9079 race condition in starting and ending condesing thread for indirect vdevs
[unleashed.git] / usr / src / uts / common / fs / zfs / vdev_indirect.c
blob29e99993bb731f95798153a6e7c0cd554e0c450a
1 /*
2 * CDDL HEADER START
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
13 * CDDL HEADER END
17 * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
20 #include <sys/zfs_context.h>
21 #include <sys/spa.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
25 #include <sys/zio.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
28 #include <sys/dmu.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/zap.h>
33 #include <sys/abd.h>
34 #include <sys/zthr.h>
37 * An indirect vdev corresponds to a vdev that has been removed. Since
38 * we cannot rewrite block pointers of snapshots, etc., we keep a
39 * mapping from old location on the removed device to the new location
40 * on another device in the pool and use this mapping whenever we need
41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
52 * - frees and claims to this device use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
56 * detection.)
60 * "Big theory statement" for how we mark blocks obsolete.
62 * When a block on an indirect vdev is freed or remapped, a section of
63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
64 * keep track of how much of each mapping entry is obsolete. When
65 * an entry becomes completely obsolete, we can remove it, thus reducing
66 * the memory used by the mapping. The complete picture of obsolescence
67 * is given by the following data structures, described below:
68 * - the entry-specific obsolete count
69 * - the vdev-specific obsolete spacemap
70 * - the pool-specific obsolete bpobj
72 * == On disk data structures used ==
74 * We track the obsolete space for the pool using several objects. Each
75 * of these objects is created on demand and freed when no longer
76 * needed, and is assumed to be empty if it does not exist.
77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
79 * - Each vic_mapping_object (associated with an indirect vdev) can
80 * have a vimp_counts_object. This is an array of uint32_t's
81 * with the same number of entries as the vic_mapping_object. When
82 * the mapping is condensed, entries from the vic_obsolete_sm_object
83 * (see below) are folded into the counts. Therefore, each
84 * obsolete_counts entry tells us the number of bytes in the
85 * corresponding mapping entry that were not referenced when the
86 * mapping was last condensed.
88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89 * This is a space map containing an alloc entry for every DVA that
90 * has been obsoleted since the last time this indirect vdev was
91 * condensed. We use this object in order to improve performance
92 * when marking a DVA as obsolete. Instead of modifying an arbitrary
93 * offset of the vimp_counts_object, we only need to append an entry
94 * to the end of this object. When a DVA becomes obsolete, it is
95 * added to the obsolete space map. This happens when the DVA is
96 * freed, remapped and not referenced by a snapshot, or the last
97 * snapshot referencing it is destroyed.
99 * - Each dataset can have a ds_remap_deadlist object. This is a
100 * deadlist object containing all blocks that were remapped in this
101 * dataset but referenced in a previous snapshot. Blocks can *only*
102 * appear on this list if they were remapped (dsl_dataset_block_remapped);
103 * blocks that were killed in a head dataset are put on the normal
104 * ds_deadlist and marked obsolete when they are freed.
106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
107 * in the pool that need to be marked obsolete. When a snapshot is
108 * destroyed, we move some of the ds_remap_deadlist to the obsolete
109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
110 * asynchronously process the obsolete bpobj, moving its entries to
111 * the specific vdevs' obsolete space maps.
113 * == Summary of how we mark blocks as obsolete ==
115 * - When freeing a block: if any DVA is on an indirect vdev, append to
116 * vic_obsolete_sm_object.
117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118 * references; otherwise append to vic_obsolete_sm_object).
119 * - When freeing a snapshot: move parts of ds_remap_deadlist to
120 * dp_obsolete_bpobj (same algorithm as ds_deadlist).
121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122 * individual vdev's vic_obsolete_sm_object.
126 * "Big theory statement" for how we condense indirect vdevs.
128 * Condensing an indirect vdev's mapping is the process of determining
129 * the precise counts of obsolete space for each mapping entry (by
130 * integrating the obsolete spacemap into the obsolete counts) and
131 * writing out a new mapping that contains only referenced entries.
133 * We condense a vdev when we expect the mapping to shrink (see
134 * vdev_indirect_should_condense()), but only perform one condense at a
135 * time to limit the memory usage. In addition, we use a separate
136 * open-context thread (spa_condense_indirect_thread) to incrementally
137 * create the new mapping object in a way that minimizes the impact on
138 * the rest of the system.
140 * == Generating a new mapping ==
142 * To generate a new mapping, we follow these steps:
144 * 1. Save the old obsolete space map and create a new mapping object
145 * (see spa_condense_indirect_start_sync()). This initializes the
146 * spa_condensing_indirect_phys with the "previous obsolete space map",
147 * which is now read only. Newly obsolete DVAs will be added to a
148 * new (initially empty) obsolete space map, and will not be
149 * considered as part of this condense operation.
151 * 2. Construct in memory the precise counts of obsolete space for each
152 * mapping entry, by incorporating the obsolete space map into the
153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
155 * 3. Iterate through each mapping entry, writing to the new mapping any
156 * entries that are not completely obsolete (i.e. which don't have
157 * obsolete count == mapping length). (See
158 * spa_condense_indirect_generate_new_mapping().)
160 * 4. Destroy the old mapping object and switch over to the new one
161 * (spa_condense_indirect_complete_sync).
163 * == Restarting from failure ==
165 * To restart the condense when we import/open the pool, we must start
166 * at the 2nd step above: reconstruct the precise counts in memory,
167 * based on the space map + counts. Then in the 3rd step, we start
168 * iterating where we left off: at vimp_max_offset of the new mapping
169 * object.
172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
175 * Condense if at least this percent of the bytes in the mapping is
176 * obsolete. With the default of 25%, the amount of space mapped
177 * will be reduced to 1% of its original size after at most 16
178 * condenses. Higher values will condense less often (causing less
179 * i/o); lower values will reduce the mapping size more quickly.
181 int zfs_indirect_condense_obsolete_pct = 25;
184 * Condense if the obsolete space map takes up more than this amount of
185 * space on disk (logically). This limits the amount of disk space
186 * consumed by the obsolete space map; the default of 1GB is small enough
187 * that we typically don't mind "wasting" it.
189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
192 * Don't bother condensing if the mapping uses less than this amount of
193 * memory. The default of 128KB is considered a "trivial" amount of
194 * memory and not worth reducing.
196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
204 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
207 * Mark the given offset and size as being obsolete in the given txg.
209 void
210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size,
211 uint64_t txg)
213 spa_t *spa = vd->vdev_spa;
214 ASSERT3U(spa_syncing_txg(spa), ==, txg);
215 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
216 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
217 ASSERT(size > 0);
218 VERIFY(vdev_indirect_mapping_entry_for_offset(
219 vd->vdev_indirect_mapping, offset) != NULL);
221 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
222 mutex_enter(&vd->vdev_obsolete_lock);
223 range_tree_add(vd->vdev_obsolete_segments, offset, size);
224 mutex_exit(&vd->vdev_obsolete_lock);
225 vdev_dirty(vd, 0, NULL, txg);
230 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
231 * wrapper is provided because the DMU does not know about vdev_t's and
232 * cannot directly call vdev_indirect_mark_obsolete.
234 void
235 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
236 uint64_t size, dmu_tx_t *tx)
238 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
239 ASSERT(dmu_tx_is_syncing(tx));
241 /* The DMU can only remap indirect vdevs. */
242 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
243 vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx));
246 static spa_condensing_indirect_t *
247 spa_condensing_indirect_create(spa_t *spa)
249 spa_condensing_indirect_phys_t *scip =
250 &spa->spa_condensing_indirect_phys;
251 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
252 objset_t *mos = spa->spa_meta_objset;
254 for (int i = 0; i < TXG_SIZE; i++) {
255 list_create(&sci->sci_new_mapping_entries[i],
256 sizeof (vdev_indirect_mapping_entry_t),
257 offsetof(vdev_indirect_mapping_entry_t, vime_node));
260 sci->sci_new_mapping =
261 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
263 return (sci);
266 static void
267 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
269 for (int i = 0; i < TXG_SIZE; i++)
270 list_destroy(&sci->sci_new_mapping_entries[i]);
272 if (sci->sci_new_mapping != NULL)
273 vdev_indirect_mapping_close(sci->sci_new_mapping);
275 kmem_free(sci, sizeof (*sci));
278 boolean_t
279 vdev_indirect_should_condense(vdev_t *vd)
281 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
282 spa_t *spa = vd->vdev_spa;
284 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
286 if (!zfs_condense_indirect_vdevs_enable)
287 return (B_FALSE);
290 * We can only condense one indirect vdev at a time.
292 if (spa->spa_condensing_indirect != NULL)
293 return (B_FALSE);
295 if (spa_shutting_down(spa))
296 return (B_FALSE);
299 * The mapping object size must not change while we are
300 * condensing, so we can only condense indirect vdevs
301 * (not vdevs that are still in the middle of being removed).
303 if (vd->vdev_ops != &vdev_indirect_ops)
304 return (B_FALSE);
307 * If nothing new has been marked obsolete, there is no
308 * point in condensing.
310 if (vd->vdev_obsolete_sm == NULL) {
311 ASSERT0(vdev_obsolete_sm_object(vd));
312 return (B_FALSE);
315 ASSERT(vd->vdev_obsolete_sm != NULL);
317 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
318 space_map_object(vd->vdev_obsolete_sm));
320 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
321 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
322 uint64_t mapping_size = vdev_indirect_mapping_size(vim);
323 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
325 ASSERT3U(bytes_obsolete, <=, bytes_mapped);
328 * If a high percentage of the bytes that are mapped have become
329 * obsolete, condense (unless the mapping is already small enough).
330 * This has a good chance of reducing the amount of memory used
331 * by the mapping.
333 if (bytes_obsolete * 100 / bytes_mapped >=
334 zfs_indirect_condense_obsolete_pct &&
335 mapping_size > zfs_condense_min_mapping_bytes) {
336 zfs_dbgmsg("should condense vdev %llu because obsolete "
337 "spacemap covers %d%% of %lluMB mapping",
338 (u_longlong_t)vd->vdev_id,
339 (int)(bytes_obsolete * 100 / bytes_mapped),
340 (u_longlong_t)bytes_mapped / 1024 / 1024);
341 return (B_TRUE);
345 * If the obsolete space map takes up too much space on disk,
346 * condense in order to free up this disk space.
348 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
349 zfs_dbgmsg("should condense vdev %llu because obsolete sm "
350 "length %lluMB >= max size %lluMB",
351 (u_longlong_t)vd->vdev_id,
352 (u_longlong_t)obsolete_sm_size / 1024 / 1024,
353 (u_longlong_t)zfs_condense_max_obsolete_bytes /
354 1024 / 1024);
355 return (B_TRUE);
358 return (B_FALSE);
362 * This sync task completes (finishes) a condense, deleting the old
363 * mapping and replacing it with the new one.
365 static void
366 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
368 spa_condensing_indirect_t *sci = arg;
369 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
370 spa_condensing_indirect_phys_t *scip =
371 &spa->spa_condensing_indirect_phys;
372 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
373 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
374 objset_t *mos = spa->spa_meta_objset;
375 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
376 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
377 uint64_t new_count =
378 vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
380 ASSERT(dmu_tx_is_syncing(tx));
381 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
382 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
383 for (int i = 0; i < TXG_SIZE; i++) {
384 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
386 ASSERT(vic->vic_mapping_object != 0);
387 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
388 ASSERT(scip->scip_next_mapping_object != 0);
389 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
392 * Reset vdev_indirect_mapping to refer to the new object.
394 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
395 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
396 vd->vdev_indirect_mapping = sci->sci_new_mapping;
397 rw_exit(&vd->vdev_indirect_rwlock);
399 sci->sci_new_mapping = NULL;
400 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
401 vic->vic_mapping_object = scip->scip_next_mapping_object;
402 scip->scip_next_mapping_object = 0;
404 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
405 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
406 scip->scip_prev_obsolete_sm_object = 0;
408 scip->scip_vdev = 0;
410 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
411 DMU_POOL_CONDENSING_INDIRECT, tx));
412 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
413 spa->spa_condensing_indirect = NULL;
415 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
416 "new mapping object %llu has %llu entries "
417 "(was %llu entries)",
418 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
419 new_count, old_count);
421 vdev_config_dirty(spa->spa_root_vdev);
425 * This sync task appends entries to the new mapping object.
427 static void
428 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
430 spa_condensing_indirect_t *sci = arg;
431 uint64_t txg = dmu_tx_get_txg(tx);
432 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
434 ASSERT(dmu_tx_is_syncing(tx));
435 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
437 vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
438 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
439 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
443 * Open-context function to add one entry to the new mapping. The new
444 * entry will be remembered and written from syncing context.
446 static void
447 spa_condense_indirect_commit_entry(spa_t *spa,
448 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
450 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
452 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
454 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
455 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
456 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
457 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
460 * If we are the first entry committed this txg, kick off the sync
461 * task to write to the MOS on our behalf.
463 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
464 dsl_sync_task_nowait(dmu_tx_pool(tx),
465 spa_condense_indirect_commit_sync, sci,
466 0, ZFS_SPACE_CHECK_NONE, tx);
469 vdev_indirect_mapping_entry_t *vime =
470 kmem_alloc(sizeof (*vime), KM_SLEEP);
471 vime->vime_mapping = *vimep;
472 vime->vime_obsolete_count = count;
473 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
475 dmu_tx_commit(tx);
478 static void
479 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
480 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
482 spa_t *spa = vd->vdev_spa;
483 uint64_t mapi = start_index;
484 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
485 uint64_t old_num_entries =
486 vdev_indirect_mapping_num_entries(old_mapping);
488 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
489 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
491 zfs_dbgmsg("starting condense of vdev %llu from index %llu",
492 (u_longlong_t)vd->vdev_id,
493 (u_longlong_t)mapi);
495 while (mapi < old_num_entries) {
497 if (zthr_iscancelled(zthr)) {
498 zfs_dbgmsg("pausing condense of vdev %llu "
499 "at index %llu", (u_longlong_t)vd->vdev_id,
500 (u_longlong_t)mapi);
501 break;
504 vdev_indirect_mapping_entry_phys_t *entry =
505 &old_mapping->vim_entries[mapi];
506 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
507 ASSERT3U(obsolete_counts[mapi], <=, entry_size);
508 if (obsolete_counts[mapi] < entry_size) {
509 spa_condense_indirect_commit_entry(spa, entry,
510 obsolete_counts[mapi]);
513 * This delay may be requested for testing, debugging,
514 * or performance reasons.
516 delay(zfs_condense_indirect_commit_entry_delay_ticks);
519 mapi++;
523 /* ARGSUSED */
524 static boolean_t
525 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
527 spa_t *spa = arg;
529 return (spa->spa_condensing_indirect != NULL);
532 /* ARGSUSED */
533 static int
534 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
536 spa_t *spa = arg;
537 vdev_t *vd;
539 ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
540 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
541 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
542 ASSERT3P(vd, !=, NULL);
543 spa_config_exit(spa, SCL_VDEV, FTAG);
545 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
546 spa_condensing_indirect_phys_t *scip =
547 &spa->spa_condensing_indirect_phys;
548 uint32_t *counts;
549 uint64_t start_index;
550 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
551 space_map_t *prev_obsolete_sm = NULL;
553 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
554 ASSERT(scip->scip_next_mapping_object != 0);
555 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
556 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
558 for (int i = 0; i < TXG_SIZE; i++) {
560 * The list must start out empty in order for the
561 * _commit_sync() sync task to be properly registered
562 * on the first call to _commit_entry(); so it's wise
563 * to double check and ensure we actually are starting
564 * with empty lists.
566 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
569 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
570 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
571 space_map_update(prev_obsolete_sm);
572 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
573 if (prev_obsolete_sm != NULL) {
574 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
575 counts, prev_obsolete_sm);
577 space_map_close(prev_obsolete_sm);
580 * Generate new mapping. Determine what index to continue from
581 * based on the max offset that we've already written in the
582 * new mapping.
584 uint64_t max_offset =
585 vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
586 if (max_offset == 0) {
587 /* We haven't written anything to the new mapping yet. */
588 start_index = 0;
589 } else {
591 * Pick up from where we left off. _entry_for_offset()
592 * returns a pointer into the vim_entries array. If
593 * max_offset is greater than any of the mappings
594 * contained in the table NULL will be returned and
595 * that indicates we've exhausted our iteration of the
596 * old_mapping.
599 vdev_indirect_mapping_entry_phys_t *entry =
600 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
601 max_offset);
603 if (entry == NULL) {
605 * We've already written the whole new mapping.
606 * This special value will cause us to skip the
607 * generate_new_mapping step and just do the sync
608 * task to complete the condense.
610 start_index = UINT64_MAX;
611 } else {
612 start_index = entry - old_mapping->vim_entries;
613 ASSERT3U(start_index, <,
614 vdev_indirect_mapping_num_entries(old_mapping));
618 spa_condense_indirect_generate_new_mapping(vd, counts,
619 start_index, zthr);
621 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
624 * If the zthr has received a cancellation signal while running
625 * in generate_new_mapping() or at any point after that, then bail
626 * early. We don't want to complete the condense if the spa is
627 * shutting down.
629 if (zthr_iscancelled(zthr))
630 return (0);
632 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
633 spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
635 return (0);
639 * Sync task to begin the condensing process.
641 void
642 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
644 spa_t *spa = vd->vdev_spa;
645 spa_condensing_indirect_phys_t *scip =
646 &spa->spa_condensing_indirect_phys;
648 ASSERT0(scip->scip_next_mapping_object);
649 ASSERT0(scip->scip_prev_obsolete_sm_object);
650 ASSERT0(scip->scip_vdev);
651 ASSERT(dmu_tx_is_syncing(tx));
652 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
653 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
654 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
656 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
657 ASSERT(obsolete_sm_obj != 0);
659 scip->scip_vdev = vd->vdev_id;
660 scip->scip_next_mapping_object =
661 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
663 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
666 * We don't need to allocate a new space map object, since
667 * vdev_indirect_sync_obsolete will allocate one when needed.
669 space_map_close(vd->vdev_obsolete_sm);
670 vd->vdev_obsolete_sm = NULL;
671 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
674 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
675 DMU_POOL_DIRECTORY_OBJECT,
676 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
677 sizeof (*scip) / sizeof (uint64_t), scip, tx));
679 ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
680 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
683 "posm=%llu nm=%llu",
684 vd->vdev_id, dmu_tx_get_txg(tx),
685 (u_longlong_t)scip->scip_prev_obsolete_sm_object,
686 (u_longlong_t)scip->scip_next_mapping_object);
688 zthr_wakeup(spa->spa_condense_zthr);
692 * Sync to the given vdev's obsolete space map any segments that are no longer
693 * referenced as of the given txg.
695 * If the obsolete space map doesn't exist yet, create and open it.
697 void
698 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
700 spa_t *spa = vd->vdev_spa;
701 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
703 ASSERT3U(vic->vic_mapping_object, !=, 0);
704 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
705 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
706 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
708 if (vdev_obsolete_sm_object(vd) == 0) {
709 uint64_t obsolete_sm_object =
710 space_map_alloc(spa->spa_meta_objset, tx);
712 ASSERT(vd->vdev_top_zap != 0);
713 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
714 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
715 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
716 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
718 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
719 VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
720 spa->spa_meta_objset, obsolete_sm_object,
721 0, vd->vdev_asize, 0));
722 space_map_update(vd->vdev_obsolete_sm);
725 ASSERT(vd->vdev_obsolete_sm != NULL);
726 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
727 space_map_object(vd->vdev_obsolete_sm));
729 space_map_write(vd->vdev_obsolete_sm,
730 vd->vdev_obsolete_segments, SM_ALLOC, tx);
731 space_map_update(vd->vdev_obsolete_sm);
732 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
736 spa_condense_init(spa_t *spa)
738 int error = zap_lookup(spa->spa_meta_objset,
739 DMU_POOL_DIRECTORY_OBJECT,
740 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
741 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
742 &spa->spa_condensing_indirect_phys);
743 if (error == 0) {
744 if (spa_writeable(spa)) {
745 spa->spa_condensing_indirect =
746 spa_condensing_indirect_create(spa);
748 return (0);
749 } else if (error == ENOENT) {
750 return (0);
751 } else {
752 return (error);
756 void
757 spa_condense_fini(spa_t *spa)
759 if (spa->spa_condensing_indirect != NULL) {
760 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
761 spa->spa_condensing_indirect = NULL;
765 void
766 spa_start_indirect_condensing_thread(spa_t *spa)
768 ASSERT3P(spa->spa_condense_zthr, ==, NULL);
769 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
770 spa_condense_indirect_thread, spa);
774 * Gets the obsolete spacemap object from the vdev's ZAP.
775 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
776 * exist yet.
779 vdev_obsolete_sm_object(vdev_t *vd)
781 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
782 if (vd->vdev_top_zap == 0) {
783 return (0);
786 uint64_t sm_obj = 0;
787 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
788 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
790 ASSERT(err == 0 || err == ENOENT);
792 return (sm_obj);
795 boolean_t
796 vdev_obsolete_counts_are_precise(vdev_t *vd)
798 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
799 if (vd->vdev_top_zap == 0) {
800 return (B_FALSE);
803 uint64_t val = 0;
804 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
805 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
807 ASSERT(err == 0 || err == ENOENT);
809 return (val != 0);
812 /* ARGSUSED */
813 static void
814 vdev_indirect_close(vdev_t *vd)
818 /* ARGSUSED */
819 static void
820 vdev_indirect_io_done(zio_t *zio)
824 /* ARGSUSED */
825 static int
826 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
827 uint64_t *ashift)
829 *psize = *max_psize = vd->vdev_asize +
830 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
831 *ashift = vd->vdev_ashift;
832 return (0);
835 typedef struct remap_segment {
836 vdev_t *rs_vd;
837 uint64_t rs_offset;
838 uint64_t rs_asize;
839 uint64_t rs_split_offset;
840 list_node_t rs_node;
841 } remap_segment_t;
843 remap_segment_t *
844 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
846 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
847 rs->rs_vd = vd;
848 rs->rs_offset = offset;
849 rs->rs_asize = asize;
850 rs->rs_split_offset = split_offset;
851 return (rs);
855 * Goes through the relevant indirect mappings until it hits a concrete vdev
856 * and issues the callback. On the way to the concrete vdev, if any other
857 * indirect vdevs are encountered, then the callback will also be called on
858 * each of those indirect vdevs. For example, if the segment is mapped to
859 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
860 * mapped to segment B on concrete vdev 2, then the callback will be called on
861 * both vdev 1 and vdev 2.
863 * While the callback passed to vdev_indirect_remap() is called on every vdev
864 * the function encounters, certain callbacks only care about concrete vdevs.
865 * These types of callbacks should return immediately and explicitly when they
866 * are called on an indirect vdev.
868 * Because there is a possibility that a DVA section in the indirect device
869 * has been split into multiple sections in our mapping, we keep track
870 * of the relevant contiguous segments of the new location (remap_segment_t)
871 * in a stack. This way we can call the callback for each of the new sections
872 * created by a single section of the indirect device. Note though, that in
873 * this scenario the callbacks in each split block won't occur in-order in
874 * terms of offset, so callers should not make any assumptions about that.
876 * For callbacks that don't handle split blocks and immediately return when
877 * they encounter them (as is the case for remap_blkptr_cb), the caller can
878 * assume that its callback will be applied from the first indirect vdev
879 * encountered to the last one and then the concrete vdev, in that order.
881 static void
882 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
883 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
885 list_t stack;
886 spa_t *spa = vd->vdev_spa;
888 list_create(&stack, sizeof (remap_segment_t),
889 offsetof(remap_segment_t, rs_node));
891 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
892 rs != NULL; rs = list_remove_head(&stack)) {
893 vdev_t *v = rs->rs_vd;
896 * Note: this can be called from open context
897 * (eg. zio_read()), so we need the rwlock to prevent
898 * the mapping from being changed by condensing.
900 rw_enter(&v->vdev_indirect_rwlock, RW_READER);
901 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
902 ASSERT3P(vim, !=, NULL);
904 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
905 ASSERT(rs->rs_asize > 0);
907 vdev_indirect_mapping_entry_phys_t *mapping =
908 vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset);
909 ASSERT3P(mapping, !=, NULL);
911 while (rs->rs_asize > 0) {
913 * Note: the vdev_indirect_mapping can not change
914 * while we are running. It only changes while the
915 * removal is in progress, and then only from syncing
916 * context. While a removal is in progress, this
917 * function is only called for frees, which also only
918 * happen from syncing context.
921 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
922 uint64_t dst_offset =
923 DVA_GET_OFFSET(&mapping->vimep_dst);
924 uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst);
926 ASSERT3U(rs->rs_offset, >=,
927 DVA_MAPPING_GET_SRC_OFFSET(mapping));
928 ASSERT3U(rs->rs_offset, <,
929 DVA_MAPPING_GET_SRC_OFFSET(mapping) + size);
930 ASSERT3U(dst_vdev, !=, v->vdev_id);
932 uint64_t inner_offset = rs->rs_offset -
933 DVA_MAPPING_GET_SRC_OFFSET(mapping);
934 uint64_t inner_size =
935 MIN(rs->rs_asize, size - inner_offset);
937 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
938 ASSERT3P(dst_v, !=, NULL);
940 if (dst_v->vdev_ops == &vdev_indirect_ops) {
941 list_insert_head(&stack,
942 rs_alloc(dst_v, dst_offset + inner_offset,
943 inner_size, rs->rs_split_offset));
947 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
948 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
950 * Note: This clause exists only solely for
951 * testing purposes. We use it to ensure that
952 * split blocks work and that the callbacks
953 * using them yield the same result if issued
954 * in reverse order.
956 uint64_t inner_half = inner_size / 2;
958 func(rs->rs_split_offset + inner_half, dst_v,
959 dst_offset + inner_offset + inner_half,
960 inner_half, arg);
962 func(rs->rs_split_offset, dst_v,
963 dst_offset + inner_offset,
964 inner_half, arg);
965 } else {
966 func(rs->rs_split_offset, dst_v,
967 dst_offset + inner_offset,
968 inner_size, arg);
971 rs->rs_offset += inner_size;
972 rs->rs_asize -= inner_size;
973 rs->rs_split_offset += inner_size;
974 mapping++;
977 rw_exit(&v->vdev_indirect_rwlock);
978 kmem_free(rs, sizeof (remap_segment_t));
980 list_destroy(&stack);
983 static void
984 vdev_indirect_child_io_done(zio_t *zio)
986 zio_t *pio = zio->io_private;
988 mutex_enter(&pio->io_lock);
989 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
990 mutex_exit(&pio->io_lock);
992 abd_put(zio->io_abd);
995 static void
996 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
997 uint64_t size, void *arg)
999 zio_t *zio = arg;
1001 ASSERT3P(vd, !=, NULL);
1003 if (vd->vdev_ops == &vdev_indirect_ops)
1004 return;
1006 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
1007 abd_get_offset(zio->io_abd, split_offset),
1008 size, zio->io_type, zio->io_priority,
1009 0, vdev_indirect_child_io_done, zio));
1012 static void
1013 vdev_indirect_io_start(zio_t *zio)
1015 spa_t *spa = zio->io_spa;
1017 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1018 if (zio->io_type != ZIO_TYPE_READ) {
1019 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1020 ASSERT((zio->io_flags &
1021 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1024 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1025 vdev_indirect_io_start_cb, zio);
1027 zio_execute(zio);
1030 vdev_ops_t vdev_indirect_ops = {
1031 vdev_indirect_open,
1032 vdev_indirect_close,
1033 vdev_default_asize,
1034 vdev_indirect_io_start,
1035 vdev_indirect_io_done,
1036 NULL,
1037 NULL,
1038 NULL,
1039 vdev_indirect_remap,
1040 VDEV_TYPE_INDIRECT, /* name of this vdev type */
1041 B_FALSE /* leaf vdev */