usr/src/uts/common/fs/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25
  26 #include <sys/zfs_context.h>
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_tx.h>
  29 #include <sys/space_map.h>
  30 #include <sys/metaslab_impl.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33
  34 /*
  35  * Allow allocations to switch to gang blocks quickly. We do this to
  36  * avoid having to load lots of space_maps in a given txg. There are,
  37  * however, some cases where we want to avoid "fast" ganging and instead
  38  * we want to do an exhaustive search of all metaslabs on this device.
  39  * Currently we don't allow any gang, zil, or dump device related allocations
  40  * to "fast" gang.
  41  */
  42 #define CAN_FASTGANG(flags) \
  43         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  44         METASLAB_GANG_AVOID)))
  45
  46 uint64_t metaslab_aliquot = 512ULL << 10;
  47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  48
  49 /*
  50  * This value defines the number of allowed allocation failures per vdev.
  51  * If a device reaches this threshold in a given txg then we consider skipping
  52  * allocations on that device.
  53  */
  54 int zfs_mg_alloc_failures;
  55
  56 /*
  57  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  58  */
  59 static int metaslab_debug = 0;
  60
  61 /*
  62  * Minimum size which forces the dynamic allocator to change
  63  * it's allocation strategy.  Once the space map cannot satisfy
  64  * an allocation of this size then it switches to using more
  65  * aggressive strategy (i.e search by size rather than offset).
  66  */
  67 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  68
  69 /*
  70  * The minimum free space, in percent, which must be available
  71  * in a space map to continue allocations in a first-fit fashion.
  72  * Once the space_map's free space drops below this level we dynamically
  73  * switch to using best-fit allocations.
  74  */
  75 int metaslab_df_free_pct = 4;
  76
  77 /*
  78  * A metaslab is considered "free" if it contains a contiguous
  79  * segment which is greater than metaslab_min_alloc_size.
  80  */
  81 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
  82
  83 /*
  84  * Max number of space_maps to prefetch.
  85  */
  86 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
  87
  88 /*
  89  * Percentage bonus multiplier for metaslabs that are in the bonus area.
  90  */
  91 int metaslab_smo_bonus_pct = 150;
  92
  93 /*
  94  * ==========================================================================
  95  * Metaslab classes
  96  * ==========================================================================
  97  */
  98 metaslab_class_t *
  99 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 100 {
 101         metaslab_class_t *mc;
 102
 103         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 104
 105         mc->mc_spa = spa;
 106         mc->mc_rotor = NULL;
 107         mc->mc_ops = ops;
 108
 109         return (mc);
 110 }
 111
 112 void
 113 metaslab_class_destroy(metaslab_class_t *mc)
 114 {
 115         ASSERT(mc->mc_rotor == NULL);
 116         ASSERT(mc->mc_alloc == 0);
 117         ASSERT(mc->mc_deferred == 0);
 118         ASSERT(mc->mc_space == 0);
 119         ASSERT(mc->mc_dspace == 0);
 120
 121         kmem_free(mc, sizeof (metaslab_class_t));
 122 }
 123
 124 int
 125 metaslab_class_validate(metaslab_class_t *mc)
 126 {
 127         metaslab_group_t *mg;
 128         vdev_t *vd;
 129
 130         /*
 131          * Must hold one of the spa_config locks.
 132          */
 133         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 134             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 135
 136         if ((mg = mc->mc_rotor) == NULL)
 137                 return (0);
 138
 139         do {
 140                 vd = mg->mg_vd;
 141                 ASSERT(vd->vdev_mg != NULL);
 142                 ASSERT3P(vd->vdev_top, ==, vd);
 143                 ASSERT3P(mg->mg_class, ==, mc);
 144                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 145         } while ((mg = mg->mg_next) != mc->mc_rotor);
 146
 147         return (0);
 148 }
 149
 150 void
 151 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 152     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 153 {
 154         atomic_add_64(&mc->mc_alloc, alloc_delta);
 155         atomic_add_64(&mc->mc_deferred, defer_delta);
 156         atomic_add_64(&mc->mc_space, space_delta);
 157         atomic_add_64(&mc->mc_dspace, dspace_delta);
 158 }
 159
 160 uint64_t
 161 metaslab_class_get_alloc(metaslab_class_t *mc)
 162 {
 163         return (mc->mc_alloc);
 164 }
 165
 166 uint64_t
 167 metaslab_class_get_deferred(metaslab_class_t *mc)
 168 {
 169         return (mc->mc_deferred);
 170 }
 171
 172 uint64_t
 173 metaslab_class_get_space(metaslab_class_t *mc)
 174 {
 175         return (mc->mc_space);
 176 }
 177
 178 uint64_t
 179 metaslab_class_get_dspace(metaslab_class_t *mc)
 180 {
 181         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 182 }
 183
 184 /*
 185  * ==========================================================================
 186  * Metaslab groups
 187  * ==========================================================================
 188  */
 189 static int
 190 metaslab_compare(const void *x1, const void *x2)
 191 {
 192         const metaslab_t *m1 = x1;
 193         const metaslab_t *m2 = x2;
 194
 195         if (m1->ms_weight < m2->ms_weight)
 196                 return (1);
 197         if (m1->ms_weight > m2->ms_weight)
 198                 return (-1);
 199
 200         /*
 201          * If the weights are identical, use the offset to force uniqueness.
 202          */
 203         if (m1->ms_map.sm_start < m2->ms_map.sm_start)
 204                 return (-1);
 205         if (m1->ms_map.sm_start > m2->ms_map.sm_start)
 206                 return (1);
 207
 208         ASSERT3P(m1, ==, m2);
 209
 210         return (0);
 211 }
 212
 213 metaslab_group_t *
 214 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 215 {
 216         metaslab_group_t *mg;
 217
 218         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 219         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 220         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 221             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 222         mg->mg_vd = vd;
 223         mg->mg_class = mc;
 224         mg->mg_activation_count = 0;
 225
 226         return (mg);
 227 }
 228
 229 void
 230 metaslab_group_destroy(metaslab_group_t *mg)
 231 {
 232         ASSERT(mg->mg_prev == NULL);
 233         ASSERT(mg->mg_next == NULL);
 234         /*
 235          * We may have gone below zero with the activation count
 236          * either because we never activated in the first place or
 237          * because we're done, and possibly removing the vdev.
 238          */
 239         ASSERT(mg->mg_activation_count <= 0);
 240
 241         avl_destroy(&mg->mg_metaslab_tree);
 242         mutex_destroy(&mg->mg_lock);
 243         kmem_free(mg, sizeof (metaslab_group_t));
 244 }
 245
 246 void
 247 metaslab_group_activate(metaslab_group_t *mg)
 248 {
 249         metaslab_class_t *mc = mg->mg_class;
 250         metaslab_group_t *mgprev, *mgnext;
 251
 252         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 253
 254         ASSERT(mc->mc_rotor != mg);
 255         ASSERT(mg->mg_prev == NULL);
 256         ASSERT(mg->mg_next == NULL);
 257         ASSERT(mg->mg_activation_count <= 0);
 258
 259         if (++mg->mg_activation_count <= 0)
 260                 return;
 261
 262         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 263
 264         if ((mgprev = mc->mc_rotor) == NULL) {
 265                 mg->mg_prev = mg;
 266                 mg->mg_next = mg;
 267         } else {
 268                 mgnext = mgprev->mg_next;
 269                 mg->mg_prev = mgprev;
 270                 mg->mg_next = mgnext;
 271                 mgprev->mg_next = mg;
 272                 mgnext->mg_prev = mg;
 273         }
 274         mc->mc_rotor = mg;
 275 }
 276
 277 void
 278 metaslab_group_passivate(metaslab_group_t *mg)
 279 {
 280         metaslab_class_t *mc = mg->mg_class;
 281         metaslab_group_t *mgprev, *mgnext;
 282
 283         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 284
 285         if (--mg->mg_activation_count != 0) {
 286                 ASSERT(mc->mc_rotor != mg);
 287                 ASSERT(mg->mg_prev == NULL);
 288                 ASSERT(mg->mg_next == NULL);
 289                 ASSERT(mg->mg_activation_count < 0);
 290                 return;
 291         }
 292
 293         mgprev = mg->mg_prev;
 294         mgnext = mg->mg_next;
 295
 296         if (mg == mgnext) {
 297                 mc->mc_rotor = NULL;
 298         } else {
 299                 mc->mc_rotor = mgnext;
 300                 mgprev->mg_next = mgnext;
 301                 mgnext->mg_prev = mgprev;
 302         }
 303
 304         mg->mg_prev = NULL;
 305         mg->mg_next = NULL;
 306 }
 307
 308 static void
 309 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 310 {
 311         mutex_enter(&mg->mg_lock);
 312         ASSERT(msp->ms_group == NULL);
 313         msp->ms_group = mg;
 314         msp->ms_weight = 0;
 315         avl_add(&mg->mg_metaslab_tree, msp);
 316         mutex_exit(&mg->mg_lock);
 317 }
 318
 319 static void
 320 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 321 {
 322         mutex_enter(&mg->mg_lock);
 323         ASSERT(msp->ms_group == mg);
 324         avl_remove(&mg->mg_metaslab_tree, msp);
 325         msp->ms_group = NULL;
 326         mutex_exit(&mg->mg_lock);
 327 }
 328
 329 static void
 330 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 331 {
 332         /*
 333          * Although in principle the weight can be any value, in
 334          * practice we do not use values in the range [1, 510].
 335          */
 336         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 337         ASSERT(MUTEX_HELD(&msp->ms_lock));
 338
 339         mutex_enter(&mg->mg_lock);
 340         ASSERT(msp->ms_group == mg);
 341         avl_remove(&mg->mg_metaslab_tree, msp);
 342         msp->ms_weight = weight;
 343         avl_add(&mg->mg_metaslab_tree, msp);
 344         mutex_exit(&mg->mg_lock);
 345 }
 346
 347 /*
 348  * ==========================================================================
 349  * Common allocator routines
 350  * ==========================================================================
 351  */
 352 static int
 353 metaslab_segsize_compare(const void *x1, const void *x2)
 354 {
 355         const space_seg_t *s1 = x1;
 356         const space_seg_t *s2 = x2;
 357         uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 358         uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 359
 360         if (ss_size1 < ss_size2)
 361                 return (-1);
 362         if (ss_size1 > ss_size2)
 363                 return (1);
 364
 365         if (s1->ss_start < s2->ss_start)
 366                 return (-1);
 367         if (s1->ss_start > s2->ss_start)
 368                 return (1);
 369
 370         return (0);
 371 }
 372
 373 /*
 374  * This is a helper function that can be used by the allocator to find
 375  * a suitable block to allocate. This will search the specified AVL
 376  * tree looking for a block that matches the specified criteria.
 377  */
 378 static uint64_t
 379 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 380     uint64_t align)
 381 {
 382         space_seg_t *ss, ssearch;
 383         avl_index_t where;
 384
 385         ssearch.ss_start = *cursor;
 386         ssearch.ss_end = *cursor + size;
 387
 388         ss = avl_find(t, &ssearch, &where);
 389         if (ss == NULL)
 390                 ss = avl_nearest(t, where, AVL_AFTER);
 391
 392         while (ss != NULL) {
 393                 uint64_t offset = P2ROUNDUP(ss->ss_start, align);
 394
 395                 if (offset + size <= ss->ss_end) {
 396                         *cursor = offset + size;
 397                         return (offset);
 398                 }
 399                 ss = AVL_NEXT(t, ss);
 400         }
 401
 402         /*
 403          * If we know we've searched the whole map (*cursor == 0), give up.
 404          * Otherwise, reset the cursor to the beginning and try again.
 405          */
 406         if (*cursor == 0)
 407                 return (-1ULL);
 408
 409         *cursor = 0;
 410         return (metaslab_block_picker(t, cursor, size, align));
 411 }
 412
 413 static void
 414 metaslab_pp_load(space_map_t *sm)
 415 {
 416         space_seg_t *ss;
 417
 418         ASSERT(sm->sm_ppd == NULL);
 419         sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
 420
 421         sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 422         avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 423             sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 424
 425         for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 426                 avl_add(sm->sm_pp_root, ss);
 427 }
 428
 429 static void
 430 metaslab_pp_unload(space_map_t *sm)
 431 {
 432         void *cookie = NULL;
 433
 434         kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 435         sm->sm_ppd = NULL;
 436
 437         while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
 438                 /* tear down the tree */
 439         }
 440
 441         avl_destroy(sm->sm_pp_root);
 442         kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
 443         sm->sm_pp_root = NULL;
 444 }
 445
 446 /* ARGSUSED */
 447 static void
 448 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 449 {
 450         /* No need to update cursor */
 451 }
 452
 453 /* ARGSUSED */
 454 static void
 455 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 456 {
 457         /* No need to update cursor */
 458 }
 459
 460 /*
 461  * Return the maximum contiguous segment within the metaslab.
 462  */
 463 uint64_t
 464 metaslab_pp_maxsize(space_map_t *sm)
 465 {
 466         avl_tree_t *t = sm->sm_pp_root;
 467         space_seg_t *ss;
 468
 469         if (t == NULL || (ss = avl_last(t)) == NULL)
 470                 return (0ULL);
 471
 472         return (ss->ss_end - ss->ss_start);
 473 }
 474
 475 /*
 476  * ==========================================================================
 477  * The first-fit block allocator
 478  * ==========================================================================
 479  */
 480 static uint64_t
 481 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 482 {
 483         avl_tree_t *t = &sm->sm_root;
 484         uint64_t align = size & -size;
 485         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 486
 487         return (metaslab_block_picker(t, cursor, size, align));
 488 }
 489
 490 /* ARGSUSED */
 491 boolean_t
 492 metaslab_ff_fragmented(space_map_t *sm)
 493 {
 494         return (B_TRUE);
 495 }
 496
 497 static space_map_ops_t metaslab_ff_ops = {
 498         metaslab_pp_load,
 499         metaslab_pp_unload,
 500         metaslab_ff_alloc,
 501         metaslab_pp_claim,
 502         metaslab_pp_free,
 503         metaslab_pp_maxsize,
 504         metaslab_ff_fragmented
 505 };
 506
 507 /*
 508  * ==========================================================================
 509  * Dynamic block allocator -
 510  * Uses the first fit allocation scheme until space get low and then
 511  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 512  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 513  * ==========================================================================
 514  */
 515 static uint64_t
 516 metaslab_df_alloc(space_map_t *sm, uint64_t size)
 517 {
 518         avl_tree_t *t = &sm->sm_root;
 519         uint64_t align = size & -size;
 520         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 521         uint64_t max_size = metaslab_pp_maxsize(sm);
 522         int free_pct = sm->sm_space * 100 / sm->sm_size;
 523
 524         ASSERT(MUTEX_HELD(sm->sm_lock));
 525         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 526
 527         if (max_size < size)
 528                 return (-1ULL);
 529
 530         /*
 531          * If we're running low on space switch to using the size
 532          * sorted AVL tree (best-fit).
 533          */
 534         if (max_size < metaslab_df_alloc_threshold ||
 535             free_pct < metaslab_df_free_pct) {
 536                 t = sm->sm_pp_root;
 537                 *cursor = 0;
 538         }
 539
 540         return (metaslab_block_picker(t, cursor, size, 1ULL));
 541 }
 542
 543 static boolean_t
 544 metaslab_df_fragmented(space_map_t *sm)
 545 {
 546         uint64_t max_size = metaslab_pp_maxsize(sm);
 547         int free_pct = sm->sm_space * 100 / sm->sm_size;
 548
 549         if (max_size >= metaslab_df_alloc_threshold &&
 550             free_pct >= metaslab_df_free_pct)
 551                 return (B_FALSE);
 552
 553         return (B_TRUE);
 554 }
 555
 556 static space_map_ops_t metaslab_df_ops = {
 557         metaslab_pp_load,
 558         metaslab_pp_unload,
 559         metaslab_df_alloc,
 560         metaslab_pp_claim,
 561         metaslab_pp_free,
 562         metaslab_pp_maxsize,
 563         metaslab_df_fragmented
 564 };
 565
 566 /*
 567  * ==========================================================================
 568  * Other experimental allocators
 569  * ==========================================================================
 570  */
 571 static uint64_t
 572 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
 573 {
 574         avl_tree_t *t = &sm->sm_root;
 575         uint64_t *cursor = (uint64_t *)sm->sm_ppd;
 576         uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
 577         uint64_t max_size = metaslab_pp_maxsize(sm);
 578         uint64_t rsize = size;
 579         uint64_t offset = 0;
 580
 581         ASSERT(MUTEX_HELD(sm->sm_lock));
 582         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 583
 584         if (max_size < size)
 585                 return (-1ULL);
 586
 587         ASSERT3U(*extent_end, >=, *cursor);
 588
 589         /*
 590          * If we're running low on space switch to using the size
 591          * sorted AVL tree (best-fit).
 592          */
 593         if ((*cursor + size) > *extent_end) {
 594
 595                 t = sm->sm_pp_root;
 596                 *cursor = *extent_end = 0;
 597
 598                 if (max_size > 2 * SPA_MAXBLOCKSIZE)
 599                         rsize = MIN(metaslab_min_alloc_size, max_size);
 600                 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
 601                 if (offset != -1)
 602                         *cursor = offset + size;
 603         } else {
 604                 offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
 605         }
 606         ASSERT3U(*cursor, <=, *extent_end);
 607         return (offset);
 608 }
 609
 610 static boolean_t
 611 metaslab_cdf_fragmented(space_map_t *sm)
 612 {
 613         uint64_t max_size = metaslab_pp_maxsize(sm);
 614
 615         if (max_size > (metaslab_min_alloc_size * 10))
 616                 return (B_FALSE);
 617         return (B_TRUE);
 618 }
 619
 620 static space_map_ops_t metaslab_cdf_ops = {
 621         metaslab_pp_load,
 622         metaslab_pp_unload,
 623         metaslab_cdf_alloc,
 624         metaslab_pp_claim,
 625         metaslab_pp_free,
 626         metaslab_pp_maxsize,
 627         metaslab_cdf_fragmented
 628 };
 629
 630 uint64_t metaslab_ndf_clump_shift = 4;
 631
 632 static uint64_t
 633 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
 634 {
 635         avl_tree_t *t = &sm->sm_root;
 636         avl_index_t where;
 637         space_seg_t *ss, ssearch;
 638         uint64_t hbit = highbit(size);
 639         uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
 640         uint64_t max_size = metaslab_pp_maxsize(sm);
 641
 642         ASSERT(MUTEX_HELD(sm->sm_lock));
 643         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 644
 645         if (max_size < size)
 646                 return (-1ULL);
 647
 648         ssearch.ss_start = *cursor;
 649         ssearch.ss_end = *cursor + size;
 650
 651         ss = avl_find(t, &ssearch, &where);
 652         if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
 653                 t = sm->sm_pp_root;
 654
 655                 ssearch.ss_start = 0;
 656                 ssearch.ss_end = MIN(max_size,
 657                     1ULL << (hbit + metaslab_ndf_clump_shift));
 658                 ss = avl_find(t, &ssearch, &where);
 659                 if (ss == NULL)
 660                         ss = avl_nearest(t, where, AVL_AFTER);
 661                 ASSERT(ss != NULL);
 662         }
 663
 664         if (ss != NULL) {
 665                 if (ss->ss_start + size <= ss->ss_end) {
 666                         *cursor = ss->ss_start + size;
 667                         return (ss->ss_start);
 668                 }
 669         }
 670         return (-1ULL);
 671 }
 672
 673 static boolean_t
 674 metaslab_ndf_fragmented(space_map_t *sm)
 675 {
 676         uint64_t max_size = metaslab_pp_maxsize(sm);
 677
 678         if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
 679                 return (B_FALSE);
 680         return (B_TRUE);
 681 }
 682
 683
 684 static space_map_ops_t metaslab_ndf_ops = {
 685         metaslab_pp_load,
 686         metaslab_pp_unload,
 687         metaslab_ndf_alloc,
 688         metaslab_pp_claim,
 689         metaslab_pp_free,
 690         metaslab_pp_maxsize,
 691         metaslab_ndf_fragmented
 692 };
 693
 694 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 695
 696 /*
 697  * ==========================================================================
 698  * Metaslabs
 699  * ==========================================================================
 700  */
 701 metaslab_t *
 702 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 703         uint64_t start, uint64_t size, uint64_t txg)
 704 {
 705         vdev_t *vd = mg->mg_vd;
 706         metaslab_t *msp;
 707
 708         msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 709         mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 710
 711         msp->ms_smo_syncing = *smo;
 712
 713         /*
 714          * We create the main space map here, but we don't create the
 715          * allocmaps and freemaps until metaslab_sync_done().  This serves
 716          * two purposes: it allows metaslab_sync_done() to detect the
 717          * addition of new space; and for debugging, it ensures that we'd
 718          * data fault on any attempt to use this metaslab before it's ready.
 719          */
 720         space_map_create(&msp->ms_map, start, size,
 721             vd->vdev_ashift, &msp->ms_lock);
 722
 723         metaslab_group_add(mg, msp);
 724
 725         if (metaslab_debug && smo->smo_object != 0) {
 726                 mutex_enter(&msp->ms_lock);
 727                 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
 728                     SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 729                 mutex_exit(&msp->ms_lock);
 730         }
 731
 732         /*
 733          * If we're opening an existing pool (txg == 0) or creating
 734          * a new one (txg == TXG_INITIAL), all space is available now.
 735          * If we're adding space to an existing pool, the new space
 736          * does not become available until after this txg has synced.
 737          */
 738         if (txg <= TXG_INITIAL)
 739                 metaslab_sync_done(msp, 0);
 740
 741         if (txg != 0) {
 742                 vdev_dirty(vd, 0, NULL, txg);
 743                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
 744         }
 745
 746         return (msp);
 747 }
 748
 749 void
 750 metaslab_fini(metaslab_t *msp)
 751 {
 752         metaslab_group_t *mg = msp->ms_group;
 753
 754         vdev_space_update(mg->mg_vd,
 755             -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 756
 757         metaslab_group_remove(mg, msp);
 758
 759         mutex_enter(&msp->ms_lock);
 760
 761         space_map_unload(&msp->ms_map);
 762         space_map_destroy(&msp->ms_map);
 763
 764         for (int t = 0; t < TXG_SIZE; t++) {
 765                 space_map_destroy(&msp->ms_allocmap[t]);
 766                 space_map_destroy(&msp->ms_freemap[t]);
 767         }
 768
 769         for (int t = 0; t < TXG_DEFER_SIZE; t++)
 770                 space_map_destroy(&msp->ms_defermap[t]);
 771
 772         ASSERT0(msp->ms_deferspace);
 773
 774         mutex_exit(&msp->ms_lock);
 775         mutex_destroy(&msp->ms_lock);
 776
 777         kmem_free(msp, sizeof (metaslab_t));
 778 }
 779
 780 #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
 781 #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 782 #define METASLAB_ACTIVE_MASK            \
 783         (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 784
 785 static uint64_t
 786 metaslab_weight(metaslab_t *msp)
 787 {
 788         metaslab_group_t *mg = msp->ms_group;
 789         space_map_t *sm = &msp->ms_map;
 790         space_map_obj_t *smo = &msp->ms_smo;
 791         vdev_t *vd = mg->mg_vd;
 792         uint64_t weight, space;
 793
 794         ASSERT(MUTEX_HELD(&msp->ms_lock));
 795
 796         /*
 797          * The baseline weight is the metaslab's free space.
 798          */
 799         space = sm->sm_size - smo->smo_alloc;
 800         weight = space;
 801
 802         /*
 803          * Modern disks have uniform bit density and constant angular velocity.
 804          * Therefore, the outer recording zones are faster (higher bandwidth)
 805          * than the inner zones by the ratio of outer to inner track diameter,
 806          * which is typically around 2:1.  We account for this by assigning
 807          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 808          * In effect, this means that we'll select the metaslab with the most
 809          * free bandwidth rather than simply the one with the most free space.
 810          */
 811         weight = 2 * weight -
 812             ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
 813         ASSERT(weight >= space && weight <= 2 * space);
 814
 815         /*
 816          * For locality, assign higher weight to metaslabs which have
 817          * a lower offset than what we've already activated.
 818          */
 819         if (sm->sm_start <= mg->mg_bonus_area)
 820                 weight *= (metaslab_smo_bonus_pct / 100);
 821         ASSERT(weight >= space &&
 822             weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
 823
 824         if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
 825                 /*
 826                  * If this metaslab is one we're actively using, adjust its
 827                  * weight to make it preferable to any inactive metaslab so
 828                  * we'll polish it off.
 829                  */
 830                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 831         }
 832         return (weight);
 833 }
 834
 835 static void
 836 metaslab_prefetch(metaslab_group_t *mg)
 837 {
 838         spa_t *spa = mg->mg_vd->vdev_spa;
 839         metaslab_t *msp;
 840         avl_tree_t *t = &mg->mg_metaslab_tree;
 841         int m;
 842
 843         mutex_enter(&mg->mg_lock);
 844
 845         /*
 846          * Prefetch the next potential metaslabs
 847          */
 848         for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
 849                 space_map_t *sm = &msp->ms_map;
 850                 space_map_obj_t *smo = &msp->ms_smo;
 851
 852                 /* If we have reached our prefetch limit then we're done */
 853                 if (m >= metaslab_prefetch_limit)
 854                         break;
 855
 856                 if (!sm->sm_loaded && smo->smo_object != 0) {
 857                         mutex_exit(&mg->mg_lock);
 858                         dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
 859                             0ULL, smo->smo_objsize);
 860                         mutex_enter(&mg->mg_lock);
 861                 }
 862         }
 863         mutex_exit(&mg->mg_lock);
 864 }
 865
 866 static int
 867 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 868 {
 869         metaslab_group_t *mg = msp->ms_group;
 870         space_map_t *sm = &msp->ms_map;
 871         space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 872
 873         ASSERT(MUTEX_HELD(&msp->ms_lock));
 874
 875         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 876                 space_map_load_wait(sm);
 877                 if (!sm->sm_loaded) {
 878                         space_map_obj_t *smo = &msp->ms_smo;
 879
 880                         int error = space_map_load(sm, sm_ops, SM_FREE, smo,
 881                             spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 882                         if (error)  {
 883                                 metaslab_group_sort(msp->ms_group, msp, 0);
 884                                 return (error);
 885                         }
 886                         for (int t = 0; t < TXG_DEFER_SIZE; t++)
 887                                 space_map_walk(&msp->ms_defermap[t],
 888                                     space_map_claim, sm);
 889
 890                 }
 891
 892                 /*
 893                  * Track the bonus area as we activate new metaslabs.
 894                  */
 895                 if (sm->sm_start > mg->mg_bonus_area) {
 896                         mutex_enter(&mg->mg_lock);
 897                         mg->mg_bonus_area = sm->sm_start;
 898                         mutex_exit(&mg->mg_lock);
 899                 }
 900
 901                 metaslab_group_sort(msp->ms_group, msp,
 902                     msp->ms_weight | activation_weight);
 903         }
 904         ASSERT(sm->sm_loaded);
 905         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 906
 907         return (0);
 908 }
 909
 910 static void
 911 metaslab_passivate(metaslab_t *msp, uint64_t size)
 912 {
 913         /*
 914          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 915          * this metaslab again.  In that case, it had better be empty,
 916          * or we would be leaving space on the table.
 917          */
 918         ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
 919         metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 920         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 921 }
 922
 923 /*
 924  * Write a metaslab to disk in the context of the specified transaction group.
 925  */
 926 void
 927 metaslab_sync(metaslab_t *msp, uint64_t txg)
 928 {
 929         vdev_t *vd = msp->ms_group->mg_vd;
 930         spa_t *spa = vd->vdev_spa;
 931         objset_t *mos = spa_meta_objset(spa);
 932         space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
 933         space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
 934         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
 935         space_map_t *sm = &msp->ms_map;
 936         space_map_obj_t *smo = &msp->ms_smo_syncing;
 937         dmu_buf_t *db;
 938         dmu_tx_t *tx;
 939
 940         ASSERT(!vd->vdev_ishole);
 941
 942         if (allocmap->sm_space == 0 && freemap->sm_space == 0)
 943                 return;
 944
 945         /*
 946          * The only state that can actually be changing concurrently with
 947          * metaslab_sync() is the metaslab's ms_map.  No other thread can
 948          * be modifying this txg's allocmap, freemap, freed_map, or smo.
 949          * Therefore, we only hold ms_lock to satify space_map ASSERTs.
 950          * We drop it whenever we call into the DMU, because the DMU
 951          * can call down to us (e.g. via zio_free()) at any time.
 952          */
 953
 954         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 955
 956         if (smo->smo_object == 0) {
 957                 ASSERT(smo->smo_objsize == 0);
 958                 ASSERT(smo->smo_alloc == 0);
 959                 smo->smo_object = dmu_object_alloc(mos,
 960                     DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 961                     DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
 962                 ASSERT(smo->smo_object != 0);
 963                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 964                     (sm->sm_start >> vd->vdev_ms_shift),
 965                     sizeof (uint64_t), &smo->smo_object, tx);
 966         }
 967
 968         mutex_enter(&msp->ms_lock);
 969
 970         space_map_walk(freemap, space_map_add, freed_map);
 971
 972         if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
 973             2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
 974                 /*
 975                  * The in-core space map representation is twice as compact
 976                  * as the on-disk one, so it's time to condense the latter
 977                  * by generating a pure allocmap from first principles.
 978                  *
 979                  * This metaslab is 100% allocated,
 980                  * minus the content of the in-core map (sm),
 981                  * minus what's been freed this txg (freed_map),
 982                  * minus deferred frees (ms_defermap[]),
 983                  * minus allocations from txgs in the future
 984                  * (because they haven't been committed yet).
 985                  */
 986                 space_map_vacate(allocmap, NULL, NULL);
 987                 space_map_vacate(freemap, NULL, NULL);
 988
 989                 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
 990
 991                 space_map_walk(sm, space_map_remove, allocmap);
 992                 space_map_walk(freed_map, space_map_remove, allocmap);
 993
 994                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
 995                         space_map_walk(&msp->ms_defermap[t],
 996                             space_map_remove, allocmap);
 997
 998                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 999                         space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
1000                             space_map_remove, allocmap);
1001
1002                 mutex_exit(&msp->ms_lock);
1003                 space_map_truncate(smo, mos, tx);
1004                 mutex_enter(&msp->ms_lock);
1005         }
1006
1007         space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1008         space_map_sync(freemap, SM_FREE, smo, mos, tx);
1009
1010         mutex_exit(&msp->ms_lock);
1011
1012         VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1013         dmu_buf_will_dirty(db, tx);
1014         ASSERT3U(db->db_size, >=, sizeof (*smo));
1015         bcopy(smo, db->db_data, sizeof (*smo));
1016         dmu_buf_rele(db, FTAG);
1017
1018         dmu_tx_commit(tx);
1019 }
1020
1021 /*
1022  * Called after a transaction group has completely synced to mark
1023  * all of the metaslab's free space as usable.
1024  */
1025 void
1026 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1027 {
1028         space_map_obj_t *smo = &msp->ms_smo;
1029         space_map_obj_t *smosync = &msp->ms_smo_syncing;
1030         space_map_t *sm = &msp->ms_map;
1031         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1032         space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1033         metaslab_group_t *mg = msp->ms_group;
1034         vdev_t *vd = mg->mg_vd;
1035         int64_t alloc_delta, defer_delta;
1036
1037         ASSERT(!vd->vdev_ishole);
1038
1039         mutex_enter(&msp->ms_lock);
1040
1041         /*
1042          * If this metaslab is just becoming available, initialize its
1043          * allocmaps and freemaps and add its capacity to the vdev.
1044          */
1045         if (freed_map->sm_size == 0) {
1046                 for (int t = 0; t < TXG_SIZE; t++) {
1047                         space_map_create(&msp->ms_allocmap[t], sm->sm_start,
1048                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1049                         space_map_create(&msp->ms_freemap[t], sm->sm_start,
1050                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1051                 }
1052
1053                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1054                         space_map_create(&msp->ms_defermap[t], sm->sm_start,
1055                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1056
1057                 vdev_space_update(vd, 0, 0, sm->sm_size);
1058         }
1059
1060         alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1061         defer_delta = freed_map->sm_space - defer_map->sm_space;
1062
1063         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1064
1065         ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
1066         ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
1067
1068         /*
1069          * If there's a space_map_load() in progress, wait for it to complete
1070          * so that we have a consistent view of the in-core space map.
1071          * Then, add defer_map (oldest deferred frees) to this map and
1072          * transfer freed_map (this txg's frees) to defer_map.
1073          */
1074         space_map_load_wait(sm);
1075         space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1076         space_map_vacate(freed_map, space_map_add, defer_map);
1077
1078         *smo = *smosync;
1079
1080         msp->ms_deferspace += defer_delta;
1081         ASSERT3S(msp->ms_deferspace, >=, 0);
1082         ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1083         if (msp->ms_deferspace != 0) {
1084                 /*
1085                  * Keep syncing this metaslab until all deferred frees
1086                  * are back in circulation.
1087                  */
1088                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1089         }
1090
1091         /*
1092          * If the map is loaded but no longer active, evict it as soon as all
1093          * future allocations have synced.  (If we unloaded it now and then
1094          * loaded a moment later, the map wouldn't reflect those allocations.)
1095          */
1096         if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1097                 int evictable = 1;
1098
1099                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1100                         if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
1101                                 evictable = 0;
1102
1103                 if (evictable && !metaslab_debug)
1104                         space_map_unload(sm);
1105         }
1106
1107         metaslab_group_sort(mg, msp, metaslab_weight(msp));
1108
1109         mutex_exit(&msp->ms_lock);
1110 }
1111
1112 void
1113 metaslab_sync_reassess(metaslab_group_t *mg)
1114 {
1115         vdev_t *vd = mg->mg_vd;
1116         int64_t failures = mg->mg_alloc_failures;
1117
1118         /*
1119          * Re-evaluate all metaslabs which have lower offsets than the
1120          * bonus area.
1121          */
1122         for (int m = 0; m < vd->vdev_ms_count; m++) {
1123                 metaslab_t *msp = vd->vdev_ms[m];
1124
1125                 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1126                         break;
1127
1128                 mutex_enter(&msp->ms_lock);
1129                 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1130                 mutex_exit(&msp->ms_lock);
1131         }
1132
1133         atomic_add_64(&mg->mg_alloc_failures, -failures);
1134
1135         /*
1136          * Prefetch the next potential metaslabs
1137          */
1138         metaslab_prefetch(mg);
1139 }
1140
1141 static uint64_t
1142 metaslab_distance(metaslab_t *msp, dva_t *dva)
1143 {
1144         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1145         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1146         uint64_t start = msp->ms_map.sm_start >> ms_shift;
1147
1148         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1149                 return (1ULL << 63);
1150
1151         if (offset < start)
1152                 return ((start - offset) << ms_shift);
1153         if (offset > start)
1154                 return ((offset - start) << ms_shift);
1155         return (0);
1156 }
1157
1158 static uint64_t
1159 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1160     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1161 {
1162         spa_t *spa = mg->mg_vd->vdev_spa;
1163         metaslab_t *msp = NULL;
1164         uint64_t offset = -1ULL;
1165         avl_tree_t *t = &mg->mg_metaslab_tree;
1166         uint64_t activation_weight;
1167         uint64_t target_distance;
1168         int i;
1169
1170         activation_weight = METASLAB_WEIGHT_PRIMARY;
1171         for (i = 0; i < d; i++) {
1172                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1173                         activation_weight = METASLAB_WEIGHT_SECONDARY;
1174                         break;
1175                 }
1176         }
1177
1178         for (;;) {
1179                 boolean_t was_active;
1180
1181                 mutex_enter(&mg->mg_lock);
1182                 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1183                         if (msp->ms_weight < asize) {
1184                                 spa_dbgmsg(spa, "%s: failed to meet weight "
1185                                     "requirement: vdev %llu, txg %llu, mg %p, "
1186                                     "msp %p, psize %llu, asize %llu, "
1187                                     "failures %llu, weight %llu",
1188                                     spa_name(spa), mg->mg_vd->vdev_id, txg,
1189                                     mg, msp, psize, asize,
1190                                     mg->mg_alloc_failures, msp->ms_weight);
1191                                 mutex_exit(&mg->mg_lock);
1192                                 return (-1ULL);
1193                         }
1194                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1195                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1196                                 break;
1197
1198                         target_distance = min_distance +
1199                             (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1200
1201                         for (i = 0; i < d; i++)
1202                                 if (metaslab_distance(msp, &dva[i]) <
1203                                     target_distance)
1204                                         break;
1205                         if (i == d)
1206                                 break;
1207                 }
1208                 mutex_exit(&mg->mg_lock);
1209                 if (msp == NULL)
1210                         return (-1ULL);
1211
1212                 /*
1213                  * If we've already reached the allowable number of failed
1214                  * allocation attempts on this metaslab group then we
1215                  * consider skipping it. We skip it only if we're allowed
1216                  * to "fast" gang, the physical size is larger than
1217                  * a gang block, and we're attempting to allocate from
1218                  * the primary metaslab.
1219                  */
1220                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1221                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1222                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1223                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
1224                             "vdev %llu, txg %llu, mg %p, psize %llu, "
1225                             "asize %llu, failures %llu", spa_name(spa),
1226                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
1227                             mg->mg_alloc_failures);
1228                         return (-1ULL);
1229                 }
1230
1231                 mutex_enter(&msp->ms_lock);
1232
1233                 /*
1234                  * Ensure that the metaslab we have selected is still
1235                  * capable of handling our request. It's possible that
1236                  * another thread may have changed the weight while we
1237                  * were blocked on the metaslab lock.
1238                  */
1239                 if (msp->ms_weight < asize || (was_active &&
1240                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1241                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1242                         mutex_exit(&msp->ms_lock);
1243                         continue;
1244                 }
1245
1246                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1247                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1248                         metaslab_passivate(msp,
1249                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1250                         mutex_exit(&msp->ms_lock);
1251                         continue;
1252                 }
1253
1254                 if (metaslab_activate(msp, activation_weight) != 0) {
1255                         mutex_exit(&msp->ms_lock);
1256                         continue;
1257                 }
1258
1259                 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1260                         break;
1261
1262                 atomic_inc_64(&mg->mg_alloc_failures);
1263
1264                 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1265
1266                 mutex_exit(&msp->ms_lock);
1267         }
1268
1269         if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1270                 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1271
1272         space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1273
1274         mutex_exit(&msp->ms_lock);
1275
1276         return (offset);
1277 }
1278
1279 /*
1280  * Allocate a block for the specified i/o.
1281  */
1282 static int
1283 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1284     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1285 {
1286         metaslab_group_t *mg, *rotor;
1287         vdev_t *vd;
1288         int dshift = 3;
1289         int all_zero;
1290         int zio_lock = B_FALSE;
1291         boolean_t allocatable;
1292         uint64_t offset = -1ULL;
1293         uint64_t asize;
1294         uint64_t distance;
1295
1296         ASSERT(!DVA_IS_VALID(&dva[d]));
1297
1298         /*
1299          * For testing, make some blocks above a certain size be gang blocks.
1300          */
1301         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1302                 return (ENOSPC);
1303
1304         /*
1305          * Start at the rotor and loop through all mgs until we find something.
1306          * Note that there's no locking on mc_rotor or mc_aliquot because
1307          * nothing actually breaks if we miss a few updates -- we just won't
1308          * allocate quite as evenly.  It all balances out over time.
1309          *
1310          * If we are doing ditto or log blocks, try to spread them across
1311          * consecutive vdevs.  If we're forced to reuse a vdev before we've
1312          * allocated all of our ditto blocks, then try and spread them out on
1313          * that vdev as much as possible.  If it turns out to not be possible,
1314          * gradually lower our standards until anything becomes acceptable.
1315          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1316          * gives us hope of containing our fault domains to something we're
1317          * able to reason about.  Otherwise, any two top-level vdev failures
1318          * will guarantee the loss of data.  With consecutive allocation,
1319          * only two adjacent top-level vdev failures will result in data loss.
1320          *
1321          * If we are doing gang blocks (hintdva is non-NULL), try to keep
1322          * ourselves on the same vdev as our gang block header.  That
1323          * way, we can hope for locality in vdev_cache, plus it makes our
1324          * fault domains something tractable.
1325          */
1326         if (hintdva) {
1327                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1328
1329                 /*
1330                  * It's possible the vdev we're using as the hint no
1331                  * longer exists (i.e. removed). Consult the rotor when
1332                  * all else fails.
1333                  */
1334                 if (vd != NULL) {
1335                         mg = vd->vdev_mg;
1336
1337                         if (flags & METASLAB_HINTBP_AVOID &&
1338                             mg->mg_next != NULL)
1339                                 mg = mg->mg_next;
1340                 } else {
1341                         mg = mc->mc_rotor;
1342                 }
1343         } else if (d != 0) {
1344                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1345                 mg = vd->vdev_mg->mg_next;
1346         } else {
1347                 mg = mc->mc_rotor;
1348         }
1349
1350         /*
1351          * If the hint put us into the wrong metaslab class, or into a
1352          * metaslab group that has been passivated, just follow the rotor.
1353          */
1354         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1355                 mg = mc->mc_rotor;
1356
1357         rotor = mg;
1358 top:
1359         all_zero = B_TRUE;
1360         do {
1361                 ASSERT(mg->mg_activation_count == 1);
1362
1363                 vd = mg->mg_vd;
1364
1365                 /*
1366                  * Don't allocate from faulted devices.
1367                  */
1368                 if (zio_lock) {
1369                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1370                         allocatable = vdev_allocatable(vd);
1371                         spa_config_exit(spa, SCL_ZIO, FTAG);
1372                 } else {
1373                         allocatable = vdev_allocatable(vd);
1374                 }
1375                 if (!allocatable)
1376                         goto next;
1377
1378                 /*
1379                  * Avoid writing single-copy data to a failing vdev
1380                  */
1381                 if ((vd->vdev_stat.vs_write_errors > 0 ||
1382                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
1383                     d == 0 && dshift == 3) {
1384                         all_zero = B_FALSE;
1385                         goto next;
1386                 }
1387
1388                 ASSERT(mg->mg_class == mc);
1389
1390                 distance = vd->vdev_asize >> dshift;
1391                 if (distance <= (1ULL << vd->vdev_ms_shift))
1392                         distance = 0;
1393                 else
1394                         all_zero = B_FALSE;
1395
1396                 asize = vdev_psize_to_asize(vd, psize);
1397                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1398
1399                 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1400                     dva, d, flags);
1401                 if (offset != -1ULL) {
1402                         /*
1403                          * If we've just selected this metaslab group,
1404                          * figure out whether the corresponding vdev is
1405                          * over- or under-used relative to the pool,
1406                          * and set an allocation bias to even it out.
1407                          */
1408                         if (mc->mc_aliquot == 0) {
1409                                 vdev_stat_t *vs = &vd->vdev_stat;
1410                                 int64_t vu, cu;
1411
1412                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1413                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1414
1415                                 /*
1416                                  * Calculate how much more or less we should
1417                                  * try to allocate from this device during
1418                                  * this iteration around the rotor.
1419                                  * For example, if a device is 80% full
1420                                  * and the pool is 20% full then we should
1421                                  * reduce allocations by 60% on this device.
1422                                  *
1423                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
1424                                  *
1425                                  * This reduces allocations by 307K for this
1426                                  * iteration.
1427                                  */
1428                                 mg->mg_bias = ((cu - vu) *
1429                                     (int64_t)mg->mg_aliquot) / 100;
1430                         }
1431
1432                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1433                             mg->mg_aliquot + mg->mg_bias) {
1434                                 mc->mc_rotor = mg->mg_next;
1435                                 mc->mc_aliquot = 0;
1436                         }
1437
1438                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
1439                         DVA_SET_OFFSET(&dva[d], offset);
1440                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1441                         DVA_SET_ASIZE(&dva[d], asize);
1442
1443                         return (0);
1444                 }
1445 next:
1446                 mc->mc_rotor = mg->mg_next;
1447                 mc->mc_aliquot = 0;
1448         } while ((mg = mg->mg_next) != rotor);
1449
1450         if (!all_zero) {
1451                 dshift++;
1452                 ASSERT(dshift < 64);
1453                 goto top;
1454         }
1455
1456         if (!allocatable && !zio_lock) {
1457                 dshift = 3;
1458                 zio_lock = B_TRUE;
1459                 goto top;
1460         }
1461
1462         bzero(&dva[d], sizeof (dva_t));
1463
1464         return (ENOSPC);
1465 }
1466
1467 /*
1468  * Free the block represented by DVA in the context of the specified
1469  * transaction group.
1470  */
1471 static void
1472 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1473 {
1474         uint64_t vdev = DVA_GET_VDEV(dva);
1475         uint64_t offset = DVA_GET_OFFSET(dva);
1476         uint64_t size = DVA_GET_ASIZE(dva);
1477         vdev_t *vd;
1478         metaslab_t *msp;
1479
1480         ASSERT(DVA_IS_VALID(dva));
1481
1482         if (txg > spa_freeze_txg(spa))
1483                 return;
1484
1485         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1486             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1487                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1488                     (u_longlong_t)vdev, (u_longlong_t)offset);
1489                 ASSERT(0);
1490                 return;
1491         }
1492
1493         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1494
1495         if (DVA_GET_GANG(dva))
1496                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1497
1498         mutex_enter(&msp->ms_lock);
1499
1500         if (now) {
1501                 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1502                     offset, size);
1503                 space_map_free(&msp->ms_map, offset, size);
1504         } else {
1505                 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1506                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
1507                 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1508         }
1509
1510         mutex_exit(&msp->ms_lock);
1511 }
1512
1513 /*
1514  * Intent log support: upon opening the pool after a crash, notify the SPA
1515  * of blocks that the intent log has allocated for immediate write, but
1516  * which are still considered free by the SPA because the last transaction
1517  * group didn't commit yet.
1518  */
1519 static int
1520 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1521 {
1522         uint64_t vdev = DVA_GET_VDEV(dva);
1523         uint64_t offset = DVA_GET_OFFSET(dva);
1524         uint64_t size = DVA_GET_ASIZE(dva);
1525         vdev_t *vd;
1526         metaslab_t *msp;
1527         int error = 0;
1528
1529         ASSERT(DVA_IS_VALID(dva));
1530
1531         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1532             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1533                 return (ENXIO);
1534
1535         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1536
1537         if (DVA_GET_GANG(dva))
1538                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1539
1540         mutex_enter(&msp->ms_lock);
1541
1542         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1543                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1544
1545         if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1546                 error = ENOENT;
1547
1548         if (error || txg == 0) {        /* txg == 0 indicates dry run */
1549                 mutex_exit(&msp->ms_lock);
1550                 return (error);
1551         }
1552
1553         space_map_claim(&msp->ms_map, offset, size);
1554
1555         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
1556                 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1557                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
1558                 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1559         }
1560
1561         mutex_exit(&msp->ms_lock);
1562
1563         return (0);
1564 }
1565
1566 int
1567 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1568     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1569 {
1570         dva_t *dva = bp->blk_dva;
1571         dva_t *hintdva = hintbp->blk_dva;
1572         int error = 0;
1573
1574         ASSERT(bp->blk_birth == 0);
1575         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1576
1577         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1578
1579         if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
1580                 spa_config_exit(spa, SCL_ALLOC, FTAG);
1581                 return (ENOSPC);
1582         }
1583
1584         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1585         ASSERT(BP_GET_NDVAS(bp) == 0);
1586         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1587
1588         for (int d = 0; d < ndvas; d++) {
1589                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1590                     txg, flags);
1591                 if (error) {
1592                         for (d--; d >= 0; d--) {
1593                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1594                                 bzero(&dva[d], sizeof (dva_t));
1595                         }
1596                         spa_config_exit(spa, SCL_ALLOC, FTAG);
1597                         return (error);
1598                 }
1599         }
1600         ASSERT(error == 0);
1601         ASSERT(BP_GET_NDVAS(bp) == ndvas);
1602
1603         spa_config_exit(spa, SCL_ALLOC, FTAG);
1604
1605         BP_SET_BIRTH(bp, txg, txg);
1606
1607         return (0);
1608 }
1609
1610 void
1611 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1612 {
1613         const dva_t *dva = bp->blk_dva;
1614         int ndvas = BP_GET_NDVAS(bp);
1615
1616         ASSERT(!BP_IS_HOLE(bp));
1617         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1618
1619         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1620
1621         for (int d = 0; d < ndvas; d++)
1622                 metaslab_free_dva(spa, &dva[d], txg, now);
1623
1624         spa_config_exit(spa, SCL_FREE, FTAG);
1625 }
1626
1627 int
1628 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1629 {
1630         const dva_t *dva = bp->blk_dva;
1631         int ndvas = BP_GET_NDVAS(bp);
1632         int error = 0;
1633
1634         ASSERT(!BP_IS_HOLE(bp));
1635
1636         if (txg != 0) {
1637                 /*
1638                  * First do a dry run to make sure all DVAs are claimable,
1639                  * so we don't have to unwind from partial failures below.
1640                  */
1641                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
1642                         return (error);
1643         }
1644
1645         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1646
1647         for (int d = 0; d < ndvas; d++)
1648                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1649                         break;
1650
1651         spa_config_exit(spa, SCL_ALLOC, FTAG);
1652
1653         ASSERT(error == 0 || txg == 0);
1654
1655         return (error);
1656 }