Merge commit '7e3488dc6cdcb0c04e1ce167a1a3bfef83b5f2e0'
[unleashed.git] / kernel / fs / zfs / bpobj.c
blobbbdd765214fcbfba501850fa742d2d89d84f7509
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 * Copyright (c) 2017 Datto Inc.
28 #include <sys/bpobj.h>
29 #include <sys/zfs_context.h>
30 #include <sys/refcount.h>
31 #include <sys/dsl_pool.h>
32 #include <sys/zfeature.h>
33 #include <sys/zap.h>
36 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
38 uint64_t
39 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
41 spa_t *spa = dmu_objset_spa(os);
42 dsl_pool_t *dp = dmu_objset_pool(os);
44 if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
46 ASSERT0(dp->dp_empty_bpobj);
47 dp->dp_empty_bpobj =
48 bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
49 VERIFY(zap_add(os,
50 DMU_POOL_DIRECTORY_OBJECT,
51 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
52 &dp->dp_empty_bpobj, tx) == 0);
54 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
55 ASSERT(dp->dp_empty_bpobj != 0);
56 return (dp->dp_empty_bpobj);
57 } else {
58 return (bpobj_alloc(os, blocksize, tx));
62 void
63 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
65 dsl_pool_t *dp = dmu_objset_pool(os);
67 spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
68 if (!spa_feature_is_active(dmu_objset_spa(os),
69 SPA_FEATURE_EMPTY_BPOBJ)) {
70 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
71 DMU_POOL_DIRECTORY_OBJECT,
72 DMU_POOL_EMPTY_BPOBJ, tx));
73 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
74 dp->dp_empty_bpobj = 0;
78 uint64_t
79 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
81 int size;
83 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
84 size = BPOBJ_SIZE_V0;
85 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
86 size = BPOBJ_SIZE_V1;
87 else
88 size = sizeof (bpobj_phys_t);
90 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
91 DMU_OT_BPOBJ_HDR, size, tx));
94 void
95 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
97 int64_t i;
98 bpobj_t bpo;
99 dmu_object_info_t doi;
100 int epb;
101 dmu_buf_t *dbuf = NULL;
103 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
104 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
106 mutex_enter(&bpo.bpo_lock);
108 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
109 goto out;
111 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
112 epb = doi.doi_data_block_size / sizeof (uint64_t);
114 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
115 uint64_t *objarray;
116 uint64_t offset, blkoff;
118 offset = i * sizeof (uint64_t);
119 blkoff = P2PHASE(i, epb);
121 if (dbuf == NULL || dbuf->db_offset > offset) {
122 if (dbuf)
123 dmu_buf_rele(dbuf, FTAG);
124 VERIFY3U(0, ==, dmu_buf_hold(os,
125 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
128 ASSERT3U(offset, >=, dbuf->db_offset);
129 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
131 objarray = dbuf->db_data;
132 bpobj_free(os, objarray[blkoff], tx);
134 if (dbuf) {
135 dmu_buf_rele(dbuf, FTAG);
136 dbuf = NULL;
138 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
140 out:
141 mutex_exit(&bpo.bpo_lock);
142 bpobj_close(&bpo);
144 VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
148 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
150 dmu_object_info_t doi;
151 int err;
153 err = dmu_object_info(os, object, &doi);
154 if (err)
155 return (err);
157 bzero(bpo, sizeof (*bpo));
158 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
160 ASSERT(bpo->bpo_dbuf == NULL);
161 ASSERT(bpo->bpo_phys == NULL);
162 ASSERT(object != 0);
163 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
164 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
166 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
167 if (err)
168 return (err);
170 bpo->bpo_os = os;
171 bpo->bpo_object = object;
172 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
173 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
174 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
175 bpo->bpo_phys = bpo->bpo_dbuf->db_data;
176 return (0);
179 boolean_t
180 bpobj_is_open(const bpobj_t *bpo)
182 return (bpo->bpo_object != 0);
185 void
186 bpobj_close(bpobj_t *bpo)
188 /* Lame workaround for closing a bpobj that was never opened. */
189 if (bpo->bpo_object == 0)
190 return;
192 dmu_buf_rele(bpo->bpo_dbuf, bpo);
193 if (bpo->bpo_cached_dbuf != NULL)
194 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
195 bpo->bpo_dbuf = NULL;
196 bpo->bpo_phys = NULL;
197 bpo->bpo_cached_dbuf = NULL;
198 bpo->bpo_object = 0;
200 mutex_destroy(&bpo->bpo_lock);
203 boolean_t
204 bpobj_is_empty(bpobj_t *bpo)
206 return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
207 (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
210 static int
211 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
212 boolean_t free)
214 dmu_object_info_t doi;
215 int epb;
216 int64_t i;
217 int err = 0;
218 dmu_buf_t *dbuf = NULL;
220 ASSERT(bpobj_is_open(bpo));
221 mutex_enter(&bpo->bpo_lock);
223 if (free)
224 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
226 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
227 blkptr_t *bparray;
228 blkptr_t *bp;
229 uint64_t offset, blkoff;
231 offset = i * sizeof (blkptr_t);
232 blkoff = P2PHASE(i, bpo->bpo_epb);
234 if (dbuf == NULL || dbuf->db_offset > offset) {
235 if (dbuf)
236 dmu_buf_rele(dbuf, FTAG);
237 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
238 FTAG, &dbuf, 0);
239 if (err)
240 break;
243 ASSERT3U(offset, >=, dbuf->db_offset);
244 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
246 bparray = dbuf->db_data;
247 bp = &bparray[blkoff];
248 err = func(arg, bp, tx);
249 if (err)
250 break;
251 if (free) {
252 bpo->bpo_phys->bpo_bytes -=
253 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
254 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
255 if (bpo->bpo_havecomp) {
256 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
257 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
259 bpo->bpo_phys->bpo_num_blkptrs--;
260 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
263 if (dbuf) {
264 dmu_buf_rele(dbuf, FTAG);
265 dbuf = NULL;
267 if (free) {
268 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
269 (i + 1) * sizeof (blkptr_t), -1ULL, tx));
271 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
272 goto out;
274 ASSERT(bpo->bpo_havecomp);
275 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
276 if (err) {
277 mutex_exit(&bpo->bpo_lock);
278 return (err);
280 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
281 epb = doi.doi_data_block_size / sizeof (uint64_t);
283 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
284 uint64_t *objarray;
285 uint64_t offset, blkoff;
286 bpobj_t sublist;
287 uint64_t used_before, comp_before, uncomp_before;
288 uint64_t used_after, comp_after, uncomp_after;
290 offset = i * sizeof (uint64_t);
291 blkoff = P2PHASE(i, epb);
293 if (dbuf == NULL || dbuf->db_offset > offset) {
294 if (dbuf)
295 dmu_buf_rele(dbuf, FTAG);
296 err = dmu_buf_hold(bpo->bpo_os,
297 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
298 if (err)
299 break;
302 ASSERT3U(offset, >=, dbuf->db_offset);
303 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
305 objarray = dbuf->db_data;
306 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
307 if (err)
308 break;
309 if (free) {
310 err = bpobj_space(&sublist,
311 &used_before, &comp_before, &uncomp_before);
312 if (err != 0) {
313 bpobj_close(&sublist);
314 break;
317 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
318 if (free) {
319 VERIFY3U(0, ==, bpobj_space(&sublist,
320 &used_after, &comp_after, &uncomp_after));
321 bpo->bpo_phys->bpo_bytes -= used_before - used_after;
322 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
323 bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
324 bpo->bpo_phys->bpo_uncomp -=
325 uncomp_before - uncomp_after;
328 bpobj_close(&sublist);
329 if (err)
330 break;
331 if (free) {
332 err = dmu_object_free(bpo->bpo_os,
333 objarray[blkoff], tx);
334 if (err)
335 break;
336 bpo->bpo_phys->bpo_num_subobjs--;
337 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
340 if (dbuf) {
341 dmu_buf_rele(dbuf, FTAG);
342 dbuf = NULL;
344 if (free) {
345 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
346 bpo->bpo_phys->bpo_subobjs,
347 (i + 1) * sizeof (uint64_t), -1ULL, tx));
350 out:
351 /* If there are no entries, there should be no bytes. */
352 if (bpobj_is_empty(bpo)) {
353 ASSERT0(bpo->bpo_phys->bpo_bytes);
354 ASSERT0(bpo->bpo_phys->bpo_comp);
355 ASSERT0(bpo->bpo_phys->bpo_uncomp);
358 mutex_exit(&bpo->bpo_lock);
359 return (err);
363 * Iterate and remove the entries. If func returns nonzero, iteration
364 * will stop and that entry will not be removed.
367 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
369 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
373 * Iterate the entries. If func returns nonzero, iteration will stop.
376 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
378 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
381 void
382 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
384 bpobj_t subbpo;
385 uint64_t used, comp, uncomp, subsubobjs;
387 ASSERT(bpobj_is_open(bpo));
388 ASSERT(subobj != 0);
389 ASSERT(bpo->bpo_havesubobj);
390 ASSERT(bpo->bpo_havecomp);
391 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
393 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
394 bpobj_decr_empty(bpo->bpo_os, tx);
395 return;
398 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
399 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
401 if (bpobj_is_empty(&subbpo)) {
402 /* No point in having an empty subobj. */
403 bpobj_close(&subbpo);
404 bpobj_free(bpo->bpo_os, subobj, tx);
405 return;
408 mutex_enter(&bpo->bpo_lock);
409 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
410 if (bpo->bpo_phys->bpo_subobjs == 0) {
411 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
412 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
413 DMU_OT_NONE, 0, tx);
416 dmu_object_info_t doi;
417 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
418 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
420 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
421 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
422 sizeof (subobj), &subobj, tx);
423 bpo->bpo_phys->bpo_num_subobjs++;
426 * If subobj has only one block of subobjs, then move subobj's
427 * subobjs to bpo's subobj list directly. This reduces
428 * recursion in bpobj_iterate due to nested subobjs.
430 subsubobjs = subbpo.bpo_phys->bpo_subobjs;
431 if (subsubobjs != 0) {
432 dmu_object_info_t doi;
434 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
435 if (doi.doi_max_offset == doi.doi_data_block_size) {
436 dmu_buf_t *subdb;
437 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
439 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
440 0, FTAG, &subdb, 0));
442 * Make sure that we are not asking dmu_write()
443 * to write more data than we have in our buffer.
445 VERIFY3U(subdb->db_size, >=,
446 numsubsub * sizeof (subobj));
447 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
448 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
449 numsubsub * sizeof (subobj), subdb->db_data, tx);
450 dmu_buf_rele(subdb, FTAG);
451 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
453 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
454 subbpo.bpo_phys->bpo_subobjs = 0;
455 VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
456 subsubobjs, tx));
459 bpo->bpo_phys->bpo_bytes += used;
460 bpo->bpo_phys->bpo_comp += comp;
461 bpo->bpo_phys->bpo_uncomp += uncomp;
462 mutex_exit(&bpo->bpo_lock);
464 bpobj_close(&subbpo);
467 void
468 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
470 blkptr_t stored_bp = *bp;
471 uint64_t offset;
472 int blkoff;
473 blkptr_t *bparray;
475 ASSERT(bpobj_is_open(bpo));
476 ASSERT(!BP_IS_HOLE(bp));
477 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
479 if (BP_IS_EMBEDDED(bp)) {
481 * The bpobj will compress better without the payload.
483 * Note that we store EMBEDDED bp's because they have an
484 * uncompressed size, which must be accounted for. An
485 * alternative would be to add their size to bpo_uncomp
486 * without storing the bp, but that would create additional
487 * complications: bpo_uncomp would be inconsistent with the
488 * set of BP's stored, and bpobj_iterate() wouldn't visit
489 * all the space accounted for in the bpobj.
491 bzero(&stored_bp, sizeof (stored_bp));
492 stored_bp.blk_prop = bp->blk_prop;
493 stored_bp.blk_birth = bp->blk_birth;
494 } else if (!BP_GET_DEDUP(bp)) {
495 /* The bpobj will compress better without the checksum */
496 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
499 /* We never need the fill count. */
500 stored_bp.blk_fill = 0;
502 mutex_enter(&bpo->bpo_lock);
504 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
505 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
507 if (bpo->bpo_cached_dbuf == NULL ||
508 offset < bpo->bpo_cached_dbuf->db_offset ||
509 offset >= bpo->bpo_cached_dbuf->db_offset +
510 bpo->bpo_cached_dbuf->db_size) {
511 if (bpo->bpo_cached_dbuf)
512 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
513 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
514 offset, bpo, &bpo->bpo_cached_dbuf, 0));
517 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
518 bparray = bpo->bpo_cached_dbuf->db_data;
519 bparray[blkoff] = stored_bp;
521 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
522 bpo->bpo_phys->bpo_num_blkptrs++;
523 bpo->bpo_phys->bpo_bytes +=
524 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
525 if (bpo->bpo_havecomp) {
526 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
527 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
529 mutex_exit(&bpo->bpo_lock);
532 struct space_range_arg {
533 spa_t *spa;
534 uint64_t mintxg;
535 uint64_t maxtxg;
536 uint64_t used;
537 uint64_t comp;
538 uint64_t uncomp;
541 /* ARGSUSED */
542 static int
543 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
545 struct space_range_arg *sra = arg;
547 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
548 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
549 sra->used += bp_get_dsize_sync(sra->spa, bp);
550 else
551 sra->used += bp_get_dsize(sra->spa, bp);
552 sra->comp += BP_GET_PSIZE(bp);
553 sra->uncomp += BP_GET_UCSIZE(bp);
555 return (0);
559 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
561 ASSERT(bpobj_is_open(bpo));
562 mutex_enter(&bpo->bpo_lock);
564 *usedp = bpo->bpo_phys->bpo_bytes;
565 if (bpo->bpo_havecomp) {
566 *compp = bpo->bpo_phys->bpo_comp;
567 *uncompp = bpo->bpo_phys->bpo_uncomp;
568 mutex_exit(&bpo->bpo_lock);
569 return (0);
570 } else {
571 mutex_exit(&bpo->bpo_lock);
572 return (bpobj_space_range(bpo, 0, UINT64_MAX,
573 usedp, compp, uncompp));
578 * Return the amount of space in the bpobj which is:
579 * mintxg < blk_birth <= maxtxg
582 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
583 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
585 struct space_range_arg sra = { 0 };
586 int err;
588 ASSERT(bpobj_is_open(bpo));
591 * As an optimization, if they want the whole txg range, just
592 * get bpo_bytes rather than iterating over the bps.
594 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
595 return (bpobj_space(bpo, usedp, compp, uncompp));
597 sra.spa = dmu_objset_spa(bpo->bpo_os);
598 sra.mintxg = mintxg;
599 sra.maxtxg = maxtxg;
601 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
602 *usedp = sra.used;
603 *compp = sra.comp;
604 *uncompp = sra.uncomp;
605 return (err);