2 * Copyright (c) 2013-2015 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/fcntl.h>
40 #include <sys/namei.h>
41 #include <sys/mount.h>
42 #include <sys/vnode.h>
43 #include <sys/mountctl.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_extern.h>
49 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1))
50 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix))
53 * breadth-first search
55 typedef struct hammer2_chain_save
{
56 TAILQ_ENTRY(hammer2_chain_save
) entry
;
57 hammer2_chain_t
*chain
;
59 } hammer2_chain_save_t
;
61 TAILQ_HEAD(hammer2_chain_save_list
, hammer2_chain_save
);
62 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t
;
64 typedef struct hammer2_bulkfree_info
{
67 hammer2_off_t sbase
; /* sub-loop iteration */
69 hammer2_bmap_data_t
*bmap
;
71 long count_10_00
; /* staged->free */
72 long count_11_10
; /* allocated->staged */
73 long count_00_11
; /* (should not happen) */
74 long count_01_11
; /* (should not happen) */
75 long count_10_11
; /* staged->allocated */
77 long count_linadjusts
;
78 long count_inodes_scanned
;
79 long count_dedup_factor
;
81 hammer2_off_t adj_free
;
83 hammer2_tid_t saved_mirror_tid
;
85 hammer2_chain_save_list_t list
;
86 hammer2_dedup_t
*dedup
;
88 } hammer2_bulkfree_info_t
;
90 static int h2_bulkfree_test(hammer2_bulkfree_info_t
*info
,
91 hammer2_blockref_t
*bref
, int pri
);
94 * General bulk scan function with callback. Called with a referenced
95 * but UNLOCKED parent. The parent is returned in the same state.
99 hammer2_bulk_scan(hammer2_chain_t
*parent
,
100 int (*func
)(hammer2_bulkfree_info_t
*info
,
101 hammer2_blockref_t
*bref
),
102 hammer2_bulkfree_info_t
*info
)
104 hammer2_blockref_t bref
;
105 hammer2_chain_t
*chain
;
106 int cache_index
= -1;
112 hammer2_chain_lock(parent
, HAMMER2_RESOLVE_ALWAYS
|
113 HAMMER2_RESOLVE_SHARED
);
117 * Generally loop on the contents if we have not been flagged
120 * Remember that these chains are completely isolated from
121 * the frontend, so we can release locks temporarily without
124 while ((doabort
& HAMMER2_BULK_ABORT
) == 0 &&
125 hammer2_chain_scan(parent
, &chain
, &bref
, &first
,
127 HAMMER2_LOOKUP_NODATA
|
128 HAMMER2_LOOKUP_SHARED
) != NULL
) {
130 * Process bref, chain is only non-NULL if the bref
131 * might be recursable (its possible that we sometimes get
132 * a non-NULL chain where the bref cannot be recursed).
135 kprintf("SCAN %016jx\n", bref
.data_off
);
136 int xerr
= tsleep(&info
->pri
, PCATCH
, "slp", hz
/ 10);
137 if (xerr
== EINTR
|| xerr
== ERESTART
) {
138 doabort
|= HAMMER2_BULK_ABORT
;
142 if (h2_bulkfree_test(info
, &bref
, 1))
145 doabort
|= func(info
, &bref
);
147 if (doabort
& HAMMER2_BULK_ABORT
)
151 * A non-null chain is always returned if it is
152 * recursive, otherwise a non-null chain might be
153 * returned but usually is not when not recursive.
159 * Else check type and setup depth-first scan.
161 * Account for bytes actually read.
163 info
->bytes_scanned
+= chain
->bytes
;
165 switch(chain
->bref
.type
) {
166 case HAMMER2_BREF_TYPE_INODE
:
167 case HAMMER2_BREF_TYPE_FREEMAP_NODE
:
168 case HAMMER2_BREF_TYPE_INDIRECT
:
169 case HAMMER2_BREF_TYPE_VOLUME
:
170 case HAMMER2_BREF_TYPE_FREEMAP
:
172 if (info
->depth
> 16) {
173 hammer2_chain_save_t
*save
;
174 save
= kmalloc(sizeof(*save
), M_HAMMER2
,
177 hammer2_chain_ref(chain
);
178 TAILQ_INSERT_TAIL(&info
->list
, save
, entry
);
183 int savepri
= info
->pri
;
185 hammer2_chain_unlock(chain
);
187 doabort
|= hammer2_bulk_scan(chain
, func
, info
);
188 info
->pri
+= savepri
;
189 hammer2_chain_lock(chain
,
190 HAMMER2_RESOLVE_ALWAYS
|
191 HAMMER2_RESOLVE_SHARED
);
196 /* does not recurse */
201 hammer2_chain_unlock(chain
);
202 hammer2_chain_drop(chain
);
206 * Save with higher pri now that we know what it is.
208 h2_bulkfree_test(info
, &parent
->bref
, info
->pri
+ 1);
210 hammer2_chain_unlock(parent
);
219 * Chain flush (partial synchronization) XXX removed
220 * Scan the whole topology - build in-memory freemap (mark 11)
221 * Reconcile the in-memory freemap against the on-disk freemap.
222 * ondisk xx -> ondisk 11 (if allocated)
223 * ondisk 11 -> ondisk 10 (if free in-memory)
224 * ondisk 10 -> ondisk 00 (if free in-memory) - on next pass
227 * The topology scan may have to be performed multiple times to window
228 * freemaps which are too large to fit in kernel memory.
230 * Races are handled using a double-transition (11->10, 10->00). The bulkfree
231 * scan snapshots the volume root's blockset and thus can run concurrent with
232 * normal operations, as long as a full flush is made between each pass to
233 * synchronize any modified chains (otherwise their blocks might be improperly
236 * Temporary memory in multiples of 64KB is required to reconstruct the leaf
237 * hammer2_bmap_data blocks so they can later be compared against the live
238 * freemap. Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage.
239 * A 32MB save area thus represents around ~1 TB. The temporary memory
240 * allocated can be specified. If it is not sufficient multiple topology
241 * passes will be made.
245 * Bulkfree callback info
247 static void cbinfo_bmap_init(hammer2_bulkfree_info_t
*cbinfo
, size_t size
);
248 static int h2_bulkfree_callback(hammer2_bulkfree_info_t
*cbinfo
,
249 hammer2_blockref_t
*bref
);
250 static void h2_bulkfree_sync(hammer2_bulkfree_info_t
*cbinfo
);
251 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t
*cbinfo
,
252 hammer2_off_t data_off
, hammer2_bmap_data_t
*live
,
253 hammer2_bmap_data_t
*bmap
, int nofree
);
256 hammer2_bulkfree_pass(hammer2_dev_t
*hmp
, hammer2_ioc_bulkfree_t
*bfi
)
258 hammer2_bulkfree_info_t cbinfo
;
259 hammer2_chain_t
*vchain
;
260 hammer2_chain_save_t
*save
;
266 * A bulkfree operations lock is required for the duration. We
267 * must hold it across our flushes to guarantee that we never run
268 * two bulkfree passes in a row without a flush in the middle.
270 lockmgr(&hmp
->bulklk
, LK_EXCLUSIVE
);
273 * We have to clear the live dedup cache as it might have entries
274 * that are freeable as of now. Any new entries in the dedup cache
275 * made after this point, even if they become freeable, will have
276 * previously been fully allocated and will be protected by the
279 hammer2_dedup_clear(hmp
);
283 * XXX This has been put back in. The below check is currently
284 * disabled because it creates quite a bit of confusion, so we're
285 * going to try to fix this in a different way.
287 * XXX This has been removed. Instead of trying to flush, which
288 * appears to have a ton of races against life chains even with
289 * the two-stage scan, we simply refuse to free any blocks
290 * related to freemap chains modified after the last filesystem
293 * Do a quick flush so we can snapshot vchain for any blocks that
294 * have been allocated prior to this point. We don't need to
295 * flush vnodes, logical buffers, or dirty inodes that have not
296 * allocated blocks yet. We do not want to flush the device buffers
297 * nor do we want to flush the actual volume root to disk here,
298 * that is not needed to perform the snapshot.
300 hammer2_flush_quick(hmp
);
304 * Setup for free pass
306 bzero(&cbinfo
, sizeof(cbinfo
));
307 size
= (bfi
->size
+ HAMMER2_FREEMAP_LEVELN_PSIZE
- 1) &
308 ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE
- 1);
310 cbinfo
.bmap
= kmem_alloc_swapbacked(&cbinfo
.kp
, size
, VM_SUBSYS_HAMMER
);
311 cbinfo
.saved_mirror_tid
= hmp
->voldata
.mirror_tid
;
313 cbinfo
.dedup
= kmalloc(sizeof(*cbinfo
.dedup
) * HAMMER2_DEDUP_HEUR_SIZE
,
314 M_HAMMER2
, M_WAITOK
| M_ZERO
);
317 * Normalize start point to a 2GB boundary. We operate on a
318 * 64KB leaf bitmap boundary which represents 2GB of storage.
320 cbinfo
.sbase
= bfi
->sbase
;
321 if (cbinfo
.sbase
> hmp
->voldata
.volu_size
)
322 cbinfo
.sbase
= hmp
->voldata
.volu_size
;
323 cbinfo
.sbase
&= ~HAMMER2_FREEMAP_LEVEL1_MASK
;
326 * The primary storage scan must use a snapshot of the volume
327 * root to avoid racing renames and other frontend work.
329 * Note that snapshots only snap synchronized storage, so
330 * we have to flush between each pass or we risk freeing
331 * storage allocated by the frontend.
333 vchain
= hammer2_chain_bulksnap(&hmp
->vchain
);
334 TAILQ_INIT(&cbinfo
.list
);
337 * Loop on a full meta-data scan as many times as required to
338 * get through all available storage.
340 while (cbinfo
.sbase
< hmp
->voldata
.volu_size
) {
342 * We have enough ram to represent (incr) bytes of storage.
343 * Each 64KB of ram represents 2GB of storage.
345 cbinfo_bmap_init(&cbinfo
, size
);
346 incr
= size
/ HAMMER2_FREEMAP_LEVELN_PSIZE
*
347 HAMMER2_FREEMAP_LEVEL1_SIZE
;
348 if (hmp
->voldata
.volu_size
- cbinfo
.sbase
< incr
)
349 cbinfo
.sstop
= hmp
->voldata
.volu_size
;
351 cbinfo
.sstop
= cbinfo
.sbase
+ incr
;
352 if (hammer2_debug
& 1) {
353 kprintf("bulkfree pass %016jx/%jdGB\n",
354 (intmax_t)cbinfo
.sbase
,
355 (intmax_t)incr
/ HAMMER2_FREEMAP_LEVEL1_SIZE
);
359 * Scan topology for stuff inside this range.
361 hammer2_trans_init(hmp
->spmp
, 0);
362 cbinfo
.mtid
= hammer2_trans_sub(hmp
->spmp
);
364 doabort
|= hammer2_bulk_scan(vchain
, h2_bulkfree_callback
,
367 while ((save
= TAILQ_FIRST(&cbinfo
.list
)) != NULL
&&
369 TAILQ_REMOVE(&cbinfo
.list
, save
, entry
);
371 doabort
|= hammer2_bulk_scan(save
->chain
,
372 h2_bulkfree_callback
,
374 hammer2_chain_drop(save
->chain
);
375 kfree(save
, M_HAMMER2
);
378 TAILQ_REMOVE(&cbinfo
.list
, save
, entry
);
379 hammer2_chain_drop(save
->chain
);
380 kfree(save
, M_HAMMER2
);
381 save
= TAILQ_FIRST(&cbinfo
.list
);
384 kprintf("bulkfree lastdrop %d %d doabort=%d\n",
385 vchain
->refs
, vchain
->core
.chain_count
, doabort
);
388 * If complete scan succeeded we can synchronize our
389 * in-memory freemap against live storage. If an abort
390 * did occur we cannot safely synchronize our partially
391 * filled-out in-memory freemap.
394 h2_bulkfree_sync(&cbinfo
);
396 hammer2_voldata_lock(hmp
);
397 hammer2_voldata_modify(hmp
);
398 hmp
->voldata
.allocator_free
+= cbinfo
.adj_free
;
399 hammer2_voldata_unlock(hmp
);
403 * Cleanup for next loop.
405 hammer2_trans_done(hmp
->spmp
);
408 cbinfo
.sbase
= cbinfo
.sstop
;
411 hammer2_chain_bulkdrop(vchain
);
412 kmem_free_swapbacked(&cbinfo
.kp
);
413 kfree(cbinfo
.dedup
, M_HAMMER2
);
416 bfi
->sstop
= cbinfo
.sbase
;
418 incr
= bfi
->sstop
/ (hmp
->voldata
.volu_size
/ 10000);
422 kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n",
426 kprintf(" transition->free %ld\n", cbinfo
.count_10_00
);
427 kprintf(" transition->staged %ld\n", cbinfo
.count_11_10
);
428 kprintf(" ERR(00)->allocated %ld\n", cbinfo
.count_00_11
);
429 kprintf(" ERR(01)->allocated %ld\n", cbinfo
.count_01_11
);
430 kprintf(" staged->allocated %ld\n", cbinfo
.count_10_11
);
431 kprintf(" ~2MB segs cleaned %ld\n", cbinfo
.count_l0cleans
);
432 kprintf(" linear adjusts %ld\n", cbinfo
.count_linadjusts
);
433 kprintf(" dedup factor %ld\n", cbinfo
.count_dedup_factor
);
435 lockmgr(&hmp
->bulklk
, LK_RELEASE
);
436 /* hammer2_vfs_sync(mp, MNT_WAIT); sync needed */
442 cbinfo_bmap_init(hammer2_bulkfree_info_t
*cbinfo
, size_t size
)
444 hammer2_bmap_data_t
*bmap
= cbinfo
->bmap
;
445 hammer2_key_t key
= cbinfo
->sbase
;
449 lokey
= (cbinfo
->hmp
->voldata
.allocator_beg
+ HAMMER2_SEGMASK64
) &
451 hikey
= cbinfo
->hmp
->voldata
.volu_size
& ~HAMMER2_SEGMASK64
;
455 if (lokey
< H2FMBASE(key
, HAMMER2_FREEMAP_LEVEL1_RADIX
) +
456 HAMMER2_ZONE_SEG64
) {
457 lokey
= H2FMBASE(key
, HAMMER2_FREEMAP_LEVEL1_RADIX
) +
460 if (key
< lokey
|| key
>= hikey
) {
461 memset(bmap
->bitmapq
, -1,
462 sizeof(bmap
->bitmapq
));
464 bmap
->linear
= HAMMER2_SEGSIZE
;
466 bmap
->avail
= H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX
);
468 size
-= sizeof(*bmap
);
469 key
+= HAMMER2_FREEMAP_LEVEL0_SIZE
;
475 h2_bulkfree_callback(hammer2_bulkfree_info_t
*cbinfo
, hammer2_blockref_t
*bref
)
477 hammer2_bmap_data_t
*bmap
;
478 hammer2_off_t data_off
;
485 * Check for signal and allow yield to userland during scan
487 if (hammer2_signal_check(&cbinfo
->save_time
))
488 return HAMMER2_BULK_ABORT
;
489 if (bref
->type
== HAMMER2_BREF_TYPE_INODE
) {
490 ++cbinfo
->count_inodes_scanned
;
491 if ((cbinfo
->count_inodes_scanned
& 65535) == 0)
492 kprintf(" inodes %6ld bytes %9ld\n",
493 cbinfo
->count_inodes_scanned
,
494 cbinfo
->bytes_scanned
);
498 * Calculate the data offset and determine if it is within
499 * the current freemap range being gathered.
502 data_off
= bref
->data_off
& ~HAMMER2_OFF_MASK_RADIX
;
503 if (data_off
< cbinfo
->sbase
|| data_off
>= cbinfo
->sstop
)
505 if (data_off
< cbinfo
->hmp
->voldata
.allocator_beg
)
507 if (data_off
>= cbinfo
->hmp
->voldata
.volu_size
)
511 * Calculate the information needed to generate the in-memory
514 * Hammer2 does not allow allocations to cross the L1 (2GB) boundary,
515 * it's a problem if it does. (Or L0 (2MB) for that matter).
517 radix
= (int)(bref
->data_off
& HAMMER2_OFF_MASK_RADIX
);
518 bytes
= (size_t)1 << radix
;
519 class = (bref
->type
<< 8) | hammer2_devblkradix(radix
);
521 if (data_off
+ bytes
>= cbinfo
->sstop
) {
522 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary "
523 "%016jx %016jx/%d\n",
524 (intmax_t)bref
->data_off
,
527 bytes
= cbinfo
->sstop
- data_off
; /* XXX */
531 * Convert to a storage offset relative to the beginning of the
532 * storage range we are collecting. Then lookup the level0 bmap entry.
534 data_off
-= cbinfo
->sbase
;
535 bmap
= cbinfo
->bmap
+ (data_off
>> HAMMER2_FREEMAP_LEVEL0_RADIX
);
538 * Convert data_off to a bmap-relative value (~2MB storage range).
539 * Adjust linear, class, and avail.
541 * Hammer2 does not allow allocations to cross the L0 (2MB) boundary,
543 data_off
&= HAMMER2_FREEMAP_LEVEL0_MASK
;
544 if (data_off
+ bytes
> HAMMER2_FREEMAP_LEVEL0_SIZE
) {
545 kprintf("hammer2_bulkfree_scan: illegal 2MB boundary "
546 "%016jx %016jx/%d\n",
547 (intmax_t)bref
->data_off
,
550 bytes
= HAMMER2_FREEMAP_LEVEL0_SIZE
- data_off
;
553 if (bmap
->class == 0) {
555 bmap
->avail
= HAMMER2_FREEMAP_LEVEL0_SIZE
;
557 if (bmap
->class != class) {
558 kprintf("hammer2_bulkfree_scan: illegal mixed class "
559 "%016jx %016jx/%d (%04x vs %04x)\n",
560 (intmax_t)bref
->data_off
,
565 if (bmap
->linear
< (int32_t)data_off
+ (int32_t)bytes
)
566 bmap
->linear
= (int32_t)data_off
+ (int32_t)bytes
;
569 * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS].
570 * 64-bit entries, 2 bits per entry, to code 11.
572 * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE.
576 hammer2_bitmap_t bmask
;
578 bindex
= (int)data_off
>> (HAMMER2_FREEMAP_BLOCK_RADIX
+
579 HAMMER2_BMAP_INDEX_RADIX
);
580 bmask
= (hammer2_bitmap_t
)3 <<
581 ((((int)data_off
& HAMMER2_BMAP_INDEX_MASK
) >>
582 HAMMER2_FREEMAP_BLOCK_RADIX
) << 1);
585 * NOTE! The (avail) calculation is bitmap-granular. Multiple
586 * sub-granular records can wind up at the same bitmap
589 if ((bmap
->bitmapq
[bindex
] & bmask
) == 0) {
590 if (bytes
< HAMMER2_FREEMAP_BLOCK_SIZE
) {
591 bmap
->avail
-= HAMMER2_FREEMAP_BLOCK_SIZE
;
593 bmap
->avail
-= bytes
;
595 bmap
->bitmapq
[bindex
] |= bmask
;
597 data_off
+= HAMMER2_FREEMAP_BLOCK_SIZE
;
598 if (bytes
< HAMMER2_FREEMAP_BLOCK_SIZE
)
601 bytes
-= HAMMER2_FREEMAP_BLOCK_SIZE
;
607 * Synchronize the in-memory bitmap with the live freemap. This is not a
608 * direct copy. Instead the bitmaps must be compared:
610 * In-memory Live-freemap
611 * 00 11 -> 10 (do nothing if live modified)
612 * 10 -> 00 (do nothing if live modified)
613 * 11 10 -> 11 handles race against live
614 * ** -> 11 nominally warn of corruption
618 h2_bulkfree_sync(hammer2_bulkfree_info_t
*cbinfo
)
620 hammer2_off_t data_off
;
622 hammer2_key_t key_dummy
;
623 hammer2_bmap_data_t
*bmap
;
624 hammer2_bmap_data_t
*live
;
625 hammer2_chain_t
*live_parent
;
626 hammer2_chain_t
*live_chain
;
627 int cache_index
= -1;
630 kprintf("hammer2_bulkfree - range ");
632 if (cbinfo
->sbase
< cbinfo
->hmp
->voldata
.allocator_beg
)
634 (intmax_t)cbinfo
->hmp
->voldata
.allocator_beg
);
637 (intmax_t)cbinfo
->sbase
);
639 if (cbinfo
->sstop
> cbinfo
->hmp
->voldata
.volu_size
)
641 (intmax_t)cbinfo
->hmp
->voldata
.volu_size
);
644 (intmax_t)cbinfo
->sstop
);
646 data_off
= cbinfo
->sbase
;
649 live_parent
= &cbinfo
->hmp
->fchain
;
650 hammer2_chain_ref(live_parent
);
651 hammer2_chain_lock(live_parent
, HAMMER2_RESOLVE_ALWAYS
);
655 * Iterate each hammer2_bmap_data_t line (128 bytes) managing
658 while (data_off
< cbinfo
->sstop
) {
660 * The freemap is not used below allocator_beg or beyond
665 if (data_off
< cbinfo
->hmp
->voldata
.allocator_beg
)
667 if (data_off
>= cbinfo
->hmp
->voldata
.volu_size
)
671 * Locate the freemap leaf on the live filesystem
673 key
= (data_off
& ~HAMMER2_FREEMAP_LEVEL1_MASK
);
676 if (live_chain
== NULL
|| live_chain
->bref
.key
!= key
) {
678 hammer2_chain_unlock(live_chain
);
679 hammer2_chain_drop(live_chain
);
681 live_chain
= hammer2_chain_lookup(
685 key
+ HAMMER2_FREEMAP_LEVEL1_MASK
,
687 HAMMER2_LOOKUP_ALWAYS
);
691 * If recent allocations were made we avoid races by
692 * not staging or freeing any blocks. We can still
693 * remark blocks as fully allocated.
696 if (hammer2_debug
& 1) {
697 kprintf("live_chain %016jx\n",
700 if (live_chain
->bref
.mirror_tid
>
701 cbinfo
->saved_mirror_tid
) {
702 kprintf("hammer2_bulkfree: "
712 if (live_chain
== NULL
) {
714 * XXX if we implement a full recovery mode we need
715 * to create/recreate missing freemap chains if our
716 * bmap has any allocated blocks.
719 bmap
->avail
!= HAMMER2_FREEMAP_LEVEL0_SIZE
) {
720 kprintf("hammer2_bulkfree: cannot locate "
721 "live leaf for allocated data "
727 if (live_chain
->error
) {
728 kprintf("hammer2_bulkfree: error %s looking up "
729 "live leaf for allocated data near %016jx\n",
730 hammer2_error_str(live_chain
->error
),
732 hammer2_chain_unlock(live_chain
);
733 hammer2_chain_drop(live_chain
);
738 bmapindex
= (data_off
& HAMMER2_FREEMAP_LEVEL1_MASK
) >>
739 HAMMER2_FREEMAP_LEVEL0_RADIX
;
740 live
= &live_chain
->data
->bmdata
[bmapindex
];
743 * TODO - we could shortcut this by testing that both
744 * live->class and bmap->class are 0, and both avails are
745 * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB).
747 if (bcmp(live
->bitmapq
, bmap
->bitmapq
,
748 sizeof(bmap
->bitmapq
)) == 0) {
751 if (hammer2_debug
& 1) {
752 kprintf("live %016jx %04d.%04x (avail=%d)\n",
753 data_off
, bmapindex
, live
->class, live
->avail
);
756 hammer2_chain_modify(live_chain
, cbinfo
->mtid
, 0, 0);
757 live
= &live_chain
->data
->bmdata
[bmapindex
];
759 h2_bulkfree_sync_adjust(cbinfo
, data_off
, live
, bmap
, nofree
);
761 data_off
+= HAMMER2_FREEMAP_LEVEL0_SIZE
;
765 hammer2_chain_unlock(live_chain
);
766 hammer2_chain_drop(live_chain
);
769 hammer2_chain_unlock(live_parent
);
770 hammer2_chain_drop(live_parent
);
775 * When bulkfree is finally able to free a block it must make sure that
776 * the INVALOK bit in any cached DIO is cleared prior to the block being
781 fixup_dio(hammer2_dev_t
*hmp
, hammer2_off_t data_off
, int bindex
, int scount
)
783 data_off
+= (scount
>> 1) * HAMMER2_FREEMAP_BLOCK_SIZE
;
785 (HAMMER2_FREEMAP_BLOCK_SIZE
* HAMMER2_BMAP_BLOCKS_PER_ELEMENT
);
786 hammer2_io_resetinval(hmp
, data_off
);
790 * Merge the bulkfree bitmap against the existing bitmap.
792 * If nofree is non-zero the merge will only mark free blocks as allocated
793 * and will refuse to free any blocks.
797 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t
*cbinfo
,
798 hammer2_off_t data_off
, hammer2_bmap_data_t
*live
,
799 hammer2_bmap_data_t
*bmap
, int nofree
)
803 hammer2_bitmap_t lmask
;
804 hammer2_bitmap_t mmask
;
806 for (bindex
= 0; bindex
< HAMMER2_BMAP_ELEMENTS
; ++bindex
) {
807 lmask
= live
->bitmapq
[bindex
]; /* live */
808 mmask
= bmap
->bitmapq
[bindex
]; /* snapshotted bulkfree */
813 scount
< HAMMER2_BMAP_BITS_PER_ELEMENT
;
815 if ((mmask
& 3) == 0) {
817 * in-memory 00 live 11 -> 10
820 * Storage might be marked allocated or
821 * staged and must be remarked staged or
828 kprintf("hammer2_bulkfree: cannot "
829 "transition m=00/l=01\n");
831 case 2: /* 10 -> 00 */
834 live
->bitmapq
[bindex
] &=
835 ~((hammer2_bitmap_t
)2 << scount
);
837 HAMMER2_FREEMAP_BLOCK_SIZE
;
839 HAMMER2_FREEMAP_LEVEL0_SIZE
) {
841 HAMMER2_FREEMAP_LEVEL0_SIZE
;
844 HAMMER2_FREEMAP_BLOCK_SIZE
;
845 ++cbinfo
->count_10_00
;
847 case 3: /* 11 -> 10 */
850 live
->bitmapq
[bindex
] &=
851 ~((hammer2_bitmap_t
)1 << scount
);
852 ++cbinfo
->count_11_10
;
853 fixup_dio(cbinfo
->hmp
, data_off
,
857 } else if ((mmask
& 3) == 3) {
859 * in-memory 11 live 10 -> 11
862 * Storage might be incorrectly marked free
863 * or staged and must be remarked fully
868 ++cbinfo
->count_00_11
;
870 HAMMER2_FREEMAP_BLOCK_SIZE
;
872 HAMMER2_FREEMAP_BLOCK_SIZE
;
873 if ((int32_t)live
->avail
< 0)
877 ++cbinfo
->count_01_11
;
879 case 2: /* 10 -> 11 */
880 ++cbinfo
->count_10_11
;
885 live
->bitmapq
[bindex
] |=
886 ((hammer2_bitmap_t
)3 << scount
);
894 * Determine if the live bitmap is completely free and reset its
895 * fields if so. Otherwise check to see if we can reduce the linear
898 for (bindex
= HAMMER2_BMAP_ELEMENTS
- 1; bindex
>= 0; --bindex
) {
899 if (live
->bitmapq
[bindex
] != 0)
904 } else if (bindex
< 0) {
905 live
->avail
= HAMMER2_FREEMAP_LEVEL0_SIZE
;
908 ++cbinfo
->count_l0cleans
;
909 } else if (bindex
< 7) {
911 if (live
->linear
> bindex
* HAMMER2_FREEMAP_BLOCK_SIZE
) {
912 live
->linear
= bindex
* HAMMER2_FREEMAP_BLOCK_SIZE
;
913 ++cbinfo
->count_linadjusts
;
917 * XXX this fine-grained measure still has some issues.
919 if (live
->linear
< bindex
* HAMMER2_FREEMAP_BLOCK_SIZE
) {
920 live
->linear
= bindex
* HAMMER2_FREEMAP_BLOCK_SIZE
;
921 ++cbinfo
->count_linadjusts
;
924 live
->linear
= HAMMER2_SEGSIZE
;
929 kprintf("%016jx %04d.%04x (avail=%7d) "
930 "%08x %08x %08x %08x %08x %08x %08x %08x\n",
933 HAMMER2_FREEMAP_LEVEL1_MASK
) >>
934 HAMMER2_FREEMAP_LEVEL0_RADIX
),
937 bmap
->bitmap
[0], bmap
->bitmap
[1],
938 bmap
->bitmap
[2], bmap
->bitmap
[3],
939 bmap
->bitmap
[4], bmap
->bitmap
[5],
940 bmap
->bitmap
[6], bmap
->bitmap
[7]);
946 * BULKFREE DEDUP HEURISTIC
948 * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
949 * All fields must be loaded into locals and validated.
953 h2_bulkfree_test(hammer2_bulkfree_info_t
*cbinfo
, hammer2_blockref_t
*bref
,
956 hammer2_dedup_t
*dedup
;
961 n
= hammer2_icrc32(&bref
->data_off
, sizeof(bref
->data_off
));
962 dedup
= cbinfo
->dedup
+ (n
& (HAMMER2_DEDUP_HEUR_MASK
& ~7));
964 for (i
= best
= 0; i
< 8; ++i
) {
965 if (dedup
[i
].data_off
== bref
->data_off
) {
966 if (dedup
[i
].ticks
< pri
)
967 dedup
[i
].ticks
= pri
;
969 cbinfo
->count_dedup_factor
+= dedup
[i
].ticks
;
972 if (dedup
[i
].ticks
< dedup
[best
].ticks
)
975 dedup
[best
].data_off
= bref
->data_off
;
976 dedup
[best
].ticks
= pri
;