From 0b8efeb7d8bb32a0e5c57bd87eef12569e81b294 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 24 Aug 2017 15:52:45 -0700 Subject: [PATCH] hammer2 - Work on concurrent bulkfree stability * The dedup bits in the DIO structure must be set atomically with the setting of the bitmap bits in the freemap to avoid racing bulkfree. * dedup bits are normally deleted on the 11->10 transition in bulkfree, and asserted to be deleted on the 10->00 transition. Also delete dedup bits when a modified chain with no parent is destroyed, and for a chain's prior data reference when it is being replaced by a new data reference. * Clean up the DIO dedup management code. * Clean up the DIO allocation code. * Cap the size of the DIO cache to 100000 elements for now, if nbuf * 2 is greater. --- sys/vfs/hammer2/hammer2.h | 35 +++++++-- sys/vfs/hammer2/hammer2_bulkfree.c | 9 +-- sys/vfs/hammer2/hammer2_chain.c | 22 ++++++ sys/vfs/hammer2/hammer2_flush.c | 7 +- sys/vfs/hammer2/hammer2_freemap.c | 12 ++++ sys/vfs/hammer2/hammer2_io.c | 142 +++++++++++++++++++++++++++++-------- sys/vfs/hammer2/hammer2_strategy.c | 79 +++------------------ sys/vfs/hammer2/hammer2_vfsops.c | 13 ++++ 8 files changed, 209 insertions(+), 110 deletions(-) diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index dad7133fed..6791b192f5 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -1275,6 +1275,32 @@ MPTOPMP(struct mount *mp) return ((hammer2_pfs_t *)mp->mnt_data); } +#define HAMMER2_DEDUP_FRAG (HAMMER2_PBUFSIZE / 64) +#define HAMMER2_DEDUP_FRAGRADIX (HAMMER2_PBUFRADIX - 6) + +static __inline +uint64_t +hammer2_dedup_mask(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes) +{ + int bbeg; + int bits; + uint64_t mask; + + bbeg = (int)((data_off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) >> + HAMMER2_DEDUP_FRAGRADIX; + bits = (int)((bytes + (HAMMER2_DEDUP_FRAG - 1)) >> + HAMMER2_DEDUP_FRAGRADIX); + mask = ((uint64_t)1 << bbeg) - 1; + if (bbeg + bits == 64) + mask = (uint64_t)-1; + else + mask = ((uint64_t)1 << (bbeg + bits)) - 1; + + mask &= ~(((uint64_t)1 << bbeg) - 1); + + return mask; +} + extern struct vop_ops hammer2_vnode_vops; extern struct vop_ops hammer2_spec_vops; extern struct vop_ops hammer2_fifo_vops; @@ -1497,10 +1523,6 @@ hammer2_tid_t hammer2_trans_newinum(hammer2_pfs_t *pmp); void hammer2_trans_assert_strategy(hammer2_pfs_t *pmp); void hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, char *data); -void hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off, - u_int bytes); -void hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, - u_int bytes); /* * hammer2_ioctl.c @@ -1519,6 +1541,11 @@ hammer2_io_t *hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, int notgood); void hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, hammer2_iocb_t *iocb); +void hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref); +void hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype, + hammer2_off_t data_off, u_int bytes); +void hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, + u_int bytes); void hammer2_io_complete(hammer2_iocb_t *iocb); void hammer2_io_callback(struct bio *bio); void hammer2_iocb_wait(hammer2_iocb_t *iocb); diff --git a/sys/vfs/hammer2/hammer2_bulkfree.c b/sys/vfs/hammer2/hammer2_bulkfree.c index b5132e69ab..45d3213876 100644 --- a/sys/vfs/hammer2/hammer2_bulkfree.c +++ b/sys/vfs/hammer2/hammer2_bulkfree.c @@ -854,7 +854,7 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, cbinfo->adj_free += HAMMER2_FREEMAP_BLOCK_SIZE; ++cbinfo->count_10_00; - hammer2_dedup_assert( + hammer2_io_dedup_assert( cbinfo->hmp, tmp_off | HAMMER2_FREEMAP_BLOCK_RADIX, @@ -864,8 +864,9 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, live->bitmapq[bindex] &= ~((hammer2_bitmap_t)1 << scount); ++cbinfo->count_11_10; - hammer2_dedup_delete( + hammer2_io_dedup_delete( cbinfo->hmp, + HAMMER2_BREF_TYPE_DATA, tmp_off | HAMMER2_FREEMAP_BLOCK_RADIX, HAMMER2_FREEMAP_BLOCK_SIZE); @@ -925,7 +926,7 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, live->linear = 0; ++cbinfo->count_l0cleans; #if 0 - hammer2_dedup_assert(cbinfo->hmp, + hammer2_io_dedup_assert(cbinfo->hmp, data_off | HAMMER2_FREEMAP_LEVEL0_RADIX, HAMMER2_FREEMAP_LEVEL0_SIZE); @@ -938,7 +939,7 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { nlinear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; #if 0 - hammer2_dedup_assert(cbinfo->hmp, + hammer2_io_dedup_assert(cbinfo->hmp, data_off + nlinear, live->linear - nlinear); #endif diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index b371982ba6..c0dc220a3a 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -465,6 +465,20 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain) hammer2_io_t *dio; /* + * On last drop if there is no parent and data_off is good (at + * least does not represent the volume root), the modified chain + * is probably going to be destroyed. We have to make sure that + * the data area is not registered for dedup. + */ + if (chain->parent == NULL && + (chain->flags & HAMMER2_CHAIN_MODIFIED) && + (chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX)) { + hmp = chain->hmp; + hammer2_io_dedup_delete(hmp, chain->bref.type, + chain->bref.data_off, chain->bytes); + } + + /* * Critical field access. */ hammer2_spin_ex(&chain->core.spin); @@ -528,6 +542,10 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain) /* * Otherwise we can scrap the MODIFIED bit if it is set, * and continue along the freeing path. + * + * Be sure to clean-out any dedup bits. Without a parent + * this chain will no longer be visible to the flush code. + * Easy check data_off to avoid the volume root. */ if (chain->flags & HAMMER2_CHAIN_MODIFIED) { atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); @@ -1630,6 +1648,10 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, if ((chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX) == 0 || newmod ) { + hammer2_io_dedup_delete(chain->hmp, + chain->bref.type, + chain->bref.data_off, + chain->bytes); if (dedup_off) { chain->bref.data_off = dedup_off; chain->bytes = 1 << (dedup_off & diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index f096b6fd10..76e1ef862b 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -874,9 +874,10 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, * potentially dirty portion of the I/O buffer. */ if (chain->flags & HAMMER2_CHAIN_DESTROY) { - hammer2_dedup_delete(hmp, - chain->bref.data_off, - chain->bytes); + hammer2_io_dedup_delete(hmp, + chain->bref.type, + chain->bref.data_off, + chain->bytes); #if 0 hammer2_io_t *dio; if (chain->dio) { diff --git a/sys/vfs/hammer2/hammer2_freemap.c b/sys/vfs/hammer2/hammer2_freemap.c index b00cef7c40..6dfa99940e 100644 --- a/sys/vfs/hammer2/hammer2_freemap.c +++ b/sys/vfs/hammer2/hammer2_freemap.c @@ -505,6 +505,18 @@ hammer2_freemap_try_alloc(hammer2_chain_t **parentp, key + bytes <= hmp->voldata.volu_size); KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG); bref->data_off = key | radix; + + /* + * Record dedupability. The dedup bits are cleared + * when bulkfree transitions the freemap from 11->10, + * and asserted to be clear on the 10->00 transition. + * + * We must record the bitmask with the chain locked + * at the time we set the allocation bits to avoid + * racing a bulkfree. + */ + if (bref->type == HAMMER2_BREF_TYPE_DATA) + hammer2_io_dedup_set(hmp, bref); #if 0 kprintf("alloc cp=%p %016jx %016jx using %016jx\n", chain, diff --git a/sys/vfs/hammer2/hammer2_io.c b/sys/vfs/hammer2/hammer2_io.c index 46c0540e3d..a2b11e1de2 100644 --- a/sys/vfs/hammer2/hammer2_io.c +++ b/sys/vfs/hammer2/hammer2_io.c @@ -93,32 +93,30 @@ hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes) #define HAMMER2_GETBLK_OWNED 2 /* - * Allocate/Locate the requested dio, reference it, issue or queue iocb. + * Returns the DIO corresponding to the data|radix, creating it if necessary. + * + * If createit is 0, NULL can be returned indicating that the DIO does not + * exist. (btype) is ignored when createit is 0. */ -void -hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, - hammer2_iocb_t *iocb) +static __inline +hammer2_io_t * +hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype, + int createit) { hammer2_io_t *dio; hammer2_io_t *xio; - off_t pbase; - off_t pmask; - - /* - * XXX after free, buffer reuse case w/ different size can clash - * with dio cache. Lets avoid it for now. Ultimate we need to - * invalidate the dio cache when freeing blocks to allow a mix - * of 16KB and 64KB block sizes). - */ - /*int psize = hammer2_devblksize(lsize);*/ - int psize = HAMMER2_PBUFSIZE; - uint64_t refs; + hammer2_key_t lbase; + hammer2_key_t pbase; + hammer2_key_t pmask; + int lsize; + int psize; + psize = HAMMER2_PBUFSIZE; pmask = ~(hammer2_off_t)(psize - 1); - - KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize); - lbase &= ~HAMMER2_OFF_MASK_RADIX; + lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX); + lbase = data_off & ~HAMMER2_OFF_MASK_RADIX; pbase = lbase & pmask; + if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) { kprintf("Illegal: %016jx %016jx+%08x / %016jx\n", pbase, lbase, lsize, pmask); @@ -136,13 +134,13 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, atomic_add_int(&dio->hmp->iofree_count, -1); } hammer2_spin_unsh(&hmp->io_spin); - } else { + } else if (createit) { hammer2_spin_unsh(&hmp->io_spin); dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO); dio->hmp = hmp; dio->pbase = pbase; dio->psize = psize; - dio->btype = iocb->btype; + dio->btype = btype; dio->refs = 1; dio->act = 5; hammer2_spin_init(&dio->spin, "h2dio"); @@ -161,16 +159,31 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, kfree(dio, M_HAMMER2); dio = xio; } + } else { + hammer2_spin_unsh(&hmp->io_spin); + return NULL; } - - /* - * Obtain/Validate the buffer. - */ - iocb->dio = dio; - dio->ticks = ticks; if (dio->act < 10) - ++dio->act; /* SMP race ok */ + ++dio->act; + + return dio; +} + +/* + * Allocate/Locate the requested dio, reference it, issue or queue iocb. + */ +void +hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, + hammer2_iocb_t *iocb) +{ + hammer2_io_t *dio; + uint64_t refs; + + KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize); + dio = hammer2_io_alloc(hmp, lbase, iocb->btype, 1); + + iocb->dio = dio; for (;;) { refs = dio->refs; @@ -665,7 +678,6 @@ hammer2_io_putblk(hammer2_io_t **diop) if (hmp->iofree_count > limit_dio) { struct hammer2_cleanupcb_info info; - kprintf("x"); RB_INIT(&info.tmptree); hammer2_spin_ex(&hmp->io_spin); if (hmp->iofree_count > limit_dio) { @@ -1091,6 +1103,7 @@ hammer2_io_setdirty(hammer2_io_t *dio) void hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes) { + /* NOP */ } void @@ -1111,6 +1124,77 @@ hammer2_io_isdirty(hammer2_io_t *dio) return((dio->refs & HAMMER2_DIO_DIRTY) != 0); } +/* + * Set dedup validation bits in a DIO. We do not need the buffer cache + * buffer for this. This must be done concurrent with setting bits in + * the freemap so as to interlock with bulkfree's clearing of those bits. + */ +void +hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref) +{ + hammer2_io_t *dio; + int lsize; + + dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1); + lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); + atomic_set_64(&dio->dedup_ok_mask, + hammer2_dedup_mask(dio, bref->data_off, lsize)); + hammer2_io_putblk(&dio); +} + +/* + * Clear dedup validation bits in a DIO. This is typically done when + * a modified chain is destroyed or by the bulkfree code. No buffer + * is needed for this operation. If the DIO no longer exists it is + * equivalent to the bits not being set. + */ +void +hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype, + hammer2_off_t data_off, u_int bytes) +{ + hammer2_io_t *dio; + + if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) + return; + if (btype != HAMMER2_BREF_TYPE_DATA) + return; + dio = hammer2_io_alloc(hmp, data_off, btype, 0); + if (dio) { + if (data_off < dio->pbase || + (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes > + dio->pbase + dio->psize) { + panic("hammer2_dedup_delete: DATAOFF BAD " + "%016jx/%d %016jx\n", + data_off, bytes, dio->pbase); + } + atomic_clear_64(&dio->dedup_ok_mask, + hammer2_dedup_mask(dio, data_off, bytes)); + hammer2_io_putblk(&dio); + } +} + +/* + * Assert that dedup validation bits in a DIO are not set. This operation + * does not require a buffer. The DIO does not need to exist. + */ +void +hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes) +{ + hammer2_io_t *dio; + + dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA, 0); + if (dio) { + KASSERT((dio->dedup_ok_mask & + hammer2_dedup_mask(dio, data_off, bytes)) == 0, + ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx", + data_off, + bytes, + hammer2_dedup_mask(dio, data_off, bytes), + dio->dedup_ok_mask)); + hammer2_io_putblk(&dio); + } +} + static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp) diff --git a/sys/vfs/hammer2/hammer2_strategy.c b/sys/vfs/hammer2/hammer2_strategy.c index 39dce35893..2f7e3a931c 100644 --- a/sys/vfs/hammer2/hammer2_strategy.c +++ b/sys/vfs/hammer2/hammer2_strategy.c @@ -1371,38 +1371,12 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag, *errorp = error; } -#define HAMMER2_DEDUP_FRAG (HAMMER2_PBUFSIZE / 64) -#define HAMMER2_DEDUP_FRAGRADIX (HAMMER2_PBUFRADIX - 6) - -static __inline -uint64_t -hammer2_dedup_mask(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes) -{ - int bbeg; - int bits; - uint64_t mask; - - bbeg = (int)((data_off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) >> - HAMMER2_DEDUP_FRAGRADIX; - bits = (int)((bytes + (HAMMER2_DEDUP_FRAG - 1)) >> - HAMMER2_DEDUP_FRAGRADIX); - mask = ((uint64_t)1 << bbeg) - 1; - if (bbeg + bits == 64) - mask = (uint64_t)-1; - else - mask = ((uint64_t)1 << (bbeg + bits)) - 1; - - mask &= ~(((uint64_t)1 << bbeg) - 1); - - return mask; -} - /* * LIVE DEDUP HEURISTICS * - * Record that the media data area is available for dedup operation. This - * will set the appropriate dedup bits in the DIO. These bits will be cleared - * if the dedup area becomes unavailable. + * Record media and crc information for possible dedup operation. Note + * that the dedup mask bits must also be set in the related DIO for a dedup + * to be fully validated (which is handled in the freemap allocation code). * * WARNING! This code is SMP safe but the heuristic allows SMP collisions. * All fields must be loaded into locals and validated. @@ -1511,9 +1485,15 @@ hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, char *data) dedup->data_off = chain->bref.data_off; dedup->data_crc = crc; +#if 0 + /* + * This is set by the allocator atomically with the freemap. + * Doing it here is too late. + */ atomic_set_64(&dio->dedup_ok_mask, hammer2_dedup_mask(dio, chain->bref.data_off, chain->bytes)); +#endif /* * Once we record the dedup the chain must be marked clean to @@ -1530,47 +1510,6 @@ hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, char *data) } } -/* - * Remove the data range from dedup consideration. This has no effect on - * any dedups which have already occurred. We do not need a valid buffer - * for this operation and must clean out dedup_ok_mask even if the dio is - * cached without any buffer available. - */ -void -hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes) -{ - hammer2_io_t *dio; - - dio = hammer2_io_getquick(hmp, data_off, bytes, 1); - if (dio) { - if (data_off < dio->pbase || - (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes > - dio->pbase + dio->psize) { - panic("DATAOFF BAD %016jx/%d %016jx\n", - data_off, bytes, dio->pbase); - } - atomic_clear_64(&dio->dedup_ok_mask, - hammer2_dedup_mask(dio, data_off, bytes)); - hammer2_io_putblk(&dio); - } -} - -/* - * Assert that the data range is not considered for dedup operation. - */ -void -hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes) -{ - hammer2_io_t *dio; - - dio = hammer2_io_getquick(hmp, data_off, bytes, 1); - if (dio) { - KKASSERT((dio->dedup_ok_mask & - hammer2_dedup_mask(dio, data_off, bytes)) == 0); - hammer2_io_putblk(&dio); - } -} - static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 65614131f7..29d889ad78 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -245,7 +245,20 @@ hammer2_vfs_init(struct vfsconf *conf) error = 0; + /* + * A large DIO cache is needed to retain dedup enablement masks. + * The bulkfree code clears related masks as part of the disk block + * recycling algorithm, preventing it from being used for a later + * dedup. + * + * NOTE: A large buffer cache can actually interfere with dedup + * operation because we dedup based on media physical buffers + * and not logical buffers. Try to make the DIO chace large + * enough to avoid this problem, but also cap it. + */ hammer2_limit_dio = nbuf * 2; + if (hammer2_limit_dio > 100000) + hammer2_limit_dio = 100000; if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) error = EINVAL; -- 2.11.4.GIT