2 * Copyright (c) 2013-2017 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 #define HAMMER2_DOP_READ 1
38 #define HAMMER2_DOP_NEW 2
39 #define HAMMER2_DOP_NEWNZ 3
40 #define HAMMER2_DOP_READQ 4
43 * Implements an abstraction layer for synchronous and asynchronous
44 * buffered device I/O. Can be used as an OS-abstraction but the main
45 * purpose is to allow larger buffers to be used against hammer2_chain's
46 * using smaller allocations, without causing deadlocks.
48 * The DIOs also record temporary state with limited persistence. This
49 * feature is used to keep track of dedupable blocks.
51 static int hammer2_io_cleanup_callback(hammer2_io_t
*dio
, void *arg
);
52 static void dio_write_stats_update(hammer2_io_t
*dio
, struct buf
*bp
);
55 hammer2_io_cmp(hammer2_io_t
*io1
, hammer2_io_t
*io2
)
57 if (io1
->pbase
< io2
->pbase
)
59 if (io1
->pbase
> io2
->pbase
)
64 RB_PROTOTYPE2(hammer2_io_tree
, hammer2_io
, rbnode
, hammer2_io_cmp
, off_t
);
65 RB_GENERATE2(hammer2_io_tree
, hammer2_io
, rbnode
, hammer2_io_cmp
,
68 struct hammer2_cleanupcb_info
{
69 struct hammer2_io_tree tmptree
;
76 hammer2_io_mask(hammer2_io_t
*dio
, hammer2_off_t off
, u_int bytes
)
81 if (bytes
< 1024) /* smaller chunks not supported */
85 * Calculate crc check mask for larger chunks
87 i
= (((off
& ~HAMMER2_OFF_MASK_RADIX
) - dio
->pbase
) &
88 HAMMER2_PBUFMASK
) >> 10;
89 if (i
== 0 && bytes
== HAMMER2_PBUFSIZE
)
91 mask
= ((uint64_t)1U << (bytes
>> 10)) - 1;
99 * Returns the DIO corresponding to the data|radix, creating it if necessary.
101 * If createit is 0, NULL can be returned indicating that the DIO does not
102 * exist. (btype) is ignored when createit is 0.
106 hammer2_io_alloc(hammer2_dev_t
*hmp
, hammer2_key_t data_off
, uint8_t btype
,
107 int createit
, int *isgoodp
)
118 psize
= HAMMER2_PBUFSIZE
;
119 pmask
= ~(hammer2_off_t
)(psize
- 1);
120 lsize
= 1 << (int)(data_off
& HAMMER2_OFF_MASK_RADIX
);
121 lbase
= data_off
& ~HAMMER2_OFF_MASK_RADIX
;
122 pbase
= lbase
& pmask
;
124 if (pbase
== 0 || ((lbase
+ lsize
- 1) & pmask
) != pbase
) {
125 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
126 pbase
, lbase
, lsize
, pmask
);
128 KKASSERT(pbase
!= 0 && ((lbase
+ lsize
- 1) & pmask
) == pbase
);
132 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
134 hammer2_spin_sh(&hmp
->io_spin
);
135 dio
= RB_LOOKUP(hammer2_io_tree
, &hmp
->iotree
, pbase
);
137 refs
= atomic_fetchadd_64(&dio
->refs
, 1);
138 if ((refs
& HAMMER2_DIO_MASK
) == 0) {
139 atomic_add_int(&dio
->hmp
->iofree_count
, -1);
141 if (refs
& HAMMER2_DIO_GOOD
)
143 hammer2_spin_unsh(&hmp
->io_spin
);
144 } else if (createit
) {
146 hammer2_spin_unsh(&hmp
->io_spin
);
147 dio
= kmalloc(sizeof(*dio
), M_HAMMER2
, M_INTWAIT
| M_ZERO
);
152 dio
->refs
= refs
+ 1;
154 hammer2_spin_ex(&hmp
->io_spin
);
155 xio
= RB_INSERT(hammer2_io_tree
, &hmp
->iotree
, dio
);
157 atomic_add_int(&hammer2_dio_count
, 1);
158 hammer2_spin_unex(&hmp
->io_spin
);
160 refs
= atomic_fetchadd_64(&xio
->refs
, 1);
161 if ((refs
& HAMMER2_DIO_MASK
) == 0)
162 atomic_add_int(&xio
->hmp
->iofree_count
, -1);
163 if (refs
& HAMMER2_DIO_GOOD
)
165 hammer2_spin_unex(&hmp
->io_spin
);
166 kfree(dio
, M_HAMMER2
);
170 hammer2_spin_unsh(&hmp
->io_spin
);
181 * Acquire the requested dio. If DIO_GOOD is not set we must instantiate
182 * a buffer. If set the buffer already exists and is good to go.
185 hammer2_io_getblk(hammer2_dev_t
*hmp
, int btype
, off_t lbase
, int lsize
, int op
)
196 bflags
= ((btype
== HAMMER2_BREF_TYPE_DATA
) ? B_NOTMETA
: 0);
199 KKASSERT((1 << (int)(lbase
& HAMMER2_OFF_MASK_RADIX
)) == lsize
);
201 if (op
== HAMMER2_DOP_READQ
) {
202 dio
= hammer2_io_alloc(hmp
, lbase
, btype
, 0, &isgood
);
205 op
= HAMMER2_DOP_READ
;
207 dio
= hammer2_io_alloc(hmp
, lbase
, btype
, 1, &isgood
);
215 * Buffer is already good, handle the op and return.
217 if (orefs
& HAMMER2_DIO_GOOD
) {
223 case HAMMER2_DOP_NEW
:
224 bzero(hammer2_io_data(dio
, lbase
), lsize
);
226 case HAMMER2_DOP_NEWNZ
:
227 atomic_set_long(&dio
->refs
, HAMMER2_DIO_DIRTY
);
229 case HAMMER2_DOP_READ
:
240 if (orefs
& HAMMER2_DIO_INPROG
) {
241 nrefs
= orefs
| HAMMER2_DIO_WAITING
;
242 tsleep_interlock(dio
, 0);
243 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
)) {
244 tsleep(dio
, PINTERLOCKED
, "h2dio", hz
);
248 nrefs
= orefs
| HAMMER2_DIO_INPROG
;
249 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
)) {
256 * We break to here if GOOD is not set and we acquired INPROG for
259 KKASSERT(dio
->bp
== NULL
);
260 if (btype
== HAMMER2_BREF_TYPE_DATA
)
261 hce
= hammer2_cluster_data_read
;
263 hce
= hammer2_cluster_meta_read
;
266 if (dio
->pbase
== (lbase
& ~HAMMER2_OFF_MASK_RADIX
) &&
267 dio
->psize
== lsize
) {
269 case HAMMER2_DOP_NEW
:
270 case HAMMER2_DOP_NEWNZ
:
271 dio
->bp
= getblk(dio
->hmp
->devvp
,
272 dio
->pbase
, dio
->psize
,
274 if (op
== HAMMER2_DOP_NEW
) {
276 bzero(dio
->bp
->b_data
, dio
->psize
);
278 atomic_set_long(&dio
->refs
, HAMMER2_DIO_DIRTY
);
280 case HAMMER2_DOP_READ
:
284 * Synchronous cluster I/O for now.
286 peof
= (dio
->pbase
+ HAMMER2_SEGMASK64
) &
289 error
= cluster_readx(dio
->hmp
->devvp
,
293 HAMMER2_PBUFSIZE
*hce
,
297 error
= breadnx(dio
->hmp
->devvp
, dio
->pbase
,
299 NULL
, NULL
, 0, &dio
->bp
);
305 * Synchronous cluster I/O for now.
307 peof
= (dio
->pbase
+ HAMMER2_SEGMASK64
) &
309 error
= cluster_readx(dio
->hmp
->devvp
,
310 peof
, dio
->pbase
, dio
->psize
,
312 dio
->psize
, HAMMER2_PBUFSIZE
*hce
,
315 error
= breadnx(dio
->hmp
->devvp
, dio
->pbase
,
317 NULL
, NULL
, 0, &dio
->bp
);
324 case HAMMER2_DOP_NEW
:
326 bzero(hammer2_io_data(dio
, lbase
), lsize
);
328 case HAMMER2_DOP_NEWNZ
:
329 atomic_set_long(&dio
->refs
, HAMMER2_DIO_DIRTY
);
331 case HAMMER2_DOP_READ
:
337 * Tell the kernel that the buffer cache is not
338 * meta-data based on the btype. This allows
339 * swapcache to distinguish between data and
343 case HAMMER2_BREF_TYPE_DATA
:
344 dio
->bp
->b_flags
|= B_NOTMETA
;
354 BUF_KERNPROC(dio
->bp
);
355 dio
->bp
->b_flags
&= ~B_AGE
;
360 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
365 nrefs
= orefs
& ~(HAMMER2_DIO_INPROG
| HAMMER2_DIO_WAITING
);
367 nrefs
|= HAMMER2_DIO_GOOD
;
368 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
)) {
369 if (orefs
& HAMMER2_DIO_WAITING
)
376 /* XXX error handling */
382 * Release our ref on *diop.
384 * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
385 * of dio->bp. Then we clean up DIO_INPROG and DIO_WAITING.
388 hammer2_io_putblk(hammer2_io_t
**diop
)
403 KKASSERT((dio
->refs
& HAMMER2_DIO_MASK
) != 0);
408 * On the 1->0 transition clear GOOD and set INPROG, and break.
409 * On any other transition we can return early.
415 if ((orefs
& HAMMER2_DIO_MASK
) == 1 &&
416 (orefs
& HAMMER2_DIO_INPROG
) == 0) {
418 * Lastdrop case, INPROG can be set.
421 nrefs
&= ~(HAMMER2_DIO_GOOD
| HAMMER2_DIO_DIRTY
);
422 nrefs
|= HAMMER2_DIO_INPROG
;
423 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
))
425 } else if ((orefs
& HAMMER2_DIO_MASK
) == 1) {
427 * Lastdrop case, INPROG already set. We must
428 * wait for INPROG to clear.
430 nrefs
= orefs
| HAMMER2_DIO_WAITING
;
431 tsleep_interlock(dio
, 0);
432 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
)) {
433 tsleep(dio
, PINTERLOCKED
, "h2dio", hz
);
441 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
))
450 * Lastdrop (1->0 transition). INPROG has been set, GOOD and DIRTY
451 * have been cleared. iofree_count has not yet been incremented,
452 * note that another accessor race will decrement iofree_count so
453 * we have to increment it regardless.
455 * We can now dispose of the buffer, and should do it before calling
456 * io_complete() in case there's a race against a new reference
457 * which causes io_complete() to chain and instantiate the bp again.
464 if ((orefs
& HAMMER2_DIO_GOOD
) && bp
) {
466 * Non-errored disposal of bp
468 if (orefs
& HAMMER2_DIO_DIRTY
) {
469 dio_write_stats_update(dio
, bp
);
472 * Allows dirty buffers to accumulate and
473 * possibly be canceled (e.g. by a 'rm'),
474 * will burst-write later. Allow the kernel
475 * to cluster the dirty buffers.
477 * NOTE: Do not use cluster_write() here. The
478 * problem is that due to the way chains
479 * are locked, buffers are cycled in and out
480 * quite often so the disposal here is not
481 * necessarily the final disposal. Avoid
482 * excessive rewriting of the same blocks
483 * by using bdwrite().
489 if ((hce
= hammer2_cluster_write
) > 0) {
491 * Allows write-behind to keep the buffer
494 peof
= (pbase
+ HAMMER2_SEGMASK64
) &
496 bp
->b_flags
|= B_CLUSTEROK
;
497 cluster_write(bp
, peof
, psize
, hce
);
501 bp
->b_flags
|= B_CLUSTEROK
;
504 } else if (bp
->b_flags
& (B_ERROR
| B_INVAL
| B_RELBUF
)) {
511 * Errored disposal of bp
517 * Update iofree_count before disposing of the dio
520 atomic_add_int(&hmp
->iofree_count
, 1);
523 * Clear INPROG, GOOD, and WAITING
528 nrefs
= orefs
& ~(HAMMER2_DIO_INPROG
| HAMMER2_DIO_GOOD
|
529 HAMMER2_DIO_WAITING
);
530 if (atomic_cmpset_64(&dio
->refs
, orefs
, nrefs
)) {
531 if (orefs
& HAMMER2_DIO_WAITING
)
539 * We cache free buffers so re-use cases can use a shared lock, but
540 * if too many build up we have to clean them out.
542 limit_dio
= hammer2_limit_dio
;
545 if (limit_dio
> 1024*1024)
546 limit_dio
= 1024*1024;
547 if (hmp
->iofree_count
> limit_dio
) {
548 struct hammer2_cleanupcb_info info
;
550 RB_INIT(&info
.tmptree
);
551 hammer2_spin_ex(&hmp
->io_spin
);
552 if (hmp
->iofree_count
> limit_dio
) {
553 info
.count
= hmp
->iofree_count
/ 5;
554 RB_SCAN(hammer2_io_tree
, &hmp
->iotree
, NULL
,
555 hammer2_io_cleanup_callback
, &info
);
557 hammer2_spin_unex(&hmp
->io_spin
);
558 hammer2_io_cleanup(hmp
, &info
.tmptree
);
563 * Cleanup any dio's with (INPROG | refs) == 0.
565 * Called to clean up cached DIOs on umount after all activity has been
570 hammer2_io_cleanup_callback(hammer2_io_t
*dio
, void *arg
)
572 struct hammer2_cleanupcb_info
*info
= arg
;
575 if ((dio
->refs
& (HAMMER2_DIO_MASK
| HAMMER2_DIO_INPROG
)) == 0) {
579 act
= dio
->act
- (ticks
- dio
->ticks
) / hz
- 1;
586 KKASSERT(dio
->bp
== NULL
);
587 if (info
->count
> 0) {
588 RB_REMOVE(hammer2_io_tree
, &dio
->hmp
->iotree
, dio
);
589 xio
= RB_INSERT(hammer2_io_tree
, &info
->tmptree
, dio
);
590 KKASSERT(xio
== NULL
);
598 hammer2_io_cleanup(hammer2_dev_t
*hmp
, struct hammer2_io_tree
*tree
)
602 while ((dio
= RB_ROOT(tree
)) != NULL
) {
603 RB_REMOVE(hammer2_io_tree
, tree
, dio
);
604 KKASSERT(dio
->bp
== NULL
&&
605 (dio
->refs
& (HAMMER2_DIO_MASK
| HAMMER2_DIO_INPROG
)) == 0);
606 if (dio
->refs
& HAMMER2_DIO_DIRTY
) {
607 kprintf("hammer2_io_cleanup: Dirty buffer "
608 "%016jx/%d (bp=%p)\n",
609 dio
->pbase
, dio
->psize
, dio
->bp
);
611 kfree(dio
, M_HAMMER2
);
612 atomic_add_int(&hammer2_dio_count
, -1);
613 atomic_add_int(&hmp
->iofree_count
, -1);
618 * Returns a pointer to the requested data.
621 hammer2_io_data(hammer2_io_t
*dio
, off_t lbase
)
627 KKASSERT(bp
!= NULL
);
629 off
= (lbase
& ~HAMMER2_OFF_MASK_RADIX
) - bp
->b_loffset
;
630 KKASSERT(off
>= 0 && off
< bp
->b_bufsize
);
631 return(bp
->b_data
+ off
);
635 hammer2_io_new(hammer2_dev_t
*hmp
, int btype
, off_t lbase
, int lsize
,
638 *diop
= hammer2_io_getblk(hmp
, btype
, lbase
, lsize
, HAMMER2_DOP_NEW
);
639 return ((*diop
)->error
);
643 hammer2_io_newnz(hammer2_dev_t
*hmp
, int btype
, off_t lbase
, int lsize
,
646 *diop
= hammer2_io_getblk(hmp
, btype
, lbase
, lsize
, HAMMER2_DOP_NEWNZ
);
647 return ((*diop
)->error
);
651 hammer2_io_bread(hammer2_dev_t
*hmp
, int btype
, off_t lbase
, int lsize
,
654 *diop
= hammer2_io_getblk(hmp
, btype
, lbase
, lsize
, HAMMER2_DOP_READ
);
655 return ((*diop
)->error
);
659 hammer2_io_getquick(hammer2_dev_t
*hmp
, off_t lbase
, int lsize
)
663 dio
= hammer2_io_getblk(hmp
, 0, lbase
, lsize
, HAMMER2_DOP_READQ
);
668 hammer2_io_bawrite(hammer2_io_t
**diop
)
670 atomic_set_64(&(*diop
)->refs
, HAMMER2_DIO_DIRTY
);
671 hammer2_io_putblk(diop
);
675 hammer2_io_bdwrite(hammer2_io_t
**diop
)
677 atomic_set_64(&(*diop
)->refs
, HAMMER2_DIO_DIRTY
);
678 hammer2_io_putblk(diop
);
682 hammer2_io_bwrite(hammer2_io_t
**diop
)
684 atomic_set_64(&(*diop
)->refs
, HAMMER2_DIO_DIRTY
);
685 hammer2_io_putblk(diop
);
686 return (0); /* XXX */
690 hammer2_io_setdirty(hammer2_io_t
*dio
)
692 atomic_set_64(&dio
->refs
, HAMMER2_DIO_DIRTY
);
696 * This routine is called when a MODIFIED chain is being DESTROYED,
697 * in an attempt to allow the related buffer cache buffer to be
698 * invalidated and discarded instead of flushing it to disk.
700 * At the moment this case is only really useful for file meta-data.
701 * File data is already handled via the logical buffer cache associated
702 * with the vnode, and will be discarded if it was never flushed to disk.
703 * File meta-data may include inodes, directory entries, and indirect blocks.
706 * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
707 * invalidated might be smaller. Most of the meta-data structures above
708 * are in the 'smaller' category. For now, don't try to invalidate the
712 hammer2_io_inval(hammer2_io_t
*dio
, hammer2_off_t data_off
, u_int bytes
)
718 hammer2_io_brelse(hammer2_io_t
**diop
)
720 hammer2_io_putblk(diop
);
724 hammer2_io_bqrelse(hammer2_io_t
**diop
)
726 hammer2_io_putblk(diop
);
730 * Set dedup validation bits in a DIO. We do not need the buffer cache
731 * buffer for this. This must be done concurrent with setting bits in
732 * the freemap so as to interlock with bulkfree's clearing of those bits.
735 hammer2_io_dedup_set(hammer2_dev_t
*hmp
, hammer2_blockref_t
*bref
)
742 dio
= hammer2_io_alloc(hmp
, bref
->data_off
, bref
->type
, 1, &isgood
);
743 lsize
= 1 << (int)(bref
->data_off
& HAMMER2_OFF_MASK_RADIX
);
744 mask
= hammer2_dedup_mask(dio
, bref
->data_off
, lsize
);
745 atomic_clear_64(&dio
->dedup_valid
, mask
);
746 atomic_set_64(&dio
->dedup_alloc
, mask
);
747 hammer2_io_putblk(&dio
);
751 * Clear dedup validation bits in a DIO. This is typically done when
752 * a modified chain is destroyed or by the bulkfree code. No buffer
753 * is needed for this operation. If the DIO no longer exists it is
754 * equivalent to the bits not being set.
757 hammer2_io_dedup_delete(hammer2_dev_t
*hmp
, uint8_t btype
,
758 hammer2_off_t data_off
, u_int bytes
)
764 if ((data_off
& ~HAMMER2_OFF_MASK_RADIX
) == 0)
766 if (btype
!= HAMMER2_BREF_TYPE_DATA
)
768 dio
= hammer2_io_alloc(hmp
, data_off
, btype
, 0, &isgood
);
770 if (data_off
< dio
->pbase
||
771 (data_off
& ~HAMMER2_OFF_MASK_RADIX
) + bytes
>
772 dio
->pbase
+ dio
->psize
) {
773 panic("hammer2_dedup_delete: DATAOFF BAD "
774 "%016jx/%d %016jx\n",
775 data_off
, bytes
, dio
->pbase
);
777 mask
= hammer2_dedup_mask(dio
, data_off
, bytes
);
778 atomic_clear_64(&dio
->dedup_alloc
, mask
);
779 atomic_clear_64(&dio
->dedup_valid
, mask
);
780 hammer2_io_putblk(&dio
);
785 * Assert that dedup allocation bits in a DIO are not set. This operation
786 * does not require a buffer. The DIO does not need to exist.
789 hammer2_io_dedup_assert(hammer2_dev_t
*hmp
, hammer2_off_t data_off
, u_int bytes
)
794 dio
= hammer2_io_alloc(hmp
, data_off
, HAMMER2_BREF_TYPE_DATA
,
797 KASSERT((dio
->dedup_alloc
&
798 hammer2_dedup_mask(dio
, data_off
, bytes
)) == 0,
799 ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
802 hammer2_dedup_mask(dio
, data_off
, bytes
),
804 hammer2_io_putblk(&dio
);
810 dio_write_stats_update(hammer2_io_t
*dio
, struct buf
*bp
)
814 if (bp
->b_flags
& B_DELWRI
)
820 case HAMMER2_BREF_TYPE_DATA
:
821 counterp
= &hammer2_iod_file_write
;
823 case HAMMER2_BREF_TYPE_DIRENT
:
824 case HAMMER2_BREF_TYPE_INODE
:
825 counterp
= &hammer2_iod_meta_write
;
827 case HAMMER2_BREF_TYPE_INDIRECT
:
828 counterp
= &hammer2_iod_indr_write
;
830 case HAMMER2_BREF_TYPE_FREEMAP_NODE
:
831 case HAMMER2_BREF_TYPE_FREEMAP_LEAF
:
832 counterp
= &hammer2_iod_fmap_write
;
835 counterp
= &hammer2_iod_volu_write
;
838 *counterp
+= dio
->psize
;
842 hammer2_io_bkvasync(hammer2_io_t
*dio
)
844 KKASSERT(dio
->bp
!= NULL
);