hammer2 - Work on concurrent bulkfree stability
[dragonfly.git] / sys / vfs / hammer2 / hammer2_io.c
bloba2b11e1de23f4d239d48daa1c7dfd1fd8f596722
1 /*
2 * Copyright (c) 2013-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #include "hammer2.h"
38 * Implements an abstraction layer for synchronous and asynchronous
39 * buffered device I/O. Can be used as an OS-abstraction but the main
40 * purpose is to allow larger buffers to be used against hammer2_chain's
41 * using smaller allocations, without causing deadlocks.
43 * The DIOs also record temporary state with limited persistence. This
44 * feature is used to keep track of dedupable blocks.
46 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
47 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
49 static int
50 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
52 if (io1->pbase < io2->pbase)
53 return(-1);
54 if (io1->pbase > io2->pbase)
55 return(1);
56 return(0);
59 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
60 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
61 off_t, pbase);
63 struct hammer2_cleanupcb_info {
64 struct hammer2_io_tree tmptree;
65 int count;
68 static __inline
69 uint64_t
70 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
72 uint64_t mask;
73 int i;
75 if (bytes < 1024) /* smaller chunks not supported */
76 return 0;
79 * Calculate crc check mask for larger chunks
81 i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
82 HAMMER2_PBUFMASK) >> 10;
83 if (i == 0 && bytes == HAMMER2_PBUFSIZE)
84 return((uint64_t)-1);
85 mask = ((uint64_t)1U << (bytes >> 10)) - 1;
86 mask <<= i;
88 return mask;
91 #define HAMMER2_GETBLK_GOOD 0
92 #define HAMMER2_GETBLK_QUEUED 1
93 #define HAMMER2_GETBLK_OWNED 2
96 * Returns the DIO corresponding to the data|radix, creating it if necessary.
98 * If createit is 0, NULL can be returned indicating that the DIO does not
99 * exist. (btype) is ignored when createit is 0.
101 static __inline
102 hammer2_io_t *
103 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
104 int createit)
106 hammer2_io_t *dio;
107 hammer2_io_t *xio;
108 hammer2_key_t lbase;
109 hammer2_key_t pbase;
110 hammer2_key_t pmask;
111 int lsize;
112 int psize;
114 psize = HAMMER2_PBUFSIZE;
115 pmask = ~(hammer2_off_t)(psize - 1);
116 lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
117 lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
118 pbase = lbase & pmask;
120 if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
121 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
122 pbase, lbase, lsize, pmask);
124 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
127 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
129 hammer2_spin_sh(&hmp->io_spin);
130 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
131 if (dio) {
132 if ((atomic_fetchadd_64(&dio->refs, 1) &
133 HAMMER2_DIO_MASK) == 0) {
134 atomic_add_int(&dio->hmp->iofree_count, -1);
136 hammer2_spin_unsh(&hmp->io_spin);
137 } else if (createit) {
138 hammer2_spin_unsh(&hmp->io_spin);
139 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
140 dio->hmp = hmp;
141 dio->pbase = pbase;
142 dio->psize = psize;
143 dio->btype = btype;
144 dio->refs = 1;
145 dio->act = 5;
146 hammer2_spin_init(&dio->spin, "h2dio");
147 TAILQ_INIT(&dio->iocbq);
148 hammer2_spin_ex(&hmp->io_spin);
149 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
150 if (xio == NULL) {
151 atomic_add_int(&hammer2_dio_count, 1);
152 hammer2_spin_unex(&hmp->io_spin);
153 } else {
154 if ((atomic_fetchadd_64(&xio->refs, 1) &
155 HAMMER2_DIO_MASK) == 0) {
156 atomic_add_int(&xio->hmp->iofree_count, -1);
158 hammer2_spin_unex(&hmp->io_spin);
159 kfree(dio, M_HAMMER2);
160 dio = xio;
162 } else {
163 hammer2_spin_unsh(&hmp->io_spin);
164 return NULL;
166 dio->ticks = ticks;
167 if (dio->act < 10)
168 ++dio->act;
170 return dio;
174 * Allocate/Locate the requested dio, reference it, issue or queue iocb.
176 void
177 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
178 hammer2_iocb_t *iocb)
180 hammer2_io_t *dio;
181 uint64_t refs;
183 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
184 dio = hammer2_io_alloc(hmp, lbase, iocb->btype, 1);
186 iocb->dio = dio;
188 for (;;) {
189 refs = dio->refs;
190 cpu_ccfence();
193 * Issue the iocb immediately if the buffer is already good.
194 * Once set GOOD cannot be cleared until refs drops to 0.
196 * lfence required because dio's are not interlocked for
197 * the DIO_GOOD test.
199 if (refs & HAMMER2_DIO_GOOD) {
200 cpu_lfence();
201 iocb->callback(iocb);
202 break;
206 * Try to own the DIO by setting INPROG so we can issue
207 * I/O on it.
209 if (refs & HAMMER2_DIO_INPROG) {
211 * If DIO_INPROG is already set then set WAITING and
212 * queue the iocb.
214 hammer2_spin_ex(&dio->spin);
215 if (atomic_cmpset_64(&dio->refs, refs,
216 refs | HAMMER2_DIO_WAITING)) {
217 iocb->flags |= HAMMER2_IOCB_ONQ |
218 HAMMER2_IOCB_INPROG;
219 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
220 hammer2_spin_unex(&dio->spin);
221 break;
223 hammer2_spin_unex(&dio->spin);
224 /* retry */
225 } else {
227 * If DIO_INPROG is not set then set it and issue the
228 * callback immediately to start I/O.
230 if (atomic_cmpset_64(&dio->refs, refs,
231 refs | HAMMER2_DIO_INPROG)) {
232 iocb->flags |= HAMMER2_IOCB_INPROG;
233 iocb->callback(iocb);
234 break;
236 /* retry */
238 /* retry */
243 * Quickly obtain a good DIO buffer, return NULL if the system no longer
244 * caches the data.
246 hammer2_io_t *
247 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, int notgood)
249 hammer2_iocb_t iocb;
250 hammer2_io_t *dio;
251 struct buf *bp;
252 off_t pbase;
253 off_t pmask;
254 int psize = HAMMER2_PBUFSIZE;
255 uint64_t orefs;
256 uint64_t nrefs;
258 pmask = ~(hammer2_off_t)(psize - 1);
260 KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
261 lbase &= ~HAMMER2_OFF_MASK_RADIX;
262 pbase = lbase & pmask;
263 if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
264 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
265 pbase, lbase, lsize, pmask);
267 KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
270 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
272 hammer2_spin_sh(&hmp->io_spin);
273 dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
274 if (dio == NULL) {
275 hammer2_spin_unsh(&hmp->io_spin);
276 return NULL;
279 if ((atomic_fetchadd_64(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
280 atomic_add_int(&dio->hmp->iofree_count, -1);
281 hammer2_spin_unsh(&hmp->io_spin);
283 dio->ticks = ticks;
284 if (dio->act < 10)
285 ++dio->act; /* SMP race ok */
288 * Obtain/validate the buffer. Do NOT issue I/O. Discard if
289 * the system does not have the data already cached.
291 nrefs = (uint64_t)-1;
292 for (;;) {
293 orefs = dio->refs;
294 cpu_ccfence();
297 * Issue the iocb immediately if the buffer is already good.
298 * Once set GOOD cannot be cleared until refs drops to 0.
300 * lfence required because dio is not interlockedf for
301 * the DIO_GOOD test.
303 if (orefs & HAMMER2_DIO_GOOD) {
304 cpu_lfence();
305 break;
309 * Try to own the DIO by setting INPROG so we can issue
310 * I/O on it. INPROG might already be set, in which case
311 * there is no way we can do this non-blocking so we punt.
313 if ((orefs & HAMMER2_DIO_INPROG))
314 break;
315 nrefs = orefs | HAMMER2_DIO_INPROG;
316 if (atomic_cmpset_64(&dio->refs, orefs, nrefs) == 0)
317 continue;
320 * We own DIO_INPROG, try to set DIO_GOOD.
322 * If (notgood) specified caller just wants the dio and doesn't
323 * care about the buffer a whole lot. However, if the buffer
324 * is good (or dirty), we still want to return it.
326 * Otherwise we are trying to resolve a dedup and bread()
327 * is expected to always be better than building a new buffer
328 * that will be written. Use bread() for better determinism
329 * than getblk().
331 bp = dio->bp;
332 dio->bp = NULL;
333 if (bp == NULL) {
334 if (notgood)
335 bp = getblk(hmp->devvp, dio->pbase,
336 dio->psize, 0, 0);
337 else
338 bread(hmp->devvp, dio->pbase, dio->psize, &bp);
342 * System buffer must also have remained cached.
344 if (bp) {
345 if ((bp->b_flags & B_ERROR) == 0 &&
346 (bp->b_flags & B_CACHE)) {
347 dio->bp = bp; /* assign BEFORE setting flag */
348 atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
349 } else {
350 bqrelse(bp);
351 bp = NULL;
356 * Clear DIO_INPROG.
358 * This is actually a bit complicated, see
359 * hammer2_io_complete() for more information.
361 iocb.dio = dio;
362 iocb.flags = HAMMER2_IOCB_INPROG;
363 hammer2_io_complete(&iocb);
364 break;
368 * Only return the dio if its buffer is good. If notgood != 0,
369 * we return the buffer regardless (so ephermal dedup bits can be
370 * cleared).
372 if (notgood == 0 && (dio->refs & HAMMER2_DIO_GOOD) == 0) {
373 hammer2_io_putblk(&dio);
375 return dio;
379 * The originator of the iocb is finished with it.
381 * WARNING: iocb may be partially initialized with only iocb->dio and
382 * iocb->flags.
384 void
385 hammer2_io_complete(hammer2_iocb_t *iocb)
387 hammer2_io_t *dio = iocb->dio;
388 hammer2_iocb_t *cbtmp;
389 uint64_t orefs;
390 uint64_t nrefs;
391 uint32_t oflags;
392 uint32_t nflags;
395 * If IOCB_INPROG was not set completion is synchronous due to the
396 * buffer already being good. We can simply set IOCB_DONE and return.
398 * In this situation DIO_INPROG is not set and we have no visibility
399 * on dio->bp. We should not try to mess with dio->bp because another
400 * thread may be finishing up its processing. dio->bp should already
401 * be set to BUF_KERNPROC()!
403 if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
404 atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
405 return;
409 * The iocb was queued, obtained DIO_INPROG, and its callback was
410 * made. The callback is now complete. We still own DIO_INPROG.
412 * We can set DIO_GOOD if no error occurred, which gives certain
413 * stability guarantees to dio->bp and allows other accessors to
414 * short-cut access. DIO_GOOD cannot be cleared until the last
415 * ref is dropped.
417 KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
418 if (dio->bp) {
419 BUF_KERNPROC(dio->bp);
420 if ((dio->bp->b_flags & B_ERROR) == 0) {
421 KKASSERT(dio->bp->b_flags & B_CACHE);
422 atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
427 * Clean up the dio before marking the iocb as being done. If another
428 * iocb is pending we chain to it while leaving DIO_INPROG set (it
429 * will call io completion and presumably clear DIO_INPROG).
431 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
432 * finishing up the cbio. This means that DIO_INPROG is cleared at
433 * the end of the chain before ANY of the cbios are marked done.
435 * NOTE: The TAILQ is not stable until the spin-lock is held.
437 for (;;) {
438 orefs = dio->refs;
439 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
441 if (orefs & HAMMER2_DIO_WAITING) {
442 hammer2_spin_ex(&dio->spin);
443 cbtmp = TAILQ_FIRST(&dio->iocbq);
444 if (cbtmp) {
446 * NOTE: flags not adjusted in this case.
447 * Flags will be adjusted by the last
448 * iocb.
450 TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
451 hammer2_spin_unex(&dio->spin);
452 cbtmp->callback(cbtmp); /* chained */
453 break;
454 } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
455 hammer2_spin_unex(&dio->spin);
456 break;
458 hammer2_spin_unex(&dio->spin);
459 /* retry */
460 } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
461 break;
462 } /* else retry */
463 /* retry */
467 * Mark the iocb as done and wakeup any waiters. This is done after
468 * all iocb chains have been called back and after DIO_INPROG has been
469 * cleared. This avoids races against ref count drops by the waiting
470 * threads (a hard but not impossible SMP race) which might result in
471 * a 1->0 transition of the refs while DIO_INPROG is still set.
473 for (;;) {
474 oflags = iocb->flags;
475 cpu_ccfence();
476 nflags = oflags;
477 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
478 nflags |= HAMMER2_IOCB_DONE;
480 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
481 if (oflags & HAMMER2_IOCB_WAKEUP)
482 wakeup(iocb);
483 /* SMP: iocb is now stale */
484 break;
486 /* retry */
488 iocb = NULL;
493 * Wait for an iocb's I/O to finish.
495 void
496 hammer2_iocb_wait(hammer2_iocb_t *iocb)
498 uint32_t oflags;
499 uint32_t nflags;
501 for (;;) {
502 oflags = iocb->flags;
503 cpu_ccfence();
504 nflags = oflags | HAMMER2_IOCB_WAKEUP;
505 if (oflags & HAMMER2_IOCB_DONE)
506 break;
507 tsleep_interlock(iocb, 0);
508 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
509 tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
516 * Release our ref on *diop.
518 * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
519 * then dispose of the underlying buffer.
521 void
522 hammer2_io_putblk(hammer2_io_t **diop)
524 hammer2_dev_t *hmp;
525 hammer2_io_t *dio;
526 hammer2_iocb_t iocb;
527 struct buf *bp;
528 off_t peof;
529 off_t pbase;
530 int psize;
531 int limit_dio;
532 uint64_t orefs;
533 uint64_t nrefs;
535 dio = *diop;
536 *diop = NULL;
537 hmp = dio->hmp;
539 KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
542 * Drop refs.
544 * On the 1->0 transition clear flags and set INPROG.
546 * On the 1->0 transition if INPROG is already set, another thread
547 * is in lastdrop and we can just return after the transition.
549 * On any other transition we can generally just return.
551 for (;;) {
552 orefs = dio->refs;
553 cpu_ccfence();
554 nrefs = orefs - 1;
556 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
557 (orefs & HAMMER2_DIO_INPROG) == 0) {
559 * Lastdrop case, INPROG can be set.
561 nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
562 nrefs |= HAMMER2_DIO_INPROG;
563 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
564 break;
565 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
567 * Lastdrop case, INPROG already set.
569 if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
570 atomic_add_int(&hmp->iofree_count, 1);
571 return;
573 } else {
575 * Normal drop case.
577 if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
578 return;
580 cpu_pause();
581 /* retry */
585 * Lastdrop (1->0 transition). INPROG has been set, GOOD and DIRTY
586 * have been cleared. iofree_count has not yet been incremented,
587 * note that another accessor race will decrement iofree_count so
588 * we have to increment it regardless.
590 * We can now dispose of the buffer, and should do it before calling
591 * io_complete() in case there's a race against a new reference
592 * which causes io_complete() to chain and instantiate the bp again.
594 pbase = dio->pbase;
595 psize = dio->psize;
596 bp = dio->bp;
597 dio->bp = NULL;
599 if (orefs & HAMMER2_DIO_GOOD) {
600 KKASSERT(bp != NULL);
601 #if 0
602 if (hammer2_inval_enable &&
603 (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
604 ++hammer2_iod_invals;
605 bp->b_flags |= B_INVAL | B_RELBUF;
606 brelse(bp);
607 } else
608 #endif
609 if (orefs & HAMMER2_DIO_DIRTY) {
610 int hce;
612 dio_write_stats_update(dio, bp);
613 if ((hce = hammer2_cluster_write) > 0) {
615 * Allows write-behind to keep the buffer
616 * cache sane.
618 peof = (pbase + HAMMER2_SEGMASK64) &
619 ~HAMMER2_SEGMASK64;
620 bp->b_flags |= B_CLUSTEROK;
621 cluster_write(bp, peof, psize, hce);
622 } else {
624 * Allows dirty buffers to accumulate and
625 * possibly be canceled (e.g. by a 'rm'),
626 * will burst-write later.
628 bp->b_flags |= B_CLUSTEROK;
629 bdwrite(bp);
631 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
632 brelse(bp);
633 } else {
634 bqrelse(bp);
636 } else if (bp) {
637 #if 0
638 if (hammer2_inval_enable &&
639 (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
640 ++hammer2_iod_invals;
641 bp->b_flags |= B_INVAL | B_RELBUF;
642 brelse(bp);
643 } else
644 #endif
645 if (orefs & HAMMER2_DIO_DIRTY) {
646 dio_write_stats_update(dio, bp);
647 bdwrite(bp);
648 } else {
649 bqrelse(bp);
654 * The instant we call io_complete dio is a free agent again and
655 * can be ripped out from under us.
657 * we can cleanup our final DIO_INPROG by simulating an iocb
658 * completion.
660 hmp = dio->hmp; /* extract fields */
661 atomic_add_int(&hmp->iofree_count, 1);
662 cpu_ccfence();
664 iocb.dio = dio;
665 iocb.flags = HAMMER2_IOCB_INPROG;
666 hammer2_io_complete(&iocb);
667 dio = NULL; /* dio stale */
670 * We cache free buffers so re-use cases can use a shared lock, but
671 * if too many build up we have to clean them out.
673 limit_dio = hammer2_limit_dio;
674 if (limit_dio < 256)
675 limit_dio = 256;
676 if (limit_dio > 1024*1024)
677 limit_dio = 1024*1024;
678 if (hmp->iofree_count > limit_dio) {
679 struct hammer2_cleanupcb_info info;
681 RB_INIT(&info.tmptree);
682 hammer2_spin_ex(&hmp->io_spin);
683 if (hmp->iofree_count > limit_dio) {
684 info.count = hmp->iofree_count / 5;
685 RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
686 hammer2_io_cleanup_callback, &info);
688 hammer2_spin_unex(&hmp->io_spin);
689 hammer2_io_cleanup(hmp, &info.tmptree);
694 * Cleanup any dio's with (INPROG | refs) == 0.
696 * Called to clean up cached DIOs on umount after all activity has been
697 * flushed.
699 static
701 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
703 struct hammer2_cleanupcb_info *info = arg;
704 hammer2_io_t *xio;
706 if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
707 if (dio->act > 0) {
708 int act;
710 act = dio->act - (ticks - dio->ticks) / hz - 1;
711 if (act > 0) {
712 dio->act = act;
713 return 0;
715 dio->act = 0;
717 KKASSERT(dio->bp == NULL);
718 if (info->count > 0) {
719 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
720 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
721 KKASSERT(xio == NULL);
722 --info->count;
725 return 0;
728 void
729 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
731 hammer2_io_t *dio;
733 while ((dio = RB_ROOT(tree)) != NULL) {
734 RB_REMOVE(hammer2_io_tree, tree, dio);
735 KKASSERT(dio->bp == NULL &&
736 (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
737 kfree(dio, M_HAMMER2);
738 atomic_add_int(&hammer2_dio_count, -1);
739 atomic_add_int(&hmp->iofree_count, -1);
744 * Returns a pointer to the requested data.
746 char *
747 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
749 struct buf *bp;
750 int off;
752 bp = dio->bp;
753 KKASSERT(bp != NULL);
754 off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
755 KKASSERT(off >= 0 && off < bp->b_bufsize);
756 return(bp->b_data + off);
759 #if 0
761 * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
762 * in the chain structure, but chain structure needs to be persistent as
763 * well on refs=0 and it isn't.
766 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
768 hammer2_io_t *dio;
769 uint64_t mask;
771 if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
772 mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
773 *maskp = mask;
774 if ((dio->crc_good_mask & mask) == mask)
775 return 1;
776 return 0;
778 *maskp = 0;
780 return 0;
783 void
784 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
786 if (dio) {
787 if (sizeof(long) == 8) {
788 atomic_set_long(&dio->crc_good_mask, mask);
789 } else {
790 #if _BYTE_ORDER == _LITTLE_ENDIAN
791 atomic_set_int(&((int *)&dio->crc_good_mask)[0],
792 (uint32_t)mask);
793 atomic_set_int(&((int *)&dio->crc_good_mask)[1],
794 (uint32_t)(mask >> 32));
795 #else
796 atomic_set_int(&((int *)&dio->crc_good_mask)[0],
797 (uint32_t)(mask >> 32));
798 atomic_set_int(&((int *)&dio->crc_good_mask)[1],
799 (uint32_t)mask);
800 #endif
805 void
806 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
808 if (dio) {
809 if (sizeof(long) == 8) {
810 atomic_clear_long(&dio->crc_good_mask, mask);
811 } else {
812 #if _BYTE_ORDER == _LITTLE_ENDIAN
813 atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
814 (uint32_t)mask);
815 atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
816 (uint32_t)(mask >> 32));
817 #else
818 atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
819 (uint32_t)(mask >> 32));
820 atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
821 (uint32_t)mask);
822 #endif
826 #endif
829 * Helpers for hammer2_io_new*() functions
831 static
832 void
833 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
835 hammer2_io_t *dio = iocb->dio;
836 int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
839 * If IOCB_INPROG is not set the dio already has a good buffer and we
840 * can't mess with it other than zero the requested range.
842 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
843 * do what needs to be done with dio->bp.
845 if (iocb->flags & HAMMER2_IOCB_INPROG) {
846 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
847 if (iocb->lsize == dio->psize) {
849 * Fully covered buffer, try to optimize to
850 * avoid any I/O. We might already have the
851 * buffer due to iocb chaining.
853 if (dio->bp == NULL) {
854 dio->bp = getblk(dio->hmp->devvp,
855 dio->pbase, dio->psize,
856 gbctl, 0);
858 if (dio->bp) {
859 vfs_bio_clrbuf(dio->bp);
860 dio->bp->b_flags |= B_CACHE;
862 } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
864 * Partial buffer, quick mode. Do nothing.
865 * Do not instantiate the buffer or try to
866 * mark it B_CACHE because other portions of
867 * the buffer might have to be read by other
868 * accessors.
870 } else if (dio->bp == NULL ||
871 (dio->bp->b_flags & B_CACHE) == 0) {
873 * Partial buffer, normal mode, requires
874 * read-before-write. Chain the read.
876 * We might already have the buffer due to
877 * iocb chaining. XXX unclear if we really
878 * need to write/release it and reacquire
879 * in that case.
881 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
883 if (dio->bp) {
884 if (dio->refs & HAMMER2_DIO_DIRTY) {
885 dio_write_stats_update(dio,
886 dio->bp);
887 bdwrite(dio->bp);
888 } else {
889 bqrelse(dio->bp);
891 dio->bp = NULL;
893 atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
894 breadcb(dio->hmp->devvp,
895 dio->pbase, dio->psize,
896 hammer2_io_callback, iocb);
897 return;
898 } /* else buffer is good */
899 } /* else callback from breadcb is complete */
901 if (dio->bp) {
902 if (iocb->flags & HAMMER2_IOCB_ZERO)
903 bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
904 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
906 hammer2_io_complete(iocb);
909 static
911 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
912 hammer2_io_t **diop, int flags)
914 hammer2_iocb_t iocb;
916 iocb.callback = hammer2_iocb_new_callback;
917 iocb.chain = NULL;
918 iocb.ptr = NULL;
919 iocb.lbase = lbase;
920 iocb.lsize = lsize;
921 iocb.flags = flags;
922 iocb.btype = btype;
923 iocb.error = 0;
924 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
925 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
926 hammer2_iocb_wait(&iocb);
927 *diop = iocb.dio;
929 return (iocb.error);
933 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
934 hammer2_io_t **diop)
936 return(_hammer2_io_new(hmp, btype, lbase, lsize,
937 diop, HAMMER2_IOCB_ZERO));
941 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
942 hammer2_io_t **diop)
944 return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
948 * This is called from the freemap to pre-validate a full-sized buffer
949 * whos contents we don't care about, in order to prevent an unnecessary
950 * read-before-write.
952 void
953 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize)
955 hammer2_io_t *dio = NULL;
957 _hammer2_io_new(hmp, btype, lbase, lsize, &dio, HAMMER2_IOCB_QUICK);
958 hammer2_io_bqrelse(&dio);
961 static
962 void
963 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
965 hammer2_io_t *dio = iocb->dio;
966 off_t peof;
967 int error;
970 * If IOCB_INPROG is not set the dio already has a good buffer and we
971 * can't mess with it other than zero the requested range.
973 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
974 * do what needs to be done with dio->bp.
976 if (iocb->flags & HAMMER2_IOCB_INPROG) {
977 int hce;
979 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
981 * Already good, likely due to being chained from
982 * another iocb.
984 error = 0;
985 } else if ((hce = hammer2_cluster_read) > 0) {
987 * Synchronous cluster I/O for now.
989 if (dio->bp) {
990 bqrelse(dio->bp);
991 dio->bp = NULL;
993 peof = (dio->pbase + HAMMER2_SEGMASK64) &
994 ~HAMMER2_SEGMASK64;
995 error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
996 dio->psize,
997 dio->psize, HAMMER2_PBUFSIZE*hce,
998 &dio->bp);
999 } else {
1001 * Synchronous I/O for now.
1003 if (dio->bp) {
1004 bqrelse(dio->bp);
1005 dio->bp = NULL;
1007 error = bread(dio->hmp->devvp, dio->pbase,
1008 dio->psize, &dio->bp);
1010 if (error) {
1011 brelse(dio->bp);
1012 dio->bp = NULL;
1015 hammer2_io_complete(iocb);
1019 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
1020 hammer2_io_t **diop)
1022 hammer2_iocb_t iocb;
1024 iocb.callback = hammer2_iocb_bread_callback;
1025 iocb.chain = NULL;
1026 iocb.ptr = NULL;
1027 iocb.lbase = lbase;
1028 iocb.lsize = lsize;
1029 iocb.btype = btype;
1030 iocb.flags = 0;
1031 iocb.error = 0;
1032 hammer2_io_getblk(hmp, lbase, lsize, &iocb);
1033 if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
1034 hammer2_iocb_wait(&iocb);
1035 *diop = iocb.dio;
1037 return (iocb.error);
1041 * System buf/bio async callback extracts the iocb and chains
1042 * to the iocb callback.
1044 void
1045 hammer2_io_callback(struct bio *bio)
1047 struct buf *dbp = bio->bio_buf;
1048 hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
1049 hammer2_io_t *dio;
1051 dio = iocb->dio;
1052 if ((bio->bio_flags & BIO_DONE) == 0)
1053 bpdone(dbp, 0);
1054 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1055 dio->bp = bio->bio_buf;
1056 iocb->callback(iocb);
1059 void
1060 hammer2_io_bawrite(hammer2_io_t **diop)
1062 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1063 hammer2_io_putblk(diop);
1066 void
1067 hammer2_io_bdwrite(hammer2_io_t **diop)
1069 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1070 hammer2_io_putblk(diop);
1074 hammer2_io_bwrite(hammer2_io_t **diop)
1076 atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1077 hammer2_io_putblk(diop);
1078 return (0); /* XXX */
1081 void
1082 hammer2_io_setdirty(hammer2_io_t *dio)
1084 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
1088 * This routine is called when a MODIFIED chain is being DESTROYED,
1089 * in an attempt to allow the related buffer cache buffer to be
1090 * invalidated and discarded instead of flushing it to disk.
1092 * At the moment this case is only really useful for file meta-data.
1093 * File data is already handled via the logical buffer cache associated
1094 * with the vnode, and will be discarded if it was never flushed to disk.
1095 * File meta-data may include inodes, directory entries, and indirect blocks.
1097 * XXX
1098 * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
1099 * invalidated might be smaller. Most of the meta-data structures above
1100 * are in the 'smaller' category. For now, don't try to invalidate the
1101 * data areas.
1103 void
1104 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
1106 /* NOP */
1109 void
1110 hammer2_io_brelse(hammer2_io_t **diop)
1112 hammer2_io_putblk(diop);
1115 void
1116 hammer2_io_bqrelse(hammer2_io_t **diop)
1118 hammer2_io_putblk(diop);
1122 hammer2_io_isdirty(hammer2_io_t *dio)
1124 return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1128 * Set dedup validation bits in a DIO. We do not need the buffer cache
1129 * buffer for this. This must be done concurrent with setting bits in
1130 * the freemap so as to interlock with bulkfree's clearing of those bits.
1132 void
1133 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
1135 hammer2_io_t *dio;
1136 int lsize;
1138 dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1);
1139 lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1140 atomic_set_64(&dio->dedup_ok_mask,
1141 hammer2_dedup_mask(dio, bref->data_off, lsize));
1142 hammer2_io_putblk(&dio);
1146 * Clear dedup validation bits in a DIO. This is typically done when
1147 * a modified chain is destroyed or by the bulkfree code. No buffer
1148 * is needed for this operation. If the DIO no longer exists it is
1149 * equivalent to the bits not being set.
1151 void
1152 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
1153 hammer2_off_t data_off, u_int bytes)
1155 hammer2_io_t *dio;
1157 if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
1158 return;
1159 if (btype != HAMMER2_BREF_TYPE_DATA)
1160 return;
1161 dio = hammer2_io_alloc(hmp, data_off, btype, 0);
1162 if (dio) {
1163 if (data_off < dio->pbase ||
1164 (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
1165 dio->pbase + dio->psize) {
1166 panic("hammer2_dedup_delete: DATAOFF BAD "
1167 "%016jx/%d %016jx\n",
1168 data_off, bytes, dio->pbase);
1170 atomic_clear_64(&dio->dedup_ok_mask,
1171 hammer2_dedup_mask(dio, data_off, bytes));
1172 hammer2_io_putblk(&dio);
1177 * Assert that dedup validation bits in a DIO are not set. This operation
1178 * does not require a buffer. The DIO does not need to exist.
1180 void
1181 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
1183 hammer2_io_t *dio;
1185 dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA, 0);
1186 if (dio) {
1187 KASSERT((dio->dedup_ok_mask &
1188 hammer2_dedup_mask(dio, data_off, bytes)) == 0,
1189 ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
1190 data_off,
1191 bytes,
1192 hammer2_dedup_mask(dio, data_off, bytes),
1193 dio->dedup_ok_mask));
1194 hammer2_io_putblk(&dio);
1198 static
1199 void
1200 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
1202 long *counterp;
1204 if (bp->b_flags & B_DELWRI)
1205 return;
1207 switch(dio->btype) {
1208 case 0:
1209 return;
1210 case HAMMER2_BREF_TYPE_DATA:
1211 counterp = &hammer2_iod_file_write;
1212 break;
1213 case HAMMER2_BREF_TYPE_DIRENT:
1214 case HAMMER2_BREF_TYPE_INODE:
1215 counterp = &hammer2_iod_meta_write;
1216 break;
1217 case HAMMER2_BREF_TYPE_INDIRECT:
1218 counterp = &hammer2_iod_indr_write;
1219 break;
1220 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1221 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1222 counterp = &hammer2_iod_fmap_write;
1223 break;
1224 default:
1225 counterp = &hammer2_iod_volu_write;
1226 break;
1228 *counterp += dio->psize;