2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.31 2008/06/28 23:50:37 dillon Exp $
37 * HAMMER dependancy flusher thread
39 * Meta data updates create buffer dependancies which are arranged as a
45 static void hammer_flusher_master_thread(void *arg
);
46 static void hammer_flusher_slave_thread(void *arg
);
47 static void hammer_flusher_clean_loose_ios(hammer_mount_t hmp
);
48 static void hammer_flusher_flush(hammer_mount_t hmp
);
49 static void hammer_flusher_flush_inode(hammer_inode_t ip
,
50 hammer_transaction_t trans
);
51 static void hammer_flusher_finalize(hammer_transaction_t trans
, int final
);
54 * Support structures for the flusher threads.
56 struct hammer_flusher_info
{
57 struct hammer_mount
*hmp
;
60 hammer_inode_t work_array
[HAMMER_FLUSH_GROUP_SIZE
];
63 typedef struct hammer_flusher_info
*hammer_flusher_info_t
;
66 * Sync all inodes pending on the flusher. This routine may have to be
67 * called twice to get them all as some may be queued to a later flush group.
70 hammer_flusher_sync(hammer_mount_t hmp
)
74 if (hmp
->flusher
.td
) {
75 seq
= hmp
->flusher
.next
;
76 if (hmp
->flusher
.signal
++ == 0)
77 wakeup(&hmp
->flusher
.signal
);
78 while ((int)(seq
- hmp
->flusher
.done
) > 0)
79 tsleep(&hmp
->flusher
.done
, 0, "hmrfls", 0);
84 * Sync all inodes pending on the flusher - return immediately.
87 hammer_flusher_async(hammer_mount_t hmp
)
89 if (hmp
->flusher
.td
) {
90 if (hmp
->flusher
.signal
++ == 0)
91 wakeup(&hmp
->flusher
.signal
);
96 hammer_flusher_create(hammer_mount_t hmp
)
98 hammer_flusher_info_t info
;
101 hmp
->flusher
.signal
= 0;
102 hmp
->flusher
.act
= 0;
103 hmp
->flusher
.done
= 0;
104 hmp
->flusher
.next
= 1;
105 hmp
->flusher
.count
= 0;
106 hammer_ref(&hmp
->flusher
.finalize_lock
);
108 lwkt_create(hammer_flusher_master_thread
, hmp
,
109 &hmp
->flusher
.td
, NULL
, 0, -1, "hammer-M");
110 for (i
= 0; i
< HAMMER_MAX_FLUSHERS
; ++i
) {
111 info
= kmalloc(sizeof(*info
), M_HAMMER
, M_WAITOK
|M_ZERO
);
113 ++hmp
->flusher
.count
;
114 hmp
->flusher
.info
[i
] = info
;
115 lwkt_create(hammer_flusher_slave_thread
, info
,
116 &info
->td
, NULL
, 0, -1, "hammer-S%d", i
);
121 hammer_flusher_destroy(hammer_mount_t hmp
)
123 hammer_flusher_info_t info
;
129 hmp
->flusher
.exiting
= 1;
130 while (hmp
->flusher
.td
) {
131 ++hmp
->flusher
.signal
;
132 wakeup(&hmp
->flusher
.signal
);
133 tsleep(&hmp
->flusher
.exiting
, 0, "hmrwex", hz
);
139 for (i
= 0; i
< HAMMER_MAX_FLUSHERS
; ++i
) {
140 if ((info
= hmp
->flusher
.info
[i
]) != NULL
) {
141 KKASSERT(info
->startit
== 0);
143 wakeup(&info
->startit
);
145 tsleep(&info
->td
, 0, "hmrwwc", 0);
147 hmp
->flusher
.info
[i
] = NULL
;
148 kfree(info
, M_HAMMER
);
149 --hmp
->flusher
.count
;
152 KKASSERT(hmp
->flusher
.count
== 0);
156 * The master flusher thread manages the flusher sequence id and
157 * synchronization with the slave work threads.
160 hammer_flusher_master_thread(void *arg
)
162 hammer_mount_t hmp
= arg
;
165 while (hmp
->flusher
.group_lock
)
166 tsleep(&hmp
->flusher
.group_lock
, 0, "hmrhld", 0);
167 hmp
->flusher
.act
= hmp
->flusher
.next
;
169 hammer_flusher_clean_loose_ios(hmp
);
170 hammer_flusher_flush(hmp
);
171 hmp
->flusher
.done
= hmp
->flusher
.act
;
172 wakeup(&hmp
->flusher
.done
);
177 if (hmp
->flusher
.exiting
&& TAILQ_EMPTY(&hmp
->flush_list
))
181 * This is a hack until we can dispose of frontend buffer
182 * cache buffers on the frontend.
184 while (hmp
->flusher
.signal
== 0)
185 tsleep(&hmp
->flusher
.signal
, 0, "hmrwwa", 0);
186 hmp
->flusher
.signal
= 0;
192 hmp
->flusher
.td
= NULL
;
193 wakeup(&hmp
->flusher
.exiting
);
198 * The slave flusher thread pulls work off the master flush_list until no
202 hammer_flusher_slave_thread(void *arg
)
204 hammer_flusher_info_t info
;
215 while (info
->startit
== 0)
216 tsleep(&info
->startit
, 0, "hmrssw", 0);
217 if (info
->startit
< 0)
222 * Try to pull out around ~64 inodes at a time to flush.
223 * The idea is to try to avoid deadlocks between the slaves.
226 while ((ip
= TAILQ_FIRST(&hmp
->flush_list
)) != NULL
) {
227 if (ip
->flush_group
!= hmp
->flusher
.act
)
229 TAILQ_REMOVE(&hmp
->flush_list
, ip
, flush_entry
);
230 info
->work_array
[n
++] = ip
;
232 if (n
< HAMMER_FLUSH_GROUP_SIZE
&&
233 c
< HAMMER_FLUSH_GROUP_SIZE
* 8) {
236 for (i
= 0; i
< n
; ++i
){
237 hammer_flusher_flush_inode(info
->work_array
[i
],
238 &hmp
->flusher
.trans
);
242 for (i
= 0; i
< n
; ++i
) {
243 hammer_flusher_flush_inode(info
->work_array
[i
],
244 &hmp
->flusher
.trans
);
246 if (--hmp
->flusher
.running
== 0)
247 wakeup(&hmp
->flusher
.running
);
255 hammer_flusher_clean_loose_ios(hammer_mount_t hmp
)
257 hammer_buffer_t buffer
;
259 int panic_count
= 1000000;
262 * loose ends - buffers without bp's aren't tracked by the kernel
263 * and can build up, so clean them out. This can occur when an
264 * IO completes on a buffer with no references left.
266 crit_enter(); /* biodone() race */
267 while ((io
= TAILQ_FIRST(&hmp
->lose_list
)) != NULL
) {
268 KKASSERT(--panic_count
> 0);
269 KKASSERT(io
->mod_list
== &hmp
->lose_list
);
270 TAILQ_REMOVE(&hmp
->lose_list
, io
, mod_entry
);
272 if (io
->lock
.refs
== 0)
273 ++hammer_count_refedbufs
;
274 hammer_ref(&io
->lock
);
276 hammer_rel_buffer(buffer
, 0);
282 * Flush all inodes in the current flush group.
285 hammer_flusher_flush(hammer_mount_t hmp
)
287 hammer_flusher_info_t info
;
288 hammer_reserve_t resv
;
292 hammer_start_transaction_fls(&hmp
->flusher
.trans
, hmp
);
295 * If the previous flush cycle just about exhausted our UNDO space
296 * we may have to do a dummy cycle to move the first_offset up
297 * before actually digging into a new cycle, or the new cycle will
298 * not have sufficient undo space.
300 if (hammer_flusher_undo_exhausted(&hmp
->flusher
.trans
, 3)) {
301 hammer_lock_ex(&hmp
->flusher
.finalize_lock
);
302 hammer_flusher_finalize(&hmp
->flusher
.trans
, 0);
303 hammer_unlock(&hmp
->flusher
.finalize_lock
);
307 * Start work threads.
310 n
= hmp
->count_iqueued
/ HAMMER_FLUSH_GROUP_SIZE
;
311 if (TAILQ_FIRST(&hmp
->flush_list
)) {
312 for (i
= 0; i
<= n
; ++i
) {
313 if (i
== HAMMER_MAX_FLUSHERS
||
314 hmp
->flusher
.info
[i
] == NULL
) {
317 info
= hmp
->flusher
.info
[i
];
318 if (info
->startit
== 0) {
319 ++hmp
->flusher
.running
;
321 wakeup(&info
->startit
);
325 while (hmp
->flusher
.running
)
326 tsleep(&hmp
->flusher
.running
, 0, "hmrfcc", 0);
328 hammer_flusher_finalize(&hmp
->flusher
.trans
, 1);
329 hmp
->flusher
.tid
= hmp
->flusher
.trans
.tid
;
332 * Clean up any freed big-blocks (typically zone-2).
333 * resv->flush_group is typically set several flush groups ahead
334 * of the free to ensure that the freed block is not reused until
335 * it can no longer be reused.
337 while ((resv
= TAILQ_FIRST(&hmp
->delay_list
)) != NULL
) {
338 if (resv
->flush_group
!= hmp
->flusher
.act
)
340 hammer_reserve_clrdelay(hmp
, resv
);
342 hammer_done_transaction(&hmp
->flusher
.trans
);
346 * Flush a single inode that is part of a flush group.
348 * NOTE! The sync code can return EWOULDBLOCK if the flush operation
349 * would otherwise blow out the buffer cache. hammer_flush_inode_done()
350 * will re-queue the inode for the next flush sequence and force the
351 * flusher to run again if this occurs.
355 hammer_flusher_flush_inode(hammer_inode_t ip
, hammer_transaction_t trans
)
357 hammer_mount_t hmp
= ip
->hmp
;
360 hammer_lock_sh(&hmp
->flusher
.finalize_lock
);
361 error
= hammer_sync_inode(ip
);
362 if (error
!= EWOULDBLOCK
)
364 hammer_flush_inode_done(ip
);
365 hammer_unlock(&hmp
->flusher
.finalize_lock
);
366 while (hmp
->flusher
.finalize_want
)
367 tsleep(&hmp
->flusher
.finalize_want
, 0, "hmrsxx", 0);
368 if (hammer_flusher_undo_exhausted(trans
, 1)) {
369 hmp
->flusher
.finalize_want
= 1;
370 hammer_lock_ex(&hmp
->flusher
.finalize_lock
);
371 kprintf("HAMMER: Warning: UNDO area too small!\n");
372 hammer_flusher_finalize(trans
, 1);
373 hammer_unlock(&hmp
->flusher
.finalize_lock
);
374 hmp
->flusher
.finalize_want
= 0;
375 wakeup(&hmp
->flusher
.finalize_want
);
376 } else if (hammer_flusher_meta_limit(trans
->hmp
)) {
377 hmp
->flusher
.finalize_want
= 1;
378 hammer_lock_ex(&hmp
->flusher
.finalize_lock
);
379 hammer_flusher_finalize(trans
, 0);
380 hammer_unlock(&hmp
->flusher
.finalize_lock
);
381 hmp
->flusher
.finalize_want
= 0;
382 wakeup(&hmp
->flusher
.finalize_want
);
387 * Return non-zero if the UNDO area has less then (QUARTER / 4) of its
390 * 1/4 - Emergency free undo space level. Below this point the flusher
391 * will finalize even if directory dependancies have not been resolved.
393 * 2/4 - Used by the pruning and reblocking code. These functions may be
394 * running in parallel with a flush and cannot be allowed to drop
395 * available undo space to emergency levels.
397 * 3/4 - Used at the beginning of a flush to force-sync the volume header
398 * to give the flush plenty of runway to work in.
401 hammer_flusher_undo_exhausted(hammer_transaction_t trans
, int quarter
)
403 if (hammer_undo_space(trans
) <
404 hammer_undo_max(trans
->hmp
) * quarter
/ 4) {
405 kprintf("%c", '0' + quarter
);
413 * Flush all pending UNDOs, wait for write completion, update the volume
414 * header with the new UNDO end position, and flush it. Then
415 * asynchronously flush the meta-data.
417 * If this is the last finalization in a flush group we also synchronize
418 * our cached blockmap and set hmp->flusher_undo_start and our cached undo
419 * fifo first_offset so the next flush resets the FIFO pointers.
423 hammer_flusher_finalize(hammer_transaction_t trans
, int final
)
425 hammer_volume_t root_volume
;
426 hammer_blockmap_t cundomap
, dundomap
;
433 root_volume
= trans
->rootvol
;
436 * Flush data buffers. This can occur asynchronously and at any
437 * time. We must interlock against the frontend direct-data write
438 * but do not have to acquire the sync-lock yet.
441 while ((io
= TAILQ_FIRST(&hmp
->data_list
)) != NULL
) {
442 if (io
->lock
.refs
== 0)
443 ++hammer_count_refedbufs
;
444 hammer_ref(&io
->lock
);
445 hammer_io_write_interlock(io
);
446 KKASSERT(io
->type
!= HAMMER_STRUCTURE_VOLUME
);
448 hammer_io_done_interlock(io
);
449 hammer_rel_buffer((hammer_buffer_t
)io
, 0);
454 * The sync-lock is required for the remaining sequence. This lock
455 * prevents meta-data from being modified.
457 hammer_sync_lock_ex(trans
);
460 * If we have been asked to finalize the volume header sync the
461 * cached blockmap to the on-disk blockmap. Generate an UNDO
462 * record for the update.
465 cundomap
= &hmp
->blockmap
[0];
466 dundomap
= &root_volume
->ondisk
->vol0_blockmap
[0];
467 if (root_volume
->io
.modified
) {
468 hammer_modify_volume(trans
, root_volume
,
469 dundomap
, sizeof(hmp
->blockmap
));
470 for (i
= 0; i
< HAMMER_MAX_ZONES
; ++i
)
471 hammer_crc_set_blockmap(&cundomap
[i
]);
472 bcopy(cundomap
, dundomap
, sizeof(hmp
->blockmap
));
473 hammer_modify_volume_done(root_volume
);
481 while ((io
= TAILQ_FIRST(&hmp
->undo_list
)) != NULL
) {
482 KKASSERT(io
->modify_refs
== 0);
483 if (io
->lock
.refs
== 0)
484 ++hammer_count_refedbufs
;
485 hammer_ref(&io
->lock
);
486 KKASSERT(io
->type
!= HAMMER_STRUCTURE_VOLUME
);
488 hammer_rel_buffer((hammer_buffer_t
)io
, 0);
493 * Wait for I/Os to complete
495 hammer_flusher_clean_loose_ios(hmp
);
496 hammer_io_wait_all(hmp
, "hmrfl1");
499 * Update the on-disk volume header with new UNDO FIFO end position
500 * (do not generate new UNDO records for this change). We have to
501 * do this for the UNDO FIFO whether (final) is set or not.
503 * Also update the on-disk next_tid field. This does not require
504 * an UNDO. However, because our TID is generated before we get
505 * the sync lock another sync may have beat us to the punch.
507 * This also has the side effect of updating first_offset based on
508 * a prior finalization when the first finalization of the next flush
509 * cycle occurs, removing any undo info from the prior finalization
510 * from consideration.
512 * The volume header will be flushed out synchronously.
514 dundomap
= &root_volume
->ondisk
->vol0_blockmap
[HAMMER_ZONE_UNDO_INDEX
];
515 cundomap
= &hmp
->blockmap
[HAMMER_ZONE_UNDO_INDEX
];
517 if (dundomap
->first_offset
!= cundomap
->first_offset
||
518 dundomap
->next_offset
!= cundomap
->next_offset
) {
519 hammer_modify_volume(NULL
, root_volume
, NULL
, 0);
520 dundomap
->first_offset
= cundomap
->first_offset
;
521 dundomap
->next_offset
= cundomap
->next_offset
;
522 hammer_crc_set_blockmap(dundomap
);
523 hammer_crc_set_volume(root_volume
->ondisk
);
524 if (root_volume
->ondisk
->vol0_next_tid
< trans
->tid
)
525 root_volume
->ondisk
->vol0_next_tid
= trans
->tid
;
526 hammer_modify_volume_done(root_volume
);
529 if (root_volume
->io
.modified
) {
530 hammer_io_flush(&root_volume
->io
);
534 * Wait for I/Os to complete
536 hammer_flusher_clean_loose_ios(hmp
);
537 hammer_io_wait_all(hmp
, "hmrfl2");
540 * Flush meta-data. The meta-data will be undone if we crash
541 * so we can safely flush it asynchronously.
543 * Repeated catchups will wind up flushing this update's meta-data
544 * and the UNDO buffers for the next update simultaniously. This
548 while ((io
= TAILQ_FIRST(&hmp
->meta_list
)) != NULL
) {
549 KKASSERT(io
->modify_refs
== 0);
550 if (io
->lock
.refs
== 0)
551 ++hammer_count_refedbufs
;
552 hammer_ref(&io
->lock
);
553 KKASSERT(io
->type
!= HAMMER_STRUCTURE_VOLUME
);
555 hammer_rel_buffer((hammer_buffer_t
)io
, 0);
560 * If this is the final finalization for the flush group set
561 * up for the next sequence by setting a new first_offset in
562 * our cached blockmap and clearing the undo history.
564 * Even though we have updated our cached first_offset, the on-disk
565 * first_offset still governs available-undo-space calculations.
568 cundomap
= &hmp
->blockmap
[HAMMER_ZONE_UNDO_INDEX
];
569 cundomap
->first_offset
= cundomap
->next_offset
;
570 hammer_clear_undo_history(hmp
);
573 hammer_sync_unlock(trans
);
577 * Return non-zero if too many dirty meta-data buffers have built up.
579 * Since we cannot allow such buffers to flush until we have dealt with
580 * the UNDOs, we risk deadlocking the kernel's buffer cache.
583 hammer_flusher_meta_limit(hammer_mount_t hmp
)
585 if (hmp
->locked_dirty_space
+ hmp
->io_running_space
>
586 hammer_limit_dirtybufspace
) {