3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner. Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head. After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
13 fs/jbd2/Makefile | 2 +-
14 fs/jbd2/checkpoint.c | 3 +
15 fs/jbd2/cleaner.c | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16 fs/jbd2/jmap.c | 34 +++++++++
17 fs/jbd2/jmap.h | 77 ++++++++++++++++++++
18 fs/jbd2/journal.c | 23 +++++-
19 include/linux/jbd2.h | 9 ++-
20 7 files changed, 512 insertions(+), 4 deletions(-)
22 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
23 index a54f50b3a06e..b6a2dddcc0a7 100644
24 --- a/fs/jbd2/Makefile
25 +++ b/fs/jbd2/Makefile
27 obj-$(CONFIG_JBD2) += jbd2.o
29 jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
32 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
33 index 4055f51617ef..b60bbf58e8f7 100644
34 --- a/fs/jbd2/checkpoint.c
35 +++ b/fs/jbd2/checkpoint.c
36 @@ -389,6 +389,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
38 unsigned long blocknr;
40 + if (journal->j_flags & JBD2_LAZY)
43 if (is_journal_aborted(journal))
46 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
48 index 000000000000..06ec11e1d2dd
50 +++ b/fs/jbd2/cleaner.c
52 +#include <linux/blk_types.h>
53 +#include <linux/jbd2.h>
55 +#include <linux/list.h>
56 +#include <linux/blkdev.h>
57 +#include <linux/completion.h>
58 +#include <linux/delay.h>
59 +#include <trace/events/jbd2.h>
61 +static inline int jbd2_low_on_space(journal_t *journal)
63 + int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
65 + trace_jbd2_jmap_printf1("low on space", x);
68 + trace_jbd2_jmap_printf1("not low on space", x);
72 +static inline int jbd2_high_on_space(journal_t *journal)
74 + if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
75 + trace_jbd2_jmap_printf("not enough cleaned");
78 + trace_jbd2_jmap_printf("enough cleaned");
79 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
80 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
85 + * Tries to move the tail forward (hence free space) as long as the transaction
86 + * at the tail has only stale blocks. Returns true if manages to free a
87 + * transaction, false otherwise.
89 +static bool jbd2_try_to_move_tail(journal_t *journal)
91 + struct transaction_infos *tis = journal->j_transaction_infos;
92 + struct transaction_info *ti, *ti1;
95 + * Advance the tail as far as possible by skipping over transactions
96 + * with no live blocks.
98 + write_lock(&journal->j_jmap_lock);
99 + ti = ti1 = &tis->buf[tis->tail];
101 + for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
102 + trace_jbd2_jmap_printf2("cleaned a transaction",
103 + tis->tail, ti->tid);
104 + tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
105 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
107 + write_unlock(&journal->j_jmap_lock);
112 + * In the worst case, this will end up updating the journal superblock
113 + * after cleaning up every transaction. Should we avoid it?
115 + write_unlock(&journal->j_state_lock);
116 + jbd2_update_log_tail(journal, ti->tid, ti->offset);
117 + write_lock(&journal->j_state_lock);
123 + * Finds the live blocks at the tail transaction and copies the corresponding
124 + * mappings to |ctx->mappings|. Returns the number of live block mappings
125 + * copied. Should be called with a read lock on |j_jmap_lock|.
127 +static int find_live_blocks(struct cleaner_ctx *ctx)
129 + journal_t *journal = ctx->journal;
130 + struct transaction_infos *tis = journal->j_transaction_infos;
131 + struct transaction_info *ti = &tis->buf[tis->tail];
132 + struct jmap_entry *je = NULL;
133 + int i, nr_live = 0;
135 + if (unlikely(list_empty(&ti->live_blks)))
138 + spin_lock(&ctx->pos_lock);
140 + ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
142 + spin_unlock(&ctx->pos_lock);
144 + list_for_each_entry_from(je, &ti->live_blks, list) {
147 + ctx->mappings[nr_live++] = je->mapping;
148 + if (nr_live == CLEANER_BATCH_SIZE)
153 + trace_jbd2_jmap_printf1("found live blocks", nr_live);
154 + for (i = 0; i < nr_live; ++i)
155 + trace_jbd2_jmap_printf2("m",
156 + ctx->mappings[i].fsblk,
157 + ctx->mappings[i].logblk);
161 +static void live_block_read_end_io(struct buffer_head *bh, int uptodate)
163 + struct cleaner_ctx *ctx = bh->b_private;
166 + set_buffer_uptodate(bh);
167 + if (atomic_dec_and_test(&ctx->nr_pending_reads))
168 + wake_up(&ctx->live_block_reads);
171 + clear_buffer_uptodate(bh);
179 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
180 + * buffer heads in |ctx->bhs|. Returns true if at least one of the reads goes
181 + * out to disk and false otherwise. If this function returns true then the
182 + * client should sleep on the condition variable |ctx->live_block_reads|. The
183 + * client will be woken up when all reads are complete, through the end_io
184 + * handler attached to buffer heads read from disk.
186 +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
188 + journal_t *journal = ctx->journal;
190 + struct blk_plug plug;
191 + bool plugged = false;
194 + for (i = 0; i < nr_live; ++i) {
195 + ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
196 + journal->j_blocksize);
197 + if (unlikely(!ctx->bhs[i])) {
201 + if (buffer_uptodate(ctx->bhs[i]))
205 + blk_start_plug(&plug);
207 + lock_buffer(ctx->bhs[i]);
208 + if (buffer_uptodate(ctx->bhs[i]))
210 + ctx->bhs[i]->b_private = ctx;
211 + ctx->bhs[i]->b_end_io = live_block_read_end_io;
212 + get_bh(ctx->bhs[i]);
213 + rc = read_block_from_log(ctx->journal, ctx->bhs[i],
214 + REQ_RAHEAD, ctx->mappings[i].logblk);
215 + if (unlikely(rc < 0))
217 + atomic_inc(&ctx->nr_pending_reads);
220 + trace_jbd2_jmap_printf2("reading from disk",
221 + ctx->mappings[i].fsblk,
222 + ctx->mappings[i].logblk);
224 + trace_jbd2_jmap_printf2("cached",
225 + ctx->mappings[i].fsblk,
226 + ctx->mappings[i].logblk);
230 + blk_finish_plug(&plug);
235 + blk_finish_plug(&plug);
236 + jbd2_journal_abort(ctx->journal, rc);
241 + * This function finds the live blocks that became stale between the call to
242 + * find_live_blocks and now, and discards them. It returns true if there are no
243 + * more live blocks left at the tail transaction.
245 +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
247 + journal_t *journal = ctx->journal;
248 + struct transaction_infos *tis = journal->j_transaction_infos;
249 + struct transaction_info *ti = &tis->buf[tis->tail];
250 + struct jmap_entry *je = NULL;
251 + int i = 0, j = 0, next = 0;
253 + trace_jbd2_jmap_printf(__func__);
254 + spin_lock(&ctx->pos_lock);
257 + list_for_each_entry_from(je, &ti->live_blks, list) {
258 + for (j = next; j < nr_live; ++j) {
259 + if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
261 + ctx->pos = list_next_entry(je, list);
263 + brelse(ctx->bhs[j]);
264 + ctx->bhs[j] = NULL;
265 + trace_jbd2_jmap_printf2(
267 + ctx->mappings[i].fsblk,
268 + ctx->mappings[i].logblk);
272 + trace_jbd2_jmap_printf2(
273 + "moved to another list",
274 + ctx->mappings[i].fsblk,
275 + ctx->mappings[i].logblk);
276 + brelse(ctx->bhs[j]);
277 + ctx->bhs[j] = NULL;
280 + if (++i == nr_live || j == nr_live)
283 + spin_unlock(&ctx->pos_lock);
286 + * We have exited the loop. If we haven't processed all the entries in
287 + * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
288 + * not processed |nr_live| entries from the live blocks list at the
289 + * tail, that is if (i < nr_live) at the exit, then the live blocks list
290 + * has shrunk and the tail transaction has no live blocks left.
292 + return j < nr_live && i < nr_live;
295 +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle,
300 + trace_jbd2_jmap_printf(__func__);
301 + for (i = 0; i < nr_live; ++i) {
304 + trace_jbd2_jmap_printf2("attaching",
305 + ctx->mappings[i].fsblk,
306 + ctx->mappings[i].logblk);
307 + err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
309 + err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
311 + jbd2_journal_abort(ctx->journal, err);
318 + * Read the live blocks from the tail transaction and attach them to the current
321 +void jbd2_jmap_do_clean_batch(struct work_struct *work)
323 + struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
324 + journal_t *journal = ctx->journal;
325 + bool wake_up_commit_thread = true;
326 + handle_t *handle = NULL;
329 + read_lock(&journal->j_jmap_lock);
330 + nr_live = find_live_blocks(ctx);
331 + read_unlock(&journal->j_jmap_lock);
333 + if (nr_live < CLEANER_BATCH_SIZE)
334 + wake_up_commit_thread = false;
338 + read_live_blocks(ctx, nr_live);
339 + wait_event(ctx->live_block_reads,
340 + atomic_read(&ctx->nr_pending_reads) <= 0);
342 + handle = jbd2_journal_start(journal, nr_live);
343 + if (IS_ERR(handle)) {
344 + jbd2_journal_abort(journal, PTR_ERR(handle));
348 + read_lock(&journal->j_jmap_lock);
349 + if (discard_stale_blocks(ctx, nr_live))
350 + wake_up_commit_thread = false;
351 + read_unlock(&journal->j_jmap_lock);
353 + * I'm not sure why this function was under the jmap_lock
354 + * previously, but it can't be, since it calls functions that
355 + * can block due to memory allocation. I don't think it needs
356 + * to be protected, since it appears that ctx->mapping is only
357 + * used by the cleaner code, and so it can't be run multiple
360 + attach_live_blocks(ctx, handle, nr_live);
362 + err = jbd2_journal_stop(handle);
364 + jbd2_journal_abort(journal, err);
369 + atomic_set(&ctx->batch_in_progress, 0);
370 + atomic_inc(&ctx->nr_txns_cleaned);
371 + if (wake_up_commit_thread) {
372 + trace_jbd2_jmap_printf("waking up commit thread");
373 + wake_up(&journal->j_wait_commit);
375 + trace_jbd2_jmap_printf("not waking up commit thread");
376 + spin_lock(&ctx->pos_lock);
378 + spin_unlock(&ctx->pos_lock);
380 + write_lock(&journal->j_state_lock);
381 + journal->j_flags &= ~JBD2_CLEANING;
382 + write_unlock(&journal->j_state_lock);
386 + * Called by the commit thread to see if we need to do any cleaning
388 + * Called with j_state_lock write locked.
390 +void jbd2_check_cleaner(journal_t *journal)
393 + * If there is cleaning going on in the workqueue, don't check
394 + * until we're done.
396 + if (journal->j_flags & JBD2_CLEANING)
399 + if (journal->j_flags & JBD2_STOP_CLEANING) {
401 + journal->j_flags &= ~JBD2_CLEANER_ENGAGED;
405 + if (journal->j_flags & JBD2_CLEANER_ENGAGED) {
406 + if (jbd2_try_to_move_tail(journal) &&
407 + jbd2_high_on_space(journal))
408 + goto disengage_cleaner;
410 + journal->j_flags |= JBD2_CLEANING;
411 + schedule_work(&journal->j_cleaner_ctx->work);
415 + if (jbd2_low_on_space(journal)) {
416 + journal->j_flags |= JBD2_CLEANER_ENGAGED;
417 + goto schedule_batch;
420 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
421 index 7de6f4a0a1dc..0e759cc095f5 100644
424 @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal,
425 * We are either deleting the entry because it was revoked, or
426 * we are moving it to the live blocks list of this transaction.
427 * In either case, we remove it from its existing list.
428 + * However, before removing it we check to see if this is an
429 + * entry in the live blocks list of the tail transaction a
430 + * pointer to whom is cached by the cleaner and update the
431 + * cached pointer if so.
433 + spin_lock(&journal->j_cleaner_ctx->pos_lock);
434 + if (je == journal->j_cleaner_ctx->pos) {
435 + journal->j_cleaner_ctx->pos = list_next_entry(je, list);
438 + spin_unlock(&journal->j_cleaner_ctx->pos_lock);
441 rb_erase(&je->rb_node, &journal->j_jmap);
442 @@ -216,6 +225,8 @@ void jbd2_finish_transaction_infos(journal_t *journal)
444 struct transaction_infos *tis = journal->j_transaction_infos;
446 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
448 write_lock(&journal->j_jmap_lock);
449 tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
450 write_unlock(&journal->j_jmap_lock);
451 @@ -243,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
453 BUG_ON(!list_empty(&ti->live_blks));
455 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
457 write_lock(&journal->j_jmap_lock);
458 nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
460 @@ -489,11 +502,32 @@ int jbd2_smr_journal_init(journal_t *journal)
462 journal->j_jmap = RB_ROOT;
463 rwlock_init(&journal->j_jmap_lock);
464 + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
466 + if (!journal->j_cleaner_ctx)
469 + journal->j_cleaner_ctx->journal = journal;
470 + journal->j_cleaner_ctx->pos = NULL;
471 + spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
472 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
473 + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
474 + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
475 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
476 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
477 + init_waitqueue_head(&journal->j_cleaner_ctx->live_block_reads);
478 + INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch);
479 return jbd2_init_transaction_infos(journal);
482 void jbd2_smr_journal_exit(journal_t *journal)
484 + if (journal->j_cleaner_ctx) {
485 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
486 + flush_work(&journal->j_cleaner_ctx->work);
487 + kfree(journal->j_cleaner_ctx);
488 + journal->j_cleaner_ctx = NULL;
490 jbd2_free_transaction_infos(journal);
493 diff --git a/fs/jbd2/jmap.h b/fs/jbd2/jmap.h
494 index 91564ce9bbda..a44f15152536 100644
497 @@ -125,4 +125,81 @@ extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
498 extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
499 int op_flags, sector_t blk);
501 +extern void jbd2_jmap_do_clean_batch(struct work_struct *work);
504 + * Cleaner stuff is below.
508 + * Number of blocks to read at once, for cleaning.
510 +#define CLEANER_BATCH_SIZE 16
513 + * Context structure for the cleaner.
515 +struct cleaner_ctx {
517 + * We set to true once we drop below low watermark and it stays so until
518 + * we rise above the high watermark. It is accessed by the commit
519 + * thread and the foreground kernel threads during the journal
520 + * destruction, therefore it is atomic.
525 + * We clean in batches of blocks. This flag indicates if we are
526 + * currently cleaning a batch. It is accessed by the commit thread and
527 + * the cleaner thread, therefore it is atomic.
529 + atomic_t batch_in_progress;
532 + * We find live blocks to clean from the live blocks list of the
533 + * transaction at the tail. This list can be larger than our batch size
534 + * and we may need several attempts to process it. We cache the
535 + * position of the next entry to start from in |pos|. Since cleaner
536 + * thread can run concurrently with the commit thread that can modify
537 + * the live blocks list of the transaction at the tail (for example, if
538 + * it needs to drop a revoked entry or if |pos| points to an entry that
539 + * has been updated and should move from the live blocks list of the
540 + * transaction at the tail to the live blocks list of current
541 + * transaction) we protect |pos| with |pos_lock|.
543 + struct jmap_entry *pos;
544 + spinlock_t pos_lock;
547 + * Live block mappings for the blocks that we copy in a batch.
549 + struct blk_mapping mappings[CLEANER_BATCH_SIZE];
552 + * Buffer heads for the live blocks read in a batch.
554 + struct buffer_head *bhs[CLEANER_BATCH_SIZE];
557 + * Number of pending reads in a batch. Every submitted read increments
558 + * it and every completed read decrements it.
560 + atomic_t nr_pending_reads;
563 + * The cleaner thread sleeps on this wait queue until the last
564 + * completed read wakes the up the cleaner thread.
566 + wait_queue_head_t live_block_reads;
568 + /* TODO: temporary for debugging, remove once done. */
569 + atomic_t nr_txns_committed;
570 + atomic_t nr_txns_cleaned;
572 + journal_t *journal;
573 + struct work_struct work;
576 +void jbd2_check_cleaner(journal_t *journal);
579 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
580 index 50ab9b5bc7f4..221e168c7c58 100644
581 --- a/fs/jbd2/journal.c
582 +++ b/fs/jbd2/journal.c
583 @@ -225,10 +225,16 @@ static int kjournald2(void *arg)
584 del_timer_sync(&journal->j_commit_timer);
585 jbd2_journal_commit_transaction(journal);
586 write_lock(&journal->j_state_lock);
590 wake_up(&journal->j_wait_done_commit);
592 + if (journal->j_flags & JBD2_LAZY)
593 + jbd2_check_cleaner(journal);
595 + if (journal->j_commit_sequence != journal->j_commit_request)
598 if (freezing(current)) {
600 * The simpler the better. Flushing journal isn't a
601 @@ -257,6 +263,9 @@ static int kjournald2(void *arg)
603 if (journal->j_flags & JBD2_UNMOUNT)
605 + if ((journal->j_flags & JBD2_CLEANER_ENGAGED) &&
606 + !(journal->j_flags & JBD2_CLEANING))
609 write_unlock(&journal->j_state_lock);
611 @@ -302,14 +311,24 @@ static int jbd2_journal_start_thread(journal_t *journal)
612 static void journal_kill_thread(journal_t *journal)
614 write_lock(&journal->j_state_lock);
615 - journal->j_flags |= JBD2_UNMOUNT;
617 + journal->j_flags |= JBD2_STOP_CLEANING;
618 + while (journal->j_flags & JBD2_CLEANING) {
619 + write_unlock(&journal->j_state_lock);
620 + wake_up(&journal->j_wait_commit);
621 + wait_event(journal->j_wait_done_commit,
622 + (journal->j_flags & JBD2_CLEANING) == 0);
623 + write_lock(&journal->j_state_lock);
626 + journal->j_flags |= JBD2_UNMOUNT;
627 while (journal->j_task) {
628 write_unlock(&journal->j_state_lock);
629 wake_up(&journal->j_wait_commit);
630 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
631 write_lock(&journal->j_state_lock);
634 write_unlock(&journal->j_state_lock);
637 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
638 index a1d56bb9fa4f..bc28c585af54 100644
639 --- a/include/linux/jbd2.h
640 +++ b/include/linux/jbd2.h
641 @@ -734,7 +734,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
642 * @j_superblock: Second part of superblock buffer
643 * @j_map: A map from file system blocks to log blocks
644 * @j_transaction_infos: An array of information structures per live transaction
645 - * @j_map_lock: Protect j_jmap and j_transaction_infos
646 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
647 + * @j_cleaner_ctx: Cleaner state
648 * @j_format_version: Version of the superblock format
649 * @j_state_lock: Protect the various scalars in the journal
650 * @j_barrier_count: Number of processes waiting to create a barrier lock
651 @@ -819,6 +820,9 @@ struct journal_s
652 /* Protect j_jmap and j_transaction_infos */
653 rwlock_t j_jmap_lock;
655 + /* Cleaner state */
656 + struct cleaner_ctx *j_cleaner_ctx;
658 /* Version of the superblock format */
659 int j_format_version;
661 @@ -1142,6 +1146,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3)
662 #define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */
663 #define JBD2_NO_CLEANUP 0x100 /* Don't flush empty the journal on shutdown */
664 #define JBD2_LAZY 0x200 /* Do lazy journalling */
665 +#define JBD2_CLEANING 0x400 /* Lazy journalling cleaning in progress */
666 +#define JBD2_CLEANER_ENGAGED 0x400 /* Cleaner has been engaged */
667 +#define JBD2_STOP_CLEANING 0x800 /* Request the cleaning thread to stop */
670 * Function declarations for the journaling transaction and buffer