3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner. Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head. After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
13 fs/jbd2/Makefile | 2 +-
14 fs/jbd2/cleaner.c | 358 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 fs/jbd2/jmap.c | 42 +++++++++--
16 fs/jbd2/journal.c | 12 ++++
17 include/linux/jbd2.h | 6 +-
18 include/linux/jmap.h | 111 +++++++++++++++++++++++++----
19 6 files changed, 510 insertions(+), 21 deletions(-)
21 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
22 index a54f50b3a06e..b6a2dddcc0a7 100644
23 --- a/fs/jbd2/Makefile
24 +++ b/fs/jbd2/Makefile
26 obj-$(CONFIG_JBD2) += jbd2.o
28 jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
31 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
33 index 000000000000..6d02aafcded9
35 +++ b/fs/jbd2/cleaner.c
37 +#include <linux/blk_types.h>
38 +#include <linux/jbd2.h>
39 +#include <linux/jmap.h>
40 +#include <linux/list.h>
41 +#include <linux/blkdev.h>
42 +#include <linux/completion.h>
43 +#include <trace/events/jbd2.h>
45 +inline int low_on_space(journal_t *journal)
47 + int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
49 + trace_jbd2_jmap_printf1("low on space", x);
52 + trace_jbd2_jmap_printf1("not low on space", x);
56 +inline int high_on_space(journal_t *journal)
58 + if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
59 + trace_jbd2_jmap_printf("not enough cleaned");
62 + trace_jbd2_jmap_printf("enough cleaned");
63 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
64 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
68 +inline bool cleaning(journal_t *journal)
70 + return atomic_read(&journal->j_cleaner_ctx->cleaning);
73 +inline void stop_cleaning(journal_t *journal)
75 + trace_jbd2_jmap_printf("stopped cleaning");
76 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
79 +inline void start_cleaning(journal_t *journal)
81 + trace_jbd2_jmap_printf("started cleaning");
82 + atomic_set(&journal->j_cleaner_ctx->cleaning, 1);
83 + clean_next_batch(journal);
86 +inline bool cleaning_batch_complete(journal_t *journal)
88 + return cleaning(journal) &&
89 + atomic_read(&journal->j_cleaner_ctx->batch_in_progress) == 0;
93 + * Tries to move the tail forward (hence free space) as long as the transaction
94 + * at the tail has only stale blocks. Returns true if manages to free a
95 + * transaction, false otherwise.
97 +bool try_to_move_tail(journal_t *journal)
99 + struct transaction_infos *tis = journal->j_transaction_infos;
100 + struct transaction_info *ti, *ti1;
103 + * Advance the tail as far as possible by skipping over transactions
104 + * with no live blocks.
106 + write_lock(&journal->j_jmap_lock);
107 + ti = ti1 = &tis->buf[tis->tail];
109 + for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
110 + trace_jbd2_jmap_printf2("cleaned a transaction",
111 + tis->tail, ti->tid);
112 + tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
113 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
115 + write_unlock(&journal->j_jmap_lock);
120 + * In the worst case, this will end up updating the journal superblock
121 + * after cleaning up every transaction. Should we avoid it?
123 + write_unlock(&journal->j_state_lock);
124 + jbd2_update_log_tail(journal, ti->tid, ti->offset);
125 + write_lock(&journal->j_state_lock);
131 + * Finds the live blocks at the tail transaction and copies the corresponding
132 + * mappings to |ctx->mappings|. Returns the number of live block mappings
133 + * copied. Should be called with a read lock on |j_jmap_lock|.
135 +int find_live_blocks(struct cleaner_ctx *ctx)
137 + journal_t *journal = ctx->journal;
138 + struct transaction_infos *tis = journal->j_transaction_infos;
139 + struct transaction_info *ti = &tis->buf[tis->tail];
140 + struct jmap_entry *je = NULL;
141 + int i, nr_live = 0;
143 + if (unlikely(list_empty(&ti->live_blks)))
146 + spin_lock(&ctx->pos_lock);
148 + ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
150 + spin_unlock(&ctx->pos_lock);
152 + list_for_each_entry_from(je, &ti->live_blks, list) {
155 + ctx->mappings[nr_live++] = je->mapping;
156 + if (nr_live == CLEANER_BATCH_SIZE)
161 + trace_jbd2_jmap_printf1("found live blocks", nr_live);
162 + for (i = 0; i < nr_live; ++i)
163 + trace_jbd2_jmap_printf2("m",
164 + ctx->mappings[i].fsblk,
165 + ctx->mappings[i].logblk);
169 +void live_block_read_end_io(struct buffer_head *bh, int uptodate)
171 + struct cleaner_ctx *ctx = bh->b_private;
174 + set_buffer_uptodate(bh);
175 + if (atomic_dec_and_test(&ctx->nr_pending_reads))
176 + complete(&ctx->live_block_reads);
179 + clear_buffer_uptodate(bh);
187 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
188 + * buffer heads in |ctx->bhs|. Returns true if at least one of the reads goes
189 + * out to disk and false otherwise. If this function returns true then the
190 + * client should sleep on the condition variable |ctx->live_block_reads|. The
191 + * client will be woken up when all reads are complete, through the end_io
192 + * handler attached to buffer heads read from disk.
194 +bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
196 + journal_t *journal = ctx->journal;
198 + struct blk_plug plug;
199 + bool plugged = false;
202 + for (i = 0; i < nr_live; ++i) {
203 + ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
204 + journal->j_blocksize);
205 + if (unlikely(!ctx->bhs[i]))
207 + if (buffer_uptodate(ctx->bhs[i]))
210 + blk_start_plug(&plug);
211 + lock_buffer(ctx->bhs[i]);
212 + ctx->bhs[i]->b_private = ctx;
213 + ctx->bhs[i]->b_end_io = live_block_read_end_io;
214 + atomic_inc(&ctx->nr_pending_reads);
215 + get_bh(ctx->bhs[i]);
216 + rc = read_block_from_log(ctx->journal, ctx->bhs[i],
217 + REQ_RAHEAD, ctx->mappings[i].logblk);
218 + if (unlikely(rc < 0))
222 + trace_jbd2_jmap_printf2("reading from disk",
223 + ctx->mappings[i].fsblk,
224 + ctx->mappings[i].logblk);
226 + trace_jbd2_jmap_printf2("cached",
227 + ctx->mappings[i].fsblk,
228 + ctx->mappings[i].logblk);
232 + blk_finish_plug(&plug);
236 + jbd2_journal_abort(ctx->journal, -ENOMEM);
241 + * This function finds the live blocks that became stale between the call to
242 + * find_live_blocks and now, and discards them. It returns true if there are no
243 + * more live blocks left at the tail transaction.
245 +bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
247 + journal_t *journal = ctx->journal;
248 + struct transaction_infos *tis = journal->j_transaction_infos;
249 + struct transaction_info *ti = &tis->buf[tis->tail];
250 + struct jmap_entry *je = NULL;
251 + int i = 0, j = 0, next = 0;
253 + trace_jbd2_jmap_printf(__func__);
254 + spin_lock(&ctx->pos_lock);
257 + list_for_each_entry_from(je, &ti->live_blks, list) {
258 + for (j = next; j < nr_live; ++j) {
259 + if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
261 + ctx->pos = list_next_entry(je, list);
263 + brelse(ctx->bhs[j]);
264 + ctx->bhs[j] = NULL;
265 + trace_jbd2_jmap_printf2(
267 + ctx->mappings[i].fsblk,
268 + ctx->mappings[i].logblk);
272 + trace_jbd2_jmap_printf2(
273 + "moved to another list",
274 + ctx->mappings[i].fsblk,
275 + ctx->mappings[i].logblk);
276 + brelse(ctx->bhs[j]);
277 + ctx->bhs[j] = NULL;
280 + if (++i == nr_live || j == nr_live)
283 + spin_unlock(&ctx->pos_lock);
286 + * We have exited the loop. If we haven't processed all the entries in
287 + * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
288 + * not processed |nr_live| entries from the live blocks list at the
289 + * tail, that is if (i < nr_live) at the exit, then the live blocks list
290 + * has shrunk and the tail transaction has no live blocks left.
292 + return j < nr_live && i < nr_live;
295 +void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle, int nr_live)
299 + trace_jbd2_jmap_printf(__func__);
300 + for (i = 0; i < nr_live; ++i) {
303 + trace_jbd2_jmap_printf2("attaching",
304 + ctx->mappings[i].fsblk,
305 + ctx->mappings[i].logblk);
306 + err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
308 + err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
310 + jbd2_journal_abort(ctx->journal, err);
317 + * Read the live blocks from the tail transaction and attach them to the current
320 +static void do_clean_batch(struct work_struct *work)
322 + struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
323 + bool wake_up_commit_thread = true;
324 + handle_t *handle = NULL;
327 + read_lock(&ctx->journal->j_jmap_lock);
328 + nr_live = find_live_blocks(ctx);
329 + read_unlock(&ctx->journal->j_jmap_lock);
331 + if (nr_live < CLEANER_BATCH_SIZE)
332 + wake_up_commit_thread = false;
336 + reinit_completion(&ctx->live_block_reads);
337 + if (read_live_blocks(ctx, nr_live)) {
338 + trace_jbd2_jmap_printf("waiting for completion");
339 + wait_for_completion(&ctx->live_block_reads);
341 + trace_jbd2_jmap_printf("not waiting for completion");
344 + handle = jbd2_journal_start(ctx->journal, nr_live);
345 + if (IS_ERR(handle)) {
346 + jbd2_journal_abort(ctx->journal, PTR_ERR(handle));
350 + read_lock(&ctx->journal->j_jmap_lock);
351 + if (discard_stale_blocks(ctx, nr_live))
352 + wake_up_commit_thread = false;
353 + attach_live_blocks(ctx, handle, nr_live);
354 + read_unlock(&ctx->journal->j_jmap_lock);
356 + err = jbd2_journal_stop(handle);
358 + jbd2_journal_abort(ctx->journal, err);
363 + atomic_set(&ctx->batch_in_progress, 0);
364 + atomic_inc(&ctx->nr_txns_cleaned);
365 + if (wake_up_commit_thread) {
366 + trace_jbd2_jmap_printf("waking up commit thread");
367 + wake_up(&ctx->journal->j_wait_commit);
369 + trace_jbd2_jmap_printf("not waking up commit thread");
370 + spin_lock(&ctx->pos_lock);
372 + spin_unlock(&ctx->pos_lock);
377 + * Schedules the next batch of cleaning.
379 +void clean_next_batch(journal_t *journal)
381 + struct cleaner_ctx *ctx = journal->j_cleaner_ctx;
383 + if (!cleaning_batch_complete(journal)) {
384 + trace_jbd2_jmap_printf("not scheduling a new batch");
388 + trace_jbd2_jmap_printf("scheduling a batch");
389 + BUG_ON(atomic_read(&ctx->nr_pending_reads));
391 + atomic_set(&ctx->batch_in_progress, 1);
392 + INIT_WORK(&ctx->work, do_clean_batch);
393 + schedule_work(&ctx->work);
395 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
396 index ddaa7d5f5634..08bfbd6f3587 100644
399 @@ -39,7 +39,7 @@ int jbd2_init_transaction_infos(journal_t *journal)
402 for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
403 - INIT_LIST_HEAD(&tis->buf[i].live_logblks);
404 + INIT_LIST_HEAD(&tis->buf[i].live_blks);
406 journal->j_transaction_infos = tis;
408 @@ -92,15 +92,26 @@ static int process_existing_mappings(journal_t *journal,
409 * We are either deleting the entry because it was revoked, or
410 * we are moving it to the live blocks list of this transaction.
411 * In either case, we remove it from its existing list.
412 + * However, before removing it we check to see if this is an
413 + * entry in the live blocks list of the tail transaction a
414 + * pointer to whom is cached by the cleaner and update the
415 + * cached pointer if so.
417 + spin_lock(&journal->j_cleaner_ctx->pos_lock);
418 + if (je == journal->j_cleaner_ctx->pos) {
419 + journal->j_cleaner_ctx->pos = list_next_entry(je, list);
420 + trace_jbd2_jmap_printf1("updating pos to",
421 + (unsigned long long) journal->j_cleaner_ctx->pos);
424 + spin_unlock(&journal->j_cleaner_ctx->pos_lock);
427 rb_erase(&je->rb_node, &journal->j_jmap);
428 kmem_cache_free(jbd2_jmap_cache, je);
430 trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
431 - fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
432 + fill_entry(je, &mappings[i], t_idx, &ti->live_blks);
436 @@ -162,8 +173,7 @@ static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
440 - fill_entry(new_entries[i], &mappings[i], t_idx,
441 - &ti->live_logblks);
442 + fill_entry(new_entries[i], &mappings[i], t_idx, &ti->live_blks);
443 rb_link_node(&new_entries[i]->rb_node, parent, p);
444 rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
445 trace_jbd2_jmap_insert(&mappings[i], t_idx);
446 @@ -190,7 +200,9 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
447 * We are possibly reusing space of an old transaction_info. The old
448 * transaction should not have any live blocks in it.
450 - BUG_ON(!list_empty(&ti->live_logblks));
451 + BUG_ON(!list_empty(&ti->live_blks));
453 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
455 write_lock(&journal->j_jmap_lock);
456 nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
457 @@ -435,11 +447,31 @@ int jbd2_smr_journal_init(journal_t *journal)
459 journal->j_jmap = RB_ROOT;
460 rwlock_init(&journal->j_jmap_lock);
461 + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
463 + if (!journal->j_cleaner_ctx)
466 + journal->j_cleaner_ctx->journal = journal;
467 + journal->j_cleaner_ctx->pos = NULL;
468 + spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
469 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
470 + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
471 + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
472 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
473 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
474 + init_completion(&journal->j_cleaner_ctx->live_block_reads);
475 return jbd2_init_transaction_infos(journal);
478 void jbd2_smr_journal_exit(journal_t *journal)
480 + if (journal->j_cleaner_ctx) {
481 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
482 + flush_work(&journal->j_cleaner_ctx->work);
483 + kfree(journal->j_cleaner_ctx);
484 + journal->j_cleaner_ctx = NULL;
486 jbd2_free_transaction_infos(journal);
489 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
490 index 0fb2cd419ee3..efe53b83e2e2 100644
491 --- a/fs/jbd2/journal.c
492 +++ b/fs/jbd2/journal.c
493 @@ -227,6 +227,15 @@ static int kjournald2(void *arg)
496 wake_up(&journal->j_wait_done_commit);
498 + if ((journal->j_flags & JBD2_LAZY) &&
499 + (cleaning(journal) || low_on_space(journal))) {
500 + if (try_to_move_tail(journal) && high_on_space(journal))
501 + stop_cleaning(journal);
503 + start_cleaning(journal);
506 if (freezing(current)) {
508 * The simpler the better. Flushing journal isn't a
509 @@ -255,6 +264,9 @@ static int kjournald2(void *arg)
511 if (journal->j_flags & JBD2_UNMOUNT)
513 + if ((journal->j_flags & JBD2_LAZY) &&
514 + cleaning_batch_complete(journal))
517 write_unlock(&journal->j_state_lock);
519 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
520 index 771588026353..3112fba26598 100644
521 --- a/include/linux/jbd2.h
522 +++ b/include/linux/jbd2.h
523 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
524 * @j_superblock: Second part of superblock buffer
525 * @j_map: A map from file system blocks to log blocks
526 * @j_transaction_infos: An array of information structures per live transaction
527 - * @j_map_lock: Protect j_jmap and j_transaction_infos
528 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
529 + * @j_cleaner_ctx: Cleaner state
530 * @j_format_version: Version of the superblock format
531 * @j_state_lock: Protect the various scalars in the journal
532 * @j_barrier_count: Number of processes waiting to create a barrier lock
533 @@ -820,6 +821,9 @@ struct journal_s
534 /* Protect j_jmap and j_transaction_infos */
535 rwlock_t j_jmap_lock;
537 + /* Cleaner state */
538 + struct cleaner_ctx *j_cleaner_ctx;
540 /* Version of the superblock format */
541 int j_format_version;
543 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
544 index a602ece7cc89..acd588b4c68b 100644
545 --- a/include/linux/jmap.h
546 +++ b/include/linux/jmap.h
548 #include <linux/journal-head.h>
549 #include <linux/list.h>
550 #include <linux/circ_buf.h>
551 +#include <linux/completion.h>
554 + * Forward declaration for journal_t so that we don't get circular dependency
555 + * between jbd2.h and jmap.h
558 +typedef struct journal_s journal_t;
561 * Maximum number of transactions. This guides the size of the circular buffer
563 #define MAX_LIVE_TRANSACTIONS 65536
566 - * Forward declaration for journal_t so that we don't get circular dependency
567 - * between jbd2.h and jmap.h
570 -typedef struct journal_s journal_t;
573 * A mapping from file system block to log block.
576 @@ -79,14 +80,14 @@ struct transaction_info {
580 - * A list of live log blocks referenced in the RB-tree that belong to
581 - * this transaction. It is used during cleaning to locate live blocks
582 - * and migrate them to appropriate location. If this list is empty,
583 - * then the transaction does not contain any live blocks and we can
584 - * reuse its space. If this list is not empty, then we can quickly
585 - * locate all the live blocks in this transaction.
586 + * A list of live blocks referenced in the RB-tree that belong to this
587 + * transaction. It is used during cleaning to locate live blocks and
588 + * migrate them to appropriate location. If this list is empty, then
589 + * the transaction does not contain any live blocks and we can reuse its
590 + * space. If this list is not empty, then we can quickly locate all the
591 + * live blocks in this transaction.
593 - struct list_head live_logblks;
594 + struct list_head live_blks;
598 @@ -128,4 +129,86 @@ extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
599 extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
603 + * Cleaner stuff is below.
607 + * Number of blocks to read at once, for cleaning.
609 +#define CLEANER_BATCH_SIZE 16
612 + * Context structure for the cleaner.
614 +struct cleaner_ctx {
616 + * We set to true once we drop below low watermark and it stays so until
617 + * we rise above the high watermark. It is accessed by the commit
618 + * thread and the foreground kernel threads during the journal
619 + * destruction, therefore it is atomic.
624 + * We clean in batches of blocks. This flag indicates if we are
625 + * currently cleaning a batch. It is accessed by the commit thread and
626 + * the cleaner thread, therefore it is atomic.
628 + atomic_t batch_in_progress;
631 + * We find live blocks to clean from the live blocks list of the
632 + * transaction at the tail. This list can be larger than our batch size
633 + * and we may need several attempts to process it. We cache the
634 + * position of the next entry to start from in |pos|. Since cleaner
635 + * thread can run concurrently with the commit thread that can modify
636 + * the live blocks list of the transaction at the tail (for example, if
637 + * it needs to drop a revoked entry or if |pos| points to an entry that
638 + * has been updated and should move from the live blocks list of the
639 + * transaction at the tail to the live blocks list of current
640 + * transaction) we protect |pos| with |pos_lock|.
642 + struct jmap_entry *pos;
643 + spinlock_t pos_lock;
646 + * Live block mappings for the blocks that we copy in a batch.
648 + struct blk_mapping mappings[CLEANER_BATCH_SIZE];
651 + * Buffer heads for the live blocks read in a batch.
653 + struct buffer_head *bhs[CLEANER_BATCH_SIZE];
656 + * Number of pending reads in a batch. Every submitted read increments
657 + * it and every completed read decrements it.
659 + atomic_t nr_pending_reads;
662 + * The cleaner thread sleeps on this condition variable until the last
663 + * completed read wakes the up the cleaner thread.
665 + struct completion live_block_reads;
667 + /* TODO: temporary for debugging, remove once done. */
668 + atomic_t nr_txns_committed;
669 + atomic_t nr_txns_cleaned;
671 + journal_t *journal;
672 + struct work_struct work;
675 +extern int low_on_space(journal_t *journal);
676 +extern int high_on_space(journal_t *journal);
677 +extern bool cleaning(journal_t *journal);
678 +extern void stop_cleaning(journal_t *journal);
679 +extern void start_cleaning(journal_t *journal);
680 +extern void clean_next_batch(journal_t *journal);
681 +extern bool cleaning_batch_complete(journal_t *journal);
682 +extern bool try_to_move_tail(journal_t *journal);