3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner. Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head. After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
13 fs/jbd2/Makefile | 2 +-
14 fs/jbd2/checkpoint.c | 3 +
15 fs/jbd2/cleaner.c | 367 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16 fs/jbd2/jmap.c | 34 +++++++++
17 fs/jbd2/jmap.h | 82 +++++++++++++++++++++
18 fs/jbd2/journal.c | 12 +++
19 include/linux/jbd2.h | 6 +-
20 7 files changed, 504 insertions(+), 2 deletions(-)
22 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
23 index a54f50b3a06e..b6a2dddcc0a7 100644
24 --- a/fs/jbd2/Makefile
25 +++ b/fs/jbd2/Makefile
27 obj-$(CONFIG_JBD2) += jbd2.o
29 jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
32 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
33 index 4055f51617ef..b60bbf58e8f7 100644
34 --- a/fs/jbd2/checkpoint.c
35 +++ b/fs/jbd2/checkpoint.c
36 @@ -389,6 +389,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
38 unsigned long blocknr;
40 + if (journal->j_flags & JBD2_LAZY)
43 if (is_journal_aborted(journal))
46 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
48 index 000000000000..1ab9d2e50702
50 +++ b/fs/jbd2/cleaner.c
52 +#include <linux/blk_types.h>
53 +#include <linux/jbd2.h>
55 +#include <linux/list.h>
56 +#include <linux/blkdev.h>
57 +#include <linux/completion.h>
58 +#include <linux/delay.h>
59 +#include <trace/events/jbd2.h>
61 +inline int jbd2_low_on_space(journal_t *journal)
63 + int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
65 + trace_jbd2_jmap_printf1("low on space", x);
68 + trace_jbd2_jmap_printf1("not low on space", x);
72 +inline int jbd2_high_on_space(journal_t *journal)
74 + if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
75 + trace_jbd2_jmap_printf("not enough cleaned");
78 + trace_jbd2_jmap_printf("enough cleaned");
79 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
80 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
84 +inline bool jbd2_cleaning(journal_t *journal)
86 + return atomic_read(&journal->j_cleaner_ctx->cleaning);
89 +inline void jbd2_stop_cleaning(journal_t *journal)
91 + trace_jbd2_jmap_printf("stopped cleaning");
92 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
95 +inline void jbd2_start_cleaning(journal_t *journal)
97 + struct cleaner_ctx *ctx = journal->j_cleaner_ctx;
99 + trace_jbd2_jmap_printf("started cleaning");
100 + atomic_set(&journal->j_cleaner_ctx->cleaning, 1);
102 + /* Schedule the next batch of cleaning */
103 + if (!jbd2_cleaning_batch_complete(journal)) {
104 + trace_jbd2_jmap_printf("not scheduling a new batch");
108 + trace_jbd2_jmap_printf("scheduling a batch");
109 + BUG_ON(atomic_read(&ctx->nr_pending_reads));
111 + atomic_set(&ctx->batch_in_progress, 1);
112 + schedule_work(&ctx->work);
117 +inline bool jbd2_cleaning_batch_complete(journal_t *journal)
119 + return jbd2_cleaning(journal) &&
120 + atomic_read(&journal->j_cleaner_ctx->batch_in_progress) == 0;
124 + * Tries to move the tail forward (hence free space) as long as the transaction
125 + * at the tail has only stale blocks. Returns true if manages to free a
126 + * transaction, false otherwise.
128 +bool jbd2_try_to_move_tail(journal_t *journal)
130 + struct transaction_infos *tis = journal->j_transaction_infos;
131 + struct transaction_info *ti, *ti1;
134 + * Advance the tail as far as possible by skipping over transactions
135 + * with no live blocks.
137 + write_lock(&journal->j_jmap_lock);
138 + ti = ti1 = &tis->buf[tis->tail];
140 + for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
141 + trace_jbd2_jmap_printf2("cleaned a transaction",
142 + tis->tail, ti->tid);
143 + tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
144 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
146 + write_unlock(&journal->j_jmap_lock);
151 + * In the worst case, this will end up updating the journal superblock
152 + * after cleaning up every transaction. Should we avoid it?
154 + write_unlock(&journal->j_state_lock);
155 + jbd2_update_log_tail(journal, ti->tid, ti->offset);
156 + write_lock(&journal->j_state_lock);
162 + * Finds the live blocks at the tail transaction and copies the corresponding
163 + * mappings to |ctx->mappings|. Returns the number of live block mappings
164 + * copied. Should be called with a read lock on |j_jmap_lock|.
166 +static int find_live_blocks(struct cleaner_ctx *ctx)
168 + journal_t *journal = ctx->journal;
169 + struct transaction_infos *tis = journal->j_transaction_infos;
170 + struct transaction_info *ti = &tis->buf[tis->tail];
171 + struct jmap_entry *je = NULL;
172 + int i, nr_live = 0;
174 + if (unlikely(list_empty(&ti->live_blks)))
177 + spin_lock(&ctx->pos_lock);
179 + ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
181 + spin_unlock(&ctx->pos_lock);
183 + list_for_each_entry_from(je, &ti->live_blks, list) {
186 + ctx->mappings[nr_live++] = je->mapping;
187 + if (nr_live == CLEANER_BATCH_SIZE)
192 + trace_jbd2_jmap_printf1("found live blocks", nr_live);
193 + for (i = 0; i < nr_live; ++i)
194 + trace_jbd2_jmap_printf2("m",
195 + ctx->mappings[i].fsblk,
196 + ctx->mappings[i].logblk);
200 +static void live_block_read_end_io(struct buffer_head *bh, int uptodate)
202 + struct cleaner_ctx *ctx = bh->b_private;
205 + set_buffer_uptodate(bh);
206 + if (atomic_dec_and_test(&ctx->nr_pending_reads))
207 + wake_up(&ctx->live_block_reads);
210 + clear_buffer_uptodate(bh);
218 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
219 + * buffer heads in |ctx->bhs|. Returns true if at least one of the reads goes
220 + * out to disk and false otherwise. If this function returns true then the
221 + * client should sleep on the condition variable |ctx->live_block_reads|. The
222 + * client will be woken up when all reads are complete, through the end_io
223 + * handler attached to buffer heads read from disk.
225 +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
227 + journal_t *journal = ctx->journal;
229 + struct blk_plug plug;
230 + bool plugged = false;
233 + for (i = 0; i < nr_live; ++i) {
234 + ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
235 + journal->j_blocksize);
236 + if (unlikely(!ctx->bhs[i])) {
240 + if (buffer_uptodate(ctx->bhs[i]))
244 + blk_start_plug(&plug);
246 + lock_buffer(ctx->bhs[i]);
247 + if (buffer_uptodate(ctx->bhs[i]))
249 + ctx->bhs[i]->b_private = ctx;
250 + ctx->bhs[i]->b_end_io = live_block_read_end_io;
251 + get_bh(ctx->bhs[i]);
252 + rc = read_block_from_log(ctx->journal, ctx->bhs[i],
253 + REQ_RAHEAD, ctx->mappings[i].logblk);
254 + if (unlikely(rc < 0))
256 + atomic_inc(&ctx->nr_pending_reads);
259 + trace_jbd2_jmap_printf2("reading from disk",
260 + ctx->mappings[i].fsblk,
261 + ctx->mappings[i].logblk);
263 + trace_jbd2_jmap_printf2("cached",
264 + ctx->mappings[i].fsblk,
265 + ctx->mappings[i].logblk);
269 + blk_finish_plug(&plug);
274 + blk_finish_plug(&plug);
275 + jbd2_journal_abort(ctx->journal, rc);
280 + * This function finds the live blocks that became stale between the call to
281 + * find_live_blocks and now, and discards them. It returns true if there are no
282 + * more live blocks left at the tail transaction.
284 +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
286 + journal_t *journal = ctx->journal;
287 + struct transaction_infos *tis = journal->j_transaction_infos;
288 + struct transaction_info *ti = &tis->buf[tis->tail];
289 + struct jmap_entry *je = NULL;
290 + int i = 0, j = 0, next = 0;
292 + trace_jbd2_jmap_printf(__func__);
293 + spin_lock(&ctx->pos_lock);
296 + list_for_each_entry_from(je, &ti->live_blks, list) {
297 + for (j = next; j < nr_live; ++j) {
298 + if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
300 + ctx->pos = list_next_entry(je, list);
302 + brelse(ctx->bhs[j]);
303 + ctx->bhs[j] = NULL;
304 + trace_jbd2_jmap_printf2(
306 + ctx->mappings[i].fsblk,
307 + ctx->mappings[i].logblk);
311 + trace_jbd2_jmap_printf2(
312 + "moved to another list",
313 + ctx->mappings[i].fsblk,
314 + ctx->mappings[i].logblk);
315 + brelse(ctx->bhs[j]);
316 + ctx->bhs[j] = NULL;
319 + if (++i == nr_live || j == nr_live)
322 + spin_unlock(&ctx->pos_lock);
325 + * We have exited the loop. If we haven't processed all the entries in
326 + * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
327 + * not processed |nr_live| entries from the live blocks list at the
328 + * tail, that is if (i < nr_live) at the exit, then the live blocks list
329 + * has shrunk and the tail transaction has no live blocks left.
331 + return j < nr_live && i < nr_live;
334 +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle,
339 + trace_jbd2_jmap_printf(__func__);
340 + for (i = 0; i < nr_live; ++i) {
343 + trace_jbd2_jmap_printf2("attaching",
344 + ctx->mappings[i].fsblk,
345 + ctx->mappings[i].logblk);
346 + err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
348 + err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
350 + jbd2_journal_abort(ctx->journal, err);
357 + * Read the live blocks from the tail transaction and attach them to the current
360 +void jbd2_jmap_do_clean_batch(struct work_struct *work)
362 + struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
363 + bool wake_up_commit_thread = true;
364 + handle_t *handle = NULL;
367 + read_lock(&ctx->journal->j_jmap_lock);
368 + nr_live = find_live_blocks(ctx);
369 + read_unlock(&ctx->journal->j_jmap_lock);
371 + if (nr_live < CLEANER_BATCH_SIZE)
372 + wake_up_commit_thread = false;
376 + read_live_blocks(ctx, nr_live);
377 + wait_event(ctx->live_block_reads,
378 + atomic_read(&ctx->nr_pending_reads) <= 0);
380 + handle = jbd2_journal_start(ctx->journal, nr_live);
381 + if (IS_ERR(handle)) {
382 + jbd2_journal_abort(ctx->journal, PTR_ERR(handle));
386 + read_lock(&ctx->journal->j_jmap_lock);
387 + if (discard_stale_blocks(ctx, nr_live))
388 + wake_up_commit_thread = false;
389 + read_unlock(&ctx->journal->j_jmap_lock);
391 + * I'm not sure why this function was under the jmap_lock
392 + * previously, but it can't be, since it calls functions that
393 + * can block due to memory allocation. I don't think it needs
394 + * to be protected, since it appears that ctx->mapping is only
395 + * used by the cleaner code, and so it can't be run multiple
398 + attach_live_blocks(ctx, handle, nr_live);
400 + err = jbd2_journal_stop(handle);
402 + jbd2_journal_abort(ctx->journal, err);
407 + atomic_set(&ctx->batch_in_progress, 0);
408 + atomic_inc(&ctx->nr_txns_cleaned);
409 + if (wake_up_commit_thread) {
410 + trace_jbd2_jmap_printf("waking up commit thread");
411 + wake_up(&ctx->journal->j_wait_commit);
413 + trace_jbd2_jmap_printf("not waking up commit thread");
414 + spin_lock(&ctx->pos_lock);
416 + spin_unlock(&ctx->pos_lock);
419 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
420 index 7de6f4a0a1dc..0e759cc095f5 100644
423 @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal,
424 * We are either deleting the entry because it was revoked, or
425 * we are moving it to the live blocks list of this transaction.
426 * In either case, we remove it from its existing list.
427 + * However, before removing it we check to see if this is an
428 + * entry in the live blocks list of the tail transaction a
429 + * pointer to whom is cached by the cleaner and update the
430 + * cached pointer if so.
432 + spin_lock(&journal->j_cleaner_ctx->pos_lock);
433 + if (je == journal->j_cleaner_ctx->pos) {
434 + journal->j_cleaner_ctx->pos = list_next_entry(je, list);
437 + spin_unlock(&journal->j_cleaner_ctx->pos_lock);
440 rb_erase(&je->rb_node, &journal->j_jmap);
441 @@ -216,6 +225,8 @@ void jbd2_finish_transaction_infos(journal_t *journal)
443 struct transaction_infos *tis = journal->j_transaction_infos;
445 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
447 write_lock(&journal->j_jmap_lock);
448 tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
449 write_unlock(&journal->j_jmap_lock);
450 @@ -243,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
452 BUG_ON(!list_empty(&ti->live_blks));
454 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
456 write_lock(&journal->j_jmap_lock);
457 nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
459 @@ -489,11 +502,32 @@ int jbd2_smr_journal_init(journal_t *journal)
461 journal->j_jmap = RB_ROOT;
462 rwlock_init(&journal->j_jmap_lock);
463 + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
465 + if (!journal->j_cleaner_ctx)
468 + journal->j_cleaner_ctx->journal = journal;
469 + journal->j_cleaner_ctx->pos = NULL;
470 + spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
471 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
472 + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
473 + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
474 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
475 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
476 + init_waitqueue_head(&journal->j_cleaner_ctx->live_block_reads);
477 + INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch);
478 return jbd2_init_transaction_infos(journal);
481 void jbd2_smr_journal_exit(journal_t *journal)
483 + if (journal->j_cleaner_ctx) {
484 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
485 + flush_work(&journal->j_cleaner_ctx->work);
486 + kfree(journal->j_cleaner_ctx);
487 + journal->j_cleaner_ctx = NULL;
489 jbd2_free_transaction_infos(journal);
492 diff --git a/fs/jbd2/jmap.h b/fs/jbd2/jmap.h
493 index 91564ce9bbda..5ae3dc52746f 100644
496 @@ -125,4 +125,86 @@ extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
497 extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
498 int op_flags, sector_t blk);
500 +extern void jbd2_jmap_do_clean_batch(struct work_struct *work);
503 + * Cleaner stuff is below.
507 + * Number of blocks to read at once, for cleaning.
509 +#define CLEANER_BATCH_SIZE 16
512 + * Context structure for the cleaner.
514 +struct cleaner_ctx {
516 + * We set to true once we drop below low watermark and it stays so until
517 + * we rise above the high watermark. It is accessed by the commit
518 + * thread and the foreground kernel threads during the journal
519 + * destruction, therefore it is atomic.
524 + * We clean in batches of blocks. This flag indicates if we are
525 + * currently cleaning a batch. It is accessed by the commit thread and
526 + * the cleaner thread, therefore it is atomic.
528 + atomic_t batch_in_progress;
531 + * We find live blocks to clean from the live blocks list of the
532 + * transaction at the tail. This list can be larger than our batch size
533 + * and we may need several attempts to process it. We cache the
534 + * position of the next entry to start from in |pos|. Since cleaner
535 + * thread can run concurrently with the commit thread that can modify
536 + * the live blocks list of the transaction at the tail (for example, if
537 + * it needs to drop a revoked entry or if |pos| points to an entry that
538 + * has been updated and should move from the live blocks list of the
539 + * transaction at the tail to the live blocks list of current
540 + * transaction) we protect |pos| with |pos_lock|.
542 + struct jmap_entry *pos;
543 + spinlock_t pos_lock;
546 + * Live block mappings for the blocks that we copy in a batch.
548 + struct blk_mapping mappings[CLEANER_BATCH_SIZE];
551 + * Buffer heads for the live blocks read in a batch.
553 + struct buffer_head *bhs[CLEANER_BATCH_SIZE];
556 + * Number of pending reads in a batch. Every submitted read increments
557 + * it and every completed read decrements it.
559 + atomic_t nr_pending_reads;
562 + * The cleaner thread sleeps on this wait queue until the last
563 + * completed read wakes the up the cleaner thread.
565 + wait_queue_head_t live_block_reads;
567 + /* TODO: temporary for debugging, remove once done. */
568 + atomic_t nr_txns_committed;
569 + atomic_t nr_txns_cleaned;
571 + journal_t *journal;
572 + struct work_struct work;
575 +extern int jbd2_low_on_space(journal_t *journal);
576 +extern int jbd2_high_on_space(journal_t *journal);
577 +extern bool jbd2_cleaning(journal_t *journal);
578 +extern void jbd2_stop_cleaning(journal_t *journal);
579 +extern void jbd2_start_cleaning(journal_t *journal);
580 +extern bool jbd2_cleaning_batch_complete(journal_t *journal);
581 +extern bool jbd2_try_to_move_tail(journal_t *journal);
583 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
584 index 50ab9b5bc7f4..9c097ddfe63f 100644
585 --- a/fs/jbd2/journal.c
586 +++ b/fs/jbd2/journal.c
587 @@ -229,6 +229,15 @@ static int kjournald2(void *arg)
590 wake_up(&journal->j_wait_done_commit);
592 + if ((journal->j_flags & JBD2_LAZY) &&
593 + (jbd2_cleaning(journal) || jbd2_low_on_space(journal))) {
594 + if (jbd2_try_to_move_tail(journal) && jbd2_high_on_space(journal))
595 + jbd2_stop_cleaning(journal);
597 + jbd2_start_cleaning(journal);
600 if (freezing(current)) {
602 * The simpler the better. Flushing journal isn't a
603 @@ -257,6 +266,9 @@ static int kjournald2(void *arg)
605 if (journal->j_flags & JBD2_UNMOUNT)
607 + if ((journal->j_flags & JBD2_LAZY) &&
608 + jbd2_cleaning_batch_complete(journal))
611 write_unlock(&journal->j_state_lock);
613 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
614 index a1d56bb9fa4f..fa6094124bcb 100644
615 --- a/include/linux/jbd2.h
616 +++ b/include/linux/jbd2.h
617 @@ -734,7 +734,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
618 * @j_superblock: Second part of superblock buffer
619 * @j_map: A map from file system blocks to log blocks
620 * @j_transaction_infos: An array of information structures per live transaction
621 - * @j_map_lock: Protect j_jmap and j_transaction_infos
622 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
623 + * @j_cleaner_ctx: Cleaner state
624 * @j_format_version: Version of the superblock format
625 * @j_state_lock: Protect the various scalars in the journal
626 * @j_barrier_count: Number of processes waiting to create a barrier lock
627 @@ -819,6 +820,9 @@ struct journal_s
628 /* Protect j_jmap and j_transaction_infos */
629 rwlock_t j_jmap_lock;
631 + /* Cleaner state */
632 + struct cleaner_ctx *j_cleaner_ctx;
634 /* Version of the superblock format */
635 int j_format_version;