3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner. Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head. After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
13 fs/jbd2/Makefile | 2 +-
14 fs/jbd2/checkpoint.c | 3 +
15 fs/jbd2/cleaner.c | 371 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
16 fs/jbd2/jmap.c | 32 +++++++
17 fs/jbd2/journal.c | 12 +++
18 include/linux/jbd2.h | 6 +-
19 include/linux/jmap.h | 82 ++++++++++++++++++
20 7 files changed, 506 insertions(+), 2 deletions(-)
22 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
23 index a54f50b3a06e..b6a2dddcc0a7 100644
24 --- a/fs/jbd2/Makefile
25 +++ b/fs/jbd2/Makefile
27 obj-$(CONFIG_JBD2) += jbd2.o
29 jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
32 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
33 index 4055f51617ef..b60bbf58e8f7 100644
34 --- a/fs/jbd2/checkpoint.c
35 +++ b/fs/jbd2/checkpoint.c
36 @@ -389,6 +389,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
38 unsigned long blocknr;
40 + if (journal->j_flags & JBD2_LAZY)
43 if (is_journal_aborted(journal))
46 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
48 index 000000000000..31abb4fa2706
50 +++ b/fs/jbd2/cleaner.c
52 +#include <linux/blk_types.h>
53 +#include <linux/jbd2.h>
54 +#include <linux/jmap.h>
55 +#include <linux/list.h>
56 +#include <linux/blkdev.h>
57 +#include <linux/completion.h>
58 +#include <linux/delay.h>
59 +#include <trace/events/jbd2.h>
61 +inline int jbd2_low_on_space(journal_t *journal)
63 + int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
65 + trace_jbd2_jmap_printf1("low on space", x);
68 + trace_jbd2_jmap_printf1("not low on space", x);
72 +inline int jbd2_high_on_space(journal_t *journal)
74 + if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
75 + trace_jbd2_jmap_printf("not enough cleaned");
78 + trace_jbd2_jmap_printf("enough cleaned");
79 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
80 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
84 +inline bool jbd2_cleaning(journal_t *journal)
86 + return atomic_read(&journal->j_cleaner_ctx->cleaning);
89 +inline void jbd2_stop_cleaning(journal_t *journal)
91 + trace_jbd2_jmap_printf("stopped cleaning");
92 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
95 +inline void jbd2_start_cleaning(journal_t *journal)
97 + struct cleaner_ctx *ctx = journal->j_cleaner_ctx;
99 + trace_jbd2_jmap_printf("started cleaning");
100 + atomic_set(&journal->j_cleaner_ctx->cleaning, 1);
102 + /* Schedule the next batch of cleaning */
103 + if (!jbd2_cleaning_batch_complete(journal)) {
104 + trace_jbd2_jmap_printf("not scheduling a new batch");
108 + trace_jbd2_jmap_printf("scheduling a batch");
109 + BUG_ON(atomic_read(&ctx->nr_pending_reads));
111 + atomic_set(&ctx->batch_in_progress, 1);
112 + schedule_work(&ctx->work);
117 +inline bool jbd2_cleaning_batch_complete(journal_t *journal)
119 + return jbd2_cleaning(journal) &&
120 + atomic_read(&journal->j_cleaner_ctx->batch_in_progress) == 0;
124 + * Tries to move the tail forward (hence free space) as long as the transaction
125 + * at the tail has only stale blocks. Returns true if manages to free a
126 + * transaction, false otherwise.
128 +bool jbd2_try_to_move_tail(journal_t *journal)
130 + struct transaction_infos *tis = journal->j_transaction_infos;
131 + struct transaction_info *ti, *ti1;
134 + * Advance the tail as far as possible by skipping over transactions
135 + * with no live blocks.
137 + write_lock(&journal->j_jmap_lock);
138 + ti = ti1 = &tis->buf[tis->tail];
140 + for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
141 + trace_jbd2_jmap_printf2("cleaned a transaction",
142 + tis->tail, ti->tid);
143 + tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
144 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
146 + write_unlock(&journal->j_jmap_lock);
151 + * In the worst case, this will end up updating the journal superblock
152 + * after cleaning up every transaction. Should we avoid it?
154 + write_unlock(&journal->j_state_lock);
155 + jbd2_update_log_tail(journal, ti->tid, ti->offset);
156 + write_lock(&journal->j_state_lock);
162 + * Finds the live blocks at the tail transaction and copies the corresponding
163 + * mappings to |ctx->mappings|. Returns the number of live block mappings
164 + * copied. Should be called with a read lock on |j_jmap_lock|.
166 +static int find_live_blocks(struct cleaner_ctx *ctx)
168 + journal_t *journal = ctx->journal;
169 + struct transaction_infos *tis = journal->j_transaction_infos;
170 + struct transaction_info *ti = &tis->buf[tis->tail];
171 + struct jmap_entry *je = NULL;
172 + int i, nr_live = 0;
174 + if (unlikely(list_empty(&ti->live_blks)))
177 + spin_lock(&ctx->pos_lock);
179 + ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
181 + spin_unlock(&ctx->pos_lock);
183 + list_for_each_entry_from(je, &ti->live_blks, list) {
186 + ctx->mappings[nr_live++] = je->mapping;
187 + if (nr_live == CLEANER_BATCH_SIZE)
192 + trace_jbd2_jmap_printf1("found live blocks", nr_live);
193 + for (i = 0; i < nr_live; ++i)
194 + trace_jbd2_jmap_printf2("m",
195 + ctx->mappings[i].fsblk,
196 + ctx->mappings[i].logblk);
200 +static void live_block_read_end_io(struct buffer_head *bh, int uptodate)
202 + struct cleaner_ctx *ctx = bh->b_private;
205 + set_buffer_uptodate(bh);
206 + if (atomic_dec_and_test(&ctx->nr_pending_reads))
207 + complete(&ctx->live_block_reads);
210 + clear_buffer_uptodate(bh);
218 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
219 + * buffer heads in |ctx->bhs|. Returns true if at least one of the reads goes
220 + * out to disk and false otherwise. If this function returns true then the
221 + * client should sleep on the condition variable |ctx->live_block_reads|. The
222 + * client will be woken up when all reads are complete, through the end_io
223 + * handler attached to buffer heads read from disk.
225 +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
227 + journal_t *journal = ctx->journal;
229 + struct blk_plug plug;
230 + bool plugged = false;
233 + for (i = 0; i < nr_live; ++i) {
234 + ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
235 + journal->j_blocksize);
236 + if (unlikely(!ctx->bhs[i]))
238 + if (buffer_uptodate(ctx->bhs[i]))
241 + blk_start_plug(&plug);
242 + lock_buffer(ctx->bhs[i]);
243 + ctx->bhs[i]->b_private = ctx;
244 + ctx->bhs[i]->b_end_io = live_block_read_end_io;
245 + atomic_inc(&ctx->nr_pending_reads);
246 + get_bh(ctx->bhs[i]);
247 + rc = read_block_from_log(ctx->journal, ctx->bhs[i],
248 + REQ_RAHEAD, ctx->mappings[i].logblk);
249 + if (unlikely(rc < 0))
253 + trace_jbd2_jmap_printf2("reading from disk",
254 + ctx->mappings[i].fsblk,
255 + ctx->mappings[i].logblk);
257 + trace_jbd2_jmap_printf2("cached",
258 + ctx->mappings[i].fsblk,
259 + ctx->mappings[i].logblk);
263 + blk_finish_plug(&plug);
267 + jbd2_journal_abort(ctx->journal, -ENOMEM);
272 + * This function finds the live blocks that became stale between the call to
273 + * find_live_blocks and now, and discards them. It returns true if there are no
274 + * more live blocks left at the tail transaction.
276 +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
278 + journal_t *journal = ctx->journal;
279 + struct transaction_infos *tis = journal->j_transaction_infos;
280 + struct transaction_info *ti = &tis->buf[tis->tail];
281 + struct jmap_entry *je = NULL;
282 + int i = 0, j = 0, next = 0;
284 + trace_jbd2_jmap_printf(__func__);
285 + spin_lock(&ctx->pos_lock);
288 + list_for_each_entry_from(je, &ti->live_blks, list) {
289 + for (j = next; j < nr_live; ++j) {
290 + if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
292 + ctx->pos = list_next_entry(je, list);
294 + brelse(ctx->bhs[j]);
295 + ctx->bhs[j] = NULL;
296 + trace_jbd2_jmap_printf2(
298 + ctx->mappings[i].fsblk,
299 + ctx->mappings[i].logblk);
303 + trace_jbd2_jmap_printf2(
304 + "moved to another list",
305 + ctx->mappings[i].fsblk,
306 + ctx->mappings[i].logblk);
307 + brelse(ctx->bhs[j]);
308 + ctx->bhs[j] = NULL;
311 + if (++i == nr_live || j == nr_live)
314 + spin_unlock(&ctx->pos_lock);
317 + * We have exited the loop. If we haven't processed all the entries in
318 + * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
319 + * not processed |nr_live| entries from the live blocks list at the
320 + * tail, that is if (i < nr_live) at the exit, then the live blocks list
321 + * has shrunk and the tail transaction has no live blocks left.
323 + return j < nr_live && i < nr_live;
326 +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle,
331 + trace_jbd2_jmap_printf(__func__);
332 + for (i = 0; i < nr_live; ++i) {
335 + trace_jbd2_jmap_printf2("attaching",
336 + ctx->mappings[i].fsblk,
337 + ctx->mappings[i].logblk);
338 + err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
340 + err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
342 + jbd2_journal_abort(ctx->journal, err);
349 + * Read the live blocks from the tail transaction and attach them to the current
352 +void jbd2_jmap_do_clean_batch(struct work_struct *work)
354 + struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
355 + bool wake_up_commit_thread = true;
356 + handle_t *handle = NULL;
360 + read_lock(&ctx->journal->j_jmap_lock);
361 + nr_live = find_live_blocks(ctx);
362 + read_unlock(&ctx->journal->j_jmap_lock);
364 + if (nr_live < CLEANER_BATCH_SIZE)
365 + wake_up_commit_thread = false;
369 + reinit_completion(&ctx->live_block_reads);
370 + if (read_live_blocks(ctx, nr_live)) {
371 + trace_jbd2_jmap_printf("waiting for completion");
372 + wait_for_completion(&ctx->live_block_reads);
374 + trace_jbd2_jmap_printf("not waiting for completion");
376 + while (atomic_read(&ctx->nr_pending_reads)) {
377 + /* Should never trigger, but we'll do better
378 + * converting to a wait_channel instead of the completion */
379 + pr_err("JBD2: clean_batch: completion failed, recovering\n");
384 + handle = jbd2_journal_start(ctx->journal, nr_live);
385 + if (IS_ERR(handle)) {
386 + jbd2_journal_abort(ctx->journal, PTR_ERR(handle));
390 + read_lock(&ctx->journal->j_jmap_lock);
391 + if (discard_stale_blocks(ctx, nr_live))
392 + wake_up_commit_thread = false;
393 + read_unlock(&ctx->journal->j_jmap_lock);
395 + * I'm not sure why this function was under the jmap_lock
396 + * previously, but it can't be, since it calls functions that
397 + * can block due to memory allocation. I don't think it needs
398 + * to be protected, since it appears that ctx->mapping is only
399 + * used by the cleaner code, and so it can't be run multiple
402 + attach_live_blocks(ctx, handle, nr_live);
404 + err = jbd2_journal_stop(handle);
406 + jbd2_journal_abort(ctx->journal, err);
411 + atomic_set(&ctx->batch_in_progress, 0);
412 + atomic_inc(&ctx->nr_txns_cleaned);
413 + if (wake_up_commit_thread) {
414 + trace_jbd2_jmap_printf("waking up commit thread");
415 + wake_up(&ctx->journal->j_wait_commit);
417 + trace_jbd2_jmap_printf("not waking up commit thread");
418 + spin_lock(&ctx->pos_lock);
420 + spin_unlock(&ctx->pos_lock);
423 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
424 index 18dd0e127aff..9b317608f70a 100644
427 @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal,
428 * We are either deleting the entry because it was revoked, or
429 * we are moving it to the live blocks list of this transaction.
430 * In either case, we remove it from its existing list.
431 + * However, before removing it we check to see if this is an
432 + * entry in the live blocks list of the tail transaction a
433 + * pointer to whom is cached by the cleaner and update the
434 + * cached pointer if so.
436 + spin_lock(&journal->j_cleaner_ctx->pos_lock);
437 + if (je == journal->j_cleaner_ctx->pos) {
438 + journal->j_cleaner_ctx->pos = list_next_entry(je, list);
441 + spin_unlock(&journal->j_cleaner_ctx->pos_lock);
444 rb_erase(&je->rb_node, &journal->j_jmap);
445 @@ -245,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
447 BUG_ON(!list_empty(&ti->live_blks));
449 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
451 write_lock(&journal->j_jmap_lock);
452 nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
454 @@ -489,11 +500,32 @@ int jbd2_smr_journal_init(journal_t *journal)
456 journal->j_jmap = RB_ROOT;
457 rwlock_init(&journal->j_jmap_lock);
458 + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
460 + if (!journal->j_cleaner_ctx)
463 + journal->j_cleaner_ctx->journal = journal;
464 + journal->j_cleaner_ctx->pos = NULL;
465 + spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
466 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
467 + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
468 + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
469 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
470 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
471 + init_completion(&journal->j_cleaner_ctx->live_block_reads);
472 + INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch);
473 return jbd2_init_transaction_infos(journal);
476 void jbd2_smr_journal_exit(journal_t *journal)
478 + if (journal->j_cleaner_ctx) {
479 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
480 + flush_work(&journal->j_cleaner_ctx->work);
481 + kfree(journal->j_cleaner_ctx);
482 + journal->j_cleaner_ctx = NULL;
484 jbd2_free_transaction_infos(journal);
487 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
488 index 493b72c60335..ab7b9bbc9296 100644
489 --- a/fs/jbd2/journal.c
490 +++ b/fs/jbd2/journal.c
491 @@ -227,6 +227,15 @@ static int kjournald2(void *arg)
494 wake_up(&journal->j_wait_done_commit);
496 + if ((journal->j_flags & JBD2_LAZY) &&
497 + (jbd2_cleaning(journal) || jbd2_low_on_space(journal))) {
498 + if (jbd2_try_to_move_tail(journal) && jbd2_high_on_space(journal))
499 + jbd2_stop_cleaning(journal);
501 + jbd2_start_cleaning(journal);
504 if (freezing(current)) {
506 * The simpler the better. Flushing journal isn't a
507 @@ -255,6 +264,9 @@ static int kjournald2(void *arg)
509 if (journal->j_flags & JBD2_UNMOUNT)
511 + if ((journal->j_flags & JBD2_LAZY) &&
512 + jbd2_cleaning_batch_complete(journal))
515 write_unlock(&journal->j_state_lock);
517 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
518 index 771588026353..3112fba26598 100644
519 --- a/include/linux/jbd2.h
520 +++ b/include/linux/jbd2.h
521 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
522 * @j_superblock: Second part of superblock buffer
523 * @j_map: A map from file system blocks to log blocks
524 * @j_transaction_infos: An array of information structures per live transaction
525 - * @j_map_lock: Protect j_jmap and j_transaction_infos
526 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
527 + * @j_cleaner_ctx: Cleaner state
528 * @j_format_version: Version of the superblock format
529 * @j_state_lock: Protect the various scalars in the journal
530 * @j_barrier_count: Number of processes waiting to create a barrier lock
531 @@ -820,6 +821,9 @@ struct journal_s
532 /* Protect j_jmap and j_transaction_infos */
533 rwlock_t j_jmap_lock;
535 + /* Cleaner state */
536 + struct cleaner_ctx *j_cleaner_ctx;
538 /* Version of the superblock format */
539 int j_format_version;
541 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
542 index 638f25df8302..5af1fec4ab95 100644
543 --- a/include/linux/jmap.h
544 +++ b/include/linux/jmap.h
545 @@ -132,5 +132,87 @@ extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
547 extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
549 +extern void jbd2_jmap_do_clean_batch(struct work_struct *work);
552 + * Cleaner stuff is below.
556 + * Number of blocks to read at once, for cleaning.
558 +#define CLEANER_BATCH_SIZE 16
561 + * Context structure for the cleaner.
563 +struct cleaner_ctx {
565 + * We set to true once we drop below low watermark and it stays so until
566 + * we rise above the high watermark. It is accessed by the commit
567 + * thread and the foreground kernel threads during the journal
568 + * destruction, therefore it is atomic.
573 + * We clean in batches of blocks. This flag indicates if we are
574 + * currently cleaning a batch. It is accessed by the commit thread and
575 + * the cleaner thread, therefore it is atomic.
577 + atomic_t batch_in_progress;
580 + * We find live blocks to clean from the live blocks list of the
581 + * transaction at the tail. This list can be larger than our batch size
582 + * and we may need several attempts to process it. We cache the
583 + * position of the next entry to start from in |pos|. Since cleaner
584 + * thread can run concurrently with the commit thread that can modify
585 + * the live blocks list of the transaction at the tail (for example, if
586 + * it needs to drop a revoked entry or if |pos| points to an entry that
587 + * has been updated and should move from the live blocks list of the
588 + * transaction at the tail to the live blocks list of current
589 + * transaction) we protect |pos| with |pos_lock|.
591 + struct jmap_entry *pos;
592 + spinlock_t pos_lock;
595 + * Live block mappings for the blocks that we copy in a batch.
597 + struct blk_mapping mappings[CLEANER_BATCH_SIZE];
600 + * Buffer heads for the live blocks read in a batch.
602 + struct buffer_head *bhs[CLEANER_BATCH_SIZE];
605 + * Number of pending reads in a batch. Every submitted read increments
606 + * it and every completed read decrements it.
608 + atomic_t nr_pending_reads;
611 + * The cleaner thread sleeps on this condition variable until the last
612 + * completed read wakes the up the cleaner thread.
614 + struct completion live_block_reads;
616 + /* TODO: temporary for debugging, remove once done. */
617 + atomic_t nr_txns_committed;
618 + atomic_t nr_txns_cleaned;
620 + journal_t *journal;
621 + struct work_struct work;
624 +extern int jbd2_low_on_space(journal_t *journal);
625 +extern int jbd2_high_on_space(journal_t *journal);
626 +extern bool jbd2_cleaning(journal_t *journal);
627 +extern void jbd2_stop_cleaning(journal_t *journal);
628 +extern void jbd2_start_cleaning(journal_t *journal);
629 +extern bool jbd2_cleaning_batch_complete(journal_t *journal);
630 +extern bool jbd2_try_to_move_tail(journal_t *journal);