cleaner

   1 Introduce cleaner
   2
   3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
   4
   5 An experimental cleaner.  Copy the live blocks from the transaction at the
   6 tail in batches to the transaction at the head.  After a commit ends, check
   7 if free space is below watermark and start cleaning until free space is
   8 above high watermark.
   9
  10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
  11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  12 ---
  13  fs/jbd2/Makefile     |   2 +-
  14  fs/jbd2/checkpoint.c |   3 +
  15  fs/jbd2/cleaner.c    | 368 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  16  fs/jbd2/jmap.c       |  34 ++++++++
  17  fs/jbd2/jmap.h       |  77 +++++++++++++++++
  18  fs/jbd2/journal.c    |  23 +++++-
  19  include/linux/jbd2.h |   8 ++
  20  7 files changed, 512 insertions(+), 3 deletions(-)
  21
  22 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
  23 index a54f50b3a06e..b6a2dddcc0a7 100644
  24 --- a/fs/jbd2/Makefile
  25 +++ b/fs/jbd2/Makefile
  26 @@ -5,4 +5,4 @@
  27  obj-$(CONFIG_JBD2) += jbd2.o
  28
  29  jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
  30 -               jmap.o
  31 +               jmap.o cleaner.o
  32 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
  33 index c125d662777c..b2468698f566 100644
  34 --- a/fs/jbd2/checkpoint.c
  35 +++ b/fs/jbd2/checkpoint.c
  36 @@ -386,6 +386,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  37         tid_t           first_tid;
  38         unsigned long   blocknr;
  39
  40 +       if (journal->j_flags & JBD2_LAZY)
  41 +               return 0;
  42 +
  43         if (is_journal_aborted(journal))
  44                 return -EIO;
  45
  46 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
  47 new file mode 100644
  48 index 000000000000..06ec11e1d2dd
  49 --- /dev/null
  50 +++ b/fs/jbd2/cleaner.c
  51 @@ -0,0 +1,368 @@
  52 +#include <linux/blk_types.h>
  53 +#include <linux/jbd2.h>
  54 +#include "jmap.h"
  55 +#include <linux/list.h>
  56 +#include <linux/blkdev.h>
  57 +#include <linux/completion.h>
  58 +#include <linux/delay.h>
  59 +#include <trace/events/jbd2.h>
  60 +
  61 +static inline int jbd2_low_on_space(journal_t *journal)
  62 +{
  63 +       int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
  64 +       if (x > 10) {
  65 +               trace_jbd2_jmap_printf1("low on space", x);
  66 +               return true;
  67 +       }
  68 +       trace_jbd2_jmap_printf1("not low on space", x);
  69 +       return false;
  70 +}
  71 +
  72 +static inline int jbd2_high_on_space(journal_t *journal)
  73 +{
  74 +       if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
  75 +               trace_jbd2_jmap_printf("not enough cleaned");
  76 +               return false;
  77 +       }
  78 +       trace_jbd2_jmap_printf("enough cleaned");
  79 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
  80 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
  81 +       return true;
  82 +}
  83 +
  84 +/*
  85 + * Tries to move the tail forward (hence free space) as long as the transaction
  86 + * at the tail has only stale blocks.  Returns true if manages to free a
  87 + * transaction, false otherwise.
  88 + */
  89 +static bool jbd2_try_to_move_tail(journal_t *journal)
  90 +{
  91 +       struct transaction_infos *tis = journal->j_transaction_infos;
  92 +       struct transaction_info *ti, *ti1;
  93 +
  94 +       /*
  95 +        * Advance the tail as far as possible by skipping over transactions
  96 +        * with no live blocks.
  97 +        */
  98 +       write_lock(&journal->j_jmap_lock);
  99 +       ti = ti1 = &tis->buf[tis->tail];
 100 +
 101 +       for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
 102 +               trace_jbd2_jmap_printf2("cleaned a transaction",
 103 +                                       tis->tail, ti->tid);
 104 +               tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
 105 +               atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
 106 +       }
 107 +       write_unlock(&journal->j_jmap_lock);
 108 +
 109 +       if (ti == ti1)
 110 +               return false;
 111 +       /*
 112 +        * In the worst case, this will end up updating the journal superblock
 113 +        * after cleaning up every transaction.  Should we avoid it?
 114 +        */
 115 +       write_unlock(&journal->j_state_lock);
 116 +       jbd2_update_log_tail(journal, ti->tid, ti->offset);
 117 +       write_lock(&journal->j_state_lock);
 118 +
 119 +       return true;
 120 +}
 121 +
 122 +/*
 123 + * Finds the live blocks at the tail transaction and copies the corresponding
 124 + * mappings to |ctx->mappings|.  Returns the number of live block mappings
 125 + * copied.  Should be called with a read lock on |j_jmap_lock|.
 126 + */
 127 +static int find_live_blocks(struct cleaner_ctx *ctx)
 128 +{
 129 +       journal_t *journal = ctx->journal;
 130 +       struct transaction_infos *tis = journal->j_transaction_infos;
 131 +       struct transaction_info *ti = &tis->buf[tis->tail];
 132 +       struct jmap_entry *je = NULL;
 133 +       int i, nr_live = 0;
 134 +
 135 +       if (unlikely(list_empty(&ti->live_blks)))
 136 +               goto done;
 137 +
 138 +       spin_lock(&ctx->pos_lock);
 139 +       if (!ctx->pos)
 140 +               ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
 141 +       je = ctx->pos;
 142 +       spin_unlock(&ctx->pos_lock);
 143 +
 144 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 145 +               if (je->revoked)
 146 +                       continue;
 147 +               ctx->mappings[nr_live++] = je->mapping;
 148 +               if (nr_live == CLEANER_BATCH_SIZE)
 149 +                       break;
 150 +       }
 151 +
 152 +done:
 153 +       trace_jbd2_jmap_printf1("found live blocks", nr_live);
 154 +       for (i = 0; i < nr_live; ++i)
 155 +               trace_jbd2_jmap_printf2("m",
 156 +                                       ctx->mappings[i].fsblk,
 157 +                                       ctx->mappings[i].logblk);
 158 +       return nr_live;
 159 +}
 160 +
 161 +static void live_block_read_end_io(struct buffer_head *bh, int uptodate)
 162 +{
 163 +       struct cleaner_ctx *ctx = bh->b_private;
 164 +
 165 +       if (uptodate) {
 166 +               set_buffer_uptodate(bh);
 167 +               if (atomic_dec_and_test(&ctx->nr_pending_reads))
 168 +                       wake_up(&ctx->live_block_reads);
 169 +       } else {
 170 +               WARN_ON(1);
 171 +               clear_buffer_uptodate(bh);
 172 +       }
 173 +
 174 +       unlock_buffer(bh);
 175 +       put_bh(bh);
 176 +}
 177 +
 178 +/*
 179 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
 180 + * buffer heads in |ctx->bhs|.  Returns true if at least one of the reads goes
 181 + * out to disk and false otherwise.  If this function returns true then the
 182 + * client should sleep on the condition variable |ctx->live_block_reads|.  The
 183 + * client will be woken up when all reads are complete, through the end_io
 184 + * handler attached to buffer heads read from disk.
 185 + */
 186 +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
 187 +{
 188 +       journal_t *journal = ctx->journal;
 189 +       bool slow = false;
 190 +       struct blk_plug plug;
 191 +       bool plugged = false;
 192 +       int i, rc;
 193 +
 194 +       for (i = 0; i < nr_live; ++i) {
 195 +               ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
 196 +                               journal->j_blocksize);
 197 +               if (unlikely(!ctx->bhs[i])) {
 198 +                       rc = -ENOMEM;
 199 +                       goto out_err;
 200 +               }
 201 +               if (buffer_uptodate(ctx->bhs[i]))
 202 +                       continue;
 203 +               if (!plugged) {
 204 +                       plugged = true;
 205 +                       blk_start_plug(&plug);
 206 +               }
 207 +               lock_buffer(ctx->bhs[i]);
 208 +               if (buffer_uptodate(ctx->bhs[i]))
 209 +                       continue;
 210 +               ctx->bhs[i]->b_private = ctx;
 211 +               ctx->bhs[i]->b_end_io = live_block_read_end_io;
 212 +               get_bh(ctx->bhs[i]);
 213 +               rc = read_block_from_log(ctx->journal, ctx->bhs[i],
 214 +                                        REQ_RAHEAD, ctx->mappings[i].logblk);
 215 +               if (unlikely(rc < 0))
 216 +                       goto out_err;
 217 +               atomic_inc(&ctx->nr_pending_reads);
 218 +               if (rc) {
 219 +                       slow = true;
 220 +                       trace_jbd2_jmap_printf2("reading from disk",
 221 +                                               ctx->mappings[i].fsblk,
 222 +                                               ctx->mappings[i].logblk);
 223 +               } else {
 224 +                       trace_jbd2_jmap_printf2("cached",
 225 +                                               ctx->mappings[i].fsblk,
 226 +                                               ctx->mappings[i].logblk);
 227 +               }
 228 +       }
 229 +       if (plugged)
 230 +               blk_finish_plug(&plug);
 231 +       return slow;
 232 +
 233 +out_err:
 234 +       if (plugged)
 235 +               blk_finish_plug(&plug);
 236 +       jbd2_journal_abort(ctx->journal, rc);
 237 +       return false;
 238 +}
 239 +
 240 +/*
 241 + * This function finds the live blocks that became stale between the call to
 242 + * find_live_blocks and now, and discards them.  It returns true if there are no
 243 + * more live blocks left at the tail transaction.
 244 + */
 245 +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
 246 +{
 247 +       journal_t *journal = ctx->journal;
 248 +       struct transaction_infos *tis = journal->j_transaction_infos;
 249 +       struct transaction_info *ti = &tis->buf[tis->tail];
 250 +       struct jmap_entry *je = NULL;
 251 +       int i = 0, j = 0, next = 0;
 252 +
 253 +       trace_jbd2_jmap_printf(__func__);
 254 +       spin_lock(&ctx->pos_lock);
 255 +       BUG_ON(!ctx->pos);
 256 +       je = ctx->pos;
 257 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 258 +               for (j = next; j < nr_live; ++j) {
 259 +                       if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
 260 +                               next = j+1;
 261 +                               ctx->pos = list_next_entry(je, list);
 262 +                               if (je->revoked) {
 263 +                                       brelse(ctx->bhs[j]);
 264 +                                       ctx->bhs[j] = NULL;
 265 +                                       trace_jbd2_jmap_printf2(
 266 +                                               "revoked",
 267 +                                               ctx->mappings[i].fsblk,
 268 +                                               ctx->mappings[i].logblk);
 269 +                               }
 270 +                               break;
 271 +                       } else {
 272 +                               trace_jbd2_jmap_printf2(
 273 +                                               "moved to another list",
 274 +                                               ctx->mappings[i].fsblk,
 275 +                                               ctx->mappings[i].logblk);
 276 +                               brelse(ctx->bhs[j]);
 277 +                               ctx->bhs[j] = NULL;
 278 +                       }
 279 +               }
 280 +               if (++i == nr_live || j == nr_live)
 281 +                       break;
 282 +       }
 283 +       spin_unlock(&ctx->pos_lock);
 284 +
 285 +       /*
 286 +        * We have exited the loop.  If we haven't processed all the entries in
 287 +        * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
 288 +        * not processed |nr_live| entries from the live blocks list at the
 289 +        * tail, that is if (i < nr_live) at the exit, then the live blocks list
 290 +        * has shrunk and the tail transaction has no live blocks left.
 291 +        */
 292 +       return j < nr_live && i < nr_live;
 293 +}
 294 +
 295 +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle,
 296 +                              int nr_live)
 297 +{
 298 +       int err, i;
 299 +
 300 +       trace_jbd2_jmap_printf(__func__);
 301 +       for (i = 0; i < nr_live; ++i) {
 302 +               if (!ctx->bhs[i])
 303 +                       continue;
 304 +               trace_jbd2_jmap_printf2("attaching",
 305 +                                       ctx->mappings[i].fsblk,
 306 +                                       ctx->mappings[i].logblk);
 307 +               err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
 308 +               if (!err)
 309 +                       err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
 310 +               if (err) {
 311 +                       jbd2_journal_abort(ctx->journal, err);
 312 +                       return;
 313 +               }
 314 +       }
 315 +}
 316 +
 317 +/*
 318 + * Read the live blocks from the tail transaction and attach them to the current
 319 + * transaction.
 320 + */
 321 +void jbd2_jmap_do_clean_batch(struct work_struct *work)
 322 +{
 323 +       struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
 324 +       journal_t *journal = ctx->journal;
 325 +       bool wake_up_commit_thread = true;
 326 +       handle_t *handle = NULL;
 327 +       int nr_live, err;
 328 +
 329 +       read_lock(&journal->j_jmap_lock);
 330 +       nr_live = find_live_blocks(ctx);
 331 +       read_unlock(&journal->j_jmap_lock);
 332 +
 333 +       if (nr_live < CLEANER_BATCH_SIZE)
 334 +               wake_up_commit_thread = false;
 335 +       if (nr_live == 0)
 336 +               goto done;
 337 +
 338 +       read_live_blocks(ctx, nr_live);
 339 +       wait_event(ctx->live_block_reads,
 340 +                  atomic_read(&ctx->nr_pending_reads) <= 0);
 341 +
 342 +       handle = jbd2_journal_start(journal, nr_live);
 343 +       if (IS_ERR(handle)) {
 344 +               jbd2_journal_abort(journal, PTR_ERR(handle));
 345 +               return;
 346 +       }
 347 +
 348 +       read_lock(&journal->j_jmap_lock);
 349 +       if (discard_stale_blocks(ctx, nr_live))
 350 +               wake_up_commit_thread = false;
 351 +       read_unlock(&journal->j_jmap_lock);
 352 +       /*
 353 +        * I'm not sure why this function was under the jmap_lock
 354 +        * previously, but it can't be, since it calls functions that
 355 +        * can block due to memory allocation.  I don't think it needs
 356 +        * to be protected, since it appears that ctx->mapping is only
 357 +        * used by the cleaner code, and so it can't be run multiple
 358 +        * times.  -- TYT
 359 +        */
 360 +       attach_live_blocks(ctx, handle, nr_live);
 361 +
 362 +       err = jbd2_journal_stop(handle);
 363 +       if (err) {
 364 +               jbd2_journal_abort(journal, err);
 365 +               return;
 366 +       }
 367 +
 368 +done:
 369 +       atomic_set(&ctx->batch_in_progress, 0);
 370 +       atomic_inc(&ctx->nr_txns_cleaned);
 371 +       if (wake_up_commit_thread) {
 372 +               trace_jbd2_jmap_printf("waking up commit thread");
 373 +               wake_up(&journal->j_wait_commit);
 374 +       } else {
 375 +               trace_jbd2_jmap_printf("not waking up commit thread");
 376 +               spin_lock(&ctx->pos_lock);
 377 +               ctx->pos = NULL;
 378 +               spin_unlock(&ctx->pos_lock);
 379 +       }
 380 +       write_lock(&journal->j_state_lock);
 381 +       journal->j_flags &= ~JBD2_CLEANING;
 382 +       write_unlock(&journal->j_state_lock);
 383 +}
 384 +
 385 +/*
 386 + * Called by the commit thread to see if we need to do any cleaning
 387 + * work.
 388 + * Called with j_state_lock write locked.
 389 + */
 390 +void jbd2_check_cleaner(journal_t *journal)
 391 +{
 392 +       /*
 393 +        * If there is cleaning going on in the workqueue, don't check
 394 +        * until we're done.
 395 +        */
 396 +       if (journal->j_flags & JBD2_CLEANING)
 397 +               return;
 398 +
 399 +       if (journal->j_flags & JBD2_STOP_CLEANING) {
 400 +       disengage_cleaner:
 401 +               journal->j_flags &= ~JBD2_CLEANER_ENGAGED;
 402 +               return;
 403 +       }
 404 +
 405 +       if (journal->j_flags & JBD2_CLEANER_ENGAGED) {
 406 +               if (jbd2_try_to_move_tail(journal) &&
 407 +                   jbd2_high_on_space(journal))
 408 +                       goto disengage_cleaner;
 409 +       schedule_batch:
 410 +               journal->j_flags |= JBD2_CLEANING;
 411 +               schedule_work(&journal->j_cleaner_ctx->work);
 412 +               return;
 413 +       }
 414 +
 415 +       if (jbd2_low_on_space(journal)) {
 416 +               journal->j_flags |= JBD2_CLEANER_ENGAGED;
 417 +               goto schedule_batch;
 418 +       }
 419 +}
 420 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
 421 index 7de6f4a0a1dc..0e759cc095f5 100644
 422 --- a/fs/jbd2/jmap.c
 423 +++ b/fs/jbd2/jmap.c
 424 @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal,
 425                  * We are either deleting the entry because it was revoked, or
 426                  * we are moving it to the live blocks list of this transaction.
 427                  * In either case, we remove it from its existing list.
 428 +                * However, before removing it we check to see if this is an
 429 +                * entry in the live blocks list of the tail transaction a
 430 +                * pointer to whom is cached by the cleaner and update the
 431 +                * cached pointer if so.
 432                  */
 433 +               spin_lock(&journal->j_cleaner_ctx->pos_lock);
 434 +               if (je == journal->j_cleaner_ctx->pos) {
 435 +                       journal->j_cleaner_ctx->pos = list_next_entry(je, list);
 436 +               }
 437                 list_del(&je->list);
 438 +               spin_unlock(&journal->j_cleaner_ctx->pos_lock);
 439
 440                 if (je->revoked) {
 441                         rb_erase(&je->rb_node, &journal->j_jmap);
 442 @@ -216,6 +225,8 @@ void jbd2_finish_transaction_infos(journal_t *journal)
 443  {
 444         struct transaction_infos *tis = journal->j_transaction_infos;
 445
 446 +       atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
 447 +
 448         write_lock(&journal->j_jmap_lock);
 449         tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
 450         write_unlock(&journal->j_jmap_lock);
 451 @@ -243,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
 452          */
 453         BUG_ON(!list_empty(&ti->live_blks));
 454
 455 +       atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
 456 +
 457         write_lock(&journal->j_jmap_lock);
 458         nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
 459                                         nr_mappings);
 460 @@ -489,11 +502,32 @@ int jbd2_smr_journal_init(journal_t *journal)
 461  {
 462         journal->j_jmap = RB_ROOT;
 463         rwlock_init(&journal->j_jmap_lock);
 464 +       journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
 465 +                                        GFP_KERNEL);
 466 +       if (!journal->j_cleaner_ctx)
 467 +               return -ENOMEM;
 468 +
 469 +       journal->j_cleaner_ctx->journal = journal;
 470 +       journal->j_cleaner_ctx->pos = NULL;
 471 +       spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
 472 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 473 +       atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
 474 +       atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
 475 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
 476 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
 477 +       init_waitqueue_head(&journal->j_cleaner_ctx->live_block_reads);
 478 +       INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch);
 479         return jbd2_init_transaction_infos(journal);
 480  }
 481
 482  void jbd2_smr_journal_exit(journal_t *journal)
 483  {
 484 +       if (journal->j_cleaner_ctx) {
 485 +               atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 486 +               flush_work(&journal->j_cleaner_ctx->work);
 487 +               kfree(journal->j_cleaner_ctx);
 488 +               journal->j_cleaner_ctx = NULL;
 489 +       }
 490         jbd2_free_transaction_infos(journal);
 491  }
 492
 493 diff --git a/fs/jbd2/jmap.h b/fs/jbd2/jmap.h
 494 index 91564ce9bbda..a44f15152536 100644
 495 --- a/fs/jbd2/jmap.h
 496 +++ b/fs/jbd2/jmap.h
 497 @@ -125,4 +125,81 @@ extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
 498  extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
 499                                int op_flags, sector_t blk);
 500
 501 +extern void jbd2_jmap_do_clean_batch(struct work_struct *work);
 502 +
 503 +/*
 504 + * Cleaner stuff is below.
 505 + */
 506 +
 507 +/*
 508 + * Number of blocks to read at once, for cleaning.
 509 + */
 510 +#define CLEANER_BATCH_SIZE 16
 511 +
 512 +/*
 513 + * Context structure for the cleaner.
 514 + */
 515 +struct cleaner_ctx {
 516 +       /*
 517 +        * We set to true once we drop below low watermark and it stays so until
 518 +        * we rise above the high watermark.  It is accessed by the commit
 519 +        * thread and the foreground kernel threads during the journal
 520 +        * destruction, therefore it is atomic.
 521 +        */
 522 +       atomic_t cleaning;
 523 +
 524 +       /*
 525 +        * We clean in batches of blocks.  This flag indicates if we are
 526 +        * currently cleaning a batch.  It is accessed by the commit thread and
 527 +        * the cleaner thread, therefore it is atomic.
 528 +        */
 529 +       atomic_t batch_in_progress;
 530 +
 531 +       /*
 532 +        * We find live blocks to clean from the live blocks list of the
 533 +        * transaction at the tail.  This list can be larger than our batch size
 534 +        * and we may need several attempts to process it.  We cache the
 535 +        * position of the next entry to start from in |pos|.  Since cleaner
 536 +        * thread can run concurrently with the commit thread that can modify
 537 +        * the live blocks list of the transaction at the tail (for example, if
 538 +        * it needs to drop a revoked entry or if |pos| points to an entry that
 539 +        * has been updated and should move from the live blocks list of the
 540 +        * transaction at the tail to the live blocks list of current
 541 +        * transaction) we protect |pos| with |pos_lock|.
 542 +        */
 543 +       struct jmap_entry *pos;
 544 +       spinlock_t pos_lock;
 545 +
 546 +       /*
 547 +        * Live block mappings for the blocks that we copy in a batch.
 548 +        */
 549 +       struct blk_mapping mappings[CLEANER_BATCH_SIZE];
 550 +
 551 +       /*
 552 +        * Buffer heads for the live blocks read in a batch.
 553 +        */
 554 +       struct buffer_head *bhs[CLEANER_BATCH_SIZE];
 555 +
 556 +       /*
 557 +        * Number of pending reads in a batch.  Every submitted read increments
 558 +        * it and every completed read decrements it.
 559 +        */
 560 +       atomic_t nr_pending_reads;
 561 +
 562 +       /*
 563 +        * The cleaner thread sleeps on this wait queue until the last
 564 +        * completed read wakes the up the cleaner thread.
 565 +        */
 566 +       wait_queue_head_t live_block_reads;
 567 +
 568 +       /* TODO: temporary for debugging, remove once done. */
 569 +       atomic_t nr_txns_committed;
 570 +       atomic_t nr_txns_cleaned;
 571 +
 572 +       journal_t *journal;
 573 +       struct work_struct work;
 574 +};
 575 +
 576 +void jbd2_check_cleaner(journal_t *journal);
 577 +
 578  #endif
 579 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
 580 index 114c7636d706..5fdcaff927cf 100644
 581 --- a/fs/jbd2/journal.c
 582 +++ b/fs/jbd2/journal.c
 583 @@ -230,10 +230,16 @@ static int kjournald2(void *arg)
 584                 del_timer_sync(&journal->j_commit_timer);
 585                 jbd2_journal_commit_transaction(journal);
 586                 write_lock(&journal->j_state_lock);
 587 -               goto loop;
 588         }
 589
 590         wake_up(&journal->j_wait_done_commit);
 591 +
 592 +       if (journal->j_flags & JBD2_LAZY)
 593 +               jbd2_check_cleaner(journal);
 594 +
 595 +       if (journal->j_commit_sequence != journal->j_commit_request)
 596 +               goto loop;
 597 +
 598         if (freezing(current)) {
 599                 /*
 600                  * The simpler the better. Flushing journal isn't a
 601 @@ -262,6 +268,9 @@ static int kjournald2(void *arg)
 602                         should_sleep = 0;
 603                 if (journal->j_flags & JBD2_UNMOUNT)
 604                         should_sleep = 0;
 605 +               if ((journal->j_flags & JBD2_CLEANER_ENGAGED) &&
 606 +                   !(journal->j_flags & JBD2_CLEANING))
 607 +                       should_sleep = 0;
 608                 if (should_sleep) {
 609                         write_unlock(&journal->j_state_lock);
 610                         schedule();
 611 @@ -307,14 +316,24 @@ static int jbd2_journal_start_thread(journal_t *journal)
 612  static void journal_kill_thread(journal_t *journal)
 613  {
 614         write_lock(&journal->j_state_lock);
 615 -       journal->j_flags |= JBD2_UNMOUNT;
 616
 617 +       journal->j_flags |= JBD2_STOP_CLEANING;
 618 +       while (journal->j_flags & JBD2_CLEANING) {
 619 +               write_unlock(&journal->j_state_lock);
 620 +               wake_up(&journal->j_wait_commit);
 621 +               wait_event(journal->j_wait_done_commit,
 622 +                          (journal->j_flags & JBD2_CLEANING) == 0);
 623 +               write_lock(&journal->j_state_lock);
 624 +       }
 625 +
 626 +       journal->j_flags |= JBD2_UNMOUNT;
 627         while (journal->j_task) {
 628                 write_unlock(&journal->j_state_lock);
 629                 wake_up(&journal->j_wait_commit);
 630                 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
 631                 write_lock(&journal->j_state_lock);
 632         }
 633 +
 634         write_unlock(&journal->j_state_lock);
 635  }
 636
 637 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
 638 index a53c7d333199..bb994983cdba 100644
 639 --- a/include/linux/jbd2.h
 640 +++ b/include/linux/jbd2.h
 641 @@ -786,6 +786,11 @@ struct journal_s
 642          */
 643         rwlock_t                j_jmap_lock;
 644
 645 +       /**
 646 +        * @j_cleaner_ctx: Cleaner state
 647 +        */
 648 +       struct cleaner_ctx      *j_cleaner_ctx;
 649 +
 650         /**
 651          * @j_format_version: Version of the superblock format.
 652          */
 653 @@ -1254,6 +1259,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
 654  #define JBD2_REC_ERR   0x080   /* The errno in the sb has been recorded */
 655  #define JBD2_NO_CLEANUP        0x100   /* Don't flush empty the journal on shutdown  */
 656  #define JBD2_LAZY      0x200   /* Do lazy journalling  */
 657 +#define JBD2_CLEANING  0x400   /* Lazy journalling cleaning in progress */
 658 +#define JBD2_CLEANER_ENGAGED   0x400   /* Cleaner has been engaged */
 659 +#define JBD2_STOP_CLEANING 0x800 /* Request the cleaning thread to stop */
 660
 661  /*
 662   * Function declarations for the journaling transaction and buffer