cleaner

   1 Introduce cleaner
   2
   3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
   4
   5 An experimental cleaner.  Copy the live blocks from the transaction at the
   6 tail in batches to the transaction at the head.  After a commit ends, check
   7 if free space is below watermark and start cleaning until free space is
   8 above high watermark.
   9
  10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
  11 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  12 ---
  13  fs/jbd2/Makefile     |   2 +-
  14  fs/jbd2/checkpoint.c |   3 +
  15  fs/jbd2/cleaner.c    | 371 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  16  fs/jbd2/jmap.c       |  32 +++++++
  17  fs/jbd2/journal.c    |  12 +++
  18  include/linux/jbd2.h |   6 +-
  19  include/linux/jmap.h |  82 ++++++++++++++++++
  20  7 files changed, 506 insertions(+), 2 deletions(-)
  21
  22 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
  23 index a54f50b3a06e..b6a2dddcc0a7 100644
  24 --- a/fs/jbd2/Makefile
  25 +++ b/fs/jbd2/Makefile
  26 @@ -5,4 +5,4 @@
  27  obj-$(CONFIG_JBD2) += jbd2.o
  28
  29  jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
  30 -               jmap.o
  31 +               jmap.o cleaner.o
  32 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
  33 index 4055f51617ef..b60bbf58e8f7 100644
  34 --- a/fs/jbd2/checkpoint.c
  35 +++ b/fs/jbd2/checkpoint.c
  36 @@ -389,6 +389,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  37         tid_t           first_tid;
  38         unsigned long   blocknr;
  39
  40 +       if (journal->j_flags & JBD2_LAZY)
  41 +               return 0;
  42 +
  43         if (is_journal_aborted(journal))
  44                 return -EIO;
  45
  46 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
  47 new file mode 100644
  48 index 000000000000..31abb4fa2706
  49 --- /dev/null
  50 +++ b/fs/jbd2/cleaner.c
  51 @@ -0,0 +1,371 @@
  52 +#include <linux/blk_types.h>
  53 +#include <linux/jbd2.h>
  54 +#include <linux/jmap.h>
  55 +#include <linux/list.h>
  56 +#include <linux/blkdev.h>
  57 +#include <linux/completion.h>
  58 +#include <linux/delay.h>
  59 +#include <trace/events/jbd2.h>
  60 +
  61 +inline int jbd2_low_on_space(journal_t *journal)
  62 +{
  63 +       int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
  64 +       if (x > 10) {
  65 +               trace_jbd2_jmap_printf1("low on space", x);
  66 +               return true;
  67 +       }
  68 +       trace_jbd2_jmap_printf1("not low on space", x);
  69 +       return false;
  70 +}
  71 +
  72 +inline int jbd2_high_on_space(journal_t *journal)
  73 +{
  74 +       if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
  75 +               trace_jbd2_jmap_printf("not enough cleaned");
  76 +               return false;
  77 +       }
  78 +       trace_jbd2_jmap_printf("enough cleaned");
  79 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
  80 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
  81 +       return true;
  82 +}
  83 +
  84 +inline bool jbd2_cleaning(journal_t *journal)
  85 +{
  86 +       return atomic_read(&journal->j_cleaner_ctx->cleaning);
  87 +}
  88 +
  89 +inline void jbd2_stop_cleaning(journal_t *journal)
  90 +{
  91 +       trace_jbd2_jmap_printf("stopped cleaning");
  92 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
  93 +}
  94 +
  95 +inline void jbd2_start_cleaning(journal_t *journal)
  96 +{
  97 +       struct cleaner_ctx *ctx = journal->j_cleaner_ctx;
  98 +
  99 +       trace_jbd2_jmap_printf("started cleaning");
 100 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 1);
 101 +
 102 +       /* Schedule the next batch of cleaning */
 103 +       if (!jbd2_cleaning_batch_complete(journal)) {
 104 +               trace_jbd2_jmap_printf("not scheduling a new batch");
 105 +               return;
 106 +       }
 107 +
 108 +       trace_jbd2_jmap_printf("scheduling a batch");
 109 +       BUG_ON(atomic_read(&ctx->nr_pending_reads));
 110 +
 111 +       atomic_set(&ctx->batch_in_progress, 1);
 112 +       schedule_work(&ctx->work);
 113 +
 114 +
 115 +}
 116 +
 117 +inline bool jbd2_cleaning_batch_complete(journal_t *journal)
 118 +{
 119 +       return jbd2_cleaning(journal) &&
 120 +               atomic_read(&journal->j_cleaner_ctx->batch_in_progress) == 0;
 121 +}
 122 +
 123 +/*
 124 + * Tries to move the tail forward (hence free space) as long as the transaction
 125 + * at the tail has only stale blocks.  Returns true if manages to free a
 126 + * transaction, false otherwise.
 127 + */
 128 +bool jbd2_try_to_move_tail(journal_t *journal)
 129 +{
 130 +       struct transaction_infos *tis = journal->j_transaction_infos;
 131 +       struct transaction_info *ti, *ti1;
 132 +
 133 +       /*
 134 +        * Advance the tail as far as possible by skipping over transactions
 135 +        * with no live blocks.
 136 +        */
 137 +       write_lock(&journal->j_jmap_lock);
 138 +       ti = ti1 = &tis->buf[tis->tail];
 139 +
 140 +       for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
 141 +               trace_jbd2_jmap_printf2("cleaned a transaction",
 142 +                                       tis->tail, ti->tid);
 143 +               tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
 144 +               atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
 145 +       }
 146 +       write_unlock(&journal->j_jmap_lock);
 147 +
 148 +       if (ti == ti1)
 149 +               return false;
 150 +       /*
 151 +        * In the worst case, this will end up updating the journal superblock
 152 +        * after cleaning up every transaction.  Should we avoid it?
 153 +        */
 154 +       write_unlock(&journal->j_state_lock);
 155 +       jbd2_update_log_tail(journal, ti->tid, ti->offset);
 156 +       write_lock(&journal->j_state_lock);
 157 +
 158 +       return true;
 159 +}
 160 +
 161 +/*
 162 + * Finds the live blocks at the tail transaction and copies the corresponding
 163 + * mappings to |ctx->mappings|.  Returns the number of live block mappings
 164 + * copied.  Should be called with a read lock on |j_jmap_lock|.
 165 + */
 166 +static int find_live_blocks(struct cleaner_ctx *ctx)
 167 +{
 168 +       journal_t *journal = ctx->journal;
 169 +       struct transaction_infos *tis = journal->j_transaction_infos;
 170 +       struct transaction_info *ti = &tis->buf[tis->tail];
 171 +       struct jmap_entry *je = NULL;
 172 +       int i, nr_live = 0;
 173 +
 174 +       if (unlikely(list_empty(&ti->live_blks)))
 175 +               goto done;
 176 +
 177 +       spin_lock(&ctx->pos_lock);
 178 +       if (!ctx->pos)
 179 +               ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
 180 +       je = ctx->pos;
 181 +       spin_unlock(&ctx->pos_lock);
 182 +
 183 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 184 +               if (je->revoked)
 185 +                       continue;
 186 +               ctx->mappings[nr_live++] = je->mapping;
 187 +               if (nr_live == CLEANER_BATCH_SIZE)
 188 +                       break;
 189 +       }
 190 +
 191 +done:
 192 +       trace_jbd2_jmap_printf1("found live blocks", nr_live);
 193 +       for (i = 0; i < nr_live; ++i)
 194 +               trace_jbd2_jmap_printf2("m",
 195 +                                       ctx->mappings[i].fsblk,
 196 +                                       ctx->mappings[i].logblk);
 197 +       return nr_live;
 198 +}
 199 +
 200 +static void live_block_read_end_io(struct buffer_head *bh, int uptodate)
 201 +{
 202 +       struct cleaner_ctx *ctx = bh->b_private;
 203 +
 204 +       if (uptodate) {
 205 +               set_buffer_uptodate(bh);
 206 +               if (atomic_dec_and_test(&ctx->nr_pending_reads))
 207 +                       complete(&ctx->live_block_reads);
 208 +       } else {
 209 +               WARN_ON(1);
 210 +               clear_buffer_uptodate(bh);
 211 +       }
 212 +
 213 +       unlock_buffer(bh);
 214 +       put_bh(bh);
 215 +}
 216 +
 217 +/*
 218 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
 219 + * buffer heads in |ctx->bhs|.  Returns true if at least one of the reads goes
 220 + * out to disk and false otherwise.  If this function returns true then the
 221 + * client should sleep on the condition variable |ctx->live_block_reads|.  The
 222 + * client will be woken up when all reads are complete, through the end_io
 223 + * handler attached to buffer heads read from disk.
 224 + */
 225 +static bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
 226 +{
 227 +       journal_t *journal = ctx->journal;
 228 +       bool slow = false;
 229 +       struct blk_plug plug;
 230 +       bool plugged = false;
 231 +       int i, rc;
 232 +
 233 +       for (i = 0; i < nr_live; ++i) {
 234 +               ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
 235 +                               journal->j_blocksize);
 236 +               if (unlikely(!ctx->bhs[i]))
 237 +                       goto out_err;
 238 +               if (buffer_uptodate(ctx->bhs[i]))
 239 +                       continue;
 240 +               plugged = true;
 241 +               blk_start_plug(&plug);
 242 +               lock_buffer(ctx->bhs[i]);
 243 +               ctx->bhs[i]->b_private = ctx;
 244 +               ctx->bhs[i]->b_end_io = live_block_read_end_io;
 245 +               atomic_inc(&ctx->nr_pending_reads);
 246 +               get_bh(ctx->bhs[i]);
 247 +               rc = read_block_from_log(ctx->journal, ctx->bhs[i],
 248 +                                        REQ_RAHEAD, ctx->mappings[i].logblk);
 249 +               if (unlikely(rc < 0))
 250 +                       goto out_err;
 251 +               if (rc) {
 252 +                       slow = true;
 253 +                       trace_jbd2_jmap_printf2("reading from disk",
 254 +                                               ctx->mappings[i].fsblk,
 255 +                                               ctx->mappings[i].logblk);
 256 +               } else {
 257 +                       trace_jbd2_jmap_printf2("cached",
 258 +                                               ctx->mappings[i].fsblk,
 259 +                                               ctx->mappings[i].logblk);
 260 +               }
 261 +       }
 262 +       if (plugged)
 263 +               blk_finish_plug(&plug);
 264 +       return slow;
 265 +
 266 +out_err:
 267 +       jbd2_journal_abort(ctx->journal, -ENOMEM);
 268 +       return false;
 269 +}
 270 +
 271 +/*
 272 + * This function finds the live blocks that became stale between the call to
 273 + * find_live_blocks and now, and discards them.  It returns true if there are no
 274 + * more live blocks left at the tail transaction.
 275 + */
 276 +static bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
 277 +{
 278 +       journal_t *journal = ctx->journal;
 279 +       struct transaction_infos *tis = journal->j_transaction_infos;
 280 +       struct transaction_info *ti = &tis->buf[tis->tail];
 281 +       struct jmap_entry *je = NULL;
 282 +       int i = 0, j = 0, next = 0;
 283 +
 284 +       trace_jbd2_jmap_printf(__func__);
 285 +       spin_lock(&ctx->pos_lock);
 286 +       BUG_ON(!ctx->pos);
 287 +       je = ctx->pos;
 288 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 289 +               for (j = next; j < nr_live; ++j) {
 290 +                       if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
 291 +                               next = j+1;
 292 +                               ctx->pos = list_next_entry(je, list);
 293 +                               if (je->revoked) {
 294 +                                       brelse(ctx->bhs[j]);
 295 +                                       ctx->bhs[j] = NULL;
 296 +                                       trace_jbd2_jmap_printf2(
 297 +                                               "revoked",
 298 +                                               ctx->mappings[i].fsblk,
 299 +                                               ctx->mappings[i].logblk);
 300 +                               }
 301 +                               break;
 302 +                       } else {
 303 +                               trace_jbd2_jmap_printf2(
 304 +                                               "moved to another list",
 305 +                                               ctx->mappings[i].fsblk,
 306 +                                               ctx->mappings[i].logblk);
 307 +                               brelse(ctx->bhs[j]);
 308 +                               ctx->bhs[j] = NULL;
 309 +                       }
 310 +               }
 311 +               if (++i == nr_live || j == nr_live)
 312 +                       break;
 313 +       }
 314 +       spin_unlock(&ctx->pos_lock);
 315 +
 316 +       /*
 317 +        * We have exited the loop.  If we haven't processed all the entries in
 318 +        * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
 319 +        * not processed |nr_live| entries from the live blocks list at the
 320 +        * tail, that is if (i < nr_live) at the exit, then the live blocks list
 321 +        * has shrunk and the tail transaction has no live blocks left.
 322 +        */
 323 +       return j < nr_live && i < nr_live;
 324 +}
 325 +
 326 +static void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle,
 327 +                              int nr_live)
 328 +{
 329 +       int err, i;
 330 +
 331 +       trace_jbd2_jmap_printf(__func__);
 332 +       for (i = 0; i < nr_live; ++i) {
 333 +               if (!ctx->bhs[i])
 334 +                       continue;
 335 +               trace_jbd2_jmap_printf2("attaching",
 336 +                                       ctx->mappings[i].fsblk,
 337 +                                       ctx->mappings[i].logblk);
 338 +               err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
 339 +               if (!err)
 340 +                       err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
 341 +               if (err) {
 342 +                       jbd2_journal_abort(ctx->journal, err);
 343 +                       return;
 344 +               }
 345 +       }
 346 +}
 347 +
 348 +/*
 349 + * Read the live blocks from the tail transaction and attach them to the current
 350 + * transaction.
 351 + */
 352 +void jbd2_jmap_do_clean_batch(struct work_struct *work)
 353 +{
 354 +       struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
 355 +       bool wake_up_commit_thread = true;
 356 +       handle_t *handle = NULL;
 357 +       int nr_live, err;
 358 +       int i = 0;
 359 +
 360 +       read_lock(&ctx->journal->j_jmap_lock);
 361 +       nr_live = find_live_blocks(ctx);
 362 +       read_unlock(&ctx->journal->j_jmap_lock);
 363 +
 364 +       if (nr_live < CLEANER_BATCH_SIZE)
 365 +               wake_up_commit_thread = false;
 366 +       if (nr_live == 0)
 367 +               goto done;
 368 +
 369 +       reinit_completion(&ctx->live_block_reads);
 370 +       if (read_live_blocks(ctx, nr_live)) {
 371 +               trace_jbd2_jmap_printf("waiting for completion");
 372 +               wait_for_completion(&ctx->live_block_reads);
 373 +       } else {
 374 +               trace_jbd2_jmap_printf("not waiting for completion");
 375 +       }
 376 +       while (atomic_read(&ctx->nr_pending_reads)) {
 377 +               /* Should never trigger, but we'll do better
 378 +                * converting to a wait_channel instead of the completion */
 379 +               pr_err("JBD2: clean_batch: completion failed, recovering\n");
 380 +               mdelay(100);
 381 +               BUG_ON(i++ > 10);
 382 +       }
 383 +
 384 +       handle = jbd2_journal_start(ctx->journal, nr_live);
 385 +       if (IS_ERR(handle)) {
 386 +               jbd2_journal_abort(ctx->journal, PTR_ERR(handle));
 387 +               return;
 388 +       }
 389 +
 390 +       read_lock(&ctx->journal->j_jmap_lock);
 391 +       if (discard_stale_blocks(ctx, nr_live))
 392 +               wake_up_commit_thread = false;
 393 +       read_unlock(&ctx->journal->j_jmap_lock);
 394 +       /*
 395 +        * I'm not sure why this function was under the jmap_lock
 396 +        * previously, but it can't be, since it calls functions that
 397 +        * can block due to memory allocation.  I don't think it needs
 398 +        * to be protected, since it appears that ctx->mapping is only
 399 +        * used by the cleaner code, and so it can't be run multiple
 400 +        * times.  -- TYT
 401 +        */
 402 +       attach_live_blocks(ctx, handle, nr_live);
 403 +
 404 +       err = jbd2_journal_stop(handle);
 405 +       if (err) {
 406 +               jbd2_journal_abort(ctx->journal, err);
 407 +               return;
 408 +       }
 409 +
 410 +done:
 411 +       atomic_set(&ctx->batch_in_progress, 0);
 412 +       atomic_inc(&ctx->nr_txns_cleaned);
 413 +       if (wake_up_commit_thread) {
 414 +               trace_jbd2_jmap_printf("waking up commit thread");
 415 +               wake_up(&ctx->journal->j_wait_commit);
 416 +       } else {
 417 +               trace_jbd2_jmap_printf("not waking up commit thread");
 418 +               spin_lock(&ctx->pos_lock);
 419 +               ctx->pos = NULL;
 420 +               spin_unlock(&ctx->pos_lock);
 421 +       }
 422 +}
 423 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
 424 index 18dd0e127aff..9b317608f70a 100644
 425 --- a/fs/jbd2/jmap.c
 426 +++ b/fs/jbd2/jmap.c
 427 @@ -91,8 +91,17 @@ static int process_existing_mappings(journal_t *journal,
 428                  * We are either deleting the entry because it was revoked, or
 429                  * we are moving it to the live blocks list of this transaction.
 430                  * In either case, we remove it from its existing list.
 431 +                * However, before removing it we check to see if this is an
 432 +                * entry in the live blocks list of the tail transaction a
 433 +                * pointer to whom is cached by the cleaner and update the
 434 +                * cached pointer if so.
 435                  */
 436 +               spin_lock(&journal->j_cleaner_ctx->pos_lock);
 437 +               if (je == journal->j_cleaner_ctx->pos) {
 438 +                       journal->j_cleaner_ctx->pos = list_next_entry(je, list);
 439 +               }
 440                 list_del(&je->list);
 441 +               spin_unlock(&journal->j_cleaner_ctx->pos_lock);
 442
 443                 if (je->revoked) {
 444                         rb_erase(&je->rb_node, &journal->j_jmap);
 445 @@ -245,6 +254,8 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
 446          */
 447         BUG_ON(!list_empty(&ti->live_blks));
 448
 449 +       atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
 450 +
 451         write_lock(&journal->j_jmap_lock);
 452         nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
 453                                         nr_mappings);
 454 @@ -489,11 +500,32 @@ int jbd2_smr_journal_init(journal_t *journal)
 455  {
 456         journal->j_jmap = RB_ROOT;
 457         rwlock_init(&journal->j_jmap_lock);
 458 +       journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
 459 +                                        GFP_KERNEL);
 460 +       if (!journal->j_cleaner_ctx)
 461 +               return -ENOMEM;
 462 +
 463 +       journal->j_cleaner_ctx->journal = journal;
 464 +       journal->j_cleaner_ctx->pos = NULL;
 465 +       spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
 466 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 467 +       atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
 468 +       atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
 469 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
 470 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
 471 +       init_completion(&journal->j_cleaner_ctx->live_block_reads);
 472 +       INIT_WORK(&journal->j_cleaner_ctx->work, jbd2_jmap_do_clean_batch);
 473         return jbd2_init_transaction_infos(journal);
 474  }
 475
 476  void jbd2_smr_journal_exit(journal_t *journal)
 477  {
 478 +       if (journal->j_cleaner_ctx) {
 479 +               atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 480 +               flush_work(&journal->j_cleaner_ctx->work);
 481 +               kfree(journal->j_cleaner_ctx);
 482 +               journal->j_cleaner_ctx = NULL;
 483 +       }
 484         jbd2_free_transaction_infos(journal);
 485  }
 486
 487 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
 488 index 493b72c60335..ab7b9bbc9296 100644
 489 --- a/fs/jbd2/journal.c
 490 +++ b/fs/jbd2/journal.c
 491 @@ -227,6 +227,15 @@ static int kjournald2(void *arg)
 492         }
 493
 494         wake_up(&journal->j_wait_done_commit);
 495 +
 496 +       if ((journal->j_flags & JBD2_LAZY) &&
 497 +           (jbd2_cleaning(journal) || jbd2_low_on_space(journal))) {
 498 +               if (jbd2_try_to_move_tail(journal) && jbd2_high_on_space(journal))
 499 +                       jbd2_stop_cleaning(journal);
 500 +               else
 501 +                       jbd2_start_cleaning(journal);
 502 +       }
 503 +
 504         if (freezing(current)) {
 505                 /*
 506                  * The simpler the better. Flushing journal isn't a
 507 @@ -255,6 +264,9 @@ static int kjournald2(void *arg)
 508                         should_sleep = 0;
 509                 if (journal->j_flags & JBD2_UNMOUNT)
 510                         should_sleep = 0;
 511 +               if ((journal->j_flags & JBD2_LAZY) &&
 512 +                   jbd2_cleaning_batch_complete(journal))
 513 +                       should_sleep = 0;
 514                 if (should_sleep) {
 515                         write_unlock(&journal->j_state_lock);
 516                         schedule();
 517 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
 518 index 771588026353..3112fba26598 100644
 519 --- a/include/linux/jbd2.h
 520 +++ b/include/linux/jbd2.h
 521 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
 522   * @j_superblock: Second part of superblock buffer
 523   * @j_map: A map from file system blocks to log blocks
 524   * @j_transaction_infos: An array of information structures per live transaction
 525 - * @j_map_lock: Protect j_jmap and j_transaction_infos
 526 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
 527 + * @j_cleaner_ctx: Cleaner state
 528   * @j_format_version: Version of the superblock format
 529   * @j_state_lock: Protect the various scalars in the journal
 530   * @j_barrier_count:  Number of processes waiting to create a barrier lock
 531 @@ -820,6 +821,9 @@ struct journal_s
 532         /* Protect j_jmap and j_transaction_infos */
 533         rwlock_t                j_jmap_lock;
 534
 535 +       /* Cleaner state */
 536 +       struct cleaner_ctx      *j_cleaner_ctx;
 537 +
 538         /* Version of the superblock format */
 539         int                     j_format_version;
 540
 541 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
 542 index 638f25df8302..5af1fec4ab95 100644
 543 --- a/include/linux/jmap.h
 544 +++ b/include/linux/jmap.h
 545 @@ -132,5 +132,87 @@ extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
 546                                const char *func);
 547  extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
 548                                sector_t block);
 549 +extern void jbd2_jmap_do_clean_batch(struct work_struct *work);
 550 +
 551 +/*
 552 + * Cleaner stuff is below.
 553 + */
 554 +
 555 +/*
 556 + * Number of blocks to read at once, for cleaning.
 557 + */
 558 +#define CLEANER_BATCH_SIZE 16
 559 +
 560 +/*
 561 + * Context structure for the cleaner.
 562 + */
 563 +struct cleaner_ctx {
 564 +       /*
 565 +        * We set to true once we drop below low watermark and it stays so until
 566 +        * we rise above the high watermark.  It is accessed by the commit
 567 +        * thread and the foreground kernel threads during the journal
 568 +        * destruction, therefore it is atomic.
 569 +        */
 570 +       atomic_t cleaning;
 571 +
 572 +       /*
 573 +        * We clean in batches of blocks.  This flag indicates if we are
 574 +        * currently cleaning a batch.  It is accessed by the commit thread and
 575 +        * the cleaner thread, therefore it is atomic.
 576 +        */
 577 +       atomic_t batch_in_progress;
 578 +
 579 +       /*
 580 +        * We find live blocks to clean from the live blocks list of the
 581 +        * transaction at the tail.  This list can be larger than our batch size
 582 +        * and we may need several attempts to process it.  We cache the
 583 +        * position of the next entry to start from in |pos|.  Since cleaner
 584 +        * thread can run concurrently with the commit thread that can modify
 585 +        * the live blocks list of the transaction at the tail (for example, if
 586 +        * it needs to drop a revoked entry or if |pos| points to an entry that
 587 +        * has been updated and should move from the live blocks list of the
 588 +        * transaction at the tail to the live blocks list of current
 589 +        * transaction) we protect |pos| with |pos_lock|.
 590 +        */
 591 +       struct jmap_entry *pos;
 592 +       spinlock_t pos_lock;
 593 +
 594 +       /*
 595 +        * Live block mappings for the blocks that we copy in a batch.
 596 +        */
 597 +       struct blk_mapping mappings[CLEANER_BATCH_SIZE];
 598 +
 599 +       /*
 600 +        * Buffer heads for the live blocks read in a batch.
 601 +        */
 602 +       struct buffer_head *bhs[CLEANER_BATCH_SIZE];
 603 +
 604 +       /*
 605 +        * Number of pending reads in a batch.  Every submitted read increments
 606 +        * it and every completed read decrements it.
 607 +        */
 608 +       atomic_t nr_pending_reads;
 609 +
 610 +       /*
 611 +        * The cleaner thread sleeps on this condition variable until the last
 612 +        * completed read wakes the up the cleaner thread.
 613 +        */
 614 +       struct completion live_block_reads;
 615 +
 616 +       /* TODO: temporary for debugging, remove once done. */
 617 +       atomic_t nr_txns_committed;
 618 +       atomic_t nr_txns_cleaned;
 619 +
 620 +       journal_t *journal;
 621 +       struct work_struct work;
 622 +};
 623 +
 624 +extern int jbd2_low_on_space(journal_t *journal);
 625 +extern int jbd2_high_on_space(journal_t *journal);
 626 +extern bool jbd2_cleaning(journal_t *journal);
 627 +extern void jbd2_stop_cleaning(journal_t *journal);
 628 +extern void jbd2_start_cleaning(journal_t *journal);
 629 +extern bool jbd2_cleaning_batch_complete(journal_t *journal);
 630 +extern bool jbd2_try_to_move_tail(journal_t *journal);
 631
 632  #endif