cleaner

   1 Introduce cleaner
   2
   3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
   4
   5 An experimental cleaner.  Copy the live blocks from the transaction at the
   6 tail in batches to the transaction at the head.  After a commit ends, check
   7 if free space is below watermark and start cleaning until free space is
   8 above high watermark.
   9
  10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
  11
  12 ---
  13  fs/jbd2/Makefile     |   2 +-
  14  fs/jbd2/cleaner.c    | 358 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  15  fs/jbd2/jmap.c       |  42 +++++++++--
  16  fs/jbd2/journal.c    |  12 ++++
  17  include/linux/jbd2.h |   6 +-
  18  include/linux/jmap.h | 111 +++++++++++++++++++++++++----
  19  6 files changed, 510 insertions(+), 21 deletions(-)
  20
  21 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
  22 index a54f50b3a06e..b6a2dddcc0a7 100644
  23 --- a/fs/jbd2/Makefile
  24 +++ b/fs/jbd2/Makefile
  25 @@ -5,4 +5,4 @@
  26  obj-$(CONFIG_JBD2) += jbd2.o
  27
  28  jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
  29 -               jmap.o
  30 +               jmap.o cleaner.o
  31 diff --git a/fs/jbd2/cleaner.c b/fs/jbd2/cleaner.c
  32 new file mode 100644
  33 index 000000000000..6d02aafcded9
  34 --- /dev/null
  35 +++ b/fs/jbd2/cleaner.c
  36 @@ -0,0 +1,358 @@
  37 +#include <linux/blk_types.h>
  38 +#include <linux/jbd2.h>
  39 +#include <linux/jmap.h>
  40 +#include <linux/list.h>
  41 +#include <linux/blkdev.h>
  42 +#include <linux/completion.h>
  43 +#include <trace/events/jbd2.h>
  44 +
  45 +inline int low_on_space(journal_t *journal)
  46 +{
  47 +       int x = atomic_read(&journal->j_cleaner_ctx->nr_txns_committed);
  48 +       if (x > 10) {
  49 +               trace_jbd2_jmap_printf1("low on space", x);
  50 +               return true;
  51 +       }
  52 +       trace_jbd2_jmap_printf1("not low on space", x);
  53 +       return false;
  54 +}
  55 +
  56 +inline int high_on_space(journal_t *journal)
  57 +{
  58 +       if (atomic_read(&journal->j_cleaner_ctx->nr_txns_cleaned) < 2) {
  59 +               trace_jbd2_jmap_printf("not enough cleaned");
  60 +               return false;
  61 +       }
  62 +       trace_jbd2_jmap_printf("enough cleaned");
  63 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
  64 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
  65 +       return true;
  66 +}
  67 +
  68 +inline bool cleaning(journal_t *journal)
  69 +{
  70 +       return atomic_read(&journal->j_cleaner_ctx->cleaning);
  71 +}
  72 +
  73 +inline void stop_cleaning(journal_t *journal)
  74 +{
  75 +       trace_jbd2_jmap_printf("stopped cleaning");
  76 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
  77 +}
  78 +
  79 +inline void start_cleaning(journal_t *journal)
  80 +{
  81 +       trace_jbd2_jmap_printf("started cleaning");
  82 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 1);
  83 +       clean_next_batch(journal);
  84 +}
  85 +
  86 +inline bool cleaning_batch_complete(journal_t *journal)
  87 +{
  88 +       return cleaning(journal) &&
  89 +               atomic_read(&journal->j_cleaner_ctx->batch_in_progress) == 0;
  90 +}
  91 +
  92 +/*
  93 + * Tries to move the tail forward (hence free space) as long as the transaction
  94 + * at the tail has only stale blocks.  Returns true if manages to free a
  95 + * transaction, false otherwise.
  96 + */
  97 +bool try_to_move_tail(journal_t *journal)
  98 +{
  99 +       struct transaction_infos *tis = journal->j_transaction_infos;
 100 +       struct transaction_info *ti, *ti1;
 101 +
 102 +       /*
 103 +        * Advance the tail as far as possible by skipping over transactions
 104 +        * with no live blocks.
 105 +        */
 106 +       write_lock(&journal->j_jmap_lock);
 107 +       ti = ti1 = &tis->buf[tis->tail];
 108 +
 109 +       for ( ; list_empty(&ti->live_blks); ti = &tis->buf[tis->tail]) {
 110 +               trace_jbd2_jmap_printf2("cleaned a transaction",
 111 +                                       tis->tail, ti->tid);
 112 +               tis->tail = (tis->tail + 1) & (MAX_LIVE_TRANSACTIONS - 1);
 113 +               atomic_inc(&journal->j_cleaner_ctx->nr_txns_cleaned);
 114 +       }
 115 +       write_unlock(&journal->j_jmap_lock);
 116 +
 117 +       if (ti == ti1)
 118 +               return false;
 119 +       /*
 120 +        * In the worst case, this will end up updating the journal superblock
 121 +        * after cleaning up every transaction.  Should we avoid it?
 122 +        */
 123 +       write_unlock(&journal->j_state_lock);
 124 +       jbd2_update_log_tail(journal, ti->tid, ti->offset);
 125 +       write_lock(&journal->j_state_lock);
 126 +
 127 +       return true;
 128 +}
 129 +
 130 +/*
 131 + * Finds the live blocks at the tail transaction and copies the corresponding
 132 + * mappings to |ctx->mappings|.  Returns the number of live block mappings
 133 + * copied.  Should be called with a read lock on |j_jmap_lock|.
 134 + */
 135 +int find_live_blocks(struct cleaner_ctx *ctx)
 136 +{
 137 +       journal_t *journal = ctx->journal;
 138 +       struct transaction_infos *tis = journal->j_transaction_infos;
 139 +       struct transaction_info *ti = &tis->buf[tis->tail];
 140 +       struct jmap_entry *je = NULL;
 141 +       int i, nr_live = 0;
 142 +
 143 +       if (unlikely(list_empty(&ti->live_blks)))
 144 +               goto done;
 145 +
 146 +       spin_lock(&ctx->pos_lock);
 147 +       if (!ctx->pos)
 148 +               ctx->pos = list_first_entry(&ti->live_blks, typeof(*je), list);
 149 +       je = ctx->pos;
 150 +       spin_unlock(&ctx->pos_lock);
 151 +
 152 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 153 +               if (je->revoked)
 154 +                       continue;
 155 +               ctx->mappings[nr_live++] = je->mapping;
 156 +               if (nr_live == CLEANER_BATCH_SIZE)
 157 +                       break;
 158 +       }
 159 +
 160 +done:
 161 +       trace_jbd2_jmap_printf1("found live blocks", nr_live);
 162 +       for (i = 0; i < nr_live; ++i)
 163 +               trace_jbd2_jmap_printf2("m",
 164 +                                       ctx->mappings[i].fsblk,
 165 +                                       ctx->mappings[i].logblk);
 166 +       return nr_live;
 167 +}
 168 +
 169 +void live_block_read_end_io(struct buffer_head *bh, int uptodate)
 170 +{
 171 +       struct cleaner_ctx *ctx = bh->b_private;
 172 +
 173 +       if (uptodate) {
 174 +               set_buffer_uptodate(bh);
 175 +               if (atomic_dec_and_test(&ctx->nr_pending_reads))
 176 +                       complete(&ctx->live_block_reads);
 177 +       } else {
 178 +               WARN_ON(1);
 179 +               clear_buffer_uptodate(bh);
 180 +       }
 181 +
 182 +       unlock_buffer(bh);
 183 +       put_bh(bh);
 184 +}
 185 +
 186 +/*
 187 + * Reads live blocks in |ctx->mappings| populated by find_live_blocks into
 188 + * buffer heads in |ctx->bhs|.  Returns true if at least one of the reads goes
 189 + * out to disk and false otherwise.  If this function returns true then the
 190 + * client should sleep on the condition variable |ctx->live_block_reads|.  The
 191 + * client will be woken up when all reads are complete, through the end_io
 192 + * handler attached to buffer heads read from disk.
 193 + */
 194 +bool read_live_blocks(struct cleaner_ctx *ctx, int nr_live)
 195 +{
 196 +       journal_t *journal = ctx->journal;
 197 +       bool slow = false;
 198 +       struct blk_plug plug;
 199 +       bool plugged = false;
 200 +       int i, rc;
 201 +
 202 +       for (i = 0; i < nr_live; ++i) {
 203 +               ctx->bhs[i] = __getblk(journal->j_dev, ctx->mappings[i].fsblk,
 204 +                               journal->j_blocksize);
 205 +               if (unlikely(!ctx->bhs[i]))
 206 +                       goto out_err;
 207 +               if (buffer_uptodate(ctx->bhs[i]))
 208 +                       continue;
 209 +               plugged = true;
 210 +               blk_start_plug(&plug);
 211 +               lock_buffer(ctx->bhs[i]);
 212 +               ctx->bhs[i]->b_private = ctx;
 213 +               ctx->bhs[i]->b_end_io = live_block_read_end_io;
 214 +               atomic_inc(&ctx->nr_pending_reads);
 215 +               get_bh(ctx->bhs[i]);
 216 +               rc = read_block_from_log(ctx->journal, ctx->bhs[i],
 217 +                                        REQ_RAHEAD, ctx->mappings[i].logblk);
 218 +               if (unlikely(rc < 0))
 219 +                       goto out_err;
 220 +               if (rc) {
 221 +                       slow = true;
 222 +                       trace_jbd2_jmap_printf2("reading from disk",
 223 +                                               ctx->mappings[i].fsblk,
 224 +                                               ctx->mappings[i].logblk);
 225 +               } else {
 226 +                       trace_jbd2_jmap_printf2("cached",
 227 +                                               ctx->mappings[i].fsblk,
 228 +                                               ctx->mappings[i].logblk);
 229 +               }
 230 +       }
 231 +       if (plugged)
 232 +               blk_finish_plug(&plug);
 233 +       return slow;
 234 +
 235 +out_err:
 236 +       jbd2_journal_abort(ctx->journal, -ENOMEM);
 237 +       return false;
 238 +}
 239 +
 240 +/*
 241 + * This function finds the live blocks that became stale between the call to
 242 + * find_live_blocks and now, and discards them.  It returns true if there are no
 243 + * more live blocks left at the tail transaction.
 244 + */
 245 +bool discard_stale_blocks(struct cleaner_ctx *ctx, int nr_live)
 246 +{
 247 +       journal_t *journal = ctx->journal;
 248 +       struct transaction_infos *tis = journal->j_transaction_infos;
 249 +       struct transaction_info *ti = &tis->buf[tis->tail];
 250 +       struct jmap_entry *je = NULL;
 251 +       int i = 0, j = 0, next = 0;
 252 +
 253 +       trace_jbd2_jmap_printf(__func__);
 254 +       spin_lock(&ctx->pos_lock);
 255 +       BUG_ON(!ctx->pos);
 256 +       je = ctx->pos;
 257 +       list_for_each_entry_from(je, &ti->live_blks, list) {
 258 +               for (j = next; j < nr_live; ++j) {
 259 +                       if (je->mapping.fsblk == ctx->mappings[j].fsblk) {
 260 +                               next = j+1;
 261 +                               ctx->pos = list_next_entry(je, list);
 262 +                               if (je->revoked) {
 263 +                                       brelse(ctx->bhs[j]);
 264 +                                       ctx->bhs[j] = NULL;
 265 +                                       trace_jbd2_jmap_printf2(
 266 +                                               "revoked",
 267 +                                               ctx->mappings[i].fsblk,
 268 +                                               ctx->mappings[i].logblk);
 269 +                               }
 270 +                               break;
 271 +                       } else {
 272 +                               trace_jbd2_jmap_printf2(
 273 +                                               "moved to another list",
 274 +                                               ctx->mappings[i].fsblk,
 275 +                                               ctx->mappings[i].logblk);
 276 +                               brelse(ctx->bhs[j]);
 277 +                               ctx->bhs[j] = NULL;
 278 +                       }
 279 +               }
 280 +               if (++i == nr_live || j == nr_live)
 281 +                       break;
 282 +       }
 283 +       spin_unlock(&ctx->pos_lock);
 284 +
 285 +       /*
 286 +        * We have exited the loop.  If we haven't processed all the entries in
 287 +        * |ctx->mappings|, that is if (j < nr_live) at the exit, and we have
 288 +        * not processed |nr_live| entries from the live blocks list at the
 289 +        * tail, that is if (i < nr_live) at the exit, then the live blocks list
 290 +        * has shrunk and the tail transaction has no live blocks left.
 291 +        */
 292 +       return j < nr_live && i < nr_live;
 293 +}
 294 +
 295 +void attach_live_blocks(struct cleaner_ctx *ctx, handle_t *handle, int nr_live)
 296 +{
 297 +       int err, i;
 298 +
 299 +       trace_jbd2_jmap_printf(__func__);
 300 +       for (i = 0; i < nr_live; ++i) {
 301 +               if (!ctx->bhs[i])
 302 +                       continue;
 303 +               trace_jbd2_jmap_printf2("attaching",
 304 +                                       ctx->mappings[i].fsblk,
 305 +                                       ctx->mappings[i].logblk);
 306 +               err = jbd2_journal_get_write_access(handle, ctx->bhs[i]);
 307 +               if (!err)
 308 +                       err = jbd2_journal_dirty_metadata(handle, ctx->bhs[i]);
 309 +               if (err) {
 310 +                       jbd2_journal_abort(ctx->journal, err);
 311 +                       return;
 312 +               }
 313 +       }
 314 +}
 315 +
 316 +/*
 317 + * Read the live blocks from the tail transaction and attach them to the current
 318 + * transaction.
 319 + */
 320 +static void do_clean_batch(struct work_struct *work)
 321 +{
 322 +       struct cleaner_ctx *ctx = container_of(work, struct cleaner_ctx, work);
 323 +       bool wake_up_commit_thread = true;
 324 +       handle_t *handle = NULL;
 325 +       int nr_live, err;
 326 +
 327 +       read_lock(&ctx->journal->j_jmap_lock);
 328 +       nr_live = find_live_blocks(ctx);
 329 +       read_unlock(&ctx->journal->j_jmap_lock);
 330 +
 331 +       if (nr_live < CLEANER_BATCH_SIZE)
 332 +               wake_up_commit_thread = false;
 333 +       if (nr_live == 0)
 334 +               goto done;
 335 +
 336 +       reinit_completion(&ctx->live_block_reads);
 337 +       if (read_live_blocks(ctx, nr_live)) {
 338 +               trace_jbd2_jmap_printf("waiting for completion");
 339 +               wait_for_completion(&ctx->live_block_reads);
 340 +       } else {
 341 +               trace_jbd2_jmap_printf("not waiting for completion");
 342 +       }
 343 +
 344 +       handle = jbd2_journal_start(ctx->journal, nr_live);
 345 +       if (IS_ERR(handle)) {
 346 +               jbd2_journal_abort(ctx->journal, PTR_ERR(handle));
 347 +               return;
 348 +       }
 349 +
 350 +       read_lock(&ctx->journal->j_jmap_lock);
 351 +       if (discard_stale_blocks(ctx, nr_live))
 352 +               wake_up_commit_thread = false;
 353 +       attach_live_blocks(ctx, handle, nr_live);
 354 +       read_unlock(&ctx->journal->j_jmap_lock);
 355 +
 356 +       err = jbd2_journal_stop(handle);
 357 +       if (err) {
 358 +               jbd2_journal_abort(ctx->journal, err);
 359 +               return;
 360 +       }
 361 +
 362 +done:
 363 +       atomic_set(&ctx->batch_in_progress, 0);
 364 +       atomic_inc(&ctx->nr_txns_cleaned);
 365 +       if (wake_up_commit_thread) {
 366 +               trace_jbd2_jmap_printf("waking up commit thread");
 367 +               wake_up(&ctx->journal->j_wait_commit);
 368 +       } else {
 369 +               trace_jbd2_jmap_printf("not waking up commit thread");
 370 +               spin_lock(&ctx->pos_lock);
 371 +               ctx->pos = NULL;
 372 +               spin_unlock(&ctx->pos_lock);
 373 +       }
 374 +}
 375 +
 376 +/*
 377 + * Schedules the next batch of cleaning.
 378 + */
 379 +void clean_next_batch(journal_t *journal)
 380 +{
 381 +       struct cleaner_ctx *ctx = journal->j_cleaner_ctx;
 382 +
 383 +       if (!cleaning_batch_complete(journal)) {
 384 +               trace_jbd2_jmap_printf("not scheduling a new batch");
 385 +               return;
 386 +       }
 387 +
 388 +       trace_jbd2_jmap_printf("scheduling a batch");
 389 +       BUG_ON(atomic_read(&ctx->nr_pending_reads));
 390 +
 391 +       atomic_set(&ctx->batch_in_progress, 1);
 392 +       INIT_WORK(&ctx->work, do_clean_batch);
 393 +       schedule_work(&ctx->work);
 394 +}
 395 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
 396 index ddaa7d5f5634..08bfbd6f3587 100644
 397 --- a/fs/jbd2/jmap.c
 398 +++ b/fs/jbd2/jmap.c
 399 @@ -39,7 +39,7 @@ int jbd2_init_transaction_infos(journal_t *journal)
 400         }
 401
 402         for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
 403 -               INIT_LIST_HEAD(&tis->buf[i].live_logblks);
 404 +               INIT_LIST_HEAD(&tis->buf[i].live_blks);
 405
 406         journal->j_transaction_infos = tis;
 407         return 0;
 408 @@ -92,15 +92,26 @@ static int process_existing_mappings(journal_t *journal,
 409                  * We are either deleting the entry because it was revoked, or
 410                  * we are moving it to the live blocks list of this transaction.
 411                  * In either case, we remove it from its existing list.
 412 +                * However, before removing it we check to see if this is an
 413 +                * entry in the live blocks list of the tail transaction a
 414 +                * pointer to whom is cached by the cleaner and update the
 415 +                * cached pointer if so.
 416                  */
 417 +               spin_lock(&journal->j_cleaner_ctx->pos_lock);
 418 +               if (je == journal->j_cleaner_ctx->pos) {
 419 +                       journal->j_cleaner_ctx->pos = list_next_entry(je, list);
 420 +                       trace_jbd2_jmap_printf1("updating pos to",
 421 +                                               (unsigned long long) journal->j_cleaner_ctx->pos);
 422 +               }
 423                 list_del(&je->list);
 424 +               spin_unlock(&journal->j_cleaner_ctx->pos_lock);
 425
 426                 if (je->revoked) {
 427                         rb_erase(&je->rb_node, &journal->j_jmap);
 428                         kmem_cache_free(jbd2_jmap_cache, je);
 429                 } else {
 430                         trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
 431 -                       fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
 432 +                       fill_entry(je, &mappings[i], t_idx, &ti->live_blks);
 433                 }
 434         }
 435         return nr_new;
 436 @@ -162,8 +173,7 @@ static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
 437                         else
 438                                 BUG_ON(1);
 439                 }
 440 -               fill_entry(new_entries[i], &mappings[i], t_idx,
 441 -                       &ti->live_logblks);
 442 +               fill_entry(new_entries[i], &mappings[i], t_idx, &ti->live_blks);
 443                 rb_link_node(&new_entries[i]->rb_node, parent, p);
 444                 rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
 445                 trace_jbd2_jmap_insert(&mappings[i], t_idx);
 446 @@ -190,7 +200,9 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
 447          * We are possibly reusing space of an old transaction_info.  The old
 448          * transaction should not have any live blocks in it.
 449          */
 450 -       BUG_ON(!list_empty(&ti->live_logblks));
 451 +       BUG_ON(!list_empty(&ti->live_blks));
 452 +
 453 +       atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
 454
 455         write_lock(&journal->j_jmap_lock);
 456         nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
 457 @@ -435,11 +447,31 @@ int jbd2_smr_journal_init(journal_t *journal)
 458  {
 459         journal->j_jmap = RB_ROOT;
 460         rwlock_init(&journal->j_jmap_lock);
 461 +       journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
 462 +                                        GFP_KERNEL);
 463 +       if (!journal->j_cleaner_ctx)
 464 +               return -ENOMEM;
 465 +
 466 +       journal->j_cleaner_ctx->journal = journal;
 467 +       journal->j_cleaner_ctx->pos = NULL;
 468 +       spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
 469 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 470 +       atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
 471 +       atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
 472 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
 473 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
 474 +       init_completion(&journal->j_cleaner_ctx->live_block_reads);
 475         return jbd2_init_transaction_infos(journal);
 476  }
 477
 478  void jbd2_smr_journal_exit(journal_t *journal)
 479  {
 480 +       if (journal->j_cleaner_ctx) {
 481 +               atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
 482 +               flush_work(&journal->j_cleaner_ctx->work);
 483 +               kfree(journal->j_cleaner_ctx);
 484 +               journal->j_cleaner_ctx = NULL;
 485 +       }
 486         jbd2_free_transaction_infos(journal);
 487  }
 488
 489 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
 490 index 0fb2cd419ee3..efe53b83e2e2 100644
 491 --- a/fs/jbd2/journal.c
 492 +++ b/fs/jbd2/journal.c
 493 @@ -227,6 +227,15 @@ static int kjournald2(void *arg)
 494         }
 495
 496         wake_up(&journal->j_wait_done_commit);
 497 +
 498 +       if ((journal->j_flags & JBD2_LAZY) &&
 499 +           (cleaning(journal) || low_on_space(journal))) {
 500 +               if (try_to_move_tail(journal) && high_on_space(journal))
 501 +                       stop_cleaning(journal);
 502 +               else
 503 +                       start_cleaning(journal);
 504 +       }
 505 +
 506         if (freezing(current)) {
 507                 /*
 508                  * The simpler the better. Flushing journal isn't a
 509 @@ -255,6 +264,9 @@ static int kjournald2(void *arg)
 510                         should_sleep = 0;
 511                 if (journal->j_flags & JBD2_UNMOUNT)
 512                         should_sleep = 0;
 513 +               if ((journal->j_flags & JBD2_LAZY) &&
 514 +                   cleaning_batch_complete(journal))
 515 +                       should_sleep = 0;
 516                 if (should_sleep) {
 517                         write_unlock(&journal->j_state_lock);
 518                         schedule();
 519 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
 520 index 771588026353..3112fba26598 100644
 521 --- a/include/linux/jbd2.h
 522 +++ b/include/linux/jbd2.h
 523 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
 524   * @j_superblock: Second part of superblock buffer
 525   * @j_map: A map from file system blocks to log blocks
 526   * @j_transaction_infos: An array of information structures per live transaction
 527 - * @j_map_lock: Protect j_jmap and j_transaction_infos
 528 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
 529 + * @j_cleaner_ctx: Cleaner state
 530   * @j_format_version: Version of the superblock format
 531   * @j_state_lock: Protect the various scalars in the journal
 532   * @j_barrier_count:  Number of processes waiting to create a barrier lock
 533 @@ -820,6 +821,9 @@ struct journal_s
 534         /* Protect j_jmap and j_transaction_infos */
 535         rwlock_t                j_jmap_lock;
 536
 537 +       /* Cleaner state */
 538 +       struct cleaner_ctx      *j_cleaner_ctx;
 539 +
 540         /* Version of the superblock format */
 541         int                     j_format_version;
 542
 543 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
 544 index a602ece7cc89..acd588b4c68b 100644
 545 --- a/include/linux/jmap.h
 546 +++ b/include/linux/jmap.h
 547 @@ -5,6 +5,14 @@
 548  #include <linux/journal-head.h>
 549  #include <linux/list.h>
 550  #include <linux/circ_buf.h>
 551 +#include <linux/completion.h>
 552 +
 553 +/*
 554 + * Forward declaration for journal_t so that we don't get circular dependency
 555 + * between jbd2.h and jmap.h
 556 + */
 557 +struct journal_s;
 558 +typedef struct journal_s journal_t;
 559
 560  /*
 561   * Maximum number of transactions.  This guides the size of the circular buffer
 562 @@ -17,13 +25,6 @@
 563  #define MAX_LIVE_TRANSACTIONS 65536
 564
 565  /*
 566 - * Forward declaration for journal_t so that we don't get circular dependency
 567 - * between jbd2.h and jmap.h
 568 - */
 569 -struct journal_s;
 570 -typedef struct journal_s journal_t;
 571 -
 572 -/*
 573   * A mapping from file system block to log block.
 574   */
 575  struct blk_mapping {
 576 @@ -79,14 +80,14 @@ struct transaction_info {
 577         sector_t offset;
 578
 579         /*
 580 -        * A list of live log blocks referenced in the RB-tree that belong to
 581 -        * this transaction.  It is used during cleaning to locate live blocks
 582 -        * and migrate them to appropriate location.  If this list is empty,
 583 -        * then the transaction does not contain any live blocks and we can
 584 -        * reuse its space.  If this list is not empty, then we can quickly
 585 -        * locate all the live blocks in this transaction.
 586 +        * A list of live blocks referenced in the RB-tree that belong to this
 587 +        * transaction.  It is used during cleaning to locate live blocks and
 588 +        * migrate them to appropriate location.  If this list is empty, then
 589 +        * the transaction does not contain any live blocks and we can reuse its
 590 +        * space.  If this list is not empty, then we can quickly locate all the
 591 +        * live blocks in this transaction.
 592          */
 593 -       struct list_head live_logblks;
 594 +       struct list_head live_blks;
 595  };
 596
 597  /*
 598 @@ -128,4 +129,86 @@ extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
 599  extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
 600                                sector_t block);
 601
 602 +/*
 603 + * Cleaner stuff is below.
 604 + */
 605 +
 606 +/*
 607 + * Number of blocks to read at once, for cleaning.
 608 + */
 609 +#define CLEANER_BATCH_SIZE 16
 610 +
 611 +/*
 612 + * Context structure for the cleaner.
 613 + */
 614 +struct cleaner_ctx {
 615 +       /*
 616 +        * We set to true once we drop below low watermark and it stays so until
 617 +        * we rise above the high watermark.  It is accessed by the commit
 618 +        * thread and the foreground kernel threads during the journal
 619 +        * destruction, therefore it is atomic.
 620 +        */
 621 +       atomic_t cleaning;
 622 +
 623 +       /*
 624 +        * We clean in batches of blocks.  This flag indicates if we are
 625 +        * currently cleaning a batch.  It is accessed by the commit thread and
 626 +        * the cleaner thread, therefore it is atomic.
 627 +        */
 628 +       atomic_t batch_in_progress;
 629 +
 630 +       /*
 631 +        * We find live blocks to clean from the live blocks list of the
 632 +        * transaction at the tail.  This list can be larger than our batch size
 633 +        * and we may need several attempts to process it.  We cache the
 634 +        * position of the next entry to start from in |pos|.  Since cleaner
 635 +        * thread can run concurrently with the commit thread that can modify
 636 +        * the live blocks list of the transaction at the tail (for example, if
 637 +        * it needs to drop a revoked entry or if |pos| points to an entry that
 638 +        * has been updated and should move from the live blocks list of the
 639 +        * transaction at the tail to the live blocks list of current
 640 +        * transaction) we protect |pos| with |pos_lock|.
 641 +        */
 642 +       struct jmap_entry *pos;
 643 +       spinlock_t pos_lock;
 644 +
 645 +       /*
 646 +        * Live block mappings for the blocks that we copy in a batch.
 647 +        */
 648 +       struct blk_mapping mappings[CLEANER_BATCH_SIZE];
 649 +
 650 +       /*
 651 +        * Buffer heads for the live blocks read in a batch.
 652 +        */
 653 +       struct buffer_head *bhs[CLEANER_BATCH_SIZE];
 654 +
 655 +       /*
 656 +        * Number of pending reads in a batch.  Every submitted read increments
 657 +        * it and every completed read decrements it.
 658 +        */
 659 +       atomic_t nr_pending_reads;
 660 +
 661 +       /*
 662 +        * The cleaner thread sleeps on this condition variable until the last
 663 +        * completed read wakes the up the cleaner thread.
 664 +        */
 665 +       struct completion live_block_reads;
 666 +
 667 +       /* TODO: temporary for debugging, remove once done. */
 668 +       atomic_t nr_txns_committed;
 669 +       atomic_t nr_txns_cleaned;
 670 +
 671 +       journal_t *journal;
 672 +       struct work_struct work;
 673 +};
 674 +
 675 +extern int low_on_space(journal_t *journal);
 676 +extern int high_on_space(journal_t *journal);
 677 +extern bool cleaning(journal_t *journal);
 678 +extern void stop_cleaning(journal_t *journal);
 679 +extern void start_cleaning(journal_t *journal);
 680 +extern void clean_next_batch(journal_t *journal);
 681 +extern bool cleaning_batch_complete(journal_t *journal);
 682 +extern bool try_to_move_tail(journal_t *journal);
 683 +
 684  #endif