add patch dont-mark-mmp-buffer-head-dirty
[ext4-patch-queue.git] / add-support-for-log-metadata-block-tracking-in-log
blobb53a2ca21d8694ba1a857510710ea5339a6bd5d2
1 Add support for tracking metadata blocks in the log.
3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 This patch adds two important data structures, jmap and transaction_infos,
6 and supporting functions.  Jmap is a map from a metadata block number to
7 the log block number.  When a transaction commits, jmap is updated with new
8 mappings; when a block is revoked, the mapping for the block is removed
9 from the jmap.  Transaction_infos is an array of transaction_info
10 structures that contain information about transactions currently present in
11 the log.  It contains a linked list of live blocks in a transaction, and it
12 is updated after every commit to keep the list up-to-date.
13 Transaction_infos array will be used by the cleaner for identifying live
14 blocks and migrating them to appropriate location.
16 [ Modified by tytso to conditionalize changes on the JBD2_LAZY journal flag ]
18 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
19 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
20 ---
21  fs/jbd2/Makefile            |   3 +-
22  fs/jbd2/commit.c            |  25 ++++
23  fs/jbd2/jmap.c              | 510 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
24  fs/jbd2/jmap.h              | 128 +++++++++++++++++++
25  fs/jbd2/journal.c           |  13 ++
26  include/linux/jbd2.h        |  29 +++++
27  include/trace/events/jbd2.h | 196 +++++++++++++++++++++++++++++
28  7 files changed, 903 insertions(+), 1 deletion(-)
30 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
31 index 802a3413872a..a54f50b3a06e 100644
32 --- a/fs/jbd2/Makefile
33 +++ b/fs/jbd2/Makefile
34 @@ -4,4 +4,5 @@
36  obj-$(CONFIG_JBD2) += jbd2.o
38 -jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
39 +jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
40 +               jmap.o
41 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
42 index 8de0e7723316..82aaaf3f63fe 100644
43 --- a/fs/jbd2/commit.c
44 +++ b/fs/jbd2/commit.c
45 @@ -26,6 +26,8 @@
46  #include <linux/bitops.h>
47  #include <trace/events/jbd2.h>
49 +#include "jmap.h"
51  /*
52   * IO end handler for temporary buffer_heads handling writes to the journal.
53   */
54 @@ -351,6 +353,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
55         int flags;
56         int err;
57         unsigned long long blocknr;
58 +       struct blk_mapping *mappings = NULL;
59 +       struct blk_mapping *map_ptr = NULL;
60         ktime_t start_time;
61         u64 commit_time;
62         char *tagp = NULL;
63 @@ -552,6 +556,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
64         J_ASSERT(commit_transaction->t_nr_buffers <=
65                  atomic_read(&commit_transaction->t_outstanding_credits));
67 +       if (journal->j_flags & JBD2_LAZY) {
68 +               int nr_mappings = commit_transaction->t_nr_buffers;
70 +               map_ptr = mappings = kmalloc(sizeof(*mappings) * nr_mappings, GFP_NOFS);
71 +               if (!mappings)
72 +                       jbd2_journal_abort(journal, -ENOMEM);
73 +       }
75         err = 0;
76         bufs = 0;
77         descriptor = NULL;
78 @@ -650,6 +662,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
79                         continue;
80                 }
81                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
82 +               if (map_ptr) {
83 +                       map_ptr->fsblk = jh2bh(jh)->b_blocknr;
84 +                       map_ptr->logblk = blocknr;
85 +                       map_ptr++;
86 +               }
88                 /* Record the new block's tag in the current descriptor
89                     buffer */
90 @@ -884,6 +901,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
91             transaction can be removed from any checkpoint list it was on
92             before. */
94 +       if (mappings) {
95 +               err = jbd2_transaction_infos_add(journal, commit_transaction,
96 +                                                mappings, map_ptr - mappings);
97 +               if (err)
98 +                       jbd2_journal_abort(journal, -ENOMEM);
99 +               kfree(mappings);
100 +       }
102         jbd_debug(3, "JBD2: commit phase 6\n");
104         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
105 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
106 new file mode 100644
107 index 000000000000..7de6f4a0a1dc
108 --- /dev/null
109 +++ b/fs/jbd2/jmap.c
110 @@ -0,0 +1,510 @@
111 +#include <linux/blk_types.h>
112 +#include <linux/jbd2.h>
113 +#include "jmap.h"
114 +#include <trace/events/jbd2.h>
116 +static struct kmem_cache *jbd2_jmap_cache;
118 +int jbd2_journal_init_jmap_cache(void)
120 +       jbd2_jmap_cache = KMEM_CACHE(jmap_entry, SLAB_RECLAIM_ACCOUNT);
121 +       if (!jbd2_jmap_cache)
122 +               return -ENOMEM;
123 +       return 0;
126 +void jbd2_journal_destroy_jmap_cache(void)
128 +       kmem_cache_destroy(jbd2_jmap_cache);
129 +       jbd2_jmap_cache = NULL;
133 + * Allocate an array of transaction_info structures and initialize the list
134 + * heads inside them.
135 + */
136 +int jbd2_init_transaction_infos(journal_t *journal)
138 +       int i;
139 +       struct transaction_infos *tis = kzalloc(sizeof(*tis), GFP_KERNEL);
140 +       if (!tis)
141 +               return -ENOMEM;
143 +       tis->buf = kzalloc(sizeof(*tis->buf) * MAX_LIVE_TRANSACTIONS,
144 +                       GFP_KERNEL);
145 +       if (!tis->buf) {
146 +               kfree(tis);
147 +               return -ENOMEM;
148 +       }
150 +       for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
151 +               INIT_LIST_HEAD(&tis->buf[i].live_blks);
153 +       journal->j_transaction_infos = tis;
154 +       return 0;
158 + * Free the array of transaction_info structures.
159 + */
160 +void jbd2_free_transaction_infos(journal_t *journal)
162 +       struct transaction_infos *tis = journal->j_transaction_infos;
163 +       if (!tis)
164 +               return;
165 +       kfree(tis->buf);
166 +       kfree(tis);
170 + * Fill an entry to be stored in jmap.
171 + */
172 +static void fill_entry(struct jmap_entry *entry, struct blk_mapping *mapping,
173 +                       int t_idx, struct list_head *list)
175 +       entry->mapping = *mapping;
176 +       entry->fsblk_last_modified = jiffies;
177 +       entry->t_idx = t_idx;
178 +       list_add(&entry->list, list);
182 + * A helper function for jbd2_transaction_infos_add.  Scans through the mappings
183 + * array, dropping revoked entries from jmap and updating existing entries.
184 + * Moves the new mappings to the beginning of the mappings array and returns the
185 + * number of new mappings.  Should be called with a write lock on j_jmap_lock.
186 + */
187 +static int process_existing_mappings(journal_t *journal,
188 +                               struct transaction_info *ti, int t_idx,
189 +                               struct blk_mapping *mappings, int nr_mappings)
191 +       struct jmap_entry *je;
192 +       int i, nr_new = 0;
194 +       for (i = 0; i < nr_mappings; ++i) {
195 +               je = jbd2_jmap_lookup(journal, mappings[i].fsblk, __func__);
196 +               if (!je) {
197 +                       mappings[nr_new++] = mappings[i];
198 +                       continue;
199 +               }
200 +               /*
201 +                * We are either deleting the entry because it was revoked, or
202 +                * we are moving it to the live blocks list of this transaction.
203 +                * In either case, we remove it from its existing list.
204 +                */
205 +               list_del(&je->list);
207 +               if (je->revoked) {
208 +                       rb_erase(&je->rb_node, &journal->j_jmap);
209 +                       kmem_cache_free(jbd2_jmap_cache, je);
210 +               } else {
211 +                       trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
212 +                       fill_entry(je, &mappings[i], t_idx, &ti->live_blks);
213 +               }
214 +       }
215 +       return nr_new;
219 + * A helper function for jbd2_transaction_infos_add.  Allocates an array of
220 + * jmap_entry structures and returns the pointer to array if successful.
221 + * Otherwise, returns NULL.
222 + */
223 +static struct jmap_entry **alloc_jmap_entries(int nr_entries)
225 +       struct jmap_entry **jmap_entries;
226 +       int i;
228 +       jmap_entries = kmalloc(sizeof(struct jmap_entry *) * nr_entries,
229 +                       GFP_NOFS);
230 +       if (!jmap_entries)
231 +               return NULL;
233 +       for (i = 0; i < nr_entries; i++) {
234 +               jmap_entries[i] = kmem_cache_zalloc(jbd2_jmap_cache, GFP_NOFS);
235 +               if (!jmap_entries[i])
236 +                       goto out_err;
237 +       }
238 +       return jmap_entries;
240 +out_err:
241 +       for (i = 0; i < nr_entries && jmap_entries[i]; ++i)
242 +               kmem_cache_free(jbd2_jmap_cache, jmap_entries[i]);
243 +       kfree(jmap_entries);
244 +       return NULL;
248 + * A helper function for jbd2_transaction_infos_add.  Adds new mappings to jmap
249 + * and updates the linked list of live logblks of the new transaction.  Should
250 + * be called with write lock on j_jmap_lock.
251 + */
252 +static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
253 +                       int t_idx, struct blk_mapping *mappings,
254 +                       struct jmap_entry **new_entries, int nr_new)
256 +       struct rb_node **p;
257 +       struct rb_node *parent = NULL;
258 +       struct jmap_entry *je;
259 +       int i;
261 +       for (i = 0; i < nr_new; ++i) {
262 +               p = &journal->j_jmap.rb_node;
263 +               while (*p) {
264 +                       parent = *p;
265 +                       je = rb_entry(parent, struct jmap_entry, rb_node);
267 +                       if (mappings[i].fsblk < je->mapping.fsblk)
268 +                               p = &(*p)->rb_left;
269 +                       else if (mappings[i].fsblk > je->mapping.fsblk)
270 +                               p = &(*p)->rb_right;
271 +                       else
272 +                               BUG_ON(1);
273 +               }
274 +               fill_entry(new_entries[i], &mappings[i], t_idx, &ti->live_blks);
275 +               rb_link_node(&new_entries[i]->rb_node, parent, p);
276 +               rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
277 +               trace_jbd2_jmap_insert(&mappings[i], t_idx);
278 +       }
281 +void jbd2_add_new_transaction_infos(journal_t *journal, tid_t tid,
282 +                                  unsigned long log_start)
284 +       struct transaction_infos *tis = journal->j_transaction_infos;
285 +       int t_idx = tis->head;
286 +       struct transaction_info *ti = &tis->buf[t_idx];
288 +       /*
289 +        * We are possibly reusing space of an old transaction_info.  The old
290 +        * transaction should not have any live blocks in it.
291 +        */
292 +       BUG_ON(!list_empty(&ti->live_blks));
294 +       write_lock(&journal->j_jmap_lock);
295 +       ti->tid = tid;
296 +       ti->offset = log_start;
297 +       write_unlock(&journal->j_jmap_lock);
300 +int jbd2_add_mapping(journal_t *journal, struct blk_mapping *mapping)
302 +       struct transaction_infos *tis = journal->j_transaction_infos;
303 +       int t_idx = tis->head;
304 +       struct transaction_info *ti = &tis->buf[t_idx];
305 +       struct jmap_entry *new_entry;
306 +       int nr_new = 0;
308 +       write_lock(&journal->j_jmap_lock);
309 +       nr_new = process_existing_mappings(journal, ti, t_idx, mapping, 1);
310 +       write_unlock(&journal->j_jmap_lock);
312 +       if (nr_new == 0)
313 +               return 0;
315 +       new_entry = kmem_cache_zalloc(jbd2_jmap_cache, GFP_NOFS);
316 +       if (!new_entry)
317 +               return -ENOMEM;
319 +       write_lock(&journal->j_jmap_lock);
320 +       add_new_mappings(journal, ti, t_idx, mapping, &new_entry, 1);
321 +       write_unlock(&journal->j_jmap_lock);
322 +       return 0;
325 +void jbd2_finish_transaction_infos(journal_t *journal)
327 +       struct transaction_infos *tis = journal->j_transaction_infos;
329 +       write_lock(&journal->j_jmap_lock);
330 +       tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
331 +       write_unlock(&journal->j_jmap_lock);
335 + * This function is called after a transaction commits.  It adds new
336 + * transaction_info structure to transaction_infos and populates jmap map with
337 + * the new mappings that are part of the committed transaction.  It also adds
338 + * all the mappings to the linked list that is part of the transaction_info
339 + * structure.
340 + */
341 +int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
342 +                       struct blk_mapping *mappings, int nr_mappings)
344 +       struct transaction_infos *tis = journal->j_transaction_infos;
345 +       int t_idx = tis->head;
346 +       struct transaction_info *ti = &tis->buf[t_idx];
347 +       struct jmap_entry **new_entries = NULL;
348 +       int nr_new = 0;
350 +       /*
351 +        * We are possibly reusing space of an old transaction_info.  The old
352 +        * transaction should not have any live blocks in it.
353 +        */
354 +       BUG_ON(!list_empty(&ti->live_blks));
356 +       write_lock(&journal->j_jmap_lock);
357 +       nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
358 +                                       nr_mappings);
359 +       write_unlock(&journal->j_jmap_lock);
361 +       if (nr_new == 0)
362 +               goto move_head;
364 +       new_entries = alloc_jmap_entries(nr_new);
365 +       if (!new_entries)
366 +               return -ENOMEM;
368 +       write_lock(&journal->j_jmap_lock);
369 +       add_new_mappings(journal, ti, t_idx, mappings, new_entries, nr_new);
370 +       write_unlock(&journal->j_jmap_lock);
372 +       kfree(new_entries);
374 +move_head:
375 +       write_lock(&journal->j_jmap_lock);
376 +       ti->tid = transaction->t_tid;
377 +       ti->offset = transaction->t_log_start;
378 +       tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
379 +       write_unlock(&journal->j_jmap_lock);
381 +       trace_jbd2_transaction_infos_add(t_idx, ti, nr_mappings);
382 +       return 0;
386 + * Look up fsblk in the jmap and return the corresponding jmap entry if found.
387 + * Should be called with a read lock on j_jmap_lock.
388 + */
389 +struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
390 +                               const char *func)
392 +       struct rb_node *p;
394 +       BUG_ON(!journal);
396 +       for (p = journal->j_jmap.rb_node; p; ) {
397 +               struct jmap_entry *je = rb_entry(p, struct jmap_entry, rb_node);
398 +               if (je->mapping.fsblk > fsblk)
399 +                       p = p->rb_left;
400 +               else if (je->mapping.fsblk < fsblk)
401 +                       p = p->rb_right;
402 +               else {
403 +                       trace_jbd2_jmap_lookup(fsblk, je->mapping.logblk, func);
404 +                       return je;
405 +               }
406 +       }
407 +       trace_jbd2_jmap_lookup(fsblk, 0, func);
408 +       return NULL;
412 + * Revoke a mapping for the fsblk in the jmap.  A lookup for fsblk will return
413 + * NULL and the mapping will be removed from the jmap during commit, unless
414 + * fsblk is reallocated as a metadata block.
415 + */
416 +void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk)
418 +       struct jmap_entry *je;
420 +       write_lock(&journal->j_jmap_lock);
421 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
422 +       /*
423 +        * For now, since we do not construct jmap from the journal, it is
424 +        * possible that a metadata block that was revoked is not in the jmap.
425 +        * Eventually, this should not be the case and we should have a
426 +        * BUG_ON(!je) here.
427 +        */
428 +       if (je) {
429 +               if (WARN_ON(je->revoked))
430 +                       pr_err("JBD2: block %llu already revoked!\n",
431 +                              (unsigned long long) fsblk);
432 +               je->revoked = true;
433 +       }
434 +       write_unlock(&journal->j_jmap_lock);
438 + * Cancel a revoke for the fsblk in the jmap.
439 + */
440 +void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk)
442 +       struct jmap_entry *je;
444 +       write_lock(&journal->j_jmap_lock);
445 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
446 +       BUG_ON(!je);
447 +       BUG_ON(!je->revoked);
448 +       je->revoked = false;
449 +       write_unlock(&journal->j_jmap_lock);
453 + * Read bh from its most up-to-date location, either from the file system or
454 + * from the log.
455 + *
456 + * If there is no mapping for the bh in jmap, this function acts like submit_bh.
457 + * Otherwise, it submits a read for the block pointed by the mapping located in
458 + * the log.  Upon completion, bh will be filled with the contents of the block
459 + * read from the log.
460 + */
461 +void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
462 +                   struct buffer_head *bh, const char *func)
464 +       sector_t fsblk = bh->b_blocknr;
465 +       sector_t logblk;
466 +       struct jmap_entry *je;
468 +       BUG_ON(!buffer_locked(bh));
470 +       if (!journal || !(journal->j_flags & JBD2_LAZY)) {
471 +               submit_bh(rw, op_flags, bh);
472 +               return;
473 +       }
475 +       read_lock(&journal->j_jmap_lock);
476 +       je = jbd2_jmap_lookup(journal, fsblk, func);
477 +       if (!je) {
478 +               read_unlock(&journal->j_jmap_lock);
479 +               submit_bh(rw, op_flags, bh);
480 +               return;
481 +       }
482 +       logblk = je->mapping.logblk;
483 +       read_unlock(&journal->j_jmap_lock);
485 +       BUG_ON(rw == WRITE);
486 +       read_block_from_log(journal, bh, op_flags, logblk);
488 +EXPORT_SYMBOL(jbd2_submit_bh);
491 + * End_io handler for read_block_from_log that copies the contents of
492 + * log_bh read from log to the embedded bh.
493 + */
494 +static void jbd2_end_log_read(struct buffer_head *log_bh, int uptodate)
496 +       struct buffer_head *bh = log_bh->b_private;
498 +       trace_jbd2_jmap_read_from_log(bh->b_blocknr, log_bh->b_blocknr,
499 +                                     uptodate);
500 +       if (uptodate)
501 +               memcpy(bh->b_data, log_bh->b_data, log_bh->b_size);
503 +       unlock_buffer(log_bh);
504 +       put_bh(log_bh);
505 +       brelse(log_bh);
507 +       bh->b_end_io(bh, uptodate);
511 + * This function fills |bh| with the contents of the |blk|.  Assume
512 + * jmap maps metadata block 123 to log block 100123.  To read the
513 + * metadata block 123, we obtain a buffer head for it and call
514 + * read_block_from_log passing the obtained buffer head as |bh| and
515 + * 100123 as |blk|.  If block 100123 is cached, then we copy the
516 + * contents to |bh| and return.  Otherwise, we submit a request and
517 + * end_io handler copies the contents of block 100123 to |bh|.
518 + * Returns -ENOMEM if getblk fails, 1 if block is not cached, 0 if
519 + * block is cached.
520 + */
521 +int read_block_from_log(journal_t *journal, struct buffer_head *bh,
522 +                       int op_flags, sector_t blk)
524 +       struct buffer_head *log_bh;
526 +       BUG_ON(!buffer_locked(bh));
528 +       log_bh = __getblk(journal->j_fs_dev, blk, bh->b_size);
529 +       if (unlikely(!log_bh)) {
530 +               bh->b_end_io(bh, 0);
531 +               return -ENOMEM;
532 +       }
534 +       lock_buffer(log_bh);
535 +       if (buffer_uptodate(log_bh)) {
536 +               memcpy(bh->b_data, log_bh->b_data, bh->b_size);
537 +               unlock_buffer(log_bh);
538 +               brelse(log_bh);
539 +               bh->b_end_io(bh, 1);
540 +               return 0;
541 +       }
543 +       log_bh->b_end_io = jbd2_end_log_read;
544 +       log_bh->b_private = bh;
545 +       get_bh(log_bh);
546 +       submit_bh(READ, op_flags, log_bh);
547 +       return 1;
551 + * Copy of ll_rw_block that uses jbd2_submit_bh instead of submit_bh.
552 + */
553 +void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags,
554 +                     int nr, struct buffer_head *bhs[], const char *func)
556 +       int i;
558 +       for (i = 0; i < nr; i++) {
559 +               struct buffer_head *bh = bhs[i];
561 +               if (!trylock_buffer(bh))
562 +                       continue;
563 +               BUG_ON(rw == WRITE);
564 +               if (!buffer_uptodate(bh)) {
565 +                       bh->b_end_io = end_buffer_read_sync;
566 +                       get_bh(bh);
567 +                       jbd2_submit_bh(journal, rw, op_flags, bh, func);
568 +                       continue;
569 +               }
570 +               unlock_buffer(bh);
571 +       }
573 +EXPORT_SYMBOL(jbd2_ll_rw_block);
576 + * Copy of bh_submit_read that uses jbd2_submit_bh instead of submit_bh.
577 + */
578 +int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
579 +                       const char *func)
581 +       BUG_ON(!buffer_locked(bh));
583 +       if (buffer_uptodate(bh)) {
584 +               unlock_buffer(bh);
585 +               return 0;
586 +       }
588 +       get_bh(bh);
589 +       bh->b_end_io = end_buffer_read_sync;
590 +       jbd2_submit_bh(journal, READ, 0, bh, func);
591 +       wait_on_buffer(bh);
592 +       if (buffer_uptodate(bh))
593 +               return 0;
594 +       return -EIO;
596 +EXPORT_SYMBOL(jbd2_bh_submit_read);
598 +int jbd2_smr_journal_init(journal_t *journal)
600 +       journal->j_jmap = RB_ROOT;
601 +       rwlock_init(&journal->j_jmap_lock);
602 +       return jbd2_init_transaction_infos(journal);
605 +void jbd2_smr_journal_exit(journal_t *journal)
607 +       jbd2_free_transaction_infos(journal);
610 +void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
611 +                       sector_t block)
613 +       struct buffer_head *bh = __getblk(sb->s_bdev, block, sb->s_blocksize);
614 +       if (likely(bh)) {
615 +               jbd2_ll_rw_block(journal, REQ_OP_READ, REQ_RAHEAD, 1,
616 +                                &bh, __func__);
617 +               brelse(bh);
618 +       }
620 +EXPORT_SYMBOL(jbd2_sb_breadahead);
621 diff --git a/fs/jbd2/jmap.h b/fs/jbd2/jmap.h
622 new file mode 100644
623 index 000000000000..91564ce9bbda
624 --- /dev/null
625 +++ b/fs/jbd2/jmap.h
626 @@ -0,0 +1,128 @@
627 +#ifndef _LINUX_JMAP_H
628 +#define _LINUX_JMAP_H
630 +#include <linux/buffer_head.h>
631 +#include <linux/journal-head.h>
632 +#include <linux/list.h>
633 +#include <linux/circ_buf.h>
634 +#include <linux/completion.h>
637 + * Forward declaration for journal_t so that we don't get circular dependency
638 + * between jbd2.h and jmap.h
639 + */
640 +struct journal_s;
641 +typedef struct journal_s journal_t;
644 + * Maximum number of transactions.  This guides the size of the circular buffer
645 + * in which we store housekeeping information per transaction.  We start
646 + * cleaning either when the circular buffer is full or when we hit the free
647 + * space threshold, whichever happens first.  For starters, we make this
648 + * constant large to make sure that we start cleaning only when we hit the free
649 + * space threshold.  Later we can empirically determine a sensible value.
650 + */
651 +#define MAX_LIVE_TRANSACTIONS 65536
654 + * A mapping from file system block to log block.
655 + */
656 +struct blk_mapping {
657 +       sector_t fsblk;
658 +       sector_t logblk;
662 + * An RB-tree entry wrapper for blk_mapping with extra housekeeping information.
663 + */
664 +struct jmap_entry {
665 +       struct rb_node rb_node;
667 +       /* The actual mapping information. */
668 +       struct blk_mapping mapping;
670 +       /*
671 +        * If a block that is mapped gets deleted, the revoked bit is set.  A
672 +        * lookup for a deleted block fails.  If a deleted block gets
673 +        * re-allocated as a metadata block, the mapping is updated and revoked
674 +        * bit is cleared.
675 +        */
676 +       bool revoked;
678 +       /*
679 +        * All log blocks that are part of the same transaction in the log are
680 +        * chained with a linked list.  The root of the list is stored in the
681 +        * transaction_info structure described below.
682 +        */
683 +       struct list_head list;
685 +       /*
686 +        * The last time when fsblk was written again to the journal and
687 +        * therefore was remapped to a different log block.
688 +        */
689 +       unsigned long fsblk_last_modified;
691 +       /*
692 +        * Index of the transaction in the transaction_info_buffer (described
693 +        * below) of which the log block is part of.
694 +        */
695 +       int t_idx;
699 + * Housekeeping information about committed transaction.
700 + */
701 +struct transaction_info {
702 +       /* Id of the transaction */
703 +       tid_t tid;
705 +       /* Offset where the transaction starts in the log */
706 +       sector_t offset;
708 +       /*
709 +        * A list of live blocks referenced in the RB-tree that belong to this
710 +        * transaction.  It is used during cleaning to locate live blocks and
711 +        * migrate them to appropriate location.  If this list is empty, then
712 +        * the transaction does not contain any live blocks and we can reuse its
713 +        * space.  If this list is not empty, then we can quickly locate all the
714 +        * live blocks in this transaction.
715 +        */
716 +       struct list_head live_blks;
720 + * An array of transaction_info structures about all the transactions in the
721 + * log.  Since there can only be a limited number of transactions in the log, we
722 + * use a circular buffer to store housekeeping information about transactions.
723 + */
724 +struct transaction_infos {
725 +       struct transaction_info *buf;
726 +       int head;
727 +       int tail;
730 +extern int jbd2_smr_journal_init(journal_t *journal);
731 +extern void jbd2_smr_journal_exit(journal_t *journal);
733 +extern int jbd2_journal_init_jmap_cache(void);
734 +extern void jbd2_journal_destroy_jmap_cache(void);
736 +extern int jbd2_init_transaction_infos(journal_t *journal);
737 +extern void jbd2_free_transaction_infos(journal_t *journal);
738 +extern void jbd2_add_new_transaction_infos(journal_t *journal, tid_t t_tid,
739 +                                         unsigned long log_start);
740 +extern int jbd2_add_mapping(journal_t *journal, struct blk_mapping *mapping);
741 +extern void jbd2_finish_transaction_infos(journal_t *journal);
742 +extern int jbd2_transaction_infos_add(journal_t *journal,
743 +                               transaction_t *transaction,
744 +                               struct blk_mapping *mappings,
745 +                               int nr_mappings);
747 +extern struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
748 +                                       const char *func);
749 +extern void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk);
750 +extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
751 +extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
752 +                              int op_flags, sector_t blk);
754 +#endif
755 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
756 index 6a6f75834cca..114c7636d706 100644
757 --- a/fs/jbd2/journal.c
758 +++ b/fs/jbd2/journal.c
759 @@ -42,6 +42,8 @@
760  #include <linux/ratelimit.h>
761  #include <linux/sched/mm.h>
763 +#include "jmap.h"
765  #define CREATE_TRACE_POINTS
766  #include <trace/events/jbd2.h>
768 @@ -1156,6 +1158,10 @@ static journal_t *journal_init_common(struct block_device *bdev,
769         journal->j_max_batch_time = 15000; /* 15ms */
770         atomic_set(&journal->j_reserved_credits, 0);
772 +       err = jbd2_smr_journal_init(journal);
773 +       if (err)
774 +               goto err_cleanup;
776         /* The journal is marked for error until we succeed with recovery! */
777         journal->j_flags = JBD2_ABORT;
779 @@ -1727,6 +1733,9 @@ int jbd2_journal_destroy(journal_t *journal)
780         if (journal->j_running_transaction)
781                 jbd2_journal_commit_transaction(journal);
783 +       if (journal->j_flags & JBD2_LAZY)
784 +               journal->j_flags |= JBD2_NO_CLEANUP;
786         if (journal->j_flags & JBD2_NO_CLEANUP) {
787                 jbd2_journal_destroy_checkpoint(journal);
788                 journal->j_checkpoint_transactions = NULL;
789 @@ -1783,6 +1792,7 @@ int jbd2_journal_destroy(journal_t *journal)
790                 jbd2_journal_destroy_revoke(journal);
791         if (journal->j_chksum_driver)
792                 crypto_free_shash(journal->j_chksum_driver);
793 +       jbd2_smr_journal_exit(journal);
794         kfree(journal->j_wbuf);
795         kfree(journal);
797 @@ -2693,6 +2703,8 @@ static int __init journal_init_caches(void)
798                 ret = jbd2_journal_init_handle_cache();
799         if (ret == 0)
800                 ret = jbd2_journal_init_transaction_cache();
801 +       if (ret == 0)
802 +               ret = jbd2_journal_init_jmap_cache();
803         return ret;
806 @@ -2702,6 +2714,7 @@ static void jbd2_journal_destroy_caches(void)
807         jbd2_journal_destroy_journal_head_cache();
808         jbd2_journal_destroy_handle_cache();
809         jbd2_journal_destroy_transaction_cache();
810 +       jbd2_journal_destroy_jmap_cache();
811         jbd2_journal_destroy_slabs();
814 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
815 index 81fa9fa7ce9c..a53c7d333199 100644
816 --- a/include/linux/jbd2.h
817 +++ b/include/linux/jbd2.h
818 @@ -769,6 +769,23 @@ struct journal_s
819          */
820         journal_superblock_t    *j_superblock;
822 +       /**
823 +        * @j_jmap: A map from file system blocks to journal blocks
824 +        */
825 +       struct rb_root          j_jmap;
827 +       /**
828 +        * @j_transaction_infos:
829 +        *
830 +        * An array of housekeeping information about live transactions
831 +        */
832 +       struct transaction_infos *j_transaction_infos;
834 +       /**
835 +        * @j_jmap_lock: Protect j_jmap and j_transaction_infos
836 +        */
837 +       rwlock_t                j_jmap_lock;
839         /**
840          * @j_format_version: Version of the superblock format.
841          */
842 @@ -1236,6 +1253,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
843                                                  * mode */
844  #define JBD2_REC_ERR   0x080   /* The errno in the sb has been recorded */
845  #define JBD2_NO_CLEANUP        0x100   /* Don't flush empty the journal on shutdown  */
846 +#define JBD2_LAZY      0x200   /* Do lazy journalling  */
848  /*
849   * Function declarations for the journaling transaction and buffer
850 @@ -1509,6 +1527,17 @@ static inline void jbd2_journal_abort_handle(handle_t *handle)
851         handle->h_aborted = 1;
854 +/* Lazy journalling redirection */
855 +extern void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
856 +                          struct buffer_head *bh, const char *func);
857 +extern void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags, int nr,
858 +                            struct buffer_head *bhs[], const char *func);
859 +extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
860 +                              const char *func);
861 +extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
862 +                              sector_t block);
865  #endif /* __KERNEL__   */
867  /* Comparison functions for transaction IDs: perform comparisons using
868 diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
869 index 2310b259329f..c9c2af4f80e5 100644
870 --- a/include/trace/events/jbd2.h
871 +++ b/include/trace/events/jbd2.h
872 @@ -10,6 +10,9 @@
874  struct transaction_chp_stats_s;
875  struct transaction_run_stats_s;
876 +struct blk_mapping;
877 +struct jmap_entry;
878 +struct transaction_info;
880  TRACE_EVENT(jbd2_checkpoint,
882 @@ -380,6 +383,199 @@ TRACE_EVENT(jbd2_lock_buffer_stall,
883                 __entry->stall_ms)
884  );
886 +TRACE_EVENT(jbd2_jmap_replace,
888 +       TP_PROTO(struct jmap_entry *jentry, struct blk_mapping *mapping, \
889 +               int t_idx),
891 +       TP_ARGS(jentry, mapping, t_idx),
893 +       TP_STRUCT__entry(
894 +               __field(sector_t, fsblk         )
895 +               __field(sector_t, old_logblk    )
896 +               __field(sector_t, new_logblk    )
897 +               __field(int, old_t_idx          )
898 +               __field(int, new_t_idx          )
899 +       ),
901 +       TP_fast_assign(
902 +               __entry->fsblk          = mapping->fsblk;
903 +               __entry->old_logblk     = jentry->mapping.logblk;
904 +               __entry->new_logblk     = mapping->logblk;
905 +               __entry->old_t_idx       = jentry->t_idx;
906 +               __entry->new_t_idx       = t_idx;
907 +       ),
909 +       TP_printk("remap %llu from %llu to %llu, move from transaction at index %d to transaction at index %d",
910 +                 (unsigned long long) __entry->fsblk,
911 +                 (unsigned long long) __entry->old_logblk,
912 +                 (unsigned long long) __entry->new_logblk,
913 +                 __entry->old_t_idx,
914 +                 __entry->new_t_idx)
917 +TRACE_EVENT(jbd2_jmap_insert,
919 +       TP_PROTO(struct blk_mapping *mapping, int t_idx),
921 +       TP_ARGS(mapping, t_idx),
923 +       TP_STRUCT__entry(
924 +               __field(sector_t, fsblk )
925 +               __field(sector_t, logblk)
926 +               __field(int, t_idx)
927 +       ),
929 +       TP_fast_assign(
930 +               __entry->fsblk  = mapping->fsblk;
931 +               __entry->logblk = mapping->logblk;
932 +               __entry->t_idx = t_idx;
933 +       ),
935 +       TP_printk("map %llu to %llu, insert to transaction %d",
936 +                 (unsigned long long) __entry->fsblk,
937 +                 (unsigned long long) __entry->logblk,
938 +                 __entry->t_idx)
941 +TRACE_EVENT(jbd2_jmap_lookup,
943 +       TP_PROTO(sector_t fsblk, sector_t logblk, const char *func),
945 +       TP_ARGS(fsblk, logblk, func),
947 +       TP_STRUCT__entry(
948 +               __field(sector_t, fsblk )
949 +               __field(sector_t, logblk)
950 +               __string(func, func)
951 +       ),
953 +       TP_fast_assign(
954 +               __entry->fsblk  = fsblk;
955 +               __entry->logblk = logblk;
956 +               __assign_str(func, func);
957 +       ),
959 +       TP_printk("%s: lookup %llu -> %llu",
960 +                 __get_str(func),
961 +                 (unsigned long long) __entry->fsblk,
962 +                 (unsigned long long) __entry->logblk)
965 +TRACE_EVENT(jbd2_jmap_read_from_log,
967 +       TP_PROTO(sector_t fsblk, sector_t logblk, int uptodate),
969 +       TP_ARGS(fsblk, logblk, uptodate),
971 +       TP_STRUCT__entry(
972 +               __field(sector_t, fsblk )
973 +               __field(sector_t, logblk)
974 +               __field(int, uptodate)
975 +       ),
977 +       TP_fast_assign(
978 +               __entry->fsblk  = fsblk;
979 +               __entry->logblk = logblk;
980 +               __entry->uptodate = uptodate;
981 +       ),
983 +       TP_printk("fsblk %llu logblk %llu uptodate %d",
984 +                 (unsigned long long) __entry->fsblk,
985 +                 (unsigned long long) __entry->logblk,
986 +                 __entry->uptodate)
989 +TRACE_EVENT(jbd2_jmap_printf,
991 +       TP_PROTO(const char *s),
993 +       TP_ARGS(s),
995 +       TP_STRUCT__entry(
996 +               __string(s, s)
997 +       ),
999 +       TP_fast_assign(
1000 +               __assign_str(s, s);
1001 +       ),
1003 +       TP_printk("%s",
1004 +               __get_str(s))
1007 +TRACE_EVENT(jbd2_jmap_printf1,
1009 +       TP_PROTO(const char *s, sector_t fsblk),
1011 +       TP_ARGS(s, fsblk),
1013 +       TP_STRUCT__entry(
1014 +               __string(s, s)
1015 +               __field(sector_t, fsblk )
1016 +       ),
1018 +       TP_fast_assign(
1019 +               __assign_str(s, s);
1020 +               __entry->fsblk  = fsblk;
1021 +       ),
1023 +       TP_printk("%s: %llu",
1024 +                 __get_str(s),
1025 +                 (unsigned long long) __entry->fsblk)
1028 +TRACE_EVENT(jbd2_jmap_printf2,
1030 +       TP_PROTO(const char *s, sector_t fsblk, sector_t logblk),
1032 +       TP_ARGS(s, fsblk, logblk),
1034 +       TP_STRUCT__entry(
1035 +               __string(s, s)
1036 +               __field(sector_t, fsblk )
1037 +               __field(sector_t, logblk)
1038 +       ),
1040 +       TP_fast_assign(
1041 +               __assign_str(s, s);
1042 +               __entry->fsblk  = fsblk;
1043 +               __entry->logblk = logblk;
1044 +       ),
1046 +       TP_printk("%s: %llu:%llu",
1047 +                 __get_str(s),
1048 +                 (unsigned long long) __entry->fsblk,
1049 +                 (unsigned long long) __entry->logblk)
1052 +TRACE_EVENT(jbd2_transaction_infos_add,
1054 +       TP_PROTO(int t_idx, struct transaction_info *ti, int nr_mappings),
1056 +       TP_ARGS(t_idx, ti, nr_mappings),
1058 +       TP_STRUCT__entry(
1059 +               __field(int, t_idx      )
1060 +               __field(tid_t, tid      )
1061 +               __field(sector_t, offset)
1062 +               __field(int, nr_mappings)
1063 +       ),
1065 +       TP_fast_assign(
1066 +               __entry->t_idx  = t_idx;
1067 +               __entry->tid    = ti->tid;
1068 +               __entry->offset = ti->offset;
1069 +               __entry->nr_mappings = nr_mappings;
1070 +       ),
1072 +       TP_printk("inserted transaction %u (offset %llu) at index %d with %d mappings",
1073 +                 __entry->tid,
1074 +                 (unsigned long long) __entry->offset,
1075 +                 __entry->t_idx,
1076 +                 __entry->nr_mappings)
1079  #endif /* _TRACE_JBD2_H */
1081  /* This part must be outside protection */