Update propagate-error-values-from-ext4_inline_data_truncate
[ext4-patch-queue.git] / add-support-for-log-metadata-block-tracking-in-log
blob1eaccde38bcc3fae3e36ac54a779483f37adae5a
1 Add support for tracking metadata blocks in the log.
3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 This patch adds two important data structures, jmap and transaction_infos,
6 and supporting functions.  Jmap is a map from a metadata block number to
7 the log block number.  When a transaction commits, jmap is updated with new
8 mappings; when a block is revoked, the mapping for the block is removed
9 from the jmap.  Transaction_infos is an array of transaction_info
10 structures that contain information about transactions currently present in
11 the log.  It contains a linked list of live blocks in a transaction, and it
12 is updated after every commit to keep the list up-to-date.
13 Transaction_infos array will be used by the cleaner for identifying live
14 blocks and migrating them to appropriate location.
16 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
18 ---
19  fs/jbd2/Makefile            |   3 +-
20  fs/jbd2/commit.c            |  17 ++++
21  fs/jbd2/jmap.c              | 440 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
22  fs/jbd2/journal.c           |  17 +++-
23  include/linux/jbd2.h        |  13 +++
24  include/linux/jmap.h        | 129 +++++++++++++++++++++++++
25  include/trace/events/jbd2.h | 169 ++++++++++++++++++++++++++++++++
26  7 files changed, 783 insertions(+), 5 deletions(-)
28 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
29 index 802a3413872a..a54f50b3a06e 100644
30 --- a/fs/jbd2/Makefile
31 +++ b/fs/jbd2/Makefile
32 @@ -4,4 +4,5 @@
34  obj-$(CONFIG_JBD2) += jbd2.o
36 -jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
37 +jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
38 +               jmap.o
39 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
40 index 31f8ca046639..4a249ec74b5c 100644
41 --- a/fs/jbd2/commit.c
42 +++ b/fs/jbd2/commit.c
43 @@ -361,6 +361,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
44         int flags;
45         int err;
46         unsigned long long blocknr;
47 +       struct blk_mapping *mappings;
48 +       int nr_mappings;
49         ktime_t start_time;
50         u64 commit_time;
51         char *tagp = NULL;
52 @@ -562,8 +564,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
53         J_ASSERT(commit_transaction->t_nr_buffers <=
54                  atomic_read(&commit_transaction->t_outstanding_credits));
56 +       nr_mappings = commit_transaction->t_nr_buffers;
57 +       mappings = kmalloc(sizeof(*mappings) * nr_mappings, GFP_NOFS);
58 +       if (!mappings)
59 +               jbd2_journal_abort(journal, -ENOMEM);
61         err = 0;
62         bufs = 0;
63 +       nr_mappings = 0;
64         descriptor = NULL;
65         while (commit_transaction->t_buffers) {
67 @@ -660,6 +668,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
68                         continue;
69                 }
70                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
71 +               mappings[nr_mappings++] = (struct blk_mapping) {
72 +                       jh2bh(jh)->b_blocknr, blocknr
73 +               };
75                 /* Record the new block's tag in the current descriptor
76                     buffer */
77 @@ -894,6 +905,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
78             transaction can be removed from any checkpoint list it was on
79             before. */
81 +       err = jbd2_transaction_infos_add(journal, commit_transaction,
82 +                                       mappings, nr_mappings);
83 +       if (err)
84 +               jbd2_journal_abort(journal, -ENOMEM);
85 +       kfree(mappings);
87         jbd_debug(3, "JBD2: commit phase 6\n");
89         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
90 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
91 new file mode 100644
92 index 000000000000..7d7b4eb389ed
93 --- /dev/null
94 +++ b/fs/jbd2/jmap.c
95 @@ -0,0 +1,440 @@
96 +#include <linux/jbd2.h>
97 +#include <linux/jmap.h>
98 +#include <trace/events/jbd2.h>
100 +static struct kmem_cache *jbd2_jmap_cache;
102 +int jbd2_journal_init_jmap_cache(void)
104 +       jbd2_jmap_cache = KMEM_CACHE(jmap_entry, SLAB_RECLAIM_ACCOUNT);
105 +       if (!jbd2_jmap_cache)
106 +               return -ENOMEM;
107 +       return 0;
110 +void jbd2_journal_destroy_jmap_cache(void)
112 +       if (jbd2_jmap_cache)
113 +               kmem_cache_destroy(jbd2_jmap_cache);
114 +       jbd2_jmap_cache = NULL;
118 + * Allocate an array of transaction_info structures and initialize the list
119 + * heads inside them.
120 + */
121 +int jbd2_init_transaction_infos(journal_t *journal)
123 +       int i;
124 +       struct transaction_infos *tis = kzalloc(sizeof(*tis), GFP_KERNEL);
125 +       if (!tis)
126 +               return -ENOMEM;
128 +       tis->buf = kzalloc(sizeof(*tis->buf) * MAX_LIVE_TRANSACTIONS,
129 +                       GFP_KERNEL);
130 +       if (!tis->buf) {
131 +               kfree(tis);
132 +               return -ENOMEM;
133 +       }
135 +       for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
136 +               INIT_LIST_HEAD(&tis->buf[i].live_logblks);
138 +       journal->j_transaction_infos = tis;
139 +       return 0;
143 + * Free the array of transaction_info structures.
144 + */
145 +void jbd2_free_transaction_infos(journal_t *journal)
147 +       struct transaction_infos *tis = journal->j_transaction_infos;
148 +       if (!tis)
149 +               return;
150 +       kfree(tis->buf);
151 +       kfree(tis);
155 + * Fill an entry to be stored in jmap.
156 + */
157 +static void fill_entry(struct jmap_entry *entry, struct blk_mapping *mapping,
158 +                       int t_idx, struct list_head *list)
160 +       entry->mapping = *mapping;
161 +       entry->fsblk_last_modified = jiffies;
162 +       entry->t_idx = t_idx;
163 +       list_add(&entry->list, list);
167 + * A helper function for jbd2_transaction_infos_add.  Scans through the mappings
168 + * array, dropping revoked entries from jmap and updating existing entries.
169 + * Moves the new mappings to the beginning of the mappings array and returns the
170 + * number of new mappings.  Should be called with a write lock on j_jmap_lock.
171 + */
172 +static int process_existing_mappings(journal_t *journal,
173 +                               struct transaction_info *ti, int t_idx,
174 +                               struct blk_mapping *mappings, int nr_mappings)
176 +       struct jmap_entry *je;
177 +       int i, nr_new = 0;
179 +       for (i = 0; i < nr_mappings; ++i) {
180 +               je = jbd2_jmap_lookup(journal, mappings[i].fsblk, __func__);
181 +               if (!je) {
182 +                       mappings[nr_new++] = mappings[i];
183 +                       continue;
184 +               }
185 +               if (je->revoked) {
186 +                       rb_erase(&je->rb_node, &journal->j_jmap);
187 +                       kmem_cache_free(jbd2_jmap_cache, je);
188 +               } else {
189 +                       /*
190 +                        * Delete jmap entry from the old transaction's list
191 +                        * before adding it to the new transaction's list.
192 +                        */
193 +                       list_del(&je->list);
194 +                       fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
195 +                       trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
196 +               }
197 +       }
198 +       return nr_new;
202 + * A helper function for jbd2_transaction_infos_add.  Allocates an array of
203 + * jmap_entry structures and returns the pointer to array if successful.
204 + * Otherwise, returns NULL.
205 + */
206 +static struct jmap_entry **alloc_jmap_entries(int nr_entries)
208 +       struct jmap_entry **jmap_entries;
209 +       int i;
211 +       jmap_entries = kmalloc(sizeof(struct jmap_entry *) * nr_entries,
212 +                       GFP_NOFS);
213 +       if (!jmap_entries)
214 +               return NULL;
216 +       for (i = 0; i < nr_entries; i++) {
217 +               jmap_entries[i] = kmem_cache_zalloc(jbd2_jmap_cache, GFP_NOFS);
218 +               if (!jmap_entries[i])
219 +                       goto out_err;
220 +       }
221 +       return jmap_entries;
223 +out_err:
224 +       for (i = 0; i < nr_entries && jmap_entries[i]; ++i)
225 +               kmem_cache_free(jbd2_jmap_cache, jmap_entries[i]);
226 +       kfree(jmap_entries);
227 +       return NULL;
231 + * A helper function for jbd2_transaction_infos_add.  Adds new mappings to jmap
232 + * and updates the linked list of live logblks of the new transaction.  Should
233 + * be called with write lock on j_jmap_lock.
234 + */
235 +static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
236 +                       int t_idx, struct blk_mapping *mappings,
237 +                       struct jmap_entry **new_entries, int nr_new)
239 +       struct rb_node **p = &journal->j_jmap.rb_node;
240 +       struct rb_node *parent = NULL;
241 +       struct jmap_entry *je;
242 +       int i;
244 +       for (i = 0; i < nr_new; ++i) {
245 +               while (*p) {
246 +                       parent = *p;
247 +                       je = rb_entry(parent, struct jmap_entry, rb_node);
249 +                       if (mappings[i].fsblk < je->mapping.fsblk)
250 +                               p = &(*p)->rb_left;
251 +                       else if (mappings[i].fsblk > je->mapping.fsblk)
252 +                               p = &(*p)->rb_right;
253 +                       else
254 +                               BUG_ON(1);
255 +               }
256 +               fill_entry(new_entries[i], &mappings[i], t_idx,
257 +                       &ti->live_logblks);
258 +               rb_link_node(&new_entries[i]->rb_node, parent, p);
259 +               rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
260 +               trace_jbd2_jmap_insert(&mappings[i], t_idx);
261 +       }
265 + * This function is called after a transaction commits.  It adds new
266 + * transaction_info structure to transaction_infos and populates jmap map with
267 + * the new mappings that are part of the committed transaction.  It also adds
268 + * all the mappings to the linked list that is part of the transaction_info
269 + * structure.
270 + */
271 +int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
272 +                       struct blk_mapping *mappings, int nr_mappings)
274 +       struct transaction_infos *tis = journal->j_transaction_infos;
275 +       int t_idx = tis->head;
276 +       struct transaction_info *ti = &tis->buf[t_idx];
277 +       struct jmap_entry **new_entries = NULL;
278 +       int nr_new = 0;
280 +       /*
281 +        * We are possibly reusing space of an old transaction_info.  The old
282 +        * transaction should not have any live blocks in it.
283 +        */
284 +       BUG_ON(!list_empty(&ti->live_logblks));
286 +       write_lock(&journal->j_jmap_lock);
287 +       nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
288 +                                       nr_mappings);
289 +       write_unlock(&journal->j_jmap_lock);
291 +       if (nr_new == 0)
292 +               goto move_head;
294 +       new_entries = alloc_jmap_entries(nr_new);
295 +       if (!new_entries)
296 +               return -ENOMEM;
298 +       write_lock(&journal->j_jmap_lock);
299 +       add_new_mappings(journal, ti, t_idx, mappings, new_entries, nr_new);
300 +       write_unlock(&journal->j_jmap_lock);
302 +       kfree(new_entries);
304 +move_head:
305 +       write_lock(&journal->j_jmap_lock);
306 +       ti->tid = transaction->t_tid;
307 +       ti->offset = transaction->t_log_start;
308 +       tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
309 +       write_unlock(&journal->j_jmap_lock);
311 +       trace_jbd2_transaction_infos_add(t_idx, ti, nr_mappings);
312 +       return 0;
316 + * Look up fsblk in the jmap and return the corresponding jmap entry if found.
317 + * Should be called with a read lock on j_jmap_lock.
318 + */
319 +struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
320 +                               const char *func)
322 +       struct rb_node *p;
324 +       BUG_ON(!journal);
326 +       for (p = journal->j_jmap.rb_node; p; ) {
327 +               struct jmap_entry *je = rb_entry(p, struct jmap_entry, rb_node);
328 +               if (je->mapping.fsblk > fsblk)
329 +                       p = p->rb_left;
330 +               else if (je->mapping.fsblk < fsblk)
331 +                       p = p->rb_right;
332 +               else {
333 +                       trace_jbd2_jmap_lookup(fsblk, je->mapping.logblk, func);
334 +                       return je;
335 +               }
336 +       }
337 +       trace_jbd2_jmap_lookup(fsblk, 0, func);
338 +       return NULL;
342 + * Revoke a mapping for the fsblk in the jmap.  A lookup for fsblk will return
343 + * NULL and the mapping will be removed from the jmap during commit, unless
344 + * fsblk is reallocated as a metadata block.
345 + */
346 +void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk)
348 +       struct jmap_entry *je;
350 +       write_lock(&journal->j_jmap_lock);
351 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
352 +       /*
353 +        * For now, since we do not construct jmap from the journal, it is
354 +        * possible that a metadata block that was revoked is not in the jmap.
355 +        * Eventually, this should not be the case and we should have a
356 +        * BUG_ON(!je) here.
357 +        */
358 +       if (je) {
359 +               BUG_ON(je->revoked);
360 +               je->revoked = true;
361 +       }
362 +       write_unlock(&journal->j_jmap_lock);
366 + * Cancel a revoke for the fsblk in the jmap.
367 + */
368 +void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk)
370 +       struct jmap_entry *je;
372 +       write_lock(&journal->j_jmap_lock);
373 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
374 +       BUG_ON(!je);
375 +       BUG_ON(!je->revoked);
376 +       je->revoked = false;
377 +       write_unlock(&journal->j_jmap_lock);
381 + * Read bh from its most up-to-date location, either from the file system or
382 + * from the log.
383 + *
384 + * If there is no mapping for the bh in jmap, this function acts like submit_bh.
385 + * Otherwise, it submits a read for the block pointed by the mapping located in
386 + * the log.  Upon completion, bh will be filled with the contents of the block
387 + * read from the log.
388 + */
389 +void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
390 +                   struct buffer_head *bh, const char *func)
392 +       sector_t fsblk = bh->b_blocknr;
393 +       sector_t logblk;
394 +       struct jmap_entry *je;
396 +       BUG_ON(!buffer_locked(bh));
398 +       if (!journal) {
399 +               submit_bh(rw, op_flags, bh);
400 +               return;
401 +       }
403 +       read_lock(&journal->j_jmap_lock);
404 +       je = jbd2_jmap_lookup(journal, fsblk, func);
405 +       if (!je) {
406 +               read_unlock(&journal->j_jmap_lock);
407 +               submit_bh(rw, op_flags, bh);
408 +               return;
409 +       }
410 +       logblk = je->mapping.logblk;
411 +       read_unlock(&journal->j_jmap_lock);
413 +       BUG_ON(rw == WRITE);
414 +       read_block_from_log(journal, bh, op_flags, logblk);
418 + * End_io handler for read_block_from_log that copies the contents of
419 + * log_bh read from log to the embedded bh.
420 + */
421 +static void jbd2_end_log_read(struct buffer_head *log_bh, int uptodate)
423 +       struct buffer_head *bh = log_bh->b_private;
425 +       if (uptodate) {
426 +               trace_jbd2_jmap_printf1("read from log", bh->b_blocknr);
427 +               memcpy(bh->b_data, log_bh->b_data, log_bh->b_size);
428 +       } else {
429 +               trace_jbd2_jmap_printf1("failed to read from log", bh->b_blocknr);
430 +       }
432 +       unlock_buffer(log_bh);
433 +       put_bh(log_bh);
434 +       brelse(log_bh);
436 +       bh->b_end_io(bh, uptodate);
440 + * This function fills |bh| with the contents of the |blk|.  Assume
441 + * jmap maps metadata block 123 to log block 100123.  To read the
442 + * metadata block 123, we obtain a buffer head for it and call
443 + * read_block_from_log passing the obtained buffer head as |bh| and
444 + * 100123 as |blk|.  If block 100123 is cached, then we copy the
445 + * contents to |bh| and return.  Otherwise, we submit a request and
446 + * end_io handler copies the contents of block 100123 to |bh|.
447 + * Returns -ENOMEM if getblk fails, 1 if block is not cached, 0 if
448 + * block is cached.
449 + */
450 +int read_block_from_log(journal_t *journal, struct buffer_head *bh,
451 +                       int op_flags, sector_t blk)
453 +       struct buffer_head *log_bh;
455 +       BUG_ON(!buffer_locked(bh));
457 +       log_bh = __getblk(journal->j_fs_dev, blk, bh->b_size);
458 +       if (unlikely(!log_bh)) {
459 +               bh->b_end_io(bh, 0);
460 +               return -ENOMEM;
461 +       }
463 +       lock_buffer(log_bh);
464 +       if (buffer_uptodate(log_bh)) {
465 +               memcpy(bh->b_data, log_bh->b_data, bh->b_size);
466 +               unlock_buffer(log_bh);
467 +               brelse(log_bh);
468 +               bh->b_end_io(bh, 1);
469 +               return 0;
470 +       }
472 +       log_bh->b_end_io = jbd2_end_log_read;
473 +       log_bh->b_private = bh;
474 +       get_bh(log_bh);
475 +       submit_bh(READ, op_flags, log_bh);
476 +       return 1;
480 + * Copy of ll_rw_block that uses jbd2_submit_bh instead of submit_bh.
481 + */
482 +void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags,
483 +                     int nr, struct buffer_head *bhs[], const char *func)
485 +       int i;
487 +       for (i = 0; i < nr; i++) {
488 +               struct buffer_head *bh = bhs[i];
490 +               if (!trylock_buffer(bh))
491 +                       continue;
492 +               BUG_ON(rw == WRITE);
493 +               if (!buffer_uptodate(bh)) {
494 +                       bh->b_end_io = end_buffer_read_sync;
495 +                       get_bh(bh);
496 +                       jbd2_submit_bh(journal, rw, op_flags, bh, func);
497 +                       continue;
498 +               }
499 +               unlock_buffer(bh);
500 +       }
504 + * Copy of bh_submit_read that uses jbd2_submit_bh instead of submit_bh.
505 + */
506 +int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
507 +                       const char *func)
509 +       BUG_ON(!buffer_locked(bh));
511 +       if (buffer_uptodate(bh)) {
512 +               unlock_buffer(bh);
513 +               return 0;
514 +       }
516 +       get_bh(bh);
517 +       bh->b_end_io = end_buffer_read_sync;
518 +       jbd2_submit_bh(journal, READ, 0, bh, func);
519 +       wait_on_buffer(bh);
520 +       if (buffer_uptodate(bh))
521 +               return 0;
522 +       return -EIO;
525 +int jbd2_smr_journal_init(journal_t *journal)
527 +       journal->j_jmap = RB_ROOT;
528 +       rwlock_init(&journal->j_jmap_lock);
529 +       return jbd2_init_transaction_infos(journal);
532 +void jbd2_smr_journal_exit(journal_t *journal)
534 +       jbd2_free_transaction_infos(journal);
536 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
537 index 927da4956a89..0cbfb7fdc45d 100644
538 --- a/fs/jbd2/journal.c
539 +++ b/fs/jbd2/journal.c
540 @@ -1120,15 +1120,17 @@ static journal_t *journal_init_common(struct block_device *bdev,
541         journal->j_max_batch_time = 15000; /* 15ms */
542         atomic_set(&journal->j_reserved_credits, 0);
544 +       err = jbd2_smr_journal_init(journal);
545 +       if (err)
546 +               goto out_err;
548         /* The journal is marked for error until we succeed with recovery! */
549         journal->j_flags = JBD2_ABORT;
551         /* Set up a default-sized revoke table for the new mount. */
552         err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
553 -       if (err) {
554 -               kfree(journal);
555 -               return NULL;
556 -       }
557 +       if (err)
558 +               goto out_err;
560         spin_lock_init(&journal->j_history_lock);
562 @@ -1162,6 +1164,9 @@ static journal_t *journal_init_common(struct block_device *bdev,
563         journal->j_superblock = (journal_superblock_t *)bh->b_data;
565         return journal;
566 +out_err:
567 +       kfree(journal);
568 +       return NULL;
571  /* jbd2_journal_init_dev and jbd2_journal_init_inode:
572 @@ -1734,6 +1739,7 @@ int jbd2_journal_destroy(journal_t *journal)
573                 jbd2_journal_destroy_revoke(journal);
574         if (journal->j_chksum_driver)
575                 crypto_free_shash(journal->j_chksum_driver);
576 +       jbd2_smr_journal_exit(journal);
577         kfree(journal->j_wbuf);
578         kfree(journal);
580 @@ -2634,6 +2640,8 @@ static int __init journal_init_caches(void)
581                 ret = jbd2_journal_init_handle_cache();
582         if (ret == 0)
583                 ret = jbd2_journal_init_transaction_cache();
584 +       if (ret == 0)
585 +               ret = jbd2_journal_init_jmap_cache();
586         return ret;
589 @@ -2643,6 +2651,7 @@ static void jbd2_journal_destroy_caches(void)
590         jbd2_journal_destroy_journal_head_cache();
591         jbd2_journal_destroy_handle_cache();
592         jbd2_journal_destroy_transaction_cache();
593 +       jbd2_journal_destroy_jmap_cache();
594         jbd2_journal_destroy_slabs();
597 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
598 index dfaa1f4dcb0c..317efb491569 100644
599 --- a/include/linux/jbd2.h
600 +++ b/include/linux/jbd2.h
601 @@ -25,6 +25,7 @@
602  #include <linux/types.h>
603  #include <linux/buffer_head.h>
604  #include <linux/journal-head.h>
605 +#include <linux/jmap.h>
606  #include <linux/stddef.h>
607  #include <linux/mutex.h>
608  #include <linux/timer.h>
609 @@ -732,6 +733,9 @@ jbd2_time_diff(unsigned long start, unsigned long end)
610   *     prior abort)?
611   * @j_sb_buffer: First part of superblock buffer
612   * @j_superblock: Second part of superblock buffer
613 + * @j_map: A map from file system blocks to log blocks
614 + * @j_transaction_infos: An array of information structures per live transaction
615 + * @j_map_lock: Protect j_jmap and j_transaction_infos
616   * @j_format_version: Version of the superblock format
617   * @j_state_lock: Protect the various scalars in the journal
618   * @j_barrier_count:  Number of processes waiting to create a barrier lock
619 @@ -807,6 +811,15 @@ struct journal_s
620         struct buffer_head      *j_sb_buffer;
621         journal_superblock_t    *j_superblock;
623 +       /* A map from file system blocks to journal blocks */
624 +       struct rb_root          j_jmap;
626 +       /* An array of housekeeping information about live transactions */
627 +       struct transaction_infos *j_transaction_infos;
629 +       /* Protect j_jmap and j_transaction_infos */
630 +       rwlock_t                j_jmap_lock;
632         /* Version of the superblock format */
633         int                     j_format_version;
635 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
636 new file mode 100644
637 index 000000000000..d068358380b0
638 --- /dev/null
639 +++ b/include/linux/jmap.h
640 @@ -0,0 +1,129 @@
641 +#ifndef _LINUX_JMAP_H
642 +#define _LINUX_JMAP_H
644 +#include <linux/buffer_head.h>
645 +#include <linux/journal-head.h>
646 +#include <linux/list.h>
647 +#include <linux/circ_buf.h>
650 + * Maximum number of transactions.  This guides the size of the circular buffer
651 + * in which we store housekeeping information per transaction.  We start
652 + * cleaning either when the circular buffer is full or when we hit the free
653 + * space threshold, whichever happens first.  For starters, we make this
654 + * constant large to make sure that we start cleaning only when we hit the free
655 + * space threshold.  Later we can empirically determine a sensible value.
656 + */
657 +#define MAX_LIVE_TRANSACTIONS 65536
660 + * Forward declaration for journal_t so that we don't get circular dependency
661 + * between jbd2.h and jmap.h
662 + */
663 +struct journal_s;
664 +typedef struct journal_s journal_t;
667 + * A mapping from file system block to log block.
668 + */
669 +struct blk_mapping {
670 +       sector_t fsblk;
671 +       sector_t logblk;
675 + * An RB-tree entry wrapper for blk_mapping with extra housekeeping information.
676 + */
677 +struct jmap_entry {
678 +       struct rb_node rb_node;
680 +       /* The actual mapping information. */
681 +       struct blk_mapping mapping;
683 +       /*
684 +        * If a block that is mapped gets deleted, the revoked bit is set.  A
685 +        * lookup for a deleted block fails.  If a deleted block gets
686 +        * re-allocated as a metadata block, the mapping is updated and revoked
687 +        * bit is cleared.
688 +        */
689 +       bool revoked;
691 +       /*
692 +        * All log blocks that are part of the same transaction in the log are
693 +        * chained with a linked list.  The root of the list is stored in the
694 +        * transaction_info structure described below.
695 +        */
696 +       struct list_head list;
698 +       /*
699 +        * The last time when fsblk was written again to the journal and
700 +        * therefore was remapped to a different log block.
701 +        */
702 +       unsigned long fsblk_last_modified;
704 +       /*
705 +        * Index of the transaction in the transaction_info_buffer (described
706 +        * below) of which the log block is part of.
707 +        */
708 +       int t_idx;
712 + * Housekeeping information about committed transaction.
713 + */
714 +struct transaction_info {
715 +       /* Id of the transaction */
716 +       tid_t tid;
718 +       /* Offset where the transaction starts in the log */
719 +       sector_t offset;
721 +       /*
722 +        * A list of live log blocks referenced in the RB-tree that belong to
723 +        * this transaction.  It is used during cleaning to locate live blocks
724 +        * and migrate them to appropriate location.  If this list is empty,
725 +        * then the transaction does not contain any live blocks and we can
726 +        * reuse its space.  If this list is not empty, then we can quickly
727 +        * locate all the live blocks in this transaction.
728 +        */
729 +       struct list_head live_logblks;
733 + * An array of transaction_info structures about all the transactions in the
734 + * log.  Since there can only be a limited number of transactions in the log, we
735 + * use a circular buffer to store housekeeping information about transactions.
736 + */
737 +struct transaction_infos {
738 +       struct transaction_info *buf;
739 +       int head;
740 +       int tail;
743 +extern int jbd2_smr_journal_init(journal_t *journal);
744 +extern void jbd2_smr_journal_exit(journal_t *journal);
746 +extern int jbd2_journal_init_jmap_cache(void);
747 +extern void jbd2_journal_destroy_jmap_cache(void);
749 +extern int jbd2_init_transaction_infos(journal_t *journal);
750 +extern void jbd2_free_transaction_infos(journal_t *journal);
751 +extern int jbd2_transaction_infos_add(journal_t *journal,
752 +                               transaction_t *transaction,
753 +                               struct blk_mapping *mappings,
754 +                               int nr_mappings);
756 +extern struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
757 +                                       const char *func);
758 +extern void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk);
759 +extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
760 +extern void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
761 +                          struct buffer_head *bh, const char *func);
762 +extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
763 +                              int op_flags, sector_t blk);
764 +extern void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags, int nr,
765 +                            struct buffer_head *bhs[], const char *func);
766 +extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
767 +                              const char *func);
769 +#endif
770 diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
771 index c1d1f3eb242d..bc1511a425ec 100644
772 --- a/include/trace/events/jbd2.h
773 +++ b/include/trace/events/jbd2.h
774 @@ -379,6 +379,175 @@ TRACE_EVENT(jbd2_lock_buffer_stall,
775                 __entry->stall_ms)
776  );
778 +TRACE_EVENT(jbd2_jmap_replace,
780 +       TP_PROTO(struct jmap_entry *jentry, struct blk_mapping *mapping, \
781 +               int t_idx),
783 +       TP_ARGS(jentry, mapping, t_idx),
785 +       TP_STRUCT__entry(
786 +               __field(sector_t, fsblk         )
787 +               __field(sector_t, old_logblk    )
788 +               __field(sector_t, new_logblk    )
789 +               __field(int, old_t_idx          )
790 +               __field(int, new_t_idx          )
791 +       ),
793 +       TP_fast_assign(
794 +               __entry->fsblk          = mapping->fsblk;
795 +               __entry->old_logblk     = jentry->mapping.logblk;
796 +               __entry->new_logblk     = mapping->logblk;
797 +               __entry->old_t_idx       = jentry->t_idx;
798 +               __entry->new_t_idx       = t_idx;
799 +       ),
801 +       TP_printk("remap %llu from %llu to %llu, move from transaction at index %d to transaction at index %d",
802 +                 (unsigned long long) __entry->fsblk,
803 +                 (unsigned long long) __entry->old_logblk,
804 +                 (unsigned long long) __entry->new_logblk,
805 +                 __entry->old_t_idx,
806 +                 __entry->new_t_idx)
809 +TRACE_EVENT(jbd2_jmap_insert,
811 +       TP_PROTO(struct blk_mapping *mapping, int t_idx),
813 +       TP_ARGS(mapping, t_idx),
815 +       TP_STRUCT__entry(
816 +               __field(sector_t, fsblk )
817 +               __field(sector_t, logblk)
818 +               __field(int, t_idx)
819 +       ),
821 +       TP_fast_assign(
822 +               __entry->fsblk  = mapping->fsblk;
823 +               __entry->logblk = mapping->logblk;
824 +               __entry->t_idx = t_idx;
825 +       ),
827 +       TP_printk("map %llu to %llu, insert to transaction %d",
828 +                 (unsigned long long) __entry->fsblk,
829 +                 (unsigned long long) __entry->logblk,
830 +                 __entry->t_idx)
833 +TRACE_EVENT(jbd2_jmap_lookup,
835 +       TP_PROTO(sector_t fsblk, sector_t logblk, const char *func),
837 +       TP_ARGS(fsblk, logblk, func),
839 +       TP_STRUCT__entry(
840 +               __field(sector_t, fsblk )
841 +               __field(sector_t, logblk)
842 +               __string(func, func)
843 +       ),
845 +       TP_fast_assign(
846 +               __entry->fsblk  = fsblk;
847 +               __entry->logblk = logblk;
848 +               __assign_str(func, func);
849 +       ),
851 +       TP_printk("%s: lookup %llu -> %llu",
852 +                 __get_str(func),
853 +                 (unsigned long long) __entry->fsblk,
854 +                 (unsigned long long) __entry->logblk)
857 +TRACE_EVENT(jbd2_jmap_printf,
859 +       TP_PROTO(const char *s),
861 +       TP_ARGS(s),
863 +       TP_STRUCT__entry(
864 +               __string(s, s)
865 +       ),
867 +       TP_fast_assign(
868 +               __assign_str(s, s);
869 +       ),
871 +       TP_printk("%s",
872 +               __get_str(s))
875 +TRACE_EVENT(jbd2_jmap_printf1,
877 +       TP_PROTO(const char *s, sector_t fsblk),
879 +       TP_ARGS(s, fsblk),
881 +       TP_STRUCT__entry(
882 +               __string(s, s)
883 +               __field(sector_t, fsblk )
884 +       ),
886 +       TP_fast_assign(
887 +               __assign_str(s, s);
888 +               __entry->fsblk  = fsblk;
889 +       ),
891 +       TP_printk("%s: %llu",
892 +                 __get_str(s),
893 +                 (unsigned long long) __entry->fsblk)
896 +TRACE_EVENT(jbd2_jmap_printf2,
898 +       TP_PROTO(const char *s, sector_t fsblk, sector_t logblk),
900 +       TP_ARGS(s, fsblk, logblk),
902 +       TP_STRUCT__entry(
903 +               __string(s, s)
904 +               __field(sector_t, fsblk )
905 +               __field(sector_t, logblk)
906 +       ),
908 +       TP_fast_assign(
909 +               __assign_str(s, s);
910 +               __entry->fsblk  = fsblk;
911 +               __entry->logblk = logblk;
912 +       ),
914 +       TP_printk("%s: %llu:%llu",
915 +                 __get_str(s),
916 +                 (unsigned long long) __entry->fsblk,
917 +                 (unsigned long long) __entry->logblk)
920 +TRACE_EVENT(jbd2_transaction_infos_add,
922 +       TP_PROTO(int t_idx, struct transaction_info *ti, int nr_mappings),
924 +       TP_ARGS(t_idx, ti, nr_mappings),
926 +       TP_STRUCT__entry(
927 +               __field(int, t_idx      )
928 +               __field(tid_t, tid      )
929 +               __field(sector_t, offset)
930 +               __field(int, nr_mappings)
931 +       ),
933 +       TP_fast_assign(
934 +               __entry->t_idx  = t_idx;
935 +               __entry->tid    = ti->tid;
936 +               __entry->offset = ti->offset;
937 +               __entry->nr_mappings = nr_mappings;
938 +       ),
940 +       TP_printk("inserted transaction %u (offset %llu) at index %d with %d mappings",
941 +                 __entry->tid,
942 +                 (unsigned long long) __entry->offset,
943 +                 __entry->t_idx,
944 +                 __entry->nr_mappings)
947  #endif /* _TRACE_JBD2_H */
949  /* This part must be outside protection */