Cleanup tracepoints in jmap.c
[ext4-patch-queue.git] / add-support-for-log-metadata-block-tracking-in-log
blobbb4c1e240d1740a0b982c916b717d440f13bd506
1 Add support for tracking metadata blocks in the log.
3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 This patch adds two important data structures, jmap and transaction_infos,
6 and supporting functions.  Jmap is a map from a metadata block number to
7 the log block number.  When a transaction commits, jmap is updated with new
8 mappings; when a block is revoked, the mapping for the block is removed
9 from the jmap.  Transaction_infos is an array of transaction_info
10 structures that contain information about transactions currently present in
11 the log.  It contains a linked list of live blocks in a transaction, and it
12 is updated after every commit to keep the list up-to-date.
13 Transaction_infos array will be used by the cleaner for identifying live
14 blocks and migrating them to appropriate location.
16 [ Modified by tytso to conditionalize changes on the JBD2_LAZY journal flag ]
18 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
19 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
20 ---
21  fs/jbd2/Makefile            |   3 +-
22  fs/jbd2/commit.c            |  23 ++++
23  fs/jbd2/jmap.c              | 456 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
24  fs/jbd2/journal.c           |  17 ++-
25  include/linux/jbd2.h        |  14 +++
26  include/linux/jmap.h        | 131 +++++++++++++++++++++
27  include/trace/events/jbd2.h | 193 +++++++++++++++++++++++++++++++
28  7 files changed, 832 insertions(+), 5 deletions(-)
30 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
31 index 802a3413872a..a54f50b3a06e 100644
32 --- a/fs/jbd2/Makefile
33 +++ b/fs/jbd2/Makefile
34 @@ -4,4 +4,5 @@
36  obj-$(CONFIG_JBD2) += jbd2.o
38 -jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
39 +jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
40 +               jmap.o
41 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
42 index 8c514367ba5a..50e1a0b375c5 100644
43 --- a/fs/jbd2/commit.c
44 +++ b/fs/jbd2/commit.c
45 @@ -362,6 +362,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
46         int flags;
47         int err;
48         unsigned long long blocknr;
49 +       struct blk_mapping *mappings = NULL;
50 +       struct blk_mapping *map_ptr = NULL;
51         ktime_t start_time;
52         u64 commit_time;
53         char *tagp = NULL;
54 @@ -563,6 +565,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
55         J_ASSERT(commit_transaction->t_nr_buffers <=
56                  atomic_read(&commit_transaction->t_outstanding_credits));
58 +       if (journal->j_flags & JBD2_LAZY) {
59 +               int nr_mappings = commit_transaction->t_nr_buffers;
61 +               map_ptr = mappings = kmalloc(sizeof(*mappings) * nr_mappings, GFP_NOFS);
62 +               if (!mappings)
63 +                       jbd2_journal_abort(journal, -ENOMEM);
64 +       }
66         err = 0;
67         bufs = 0;
68         descriptor = NULL;
69 @@ -661,6 +671,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
70                         continue;
71                 }
72                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
73 +               if (map_ptr) {
74 +                       map_ptr->fsblk = jh2bh(jh)->b_blocknr;
75 +                       map_ptr->logblk = blocknr;
76 +                       map_ptr++;
77 +               }
79                 /* Record the new block's tag in the current descriptor
80                     buffer */
81 @@ -895,6 +910,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
82             transaction can be removed from any checkpoint list it was on
83             before. */
85 +       if (mappings) {
86 +               err = jbd2_transaction_infos_add(journal, commit_transaction,
87 +                                                mappings, map_ptr - mappings);
88 +               if (err)
89 +                       jbd2_journal_abort(journal, -ENOMEM);
90 +               kfree(mappings);
91 +       }
93         jbd_debug(3, "JBD2: commit phase 6\n");
95         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
96 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
97 new file mode 100644
98 index 000000000000..31d143ea28ee
99 --- /dev/null
100 +++ b/fs/jbd2/jmap.c
101 @@ -0,0 +1,456 @@
102 +#include <linux/blk_types.h>
103 +#include <linux/jbd2.h>
104 +#include <linux/jmap.h>
105 +#include <trace/events/jbd2.h>
107 +static struct kmem_cache *jbd2_jmap_cache;
109 +int jbd2_journal_init_jmap_cache(void)
111 +       jbd2_jmap_cache = KMEM_CACHE(jmap_entry, SLAB_RECLAIM_ACCOUNT);
112 +       if (!jbd2_jmap_cache)
113 +               return -ENOMEM;
114 +       return 0;
117 +void jbd2_journal_destroy_jmap_cache(void)
119 +       kmem_cache_destroy(jbd2_jmap_cache);
120 +       jbd2_jmap_cache = NULL;
124 + * Allocate an array of transaction_info structures and initialize the list
125 + * heads inside them.
126 + */
127 +int jbd2_init_transaction_infos(journal_t *journal)
129 +       int i;
130 +       struct transaction_infos *tis = kzalloc(sizeof(*tis), GFP_KERNEL);
131 +       if (!tis)
132 +               return -ENOMEM;
134 +       tis->buf = kzalloc(sizeof(*tis->buf) * MAX_LIVE_TRANSACTIONS,
135 +                       GFP_KERNEL);
136 +       if (!tis->buf) {
137 +               kfree(tis);
138 +               return -ENOMEM;
139 +       }
141 +       for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
142 +               INIT_LIST_HEAD(&tis->buf[i].live_logblks);
144 +       journal->j_transaction_infos = tis;
145 +       return 0;
149 + * Free the array of transaction_info structures.
150 + */
151 +void jbd2_free_transaction_infos(journal_t *journal)
153 +       struct transaction_infos *tis = journal->j_transaction_infos;
154 +       if (!tis)
155 +               return;
156 +       kfree(tis->buf);
157 +       kfree(tis);
161 + * Fill an entry to be stored in jmap.
162 + */
163 +static void fill_entry(struct jmap_entry *entry, struct blk_mapping *mapping,
164 +                       int t_idx, struct list_head *list)
166 +       entry->mapping = *mapping;
167 +       entry->fsblk_last_modified = jiffies;
168 +       entry->t_idx = t_idx;
169 +       list_add(&entry->list, list);
173 + * A helper function for jbd2_transaction_infos_add.  Scans through the mappings
174 + * array, dropping revoked entries from jmap and updating existing entries.
175 + * Moves the new mappings to the beginning of the mappings array and returns the
176 + * number of new mappings.  Should be called with a write lock on j_jmap_lock.
177 + */
178 +static int process_existing_mappings(journal_t *journal,
179 +                               struct transaction_info *ti, int t_idx,
180 +                               struct blk_mapping *mappings, int nr_mappings)
182 +       struct jmap_entry *je;
183 +       int i, nr_new = 0;
185 +       for (i = 0; i < nr_mappings; ++i) {
186 +               je = jbd2_jmap_lookup(journal, mappings[i].fsblk, __func__);
187 +               if (!je) {
188 +                       mappings[nr_new++] = mappings[i];
189 +                       continue;
190 +               }
191 +               /*
192 +                * We are either deleting the entry because it was revoked, or
193 +                * we are moving it to the live blocks list of this transaction.
194 +                * In either case, we remove it from its existing list.
195 +                */
196 +               list_del(&je->list);
198 +               if (je->revoked) {
199 +                       rb_erase(&je->rb_node, &journal->j_jmap);
200 +                       kmem_cache_free(jbd2_jmap_cache, je);
201 +               } else {
202 +                       trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
203 +                       fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
204 +               }
205 +       }
206 +       return nr_new;
210 + * A helper function for jbd2_transaction_infos_add.  Allocates an array of
211 + * jmap_entry structures and returns the pointer to array if successful.
212 + * Otherwise, returns NULL.
213 + */
214 +static struct jmap_entry **alloc_jmap_entries(int nr_entries)
216 +       struct jmap_entry **jmap_entries;
217 +       int i;
219 +       jmap_entries = kmalloc(sizeof(struct jmap_entry *) * nr_entries,
220 +                       GFP_NOFS);
221 +       if (!jmap_entries)
222 +               return NULL;
224 +       for (i = 0; i < nr_entries; i++) {
225 +               jmap_entries[i] = kmem_cache_zalloc(jbd2_jmap_cache, GFP_NOFS);
226 +               if (!jmap_entries[i])
227 +                       goto out_err;
228 +       }
229 +       return jmap_entries;
231 +out_err:
232 +       for (i = 0; i < nr_entries && jmap_entries[i]; ++i)
233 +               kmem_cache_free(jbd2_jmap_cache, jmap_entries[i]);
234 +       kfree(jmap_entries);
235 +       return NULL;
239 + * A helper function for jbd2_transaction_infos_add.  Adds new mappings to jmap
240 + * and updates the linked list of live logblks of the new transaction.  Should
241 + * be called with write lock on j_jmap_lock.
242 + */
243 +static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
244 +                       int t_idx, struct blk_mapping *mappings,
245 +                       struct jmap_entry **new_entries, int nr_new)
247 +       struct rb_node **p;
248 +       struct rb_node *parent = NULL;
249 +       struct jmap_entry *je;
250 +       int i;
252 +       for (i = 0; i < nr_new; ++i) {
253 +               p = &journal->j_jmap.rb_node;
254 +               while (*p) {
255 +                       parent = *p;
256 +                       je = rb_entry(parent, struct jmap_entry, rb_node);
258 +                       if (mappings[i].fsblk < je->mapping.fsblk)
259 +                               p = &(*p)->rb_left;
260 +                       else if (mappings[i].fsblk > je->mapping.fsblk)
261 +                               p = &(*p)->rb_right;
262 +                       else
263 +                               BUG_ON(1);
264 +               }
265 +               fill_entry(new_entries[i], &mappings[i], t_idx,
266 +                       &ti->live_logblks);
267 +               rb_link_node(&new_entries[i]->rb_node, parent, p);
268 +               rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
269 +               trace_jbd2_jmap_insert(&mappings[i], t_idx);
270 +       }
274 + * This function is called after a transaction commits.  It adds new
275 + * transaction_info structure to transaction_infos and populates jmap map with
276 + * the new mappings that are part of the committed transaction.  It also adds
277 + * all the mappings to the linked list that is part of the transaction_info
278 + * structure.
279 + */
280 +int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
281 +                       struct blk_mapping *mappings, int nr_mappings)
283 +       struct transaction_infos *tis = journal->j_transaction_infos;
284 +       int t_idx = tis->head;
285 +       struct transaction_info *ti = &tis->buf[t_idx];
286 +       struct jmap_entry **new_entries = NULL;
287 +       int nr_new = 0;
289 +       /*
290 +        * We are possibly reusing space of an old transaction_info.  The old
291 +        * transaction should not have any live blocks in it.
292 +        */
293 +       BUG_ON(!list_empty(&ti->live_logblks));
295 +       write_lock(&journal->j_jmap_lock);
296 +       nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
297 +                                       nr_mappings);
298 +       write_unlock(&journal->j_jmap_lock);
300 +       if (nr_new == 0)
301 +               goto move_head;
303 +       new_entries = alloc_jmap_entries(nr_new);
304 +       if (!new_entries)
305 +               return -ENOMEM;
307 +       write_lock(&journal->j_jmap_lock);
308 +       add_new_mappings(journal, ti, t_idx, mappings, new_entries, nr_new);
309 +       write_unlock(&journal->j_jmap_lock);
311 +       kfree(new_entries);
313 +move_head:
314 +       write_lock(&journal->j_jmap_lock);
315 +       ti->tid = transaction->t_tid;
316 +       ti->offset = transaction->t_log_start;
317 +       tis->head = (tis->head + 1) & (MAX_LIVE_TRANSACTIONS - 1);
318 +       write_unlock(&journal->j_jmap_lock);
320 +       trace_jbd2_transaction_infos_add(t_idx, ti, nr_mappings);
321 +       return 0;
325 + * Look up fsblk in the jmap and return the corresponding jmap entry if found.
326 + * Should be called with a read lock on j_jmap_lock.
327 + */
328 +struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
329 +                               const char *func)
331 +       struct rb_node *p;
333 +       BUG_ON(!journal);
335 +       for (p = journal->j_jmap.rb_node; p; ) {
336 +               struct jmap_entry *je = rb_entry(p, struct jmap_entry, rb_node);
337 +               if (je->mapping.fsblk > fsblk)
338 +                       p = p->rb_left;
339 +               else if (je->mapping.fsblk < fsblk)
340 +                       p = p->rb_right;
341 +               else {
342 +                       trace_jbd2_jmap_lookup(fsblk, je->mapping.logblk, func);
343 +                       return je;
344 +               }
345 +       }
346 +       trace_jbd2_jmap_lookup(fsblk, 0, func);
347 +       return NULL;
351 + * Revoke a mapping for the fsblk in the jmap.  A lookup for fsblk will return
352 + * NULL and the mapping will be removed from the jmap during commit, unless
353 + * fsblk is reallocated as a metadata block.
354 + */
355 +void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk)
357 +       struct jmap_entry *je;
359 +       write_lock(&journal->j_jmap_lock);
360 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
361 +       /*
362 +        * For now, since we do not construct jmap from the journal, it is
363 +        * possible that a metadata block that was revoked is not in the jmap.
364 +        * Eventually, this should not be the case and we should have a
365 +        * BUG_ON(!je) here.
366 +        */
367 +       if (je) {
368 +               BUG_ON(je->revoked);
369 +               je->revoked = true;
370 +       }
371 +       write_unlock(&journal->j_jmap_lock);
375 + * Cancel a revoke for the fsblk in the jmap.
376 + */
377 +void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk)
379 +       struct jmap_entry *je;
381 +       write_lock(&journal->j_jmap_lock);
382 +       je = jbd2_jmap_lookup(journal, fsblk, __func__);
383 +       BUG_ON(!je);
384 +       BUG_ON(!je->revoked);
385 +       je->revoked = false;
386 +       write_unlock(&journal->j_jmap_lock);
390 + * Read bh from its most up-to-date location, either from the file system or
391 + * from the log.
392 + *
393 + * If there is no mapping for the bh in jmap, this function acts like submit_bh.
394 + * Otherwise, it submits a read for the block pointed by the mapping located in
395 + * the log.  Upon completion, bh will be filled with the contents of the block
396 + * read from the log.
397 + */
398 +void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
399 +                   struct buffer_head *bh, const char *func)
401 +       sector_t fsblk = bh->b_blocknr;
402 +       sector_t logblk;
403 +       struct jmap_entry *je;
405 +       BUG_ON(!buffer_locked(bh));
407 +       if (!journal || !(journal->j_flags & JBD2_LAZY)) {
408 +               submit_bh(rw, op_flags, bh);
409 +               return;
410 +       }
412 +       read_lock(&journal->j_jmap_lock);
413 +       je = jbd2_jmap_lookup(journal, fsblk, func);
414 +       if (!je) {
415 +               read_unlock(&journal->j_jmap_lock);
416 +               submit_bh(rw, op_flags, bh);
417 +               return;
418 +       }
419 +       logblk = je->mapping.logblk;
420 +       read_unlock(&journal->j_jmap_lock);
422 +       BUG_ON(rw == WRITE);
423 +       read_block_from_log(journal, bh, op_flags, logblk);
425 +EXPORT_SYMBOL(jbd2_submit_bh);
428 + * End_io handler for read_block_from_log that copies the contents of
429 + * log_bh read from log to the embedded bh.
430 + */
431 +static void jbd2_end_log_read(struct buffer_head *log_bh, int uptodate)
433 +       struct buffer_head *bh = log_bh->b_private;
435 +       trace_jbd2_jmap_read_from_log(bh->b_blocknr, log_bh->b_blocknr,
436 +                                     uptodate);
437 +       if (uptodate)
438 +               memcpy(bh->b_data, log_bh->b_data, log_bh->b_size);
440 +       unlock_buffer(log_bh);
441 +       put_bh(log_bh);
442 +       brelse(log_bh);
444 +       bh->b_end_io(bh, uptodate);
448 + * This function fills |bh| with the contents of the |blk|.  Assume
449 + * jmap maps metadata block 123 to log block 100123.  To read the
450 + * metadata block 123, we obtain a buffer head for it and call
451 + * read_block_from_log passing the obtained buffer head as |bh| and
452 + * 100123 as |blk|.  If block 100123 is cached, then we copy the
453 + * contents to |bh| and return.  Otherwise, we submit a request and
454 + * end_io handler copies the contents of block 100123 to |bh|.
455 + * Returns -ENOMEM if getblk fails, 1 if block is not cached, 0 if
456 + * block is cached.
457 + */
458 +int read_block_from_log(journal_t *journal, struct buffer_head *bh,
459 +                       int op_flags, sector_t blk)
461 +       struct buffer_head *log_bh;
463 +       BUG_ON(!buffer_locked(bh));
465 +       log_bh = __getblk(journal->j_fs_dev, blk, bh->b_size);
466 +       if (unlikely(!log_bh)) {
467 +               bh->b_end_io(bh, 0);
468 +               return -ENOMEM;
469 +       }
471 +       lock_buffer(log_bh);
472 +       if (buffer_uptodate(log_bh)) {
473 +               memcpy(bh->b_data, log_bh->b_data, bh->b_size);
474 +               unlock_buffer(log_bh);
475 +               brelse(log_bh);
476 +               bh->b_end_io(bh, 1);
477 +               return 0;
478 +       }
480 +       log_bh->b_end_io = jbd2_end_log_read;
481 +       log_bh->b_private = bh;
482 +       get_bh(log_bh);
483 +       submit_bh(READ, op_flags, log_bh);
484 +       return 1;
488 + * Copy of ll_rw_block that uses jbd2_submit_bh instead of submit_bh.
489 + */
490 +void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags,
491 +                     int nr, struct buffer_head *bhs[], const char *func)
493 +       int i;
495 +       for (i = 0; i < nr; i++) {
496 +               struct buffer_head *bh = bhs[i];
498 +               if (!trylock_buffer(bh))
499 +                       continue;
500 +               BUG_ON(rw == WRITE);
501 +               if (!buffer_uptodate(bh)) {
502 +                       bh->b_end_io = end_buffer_read_sync;
503 +                       get_bh(bh);
504 +                       jbd2_submit_bh(journal, rw, op_flags, bh, func);
505 +                       continue;
506 +               }
507 +               unlock_buffer(bh);
508 +       }
510 +EXPORT_SYMBOL(jbd2_ll_rw_block);
513 + * Copy of bh_submit_read that uses jbd2_submit_bh instead of submit_bh.
514 + */
515 +int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
516 +                       const char *func)
518 +       BUG_ON(!buffer_locked(bh));
520 +       if (buffer_uptodate(bh)) {
521 +               unlock_buffer(bh);
522 +               return 0;
523 +       }
525 +       get_bh(bh);
526 +       bh->b_end_io = end_buffer_read_sync;
527 +       jbd2_submit_bh(journal, READ, 0, bh, func);
528 +       wait_on_buffer(bh);
529 +       if (buffer_uptodate(bh))
530 +               return 0;
531 +       return -EIO;
533 +EXPORT_SYMBOL(jbd2_bh_submit_read);
535 +int jbd2_smr_journal_init(journal_t *journal)
537 +       journal->j_jmap = RB_ROOT;
538 +       rwlock_init(&journal->j_jmap_lock);
539 +       return jbd2_init_transaction_infos(journal);
542 +void jbd2_smr_journal_exit(journal_t *journal)
544 +       jbd2_free_transaction_infos(journal);
547 +void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
548 +                       sector_t block)
550 +       struct buffer_head *bh = __getblk(sb->s_bdev, block, sb->s_blocksize);
551 +       if (likely(bh)) {
552 +               jbd2_ll_rw_block(journal, REQ_OP_READ, REQ_RAHEAD, 1,
553 +                                &bh, __func__);
554 +               brelse(bh);
555 +       }
557 +EXPORT_SYMBOL(jbd2_sb_breadahead);
558 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
559 index eae93cdfbaa7..77a44c86ccc2 100644
560 --- a/fs/jbd2/journal.c
561 +++ b/fs/jbd2/journal.c
562 @@ -1120,15 +1120,17 @@ static journal_t *journal_init_common(struct block_device *bdev,
563         journal->j_max_batch_time = 15000; /* 15ms */
564         atomic_set(&journal->j_reserved_credits, 0);
566 +       err = jbd2_smr_journal_init(journal);
567 +       if (err)
568 +               goto out_err;
570         /* The journal is marked for error until we succeed with recovery! */
571         journal->j_flags = JBD2_ABORT;
573         /* Set up a default-sized revoke table for the new mount. */
574         err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
575 -       if (err) {
576 -               kfree(journal);
577 -               return NULL;
578 -       }
579 +       if (err)
580 +               goto out_err;
582         spin_lock_init(&journal->j_history_lock);
584 @@ -1162,6 +1164,9 @@ static journal_t *journal_init_common(struct block_device *bdev,
585         journal->j_superblock = (journal_superblock_t *)bh->b_data;
587         return journal;
588 +out_err:
589 +       kfree(journal);
590 +       return NULL;
593  /* jbd2_journal_init_dev and jbd2_journal_init_inode:
594 @@ -1741,6 +1746,7 @@ int jbd2_journal_destroy(journal_t *journal)
595                 jbd2_journal_destroy_revoke(journal);
596         if (journal->j_chksum_driver)
597                 crypto_free_shash(journal->j_chksum_driver);
598 +       jbd2_smr_journal_exit(journal);
599         kfree(journal->j_wbuf);
600         kfree(journal);
602 @@ -2641,6 +2647,8 @@ static int __init journal_init_caches(void)
603                 ret = jbd2_journal_init_handle_cache();
604         if (ret == 0)
605                 ret = jbd2_journal_init_transaction_cache();
606 +       if (ret == 0)
607 +               ret = jbd2_journal_init_jmap_cache();
608         return ret;
611 @@ -2650,6 +2658,7 @@ static void jbd2_journal_destroy_caches(void)
612         jbd2_journal_destroy_journal_head_cache();
613         jbd2_journal_destroy_handle_cache();
614         jbd2_journal_destroy_transaction_cache();
615 +       jbd2_journal_destroy_jmap_cache();
616         jbd2_journal_destroy_slabs();
619 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
620 index 9a07b0485784..771588026353 100644
621 --- a/include/linux/jbd2.h
622 +++ b/include/linux/jbd2.h
623 @@ -25,6 +25,7 @@
624  #include <linux/types.h>
625  #include <linux/buffer_head.h>
626  #include <linux/journal-head.h>
627 +#include <linux/jmap.h>
628  #include <linux/stddef.h>
629  #include <linux/mutex.h>
630  #include <linux/timer.h>
631 @@ -732,6 +733,9 @@ jbd2_time_diff(unsigned long start, unsigned long end)
632   *     prior abort)?
633   * @j_sb_buffer: First part of superblock buffer
634   * @j_superblock: Second part of superblock buffer
635 + * @j_map: A map from file system blocks to log blocks
636 + * @j_transaction_infos: An array of information structures per live transaction
637 + * @j_map_lock: Protect j_jmap and j_transaction_infos
638   * @j_format_version: Version of the superblock format
639   * @j_state_lock: Protect the various scalars in the journal
640   * @j_barrier_count:  Number of processes waiting to create a barrier lock
641 @@ -807,6 +811,15 @@ struct journal_s
642         struct buffer_head      *j_sb_buffer;
643         journal_superblock_t    *j_superblock;
645 +       /* A map from file system blocks to journal blocks */
646 +       struct rb_root          j_jmap;
648 +       /* An array of housekeeping information about live transactions */
649 +       struct transaction_infos *j_transaction_infos;
651 +       /* Protect j_jmap and j_transaction_infos */
652 +       rwlock_t                j_jmap_lock;
654         /* Version of the superblock format */
655         int                     j_format_version;
657 @@ -1129,6 +1142,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
658                                                  * mode */
659  #define JBD2_REC_ERR   0x080   /* The errno in the sb has been recorded */
660  #define JBD2_NO_CLEANUP        0x100   /* Don't flush empty the journal on shutdown  */
661 +#define JBD2_LAZY      0x200   /* Do lazy journalling  */
663  /*
664   * Function declarations for the journaling transaction and buffer
665 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
666 new file mode 100644
667 index 000000000000..a602ece7cc89
668 --- /dev/null
669 +++ b/include/linux/jmap.h
670 @@ -0,0 +1,131 @@
671 +#ifndef _LINUX_JMAP_H
672 +#define _LINUX_JMAP_H
674 +#include <linux/buffer_head.h>
675 +#include <linux/journal-head.h>
676 +#include <linux/list.h>
677 +#include <linux/circ_buf.h>
680 + * Maximum number of transactions.  This guides the size of the circular buffer
681 + * in which we store housekeeping information per transaction.  We start
682 + * cleaning either when the circular buffer is full or when we hit the free
683 + * space threshold, whichever happens first.  For starters, we make this
684 + * constant large to make sure that we start cleaning only when we hit the free
685 + * space threshold.  Later we can empirically determine a sensible value.
686 + */
687 +#define MAX_LIVE_TRANSACTIONS 65536
690 + * Forward declaration for journal_t so that we don't get circular dependency
691 + * between jbd2.h and jmap.h
692 + */
693 +struct journal_s;
694 +typedef struct journal_s journal_t;
697 + * A mapping from file system block to log block.
698 + */
699 +struct blk_mapping {
700 +       sector_t fsblk;
701 +       sector_t logblk;
705 + * An RB-tree entry wrapper for blk_mapping with extra housekeeping information.
706 + */
707 +struct jmap_entry {
708 +       struct rb_node rb_node;
710 +       /* The actual mapping information. */
711 +       struct blk_mapping mapping;
713 +       /*
714 +        * If a block that is mapped gets deleted, the revoked bit is set.  A
715 +        * lookup for a deleted block fails.  If a deleted block gets
716 +        * re-allocated as a metadata block, the mapping is updated and revoked
717 +        * bit is cleared.
718 +        */
719 +       bool revoked;
721 +       /*
722 +        * All log blocks that are part of the same transaction in the log are
723 +        * chained with a linked list.  The root of the list is stored in the
724 +        * transaction_info structure described below.
725 +        */
726 +       struct list_head list;
728 +       /*
729 +        * The last time when fsblk was written again to the journal and
730 +        * therefore was remapped to a different log block.
731 +        */
732 +       unsigned long fsblk_last_modified;
734 +       /*
735 +        * Index of the transaction in the transaction_info_buffer (described
736 +        * below) of which the log block is part of.
737 +        */
738 +       int t_idx;
742 + * Housekeeping information about committed transaction.
743 + */
744 +struct transaction_info {
745 +       /* Id of the transaction */
746 +       tid_t tid;
748 +       /* Offset where the transaction starts in the log */
749 +       sector_t offset;
751 +       /*
752 +        * A list of live log blocks referenced in the RB-tree that belong to
753 +        * this transaction.  It is used during cleaning to locate live blocks
754 +        * and migrate them to appropriate location.  If this list is empty,
755 +        * then the transaction does not contain any live blocks and we can
756 +        * reuse its space.  If this list is not empty, then we can quickly
757 +        * locate all the live blocks in this transaction.
758 +        */
759 +       struct list_head live_logblks;
763 + * An array of transaction_info structures about all the transactions in the
764 + * log.  Since there can only be a limited number of transactions in the log, we
765 + * use a circular buffer to store housekeeping information about transactions.
766 + */
767 +struct transaction_infos {
768 +       struct transaction_info *buf;
769 +       int head;
770 +       int tail;
773 +extern int jbd2_smr_journal_init(journal_t *journal);
774 +extern void jbd2_smr_journal_exit(journal_t *journal);
776 +extern int jbd2_journal_init_jmap_cache(void);
777 +extern void jbd2_journal_destroy_jmap_cache(void);
779 +extern int jbd2_init_transaction_infos(journal_t *journal);
780 +extern void jbd2_free_transaction_infos(journal_t *journal);
781 +extern int jbd2_transaction_infos_add(journal_t *journal,
782 +                               transaction_t *transaction,
783 +                               struct blk_mapping *mappings,
784 +                               int nr_mappings);
786 +extern struct jmap_entry *jbd2_jmap_lookup(journal_t *journal, sector_t fsblk,
787 +                                       const char *func);
788 +extern void jbd2_jmap_revoke(journal_t *journal, sector_t fsblk);
789 +extern void jbd2_jmap_cancel_revoke(journal_t *journal, sector_t fsblk);
790 +extern void jbd2_submit_bh(journal_t *journal, int rw, int op_flags,
791 +                          struct buffer_head *bh, const char *func);
792 +extern int read_block_from_log(journal_t *journal, struct buffer_head *bh,
793 +                              int op_flags, sector_t blk);
794 +extern void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags, int nr,
795 +                            struct buffer_head *bhs[], const char *func);
796 +extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
797 +                              const char *func);
798 +extern void jbd2_sb_breadahead(journal_t *journal, struct super_block *sb,
799 +                              sector_t block);
801 +#endif
802 diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
803 index c1d1f3eb242d..6d0619bc99af 100644
804 --- a/include/trace/events/jbd2.h
805 +++ b/include/trace/events/jbd2.h
806 @@ -379,6 +379,199 @@ TRACE_EVENT(jbd2_lock_buffer_stall,
807                 __entry->stall_ms)
808  );
810 +TRACE_EVENT(jbd2_jmap_replace,
812 +       TP_PROTO(struct jmap_entry *jentry, struct blk_mapping *mapping, \
813 +               int t_idx),
815 +       TP_ARGS(jentry, mapping, t_idx),
817 +       TP_STRUCT__entry(
818 +               __field(sector_t, fsblk         )
819 +               __field(sector_t, old_logblk    )
820 +               __field(sector_t, new_logblk    )
821 +               __field(int, old_t_idx          )
822 +               __field(int, new_t_idx          )
823 +       ),
825 +       TP_fast_assign(
826 +               __entry->fsblk          = mapping->fsblk;
827 +               __entry->old_logblk     = jentry->mapping.logblk;
828 +               __entry->new_logblk     = mapping->logblk;
829 +               __entry->old_t_idx       = jentry->t_idx;
830 +               __entry->new_t_idx       = t_idx;
831 +       ),
833 +       TP_printk("remap %llu from %llu to %llu, move from transaction at index %d to transaction at index %d",
834 +                 (unsigned long long) __entry->fsblk,
835 +                 (unsigned long long) __entry->old_logblk,
836 +                 (unsigned long long) __entry->new_logblk,
837 +                 __entry->old_t_idx,
838 +                 __entry->new_t_idx)
841 +TRACE_EVENT(jbd2_jmap_insert,
843 +       TP_PROTO(struct blk_mapping *mapping, int t_idx),
845 +       TP_ARGS(mapping, t_idx),
847 +       TP_STRUCT__entry(
848 +               __field(sector_t, fsblk )
849 +               __field(sector_t, logblk)
850 +               __field(int, t_idx)
851 +       ),
853 +       TP_fast_assign(
854 +               __entry->fsblk  = mapping->fsblk;
855 +               __entry->logblk = mapping->logblk;
856 +               __entry->t_idx = t_idx;
857 +       ),
859 +       TP_printk("map %llu to %llu, insert to transaction %d",
860 +                 (unsigned long long) __entry->fsblk,
861 +                 (unsigned long long) __entry->logblk,
862 +                 __entry->t_idx)
865 +TRACE_EVENT(jbd2_jmap_lookup,
867 +       TP_PROTO(sector_t fsblk, sector_t logblk, const char *func),
869 +       TP_ARGS(fsblk, logblk, func),
871 +       TP_STRUCT__entry(
872 +               __field(sector_t, fsblk )
873 +               __field(sector_t, logblk)
874 +               __string(func, func)
875 +       ),
877 +       TP_fast_assign(
878 +               __entry->fsblk  = fsblk;
879 +               __entry->logblk = logblk;
880 +               __assign_str(func, func);
881 +       ),
883 +       TP_printk("%s: lookup %llu -> %llu",
884 +                 __get_str(func),
885 +                 (unsigned long long) __entry->fsblk,
886 +                 (unsigned long long) __entry->logblk)
889 +TRACE_EVENT(jbd2_jmap_read_from_log,
891 +       TP_PROTO(sector_t fsblk, sector_t logblk, int uptodate),
893 +       TP_ARGS(fsblk, logblk, uptodate),
895 +       TP_STRUCT__entry(
896 +               __field(sector_t, fsblk )
897 +               __field(sector_t, logblk)
898 +               __field(int, uptodate)
899 +       ),
901 +       TP_fast_assign(
902 +               __entry->fsblk  = fsblk;
903 +               __entry->logblk = logblk;
904 +               __entry->uptodate = uptodate;
905 +       ),
907 +       TP_printk("fsblk %llu logblk %llu uptodate %d",
908 +                 (unsigned long long) __entry->fsblk,
909 +                 (unsigned long long) __entry->logblk,
910 +                 __entry->uptodate)
913 +TRACE_EVENT(jbd2_jmap_printf,
915 +       TP_PROTO(const char *s),
917 +       TP_ARGS(s),
919 +       TP_STRUCT__entry(
920 +               __string(s, s)
921 +       ),
923 +       TP_fast_assign(
924 +               __assign_str(s, s);
925 +       ),
927 +       TP_printk("%s",
928 +               __get_str(s))
931 +TRACE_EVENT(jbd2_jmap_printf1,
933 +       TP_PROTO(const char *s, sector_t fsblk),
935 +       TP_ARGS(s, fsblk),
937 +       TP_STRUCT__entry(
938 +               __string(s, s)
939 +               __field(sector_t, fsblk )
940 +       ),
942 +       TP_fast_assign(
943 +               __assign_str(s, s);
944 +               __entry->fsblk  = fsblk;
945 +       ),
947 +       TP_printk("%s: %llu",
948 +                 __get_str(s),
949 +                 (unsigned long long) __entry->fsblk)
952 +TRACE_EVENT(jbd2_jmap_printf2,
954 +       TP_PROTO(const char *s, sector_t fsblk, sector_t logblk),
956 +       TP_ARGS(s, fsblk, logblk),
958 +       TP_STRUCT__entry(
959 +               __string(s, s)
960 +               __field(sector_t, fsblk )
961 +               __field(sector_t, logblk)
962 +       ),
964 +       TP_fast_assign(
965 +               __assign_str(s, s);
966 +               __entry->fsblk  = fsblk;
967 +               __entry->logblk = logblk;
968 +       ),
970 +       TP_printk("%s: %llu:%llu",
971 +                 __get_str(s),
972 +                 (unsigned long long) __entry->fsblk,
973 +                 (unsigned long long) __entry->logblk)
976 +TRACE_EVENT(jbd2_transaction_infos_add,
978 +       TP_PROTO(int t_idx, struct transaction_info *ti, int nr_mappings),
980 +       TP_ARGS(t_idx, ti, nr_mappings),
982 +       TP_STRUCT__entry(
983 +               __field(int, t_idx      )
984 +               __field(tid_t, tid      )
985 +               __field(sector_t, offset)
986 +               __field(int, nr_mappings)
987 +       ),
989 +       TP_fast_assign(
990 +               __entry->t_idx  = t_idx;
991 +               __entry->tid    = ti->tid;
992 +               __entry->offset = ti->offset;
993 +               __entry->nr_mappings = nr_mappings;
994 +       ),
996 +       TP_printk("inserted transaction %u (offset %llu) at index %d with %d mappings",
997 +                 __entry->tid,
998 +                 (unsigned long long) __entry->offset,
999 +                 __entry->t_idx,
1000 +                 __entry->nr_mappings)
1003  #endif /* _TRACE_JBD2_H */
1005  /* This part must be outside protection */