add patch prezero-allocated-blocks-for-DAX-IO
[ext4-patch-queue.git] / add-support-for-avoiding-data-writes-during-transaction-commits
blob5a26cd3984d0b0fdffa53388d1b1b299e642c690
1 jbd2: add support for avoiding data writes during transaction commits
3 From: Jan Kara <jack@suse.cz>
5 Currently when filesystem needs to make sure data is on permanent
6 storage before committing a transaction it adds inode to transaction's
7 inode list. During transaction commit, jbd2 writes back all dirty
8 buffers that have allocated underlying blocks and waits for the IO to
9 finish. However when doing writeback for delayed allocated data, we
10 allocate blocks and immediately submit the data. Thus asking jbd2 to
11 write dirty pages just unnecessarily adds more work to jbd2 possibly
12 writing back other redirtied blocks.
14 Add support to jbd2 to allow filesystem to ask jbd2 to only wait for
15 outstanding data writes before committing a transaction and thus avoid
16 unnecessary writes.
18 Signed-off-by: Jan Kara <jack@suse.cz>
19 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
20 ---
21  fs/ext4/ext4_jbd2.h   |  3 ++-
22  fs/jbd2/commit.c      |  4 ++++
23  fs/jbd2/journal.c     |  3 ++-
24  fs/jbd2/transaction.c | 22 ++++++++++++++++++----
25  fs/ocfs2/journal.h    |  2 +-
26  include/linux/jbd2.h  | 13 +++++++++++--
27  6 files changed, 38 insertions(+), 9 deletions(-)
29 diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
30 index 5f5846211095..f1c940b38b30 100644
31 --- a/fs/ext4/ext4_jbd2.h
32 +++ b/fs/ext4/ext4_jbd2.h
33 @@ -362,7 +362,8 @@ static inline int ext4_journal_force_commit(journal_t *journal)
34  static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
35  {
36         if (ext4_handle_valid(handle))
37 -               return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
38 +               return jbd2_journal_inode_add_write(handle,
39 +                                                   EXT4_I(inode)->jinode);
40         return 0;
41  }
43 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
44 index 517f2de784cf..ad6efdabca2c 100644
45 --- a/fs/jbd2/commit.c
46 +++ b/fs/jbd2/commit.c
47 @@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
49         spin_lock(&journal->j_list_lock);
50         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
51 +               if (!(jinode->i_flags & JI_WRITE_DATA))
52 +                       continue;
53                 mapping = jinode->i_vfs_inode->i_mapping;
54                 jinode->i_flags |= JI_COMMIT_RUNNING;
55                 spin_unlock(&journal->j_list_lock);
56 @@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
57         /* For locking, see the comment in journal_submit_data_buffers() */
58         spin_lock(&journal->j_list_lock);
59         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
60 +               if (!(jinode->i_flags & JI_WAIT_DATA))
61 +                       continue;
62                 jinode->i_flags |= JI_COMMIT_RUNNING;
63                 spin_unlock(&journal->j_list_lock);
64                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
65 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
66 index de73a9516a54..ad7de5f9aa69 100644
67 --- a/fs/jbd2/journal.c
68 +++ b/fs/jbd2/journal.c
69 @@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
70  EXPORT_SYMBOL(jbd2_journal_invalidatepage);
71  EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
72  EXPORT_SYMBOL(jbd2_journal_force_commit);
73 -EXPORT_SYMBOL(jbd2_journal_file_inode);
74 +EXPORT_SYMBOL(jbd2_journal_inode_add_write);
75 +EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
76  EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
77  EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
78  EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
79 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
80 index 01e4652d88f6..b2b25d65e994 100644
81 --- a/fs/jbd2/transaction.c
82 +++ b/fs/jbd2/transaction.c
83 @@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
84  /*
85   * File inode in the inode list of the handle's transaction
86   */
87 -int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
88 +static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
89 +                                  unsigned long flags)
90  {
91         transaction_t *transaction = handle->h_transaction;
92         journal_t *journal;
93 @@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
94          * and if jinode->i_next_transaction == transaction, commit code
95          * will only file the inode where we want it.
96          */
97 -       if (jinode->i_transaction == transaction ||
98 -           jinode->i_next_transaction == transaction)
99 +       if ((jinode->i_transaction == transaction ||
100 +           jinode->i_next_transaction == transaction) &&
101 +           (jinode->i_flags & flags) == flags)
102                 return 0;
104         spin_lock(&journal->j_list_lock);
106 +       jinode->i_flags |= flags;
107 +       /* Is inode already attached where we need it? */
108         if (jinode->i_transaction == transaction ||
109             jinode->i_next_transaction == transaction)
110                 goto done;
111 @@ -2523,6 +2526,17 @@ done:
112         return 0;
115 +int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
117 +       return jbd2_journal_file_inode(handle, jinode,
118 +                                      JI_WRITE_DATA | JI_WAIT_DATA);
121 +int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
123 +       return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
126  /*
127   * File truncate and transaction commit interact with each other in a
128   * non-trivial way.  If a transaction writing data block A is
129 diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
130 index f4cd3c3e9fb7..497a4171ef61 100644
131 --- a/fs/ocfs2/journal.h
132 +++ b/fs/ocfs2/journal.h
133 @@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
135  static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
137 -       return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
138 +       return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
141  static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
142 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
143 index fd1083c46c61..39511484ad10 100644
144 --- a/include/linux/jbd2.h
145 +++ b/include/linux/jbd2.h
146 @@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
148  /* Flags in jbd_inode->i_flags */
149  #define __JI_COMMIT_RUNNING 0
150 -/* Commit of the inode data in progress. We use this flag to protect us from
151 +#define __JI_WRITE_DATA 1
152 +#define __JI_WAIT_DATA 2
155 + * Commit of the inode data in progress. We use this flag to protect us from
156   * concurrent deletion of inode. We cannot use reference to inode for this
157   * since we cannot afford doing last iput() on behalf of kjournald
158   */
159  #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
160 +/* Write allocated dirty buffers in this inode before commit */
161 +#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
162 +/* Wait for outstanding data writes for this inode before commit */
163 +#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
165  /**
166   * struct jbd_inode is the structure linking inodes in ordered mode
167 @@ -1270,7 +1278,8 @@ extern int           jbd2_journal_clear_err  (journal_t *);
168  extern int        jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
169  extern int        jbd2_journal_force_commit(journal_t *);
170  extern int        jbd2_journal_force_commit_nested(journal_t *);
171 -extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
172 +extern int        jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
173 +extern int        jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
174  extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
175                                 struct jbd2_inode *inode, loff_t new_size);
176  extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
177 -- 
178 2.6.2