archive/avoid-unnecessarily-writing-back-dirty-pages-before-hole-punching

   1 ext4: avoid unnecessarily writing back dirty pages before hole punching
   2
   3 From: Li Wang <liwang@ubuntukylin.com>
   4
   5 For hole punching, currently ext4 will synchronously write back the
   6 dirty pages fit into the hole, since the data on the disk responding
   7 to those pages are to be deleted, it is benefical to directly release
   8 those pages, no matter they are dirty or not, except the ordered case.
   9
  10 [ Fixed error return to unlock i_mutex if ext4_begin_ordered_punch_hole()
  11   fails.  Thanks to Wei Yongjun <yongjun_wei@trendmicro.com.cn> for
  12   pointing this out.]
  13
  14 Signed-off-by: Li Wang <liwang@ubuntukylin.com>
  15 Signed-off-by: Yunchuan Wen <yunchuanwen@ubuntukylin.com>
  16 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  17 Cc: Dmitry Monakhov <dmonakhov@openvz.org>
  18 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
  19 Reviewed-by: Jan Kara <jack@suse.cz>
  20 ---
  21  fs/ext4/inode.c       | 28 ++++++++++++++++------------
  22  fs/jbd2/journal.c     |  2 +-
  23  fs/jbd2/transaction.c | 29 ++++++-----------------------
  24  include/linux/jbd2.h  | 33 +++++++++++++++++++++++++++++++--
  25  4 files changed, 54 insertions(+), 38 deletions(-)
  26
  27 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
  28 index 0db830d..06136b5 100644
  29 --- a/fs/ext4/inode.c
  30 +++ b/fs/ext4/inode.c
  31 @@ -3466,6 +3466,16 @@ int ext4_can_truncate(struct inode *inode)
  32         return 0;
  33  }
  34
  35 +static inline int ext4_begin_ordered_punch_hole(struct inode *inode,
  36 +                                              loff_t start, loff_t length)
  37 +{
  38 +       if (!EXT4_I(inode)->jinode)
  39 +               return 0;
  40 +       return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode),
  41 +                                                   EXT4_I(inode)->jinode,
  42 +                                                   start, start+length-1);
  43 +}
  44 +
  45  /*
  46   * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  47   * associated with the given offset and length
  48 @@ -3482,7 +3492,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  49         struct inode *inode = file_inode(file);
  50         struct super_block *sb = inode->i_sb;
  51         ext4_lblk_t first_block, stop_block;
  52 -       struct address_space *mapping = inode->i_mapping;
  53         loff_t first_block_offset, last_block_offset;
  54         handle_t *handle;
  55         unsigned int credits;
  56 @@ -3498,17 +3507,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  57
  58         trace_ext4_punch_hole(inode, offset, length);
  59
  60 -       /*
  61 -        * Write out all dirty pages to avoid race conditions
  62 -        * Then release them.
  63 -        */
  64 -       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
  65 -               ret = filemap_write_and_wait_range(mapping, offset,
  66 -                                                  offset + length - 1);
  67 -               if (ret)
  68 -                       return ret;
  69 -       }
  70 -
  71         mutex_lock(&inode->i_mutex);
  72         /* It's not possible punch hole on append only file */
  73         if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
  74 @@ -3537,6 +3535,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  75         first_block_offset = round_up(offset, sb->s_blocksize);
  76         last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
  77
  78 +       if (ext4_should_order_data(inode)) {
  79 +               ret = ext4_begin_ordered_punch_hole(inode, offset, length);
  80 +               if (ret)
  81 +                       goto out_mutex;
  82 +       }
  83 +
  84         /* Now release the pages and zero block aligned part of pages*/
  85         if (last_block_offset > first_block_offset)
  86                 truncate_pagecache_range(inode, first_block_offset,
  87 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
  88 index 915dd57..4c8b8d4 100644
  89 --- a/fs/jbd2/journal.c
  90 +++ b/fs/jbd2/journal.c
  91 @@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit);
  92  EXPORT_SYMBOL(jbd2_journal_file_inode);
  93  EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
  94  EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
  95 -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
  96 +EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole);
  97  EXPORT_SYMBOL(jbd2_inode_cache);
  98
  99  static void __journal_abort_soft (journal_t *journal, int errno);
 100 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
 101 index dd422e6..91d62e1 100644
 102 --- a/fs/jbd2/transaction.c
 103 +++ b/fs/jbd2/transaction.c
 104 @@ -2419,29 +2419,10 @@ done:
 105         return 0;
 106  }
 107
 108 -/*
 109 - * File truncate and transaction commit interact with each other in a
 110 - * non-trivial way.  If a transaction writing data block A is
 111 - * committing, we cannot discard the data by truncate until we have
 112 - * written them.  Otherwise if we crashed after the transaction with
 113 - * write has committed but before the transaction with truncate has
 114 - * committed, we could see stale data in block A.  This function is a
 115 - * helper to solve this problem.  It starts writeout of the truncated
 116 - * part in case it is in the committing transaction.
 117 - *
 118 - * Filesystem code must call this function when inode is journaled in
 119 - * ordered mode before truncation happens and after the inode has been
 120 - * placed on orphan list with the new inode size. The second condition
 121 - * avoids the race that someone writes new data and we start
 122 - * committing the transaction after this function has been called but
 123 - * before a transaction for truncate is started (and furthermore it
 124 - * allows us to optimize the case where the addition to orphan list
 125 - * happens in the same transaction as write --- we don't have to write
 126 - * any data in such case).
 127 - */
 128 -int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 129 +
 130 +int jbd2_journal_begin_ordered_punch_hole(journal_t *journal,
 131                                         struct jbd2_inode *jinode,
 132 -                                       loff_t new_size)
 133 +                                       loff_t start, loff_t end)
 134  {
 135         transaction_t *inode_trans, *commit_trans;
 136         int ret = 0;
 137 @@ -2460,10 +2441,12 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 138         spin_unlock(&journal->j_list_lock);
 139         if (inode_trans == commit_trans) {
 140                 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 141 -                       new_size, LLONG_MAX);
 142 +                       start, end);
 143                 if (ret)
 144                         jbd2_journal_abort(journal, ret);
 145         }
 146  out:
 147         return ret;
 148  }
 149 +
 150 +
 151 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
 152 index 0302f3f..5f3c094 100644
 153 --- a/include/linux/jbd2.h
 154 +++ b/include/linux/jbd2.h
 155 @@ -1157,12 +1157,41 @@ extern int         jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *
 156  extern int        jbd2_journal_force_commit(journal_t *);
 157  extern int        jbd2_journal_force_commit_nested(journal_t *);
 158  extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
 159 -extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
 160 -                               struct jbd2_inode *inode, loff_t new_size);
 161 +extern int        jbd2_journal_begin_ordered_punch_hole(journal_t *,
 162 +                                       struct jbd2_inode *,
 163 +                                       loff_t, loff_t);
 164  extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
 165  extern void       jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 166
 167  /*
 168 + * File truncate and transaction commit interact with each other in a
 169 + * non-trivial way.  If a transaction writing data block A is
 170 + * committing, we cannot discard the data by truncate until we have
 171 + * written them.  Otherwise if we crashed after the transaction with
 172 + * write has committed but before the transaction with truncate has
 173 + * committed, we could see stale data in block A.  This function is a
 174 + * helper to solve this problem.  It starts writeout of the truncated
 175 + * part in case it is in the committing transaction.
 176 + *
 177 + * Filesystem code must call this function when inode is journaled in
 178 + * ordered mode before truncation happens and after the inode has been
 179 + * placed on orphan list with the new inode size. The second condition
 180 + * avoids the race that someone writes new data and we start
 181 + * committing the transaction after this function has been called but
 182 + * before a transaction for truncate is started (and furthermore it
 183 + * allows us to optimize the case where the addition to orphan list
 184 + * happens in the same transaction as write --- we don't have to write
 185 + * any data in such case).
 186 + */
 187 +static inline int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 188 +                                       struct jbd2_inode *jinode,
 189 +                                       loff_t new_size)
 190 +{
 191 +       return jbd2_journal_begin_ordered_punch_hole(journal, jinode,
 192 +                                                 new_size, LLONG_MAX);
 193 +}
 194 +
 195 +/*
 196   * journal_head management
 197   */
 198  struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);