dax-rip-out-get_block-based-IO-support

   1 dax: rip out get_block based IO support
   2
   3 From: Jan Kara <jack@suse.cz>
   4
   5 No one uses functions using the get_block callback anymore. Rip them
   6 out and update documentation.
   7
   8 Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
   9 Signed-off-by: Jan Kara <jack@suse.cz>
  10 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  11 ---
  12  Documentation/filesystems/dax.txt |  22 +--
  13  fs/dax.c                          | 315 --------------------------------------
  14  include/linux/dax.h               |  12 --
  15  3 files changed, 11 insertions(+), 338 deletions(-)
  16
  17 diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
  18 index 23d18b8a49d5..a7e6e14aeb08 100644
  19 --- a/Documentation/filesystems/dax.txt
  20 +++ b/Documentation/filesystems/dax.txt
  21 @@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
  22  Filesystem support consists of
  23  - adding support to mark inodes as being DAX by setting the S_DAX flag in
  24    i_flags
  25 -- implementing the direct_IO address space operation, and calling
  26 -  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
  27 +- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
  28 +  when inode has S_DAX flag set
  29  - implementing an mmap file operation for DAX files which sets the
  30    VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
  31 -  include handlers for fault, pmd_fault and page_mkwrite (which should
  32 -  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
  33 -  appropriate get_block() callback)
  34 -- calling dax_truncate_page() instead of block_truncate_page() for DAX files
  35 -- calling dax_zero_page_range() instead of zero_user() for DAX files
  36 +  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
  37 +  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
  38 +  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
  39 +  iomap operations.
  40 +- calling iomap_zero_range() passing appropriate iomap operations instead of
  41 +  block_truncate_page() for DAX files
  42  - ensuring that there is sufficient locking between reads, writes,
  43    truncates and page faults
  44
  45 -The get_block() callback passed to the DAX functions may return
  46 -uninitialised extents.  If it does, it must ensure that simultaneous
  47 -calls to get_block() (for example by a page-fault racing with a read()
  48 -or a write()) work correctly.
  49 +The iomap handlers for allocating blocks must make sure that allocated blocks
  50 +are zeroed out and converted to written extents before being returned to avoid
  51 +exposure of uninitialized data through mmap.
  52
  53  These filesystems may be used for inspiration:
  54  - ext2: see Documentation/filesystems/ext2.txt
  55 diff --git a/fs/dax.c b/fs/dax.c
  56 index 28af41b9da3a..ad131cd2605d 100644
  57 --- a/fs/dax.c
  58 +++ b/fs/dax.c
  59 @@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  60         return page;
  61  }
  62
  63 -static bool buffer_written(struct buffer_head *bh)
  64 -{
  65 -       return buffer_mapped(bh) && !buffer_unwritten(bh);
  66 -}
  67 -
  68 -static sector_t to_sector(const struct buffer_head *bh,
  69 -               const struct inode *inode)
  70 -{
  71 -       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
  72 -
  73 -       return sector;
  74 -}
  75 -
  76 -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  77 -                     loff_t start, loff_t end, get_block_t get_block,
  78 -                     struct buffer_head *bh)
  79 -{
  80 -       loff_t pos = start, max = start, bh_max = start;
  81 -       bool hole = false;
  82 -       struct block_device *bdev = NULL;
  83 -       int rw = iov_iter_rw(iter), rc;
  84 -       long map_len = 0;
  85 -       struct blk_dax_ctl dax = {
  86 -               .addr = ERR_PTR(-EIO),
  87 -       };
  88 -       unsigned blkbits = inode->i_blkbits;
  89 -       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
  90 -                                                               >> blkbits;
  91 -
  92 -       if (rw == READ)
  93 -               end = min(end, i_size_read(inode));
  94 -
  95 -       while (pos < end) {
  96 -               size_t len;
  97 -               if (pos == max) {
  98 -                       long page = pos >> PAGE_SHIFT;
  99 -                       sector_t block = page << (PAGE_SHIFT - blkbits);
 100 -                       unsigned first = pos - (block << blkbits);
 101 -                       long size;
 102 -
 103 -                       if (pos == bh_max) {
 104 -                               bh->b_size = PAGE_ALIGN(end - pos);
 105 -                               bh->b_state = 0;
 106 -                               rc = get_block(inode, block, bh, rw == WRITE);
 107 -                               if (rc)
 108 -                                       break;
 109 -                               bh_max = pos - first + bh->b_size;
 110 -                               bdev = bh->b_bdev;
 111 -                               /*
 112 -                                * We allow uninitialized buffers for writes
 113 -                                * beyond EOF as those cannot race with faults
 114 -                                */
 115 -                               WARN_ON_ONCE(
 116 -                                       (buffer_new(bh) && block < file_blks) ||
 117 -                                       (rw == WRITE && buffer_unwritten(bh)));
 118 -                       } else {
 119 -                               unsigned done = bh->b_size -
 120 -                                               (bh_max - (pos - first));
 121 -                               bh->b_blocknr += done >> blkbits;
 122 -                               bh->b_size -= done;
 123 -                       }
 124 -
 125 -                       hole = rw == READ && !buffer_written(bh);
 126 -                       if (hole) {
 127 -                               size = bh->b_size - first;
 128 -                       } else {
 129 -                               dax_unmap_atomic(bdev, &dax);
 130 -                               dax.sector = to_sector(bh, inode);
 131 -                               dax.size = bh->b_size;
 132 -                               map_len = dax_map_atomic(bdev, &dax);
 133 -                               if (map_len < 0) {
 134 -                                       rc = map_len;
 135 -                                       break;
 136 -                               }
 137 -                               dax.addr += first;
 138 -                               size = map_len - first;
 139 -                       }
 140 -                       /*
 141 -                        * pos + size is one past the last offset for IO,
 142 -                        * so pos + size can overflow loff_t at extreme offsets.
 143 -                        * Cast to u64 to catch this and get the true minimum.
 144 -                        */
 145 -                       max = min_t(u64, pos + size, end);
 146 -               }
 147 -
 148 -               if (iov_iter_rw(iter) == WRITE) {
 149 -                       len = copy_from_iter_pmem(dax.addr, max - pos, iter);
 150 -               } else if (!hole)
 151 -                       len = copy_to_iter((void __force *) dax.addr, max - pos,
 152 -                                       iter);
 153 -               else
 154 -                       len = iov_iter_zero(max - pos, iter);
 155 -
 156 -               if (!len) {
 157 -                       rc = -EFAULT;
 158 -                       break;
 159 -               }
 160 -
 161 -               pos += len;
 162 -               if (!IS_ERR(dax.addr))
 163 -                       dax.addr += len;
 164 -       }
 165 -
 166 -       dax_unmap_atomic(bdev, &dax);
 167 -
 168 -       return (pos == start) ? rc : pos - start;
 169 -}
 170 -
 171 -/**
 172 - * dax_do_io - Perform I/O to a DAX file
 173 - * @iocb: The control block for this I/O
 174 - * @inode: The file which the I/O is directed at
 175 - * @iter: The addresses to do I/O from or to
 176 - * @get_block: The filesystem method used to translate file offsets to blocks
 177 - * @end_io: A filesystem callback for I/O completion
 178 - * @flags: See below
 179 - *
 180 - * This function uses the same locking scheme as do_blockdev_direct_IO:
 181 - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 182 - * caller for writes.  For reads, we take and release the i_mutex ourselves.
 183 - * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 184 - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 185 - * is in progress.
 186 - */
 187 -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 188 -                 struct iov_iter *iter, get_block_t get_block,
 189 -                 dio_iodone_t end_io, int flags)
 190 -{
 191 -       struct buffer_head bh;
 192 -       ssize_t retval = -EINVAL;
 193 -       loff_t pos = iocb->ki_pos;
 194 -       loff_t end = pos + iov_iter_count(iter);
 195 -
 196 -       memset(&bh, 0, sizeof(bh));
 197 -       bh.b_bdev = inode->i_sb->s_bdev;
 198 -
 199 -       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 200 -               inode_lock(inode);
 201 -
 202 -       /* Protects against truncate */
 203 -       if (!(flags & DIO_SKIP_DIO_COUNT))
 204 -               inode_dio_begin(inode);
 205 -
 206 -       retval = dax_io(inode, iter, pos, end, get_block, &bh);
 207 -
 208 -       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 209 -               inode_unlock(inode);
 210 -
 211 -       if (end_io) {
 212 -               int err;
 213 -
 214 -               err = end_io(iocb, pos, retval, bh.b_private);
 215 -               if (err)
 216 -                       retval = err;
 217 -       }
 218 -
 219 -       if (!(flags & DIO_SKIP_DIO_COUNT))
 220 -               inode_dio_end(inode);
 221 -       return retval;
 222 -}
 223 -EXPORT_SYMBOL_GPL(dax_do_io);
 224 -
 225  /*
 226   * DAX radix tree locking
 227   */
 228 @@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
 229  }
 230
 231  /**
 232 - * dax_fault - handle a page fault on a DAX file
 233 - * @vma: The virtual memory area where the fault occurred
 234 - * @vmf: The description of the fault
 235 - * @get_block: The filesystem method used to translate file offsets to blocks
 236 - *
 237 - * When a page fault occurs, filesystems may call this helper in their
 238 - * fault handler for DAX files. dax_fault() assumes the caller has done all
 239 - * the necessary locking for the page fault to proceed successfully.
 240 - */
 241 -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 242 -                       get_block_t get_block)
 243 -{
 244 -       struct file *file = vma->vm_file;
 245 -       struct address_space *mapping = file->f_mapping;
 246 -       struct inode *inode = mapping->host;
 247 -       void *entry;
 248 -       struct buffer_head bh;
 249 -       unsigned long vaddr = (unsigned long)vmf->virtual_address;
 250 -       unsigned blkbits = inode->i_blkbits;
 251 -       sector_t block;
 252 -       pgoff_t size;
 253 -       int error;
 254 -       int major = 0;
 255 -
 256 -       /*
 257 -        * Check whether offset isn't beyond end of file now. Caller is supposed
 258 -        * to hold locks serializing us with truncate / punch hole so this is
 259 -        * a reliable test.
 260 -        */
 261 -       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 262 -       if (vmf->pgoff >= size)
 263 -               return VM_FAULT_SIGBUS;
 264 -
 265 -       memset(&bh, 0, sizeof(bh));
 266 -       block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 267 -       bh.b_bdev = inode->i_sb->s_bdev;
 268 -       bh.b_size = PAGE_SIZE;
 269 -
 270 -       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 271 -       if (IS_ERR(entry)) {
 272 -               error = PTR_ERR(entry);
 273 -               goto out;
 274 -       }
 275 -
 276 -       error = get_block(inode, block, &bh, 0);
 277 -       if (!error && (bh.b_size < PAGE_SIZE))
 278 -               error = -EIO;           /* fs corruption? */
 279 -       if (error)
 280 -               goto unlock_entry;
 281 -
 282 -       if (vmf->cow_page) {
 283 -               struct page *new_page = vmf->cow_page;
 284 -               if (buffer_written(&bh))
 285 -                       error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
 286 -                                       bh.b_size, new_page, vaddr);
 287 -               else
 288 -                       clear_user_highpage(new_page, vaddr);
 289 -               if (error)
 290 -                       goto unlock_entry;
 291 -               if (!radix_tree_exceptional_entry(entry)) {
 292 -                       vmf->page = entry;
 293 -                       return VM_FAULT_LOCKED;
 294 -               }
 295 -               vmf->entry = entry;
 296 -               return VM_FAULT_DAX_LOCKED;
 297 -       }
 298 -
 299 -       if (!buffer_mapped(&bh)) {
 300 -               if (vmf->flags & FAULT_FLAG_WRITE) {
 301 -                       error = get_block(inode, block, &bh, 1);
 302 -                       count_vm_event(PGMAJFAULT);
 303 -                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 304 -                       major = VM_FAULT_MAJOR;
 305 -                       if (!error && (bh.b_size < PAGE_SIZE))
 306 -                               error = -EIO;
 307 -                       if (error)
 308 -                               goto unlock_entry;
 309 -               } else {
 310 -                       return dax_load_hole(mapping, entry, vmf);
 311 -               }
 312 -       }
 313 -
 314 -       /* Filesystem should not return unwritten buffers to us! */
 315 -       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
 316 -       error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
 317 -                       bh.b_size, &entry, vma, vmf);
 318 - unlock_entry:
 319 -       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 320 - out:
 321 -       if (error == -ENOMEM)
 322 -               return VM_FAULT_OOM | major;
 323 -       /* -EBUSY is fine, somebody else faulted on the same PTE */
 324 -       if ((error < 0) && (error != -EBUSY))
 325 -               return VM_FAULT_SIGBUS | major;
 326 -       return VM_FAULT_NOPAGE | major;
 327 -}
 328 -EXPORT_SYMBOL_GPL(dax_fault);
 329 -
 330 -/**
 331   * dax_pfn_mkwrite - handle first write to DAX page
 332   * @vma: The virtual memory area where the fault occurred
 333   * @vmf: The description of the fault
 334 @@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 335  }
 336  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 337
 338 -/**
 339 - * dax_zero_page_range - zero a range within a page of a DAX file
 340 - * @inode: The file being truncated
 341 - * @from: The file offset that is being truncated to
 342 - * @length: The number of bytes to zero
 343 - * @get_block: The filesystem method used to translate file offsets to blocks
 344 - *
 345 - * This function can be called by a filesystem when it is zeroing part of a
 346 - * page in a DAX file.  This is intended for hole-punch operations.  If
 347 - * you are truncating a file, the helper function dax_truncate_page() may be
 348 - * more convenient.
 349 - */
 350 -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 351 -                                                       get_block_t get_block)
 352 -{
 353 -       struct buffer_head bh;
 354 -       pgoff_t index = from >> PAGE_SHIFT;
 355 -       unsigned offset = from & (PAGE_SIZE-1);
 356 -       int err;
 357 -
 358 -       /* Block boundary? Nothing to do */
 359 -       if (!length)
 360 -               return 0;
 361 -       if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
 362 -               return -EINVAL;
 363 -
 364 -       memset(&bh, 0, sizeof(bh));
 365 -       bh.b_bdev = inode->i_sb->s_bdev;
 366 -       bh.b_size = PAGE_SIZE;
 367 -       err = get_block(inode, index, &bh, 0);
 368 -       if (err < 0 || !buffer_written(&bh))
 369 -               return err;
 370 -
 371 -       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
 372 -                       offset, length);
 373 -}
 374 -EXPORT_SYMBOL_GPL(dax_zero_page_range);
 375 -
 376 -/**
 377 - * dax_truncate_page - handle a partial page being truncated in a DAX file
 378 - * @inode: The file being truncated
 379 - * @from: The file offset that is being truncated to
 380 - * @get_block: The filesystem method used to translate file offsets to blocks
 381 - *
 382 - * Similar to block_truncate_page(), this function can be called by a
 383 - * filesystem when it is truncating a DAX file to handle the partial page.
 384 - */
 385 -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 386 -{
 387 -       unsigned length = PAGE_ALIGN(from) - from;
 388 -       return dax_zero_page_range(inode, from, length, get_block);
 389 -}
 390 -EXPORT_SYMBOL_GPL(dax_truncate_page);
 391 -
 392  #ifdef CONFIG_FS_IOMAP
 393  static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 394  {
 395 diff --git a/include/linux/dax.h b/include/linux/dax.h
 396 index 8d1a5c47945f..0afade8bd3d7 100644
 397 --- a/include/linux/dax.h
 398 +++ b/include/linux/dax.h
 399 @@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 400
 401  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 402                 struct iomap_ops *ops);
 403 -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
 404 -                 get_block_t, dio_iodone_t, int flags);
 405 -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 406 -int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 407  int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 408                         struct iomap_ops *ops);
 409 -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 410  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 411  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 412                 pgoff_t index, void *entry, bool wake_all);
 413 @@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 414  }
 415  #endif
 416
 417 -static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 418 -                               pmd_t *pmd, unsigned int flags, get_block_t gb)
 419 -{
 420 -       return VM_FAULT_FALLBACK;
 421 -}
 422 -
 423  #ifdef CONFIG_FS_DAX_PMD
 424  static inline unsigned int dax_radix_order(void *entry)
 425  {
 426 @@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 427  }
 428  #endif
 429  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 430 -#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
 431
 432  static inline bool vma_is_dax(struct vm_area_struct *vma)
 433  {
 434 --
 435 2.6.6
 436
 437