fs/xfs/xfs_file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_inode_item.h"
  16 #include "xfs_bmap.h"
  17 #include "xfs_bmap_util.h"
  18 #include "xfs_dir2.h"
  19 #include "xfs_dir2_priv.h"
  20 #include "xfs_ioctl.h"
  21 #include "xfs_trace.h"
  22 #include "xfs_log.h"
  23 #include "xfs_icache.h"
  24 #include "xfs_pnfs.h"
  25 #include "xfs_iomap.h"
  26 #include "xfs_reflink.h"
  27
  28 #include <linux/falloc.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/mman.h>
  31 #include <linux/fadvise.h>
  32
  33 static const struct vm_operations_struct xfs_file_vm_ops;
  34
  35 int
  36 xfs_update_prealloc_flags(
  37         struct xfs_inode        *ip,
  38         enum xfs_prealloc_flags flags)
  39 {
  40         struct xfs_trans        *tp;
  41         int                     error;
  42
  43         error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
  44                         0, 0, 0, &tp);
  45         if (error)
  46                 return error;
  47
  48         xfs_ilock(ip, XFS_ILOCK_EXCL);
  49         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  50
  51         if (!(flags & XFS_PREALLOC_INVISIBLE)) {
  52                 VFS_I(ip)->i_mode &= ~S_ISUID;
  53                 if (VFS_I(ip)->i_mode & S_IXGRP)
  54                         VFS_I(ip)->i_mode &= ~S_ISGID;
  55                 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  56         }
  57
  58         if (flags & XFS_PREALLOC_SET)
  59                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
  60         if (flags & XFS_PREALLOC_CLEAR)
  61                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
  62
  63         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  64         if (flags & XFS_PREALLOC_SYNC)
  65                 xfs_trans_set_sync(tp);
  66         return xfs_trans_commit(tp);
  67 }
  68
  69 /*
  70  * Fsync operations on directories are much simpler than on regular files,
  71  * as there is no file data to flush, and thus also no need for explicit
  72  * cache flush operations, and there are no non-transaction metadata updates
  73  * on directories either.
  74  */
  75 STATIC int
  76 xfs_dir_fsync(
  77         struct file             *file,
  78         loff_t                  start,
  79         loff_t                  end,
  80         int                     datasync)
  81 {
  82         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
  83         struct xfs_mount        *mp = ip->i_mount;
  84         xfs_lsn_t               lsn = 0;
  85
  86         trace_xfs_dir_fsync(ip);
  87
  88         xfs_ilock(ip, XFS_ILOCK_SHARED);
  89         if (xfs_ipincount(ip))
  90                 lsn = ip->i_itemp->ili_last_lsn;
  91         xfs_iunlock(ip, XFS_ILOCK_SHARED);
  92
  93         if (!lsn)
  94                 return 0;
  95         return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
  96 }
  97
  98 STATIC int
  99 xfs_file_fsync(
 100         struct file             *file,
 101         loff_t                  start,
 102         loff_t                  end,
 103         int                     datasync)
 104 {
 105         struct inode            *inode = file->f_mapping->host;
 106         struct xfs_inode        *ip = XFS_I(inode);
 107         struct xfs_mount        *mp = ip->i_mount;
 108         int                     error = 0;
 109         int                     log_flushed = 0;
 110         xfs_lsn_t               lsn = 0;
 111
 112         trace_xfs_file_fsync(ip);
 113
 114         error = file_write_and_wait_range(file, start, end);
 115         if (error)
 116                 return error;
 117
 118         if (XFS_FORCED_SHUTDOWN(mp))
 119                 return -EIO;
 120
 121         xfs_iflags_clear(ip, XFS_ITRUNCATED);
 122
 123         /*
 124          * If we have an RT and/or log subvolume we need to make sure to flush
 125          * the write cache the device used for file data first.  This is to
 126          * ensure newly written file data make it to disk before logging the new
 127          * inode size in case of an extending write.
 128          */
 129         if (XFS_IS_REALTIME_INODE(ip))
 130                 xfs_blkdev_issue_flush(mp->m_rtdev_targp);
 131         else if (mp->m_logdev_targp != mp->m_ddev_targp)
 132                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
 133
 134         /*
 135          * All metadata updates are logged, which means that we just have to
 136          * flush the log up to the latest LSN that touched the inode. If we have
 137          * concurrent fsync/fdatasync() calls, we need them to all block on the
 138          * log force before we clear the ili_fsync_fields field. This ensures
 139          * that we don't get a racing sync operation that does not wait for the
 140          * metadata to hit the journal before returning. If we race with
 141          * clearing the ili_fsync_fields, then all that will happen is the log
 142          * force will do nothing as the lsn will already be on disk. We can't
 143          * race with setting ili_fsync_fields because that is done under
 144          * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
 145          * until after the ili_fsync_fields is cleared.
 146          */
 147         xfs_ilock(ip, XFS_ILOCK_SHARED);
 148         if (xfs_ipincount(ip)) {
 149                 if (!datasync ||
 150                     (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
 151                         lsn = ip->i_itemp->ili_last_lsn;
 152         }
 153
 154         if (lsn) {
 155                 error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
 156                 ip->i_itemp->ili_fsync_fields = 0;
 157         }
 158         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 159
 160         /*
 161          * If we only have a single device, and the log force about was
 162          * a no-op we might have to flush the data device cache here.
 163          * This can only happen for fdatasync/O_DSYNC if we were overwriting
 164          * an already allocated file and thus do not have any metadata to
 165          * commit.
 166          */
 167         if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
 168             mp->m_logdev_targp == mp->m_ddev_targp)
 169                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
 170
 171         return error;
 172 }
 173
 174 STATIC ssize_t
 175 xfs_file_dio_aio_read(
 176         struct kiocb            *iocb,
 177         struct iov_iter         *to)
 178 {
 179         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 180         size_t                  count = iov_iter_count(to);
 181         ssize_t                 ret;
 182
 183         trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 184
 185         if (!count)
 186                 return 0; /* skip atime */
 187
 188         file_accessed(iocb->ki_filp);
 189
 190         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 191         ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
 192                         is_sync_kiocb(iocb));
 193         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 194
 195         return ret;
 196 }
 197
 198 static noinline ssize_t
 199 xfs_file_dax_read(
 200         struct kiocb            *iocb,
 201         struct iov_iter         *to)
 202 {
 203         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
 204         size_t                  count = iov_iter_count(to);
 205         ssize_t                 ret = 0;
 206
 207         trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
 208
 209         if (!count)
 210                 return 0; /* skip atime */
 211
 212         if (iocb->ki_flags & IOCB_NOWAIT) {
 213                 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
 214                         return -EAGAIN;
 215         } else {
 216                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
 217         }
 218
 219         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
 220         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 221
 222         file_accessed(iocb->ki_filp);
 223         return ret;
 224 }
 225
 226 STATIC ssize_t
 227 xfs_file_buffered_aio_read(
 228         struct kiocb            *iocb,
 229         struct iov_iter         *to)
 230 {
 231         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
 232         ssize_t                 ret;
 233
 234         trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 235
 236         if (iocb->ki_flags & IOCB_NOWAIT) {
 237                 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
 238                         return -EAGAIN;
 239         } else {
 240                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
 241         }
 242         ret = generic_file_read_iter(iocb, to);
 243         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 244
 245         return ret;
 246 }
 247
 248 STATIC ssize_t
 249 xfs_file_read_iter(
 250         struct kiocb            *iocb,
 251         struct iov_iter         *to)
 252 {
 253         struct inode            *inode = file_inode(iocb->ki_filp);
 254         struct xfs_mount        *mp = XFS_I(inode)->i_mount;
 255         ssize_t                 ret = 0;
 256
 257         XFS_STATS_INC(mp, xs_read_calls);
 258
 259         if (XFS_FORCED_SHUTDOWN(mp))
 260                 return -EIO;
 261
 262         if (IS_DAX(inode))
 263                 ret = xfs_file_dax_read(iocb, to);
 264         else if (iocb->ki_flags & IOCB_DIRECT)
 265                 ret = xfs_file_dio_aio_read(iocb, to);
 266         else
 267                 ret = xfs_file_buffered_aio_read(iocb, to);
 268
 269         if (ret > 0)
 270                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
 271         return ret;
 272 }
 273
 274 /*
 275  * Common pre-write limit and setup checks.
 276  *
 277  * Called with the iolocked held either shared and exclusive according to
 278  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 279  * if called for a direct write beyond i_size.
 280  */
 281 STATIC ssize_t
 282 xfs_file_aio_write_checks(
 283         struct kiocb            *iocb,
 284         struct iov_iter         *from,
 285         int                     *iolock)
 286 {
 287         struct file             *file = iocb->ki_filp;
 288         struct inode            *inode = file->f_mapping->host;
 289         struct xfs_inode        *ip = XFS_I(inode);
 290         ssize_t                 error = 0;
 291         size_t                  count = iov_iter_count(from);
 292         bool                    drained_dio = false;
 293         loff_t                  isize;
 294
 295 restart:
 296         error = generic_write_checks(iocb, from);
 297         if (error <= 0)
 298                 return error;
 299
 300         error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
 301         if (error)
 302                 return error;
 303
 304         /*
 305          * For changing security info in file_remove_privs() we need i_rwsem
 306          * exclusively.
 307          */
 308         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
 309                 xfs_iunlock(ip, *iolock);
 310                 *iolock = XFS_IOLOCK_EXCL;
 311                 xfs_ilock(ip, *iolock);
 312                 goto restart;
 313         }
 314         /*
 315          * If the offset is beyond the size of the file, we need to zero any
 316          * blocks that fall between the existing EOF and the start of this
 317          * write.  If zeroing is needed and we are currently holding the
 318          * iolock shared, we need to update it to exclusive which implies
 319          * having to redo all checks before.
 320          *
 321          * We need to serialise against EOF updates that occur in IO
 322          * completions here. We want to make sure that nobody is changing the
 323          * size while we do this check until we have placed an IO barrier (i.e.
 324          * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
 325          * The spinlock effectively forms a memory barrier once we have the
 326          * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
 327          * and hence be able to correctly determine if we need to run zeroing.
 328          */
 329         spin_lock(&ip->i_flags_lock);
 330         isize = i_size_read(inode);
 331         if (iocb->ki_pos > isize) {
 332                 spin_unlock(&ip->i_flags_lock);
 333                 if (!drained_dio) {
 334                         if (*iolock == XFS_IOLOCK_SHARED) {
 335                                 xfs_iunlock(ip, *iolock);
 336                                 *iolock = XFS_IOLOCK_EXCL;
 337                                 xfs_ilock(ip, *iolock);
 338                                 iov_iter_reexpand(from, count);
 339                         }
 340                         /*
 341                          * We now have an IO submission barrier in place, but
 342                          * AIO can do EOF updates during IO completion and hence
 343                          * we now need to wait for all of them to drain. Non-AIO
 344                          * DIO will have drained before we are given the
 345                          * XFS_IOLOCK_EXCL, and so for most cases this wait is a
 346                          * no-op.
 347                          */
 348                         inode_dio_wait(inode);
 349                         drained_dio = true;
 350                         goto restart;
 351                 }
 352
 353                 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 354                 error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
 355                                 NULL, &xfs_buffered_write_iomap_ops);
 356                 if (error)
 357                         return error;
 358         } else
 359                 spin_unlock(&ip->i_flags_lock);
 360
 361         /*
 362          * Updating the timestamps will grab the ilock again from
 363          * xfs_fs_dirty_inode, so we have to call it after dropping the
 364          * lock above.  Eventually we should look into a way to avoid
 365          * the pointless lock roundtrip.
 366          */
 367         return file_modified(file);
 368 }
 369
 370 static int
 371 xfs_dio_write_end_io(
 372         struct kiocb            *iocb,
 373         ssize_t                 size,
 374         int                     error,
 375         unsigned                flags)
 376 {
 377         struct inode            *inode = file_inode(iocb->ki_filp);
 378         struct xfs_inode        *ip = XFS_I(inode);
 379         loff_t                  offset = iocb->ki_pos;
 380         unsigned int            nofs_flag;
 381
 382         trace_xfs_end_io_direct_write(ip, offset, size);
 383
 384         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 385                 return -EIO;
 386
 387         if (error)
 388                 return error;
 389         if (!size)
 390                 return 0;
 391
 392         /*
 393          * Capture amount written on completion as we can't reliably account
 394          * for it on submission.
 395          */
 396         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
 397
 398         /*
 399          * We can allocate memory here while doing writeback on behalf of
 400          * memory reclaim.  To avoid memory allocation deadlocks set the
 401          * task-wide nofs context for the following operations.
 402          */
 403         nofs_flag = memalloc_nofs_save();
 404
 405         if (flags & IOMAP_DIO_COW) {
 406                 error = xfs_reflink_end_cow(ip, offset, size);
 407                 if (error)
 408                         goto out;
 409         }
 410
 411         /*
 412          * Unwritten conversion updates the in-core isize after extent
 413          * conversion but before updating the on-disk size. Updating isize any
 414          * earlier allows a racing dio read to find unwritten extents before
 415          * they are converted.
 416          */
 417         if (flags & IOMAP_DIO_UNWRITTEN) {
 418                 error = xfs_iomap_write_unwritten(ip, offset, size, true);
 419                 goto out;
 420         }
 421
 422         /*
 423          * We need to update the in-core inode size here so that we don't end up
 424          * with the on-disk inode size being outside the in-core inode size. We
 425          * have no other method of updating EOF for AIO, so always do it here
 426          * if necessary.
 427          *
 428          * We need to lock the test/set EOF update as we can be racing with
 429          * other IO completions here to update the EOF. Failing to serialise
 430          * here can result in EOF moving backwards and Bad Things Happen when
 431          * that occurs.
 432          */
 433         spin_lock(&ip->i_flags_lock);
 434         if (offset + size > i_size_read(inode)) {
 435                 i_size_write(inode, offset + size);
 436                 spin_unlock(&ip->i_flags_lock);
 437                 error = xfs_setfilesize(ip, offset, size);
 438         } else {
 439                 spin_unlock(&ip->i_flags_lock);
 440         }
 441
 442 out:
 443         memalloc_nofs_restore(nofs_flag);
 444         return error;
 445 }
 446
 447 static const struct iomap_dio_ops xfs_dio_write_ops = {
 448         .end_io         = xfs_dio_write_end_io,
 449 };
 450
 451 /*
 452  * xfs_file_dio_aio_write - handle direct IO writes
 453  *
 454  * Lock the inode appropriately to prepare for and issue a direct IO write.
 455  * By separating it from the buffered write path we remove all the tricky to
 456  * follow locking changes and looping.
 457  *
 458  * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 459  * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 460  * pages are flushed out.
 461  *
 462  * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 463  * allowing them to be done in parallel with reads and other direct IO writes.
 464  * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 465  * needs to do sub-block zeroing and that requires serialisation against other
 466  * direct IOs to the same block. In this case we need to serialise the
 467  * submission of the unaligned IOs so that we don't get racing block zeroing in
 468  * the dio layer.  To avoid the problem with aio, we also need to wait for
 469  * outstanding IOs to complete so that unwritten extent conversion is completed
 470  * before we try to map the overlapping block. This is currently implemented by
 471  * hitting it with a big hammer (i.e. inode_dio_wait()).
 472  *
 473  * Returns with locks held indicated by @iolock and errors indicated by
 474  * negative return values.
 475  */
 476 STATIC ssize_t
 477 xfs_file_dio_aio_write(
 478         struct kiocb            *iocb,
 479         struct iov_iter         *from)
 480 {
 481         struct file             *file = iocb->ki_filp;
 482         struct address_space    *mapping = file->f_mapping;
 483         struct inode            *inode = mapping->host;
 484         struct xfs_inode        *ip = XFS_I(inode);
 485         struct xfs_mount        *mp = ip->i_mount;
 486         ssize_t                 ret = 0;
 487         int                     unaligned_io = 0;
 488         int                     iolock;
 489         size_t                  count = iov_iter_count(from);
 490         struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
 491
 492         /* DIO must be aligned to device logical sector size */
 493         if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 494                 return -EINVAL;
 495
 496         /*
 497          * Don't take the exclusive iolock here unless the I/O is unaligned to
 498          * the file system block size.  We don't need to consider the EOF
 499          * extension case here because xfs_file_aio_write_checks() will relock
 500          * the inode as necessary for EOF zeroing cases and fill out the new
 501          * inode size as appropriate.
 502          */
 503         if ((iocb->ki_pos & mp->m_blockmask) ||
 504             ((iocb->ki_pos + count) & mp->m_blockmask)) {
 505                 unaligned_io = 1;
 506
 507                 /*
 508                  * We can't properly handle unaligned direct I/O to reflink
 509                  * files yet, as we can't unshare a partial block.
 510                  */
 511                 if (xfs_is_cow_inode(ip)) {
 512                         trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
 513                         return -EREMCHG;
 514                 }
 515                 iolock = XFS_IOLOCK_EXCL;
 516         } else {
 517                 iolock = XFS_IOLOCK_SHARED;
 518         }
 519
 520         if (iocb->ki_flags & IOCB_NOWAIT) {
 521                 /* unaligned dio always waits, bail */
 522                 if (unaligned_io)
 523                         return -EAGAIN;
 524                 if (!xfs_ilock_nowait(ip, iolock))
 525                         return -EAGAIN;
 526         } else {
 527                 xfs_ilock(ip, iolock);
 528         }
 529
 530         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 531         if (ret)
 532                 goto out;
 533         count = iov_iter_count(from);
 534
 535         /*
 536          * If we are doing unaligned IO, we can't allow any other overlapping IO
 537          * in-flight at the same time or we risk data corruption. Wait for all
 538          * other IO to drain before we submit. If the IO is aligned, demote the
 539          * iolock if we had to take the exclusive lock in
 540          * xfs_file_aio_write_checks() for other reasons.
 541          */
 542         if (unaligned_io) {
 543                 inode_dio_wait(inode);
 544         } else if (iolock == XFS_IOLOCK_EXCL) {
 545                 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 546                 iolock = XFS_IOLOCK_SHARED;
 547         }
 548
 549         trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 550         /*
 551          * If unaligned, this is the only IO in-flight. Wait on it before we
 552          * release the iolock to prevent subsequent overlapping IO.
 553          */
 554         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 555                            &xfs_dio_write_ops,
 556                            is_sync_kiocb(iocb) || unaligned_io);
 557 out:
 558         xfs_iunlock(ip, iolock);
 559
 560         /*
 561          * No fallback to buffered IO on errors for XFS, direct IO will either
 562          * complete fully or fail.
 563          */
 564         ASSERT(ret < 0 || ret == count);
 565         return ret;
 566 }
 567
 568 static noinline ssize_t
 569 xfs_file_dax_write(
 570         struct kiocb            *iocb,
 571         struct iov_iter         *from)
 572 {
 573         struct inode            *inode = iocb->ki_filp->f_mapping->host;
 574         struct xfs_inode        *ip = XFS_I(inode);
 575         int                     iolock = XFS_IOLOCK_EXCL;
 576         ssize_t                 ret, error = 0;
 577         size_t                  count;
 578         loff_t                  pos;
 579
 580         if (iocb->ki_flags & IOCB_NOWAIT) {
 581                 if (!xfs_ilock_nowait(ip, iolock))
 582                         return -EAGAIN;
 583         } else {
 584                 xfs_ilock(ip, iolock);
 585         }
 586
 587         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 588         if (ret)
 589                 goto out;
 590
 591         pos = iocb->ki_pos;
 592         count = iov_iter_count(from);
 593
 594         trace_xfs_file_dax_write(ip, count, pos);
 595         ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
 596         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 597                 i_size_write(inode, iocb->ki_pos);
 598                 error = xfs_setfilesize(ip, pos, ret);
 599         }
 600 out:
 601         xfs_iunlock(ip, iolock);
 602         if (error)
 603                 return error;
 604
 605         if (ret > 0) {
 606                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 607
 608                 /* Handle various SYNC-type writes */
 609                 ret = generic_write_sync(iocb, ret);
 610         }
 611         return ret;
 612 }
 613
 614 STATIC ssize_t
 615 xfs_file_buffered_aio_write(
 616         struct kiocb            *iocb,
 617         struct iov_iter         *from)
 618 {
 619         struct file             *file = iocb->ki_filp;
 620         struct address_space    *mapping = file->f_mapping;
 621         struct inode            *inode = mapping->host;
 622         struct xfs_inode        *ip = XFS_I(inode);
 623         ssize_t                 ret;
 624         int                     enospc = 0;
 625         int                     iolock;
 626
 627         if (iocb->ki_flags & IOCB_NOWAIT)
 628                 return -EOPNOTSUPP;
 629
 630 write_retry:
 631         iolock = XFS_IOLOCK_EXCL;
 632         xfs_ilock(ip, iolock);
 633
 634         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 635         if (ret)
 636                 goto out;
 637
 638         /* We can write back this queue in page reclaim */
 639         current->backing_dev_info = inode_to_bdi(inode);
 640
 641         trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 642         ret = iomap_file_buffered_write(iocb, from,
 643                         &xfs_buffered_write_iomap_ops);
 644         if (likely(ret >= 0))
 645                 iocb->ki_pos += ret;
 646
 647         /*
 648          * If we hit a space limit, try to free up some lingering preallocated
 649          * space before returning an error. In the case of ENOSPC, first try to
 650          * write back all dirty inodes to free up some of the excess reserved
 651          * metadata space. This reduces the chances that the eofblocks scan
 652          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
 653          * also behaves as a filter to prevent too many eofblocks scans from
 654          * running at the same time.
 655          */
 656         if (ret == -EDQUOT && !enospc) {
 657                 xfs_iunlock(ip, iolock);
 658                 enospc = xfs_inode_free_quota_eofblocks(ip);
 659                 if (enospc)
 660                         goto write_retry;
 661                 enospc = xfs_inode_free_quota_cowblocks(ip);
 662                 if (enospc)
 663                         goto write_retry;
 664                 iolock = 0;
 665         } else if (ret == -ENOSPC && !enospc) {
 666                 struct xfs_eofblocks eofb = {0};
 667
 668                 enospc = 1;
 669                 xfs_flush_inodes(ip->i_mount);
 670
 671                 xfs_iunlock(ip, iolock);
 672                 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 673                 xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 674                 xfs_icache_free_cowblocks(ip->i_mount, &eofb);
 675                 goto write_retry;
 676         }
 677
 678         current->backing_dev_info = NULL;
 679 out:
 680         if (iolock)
 681                 xfs_iunlock(ip, iolock);
 682
 683         if (ret > 0) {
 684                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 685                 /* Handle various SYNC-type writes */
 686                 ret = generic_write_sync(iocb, ret);
 687         }
 688         return ret;
 689 }
 690
 691 STATIC ssize_t
 692 xfs_file_write_iter(
 693         struct kiocb            *iocb,
 694         struct iov_iter         *from)
 695 {
 696         struct file             *file = iocb->ki_filp;
 697         struct address_space    *mapping = file->f_mapping;
 698         struct inode            *inode = mapping->host;
 699         struct xfs_inode        *ip = XFS_I(inode);
 700         ssize_t                 ret;
 701         size_t                  ocount = iov_iter_count(from);
 702
 703         XFS_STATS_INC(ip->i_mount, xs_write_calls);
 704
 705         if (ocount == 0)
 706                 return 0;
 707
 708         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 709                 return -EIO;
 710
 711         if (IS_DAX(inode))
 712                 return xfs_file_dax_write(iocb, from);
 713
 714         if (iocb->ki_flags & IOCB_DIRECT) {
 715                 /*
 716                  * Allow a directio write to fall back to a buffered
 717                  * write *only* in the case that we're doing a reflink
 718                  * CoW.  In all other directio scenarios we do not
 719                  * allow an operation to fall back to buffered mode.
 720                  */
 721                 ret = xfs_file_dio_aio_write(iocb, from);
 722                 if (ret != -EREMCHG)
 723                         return ret;
 724         }
 725
 726         return xfs_file_buffered_aio_write(iocb, from);
 727 }
 728
 729 static void
 730 xfs_wait_dax_page(
 731         struct inode            *inode)
 732 {
 733         struct xfs_inode        *ip = XFS_I(inode);
 734
 735         xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 736         schedule();
 737         xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 738 }
 739
 740 static int
 741 xfs_break_dax_layouts(
 742         struct inode            *inode,
 743         bool                    *retry)
 744 {
 745         struct page             *page;
 746
 747         ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 748
 749         page = dax_layout_busy_page(inode->i_mapping);
 750         if (!page)
 751                 return 0;
 752
 753         *retry = true;
 754         return ___wait_var_event(&page->_refcount,
 755                         atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
 756                         0, 0, xfs_wait_dax_page(inode));
 757 }
 758
 759 int
 760 xfs_break_layouts(
 761         struct inode            *inode,
 762         uint                    *iolock,
 763         enum layout_break_reason reason)
 764 {
 765         bool                    retry;
 766         int                     error;
 767
 768         ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 769
 770         do {
 771                 retry = false;
 772                 switch (reason) {
 773                 case BREAK_UNMAP:
 774                         error = xfs_break_dax_layouts(inode, &retry);
 775                         if (error || retry)
 776                                 break;
 777                         /* fall through */
 778                 case BREAK_WRITE:
 779                         error = xfs_break_leased_layouts(inode, iolock, &retry);
 780                         break;
 781                 default:
 782                         WARN_ON_ONCE(1);
 783                         error = -EINVAL;
 784                 }
 785         } while (error == 0 && retry);
 786
 787         return error;
 788 }
 789
 790 #define XFS_FALLOC_FL_SUPPORTED                                         \
 791                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
 792                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
 793                  FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 794
 795 STATIC long
 796 xfs_file_fallocate(
 797         struct file             *file,
 798         int                     mode,
 799         loff_t                  offset,
 800         loff_t                  len)
 801 {
 802         struct inode            *inode = file_inode(file);
 803         struct xfs_inode        *ip = XFS_I(inode);
 804         long                    error;
 805         enum xfs_prealloc_flags flags = 0;
 806         uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 807         loff_t                  new_size = 0;
 808         bool                    do_file_insert = false;
 809
 810         if (!S_ISREG(inode->i_mode))
 811                 return -EINVAL;
 812         if (mode & ~XFS_FALLOC_FL_SUPPORTED)
 813                 return -EOPNOTSUPP;
 814
 815         xfs_ilock(ip, iolock);
 816         error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
 817         if (error)
 818                 goto out_unlock;
 819
 820         /*
 821          * Must wait for all AIO to complete before we continue as AIO can
 822          * change the file size on completion without holding any locks we
 823          * currently hold. We must do this first because AIO can update both
 824          * the on disk and in memory inode sizes, and the operations that follow
 825          * require the in-memory size to be fully up-to-date.
 826          */
 827         inode_dio_wait(inode);
 828
 829         /*
 830          * Now AIO and DIO has drained we flush and (if necessary) invalidate
 831          * the cached range over the first operation we are about to run.
 832          *
 833          * We care about zero and collapse here because they both run a hole
 834          * punch over the range first. Because that can zero data, and the range
 835          * of invalidation for the shift operations is much larger, we still do
 836          * the required flush for collapse in xfs_prepare_shift().
 837          *
 838          * Insert has the same range requirements as collapse, and we extend the
 839          * file first which can zero data. Hence insert has the same
 840          * flush/invalidate requirements as collapse and so they are both
 841          * handled at the right time by xfs_prepare_shift().
 842          */
 843         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
 844                     FALLOC_FL_COLLAPSE_RANGE)) {
 845                 error = xfs_flush_unmap_range(ip, offset, len);
 846                 if (error)
 847                         goto out_unlock;
 848         }
 849
 850         if (mode & FALLOC_FL_PUNCH_HOLE) {
 851                 error = xfs_free_file_space(ip, offset, len);
 852                 if (error)
 853                         goto out_unlock;
 854         } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
 855                 unsigned int blksize_mask = i_blocksize(inode) - 1;
 856
 857                 if (offset & blksize_mask || len & blksize_mask) {
 858                         error = -EINVAL;
 859                         goto out_unlock;
 860                 }
 861
 862                 /*
 863                  * There is no need to overlap collapse range with EOF,
 864                  * in which case it is effectively a truncate operation
 865                  */
 866                 if (offset + len >= i_size_read(inode)) {
 867                         error = -EINVAL;
 868                         goto out_unlock;
 869                 }
 870
 871                 new_size = i_size_read(inode) - len;
 872
 873                 error = xfs_collapse_file_space(ip, offset, len);
 874                 if (error)
 875                         goto out_unlock;
 876         } else if (mode & FALLOC_FL_INSERT_RANGE) {
 877                 unsigned int    blksize_mask = i_blocksize(inode) - 1;
 878                 loff_t          isize = i_size_read(inode);
 879
 880                 if (offset & blksize_mask || len & blksize_mask) {
 881                         error = -EINVAL;
 882                         goto out_unlock;
 883                 }
 884
 885                 /*
 886                  * New inode size must not exceed ->s_maxbytes, accounting for
 887                  * possible signed overflow.
 888                  */
 889                 if (inode->i_sb->s_maxbytes - isize < len) {
 890                         error = -EFBIG;
 891                         goto out_unlock;
 892                 }
 893                 new_size = isize + len;
 894
 895                 /* Offset should be less than i_size */
 896                 if (offset >= isize) {
 897                         error = -EINVAL;
 898                         goto out_unlock;
 899                 }
 900                 do_file_insert = true;
 901         } else {
 902                 flags |= XFS_PREALLOC_SET;
 903
 904                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 905                     offset + len > i_size_read(inode)) {
 906                         new_size = offset + len;
 907                         error = inode_newsize_ok(inode, new_size);
 908                         if (error)
 909                                 goto out_unlock;
 910                 }
 911
 912                 if (mode & FALLOC_FL_ZERO_RANGE) {
 913                         /*
 914                          * Punch a hole and prealloc the range.  We use a hole
 915                          * punch rather than unwritten extent conversion for two
 916                          * reasons:
 917                          *
 918                          *   1.) Hole punch handles partial block zeroing for us.
 919                          *   2.) If prealloc returns ENOSPC, the file range is
 920                          *       still zero-valued by virtue of the hole punch.
 921                          */
 922                         unsigned int blksize = i_blocksize(inode);
 923
 924                         trace_xfs_zero_file_space(ip);
 925
 926                         error = xfs_free_file_space(ip, offset, len);
 927                         if (error)
 928                                 goto out_unlock;
 929
 930                         len = round_up(offset + len, blksize) -
 931                               round_down(offset, blksize);
 932                         offset = round_down(offset, blksize);
 933                 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
 934                         error = xfs_reflink_unshare(ip, offset, len);
 935                         if (error)
 936                                 goto out_unlock;
 937                 } else {
 938                         /*
 939                          * If always_cow mode we can't use preallocations and
 940                          * thus should not create them.
 941                          */
 942                         if (xfs_is_always_cow_inode(ip)) {
 943                                 error = -EOPNOTSUPP;
 944                                 goto out_unlock;
 945                         }
 946                 }
 947
 948                 if (!xfs_is_always_cow_inode(ip)) {
 949                         error = xfs_alloc_file_space(ip, offset, len,
 950                                                      XFS_BMAPI_PREALLOC);
 951                         if (error)
 952                                 goto out_unlock;
 953                 }
 954         }
 955
 956         if (file->f_flags & O_DSYNC)
 957                 flags |= XFS_PREALLOC_SYNC;
 958
 959         error = xfs_update_prealloc_flags(ip, flags);
 960         if (error)
 961                 goto out_unlock;
 962
 963         /* Change file size if needed */
 964         if (new_size) {
 965                 struct iattr iattr;
 966
 967                 iattr.ia_valid = ATTR_SIZE;
 968                 iattr.ia_size = new_size;
 969                 error = xfs_vn_setattr_size(file_dentry(file), &iattr);
 970                 if (error)
 971                         goto out_unlock;
 972         }
 973
 974         /*
 975          * Perform hole insertion now that the file size has been
 976          * updated so that if we crash during the operation we don't
 977          * leave shifted extents past EOF and hence losing access to
 978          * the data that is contained within them.
 979          */
 980         if (do_file_insert)
 981                 error = xfs_insert_file_space(ip, offset, len);
 982
 983 out_unlock:
 984         xfs_iunlock(ip, iolock);
 985         return error;
 986 }
 987
 988 STATIC int
 989 xfs_file_fadvise(
 990         struct file     *file,
 991         loff_t          start,
 992         loff_t          end,
 993         int             advice)
 994 {
 995         struct xfs_inode *ip = XFS_I(file_inode(file));
 996         int ret;
 997         int lockflags = 0;
 998
 999         /*
1000          * Operations creating pages in page cache need protection from hole
1001          * punching and similar ops
1002          */
1003         if (advice == POSIX_FADV_WILLNEED) {
1004                 lockflags = XFS_IOLOCK_SHARED;
1005                 xfs_ilock(ip, lockflags);
1006         }
1007         ret = generic_fadvise(file, start, end, advice);
1008         if (lockflags)
1009                 xfs_iunlock(ip, lockflags);
1010         return ret;
1011 }
1012
1013 STATIC loff_t
1014 xfs_file_remap_range(
1015         struct file             *file_in,
1016         loff_t                  pos_in,
1017         struct file             *file_out,
1018         loff_t                  pos_out,
1019         loff_t                  len,
1020         unsigned int            remap_flags)
1021 {
1022         struct inode            *inode_in = file_inode(file_in);
1023         struct xfs_inode        *src = XFS_I(inode_in);
1024         struct inode            *inode_out = file_inode(file_out);
1025         struct xfs_inode        *dest = XFS_I(inode_out);
1026         struct xfs_mount        *mp = src->i_mount;
1027         loff_t                  remapped = 0;
1028         xfs_extlen_t            cowextsize;
1029         int                     ret;
1030
1031         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1032                 return -EINVAL;
1033
1034         if (!xfs_sb_version_hasreflink(&mp->m_sb))
1035                 return -EOPNOTSUPP;
1036
1037         if (XFS_FORCED_SHUTDOWN(mp))
1038                 return -EIO;
1039
1040         /* Prepare and then clone file data. */
1041         ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1042                         &len, remap_flags);
1043         if (ret < 0 || len == 0)
1044                 return ret;
1045
1046         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1047
1048         ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1049                         &remapped);
1050         if (ret)
1051                 goto out_unlock;
1052
1053         /*
1054          * Carry the cowextsize hint from src to dest if we're sharing the
1055          * entire source file to the entire destination file, the source file
1056          * has a cowextsize hint, and the destination file does not.
1057          */
1058         cowextsize = 0;
1059         if (pos_in == 0 && len == i_size_read(inode_in) &&
1060             (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1061             pos_out == 0 && len >= i_size_read(inode_out) &&
1062             !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1063                 cowextsize = src->i_d.di_cowextsize;
1064
1065         ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1066                         remap_flags);
1067
1068 out_unlock:
1069         xfs_reflink_remap_unlock(file_in, file_out);
1070         if (ret)
1071                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1072         return remapped > 0 ? remapped : ret;
1073 }
1074
1075 STATIC int
1076 xfs_file_open(
1077         struct inode    *inode,
1078         struct file     *file)
1079 {
1080         if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1081                 return -EFBIG;
1082         if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
1083                 return -EIO;
1084         file->f_mode |= FMODE_NOWAIT;
1085         return 0;
1086 }
1087
1088 STATIC int
1089 xfs_dir_open(
1090         struct inode    *inode,
1091         struct file     *file)
1092 {
1093         struct xfs_inode *ip = XFS_I(inode);
1094         int             mode;
1095         int             error;
1096
1097         error = xfs_file_open(inode, file);
1098         if (error)
1099                 return error;
1100
1101         /*
1102          * If there are any blocks, read-ahead block 0 as we're almost
1103          * certain to have the next operation be a read there.
1104          */
1105         mode = xfs_ilock_data_map_shared(ip);
1106         if (ip->i_d.di_nextents > 0)
1107                 error = xfs_dir3_data_readahead(ip, 0, 0);
1108         xfs_iunlock(ip, mode);
1109         return error;
1110 }
1111
1112 STATIC int
1113 xfs_file_release(
1114         struct inode    *inode,
1115         struct file     *filp)
1116 {
1117         return xfs_release(XFS_I(inode));
1118 }
1119
1120 STATIC int
1121 xfs_file_readdir(
1122         struct file     *file,
1123         struct dir_context *ctx)
1124 {
1125         struct inode    *inode = file_inode(file);
1126         xfs_inode_t     *ip = XFS_I(inode);
1127         size_t          bufsize;
1128
1129         /*
1130          * The Linux API doesn't pass down the total size of the buffer
1131          * we read into down to the filesystem.  With the filldir concept
1132          * it's not needed for correct information, but the XFS dir2 leaf
1133          * code wants an estimate of the buffer size to calculate it's
1134          * readahead window and size the buffers used for mapping to
1135          * physical blocks.
1136          *
1137          * Try to give it an estimate that's good enough, maybe at some
1138          * point we can change the ->readdir prototype to include the
1139          * buffer size.  For now we use the current glibc buffer size.
1140          */
1141         bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
1142
1143         return xfs_readdir(NULL, ip, ctx, bufsize);
1144 }
1145
1146 STATIC loff_t
1147 xfs_file_llseek(
1148         struct file     *file,
1149         loff_t          offset,
1150         int             whence)
1151 {
1152         struct inode            *inode = file->f_mapping->host;
1153
1154         if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
1155                 return -EIO;
1156
1157         switch (whence) {
1158         default:
1159                 return generic_file_llseek(file, offset, whence);
1160         case SEEK_HOLE:
1161                 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1162                 break;
1163         case SEEK_DATA:
1164                 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1165                 break;
1166         }
1167
1168         if (offset < 0)
1169                 return offset;
1170         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1171 }
1172
1173 /*
1174  * Locking for serialisation of IO during page faults. This results in a lock
1175  * ordering of:
1176  *
1177  * mmap_sem (MM)
1178  *   sb_start_pagefault(vfs, freeze)
1179  *     i_mmaplock (XFS - truncate serialisation)
1180  *       page_lock (MM)
1181  *         i_lock (XFS - extent map serialisation)
1182  */
1183 static vm_fault_t
1184 __xfs_filemap_fault(
1185         struct vm_fault         *vmf,
1186         enum page_entry_size    pe_size,
1187         bool                    write_fault)
1188 {
1189         struct inode            *inode = file_inode(vmf->vma->vm_file);
1190         struct xfs_inode        *ip = XFS_I(inode);
1191         vm_fault_t              ret;
1192
1193         trace_xfs_filemap_fault(ip, pe_size, write_fault);
1194
1195         if (write_fault) {
1196                 sb_start_pagefault(inode->i_sb);
1197                 file_update_time(vmf->vma->vm_file);
1198         }
1199
1200         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1201         if (IS_DAX(inode)) {
1202                 pfn_t pfn;
1203
1204                 ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
1205                                 (write_fault && !vmf->cow_page) ?
1206                                  &xfs_direct_write_iomap_ops :
1207                                  &xfs_read_iomap_ops);
1208                 if (ret & VM_FAULT_NEEDDSYNC)
1209                         ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1210         } else {
1211                 if (write_fault)
1212                         ret = iomap_page_mkwrite(vmf,
1213                                         &xfs_buffered_write_iomap_ops);
1214                 else
1215                         ret = filemap_fault(vmf);
1216         }
1217         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1218
1219         if (write_fault)
1220                 sb_end_pagefault(inode->i_sb);
1221         return ret;
1222 }
1223
1224 static vm_fault_t
1225 xfs_filemap_fault(
1226         struct vm_fault         *vmf)
1227 {
1228         /* DAX can shortcut the normal fault path on write faults! */
1229         return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1230                         IS_DAX(file_inode(vmf->vma->vm_file)) &&
1231                         (vmf->flags & FAULT_FLAG_WRITE));
1232 }
1233
1234 static vm_fault_t
1235 xfs_filemap_huge_fault(
1236         struct vm_fault         *vmf,
1237         enum page_entry_size    pe_size)
1238 {
1239         if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1240                 return VM_FAULT_FALLBACK;
1241
1242         /* DAX can shortcut the normal fault path on write faults! */
1243         return __xfs_filemap_fault(vmf, pe_size,
1244                         (vmf->flags & FAULT_FLAG_WRITE));
1245 }
1246
1247 static vm_fault_t
1248 xfs_filemap_page_mkwrite(
1249         struct vm_fault         *vmf)
1250 {
1251         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1252 }
1253
1254 /*
1255  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1256  * on write faults. In reality, it needs to serialise against truncate and
1257  * prepare memory for writing so handle is as standard write fault.
1258  */
1259 static vm_fault_t
1260 xfs_filemap_pfn_mkwrite(
1261         struct vm_fault         *vmf)
1262 {
1263
1264         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1265 }
1266
1267 static const struct vm_operations_struct xfs_file_vm_ops = {
1268         .fault          = xfs_filemap_fault,
1269         .huge_fault     = xfs_filemap_huge_fault,
1270         .map_pages      = filemap_map_pages,
1271         .page_mkwrite   = xfs_filemap_page_mkwrite,
1272         .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
1273 };
1274
1275 STATIC int
1276 xfs_file_mmap(
1277         struct file             *file,
1278         struct vm_area_struct   *vma)
1279 {
1280         struct inode            *inode = file_inode(file);
1281         struct xfs_buftarg      *target = xfs_inode_buftarg(XFS_I(inode));
1282
1283         /*
1284          * We don't support synchronous mappings for non-DAX files and
1285          * for DAX files if underneath dax_device is not synchronous.
1286          */
1287         if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1288                 return -EOPNOTSUPP;
1289
1290         file_accessed(file);
1291         vma->vm_ops = &xfs_file_vm_ops;
1292         if (IS_DAX(inode))
1293                 vma->vm_flags |= VM_HUGEPAGE;
1294         return 0;
1295 }
1296
1297 const struct file_operations xfs_file_operations = {
1298         .llseek         = xfs_file_llseek,
1299         .read_iter      = xfs_file_read_iter,
1300         .write_iter     = xfs_file_write_iter,
1301         .splice_read    = generic_file_splice_read,
1302         .splice_write   = iter_file_splice_write,
1303         .iopoll         = iomap_dio_iopoll,
1304         .unlocked_ioctl = xfs_file_ioctl,
1305 #ifdef CONFIG_COMPAT
1306         .compat_ioctl   = xfs_file_compat_ioctl,
1307 #endif
1308         .mmap           = xfs_file_mmap,
1309         .mmap_supported_flags = MAP_SYNC,
1310         .open           = xfs_file_open,
1311         .release        = xfs_file_release,
1312         .fsync          = xfs_file_fsync,
1313         .get_unmapped_area = thp_get_unmapped_area,
1314         .fallocate      = xfs_file_fallocate,
1315         .fadvise        = xfs_file_fadvise,
1316         .remap_file_range = xfs_file_remap_range,
1317 };
1318
1319 const struct file_operations xfs_dir_file_operations = {
1320         .open           = xfs_dir_open,
1321         .read           = generic_read_dir,
1322         .iterate_shared = xfs_file_readdir,
1323         .llseek         = generic_file_llseek,
1324         .unlocked_ioctl = xfs_file_ioctl,
1325 #ifdef CONFIG_COMPAT
1326         .compat_ioctl   = xfs_file_compat_ioctl,
1327 #endif
1328         .fsync          = xfs_dir_fsync,
1329 };