fs/xfs/linux/xfs_lrw.c

   1 /*
   2  * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32 /*
  33  *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
  34  *
  35  */
  36
  37 #include <xfs.h>
  38 #include <linux/pagemap.h>
  39 #include <linux/capability.h>
  40
  41
  42 /*
  43  *      xfs_iozero
  44  *
  45  *      xfs_iozero clears the specified range of buffer supplied,
  46  *      and marks all the affected blocks as valid and modified.  If
  47  *      an affected block is not allocated, it will be allocated.  If
  48  *      an affected block is not completely overwritten, and is not
  49  *      valid before the operation, it will be read from disk before
  50  *      being partially zeroed.
  51  */
  52 STATIC int
  53 xfs_iozero(
  54         struct inode            *ip,    /* inode                        */
  55         loff_t                  pos,    /* offset in file               */
  56         size_t                  count,  /* size of data to zero         */
  57         loff_t                  end_size)       /* max file size to set */
  58 {
  59         unsigned                bytes;
  60         struct page             *page;
  61         struct address_space    *mapping;
  62         char                    *kaddr;
  63         int                     status;
  64
  65         mapping = ip->i_mapping;
  66         do {
  67                 unsigned long index, offset;
  68
  69                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
  70                 index = pos >> PAGE_CACHE_SHIFT;
  71                 bytes = PAGE_CACHE_SIZE - offset;
  72                 if (bytes > count)
  73                         bytes = count;
  74
  75                 status = -ENOMEM;
  76                 page = grab_cache_page(mapping, index);
  77                 if (!page)
  78                         break;
  79
  80                 kaddr = kmap(page);
  81                 status = mapping->a_ops->prepare_write(NULL, page, offset,
  82                                                         offset + bytes);
  83                 if (status) {
  84                         goto unlock;
  85                 }
  86
  87                 memset((void *) (kaddr + offset), 0, bytes);
  88                 flush_dcache_page(page);
  89                 status = mapping->a_ops->commit_write(NULL, page, offset,
  90                                                         offset + bytes);
  91                 if (!status) {
  92                         pos += bytes;
  93                         count -= bytes;
  94                         if (pos > ip->i_size)
  95                                 ip->i_size = pos < end_size ? pos : end_size;
  96                 }
  97
  98 unlock:
  99                 kunmap(page);
 100                 unlock_page(page);
 101                 page_cache_release(page);
 102                 if (status)
 103                         break;
 104         } while (count);
 105
 106         return (-status);
 107 }
 108
 109 ssize_t                 /* bytes read, or (-)  error */
 110 xfs_read(
 111         bhv_desc_t              *bdp,
 112         struct file             *filp,
 113         const struct iovec      *iovp,
 114         unsigned long           segs,
 115         loff_t                  *offp,
 116         cred_t                  *credp)
 117 {
 118         size_t                  size = 0;
 119         ssize_t                 ret;
 120         xfs_fsize_t             n;
 121         xfs_inode_t             *ip;
 122         xfs_mount_t             *mp;
 123         vnode_t                 *vp;
 124         unsigned long           seg;
 125         int                     direct = filp->f_flags & O_DIRECT;
 126
 127         ip = XFS_BHVTOI(bdp);
 128         vp = BHV_TO_VNODE(bdp);
 129         mp = ip->i_mount;
 130         vn_trace_entry(vp, "xfs_read", (inst_t *)__return_address);
 131
 132         XFS_STATS_INC(xfsstats.xs_read_calls);
 133
 134         /* START copy & waste from filemap.c */
 135         for (seg = 0; seg < segs; seg++) {
 136                 const struct iovec *iv = &iovp[seg];
 137
 138                 /*
 139                  * If any segment has a negative length, or the cumulative
 140                  * length ever wraps negative then return -EINVAL.
 141                  */
 142                 size += iv->iov_len;
 143                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 144                         return XFS_ERROR(-EINVAL);
 145                 if (direct) {   /* XFS specific check */
 146                         if ((__psint_t)iv->iov_base & BBMASK) {
 147                                 if (*offp == ip->i_d.di_size)
 148                                         return 0;
 149                                 return XFS_ERROR(-EINVAL);
 150                         }
 151                 }
 152                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
 153                         continue;
 154                 if (seg == 0)
 155                         return XFS_ERROR(-EFAULT);
 156                 segs = seg;
 157                 break;
 158         }
 159         /* END copy & waste from filemap.c */
 160
 161         if (direct) {
 162                 if ((*offp & mp->m_blockmask) ||
 163                     (size & mp->m_blockmask)) {
 164                         if (*offp == ip->i_d.di_size) {
 165                                 return (0);
 166                         }
 167                         return -XFS_ERROR(EINVAL);
 168                 }
 169         }
 170
 171         n = XFS_MAX_FILE_OFFSET - *offp;
 172         if ((n <= 0) || (size == 0))
 173                 return 0;
 174
 175         if (n < size)
 176                 size = n;
 177
 178         if (XFS_FORCED_SHUTDOWN(mp)) {
 179                 return -EIO;
 180         }
 181
 182         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 183
 184         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 185             !(filp->f_mode & FINVIS)) {
 186                 int error;
 187                 vrwlock_t locktype = VRWLOCK_READ;
 188
 189                 error = xfs_dm_send_data_event(DM_EVENT_READ, bdp, *offp,
 190                                 size, FILP_DELAY_FLAG(filp), &locktype);
 191                 if (error) {
 192                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 193                         return -error;
 194                 }
 195         }
 196
 197         ret = generic_file_readv(filp, iovp, segs, offp);
 198         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 199
 200         XFS_STATS_ADD(xfsstats.xs_read_bytes, ret);
 201
 202         if (!(filp->f_mode & FINVIS))
 203                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 204
 205         return ret;
 206 }
 207
 208 ssize_t
 209 xfs_sendfile(
 210         bhv_desc_t              *bdp,
 211         struct file             *filp,
 212         loff_t                  *offp,
 213         size_t                  count,
 214         read_actor_t            actor,
 215         void                    *target,
 216         cred_t                  *credp)
 217 {
 218         ssize_t                 ret;
 219         xfs_fsize_t             n;
 220         xfs_inode_t             *ip;
 221         vnode_t                 *vp;
 222         int                     invisible = (filp->f_mode & FINVIS);
 223
 224         ip = XFS_BHVTOI(bdp);
 225         vp = BHV_TO_VNODE(bdp);
 226         vn_trace_entry(vp, "xfs_sendfile", (inst_t *)__return_address);
 227
 228         XFS_STATS_INC(xfsstats.xs_read_calls);
 229
 230         n = XFS_MAX_FILE_OFFSET - *offp;
 231         if ((n <= 0) || (count == 0))
 232                 return 0;
 233
 234         if (n < count)
 235                 count = n;
 236
 237         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 238                 return -EIO;
 239
 240         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 241         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !invisible) {
 242                 vrwlock_t locktype = VRWLOCK_READ;
 243                 int error;
 244
 245                 error = xfs_dm_send_data_event(DM_EVENT_READ, bdp, *offp,
 246                                 count, FILP_DELAY_FLAG(filp), &locktype);
 247                 if (error) {
 248                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 249                         return -error;
 250                 }
 251         }
 252         ret = generic_file_sendfile(filp, offp, count, actor, target);
 253         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 254
 255         XFS_STATS_ADD(xfsstats.xs_read_bytes, ret);
 256         if (!invisible)
 257                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 258         return ret;
 259 }
 260
 261 /*
 262  * This routine is called to handle zeroing any space in the last
 263  * block of the file that is beyond the EOF.  We do this since the
 264  * size is being increased without writing anything to that block
 265  * and we don't want anyone to read the garbage on the disk.
 266  */
 267 STATIC int                              /* error (positive) */
 268 xfs_zero_last_block(
 269         struct inode    *ip,
 270         xfs_iocore_t    *io,
 271         xfs_off_t       offset,
 272         xfs_fsize_t     isize,
 273         xfs_fsize_t     end_size)
 274 {
 275         xfs_fileoff_t   last_fsb;
 276         xfs_mount_t     *mp;
 277         int             nimaps;
 278         int             zero_offset;
 279         int             zero_len;
 280         int             isize_fsb_offset;
 281         int             error = 0;
 282         xfs_bmbt_irec_t imap;
 283         loff_t          loff;
 284         size_t          lsize;
 285
 286         ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
 287         ASSERT(offset > isize);
 288
 289         mp = io->io_mount;
 290
 291         isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
 292         if (isize_fsb_offset == 0) {
 293                 /*
 294                  * There are no extra bytes in the last block on disk to
 295                  * zero, so return.
 296                  */
 297                 return 0;
 298         }
 299
 300         last_fsb = XFS_B_TO_FSBT(mp, isize);
 301         nimaps = 1;
 302         error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
 303                           &nimaps, NULL);
 304         if (error) {
 305                 return error;
 306         }
 307         ASSERT(nimaps > 0);
 308         /*
 309          * If the block underlying isize is just a hole, then there
 310          * is nothing to zero.
 311          */
 312         if (imap.br_startblock == HOLESTARTBLOCK) {
 313                 return 0;
 314         }
 315         /*
 316          * Zero the part of the last block beyond the EOF, and write it
 317          * out sync.  We need to drop the ilock while we do this so we
 318          * don't deadlock when the buffer cache calls back to us.
 319          */
 320         XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
 321         loff = XFS_FSB_TO_B(mp, last_fsb);
 322         lsize = XFS_FSB_TO_B(mp, 1);
 323
 324         zero_offset = isize_fsb_offset;
 325         zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
 326
 327         error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
 328
 329         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 330         ASSERT(error >= 0);
 331         return error;
 332 }
 333
 334 /*
 335  * Zero any on disk space between the current EOF and the new,
 336  * larger EOF.  This handles the normal case of zeroing the remainder
 337  * of the last block in the file and the unusual case of zeroing blocks
 338  * out beyond the size of the file.  This second case only happens
 339  * with fixed size extents and when the system crashes before the inode
 340  * size was updated but after blocks were allocated.  If fill is set,
 341  * then any holes in the range are filled and zeroed.  If not, the holes
 342  * are left alone as holes.
 343  */
 344
 345 int                                     /* error (positive) */
 346 xfs_zero_eof(
 347         vnode_t         *vp,
 348         xfs_iocore_t    *io,
 349         xfs_off_t       offset,         /* starting I/O offset */
 350         xfs_fsize_t     isize,          /* current inode size */
 351         xfs_fsize_t     end_size)       /* terminal inode size */
 352 {
 353         struct inode    *ip = LINVFS_GET_IP(vp);
 354         xfs_fileoff_t   start_zero_fsb;
 355         xfs_fileoff_t   end_zero_fsb;
 356         xfs_fileoff_t   prev_zero_fsb;
 357         xfs_fileoff_t   zero_count_fsb;
 358         xfs_fileoff_t   last_fsb;
 359         xfs_extlen_t    buf_len_fsb;
 360         xfs_extlen_t    prev_zero_count;
 361         xfs_mount_t     *mp;
 362         int             nimaps;
 363         int             error = 0;
 364         xfs_bmbt_irec_t imap;
 365         loff_t          loff;
 366         size_t          lsize;
 367
 368         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 369         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 370
 371         mp = io->io_mount;
 372
 373         /*
 374          * First handle zeroing the block on which isize resides.
 375          * We only zero a part of that block so it is handled specially.
 376          */
 377         error = xfs_zero_last_block(ip, io, offset, isize, end_size);
 378         if (error) {
 379                 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 380                 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 381                 return error;
 382         }
 383
 384         /*
 385          * Calculate the range between the new size and the old
 386          * where blocks needing to be zeroed may exist.  To get the
 387          * block where the last byte in the file currently resides,
 388          * we need to subtract one from the size and truncate back
 389          * to a block boundary.  We subtract 1 in case the size is
 390          * exactly on a block boundary.
 391          */
 392         last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 393         start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 394         end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 395         ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 396         if (last_fsb == end_zero_fsb) {
 397                 /*
 398                  * The size was only incremented on its last block.
 399                  * We took care of that above, so just return.
 400                  */
 401                 return 0;
 402         }
 403
 404         ASSERT(start_zero_fsb <= end_zero_fsb);
 405         prev_zero_fsb = NULLFILEOFF;
 406         prev_zero_count = 0;
 407         while (start_zero_fsb <= end_zero_fsb) {
 408                 nimaps = 1;
 409                 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 410                 error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
 411                                   0, NULL, 0, &imap, &nimaps, NULL);
 412                 if (error) {
 413                         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 414                         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 415                         return error;
 416                 }
 417                 ASSERT(nimaps > 0);
 418
 419                 if (imap.br_startblock == HOLESTARTBLOCK) {
 420                         /*
 421                          * This loop handles initializing pages that were
 422                          * partially initialized by the code below this
 423                          * loop. It basically zeroes the part of the page
 424                          * that sits on a hole and sets the page as P_HOLE
 425                          * and calls remapf if it is a mapped file.
 426                          */
 427                         prev_zero_fsb = NULLFILEOFF;
 428                         prev_zero_count = 0;
 429                         start_zero_fsb = imap.br_startoff +
 430                                          imap.br_blockcount;
 431                         ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 432                         continue;
 433                 }
 434
 435                 /*
 436                  * There are blocks in the range requested.
 437                  * Zero them a single write at a time.  We actually
 438                  * don't zero the entire range returned if it is
 439                  * too big and simply loop around to get the rest.
 440                  * That is not the most efficient thing to do, but it
 441                  * is simple and this path should not be exercised often.
 442                  */
 443                 buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
 444                                               mp->m_writeio_blocks << 8);
 445                 /*
 446                  * Drop the inode lock while we're doing the I/O.
 447                  * We'll still have the iolock to protect us.
 448                  */
 449                 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 450
 451                 loff = XFS_FSB_TO_B(mp, start_zero_fsb);
 452                 lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
 453
 454                 error = xfs_iozero(ip, loff, lsize, end_size);
 455
 456                 if (error) {
 457                         goto out_lock;
 458                 }
 459
 460                 prev_zero_fsb = start_zero_fsb;
 461                 prev_zero_count = buf_len_fsb;
 462                 start_zero_fsb = imap.br_startoff + buf_len_fsb;
 463                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 464
 465                 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 466         }
 467
 468         return 0;
 469
 470 out_lock:
 471
 472         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 473         ASSERT(error >= 0);
 474         return error;
 475 }
 476
 477 ssize_t                         /* bytes written, or (-) error */
 478 xfs_write(
 479         bhv_desc_t              *bdp,
 480         struct file             *file,
 481         const struct iovec      *iovp,
 482         unsigned long           segs,
 483         loff_t                  *offset,
 484         cred_t                  *credp)
 485 {
 486         size_t                  size = 0;
 487         xfs_inode_t             *xip;
 488         xfs_mount_t             *mp;
 489         ssize_t                 ret;
 490         int                     error = 0;
 491         xfs_fsize_t             isize, new_size;
 492         xfs_fsize_t             n, limit = XFS_MAX_FILE_OFFSET;
 493         xfs_iocore_t            *io;
 494         vnode_t                 *vp;
 495         unsigned long           seg;
 496         int                     iolock;
 497         int                     direct = file->f_flags & O_DIRECT;
 498         int                     eventsent = 0;
 499         vrwlock_t               locktype;
 500
 501         XFS_STATS_INC(xfsstats.xs_write_calls);
 502
 503         vp = BHV_TO_VNODE(bdp);
 504         vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address);
 505         xip = XFS_BHVTOI(bdp);
 506
 507         /* START copy & waste from filemap.c */
 508         for (seg = 0; seg < segs; seg++) {
 509                 const struct iovec *iv = &iovp[seg];
 510
 511                 /*
 512                  * If any segment has a negative length, or the cumulative
 513                  * length ever wraps negative then return -EINVAL.
 514                  */
 515                 size += iv->iov_len;
 516                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 517                         return XFS_ERROR(-EINVAL);
 518                 if (direct) {   /* XFS specific check */
 519                         if ((__psint_t)iv->iov_base & BBMASK)
 520                                 return XFS_ERROR(-EINVAL);
 521                 }
 522                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 523                         continue;
 524                 if (seg == 0)
 525                         return XFS_ERROR(-EFAULT);
 526                 segs = seg;
 527                 break;
 528         }
 529         /* END copy & waste from filemap.c */
 530
 531         if (size == 0)
 532                 return 0;
 533
 534         io = &(xip->i_iocore);
 535         mp = io->io_mount;
 536
 537         xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
 538
 539         if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
 540                 return -EIO;
 541         }
 542
 543         if (direct) {
 544                 if ((*offset & mp->m_blockmask) ||
 545                     (size & mp->m_blockmask)) {
 546                         return XFS_ERROR(-EINVAL);
 547                 }
 548                 iolock = XFS_IOLOCK_SHARED;
 549                 locktype = VRWLOCK_WRITE_DIRECT;
 550         } else {
 551                 iolock = XFS_IOLOCK_EXCL;
 552                 locktype = VRWLOCK_WRITE;
 553         }
 554
 555         xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 556         isize = xip->i_d.di_size;
 557
 558         if (file->f_flags & O_APPEND)
 559                 *offset = isize;
 560
 561 start:
 562         n = limit - *offset;
 563         if (n <= 0) {
 564                 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 565                 return -EFBIG;
 566         }
 567
 568         if (n < size)
 569                 size = n;
 570
 571         new_size = *offset + size;
 572         if (new_size > isize) {
 573                 io->io_new_size = new_size;
 574         }
 575
 576         if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 577             !(file->f_mode & FINVIS) && !eventsent)) {
 578                 loff_t          savedsize = *offset;
 579
 580                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 581                 error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
 582                                 *offset, size,
 583                                 FILP_DELAY_FLAG(file), &locktype);
 584                 if (error) {
 585                         xfs_iunlock(xip, iolock);
 586                         return -error;
 587                 }
 588                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 589                 eventsent = 1;
 590
 591                 /*
 592                  * The iolock was dropped and reaquired in
 593                  * xfs_dm_send_data_event so we have to recheck the size
 594                  *  when appending.  We will only "goto start;" once,
 595                  *  since having sent the event prevents another call
 596                  *  to xfs_dm_send_data_event, which is what
 597                  *  allows the size to change in the first place.
 598                  */
 599                 if ((file->f_flags & O_APPEND) &&
 600                     savedsize != xip->i_d.di_size) {
 601                         *offset = isize = xip->i_d.di_size;
 602                         goto start;
 603                 }
 604         }
 605
 606         /*
 607          * On Linux, generic_file_write updates the times even if
 608          * no data is copied in so long as the write had a size.
 609          *
 610          * We must update xfs' times since revalidate will overcopy xfs.
 611          */
 612         if (size) {
 613                 if (!(file->f_mode & FINVIS))
 614                         xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 615         }
 616
 617         /*
 618          * If the offset is beyond the size of the file, we have a couple
 619          * of things to do. First, if there is already space allocated
 620          * we need to either create holes or zero the disk or ...
 621          *
 622          * If there is a page where the previous size lands, we need
 623          * to zero it out up to the new size.
 624          */
 625
 626         if (!direct && (*offset > isize && isize)) {
 627                 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
 628                         isize, *offset + size);
 629                 if (error) {
 630                         xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 631                         return(-error);
 632                 }
 633         }
 634         xfs_iunlock(xip, XFS_ILOCK_EXCL);
 635
 636         /*
 637          * If we're writing the file then make sure to clear the
 638          * setuid and setgid bits if the process is not being run
 639          * by root.  This keeps people from modifying setuid and
 640          * setgid binaries.
 641          */
 642
 643         if (((xip->i_d.di_mode & ISUID) ||
 644             ((xip->i_d.di_mode & (ISGID | (IEXEC >> 3))) ==
 645                 (ISGID | (IEXEC >> 3)))) &&
 646              !capable(CAP_FSETID)) {
 647                 error = xfs_write_clear_setuid(xip);
 648                 if (error) {
 649                         xfs_iunlock(xip, iolock);
 650                         return -error;
 651                 }
 652         }
 653
 654 retry:
 655         if (direct) {
 656                 xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
 657         }
 658
 659         ret = generic_file_write_nolock(file, iovp, segs, offset);
 660
 661         if ((ret == -ENOSPC) &&
 662             DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
 663             !(file->f_mode & FINVIS)) {
 664
 665                 xfs_rwunlock(bdp, locktype);
 666                 error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp,
 667                                 DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL,
 668                                 0, 0, 0); /* Delay flag intentionally  unused */
 669                 if (error)
 670                         return -error;
 671                 xfs_rwlock(bdp, locktype);
 672                 *offset = xip->i_d.di_size;
 673                 goto retry;
 674
 675         }
 676
 677         if (ret <= 0) {
 678                 xfs_rwunlock(bdp, locktype);
 679                 return ret;
 680         }
 681
 682         XFS_STATS_ADD(xfsstats.xs_write_bytes, ret);
 683
 684         if (*offset > xip->i_d.di_size) {
 685                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 686                 if (*offset > xip->i_d.di_size) {
 687                         struct inode    *inode = LINVFS_GET_IP(vp);
 688
 689                         inode->i_size = xip->i_d.di_size = *offset;
 690                         xip->i_update_core = 1;
 691                         xip->i_update_size = 1;
 692                 }
 693                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 694         }
 695
 696         /* Handle various SYNC-type writes */
 697         if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
 698
 699                 /*
 700                  * If we're treating this as O_DSYNC and we have not updated the
 701                  * size, force the log.
 702                  */
 703
 704                 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
 705                         && !(xip->i_update_size)) {
 706                         /*
 707                          * If an allocation transaction occurred
 708                          * without extending the size, then we have to force
 709                          * the log up the proper point to ensure that the
 710                          * allocation is permanent.  We can't count on
 711                          * the fact that buffered writes lock out direct I/O
 712                          * writes - the direct I/O write could have extended
 713                          * the size nontransactionally, then finished before
 714                          * we started.  xfs_write_file will think that the file
 715                          * didn't grow but the update isn't safe unless the
 716                          * size change is logged.
 717                          *
 718                          * Force the log if we've committed a transaction
 719                          * against the inode or if someone else has and
 720                          * the commit record hasn't gone to disk (e.g.
 721                          * the inode is pinned).  This guarantees that
 722                          * all changes affecting the inode are permanent
 723                          * when we return.
 724                          */
 725
 726                         xfs_inode_log_item_t *iip;
 727                         xfs_lsn_t lsn;
 728
 729                         iip = xip->i_itemp;
 730                         if (iip && iip->ili_last_lsn) {
 731                                 lsn = iip->ili_last_lsn;
 732                                 xfs_log_force(mp, lsn,
 733                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 734                         } else if (xfs_ipincount(xip) > 0) {
 735                                 xfs_log_force(mp, (xfs_lsn_t)0,
 736                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 737                         }
 738
 739                 } else {
 740                         xfs_trans_t     *tp;
 741
 742                         /*
 743                          * O_SYNC or O_DSYNC _with_ a size update are handled
 744                          * the same way.
 745                          *
 746                          * If the write was synchronous then we need to make
 747                          * sure that the inode modification time is permanent.
 748                          * We'll have updated the timestamp above, so here
 749                          * we use a synchronous transaction to log the inode.
 750                          * It's not fast, but it's necessary.
 751                          *
 752                          * If this a dsync write and the size got changed
 753                          * non-transactionally, then we need to ensure that
 754                          * the size change gets logged in a synchronous
 755                          * transaction.
 756                          */
 757
 758                         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
 759                         if ((error = xfs_trans_reserve(tp, 0,
 760                                                       XFS_SWRITE_LOG_RES(mp),
 761                                                       0, 0, 0))) {
 762                                 /* Transaction reserve failed */
 763                                 xfs_trans_cancel(tp, 0);
 764                         } else {
 765                                 /* Transaction reserve successful */
 766                                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 767                                 xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
 768                                 xfs_trans_ihold(tp, xip);
 769                                 xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
 770                                 xfs_trans_set_sync(tp);
 771                                 error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
 772                                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 773                         }
 774                 }
 775         } /* (ioflags & O_SYNC) */
 776
 777         xfs_rwunlock(bdp, locktype);
 778         return(ret);
 779 }
 780
 781 /*
 782  * All xfs metadata buffers except log state machine buffers
 783  * get this attached as their b_bdstrat callback function.
 784  * This is so that we can catch a buffer
 785  * after prematurely unpinning it to forcibly shutdown the filesystem.
 786  */
 787 int
 788 xfs_bdstrat_cb(struct xfs_buf *bp)
 789 {
 790         xfs_mount_t     *mp;
 791
 792         mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
 793         if (!XFS_FORCED_SHUTDOWN(mp)) {
 794                 pagebuf_iorequest(bp);
 795                 return 0;
 796         } else {
 797                 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
 798                 /*
 799                  * Metadata write that didn't get logged but
 800                  * written delayed anyway. These aren't associated
 801                  * with a transaction, and can be ignored.
 802                  */
 803                 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
 804                     (XFS_BUF_ISREAD(bp)) == 0)
 805                         return (xfs_bioerror_relse(bp));
 806                 else
 807                         return (xfs_bioerror(bp));
 808         }
 809 }
 810
 811
 812 int
 813 xfs_bmap(bhv_desc_t     *bdp,
 814         xfs_off_t       offset,
 815         ssize_t         count,
 816         int             flags,
 817         page_buf_bmap_t *pbmapp,
 818         int             *npbmaps)
 819 {
 820         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
 821         xfs_iocore_t    *io = &ip->i_iocore;
 822
 823         ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
 824         ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
 825                ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
 826
 827         return xfs_iomap(io, offset, count, flags, pbmapp, npbmaps);
 828 }
 829
 830 /*
 831  * Wrapper around bdstrat so that we can stop data
 832  * from going to disk in case we are shutting down the filesystem.
 833  * Typically user data goes thru this path; one of the exceptions
 834  * is the superblock.
 835  */
 836 int
 837 xfsbdstrat(
 838         struct xfs_mount        *mp,
 839         struct xfs_buf          *bp)
 840 {
 841         ASSERT(mp);
 842         if (!XFS_FORCED_SHUTDOWN(mp)) {
 843                 /* Grio redirection would go here
 844                  * if (XFS_BUF_IS_GRIO(bp)) {
 845                  */
 846
 847                 pagebuf_iorequest(bp);
 848                 return 0;
 849         }
 850
 851         xfs_buftrace("XFSBDSTRAT IOERROR", bp);
 852         return (xfs_bioerror_relse(bp));
 853 }
 854
 855
 856 void
 857 XFS_bflush(xfs_buftarg_t *target)
 858 {
 859         pagebuf_delwri_flush(target, PBDF_WAIT, NULL);
 860 }
 861
 862
 863 /* Push all fs state out to disk
 864  */
 865
 866 void
 867 XFS_log_write_unmount_ro(bhv_desc_t     *bdp)
 868 {
 869         xfs_mount_t     *mp;
 870         int pincount = 0;
 871         int count = 0;
 872         int error;
 873
 874         mp = XFS_BHVTOM(bdp);
 875         pagebuf_delwri_flush(mp->m_ddev_targp, PBDF_WAIT, &pincount);
 876         xfs_finish_reclaim_all(mp);
 877
 878         do {
 879                 VFS_SYNC(XFS_MTOVFS(mp), SYNC_ATTR|SYNC_WAIT, NULL, error);
 880                 pagebuf_delwri_flush(mp->m_ddev_targp, PBDF_WAIT, &pincount);
 881                 if (pincount == 0) {delay(50); count++;}
 882         }  while (count < 2);
 883
 884         /* Ok now write out an unmount record */
 885         xfs_log_unmount_write(mp);
 886         xfs_unmountfs_writesb(mp);
 887 }
 888
 889 /*
 890  * If the underlying (log or data) device is readonly, there are some
 891  * operations that cannot proceed.
 892  */
 893 int
 894 xfs_dev_is_read_only(xfs_mount_t *mp, char *message)
 895 {
 896         if (bdev_read_only(mp->m_ddev_targp->pbr_bdev) ||
 897             bdev_read_only(mp->m_logdev_targp->pbr_bdev) ||
 898            (mp->m_rtdev_targp && bdev_read_only(mp->m_rtdev_targp->pbr_bdev))) {
 899                 cmn_err(CE_NOTE,
 900                         "XFS: %s required on read-only device.", message);
 901                 cmn_err(CE_NOTE,
 902                         "XFS: write access unavailable, cannot proceed.");
 903                 return EROFS;
 904         }
 905
 906         return 0;
 907 }
 908