fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_trans_space.h"
  52 #include "xfs_log_priv.h"
  53 #include "xfs_filestream.h"
  54 #include "xfs_vnodeops.h"
  55
  56 int
  57 xfs_open(
  58         xfs_inode_t     *ip)
  59 {
  60         int             mode;
  61
  62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  63                 return XFS_ERROR(EIO);
  64
  65         /*
  66          * If it's a directory with any blocks, read-ahead block 0
  67          * as we're almost certain to have the next operation be a read there.
  68          */
  69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  70                 mode = xfs_ilock_map_shared(ip);
  71                 if (ip->i_d.di_nextents > 0)
  72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  73                 xfs_iunlock(ip, mode);
  74         }
  75         return 0;
  76 }
  77
  78 /*
  79  * xfs_getattr
  80  */
  81 int
  82 xfs_getattr(
  83         xfs_inode_t     *ip,
  84         bhv_vattr_t     *vap,
  85         int             flags)
  86 {
  87         bhv_vnode_t     *vp = XFS_ITOV(ip);
  88         xfs_mount_t     *mp = ip->i_mount;
  89
  90         xfs_itrace_entry(ip);
  91
  92         if (XFS_FORCED_SHUTDOWN(mp))
  93                 return XFS_ERROR(EIO);
  94
  95         if (!(flags & ATTR_LAZY))
  96                 xfs_ilock(ip, XFS_ILOCK_SHARED);
  97
  98         vap->va_size = XFS_ISIZE(ip);
  99         if (vap->va_mask == XFS_AT_SIZE)
 100                 goto all_done;
 101
 102         vap->va_nblocks =
 103                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 104         vap->va_nodeid = ip->i_ino;
 105 #if XFS_BIG_INUMS
 106         vap->va_nodeid += mp->m_inoadd;
 107 #endif
 108         vap->va_nlink = ip->i_d.di_nlink;
 109
 110         /*
 111          * Quick exit for non-stat callers
 112          */
 113         if ((vap->va_mask &
 114             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 115               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 116                 goto all_done;
 117
 118         /*
 119          * Copy from in-core inode.
 120          */
 121         vap->va_mode = ip->i_d.di_mode;
 122         vap->va_uid = ip->i_d.di_uid;
 123         vap->va_gid = ip->i_d.di_gid;
 124         vap->va_projid = ip->i_d.di_projid;
 125
 126         /*
 127          * Check vnode type block/char vs. everything else.
 128          */
 129         switch (ip->i_d.di_mode & S_IFMT) {
 130         case S_IFBLK:
 131         case S_IFCHR:
 132                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 133                 vap->va_blocksize = BLKDEV_IOSIZE;
 134                 break;
 135         default:
 136                 vap->va_rdev = 0;
 137
 138                 if (!(XFS_IS_REALTIME_INODE(ip))) {
 139                         vap->va_blocksize = xfs_preferred_iosize(mp);
 140                 } else {
 141
 142                         /*
 143                          * If the file blocks are being allocated from a
 144                          * realtime partition, then return the inode's
 145                          * realtime extent size or the realtime volume's
 146                          * extent size.
 147                          */
 148                         vap->va_blocksize =
 149                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 150                 }
 151                 break;
 152         }
 153
 154         vn_atime_to_timespec(vp, &vap->va_atime);
 155         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 156         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 157         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 158         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 159
 160         /*
 161          * Exit for stat callers.  See if any of the rest of the fields
 162          * to be filled in are needed.
 163          */
 164         if ((vap->va_mask &
 165              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 166               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 167                 goto all_done;
 168
 169         /*
 170          * Convert di_flags to xflags.
 171          */
 172         vap->va_xflags = xfs_ip2xflags(ip);
 173
 174         /*
 175          * Exit for inode revalidate.  See if any of the rest of
 176          * the fields to be filled in are needed.
 177          */
 178         if ((vap->va_mask &
 179              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 180               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 181                 goto all_done;
 182
 183         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 184         vap->va_nextents =
 185                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 186                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 187                         ip->i_d.di_nextents;
 188         if (ip->i_afp)
 189                 vap->va_anextents =
 190                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 191                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 192                                  ip->i_d.di_anextents;
 193         else
 194                 vap->va_anextents = 0;
 195         vap->va_gen = ip->i_d.di_gen;
 196
 197  all_done:
 198         if (!(flags & ATTR_LAZY))
 199                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 200         return 0;
 201 }
 202
 203
 204 /*
 205  * xfs_setattr
 206  */
 207 int
 208 xfs_setattr(
 209         xfs_inode_t             *ip,
 210         bhv_vattr_t             *vap,
 211         int                     flags,
 212         cred_t                  *credp)
 213 {
 214         bhv_vnode_t             *vp = XFS_ITOV(ip);
 215         xfs_mount_t             *mp = ip->i_mount;
 216         xfs_trans_t             *tp;
 217         int                     mask;
 218         int                     code;
 219         uint                    lock_flags;
 220         uint                    commit_flags=0;
 221         uid_t                   uid=0, iuid=0;
 222         gid_t                   gid=0, igid=0;
 223         int                     timeflags = 0;
 224         xfs_prid_t              projid=0, iprojid=0;
 225         int                     mandlock_before, mandlock_after;
 226         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 227         int                     file_owner;
 228         int                     need_iolock = 1;
 229
 230         xfs_itrace_entry(ip);
 231
 232         if (mp->m_flags & XFS_MOUNT_RDONLY)
 233                 return XFS_ERROR(EROFS);
 234
 235         /*
 236          * Cannot set certain attributes.
 237          */
 238         mask = vap->va_mask;
 239         if (mask & XFS_AT_NOSET) {
 240                 return XFS_ERROR(EINVAL);
 241         }
 242
 243         if (XFS_FORCED_SHUTDOWN(mp))
 244                 return XFS_ERROR(EIO);
 245
 246         /*
 247          * Timestamps do not need to be logged and hence do not
 248          * need to be done within a transaction.
 249          */
 250         if (mask & XFS_AT_UPDTIMES) {
 251                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 252                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 253                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 254                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 255                 xfs_ichgtime(ip, timeflags);
 256                 return 0;
 257         }
 258
 259         olddquot1 = olddquot2 = NULL;
 260         udqp = gdqp = NULL;
 261
 262         /*
 263          * If disk quotas is on, we make sure that the dquots do exist on disk,
 264          * before we start any other transactions. Trying to do this later
 265          * is messy. We don't care to take a readlock to look at the ids
 266          * in inode here, because we can't hold it across the trans_reserve.
 267          * If the IDs do change before we take the ilock, we're covered
 268          * because the i_*dquot fields will get updated anyway.
 269          */
 270         if (XFS_IS_QUOTA_ON(mp) &&
 271             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 272                 uint    qflags = 0;
 273
 274                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 275                         uid = vap->va_uid;
 276                         qflags |= XFS_QMOPT_UQUOTA;
 277                 } else {
 278                         uid = ip->i_d.di_uid;
 279                 }
 280                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 281                         gid = vap->va_gid;
 282                         qflags |= XFS_QMOPT_GQUOTA;
 283                 }  else {
 284                         gid = ip->i_d.di_gid;
 285                 }
 286                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 287                         projid = vap->va_projid;
 288                         qflags |= XFS_QMOPT_PQUOTA;
 289                 }  else {
 290                         projid = ip->i_d.di_projid;
 291                 }
 292                 /*
 293                  * We take a reference when we initialize udqp and gdqp,
 294                  * so it is important that we never blindly double trip on
 295                  * the same variable. See xfs_create() for an example.
 296                  */
 297                 ASSERT(udqp == NULL);
 298                 ASSERT(gdqp == NULL);
 299                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 300                                          &udqp, &gdqp);
 301                 if (code)
 302                         return code;
 303         }
 304
 305         /*
 306          * For the other attributes, we acquire the inode lock and
 307          * first do an error checking pass.
 308          */
 309         tp = NULL;
 310         lock_flags = XFS_ILOCK_EXCL;
 311         if (flags & ATTR_NOLOCK)
 312                 need_iolock = 0;
 313         if (!(mask & XFS_AT_SIZE)) {
 314                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 315                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 316                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 317                         commit_flags = 0;
 318                         if ((code = xfs_trans_reserve(tp, 0,
 319                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 320                                                      0, 0))) {
 321                                 lock_flags = 0;
 322                                 goto error_return;
 323                         }
 324                 }
 325         } else {
 326                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 327                     !(flags & ATTR_DMI)) {
 328                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 329                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 330                                 vap->va_size, 0, dmflags, NULL);
 331                         if (code) {
 332                                 lock_flags = 0;
 333                                 goto error_return;
 334                         }
 335                 }
 336                 if (need_iolock)
 337                         lock_flags |= XFS_IOLOCK_EXCL;
 338         }
 339
 340         xfs_ilock(ip, lock_flags);
 341
 342         /* boolean: are we the file owner? */
 343         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 344
 345         /*
 346          * Change various properties of a file.
 347          * Only the owner or users with CAP_FOWNER
 348          * capability may do these things.
 349          */
 350         if (mask &
 351             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 352              XFS_AT_GID|XFS_AT_PROJID)) {
 353                 /*
 354                  * CAP_FOWNER overrides the following restrictions:
 355                  *
 356                  * The user ID of the calling process must be equal
 357                  * to the file owner ID, except in cases where the
 358                  * CAP_FSETID capability is applicable.
 359                  */
 360                 if (!file_owner && !capable(CAP_FOWNER)) {
 361                         code = XFS_ERROR(EPERM);
 362                         goto error_return;
 363                 }
 364
 365                 /*
 366                  * CAP_FSETID overrides the following restrictions:
 367                  *
 368                  * The effective user ID of the calling process shall match
 369                  * the file owner when setting the set-user-ID and
 370                  * set-group-ID bits on that file.
 371                  *
 372                  * The effective group ID or one of the supplementary group
 373                  * IDs of the calling process shall match the group owner of
 374                  * the file when setting the set-group-ID bit on that file
 375                  */
 376                 if (mask & XFS_AT_MODE) {
 377                         mode_t m = 0;
 378
 379                         if ((vap->va_mode & S_ISUID) && !file_owner)
 380                                 m |= S_ISUID;
 381                         if ((vap->va_mode & S_ISGID) &&
 382                             !in_group_p((gid_t)ip->i_d.di_gid))
 383                                 m |= S_ISGID;
 384 #if 0
 385                         /* Linux allows this, Irix doesn't. */
 386                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 387                                 m |= S_ISVTX;
 388 #endif
 389                         if (m && !capable(CAP_FSETID))
 390                                 vap->va_mode &= ~m;
 391                 }
 392         }
 393
 394         /*
 395          * Change file ownership.  Must be the owner or privileged.
 396          * If the system was configured with the "restricted_chown"
 397          * option, the owner is not permitted to give away the file,
 398          * and can change the group id only to a group of which he
 399          * or she is a member.
 400          */
 401         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 402                 /*
 403                  * These IDs could have changed since we last looked at them.
 404                  * But, we're assured that if the ownership did change
 405                  * while we didn't have the inode locked, inode's dquot(s)
 406                  * would have changed also.
 407                  */
 408                 iuid = ip->i_d.di_uid;
 409                 iprojid = ip->i_d.di_projid;
 410                 igid = ip->i_d.di_gid;
 411                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 412                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 413                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 414                          iprojid;
 415
 416                 /*
 417                  * CAP_CHOWN overrides the following restrictions:
 418                  *
 419                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 420                  * shall override the restriction that a process cannot
 421                  * change the user ID of a file it owns and the restriction
 422                  * that the group ID supplied to the chown() function
 423                  * shall be equal to either the group ID or one of the
 424                  * supplementary group IDs of the calling process.
 425                  */
 426                 if (restricted_chown &&
 427                     (iuid != uid || (igid != gid &&
 428                                      !in_group_p((gid_t)gid))) &&
 429                     !capable(CAP_CHOWN)) {
 430                         code = XFS_ERROR(EPERM);
 431                         goto error_return;
 432                 }
 433                 /*
 434                  * Do a quota reservation only if uid/projid/gid is actually
 435                  * going to change.
 436                  */
 437                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 438                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 439                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 440                         ASSERT(tp);
 441                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 442                                                 capable(CAP_FOWNER) ?
 443                                                 XFS_QMOPT_FORCE_RES : 0);
 444                         if (code)       /* out of quota */
 445                                 goto error_return;
 446                 }
 447         }
 448
 449         /*
 450          * Truncate file.  Must have write permission and not be a directory.
 451          */
 452         if (mask & XFS_AT_SIZE) {
 453                 /* Short circuit the truncate case for zero length files */
 454                 if ((vap->va_size == 0) &&
 455                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 456                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 457                         lock_flags &= ~XFS_ILOCK_EXCL;
 458                         if (mask & XFS_AT_CTIME)
 459                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 460                         code = 0;
 461                         goto error_return;
 462                 }
 463
 464                 if (VN_ISDIR(vp)) {
 465                         code = XFS_ERROR(EISDIR);
 466                         goto error_return;
 467                 } else if (!VN_ISREG(vp)) {
 468                         code = XFS_ERROR(EINVAL);
 469                         goto error_return;
 470                 }
 471                 /*
 472                  * Make sure that the dquots are attached to the inode.
 473                  */
 474                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 475                         goto error_return;
 476         }
 477
 478         /*
 479          * Change file access or modified times.
 480          */
 481         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 482                 if (!file_owner) {
 483                         if ((flags & ATTR_UTIME) &&
 484                             !capable(CAP_FOWNER)) {
 485                                 code = XFS_ERROR(EPERM);
 486                                 goto error_return;
 487                         }
 488                 }
 489         }
 490
 491         /*
 492          * Change extent size or realtime flag.
 493          */
 494         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 495                 /*
 496                  * Can't change extent size if any extents are allocated.
 497                  */
 498                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 499                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 500                      vap->va_extsize) ) {
 501                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 502                         goto error_return;
 503                 }
 504
 505                 /*
 506                  * Can't change realtime flag if any extents are allocated.
 507                  */
 508                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 509                     (mask & XFS_AT_XFLAGS) &&
 510                     (XFS_IS_REALTIME_INODE(ip)) !=
 511                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 512                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 513                         goto error_return;
 514                 }
 515                 /*
 516                  * Extent size must be a multiple of the appropriate block
 517                  * size, if set at all.
 518                  */
 519                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 520                         xfs_extlen_t    size;
 521
 522                         if (XFS_IS_REALTIME_INODE(ip) ||
 523                             ((mask & XFS_AT_XFLAGS) &&
 524                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 525                                 size = mp->m_sb.sb_rextsize <<
 526                                        mp->m_sb.sb_blocklog;
 527                         } else {
 528                                 size = mp->m_sb.sb_blocksize;
 529                         }
 530                         if (vap->va_extsize % size) {
 531                                 code = XFS_ERROR(EINVAL);
 532                                 goto error_return;
 533                         }
 534                 }
 535                 /*
 536                  * If realtime flag is set then must have realtime data.
 537                  */
 538                 if ((mask & XFS_AT_XFLAGS) &&
 539                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 540                         if ((mp->m_sb.sb_rblocks == 0) ||
 541                             (mp->m_sb.sb_rextsize == 0) ||
 542                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 543                                 code = XFS_ERROR(EINVAL);
 544                                 goto error_return;
 545                         }
 546                 }
 547
 548                 /*
 549                  * Can't modify an immutable/append-only file unless
 550                  * we have appropriate permission.
 551                  */
 552                 if ((mask & XFS_AT_XFLAGS) &&
 553                     (ip->i_d.di_flags &
 554                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 555                      (vap->va_xflags &
 556                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 557                     !capable(CAP_LINUX_IMMUTABLE)) {
 558                         code = XFS_ERROR(EPERM);
 559                         goto error_return;
 560                 }
 561         }
 562
 563         /*
 564          * Now we can make the changes.  Before we join the inode
 565          * to the transaction, if XFS_AT_SIZE is set then take care of
 566          * the part of the truncation that must be done without the
 567          * inode lock.  This needs to be done before joining the inode
 568          * to the transaction, because the inode cannot be unlocked
 569          * once it is a part of the transaction.
 570          */
 571         if (mask & XFS_AT_SIZE) {
 572                 code = 0;
 573                 if ((vap->va_size > ip->i_size) &&
 574                     (flags & ATTR_NOSIZETOK) == 0) {
 575                         code = xfs_igrow_start(ip, vap->va_size, credp);
 576                 }
 577                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 578
 579                 /*
 580                  * We are going to log the inode size change in this
 581                  * transaction so any previous writes that are beyond the on
 582                  * disk EOF and the new EOF that have not been written out need
 583                  * to be written here. If we do not write the data out, we
 584                  * expose ourselves to the null files problem.
 585                  *
 586                  * Only flush from the on disk size to the smaller of the in
 587                  * memory file size or the new size as that's the range we
 588                  * really care about here and prevents waiting for other data
 589                  * not within the range we care about here.
 590                  */
 591                 if (!code &&
 592                     (ip->i_size != ip->i_d.di_size) &&
 593                     (vap->va_size > ip->i_d.di_size)) {
 594                         code = xfs_flush_pages(ip,
 595                                         ip->i_d.di_size, vap->va_size,
 596                                         XFS_B_ASYNC, FI_NONE);
 597                 }
 598
 599                 /* wait for all I/O to complete */
 600                 vn_iowait(ip);
 601
 602                 if (!code)
 603                         code = xfs_itruncate_data(ip, vap->va_size);
 604                 if (code) {
 605                         ASSERT(tp == NULL);
 606                         lock_flags &= ~XFS_ILOCK_EXCL;
 607                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 608                         goto error_return;
 609                 }
 610                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 611                 if ((code = xfs_trans_reserve(tp, 0,
 612                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 613                                              XFS_TRANS_PERM_LOG_RES,
 614                                              XFS_ITRUNCATE_LOG_COUNT))) {
 615                         xfs_trans_cancel(tp, 0);
 616                         if (need_iolock)
 617                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 618                         return code;
 619                 }
 620                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 621                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 622         }
 623
 624         if (tp) {
 625                 xfs_trans_ijoin(tp, ip, lock_flags);
 626                 xfs_trans_ihold(tp, ip);
 627         }
 628
 629         /* determine whether mandatory locking mode changes */
 630         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 631
 632         /*
 633          * Truncate file.  Must have write permission and not be a directory.
 634          */
 635         if (mask & XFS_AT_SIZE) {
 636                 if (vap->va_size > ip->i_size) {
 637                         xfs_igrow_finish(tp, ip, vap->va_size,
 638                             !(flags & ATTR_DMI));
 639                 } else if ((vap->va_size <= ip->i_size) ||
 640                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 641                         /*
 642                          * signal a sync transaction unless
 643                          * we're truncating an already unlinked
 644                          * file on a wsync filesystem
 645                          */
 646                         code = xfs_itruncate_finish(&tp, ip,
 647                                             (xfs_fsize_t)vap->va_size,
 648                                             XFS_DATA_FORK,
 649                                             ((ip->i_d.di_nlink != 0 ||
 650                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 651                                              ? 1 : 0));
 652                         if (code)
 653                                 goto abort_return;
 654                         /*
 655                          * Truncated "down", so we're removing references
 656                          * to old data here - if we now delay flushing for
 657                          * a long time, we expose ourselves unduly to the
 658                          * notorious NULL files problem.  So, we mark this
 659                          * vnode and flush it when the file is closed, and
 660                          * do not wait the usual (long) time for writeout.
 661                          */
 662                         xfs_iflags_set(ip, XFS_ITRUNCATED);
 663                 }
 664                 /*
 665                  * Have to do this even if the file's size doesn't change.
 666                  */
 667                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 668         }
 669
 670         /*
 671          * Change file access modes.
 672          */
 673         if (mask & XFS_AT_MODE) {
 674                 ip->i_d.di_mode &= S_IFMT;
 675                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 676
 677                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 678                 timeflags |= XFS_ICHGTIME_CHG;
 679         }
 680
 681         /*
 682          * Change file ownership.  Must be the owner or privileged.
 683          * If the system was configured with the "restricted_chown"
 684          * option, the owner is not permitted to give away the file,
 685          * and can change the group id only to a group of which he
 686          * or she is a member.
 687          */
 688         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 689                 /*
 690                  * CAP_FSETID overrides the following restrictions:
 691                  *
 692                  * The set-user-ID and set-group-ID bits of a file will be
 693                  * cleared upon successful return from chown()
 694                  */
 695                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 696                     !capable(CAP_FSETID)) {
 697                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 698                 }
 699
 700                 /*
 701                  * Change the ownerships and register quota modifications
 702                  * in the transaction.
 703                  */
 704                 if (iuid != uid) {
 705                         if (XFS_IS_UQUOTA_ON(mp)) {
 706                                 ASSERT(mask & XFS_AT_UID);
 707                                 ASSERT(udqp);
 708                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 709                                                         &ip->i_udquot, udqp);
 710                         }
 711                         ip->i_d.di_uid = uid;
 712                 }
 713                 if (igid != gid) {
 714                         if (XFS_IS_GQUOTA_ON(mp)) {
 715                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 716                                 ASSERT(mask & XFS_AT_GID);
 717                                 ASSERT(gdqp);
 718                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 719                                                         &ip->i_gdquot, gdqp);
 720                         }
 721                         ip->i_d.di_gid = gid;
 722                 }
 723                 if (iprojid != projid) {
 724                         if (XFS_IS_PQUOTA_ON(mp)) {
 725                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 726                                 ASSERT(mask & XFS_AT_PROJID);
 727                                 ASSERT(gdqp);
 728                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 729                                                         &ip->i_gdquot, gdqp);
 730                         }
 731                         ip->i_d.di_projid = projid;
 732                         /*
 733                          * We may have to rev the inode as well as
 734                          * the superblock version number since projids didn't
 735                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 736                          */
 737                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 738                                 xfs_bump_ino_vers2(tp, ip);
 739                 }
 740
 741                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 742                 timeflags |= XFS_ICHGTIME_CHG;
 743         }
 744
 745
 746         /*
 747          * Change file access or modified times.
 748          */
 749         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 750                 if (mask & XFS_AT_ATIME) {
 751                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 752                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 753                         ip->i_update_core = 1;
 754                         timeflags &= ~XFS_ICHGTIME_ACC;
 755                 }
 756                 if (mask & XFS_AT_MTIME) {
 757                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 758                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 759                         timeflags &= ~XFS_ICHGTIME_MOD;
 760                         timeflags |= XFS_ICHGTIME_CHG;
 761                 }
 762                 if (tp && (flags & ATTR_UTIME))
 763                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 764         }
 765
 766         /*
 767          * Change XFS-added attributes.
 768          */
 769         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 770                 if (mask & XFS_AT_EXTSIZE) {
 771                         /*
 772                          * Converting bytes to fs blocks.
 773                          */
 774                         ip->i_d.di_extsize = vap->va_extsize >>
 775                                 mp->m_sb.sb_blocklog;
 776                 }
 777                 if (mask & XFS_AT_XFLAGS) {
 778                         uint    di_flags;
 779
 780                         /* can't set PREALLOC this way, just preserve it */
 781                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 782                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 783                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 784                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 785                                 di_flags |= XFS_DIFLAG_APPEND;
 786                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 787                                 di_flags |= XFS_DIFLAG_SYNC;
 788                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 789                                 di_flags |= XFS_DIFLAG_NOATIME;
 790                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 791                                 di_flags |= XFS_DIFLAG_NODUMP;
 792                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 793                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 794                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 795                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 796                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 797                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 798                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 799                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 800                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 801                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 802                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 803                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 804                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 805                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 806                                 if (vap->va_xflags & XFS_XFLAG_REALTIME)
 807                                         di_flags |= XFS_DIFLAG_REALTIME;
 808                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 809                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 810                         }
 811                         ip->i_d.di_flags = di_flags;
 812                 }
 813                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 814                 timeflags |= XFS_ICHGTIME_CHG;
 815         }
 816
 817         /*
 818          * Change file inode change time only if XFS_AT_CTIME set
 819          * AND we have been called by a DMI function.
 820          */
 821
 822         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 823                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 824                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 825                 ip->i_update_core = 1;
 826                 timeflags &= ~XFS_ICHGTIME_CHG;
 827         }
 828
 829         /*
 830          * Send out timestamp changes that need to be set to the
 831          * current time.  Not done when called by a DMI function.
 832          */
 833         if (timeflags && !(flags & ATTR_DMI))
 834                 xfs_ichgtime(ip, timeflags);
 835
 836         XFS_STATS_INC(xs_ig_attrchg);
 837
 838         /*
 839          * If this is a synchronous mount, make sure that the
 840          * transaction goes to disk before returning to the user.
 841          * This is slightly sub-optimal in that truncates require
 842          * two sync transactions instead of one for wsync filesystems.
 843          * One for the truncate and one for the timestamps since we
 844          * don't want to change the timestamps unless we're sure the
 845          * truncate worked.  Truncates are less than 1% of the laddis
 846          * mix so this probably isn't worth the trouble to optimize.
 847          */
 848         code = 0;
 849         if (tp) {
 850                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 851                         xfs_trans_set_sync(tp);
 852
 853                 code = xfs_trans_commit(tp, commit_flags);
 854         }
 855
 856         /*
 857          * If the (regular) file's mandatory locking mode changed, then
 858          * notify the vnode.  We do this under the inode lock to prevent
 859          * racing calls to vop_vnode_change.
 860          */
 861         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 862
 863         xfs_iunlock(ip, lock_flags);
 864
 865         /*
 866          * Release any dquot(s) the inode had kept before chown.
 867          */
 868         XFS_QM_DQRELE(mp, olddquot1);
 869         XFS_QM_DQRELE(mp, olddquot2);
 870         XFS_QM_DQRELE(mp, udqp);
 871         XFS_QM_DQRELE(mp, gdqp);
 872
 873         if (code) {
 874                 return code;
 875         }
 876
 877         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 878             !(flags & ATTR_DMI)) {
 879                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 880                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 881                                         0, 0, AT_DELAY_FLAG(flags));
 882         }
 883         return 0;
 884
 885  abort_return:
 886         commit_flags |= XFS_TRANS_ABORT;
 887         /* FALLTHROUGH */
 888  error_return:
 889         XFS_QM_DQRELE(mp, udqp);
 890         XFS_QM_DQRELE(mp, gdqp);
 891         if (tp) {
 892                 xfs_trans_cancel(tp, commit_flags);
 893         }
 894         if (lock_flags != 0) {
 895                 xfs_iunlock(ip, lock_flags);
 896         }
 897         return code;
 898 }
 899
 900 /*
 901  * The maximum pathlen is 1024 bytes. Since the minimum file system
 902  * blocksize is 512 bytes, we can get a max of 2 extents back from
 903  * bmapi.
 904  */
 905 #define SYMLINK_MAPS 2
 906
 907 STATIC int
 908 xfs_readlink_bmap(
 909         xfs_inode_t     *ip,
 910         char            *link)
 911 {
 912         xfs_mount_t     *mp = ip->i_mount;
 913         int             pathlen = ip->i_d.di_size;
 914         int             nmaps = SYMLINK_MAPS;
 915         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 916         xfs_daddr_t     d;
 917         int             byte_cnt;
 918         int             n;
 919         xfs_buf_t       *bp;
 920         int             error = 0;
 921
 922         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 923                         mval, &nmaps, NULL, NULL);
 924         if (error)
 925                 goto out;
 926
 927         for (n = 0; n < nmaps; n++) {
 928                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 929                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 930
 931                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 932                 error = XFS_BUF_GETERROR(bp);
 933                 if (error) {
 934                         xfs_ioerror_alert("xfs_readlink",
 935                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 936                         xfs_buf_relse(bp);
 937                         goto out;
 938                 }
 939                 if (pathlen < byte_cnt)
 940                         byte_cnt = pathlen;
 941                 pathlen -= byte_cnt;
 942
 943                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 944                 xfs_buf_relse(bp);
 945         }
 946
 947         link[ip->i_d.di_size] = '\0';
 948         error = 0;
 949
 950  out:
 951         return error;
 952 }
 953
 954 int
 955 xfs_readlink(
 956         xfs_inode_t     *ip,
 957         char            *link)
 958 {
 959         xfs_mount_t     *mp = ip->i_mount;
 960         int             pathlen;
 961         int             error = 0;
 962
 963         xfs_itrace_entry(ip);
 964
 965         if (XFS_FORCED_SHUTDOWN(mp))
 966                 return XFS_ERROR(EIO);
 967
 968         xfs_ilock(ip, XFS_ILOCK_SHARED);
 969
 970         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 971         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 972
 973         pathlen = ip->i_d.di_size;
 974         if (!pathlen)
 975                 goto out;
 976
 977         if (ip->i_df.if_flags & XFS_IFINLINE) {
 978                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 979                 link[pathlen] = '\0';
 980         } else {
 981                 error = xfs_readlink_bmap(ip, link);
 982         }
 983
 984  out:
 985         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 986         return error;
 987 }
 988
 989 /*
 990  * xfs_fsync
 991  *
 992  * This is called to sync the inode and its data out to disk.
 993  * We need to hold the I/O lock while flushing the data, and
 994  * the inode lock while flushing the inode.  The inode lock CANNOT
 995  * be held while flushing the data, so acquire after we're done
 996  * with that.
 997  */
 998 int
 999 xfs_fsync(
1000         xfs_inode_t     *ip,
1001         int             flag,
1002         xfs_off_t       start,
1003         xfs_off_t       stop)
1004 {
1005         xfs_trans_t     *tp;
1006         int             error;
1007         int             log_flushed = 0, changed = 1;
1008
1009         xfs_itrace_entry(ip);
1010
1011         ASSERT(start >= 0 && stop >= -1);
1012
1013         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1014                 return XFS_ERROR(EIO);
1015
1016         if (flag & FSYNC_DATA)
1017                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1018
1019         /*
1020          * We always need to make sure that the required inode state
1021          * is safe on disk.  The vnode might be clean but because
1022          * of committed transactions that haven't hit the disk yet.
1023          * Likewise, there could be unflushed non-transactional
1024          * changes to the inode core that have to go to disk.
1025          *
1026          * The following code depends on one assumption:  that
1027          * any transaction that changes an inode logs the core
1028          * because it has to change some field in the inode core
1029          * (typically nextents or nblocks).  That assumption
1030          * implies that any transactions against an inode will
1031          * catch any non-transactional updates.  If inode-altering
1032          * transactions exist that violate this assumption, the
1033          * code breaks.  Right now, it figures that if the involved
1034          * update_* field is clear and the inode is unpinned, the
1035          * inode is clean.  Either it's been flushed or it's been
1036          * committed and the commit has hit the disk unpinning the inode.
1037          * (Note that xfs_inode_item_format() called at commit clears
1038          * the update_* fields.)
1039          */
1040         xfs_ilock(ip, XFS_ILOCK_SHARED);
1041
1042         /* If we are flushing data then we care about update_size
1043          * being set, otherwise we care about update_core
1044          */
1045         if ((flag & FSYNC_DATA) ?
1046                         (ip->i_update_size == 0) :
1047                         (ip->i_update_core == 0)) {
1048                 /*
1049                  * Timestamps/size haven't changed since last inode
1050                  * flush or inode transaction commit.  That means
1051                  * either nothing got written or a transaction
1052                  * committed which caught the updates.  If the
1053                  * latter happened and the transaction hasn't
1054                  * hit the disk yet, the inode will be still
1055                  * be pinned.  If it is, force the log.
1056                  */
1057
1058                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1059
1060                 if (xfs_ipincount(ip)) {
1061                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1062                                       XFS_LOG_FORCE |
1063                                       ((flag & FSYNC_WAIT)
1064                                        ? XFS_LOG_SYNC : 0),
1065                                       &log_flushed);
1066                 } else {
1067                         /*
1068                          * If the inode is not pinned and nothing
1069                          * has changed we don't need to flush the
1070                          * cache.
1071                          */
1072                         changed = 0;
1073                 }
1074                 error = 0;
1075         } else  {
1076                 /*
1077                  * Kick off a transaction to log the inode
1078                  * core to get the updates.  Make it
1079                  * sync if FSYNC_WAIT is passed in (which
1080                  * is done by everybody but specfs).  The
1081                  * sync transaction will also force the log.
1082                  */
1083                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1084                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1085                 if ((error = xfs_trans_reserve(tp, 0,
1086                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1087                                 0, 0, 0)))  {
1088                         xfs_trans_cancel(tp, 0);
1089                         return error;
1090                 }
1091                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1092
1093                 /*
1094                  * Note - it's possible that we might have pushed
1095                  * ourselves out of the way during trans_reserve
1096                  * which would flush the inode.  But there's no
1097                  * guarantee that the inode buffer has actually
1098                  * gone out yet (it's delwri).  Plus the buffer
1099                  * could be pinned anyway if it's part of an
1100                  * inode in another recent transaction.  So we
1101                  * play it safe and fire off the transaction anyway.
1102                  */
1103                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1104                 xfs_trans_ihold(tp, ip);
1105                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1106                 if (flag & FSYNC_WAIT)
1107                         xfs_trans_set_sync(tp);
1108                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1109
1110                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1111         }
1112
1113         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1114                 /*
1115                  * If the log write didn't issue an ordered tag we need
1116                  * to flush the disk cache for the data device now.
1117                  */
1118                 if (!log_flushed)
1119                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1120
1121                 /*
1122                  * If this inode is on the RT dev we need to flush that
1123                  * cache as well.
1124                  */
1125                 if (XFS_IS_REALTIME_INODE(ip))
1126                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1127         }
1128
1129         return error;
1130 }
1131
1132 /*
1133  * This is called by xfs_inactive to free any blocks beyond eof
1134  * when the link count isn't zero and by xfs_dm_punch_hole() when
1135  * punching a hole to EOF.
1136  */
1137 int
1138 xfs_free_eofblocks(
1139         xfs_mount_t     *mp,
1140         xfs_inode_t     *ip,
1141         int             flags)
1142 {
1143         xfs_trans_t     *tp;
1144         int             error;
1145         xfs_fileoff_t   end_fsb;
1146         xfs_fileoff_t   last_fsb;
1147         xfs_filblks_t   map_len;
1148         int             nimaps;
1149         xfs_bmbt_irec_t imap;
1150         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1151
1152         /*
1153          * Figure out if there are any blocks beyond the end
1154          * of the file.  If not, then there is nothing to do.
1155          */
1156         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1157         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1158         map_len = last_fsb - end_fsb;
1159         if (map_len <= 0)
1160                 return 0;
1161
1162         nimaps = 1;
1163         xfs_ilock(ip, XFS_ILOCK_SHARED);
1164         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1165                           NULL, 0, &imap, &nimaps, NULL, NULL);
1166         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1167
1168         if (!error && (nimaps != 0) &&
1169             (imap.br_startblock != HOLESTARTBLOCK ||
1170              ip->i_delayed_blks)) {
1171                 /*
1172                  * Attach the dquots to the inode up front.
1173                  */
1174                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1175                         return error;
1176
1177                 /*
1178                  * There are blocks after the end of file.
1179                  * Free them up now by truncating the file to
1180                  * its current size.
1181                  */
1182                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1183
1184                 /*
1185                  * Do the xfs_itruncate_start() call before
1186                  * reserving any log space because
1187                  * itruncate_start will call into the buffer
1188                  * cache and we can't
1189                  * do that within a transaction.
1190                  */
1191                 if (use_iolock)
1192                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1193                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1194                                     ip->i_size);
1195                 if (error) {
1196                         xfs_trans_cancel(tp, 0);
1197                         if (use_iolock)
1198                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1199                         return error;
1200                 }
1201
1202                 error = xfs_trans_reserve(tp, 0,
1203                                           XFS_ITRUNCATE_LOG_RES(mp),
1204                                           0, XFS_TRANS_PERM_LOG_RES,
1205                                           XFS_ITRUNCATE_LOG_COUNT);
1206                 if (error) {
1207                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1208                         xfs_trans_cancel(tp, 0);
1209                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1210                         return error;
1211                 }
1212
1213                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1214                 xfs_trans_ijoin(tp, ip,
1215                                 XFS_IOLOCK_EXCL |
1216                                 XFS_ILOCK_EXCL);
1217                 xfs_trans_ihold(tp, ip);
1218
1219                 error = xfs_itruncate_finish(&tp, ip,
1220                                              ip->i_size,
1221                                              XFS_DATA_FORK,
1222                                              0);
1223                 /*
1224                  * If we get an error at this point we
1225                  * simply don't bother truncating the file.
1226                  */
1227                 if (error) {
1228                         xfs_trans_cancel(tp,
1229                                          (XFS_TRANS_RELEASE_LOG_RES |
1230                                           XFS_TRANS_ABORT));
1231                 } else {
1232                         error = xfs_trans_commit(tp,
1233                                                 XFS_TRANS_RELEASE_LOG_RES);
1234                 }
1235                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1236                                             : XFS_ILOCK_EXCL));
1237         }
1238         return error;
1239 }
1240
1241 /*
1242  * Free a symlink that has blocks associated with it.
1243  */
1244 STATIC int
1245 xfs_inactive_symlink_rmt(
1246         xfs_inode_t     *ip,
1247         xfs_trans_t     **tpp)
1248 {
1249         xfs_buf_t       *bp;
1250         int             committed;
1251         int             done;
1252         int             error;
1253         xfs_fsblock_t   first_block;
1254         xfs_bmap_free_t free_list;
1255         int             i;
1256         xfs_mount_t     *mp;
1257         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1258         int             nmaps;
1259         xfs_trans_t     *ntp;
1260         int             size;
1261         xfs_trans_t     *tp;
1262
1263         tp = *tpp;
1264         mp = ip->i_mount;
1265         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1266         /*
1267          * We're freeing a symlink that has some
1268          * blocks allocated to it.  Free the
1269          * blocks here.  We know that we've got
1270          * either 1 or 2 extents and that we can
1271          * free them all in one bunmapi call.
1272          */
1273         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1274         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1275                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1276                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1277                 xfs_trans_cancel(tp, 0);
1278                 *tpp = NULL;
1279                 return error;
1280         }
1281         /*
1282          * Lock the inode, fix the size, and join it to the transaction.
1283          * Hold it so in the normal path, we still have it locked for
1284          * the second transaction.  In the error paths we need it
1285          * held so the cancel won't rele it, see below.
1286          */
1287         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1288         size = (int)ip->i_d.di_size;
1289         ip->i_d.di_size = 0;
1290         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1291         xfs_trans_ihold(tp, ip);
1292         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1293         /*
1294          * Find the block(s) so we can inval and unmap them.
1295          */
1296         done = 0;
1297         XFS_BMAP_INIT(&free_list, &first_block);
1298         nmaps = ARRAY_SIZE(mval);
1299         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1300                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1301                         &free_list, NULL)))
1302                 goto error0;
1303         /*
1304          * Invalidate the block(s).
1305          */
1306         for (i = 0; i < nmaps; i++) {
1307                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1308                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1309                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1310                 xfs_trans_binval(tp, bp);
1311         }
1312         /*
1313          * Unmap the dead block(s) to the free_list.
1314          */
1315         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1316                         &first_block, &free_list, NULL, &done)))
1317                 goto error1;
1318         ASSERT(done);
1319         /*
1320          * Commit the first transaction.  This logs the EFI and the inode.
1321          */
1322         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1323                 goto error1;
1324         /*
1325          * The transaction must have been committed, since there were
1326          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1327          * The new tp has the extent freeing and EFDs.
1328          */
1329         ASSERT(committed);
1330         /*
1331          * The first xact was committed, so add the inode to the new one.
1332          * Mark it dirty so it will be logged and moved forward in the log as
1333          * part of every commit.
1334          */
1335         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1336         xfs_trans_ihold(tp, ip);
1337         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1338         /*
1339          * Get a new, empty transaction to return to our caller.
1340          */
1341         ntp = xfs_trans_dup(tp);
1342         /*
1343          * Commit the transaction containing extent freeing and EFDs.
1344          * If we get an error on the commit here or on the reserve below,
1345          * we need to unlock the inode since the new transaction doesn't
1346          * have the inode attached.
1347          */
1348         error = xfs_trans_commit(tp, 0);
1349         tp = ntp;
1350         if (error) {
1351                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1352                 goto error0;
1353         }
1354         /*
1355          * Remove the memory for extent descriptions (just bookkeeping).
1356          */
1357         if (ip->i_df.if_bytes)
1358                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1359         ASSERT(ip->i_df.if_bytes == 0);
1360         /*
1361          * Put an itruncate log reservation in the new transaction
1362          * for our caller.
1363          */
1364         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1365                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1366                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1367                 goto error0;
1368         }
1369         /*
1370          * Return with the inode locked but not joined to the transaction.
1371          */
1372         *tpp = tp;
1373         return 0;
1374
1375  error1:
1376         xfs_bmap_cancel(&free_list);
1377  error0:
1378         /*
1379          * Have to come here with the inode locked and either
1380          * (held and in the transaction) or (not in the transaction).
1381          * If the inode isn't held then cancel would iput it, but
1382          * that's wrong since this is inactive and the vnode ref
1383          * count is 0 already.
1384          * Cancel won't do anything to the inode if held, but it still
1385          * needs to be locked until the cancel is done, if it was
1386          * joined to the transaction.
1387          */
1388         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1389         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1390         *tpp = NULL;
1391         return error;
1392
1393 }
1394
1395 STATIC int
1396 xfs_inactive_symlink_local(
1397         xfs_inode_t     *ip,
1398         xfs_trans_t     **tpp)
1399 {
1400         int             error;
1401
1402         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1403         /*
1404          * We're freeing a symlink which fit into
1405          * the inode.  Just free the memory used
1406          * to hold the old symlink.
1407          */
1408         error = xfs_trans_reserve(*tpp, 0,
1409                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1410                                   0, XFS_TRANS_PERM_LOG_RES,
1411                                   XFS_ITRUNCATE_LOG_COUNT);
1412
1413         if (error) {
1414                 xfs_trans_cancel(*tpp, 0);
1415                 *tpp = NULL;
1416                 return error;
1417         }
1418         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1419
1420         /*
1421          * Zero length symlinks _can_ exist.
1422          */
1423         if (ip->i_df.if_bytes > 0) {
1424                 xfs_idata_realloc(ip,
1425                                   -(ip->i_df.if_bytes),
1426                                   XFS_DATA_FORK);
1427                 ASSERT(ip->i_df.if_bytes == 0);
1428         }
1429         return 0;
1430 }
1431
1432 STATIC int
1433 xfs_inactive_attrs(
1434         xfs_inode_t     *ip,
1435         xfs_trans_t     **tpp)
1436 {
1437         xfs_trans_t     *tp;
1438         int             error;
1439         xfs_mount_t     *mp;
1440
1441         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1442         tp = *tpp;
1443         mp = ip->i_mount;
1444         ASSERT(ip->i_d.di_forkoff != 0);
1445         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1446         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1447
1448         error = xfs_attr_inactive(ip);
1449         if (error) {
1450                 *tpp = NULL;
1451                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1452                 return error; /* goto out */
1453         }
1454
1455         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1456         error = xfs_trans_reserve(tp, 0,
1457                                   XFS_IFREE_LOG_RES(mp),
1458                                   0, XFS_TRANS_PERM_LOG_RES,
1459                                   XFS_INACTIVE_LOG_COUNT);
1460         if (error) {
1461                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1462                 xfs_trans_cancel(tp, 0);
1463                 *tpp = NULL;
1464                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1465                 return error;
1466         }
1467
1468         xfs_ilock(ip, XFS_ILOCK_EXCL);
1469         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1470         xfs_trans_ihold(tp, ip);
1471         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1472
1473         ASSERT(ip->i_d.di_anextents == 0);
1474
1475         *tpp = tp;
1476         return 0;
1477 }
1478
1479 int
1480 xfs_release(
1481         xfs_inode_t     *ip)
1482 {
1483         bhv_vnode_t     *vp = XFS_ITOV(ip);
1484         xfs_mount_t     *mp = ip->i_mount;
1485         int             error;
1486
1487         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1488                 return 0;
1489
1490         /* If this is a read-only mount, don't do this (would generate I/O) */
1491         if (mp->m_flags & XFS_MOUNT_RDONLY)
1492                 return 0;
1493
1494         if (!XFS_FORCED_SHUTDOWN(mp)) {
1495                 int truncated;
1496
1497                 /*
1498                  * If we are using filestreams, and we have an unlinked
1499                  * file that we are processing the last close on, then nothing
1500                  * will be able to reopen and write to this file. Purge this
1501                  * inode from the filestreams cache so that it doesn't delay
1502                  * teardown of the inode.
1503                  */
1504                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1505                         xfs_filestream_deassociate(ip);
1506
1507                 /*
1508                  * If we previously truncated this file and removed old data
1509                  * in the process, we want to initiate "early" writeout on
1510                  * the last close.  This is an attempt to combat the notorious
1511                  * NULL files problem which is particularly noticable from a
1512                  * truncate down, buffered (re-)write (delalloc), followed by
1513                  * a crash.  What we are effectively doing here is
1514                  * significantly reducing the time window where we'd otherwise
1515                  * be exposed to that problem.
1516                  */
1517                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1518                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1519                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1520         }
1521
1522         if (ip->i_d.di_nlink != 0) {
1523                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1524                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1525                        ip->i_delayed_blks > 0)) &&
1526                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1527                     (!(ip->i_d.di_flags &
1528                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1529                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1530                         if (error)
1531                                 return error;
1532                 }
1533         }
1534
1535         return 0;
1536 }
1537
1538 /*
1539  * xfs_inactive
1540  *
1541  * This is called when the vnode reference count for the vnode
1542  * goes to zero.  If the file has been unlinked, then it must
1543  * now be truncated.  Also, we clear all of the read-ahead state
1544  * kept for the inode here since the file is now closed.
1545  */
1546 int
1547 xfs_inactive(
1548         xfs_inode_t     *ip)
1549 {
1550         bhv_vnode_t     *vp = XFS_ITOV(ip);
1551         xfs_bmap_free_t free_list;
1552         xfs_fsblock_t   first_block;
1553         int             committed;
1554         xfs_trans_t     *tp;
1555         xfs_mount_t     *mp;
1556         int             error;
1557         int             truncate;
1558
1559         xfs_itrace_entry(ip);
1560
1561         /*
1562          * If the inode is already free, then there can be nothing
1563          * to clean up here.
1564          */
1565         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1566                 ASSERT(ip->i_df.if_real_bytes == 0);
1567                 ASSERT(ip->i_df.if_broot_bytes == 0);
1568                 return VN_INACTIVE_CACHE;
1569         }
1570
1571         /*
1572          * Only do a truncate if it's a regular file with
1573          * some actual space in it.  It's OK to look at the
1574          * inode's fields without the lock because we're the
1575          * only one with a reference to the inode.
1576          */
1577         truncate = ((ip->i_d.di_nlink == 0) &&
1578             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1579              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1580             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1581
1582         mp = ip->i_mount;
1583
1584         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1585                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1586         }
1587
1588         error = 0;
1589
1590         /* If this is a read-only mount, don't do this (would generate I/O) */
1591         if (mp->m_flags & XFS_MOUNT_RDONLY)
1592                 goto out;
1593
1594         if (ip->i_d.di_nlink != 0) {
1595                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1596                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1597                        ip->i_delayed_blks > 0)) &&
1598                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1599                      (!(ip->i_d.di_flags &
1600                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1601                       (ip->i_delayed_blks != 0)))) {
1602                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1603                         if (error)
1604                                 return VN_INACTIVE_CACHE;
1605                 }
1606                 goto out;
1607         }
1608
1609         ASSERT(ip->i_d.di_nlink == 0);
1610
1611         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1612                 return VN_INACTIVE_CACHE;
1613
1614         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1615         if (truncate) {
1616                 /*
1617                  * Do the xfs_itruncate_start() call before
1618                  * reserving any log space because itruncate_start
1619                  * will call into the buffer cache and we can't
1620                  * do that within a transaction.
1621                  */
1622                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1623
1624                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1625                 if (error) {
1626                         xfs_trans_cancel(tp, 0);
1627                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1628                         return VN_INACTIVE_CACHE;
1629                 }
1630
1631                 error = xfs_trans_reserve(tp, 0,
1632                                           XFS_ITRUNCATE_LOG_RES(mp),
1633                                           0, XFS_TRANS_PERM_LOG_RES,
1634                                           XFS_ITRUNCATE_LOG_COUNT);
1635                 if (error) {
1636                         /* Don't call itruncate_cleanup */
1637                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1638                         xfs_trans_cancel(tp, 0);
1639                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1640                         return VN_INACTIVE_CACHE;
1641                 }
1642
1643                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1644                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1645                 xfs_trans_ihold(tp, ip);
1646
1647                 /*
1648                  * normally, we have to run xfs_itruncate_finish sync.
1649                  * But if filesystem is wsync and we're in the inactive
1650                  * path, then we know that nlink == 0, and that the
1651                  * xaction that made nlink == 0 is permanently committed
1652                  * since xfs_remove runs as a synchronous transaction.
1653                  */
1654                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1655                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1656
1657                 if (error) {
1658                         xfs_trans_cancel(tp,
1659                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1660                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1661                         return VN_INACTIVE_CACHE;
1662                 }
1663         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1664
1665                 /*
1666                  * If we get an error while cleaning up a
1667                  * symlink we bail out.
1668                  */
1669                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1670                         xfs_inactive_symlink_rmt(ip, &tp) :
1671                         xfs_inactive_symlink_local(ip, &tp);
1672
1673                 if (error) {
1674                         ASSERT(tp == NULL);
1675                         return VN_INACTIVE_CACHE;
1676                 }
1677
1678                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1679                 xfs_trans_ihold(tp, ip);
1680         } else {
1681                 error = xfs_trans_reserve(tp, 0,
1682                                           XFS_IFREE_LOG_RES(mp),
1683                                           0, XFS_TRANS_PERM_LOG_RES,
1684                                           XFS_INACTIVE_LOG_COUNT);
1685                 if (error) {
1686                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1687                         xfs_trans_cancel(tp, 0);
1688                         return VN_INACTIVE_CACHE;
1689                 }
1690
1691                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1692                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1693                 xfs_trans_ihold(tp, ip);
1694         }
1695
1696         /*
1697          * If there are attributes associated with the file
1698          * then blow them away now.  The code calls a routine
1699          * that recursively deconstructs the attribute fork.
1700          * We need to just commit the current transaction
1701          * because we can't use it for xfs_attr_inactive().
1702          */
1703         if (ip->i_d.di_anextents > 0) {
1704                 error = xfs_inactive_attrs(ip, &tp);
1705                 /*
1706                  * If we got an error, the transaction is already
1707                  * cancelled, and the inode is unlocked. Just get out.
1708                  */
1709                  if (error)
1710                          return VN_INACTIVE_CACHE;
1711         } else if (ip->i_afp) {
1712                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1713         }
1714
1715         /*
1716          * Free the inode.
1717          */
1718         XFS_BMAP_INIT(&free_list, &first_block);
1719         error = xfs_ifree(tp, ip, &free_list);
1720         if (error) {
1721                 /*
1722                  * If we fail to free the inode, shut down.  The cancel
1723                  * might do that, we need to make sure.  Otherwise the
1724                  * inode might be lost for a long time or forever.
1725                  */
1726                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1727                         cmn_err(CE_NOTE,
1728                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1729                                 error, mp->m_fsname);
1730                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1731                 }
1732                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1733         } else {
1734                 /*
1735                  * Credit the quota account(s). The inode is gone.
1736                  */
1737                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1738
1739                 /*
1740                  * Just ignore errors at this point.  There is
1741                  * nothing we can do except to try to keep going.
1742                  */
1743                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1744                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1745         }
1746         /*
1747          * Release the dquots held by inode, if any.
1748          */
1749         XFS_QM_DQDETACH(mp, ip);
1750
1751         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1752
1753  out:
1754         return VN_INACTIVE_CACHE;
1755 }
1756
1757
1758 int
1759 xfs_lookup(
1760         xfs_inode_t             *dp,
1761         bhv_vname_t             *dentry,
1762         bhv_vnode_t             **vpp)
1763 {
1764         xfs_inode_t             *ip;
1765         xfs_ino_t               e_inum;
1766         int                     error;
1767         uint                    lock_mode;
1768
1769         xfs_itrace_entry(dp);
1770
1771         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1772                 return XFS_ERROR(EIO);
1773
1774         lock_mode = xfs_ilock_map_shared(dp);
1775         error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
1776         if (!error) {
1777                 *vpp = XFS_ITOV(ip);
1778                 xfs_itrace_ref(ip);
1779         }
1780         xfs_iunlock_map_shared(dp, lock_mode);
1781         return error;
1782 }
1783
1784 int
1785 xfs_create(
1786         xfs_inode_t             *dp,
1787         bhv_vname_t             *dentry,
1788         mode_t                  mode,
1789         xfs_dev_t               rdev,
1790         bhv_vnode_t             **vpp,
1791         cred_t                  *credp)
1792 {
1793         char                    *name = VNAME(dentry);
1794         xfs_mount_t             *mp = dp->i_mount;
1795         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
1796         xfs_inode_t             *ip;
1797         bhv_vnode_t             *vp = NULL;
1798         xfs_trans_t             *tp;
1799         int                     error;
1800         xfs_bmap_free_t         free_list;
1801         xfs_fsblock_t           first_block;
1802         boolean_t               unlock_dp_on_error = B_FALSE;
1803         int                     dm_event_sent = 0;
1804         uint                    cancel_flags;
1805         int                     committed;
1806         xfs_prid_t              prid;
1807         struct xfs_dquot        *udqp, *gdqp;
1808         uint                    resblks;
1809         int                     namelen;
1810
1811         ASSERT(!*vpp);
1812         xfs_itrace_entry(dp);
1813
1814         namelen = VNAMELEN(dentry);
1815
1816         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1817                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1818                                 dir_vp, DM_RIGHT_NULL, NULL,
1819                                 DM_RIGHT_NULL, name, NULL,
1820                                 mode, 0, 0);
1821
1822                 if (error)
1823                         return error;
1824                 dm_event_sent = 1;
1825         }
1826
1827         if (XFS_FORCED_SHUTDOWN(mp))
1828                 return XFS_ERROR(EIO);
1829
1830         /* Return through std_return after this point. */
1831
1832         udqp = gdqp = NULL;
1833         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1834                 prid = dp->i_d.di_projid;
1835         else
1836                 prid = (xfs_prid_t)dfltprid;
1837
1838         /*
1839          * Make sure that we have allocated dquot(s) on disk.
1840          */
1841         error = XFS_QM_DQVOPALLOC(mp, dp,
1842                         current_fsuid(credp), current_fsgid(credp), prid,
1843                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1844         if (error)
1845                 goto std_return;
1846
1847         ip = NULL;
1848
1849         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1850         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1851         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1852         /*
1853          * Initially assume that the file does not exist and
1854          * reserve the resources for that case.  If that is not
1855          * the case we'll drop the one we have and get a more
1856          * appropriate transaction later.
1857          */
1858         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1859                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1860         if (error == ENOSPC) {
1861                 resblks = 0;
1862                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1863                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1864         }
1865         if (error) {
1866                 cancel_flags = 0;
1867                 goto error_return;
1868         }
1869
1870         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1871         unlock_dp_on_error = B_TRUE;
1872
1873         XFS_BMAP_INIT(&free_list, &first_block);
1874
1875         ASSERT(ip == NULL);
1876
1877         /*
1878          * Reserve disk quota and the inode.
1879          */
1880         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1881         if (error)
1882                 goto error_return;
1883
1884         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1885                 goto error_return;
1886         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1887                         rdev, credp, prid, resblks > 0,
1888                         &ip, &committed);
1889         if (error) {
1890                 if (error == ENOSPC)
1891                         goto error_return;
1892                 goto abort_return;
1893         }
1894         xfs_itrace_ref(ip);
1895
1896         /*
1897          * At this point, we've gotten a newly allocated inode.
1898          * It is locked (and joined to the transaction).
1899          */
1900
1901         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1902
1903         /*
1904          * Now we join the directory inode to the transaction.  We do not do it
1905          * earlier because xfs_dir_ialloc might commit the previous transaction
1906          * (and release all the locks).  An error from here on will result in
1907          * the transaction cancel unlocking dp so don't do it explicitly in the
1908          * error path.
1909          */
1910         VN_HOLD(dir_vp);
1911         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1912         unlock_dp_on_error = B_FALSE;
1913
1914         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
1915                                         &first_block, &free_list, resblks ?
1916                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1917         if (error) {
1918                 ASSERT(error != ENOSPC);
1919                 goto abort_return;
1920         }
1921         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1922         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1923
1924         /*
1925          * If this is a synchronous mount, make sure that the
1926          * create transaction goes to disk before returning to
1927          * the user.
1928          */
1929         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1930                 xfs_trans_set_sync(tp);
1931         }
1932
1933         dp->i_gen++;
1934
1935         /*
1936          * Attach the dquot(s) to the inodes and modify them incore.
1937          * These ids of the inode couldn't have changed since the new
1938          * inode has been locked ever since it was created.
1939          */
1940         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1941
1942         /*
1943          * xfs_trans_commit normally decrements the vnode ref count
1944          * when it unlocks the inode. Since we want to return the
1945          * vnode to the caller, we bump the vnode ref count now.
1946          */
1947         IHOLD(ip);
1948         vp = XFS_ITOV(ip);
1949
1950         error = xfs_bmap_finish(&tp, &free_list, &committed);
1951         if (error) {
1952                 xfs_bmap_cancel(&free_list);
1953                 goto abort_rele;
1954         }
1955
1956         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1957         if (error) {
1958                 IRELE(ip);
1959                 tp = NULL;
1960                 goto error_return;
1961         }
1962
1963         XFS_QM_DQRELE(mp, udqp);
1964         XFS_QM_DQRELE(mp, gdqp);
1965
1966         *vpp = vp;
1967
1968         /* Fallthrough to std_return with error = 0  */
1969
1970 std_return:
1971         if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
1972             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1973                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1974                         dir_vp, DM_RIGHT_NULL,
1975                         *vpp ? vp:NULL,
1976                         DM_RIGHT_NULL, name, NULL,
1977                         mode, error, 0);
1978         }
1979         return error;
1980
1981  abort_return:
1982         cancel_flags |= XFS_TRANS_ABORT;
1983         /* FALLTHROUGH */
1984
1985  error_return:
1986         if (tp != NULL)
1987                 xfs_trans_cancel(tp, cancel_flags);
1988
1989         XFS_QM_DQRELE(mp, udqp);
1990         XFS_QM_DQRELE(mp, gdqp);
1991
1992         if (unlock_dp_on_error)
1993                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1994
1995         goto std_return;
1996
1997  abort_rele:
1998         /*
1999          * Wait until after the current transaction is aborted to
2000          * release the inode.  This prevents recursive transactions
2001          * and deadlocks from xfs_inactive.
2002          */
2003         cancel_flags |= XFS_TRANS_ABORT;
2004         xfs_trans_cancel(tp, cancel_flags);
2005         IRELE(ip);
2006
2007         XFS_QM_DQRELE(mp, udqp);
2008         XFS_QM_DQRELE(mp, gdqp);
2009
2010         goto std_return;
2011 }
2012
2013 #ifdef DEBUG
2014 /*
2015  * Some counters to see if (and how often) we are hitting some deadlock
2016  * prevention code paths.
2017  */
2018
2019 int xfs_rm_locks;
2020 int xfs_rm_lock_delays;
2021 int xfs_rm_attempts;
2022 #endif
2023
2024 /*
2025  * The following routine will lock the inodes associated with the
2026  * directory and the named entry in the directory. The locks are
2027  * acquired in increasing inode number.
2028  *
2029  * If the entry is "..", then only the directory is locked. The
2030  * vnode ref count will still include that from the .. entry in
2031  * this case.
2032  *
2033  * There is a deadlock we need to worry about. If the locked directory is
2034  * in the AIL, it might be blocking up the log. The next inode we lock
2035  * could be already locked by another thread waiting for log space (e.g
2036  * a permanent log reservation with a long running transaction (see
2037  * xfs_itruncate_finish)). To solve this, we must check if the directory
2038  * is in the ail and use lock_nowait. If we can't lock, we need to
2039  * drop the inode lock on the directory and try again. xfs_iunlock will
2040  * potentially push the tail if we were holding up the log.
2041  */
2042 STATIC int
2043 xfs_lock_dir_and_entry(
2044         xfs_inode_t     *dp,
2045         xfs_inode_t     *ip)    /* inode of entry 'name' */
2046 {
2047         int             attempts;
2048         xfs_ino_t       e_inum;
2049         xfs_inode_t     *ips[2];
2050         xfs_log_item_t  *lp;
2051
2052 #ifdef DEBUG
2053         xfs_rm_locks++;
2054 #endif
2055         attempts = 0;
2056
2057 again:
2058         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2059
2060         e_inum = ip->i_ino;
2061
2062         xfs_itrace_ref(ip);
2063
2064         /*
2065          * We want to lock in increasing inum. Since we've already
2066          * acquired the lock on the directory, we may need to release
2067          * if if the inum of the entry turns out to be less.
2068          */
2069         if (e_inum > dp->i_ino) {
2070                 /*
2071                  * We are already in the right order, so just
2072                  * lock on the inode of the entry.
2073                  * We need to use nowait if dp is in the AIL.
2074                  */
2075
2076                 lp = (xfs_log_item_t *)dp->i_itemp;
2077                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2078                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2079                                 attempts++;
2080 #ifdef DEBUG
2081                                 xfs_rm_attempts++;
2082 #endif
2083
2084                                 /*
2085                                  * Unlock dp and try again.
2086                                  * xfs_iunlock will try to push the tail
2087                                  * if the inode is in the AIL.
2088                                  */
2089
2090                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2091
2092                                 if ((attempts % 5) == 0) {
2093                                         delay(1); /* Don't just spin the CPU */
2094 #ifdef DEBUG
2095                                         xfs_rm_lock_delays++;
2096 #endif
2097                                 }
2098                                 goto again;
2099                         }
2100                 } else {
2101                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2102                 }
2103         } else if (e_inum < dp->i_ino) {
2104                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2105
2106                 ips[0] = ip;
2107                 ips[1] = dp;
2108                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2109         }
2110         /* else  e_inum == dp->i_ino */
2111         /*     This can happen if we're asked to lock /x/..
2112          *     the entry is "..", which is also the parent directory.
2113          */
2114
2115         return 0;
2116 }
2117
2118 #ifdef DEBUG
2119 int xfs_locked_n;
2120 int xfs_small_retries;
2121 int xfs_middle_retries;
2122 int xfs_lots_retries;
2123 int xfs_lock_delays;
2124 #endif
2125
2126 /*
2127  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2128  * a different value
2129  */
2130 static inline int
2131 xfs_lock_inumorder(int lock_mode, int subclass)
2132 {
2133         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2134                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2135         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2136                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2137
2138         return lock_mode;
2139 }
2140
2141 /*
2142  * The following routine will lock n inodes in exclusive mode.
2143  * We assume the caller calls us with the inodes in i_ino order.
2144  *
2145  * We need to detect deadlock where an inode that we lock
2146  * is in the AIL and we start waiting for another inode that is locked
2147  * by a thread in a long running transaction (such as truncate). This can
2148  * result in deadlock since the long running trans might need to wait
2149  * for the inode we just locked in order to push the tail and free space
2150  * in the log.
2151  */
2152 void
2153 xfs_lock_inodes(
2154         xfs_inode_t     **ips,
2155         int             inodes,
2156         int             first_locked,
2157         uint            lock_mode)
2158 {
2159         int             attempts = 0, i, j, try_lock;
2160         xfs_log_item_t  *lp;
2161
2162         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2163
2164         if (first_locked) {
2165                 try_lock = 1;
2166                 i = 1;
2167         } else {
2168                 try_lock = 0;
2169                 i = 0;
2170         }
2171
2172 again:
2173         for (; i < inodes; i++) {
2174                 ASSERT(ips[i]);
2175
2176                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2177                         continue;
2178
2179                 /*
2180                  * If try_lock is not set yet, make sure all locked inodes
2181                  * are not in the AIL.
2182                  * If any are, set try_lock to be used later.
2183                  */
2184
2185                 if (!try_lock) {
2186                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2187                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2188                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2189                                         try_lock++;
2190                                 }
2191                         }
2192                 }
2193
2194                 /*
2195                  * If any of the previous locks we have locked is in the AIL,
2196                  * we must TRY to get the second and subsequent locks. If
2197                  * we can't get any, we must release all we have
2198                  * and try again.
2199                  */
2200
2201                 if (try_lock) {
2202                         /* try_lock must be 0 if i is 0. */
2203                         /*
2204                          * try_lock means we have an inode locked
2205                          * that is in the AIL.
2206                          */
2207                         ASSERT(i != 0);
2208                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2209                                 attempts++;
2210
2211                                 /*
2212                                  * Unlock all previous guys and try again.
2213                                  * xfs_iunlock will try to push the tail
2214                                  * if the inode is in the AIL.
2215                                  */
2216
2217                                 for(j = i - 1; j >= 0; j--) {
2218
2219                                         /*
2220                                          * Check to see if we've already
2221                                          * unlocked this one.
2222                                          * Not the first one going back,
2223                                          * and the inode ptr is the same.
2224                                          */
2225                                         if ((j != (i - 1)) && ips[j] ==
2226                                                                 ips[j+1])
2227                                                 continue;
2228
2229                                         xfs_iunlock(ips[j], lock_mode);
2230                                 }
2231
2232                                 if ((attempts % 5) == 0) {
2233                                         delay(1); /* Don't just spin the CPU */
2234 #ifdef DEBUG
2235                                         xfs_lock_delays++;
2236 #endif
2237                                 }
2238                                 i = 0;
2239                                 try_lock = 0;
2240                                 goto again;
2241                         }
2242                 } else {
2243                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2244                 }
2245         }
2246
2247 #ifdef DEBUG
2248         if (attempts) {
2249                 if (attempts < 5) xfs_small_retries++;
2250                 else if (attempts < 100) xfs_middle_retries++;
2251                 else xfs_lots_retries++;
2252         } else {
2253                 xfs_locked_n++;
2254         }
2255 #endif
2256 }
2257
2258 #ifdef  DEBUG
2259 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2260 int remove_which_error_return = 0;
2261 #else /* ! DEBUG */
2262 #define REMOVE_DEBUG_TRACE(x)
2263 #endif  /* ! DEBUG */
2264
2265 int
2266 xfs_remove(
2267         xfs_inode_t             *dp,
2268         bhv_vname_t             *dentry)
2269 {
2270         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2271         char                    *name = VNAME(dentry);
2272         xfs_mount_t             *mp = dp->i_mount;
2273         xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
2274         int                     namelen = VNAMELEN(dentry);
2275         xfs_trans_t             *tp = NULL;
2276         int                     error = 0;
2277         xfs_bmap_free_t         free_list;
2278         xfs_fsblock_t           first_block;
2279         int                     cancel_flags;
2280         int                     committed;
2281         int                     link_zero;
2282         uint                    resblks;
2283
2284         xfs_itrace_entry(dp);
2285
2286         if (XFS_FORCED_SHUTDOWN(mp))
2287                 return XFS_ERROR(EIO);
2288
2289         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2290                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2291                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2292                                         name, NULL, ip->i_d.di_mode, 0, 0);
2293                 if (error)
2294                         return error;
2295         }
2296
2297         /*
2298          * We need to get a reference to ip before we get our log
2299          * reservation. The reason for this is that we cannot call
2300          * xfs_iget for an inode for which we do not have a reference
2301          * once we've acquired a log reservation. This is because the
2302          * inode we are trying to get might be in xfs_inactive going
2303          * for a log reservation. Since we'll have to wait for the
2304          * inactive code to complete before returning from xfs_iget,
2305          * we need to make sure that we don't have log space reserved
2306          * when we call xfs_iget.  Instead we get an unlocked reference
2307          * to the inode before getting our log reservation.
2308          */
2309         IHOLD(ip);
2310
2311         xfs_itrace_entry(ip);
2312         xfs_itrace_ref(ip);
2313
2314         error = XFS_QM_DQATTACH(mp, dp, 0);
2315         if (!error && dp != ip)
2316                 error = XFS_QM_DQATTACH(mp, ip, 0);
2317         if (error) {
2318                 REMOVE_DEBUG_TRACE(__LINE__);
2319                 IRELE(ip);
2320                 goto std_return;
2321         }
2322
2323         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2324         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2325         /*
2326          * We try to get the real space reservation first,
2327          * allowing for directory btree deletion(s) implying
2328          * possible bmap insert(s).  If we can't get the space
2329          * reservation then we use 0 instead, and avoid the bmap
2330          * btree insert(s) in the directory code by, if the bmap
2331          * insert tries to happen, instead trimming the LAST
2332          * block from the directory.
2333          */
2334         resblks = XFS_REMOVE_SPACE_RES(mp);
2335         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2336                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2337         if (error == ENOSPC) {
2338                 resblks = 0;
2339                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2340                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2341         }
2342         if (error) {
2343                 ASSERT(error != ENOSPC);
2344                 REMOVE_DEBUG_TRACE(__LINE__);
2345                 xfs_trans_cancel(tp, 0);
2346                 IRELE(ip);
2347                 return error;
2348         }
2349
2350         error = xfs_lock_dir_and_entry(dp, ip);
2351         if (error) {
2352                 REMOVE_DEBUG_TRACE(__LINE__);
2353                 xfs_trans_cancel(tp, cancel_flags);
2354                 IRELE(ip);
2355                 goto std_return;
2356         }
2357
2358         /*
2359          * At this point, we've gotten both the directory and the entry
2360          * inodes locked.
2361          */
2362         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2363         if (dp != ip) {
2364                 /*
2365                  * Increment vnode ref count only in this case since
2366                  * there's an extra vnode reference in the case where
2367                  * dp == ip.
2368                  */
2369                 IHOLD(dp);
2370                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2371         }
2372
2373         /*
2374          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2375          */
2376         XFS_BMAP_INIT(&free_list, &first_block);
2377         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2378                                         &first_block, &free_list, 0);
2379         if (error) {
2380                 ASSERT(error != ENOENT);
2381                 REMOVE_DEBUG_TRACE(__LINE__);
2382                 goto error1;
2383         }
2384         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2385
2386         dp->i_gen++;
2387         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2388
2389         error = xfs_droplink(tp, ip);
2390         if (error) {
2391                 REMOVE_DEBUG_TRACE(__LINE__);
2392                 goto error1;
2393         }
2394
2395         /* Determine if this is the last link while
2396          * we are in the transaction.
2397          */
2398         link_zero = (ip)->i_d.di_nlink==0;
2399
2400         /*
2401          * Take an extra ref on the inode so that it doesn't
2402          * go to xfs_inactive() from within the commit.
2403          */
2404         IHOLD(ip);
2405
2406         /*
2407          * If this is a synchronous mount, make sure that the
2408          * remove transaction goes to disk before returning to
2409          * the user.
2410          */
2411         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2412                 xfs_trans_set_sync(tp);
2413         }
2414
2415         error = xfs_bmap_finish(&tp, &free_list, &committed);
2416         if (error) {
2417                 REMOVE_DEBUG_TRACE(__LINE__);
2418                 goto error_rele;
2419         }
2420
2421         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2422         if (error) {
2423                 IRELE(ip);
2424                 goto std_return;
2425         }
2426
2427         /*
2428          * If we are using filestreams, kill the stream association.
2429          * If the file is still open it may get a new one but that
2430          * will get killed on last close in xfs_close() so we don't
2431          * have to worry about that.
2432          */
2433         if (link_zero && xfs_inode_is_filestream(ip))
2434                 xfs_filestream_deassociate(ip);
2435
2436         xfs_itrace_exit(ip);
2437         IRELE(ip);
2438
2439 /*      Fall through to std_return with error = 0 */
2440  std_return:
2441         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2442                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2443                                 dir_vp, DM_RIGHT_NULL,
2444                                 NULL, DM_RIGHT_NULL,
2445                                 name, NULL, ip->i_d.di_mode, error, 0);
2446         }
2447         return error;
2448
2449  error1:
2450         xfs_bmap_cancel(&free_list);
2451         cancel_flags |= XFS_TRANS_ABORT;
2452         xfs_trans_cancel(tp, cancel_flags);
2453         goto std_return;
2454
2455  error_rele:
2456         /*
2457          * In this case make sure to not release the inode until after
2458          * the current transaction is aborted.  Releasing it beforehand
2459          * can cause us to go to xfs_inactive and start a recursive
2460          * transaction which can easily deadlock with the current one.
2461          */
2462         xfs_bmap_cancel(&free_list);
2463         cancel_flags |= XFS_TRANS_ABORT;
2464         xfs_trans_cancel(tp, cancel_flags);
2465
2466         IRELE(ip);
2467
2468         goto std_return;
2469 }
2470
2471 int
2472 xfs_link(
2473         xfs_inode_t             *tdp,
2474         bhv_vnode_t             *src_vp,
2475         bhv_vname_t             *dentry)
2476 {
2477         bhv_vnode_t             *target_dir_vp = XFS_ITOV(tdp);
2478         xfs_mount_t             *mp = tdp->i_mount;
2479         xfs_inode_t             *sip = xfs_vtoi(src_vp);
2480         xfs_trans_t             *tp;
2481         xfs_inode_t             *ips[2];
2482         int                     error;
2483         xfs_bmap_free_t         free_list;
2484         xfs_fsblock_t           first_block;
2485         int                     cancel_flags;
2486         int                     committed;
2487         int                     resblks;
2488         char                    *target_name = VNAME(dentry);
2489         int                     target_namelen;
2490
2491         xfs_itrace_entry(tdp);
2492         xfs_itrace_entry(xfs_vtoi(src_vp));
2493
2494         target_namelen = VNAMELEN(dentry);
2495         ASSERT(!VN_ISDIR(src_vp));
2496
2497         if (XFS_FORCED_SHUTDOWN(mp))
2498                 return XFS_ERROR(EIO);
2499
2500         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2501                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2502                                         target_dir_vp, DM_RIGHT_NULL,
2503                                         src_vp, DM_RIGHT_NULL,
2504                                         target_name, NULL, 0, 0, 0);
2505                 if (error)
2506                         return error;
2507         }
2508
2509         /* Return through std_return after this point. */
2510
2511         error = XFS_QM_DQATTACH(mp, sip, 0);
2512         if (!error && sip != tdp)
2513                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2514         if (error)
2515                 goto std_return;
2516
2517         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2518         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2519         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2520         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2521                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2522         if (error == ENOSPC) {
2523                 resblks = 0;
2524                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2525                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2526         }
2527         if (error) {
2528                 cancel_flags = 0;
2529                 goto error_return;
2530         }
2531
2532         if (sip->i_ino < tdp->i_ino) {
2533                 ips[0] = sip;
2534                 ips[1] = tdp;
2535         } else {
2536                 ips[0] = tdp;
2537                 ips[1] = sip;
2538         }
2539
2540         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2541
2542         /*
2543          * Increment vnode ref counts since xfs_trans_commit &
2544          * xfs_trans_cancel will both unlock the inodes and
2545          * decrement the associated ref counts.
2546          */
2547         VN_HOLD(src_vp);
2548         VN_HOLD(target_dir_vp);
2549         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2550         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2551
2552         /*
2553          * If the source has too many links, we can't make any more to it.
2554          */
2555         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2556                 error = XFS_ERROR(EMLINK);
2557                 goto error_return;
2558         }
2559
2560         /*
2561          * If we are using project inheritance, we only allow hard link
2562          * creation in our tree when the project IDs are the same; else
2563          * the tree quota mechanism could be circumvented.
2564          */
2565         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2566                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2567                 error = XFS_ERROR(EXDEV);
2568                 goto error_return;
2569         }
2570
2571         if (resblks == 0 &&
2572             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2573                 goto error_return;
2574
2575         XFS_BMAP_INIT(&free_list, &first_block);
2576
2577         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2578                                    sip->i_ino, &first_block, &free_list,
2579                                    resblks);
2580         if (error)
2581                 goto abort_return;
2582         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2583         tdp->i_gen++;
2584         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2585
2586         error = xfs_bumplink(tp, sip);
2587         if (error)
2588                 goto abort_return;
2589
2590         /*
2591          * If this is a synchronous mount, make sure that the
2592          * link transaction goes to disk before returning to
2593          * the user.
2594          */
2595         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2596                 xfs_trans_set_sync(tp);
2597         }
2598
2599         error = xfs_bmap_finish (&tp, &free_list, &committed);
2600         if (error) {
2601                 xfs_bmap_cancel(&free_list);
2602                 goto abort_return;
2603         }
2604
2605         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2606         if (error)
2607                 goto std_return;
2608
2609         /* Fall through to std_return with error = 0. */
2610 std_return:
2611         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2612                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2613                                 target_dir_vp, DM_RIGHT_NULL,
2614                                 src_vp, DM_RIGHT_NULL,
2615                                 target_name, NULL, 0, error, 0);
2616         }
2617         return error;
2618
2619  abort_return:
2620         cancel_flags |= XFS_TRANS_ABORT;
2621         /* FALLTHROUGH */
2622
2623  error_return:
2624         xfs_trans_cancel(tp, cancel_flags);
2625         goto std_return;
2626 }
2627
2628
2629 int
2630 xfs_mkdir(
2631         xfs_inode_t             *dp,
2632         bhv_vname_t             *dentry,
2633         mode_t                  mode,
2634         bhv_vnode_t             **vpp,
2635         cred_t                  *credp)
2636 {
2637         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2638         char                    *dir_name = VNAME(dentry);
2639         int                     dir_namelen = VNAMELEN(dentry);
2640         xfs_mount_t             *mp = dp->i_mount;
2641         xfs_inode_t             *cdp;   /* inode of created dir */
2642         bhv_vnode_t             *cvp;   /* vnode of created dir */
2643         xfs_trans_t             *tp;
2644         int                     cancel_flags;
2645         int                     error;
2646         int                     committed;
2647         xfs_bmap_free_t         free_list;
2648         xfs_fsblock_t           first_block;
2649         boolean_t               unlock_dp_on_error = B_FALSE;
2650         boolean_t               created = B_FALSE;
2651         int                     dm_event_sent = 0;
2652         xfs_prid_t              prid;
2653         struct xfs_dquot        *udqp, *gdqp;
2654         uint                    resblks;
2655
2656         if (XFS_FORCED_SHUTDOWN(mp))
2657                 return XFS_ERROR(EIO);
2658
2659         tp = NULL;
2660
2661         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2662                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2663                                         dir_vp, DM_RIGHT_NULL, NULL,
2664                                         DM_RIGHT_NULL, dir_name, NULL,
2665                                         mode, 0, 0);
2666                 if (error)
2667                         return error;
2668                 dm_event_sent = 1;
2669         }
2670
2671         /* Return through std_return after this point. */
2672
2673         xfs_itrace_entry(dp);
2674
2675         mp = dp->i_mount;
2676         udqp = gdqp = NULL;
2677         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2678                 prid = dp->i_d.di_projid;
2679         else
2680                 prid = (xfs_prid_t)dfltprid;
2681
2682         /*
2683          * Make sure that we have allocated dquot(s) on disk.
2684          */
2685         error = XFS_QM_DQVOPALLOC(mp, dp,
2686                         current_fsuid(credp), current_fsgid(credp), prid,
2687                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2688         if (error)
2689                 goto std_return;
2690
2691         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2692         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2693         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2694         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2695                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2696         if (error == ENOSPC) {
2697                 resblks = 0;
2698                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2699                                           XFS_TRANS_PERM_LOG_RES,
2700                                           XFS_MKDIR_LOG_COUNT);
2701         }
2702         if (error) {
2703                 cancel_flags = 0;
2704                 goto error_return;
2705         }
2706
2707         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2708         unlock_dp_on_error = B_TRUE;
2709
2710         /*
2711          * Check for directory link count overflow.
2712          */
2713         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2714                 error = XFS_ERROR(EMLINK);
2715                 goto error_return;
2716         }
2717
2718         /*
2719          * Reserve disk quota and the inode.
2720          */
2721         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2722         if (error)
2723                 goto error_return;
2724
2725         if (resblks == 0 &&
2726             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2727                 goto error_return;
2728         /*
2729          * create the directory inode.
2730          */
2731         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2732                         0, credp, prid, resblks > 0,
2733                 &cdp, NULL);
2734         if (error) {
2735                 if (error == ENOSPC)
2736                         goto error_return;
2737                 goto abort_return;
2738         }
2739         xfs_itrace_ref(cdp);
2740
2741         /*
2742          * Now we add the directory inode to the transaction.
2743          * We waited until now since xfs_dir_ialloc might start
2744          * a new transaction.  Had we joined the transaction
2745          * earlier, the locks might have gotten released. An error
2746          * from here on will result in the transaction cancel
2747          * unlocking dp so don't do it explicitly in the error path.
2748          */
2749         VN_HOLD(dir_vp);
2750         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2751         unlock_dp_on_error = B_FALSE;
2752
2753         XFS_BMAP_INIT(&free_list, &first_block);
2754
2755         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2756                                    &first_block, &free_list, resblks ?
2757                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2758         if (error) {
2759                 ASSERT(error != ENOSPC);
2760                 goto error1;
2761         }
2762         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2763
2764         /*
2765          * Bump the in memory version number of the parent directory
2766          * so that other processes accessing it will recognize that
2767          * the directory has changed.
2768          */
2769         dp->i_gen++;
2770
2771         error = xfs_dir_init(tp, cdp, dp);
2772         if (error)
2773                 goto error2;
2774
2775         cdp->i_gen = 1;
2776         error = xfs_bumplink(tp, dp);
2777         if (error)
2778                 goto error2;
2779
2780         cvp = XFS_ITOV(cdp);
2781
2782         created = B_TRUE;
2783
2784         *vpp = cvp;
2785         IHOLD(cdp);
2786
2787         /*
2788          * Attach the dquots to the new inode and modify the icount incore.
2789          */
2790         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2791
2792         /*
2793          * If this is a synchronous mount, make sure that the
2794          * mkdir transaction goes to disk before returning to
2795          * the user.
2796          */
2797         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2798                 xfs_trans_set_sync(tp);
2799         }
2800
2801         error = xfs_bmap_finish(&tp, &free_list, &committed);
2802         if (error) {
2803                 IRELE(cdp);
2804                 goto error2;
2805         }
2806
2807         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2808         XFS_QM_DQRELE(mp, udqp);
2809         XFS_QM_DQRELE(mp, gdqp);
2810         if (error) {
2811                 IRELE(cdp);
2812         }
2813
2814         /* Fall through to std_return with error = 0 or errno from
2815          * xfs_trans_commit. */
2816
2817 std_return:
2818         if ((created || (error != 0 && dm_event_sent != 0)) &&
2819             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2820                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2821                                         dir_vp, DM_RIGHT_NULL,
2822                                         created ? XFS_ITOV(cdp):NULL,
2823                                         DM_RIGHT_NULL,
2824                                         dir_name, NULL,
2825                                         mode, error, 0);
2826         }
2827         return error;
2828
2829  error2:
2830  error1:
2831         xfs_bmap_cancel(&free_list);
2832  abort_return:
2833         cancel_flags |= XFS_TRANS_ABORT;
2834  error_return:
2835         xfs_trans_cancel(tp, cancel_flags);
2836         XFS_QM_DQRELE(mp, udqp);
2837         XFS_QM_DQRELE(mp, gdqp);
2838
2839         if (unlock_dp_on_error)
2840                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2841
2842         goto std_return;
2843 }
2844
2845 int
2846 xfs_rmdir(
2847         xfs_inode_t             *dp,
2848         bhv_vname_t             *dentry)
2849 {
2850         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2851         char                    *name = VNAME(dentry);
2852         int                     namelen = VNAMELEN(dentry);
2853         xfs_mount_t             *mp = dp->i_mount;
2854         xfs_inode_t             *cdp = VNAME_TO_INODE(dentry);
2855         xfs_trans_t             *tp;
2856         int                     error;
2857         xfs_bmap_free_t         free_list;
2858         xfs_fsblock_t           first_block;
2859         int                     cancel_flags;
2860         int                     committed;
2861         int                     last_cdp_link;
2862         uint                    resblks;
2863
2864         xfs_itrace_entry(dp);
2865
2866         if (XFS_FORCED_SHUTDOWN(mp))
2867                 return XFS_ERROR(EIO);
2868
2869         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2870                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2871                                         dir_vp, DM_RIGHT_NULL,
2872                                         NULL, DM_RIGHT_NULL,
2873                                         name, NULL, cdp->i_d.di_mode, 0, 0);
2874                 if (error)
2875                         return XFS_ERROR(error);
2876         }
2877
2878         /*
2879          * We need to get a reference to cdp before we get our log
2880          * reservation.  The reason for this is that we cannot call
2881          * xfs_iget for an inode for which we do not have a reference
2882          * once we've acquired a log reservation.  This is because the
2883          * inode we are trying to get might be in xfs_inactive going
2884          * for a log reservation.  Since we'll have to wait for the
2885          * inactive code to complete before returning from xfs_iget,
2886          * we need to make sure that we don't have log space reserved
2887          * when we call xfs_iget.  Instead we get an unlocked reference
2888          * to the inode before getting our log reservation.
2889          */
2890         IHOLD(cdp);
2891
2892         /*
2893          * Get the dquots for the inodes.
2894          */
2895         error = XFS_QM_DQATTACH(mp, dp, 0);
2896         if (!error && dp != cdp)
2897                 error = XFS_QM_DQATTACH(mp, cdp, 0);
2898         if (error) {
2899                 IRELE(cdp);
2900                 REMOVE_DEBUG_TRACE(__LINE__);
2901                 goto std_return;
2902         }
2903
2904         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2905         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2906         /*
2907          * We try to get the real space reservation first,
2908          * allowing for directory btree deletion(s) implying
2909          * possible bmap insert(s).  If we can't get the space
2910          * reservation then we use 0 instead, and avoid the bmap
2911          * btree insert(s) in the directory code by, if the bmap
2912          * insert tries to happen, instead trimming the LAST
2913          * block from the directory.
2914          */
2915         resblks = XFS_REMOVE_SPACE_RES(mp);
2916         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2917                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2918         if (error == ENOSPC) {
2919                 resblks = 0;
2920                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2921                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2922         }
2923         if (error) {
2924                 ASSERT(error != ENOSPC);
2925                 cancel_flags = 0;
2926                 IRELE(cdp);
2927                 goto error_return;
2928         }
2929         XFS_BMAP_INIT(&free_list, &first_block);
2930
2931         /*
2932          * Now lock the child directory inode and the parent directory
2933          * inode in the proper order.  This will take care of validating
2934          * that the directory entry for the child directory inode has
2935          * not changed while we were obtaining a log reservation.
2936          */
2937         error = xfs_lock_dir_and_entry(dp, cdp);
2938         if (error) {
2939                 xfs_trans_cancel(tp, cancel_flags);
2940                 IRELE(cdp);
2941                 goto std_return;
2942         }
2943
2944         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2945         if (dp != cdp) {
2946                 /*
2947                  * Only increment the parent directory vnode count if
2948                  * we didn't bump it in looking up cdp.  The only time
2949                  * we don't bump it is when we're looking up ".".
2950                  */
2951                 VN_HOLD(dir_vp);
2952         }
2953
2954         xfs_itrace_ref(cdp);
2955         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
2956
2957         ASSERT(cdp->i_d.di_nlink >= 2);
2958         if (cdp->i_d.di_nlink != 2) {
2959                 error = XFS_ERROR(ENOTEMPTY);
2960                 goto error_return;
2961         }
2962         if (!xfs_dir_isempty(cdp)) {
2963                 error = XFS_ERROR(ENOTEMPTY);
2964                 goto error_return;
2965         }
2966
2967         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
2968                                         &first_block, &free_list, resblks);
2969         if (error)
2970                 goto error1;
2971
2972         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2973
2974         /*
2975          * Bump the in memory generation count on the parent
2976          * directory so that other can know that it has changed.
2977          */
2978         dp->i_gen++;
2979
2980         /*
2981          * Drop the link from cdp's "..".
2982          */
2983         error = xfs_droplink(tp, dp);
2984         if (error) {
2985                 goto error1;
2986         }
2987
2988         /*
2989          * Drop the link from dp to cdp.
2990          */
2991         error = xfs_droplink(tp, cdp);
2992         if (error) {
2993                 goto error1;
2994         }
2995
2996         /*
2997          * Drop the "." link from cdp to self.
2998          */
2999         error = xfs_droplink(tp, cdp);
3000         if (error) {
3001                 goto error1;
3002         }
3003
3004         /* Determine these before committing transaction */
3005         last_cdp_link = (cdp)->i_d.di_nlink==0;
3006
3007         /*
3008          * Take an extra ref on the child vnode so that it
3009          * does not go to xfs_inactive() from within the commit.
3010          */
3011         IHOLD(cdp);
3012
3013         /*
3014          * If this is a synchronous mount, make sure that the
3015          * rmdir transaction goes to disk before returning to
3016          * the user.
3017          */
3018         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3019                 xfs_trans_set_sync(tp);
3020         }
3021
3022         error = xfs_bmap_finish (&tp, &free_list, &committed);
3023         if (error) {
3024                 xfs_bmap_cancel(&free_list);
3025                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3026                                  XFS_TRANS_ABORT));
3027                 IRELE(cdp);
3028                 goto std_return;
3029         }
3030
3031         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3032         if (error) {
3033                 IRELE(cdp);
3034                 goto std_return;
3035         }
3036
3037
3038         IRELE(cdp);
3039
3040         /* Fall through to std_return with error = 0 or the errno
3041          * from xfs_trans_commit. */
3042  std_return:
3043         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3044                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3045                                         dir_vp, DM_RIGHT_NULL,
3046                                         NULL, DM_RIGHT_NULL,
3047                                         name, NULL, cdp->i_d.di_mode,
3048                                         error, 0);
3049         }
3050         return error;
3051
3052  error1:
3053         xfs_bmap_cancel(&free_list);
3054         cancel_flags |= XFS_TRANS_ABORT;
3055         /* FALLTHROUGH */
3056
3057  error_return:
3058         xfs_trans_cancel(tp, cancel_flags);
3059         goto std_return;
3060 }
3061
3062 int
3063 xfs_symlink(
3064         xfs_inode_t             *dp,
3065         bhv_vname_t             *dentry,
3066         char                    *target_path,
3067         mode_t                  mode,
3068         bhv_vnode_t             **vpp,
3069         cred_t                  *credp)
3070 {
3071         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
3072         xfs_mount_t             *mp = dp->i_mount;
3073         xfs_trans_t             *tp;
3074         xfs_inode_t             *ip;
3075         int                     error;
3076         int                     pathlen;
3077         xfs_bmap_free_t         free_list;
3078         xfs_fsblock_t           first_block;
3079         boolean_t               unlock_dp_on_error = B_FALSE;
3080         uint                    cancel_flags;
3081         int                     committed;
3082         xfs_fileoff_t           first_fsb;
3083         xfs_filblks_t           fs_blocks;
3084         int                     nmaps;
3085         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3086         xfs_daddr_t             d;
3087         char                    *cur_chunk;
3088         int                     byte_cnt;
3089         int                     n;
3090         xfs_buf_t               *bp;
3091         xfs_prid_t              prid;
3092         struct xfs_dquot        *udqp, *gdqp;
3093         uint                    resblks;
3094         char                    *link_name = VNAME(dentry);
3095         int                     link_namelen;
3096
3097         *vpp = NULL;
3098         error = 0;
3099         ip = NULL;
3100         tp = NULL;
3101
3102         xfs_itrace_entry(dp);
3103
3104         if (XFS_FORCED_SHUTDOWN(mp))
3105                 return XFS_ERROR(EIO);
3106
3107         link_namelen = VNAMELEN(dentry);
3108
3109         /*
3110          * Check component lengths of the target path name.
3111          */
3112         pathlen = strlen(target_path);
3113         if (pathlen >= MAXPATHLEN)      /* total string too long */
3114                 return XFS_ERROR(ENAMETOOLONG);
3115         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3116                 int len, total;
3117                 char *path;
3118
3119                 for (total = 0, path = target_path; total < pathlen;) {
3120                         /*
3121                          * Skip any slashes.
3122                          */
3123                         while(*path == '/') {
3124                                 total++;
3125                                 path++;
3126                         }
3127
3128                         /*
3129                          * Count up to the next slash or end of path.
3130                          * Error out if the component is bigger than MAXNAMELEN.
3131                          */
3132                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3133                                 if (++len >= MAXNAMELEN) {
3134                                         error = ENAMETOOLONG;
3135                                         return error;
3136                                 }
3137                         }
3138                 }
3139         }
3140
3141         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3142                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3143                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3144                                         link_name, target_path, 0, 0, 0);
3145                 if (error)
3146                         return error;
3147         }
3148
3149         /* Return through std_return after this point. */
3150
3151         udqp = gdqp = NULL;
3152         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3153                 prid = dp->i_d.di_projid;
3154         else
3155                 prid = (xfs_prid_t)dfltprid;
3156
3157         /*
3158          * Make sure that we have allocated dquot(s) on disk.
3159          */
3160         error = XFS_QM_DQVOPALLOC(mp, dp,
3161                         current_fsuid(credp), current_fsgid(credp), prid,
3162                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3163         if (error)
3164                 goto std_return;
3165
3166         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3167         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3168         /*
3169          * The symlink will fit into the inode data fork?
3170          * There can't be any attributes so we get the whole variable part.
3171          */
3172         if (pathlen <= XFS_LITINO(mp))
3173                 fs_blocks = 0;
3174         else
3175                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3176         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3177         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3178                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3179         if (error == ENOSPC && fs_blocks == 0) {
3180                 resblks = 0;
3181                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3182                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3183         }
3184         if (error) {
3185                 cancel_flags = 0;
3186                 goto error_return;
3187         }
3188
3189         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3190         unlock_dp_on_error = B_TRUE;
3191
3192         /*
3193          * Check whether the directory allows new symlinks or not.
3194          */
3195         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3196                 error = XFS_ERROR(EPERM);
3197                 goto error_return;
3198         }
3199
3200         /*
3201          * Reserve disk quota : blocks and inode.
3202          */
3203         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3204         if (error)
3205                 goto error_return;
3206
3207         /*
3208          * Check for ability to enter directory entry, if no space reserved.
3209          */
3210         if (resblks == 0 &&
3211             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3212                 goto error_return;
3213         /*
3214          * Initialize the bmap freelist prior to calling either
3215          * bmapi or the directory create code.
3216          */
3217         XFS_BMAP_INIT(&free_list, &first_block);
3218
3219         /*
3220          * Allocate an inode for the symlink.
3221          */
3222         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3223                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3224         if (error) {
3225                 if (error == ENOSPC)
3226                         goto error_return;
3227                 goto error1;
3228         }
3229         xfs_itrace_ref(ip);
3230
3231         /*
3232          * An error after we've joined dp to the transaction will result in the
3233          * transaction cancel unlocking dp so don't do it explicitly in the
3234          * error path.
3235          */
3236         VN_HOLD(dir_vp);
3237         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3238         unlock_dp_on_error = B_FALSE;
3239
3240         /*
3241          * Also attach the dquot(s) to it, if applicable.
3242          */
3243         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3244
3245         if (resblks)
3246                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3247         /*
3248          * If the symlink will fit into the inode, write it inline.
3249          */
3250         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3251                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3252                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3253                 ip->i_d.di_size = pathlen;
3254
3255                 /*
3256                  * The inode was initially created in extent format.
3257                  */
3258                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3259                 ip->i_df.if_flags |= XFS_IFINLINE;
3260
3261                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3262                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3263
3264         } else {
3265                 first_fsb = 0;
3266                 nmaps = SYMLINK_MAPS;
3267
3268                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3269                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3270                                   &first_block, resblks, mval, &nmaps,
3271                                   &free_list, NULL);
3272                 if (error) {
3273                         goto error1;
3274                 }
3275
3276                 if (resblks)
3277                         resblks -= fs_blocks;
3278                 ip->i_d.di_size = pathlen;
3279                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3280
3281                 cur_chunk = target_path;
3282                 for (n = 0; n < nmaps; n++) {
3283                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3284                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3285                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3286                                                BTOBB(byte_cnt), 0);
3287                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3288                         if (pathlen < byte_cnt) {
3289                                 byte_cnt = pathlen;
3290                         }
3291                         pathlen -= byte_cnt;
3292
3293                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3294                         cur_chunk += byte_cnt;
3295
3296                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3297                 }
3298         }
3299
3300         /*
3301          * Create the directory entry for the symlink.
3302          */
3303         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3304                                    &first_block, &free_list, resblks);
3305         if (error)
3306                 goto error1;
3307         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3308         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3309
3310         /*
3311          * Bump the in memory version number of the parent directory
3312          * so that other processes accessing it will recognize that
3313          * the directory has changed.
3314          */
3315         dp->i_gen++;
3316
3317         /*
3318          * If this is a synchronous mount, make sure that the
3319          * symlink transaction goes to disk before returning to
3320          * the user.
3321          */
3322         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3323                 xfs_trans_set_sync(tp);
3324         }
3325
3326         /*
3327          * xfs_trans_commit normally decrements the vnode ref count
3328          * when it unlocks the inode. Since we want to return the
3329          * vnode to the caller, we bump the vnode ref count now.
3330          */
3331         IHOLD(ip);
3332
3333         error = xfs_bmap_finish(&tp, &free_list, &committed);
3334         if (error) {
3335                 goto error2;
3336         }
3337         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3338         XFS_QM_DQRELE(mp, udqp);
3339         XFS_QM_DQRELE(mp, gdqp);
3340
3341         /* Fall through to std_return with error = 0 or errno from
3342          * xfs_trans_commit     */
3343 std_return:
3344         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3345                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3346                                         dir_vp, DM_RIGHT_NULL,
3347                                         error ? NULL : XFS_ITOV(ip),
3348                                         DM_RIGHT_NULL, link_name, target_path,
3349                                         0, error, 0);
3350         }
3351
3352         if (!error) {
3353                 bhv_vnode_t *vp;
3354
3355                 ASSERT(ip);
3356                 vp = XFS_ITOV(ip);
3357                 *vpp = vp;
3358         }
3359         return error;
3360
3361  error2:
3362         IRELE(ip);
3363  error1:
3364         xfs_bmap_cancel(&free_list);
3365         cancel_flags |= XFS_TRANS_ABORT;
3366  error_return:
3367         xfs_trans_cancel(tp, cancel_flags);
3368         XFS_QM_DQRELE(mp, udqp);
3369         XFS_QM_DQRELE(mp, gdqp);
3370
3371         if (unlock_dp_on_error)
3372                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3373
3374         goto std_return;
3375 }
3376
3377 int
3378 xfs_rwlock(
3379         xfs_inode_t     *ip,
3380         bhv_vrwlock_t   locktype)
3381 {
3382         if (S_ISDIR(ip->i_d.di_mode))
3383                 return 1;
3384         if (locktype == VRWLOCK_WRITE) {
3385                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3386         } else if (locktype == VRWLOCK_TRY_READ) {
3387                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3388         } else if (locktype == VRWLOCK_TRY_WRITE) {
3389                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3390         } else {
3391                 ASSERT((locktype == VRWLOCK_READ) ||
3392                        (locktype == VRWLOCK_WRITE_DIRECT));
3393                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3394         }
3395
3396         return 1;
3397 }
3398
3399
3400 void
3401 xfs_rwunlock(
3402         xfs_inode_t     *ip,
3403         bhv_vrwlock_t   locktype)
3404 {
3405         if (S_ISDIR(ip->i_d.di_mode))
3406                 return;
3407         if (locktype == VRWLOCK_WRITE) {
3408                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3409         } else {
3410                 ASSERT((locktype == VRWLOCK_READ) ||
3411                        (locktype == VRWLOCK_WRITE_DIRECT));
3412                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3413         }
3414         return;
3415 }
3416
3417
3418 int
3419 xfs_inode_flush(
3420         xfs_inode_t     *ip,
3421         int             flags)
3422 {
3423         xfs_mount_t     *mp = ip->i_mount;
3424         int             error = 0;
3425
3426         if (XFS_FORCED_SHUTDOWN(mp))
3427                 return XFS_ERROR(EIO);
3428
3429         /*
3430          * Bypass inodes which have already been cleaned by
3431          * the inode flush clustering code inside xfs_iflush
3432          */
3433         if (xfs_inode_clean(ip))
3434                 return 0;
3435
3436         /*
3437          * We make this non-blocking if the inode is contended,
3438          * return EAGAIN to indicate to the caller that they
3439          * did not succeed. This prevents the flush path from
3440          * blocking on inodes inside another operation right
3441          * now, they get caught later by xfs_sync.
3442          */
3443         if (flags & FLUSH_SYNC) {
3444                 xfs_ilock(ip, XFS_ILOCK_SHARED);
3445                 xfs_iflock(ip);
3446         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3447                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3448                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3449                         return EAGAIN;
3450                 }
3451         } else {
3452                 return EAGAIN;
3453         }
3454
3455         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
3456                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
3457         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3458
3459         return error;
3460 }
3461
3462
3463 int
3464 xfs_set_dmattrs(
3465         xfs_inode_t     *ip,
3466         u_int           evmask,
3467         u_int16_t       state)
3468 {
3469         xfs_mount_t     *mp = ip->i_mount;
3470         xfs_trans_t     *tp;
3471         int             error;
3472
3473         if (!capable(CAP_SYS_ADMIN))
3474                 return XFS_ERROR(EPERM);
3475
3476         if (XFS_FORCED_SHUTDOWN(mp))
3477                 return XFS_ERROR(EIO);
3478
3479         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3480         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3481         if (error) {
3482                 xfs_trans_cancel(tp, 0);
3483                 return error;
3484         }
3485         xfs_ilock(ip, XFS_ILOCK_EXCL);
3486         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3487
3488         ip->i_d.di_dmevmask = evmask;
3489         ip->i_d.di_dmstate  = state;
3490
3491         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3492         IHOLD(ip);
3493         error = xfs_trans_commit(tp, 0);
3494
3495         return error;
3496 }
3497
3498 int
3499 xfs_reclaim(
3500         xfs_inode_t     *ip)
3501 {
3502         bhv_vnode_t     *vp = XFS_ITOV(ip);
3503
3504         xfs_itrace_entry(ip);
3505
3506         ASSERT(!VN_MAPPED(vp));
3507
3508         /* bad inode, get out here ASAP */
3509         if (VN_BAD(vp)) {
3510                 xfs_ireclaim(ip);
3511                 return 0;
3512         }
3513
3514         vn_iowait(ip);
3515
3516         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3517
3518         /*
3519          * Make sure the atime in the XFS inode is correct before freeing the
3520          * Linux inode.
3521          */
3522         xfs_synchronize_atime(ip);
3523
3524         /*
3525          * If we have nothing to flush with this inode then complete the
3526          * teardown now, otherwise break the link between the xfs inode and the
3527          * linux inode and clean up the xfs inode later. This avoids flushing
3528          * the inode to disk during the delete operation itself.
3529          *
3530          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3531          * first to ensure that xfs_iunpin() will never see an xfs inode
3532          * that has a linux inode being reclaimed. Synchronisation is provided
3533          * by the i_flags_lock.
3534          */
3535         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3536                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3537                 xfs_iflock(ip);
3538                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3539         } else {
3540                 xfs_mount_t     *mp = ip->i_mount;
3541
3542                 /* Protect sync and unpin from us */
3543                 XFS_MOUNT_ILOCK(mp);
3544                 spin_lock(&ip->i_flags_lock);
3545                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3546                 vn_to_inode(vp)->i_private = NULL;
3547                 ip->i_vnode = NULL;
3548                 spin_unlock(&ip->i_flags_lock);
3549                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3550                 XFS_MOUNT_IUNLOCK(mp);
3551         }
3552         return 0;
3553 }
3554
3555 int
3556 xfs_finish_reclaim(
3557         xfs_inode_t     *ip,
3558         int             locked,
3559         int             sync_mode)
3560 {
3561         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3562         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3563         int             error;
3564
3565         if (vp && VN_BAD(vp))
3566                 goto reclaim;
3567
3568         /* The hash lock here protects a thread in xfs_iget_core from
3569          * racing with us on linking the inode back with a vnode.
3570          * Once we have the XFS_IRECLAIM flag set it will not touch
3571          * us.
3572          */
3573         write_lock(&pag->pag_ici_lock);
3574         spin_lock(&ip->i_flags_lock);
3575         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3576             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3577                 spin_unlock(&ip->i_flags_lock);
3578                 write_unlock(&pag->pag_ici_lock);
3579                 if (locked) {
3580                         xfs_ifunlock(ip);
3581                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3582                 }
3583                 return 1;
3584         }
3585         __xfs_iflags_set(ip, XFS_IRECLAIM);
3586         spin_unlock(&ip->i_flags_lock);
3587         write_unlock(&pag->pag_ici_lock);
3588         xfs_put_perag(ip->i_mount, pag);
3589
3590         /*
3591          * If the inode is still dirty, then flush it out.  If the inode
3592          * is not in the AIL, then it will be OK to flush it delwri as
3593          * long as xfs_iflush() does not keep any references to the inode.
3594          * We leave that decision up to xfs_iflush() since it has the
3595          * knowledge of whether it's OK to simply do a delwri flush of
3596          * the inode or whether we need to wait until the inode is
3597          * pulled from the AIL.
3598          * We get the flush lock regardless, though, just to make sure
3599          * we don't free it while it is being flushed.
3600          */
3601         if (!locked) {
3602                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3603                 xfs_iflock(ip);
3604         }
3605
3606         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3607                 if (ip->i_update_core ||
3608                     ((ip->i_itemp != NULL) &&
3609                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3610                         error = xfs_iflush(ip, sync_mode);
3611                         /*
3612                          * If we hit an error, typically because of filesystem
3613                          * shutdown, we don't need to let vn_reclaim to know
3614                          * because we're gonna reclaim the inode anyway.
3615                          */
3616                         if (error) {
3617                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3618                                 goto reclaim;
3619                         }
3620                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3621                 }
3622
3623                 ASSERT(ip->i_update_core == 0);
3624                 ASSERT(ip->i_itemp == NULL ||
3625                        ip->i_itemp->ili_format.ilf_fields == 0);
3626         }
3627
3628         xfs_ifunlock(ip);
3629         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3630
3631  reclaim:
3632         xfs_ireclaim(ip);
3633         return 0;
3634 }
3635
3636 int
3637 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3638 {
3639         int             purged;
3640         xfs_inode_t     *ip, *n;
3641         int             done = 0;
3642
3643         while (!done) {
3644                 purged = 0;
3645                 XFS_MOUNT_ILOCK(mp);
3646                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3647                         if (noblock) {
3648                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3649                                         continue;
3650                                 if (xfs_ipincount(ip) ||
3651                                     !xfs_iflock_nowait(ip)) {
3652                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3653                                         continue;
3654                                 }
3655                         }
3656                         XFS_MOUNT_IUNLOCK(mp);
3657                         if (xfs_finish_reclaim(ip, noblock,
3658                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3659                                 delay(1);
3660                         purged = 1;
3661                         break;
3662                 }
3663
3664                 done = !purged;
3665         }
3666
3667         XFS_MOUNT_IUNLOCK(mp);
3668         return 0;
3669 }
3670
3671 /*
3672  * xfs_alloc_file_space()
3673  *      This routine allocates disk space for the given file.
3674  *
3675  *      If alloc_type == 0, this request is for an ALLOCSP type
3676  *      request which will change the file size.  In this case, no
3677  *      DMAPI event will be generated by the call.  A TRUNCATE event
3678  *      will be generated later by xfs_setattr.
3679  *
3680  *      If alloc_type != 0, this request is for a RESVSP type
3681  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3682  *      lower block boundary byte address is less than the file's
3683  *      length.
3684  *
3685  * RETURNS:
3686  *       0 on success
3687  *      errno on error
3688  *
3689  */
3690 STATIC int
3691 xfs_alloc_file_space(
3692         xfs_inode_t             *ip,
3693         xfs_off_t               offset,
3694         xfs_off_t               len,
3695         int                     alloc_type,
3696         int                     attr_flags)
3697 {
3698         xfs_mount_t             *mp = ip->i_mount;
3699         xfs_off_t               count;
3700         xfs_filblks_t           allocated_fsb;
3701         xfs_filblks_t           allocatesize_fsb;
3702         xfs_extlen_t            extsz, temp;
3703         xfs_fileoff_t           startoffset_fsb;
3704         xfs_fsblock_t           firstfsb;
3705         int                     nimaps;
3706         int                     bmapi_flag;
3707         int                     quota_flag;
3708         int                     rt;
3709         xfs_trans_t             *tp;
3710         xfs_bmbt_irec_t         imaps[1], *imapp;
3711         xfs_bmap_free_t         free_list;
3712         uint                    qblocks, resblks, resrtextents;
3713         int                     committed;
3714         int                     error;
3715
3716         xfs_itrace_entry(ip);
3717
3718         if (XFS_FORCED_SHUTDOWN(mp))
3719                 return XFS_ERROR(EIO);
3720
3721         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3722                 return error;
3723
3724         if (len <= 0)
3725                 return XFS_ERROR(EINVAL);
3726
3727         rt = XFS_IS_REALTIME_INODE(ip);
3728         extsz = xfs_get_extsz_hint(ip);
3729
3730         count = len;
3731         imapp = &imaps[0];
3732         nimaps = 1;
3733         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3734         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3735         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3736
3737         /*      Generate a DMAPI event if needed.       */
3738         if (alloc_type != 0 && offset < ip->i_size &&
3739                         (attr_flags&ATTR_DMI) == 0  &&
3740                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3741                 xfs_off_t           end_dmi_offset;
3742
3743                 end_dmi_offset = offset+len;
3744                 if (end_dmi_offset > ip->i_size)
3745                         end_dmi_offset = ip->i_size;
3746                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
3747                         offset, end_dmi_offset - offset,
3748                         0, NULL);
3749                 if (error)
3750                         return error;
3751         }
3752
3753         /*
3754          * Allocate file space until done or until there is an error
3755          */
3756 retry:
3757         while (allocatesize_fsb && !error) {
3758                 xfs_fileoff_t   s, e;
3759
3760                 /*
3761                  * Determine space reservations for data/realtime.
3762                  */
3763                 if (unlikely(extsz)) {
3764                         s = startoffset_fsb;
3765                         do_div(s, extsz);
3766                         s *= extsz;
3767                         e = startoffset_fsb + allocatesize_fsb;
3768                         if ((temp = do_mod(startoffset_fsb, extsz)))
3769                                 e += temp;
3770                         if ((temp = do_mod(e, extsz)))
3771                                 e += extsz - temp;
3772                 } else {
3773                         s = 0;
3774                         e = allocatesize_fsb;
3775                 }
3776
3777                 if (unlikely(rt)) {
3778                         resrtextents = qblocks = (uint)(e - s);
3779                         resrtextents /= mp->m_sb.sb_rextsize;
3780                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3781                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3782                 } else {
3783                         resrtextents = 0;
3784                         resblks = qblocks = \
3785                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3786                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3787                 }
3788
3789                 /*
3790                  * Allocate and setup the transaction.
3791                  */
3792                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3793                 error = xfs_trans_reserve(tp, resblks,
3794                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3795                                           XFS_TRANS_PERM_LOG_RES,
3796                                           XFS_WRITE_LOG_COUNT);
3797                 /*
3798                  * Check for running out of space
3799                  */
3800                 if (error) {
3801                         /*
3802                          * Free the transaction structure.
3803                          */
3804                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3805                         xfs_trans_cancel(tp, 0);
3806                         break;
3807                 }
3808                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3809                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3810                                                       qblocks, 0, quota_flag);
3811                 if (error)
3812                         goto error1;
3813
3814                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3815                 xfs_trans_ihold(tp, ip);
3816
3817                 /*
3818                  * Issue the xfs_bmapi() call to allocate the blocks
3819                  */
3820                 XFS_BMAP_INIT(&free_list, &firstfsb);
3821                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3822                                   allocatesize_fsb, bmapi_flag,
3823                                   &firstfsb, 0, imapp, &nimaps,
3824                                   &free_list, NULL);
3825                 if (error) {
3826                         goto error0;
3827                 }
3828
3829                 /*
3830                  * Complete the transaction
3831                  */
3832                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3833                 if (error) {
3834                         goto error0;
3835                 }
3836
3837                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3838                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3839                 if (error) {
3840                         break;
3841                 }
3842
3843                 allocated_fsb = imapp->br_blockcount;
3844
3845                 if (nimaps == 0) {
3846                         error = XFS_ERROR(ENOSPC);
3847                         break;
3848                 }
3849
3850                 startoffset_fsb += allocated_fsb;
3851                 allocatesize_fsb -= allocated_fsb;
3852         }
3853 dmapi_enospc_check:
3854         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3855             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3856                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3857                                 XFS_ITOV(ip), DM_RIGHT_NULL,
3858                                 XFS_ITOV(ip), DM_RIGHT_NULL,
3859                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3860                 if (error == 0)
3861                         goto retry;     /* Maybe DMAPI app. has made space */
3862                 /* else fall through with error from XFS_SEND_DATA */
3863         }
3864
3865         return error;
3866
3867 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3868         xfs_bmap_cancel(&free_list);
3869         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3870
3871 error1: /* Just cancel transaction */
3872         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3873         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3874         goto dmapi_enospc_check;
3875 }
3876
3877 /*
3878  * Zero file bytes between startoff and endoff inclusive.
3879  * The iolock is held exclusive and no blocks are buffered.
3880  */
3881 STATIC int
3882 xfs_zero_remaining_bytes(
3883         xfs_inode_t             *ip,
3884         xfs_off_t               startoff,
3885         xfs_off_t               endoff)
3886 {
3887         xfs_bmbt_irec_t         imap;
3888         xfs_fileoff_t           offset_fsb;
3889         xfs_off_t               lastoffset;
3890         xfs_off_t               offset;
3891         xfs_buf_t               *bp;
3892         xfs_mount_t             *mp = ip->i_mount;
3893         int                     nimap;
3894         int                     error = 0;
3895
3896         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3897                                 XFS_IS_REALTIME_INODE(ip) ?
3898                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3899
3900         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3901                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3902                 nimap = 1;
3903                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3904                         NULL, 0, &imap, &nimap, NULL, NULL);
3905                 if (error || nimap < 1)
3906                         break;
3907                 ASSERT(imap.br_blockcount >= 1);
3908                 ASSERT(imap.br_startoff == offset_fsb);
3909                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3910                 if (lastoffset > endoff)
3911                         lastoffset = endoff;
3912                 if (imap.br_startblock == HOLESTARTBLOCK)
3913                         continue;
3914                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3915                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3916                         continue;
3917                 XFS_BUF_UNDONE(bp);
3918                 XFS_BUF_UNWRITE(bp);
3919                 XFS_BUF_READ(bp);
3920                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3921                 xfsbdstrat(mp, bp);
3922                 if ((error = xfs_iowait(bp))) {
3923                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3924                                           mp, bp, XFS_BUF_ADDR(bp));
3925                         break;
3926                 }
3927                 memset(XFS_BUF_PTR(bp) +
3928                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3929                       0, lastoffset - offset + 1);
3930                 XFS_BUF_UNDONE(bp);
3931                 XFS_BUF_UNREAD(bp);
3932                 XFS_BUF_WRITE(bp);
3933                 xfsbdstrat(mp, bp);
3934                 if ((error = xfs_iowait(bp))) {
3935                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3936                                           mp, bp, XFS_BUF_ADDR(bp));
3937                         break;
3938                 }
3939         }
3940         xfs_buf_free(bp);
3941         return error;
3942 }
3943
3944 /*
3945  * xfs_free_file_space()
3946  *      This routine frees disk space for the given file.
3947  *
3948  *      This routine is only called by xfs_change_file_space
3949  *      for an UNRESVSP type call.
3950  *
3951  * RETURNS:
3952  *       0 on success
3953  *      errno on error
3954  *
3955  */
3956 STATIC int
3957 xfs_free_file_space(
3958         xfs_inode_t             *ip,
3959         xfs_off_t               offset,
3960         xfs_off_t               len,
3961         int                     attr_flags)
3962 {
3963         bhv_vnode_t             *vp;
3964         int                     committed;
3965         int                     done;
3966         xfs_off_t               end_dmi_offset;
3967         xfs_fileoff_t           endoffset_fsb;
3968         int                     error;
3969         xfs_fsblock_t           firstfsb;
3970         xfs_bmap_free_t         free_list;
3971         xfs_bmbt_irec_t         imap;
3972         xfs_off_t               ioffset;
3973         xfs_extlen_t            mod=0;
3974         xfs_mount_t             *mp;
3975         int                     nimap;
3976         uint                    resblks;
3977         uint                    rounding;
3978         int                     rt;
3979         xfs_fileoff_t           startoffset_fsb;
3980         xfs_trans_t             *tp;
3981         int                     need_iolock = 1;
3982
3983         vp = XFS_ITOV(ip);
3984         mp = ip->i_mount;
3985
3986         xfs_itrace_entry(ip);
3987
3988         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3989                 return error;
3990
3991         error = 0;
3992         if (len <= 0)   /* if nothing being freed */
3993                 return error;
3994         rt = XFS_IS_REALTIME_INODE(ip);
3995         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
3996         end_dmi_offset = offset + len;
3997         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
3998
3999         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4000             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4001                 if (end_dmi_offset > ip->i_size)
4002                         end_dmi_offset = ip->i_size;
4003                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4004                                 offset, end_dmi_offset - offset,
4005                                 AT_DELAY_FLAG(attr_flags), NULL);
4006                 if (error)
4007                         return error;
4008         }
4009
4010         if (attr_flags & ATTR_NOLOCK)
4011                 need_iolock = 0;
4012         if (need_iolock) {
4013                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4014                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
4015         }
4016
4017         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
4018         ioffset = offset & ~(rounding - 1);
4019
4020         if (VN_CACHED(vp) != 0) {
4021                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
4022                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
4023                 if (error)
4024                         goto out_unlock_iolock;
4025         }
4026
4027         /*
4028          * Need to zero the stuff we're not freeing, on disk.
4029          * If its a realtime file & can't use unwritten extents then we
4030          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4031          * will take care of it for us.
4032          */
4033         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
4034                 nimap = 1;
4035                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
4036                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4037                 if (error)
4038                         goto out_unlock_iolock;
4039                 ASSERT(nimap == 0 || nimap == 1);
4040                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4041                         xfs_daddr_t     block;
4042
4043                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4044                         block = imap.br_startblock;
4045                         mod = do_div(block, mp->m_sb.sb_rextsize);
4046                         if (mod)
4047                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4048                 }
4049                 nimap = 1;
4050                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
4051                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4052                 if (error)
4053                         goto out_unlock_iolock;
4054                 ASSERT(nimap == 0 || nimap == 1);
4055                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4056                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4057                         mod++;
4058                         if (mod && (mod != mp->m_sb.sb_rextsize))
4059                                 endoffset_fsb -= mod;
4060                 }
4061         }
4062         if ((done = (endoffset_fsb <= startoffset_fsb)))
4063                 /*
4064                  * One contiguous piece to clear
4065                  */
4066                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4067         else {
4068                 /*
4069                  * Some full blocks, possibly two pieces to clear
4070                  */
4071                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4072                         error = xfs_zero_remaining_bytes(ip, offset,
4073                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4074                 if (!error &&
4075                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4076                         error = xfs_zero_remaining_bytes(ip,
4077                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4078                                 offset + len - 1);
4079         }
4080
4081         /*
4082          * free file space until done or until there is an error
4083          */
4084         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4085         while (!error && !done) {
4086
4087                 /*
4088                  * allocate and setup the transaction. Allow this
4089                  * transaction to dip into the reserve blocks to ensure
4090                  * the freeing of the space succeeds at ENOSPC.
4091                  */
4092                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4093                 tp->t_flags |= XFS_TRANS_RESERVE;
4094                 error = xfs_trans_reserve(tp,
4095                                           resblks,
4096                                           XFS_WRITE_LOG_RES(mp),
4097                                           0,
4098                                           XFS_TRANS_PERM_LOG_RES,
4099                                           XFS_WRITE_LOG_COUNT);
4100
4101                 /*
4102                  * check for running out of space
4103                  */
4104                 if (error) {
4105                         /*
4106                          * Free the transaction structure.
4107                          */
4108                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4109                         xfs_trans_cancel(tp, 0);
4110                         break;
4111                 }
4112                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4113                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4114                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4115                                 XFS_QMOPT_RES_REGBLKS);
4116                 if (error)
4117                         goto error1;
4118
4119                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4120                 xfs_trans_ihold(tp, ip);
4121
4122                 /*
4123                  * issue the bunmapi() call to free the blocks
4124                  */
4125                 XFS_BMAP_INIT(&free_list, &firstfsb);
4126                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4127                                   endoffset_fsb - startoffset_fsb,
4128                                   0, 2, &firstfsb, &free_list, NULL, &done);
4129                 if (error) {
4130                         goto error0;
4131                 }
4132
4133                 /*
4134                  * complete the transaction
4135                  */
4136                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4137                 if (error) {
4138                         goto error0;
4139                 }
4140
4141                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4142                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4143         }
4144
4145  out_unlock_iolock:
4146         if (need_iolock)
4147                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4148         return error;
4149
4150  error0:
4151         xfs_bmap_cancel(&free_list);
4152  error1:
4153         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4154         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4155                     XFS_ILOCK_EXCL);
4156         return error;
4157 }
4158
4159 /*
4160  * xfs_change_file_space()
4161  *      This routine allocates or frees disk space for the given file.
4162  *      The user specified parameters are checked for alignment and size
4163  *      limitations.
4164  *
4165  * RETURNS:
4166  *       0 on success
4167  *      errno on error
4168  *
4169  */
4170 int
4171 xfs_change_file_space(
4172         xfs_inode_t     *ip,
4173         int             cmd,
4174         xfs_flock64_t   *bf,
4175         xfs_off_t       offset,
4176         cred_t          *credp,
4177         int             attr_flags)
4178 {
4179         xfs_mount_t     *mp = ip->i_mount;
4180         int             clrprealloc;
4181         int             error;
4182         xfs_fsize_t     fsize;
4183         int             setprealloc;
4184         xfs_off_t       startoffset;
4185         xfs_off_t       llen;
4186         xfs_trans_t     *tp;
4187         bhv_vattr_t     va;
4188
4189         xfs_itrace_entry(ip);
4190
4191         if (!S_ISREG(ip->i_d.di_mode))
4192                 return XFS_ERROR(EINVAL);
4193
4194         switch (bf->l_whence) {
4195         case 0: /*SEEK_SET*/
4196                 break;
4197         case 1: /*SEEK_CUR*/
4198                 bf->l_start += offset;
4199                 break;
4200         case 2: /*SEEK_END*/
4201                 bf->l_start += ip->i_size;
4202                 break;
4203         default:
4204                 return XFS_ERROR(EINVAL);
4205         }
4206
4207         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4208
4209         if (   (bf->l_start < 0)
4210             || (bf->l_start > XFS_MAXIOFFSET(mp))
4211             || (bf->l_start + llen < 0)
4212             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4213                 return XFS_ERROR(EINVAL);
4214
4215         bf->l_whence = 0;
4216
4217         startoffset = bf->l_start;
4218         fsize = ip->i_size;
4219
4220         /*
4221          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4222          * file space.
4223          * These calls do NOT zero the data space allocated to the file,
4224          * nor do they change the file size.
4225          *
4226          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4227          * space.
4228          * These calls cause the new file data to be zeroed and the file
4229          * size to be changed.
4230          */
4231         setprealloc = clrprealloc = 0;
4232
4233         switch (cmd) {
4234         case XFS_IOC_RESVSP:
4235         case XFS_IOC_RESVSP64:
4236                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4237                                                                 1, attr_flags);
4238                 if (error)
4239                         return error;
4240                 setprealloc = 1;
4241                 break;
4242
4243         case XFS_IOC_UNRESVSP:
4244         case XFS_IOC_UNRESVSP64:
4245                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4246                                                                 attr_flags)))
4247                         return error;
4248                 break;
4249
4250         case XFS_IOC_ALLOCSP:
4251         case XFS_IOC_ALLOCSP64:
4252         case XFS_IOC_FREESP:
4253         case XFS_IOC_FREESP64:
4254                 if (startoffset > fsize) {
4255                         error = xfs_alloc_file_space(ip, fsize,
4256                                         startoffset - fsize, 0, attr_flags);
4257                         if (error)
4258                                 break;
4259                 }
4260
4261                 va.va_mask = XFS_AT_SIZE;
4262                 va.va_size = startoffset;
4263
4264                 error = xfs_setattr(ip, &va, attr_flags, credp);
4265
4266                 if (error)
4267                         return error;
4268
4269                 clrprealloc = 1;
4270                 break;
4271
4272         default:
4273                 ASSERT(0);
4274                 return XFS_ERROR(EINVAL);
4275         }
4276
4277         /*
4278          * update the inode timestamp, mode, and prealloc flag bits
4279          */
4280         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4281
4282         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4283                                       0, 0, 0))) {
4284                 /* ASSERT(0); */
4285                 xfs_trans_cancel(tp, 0);
4286                 return error;
4287         }
4288
4289         xfs_ilock(ip, XFS_ILOCK_EXCL);
4290
4291         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4292         xfs_trans_ihold(tp, ip);
4293
4294         if ((attr_flags & ATTR_DMI) == 0) {
4295                 ip->i_d.di_mode &= ~S_ISUID;
4296
4297                 /*
4298                  * Note that we don't have to worry about mandatory
4299                  * file locking being disabled here because we only
4300                  * clear the S_ISGID bit if the Group execute bit is
4301                  * on, but if it was on then mandatory locking wouldn't
4302                  * have been enabled.
4303                  */
4304                 if (ip->i_d.di_mode & S_IXGRP)
4305                         ip->i_d.di_mode &= ~S_ISGID;
4306
4307                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4308         }
4309         if (setprealloc)
4310                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4311         else if (clrprealloc)
4312                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4313
4314         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4315         xfs_trans_set_sync(tp);
4316
4317         error = xfs_trans_commit(tp, 0);
4318
4319         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4320
4321         return error;
4322 }