fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_trans_space.h"
  52 #include "xfs_log_priv.h"
  53 #include "xfs_filestream.h"
  54 #include "xfs_vnodeops.h"
  55
  56 int
  57 xfs_open(
  58         xfs_inode_t     *ip)
  59 {
  60         int             mode;
  61
  62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  63                 return XFS_ERROR(EIO);
  64
  65         /*
  66          * If it's a directory with any blocks, read-ahead block 0
  67          * as we're almost certain to have the next operation be a read there.
  68          */
  69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  70                 mode = xfs_ilock_map_shared(ip);
  71                 if (ip->i_d.di_nextents > 0)
  72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  73                 xfs_iunlock(ip, mode);
  74         }
  75         return 0;
  76 }
  77
  78 /*
  79  * xfs_setattr
  80  */
  81 int
  82 xfs_setattr(
  83         xfs_inode_t             *ip,
  84         bhv_vattr_t             *vap,
  85         int                     flags,
  86         cred_t                  *credp)
  87 {
  88         xfs_mount_t             *mp = ip->i_mount;
  89         xfs_trans_t             *tp;
  90         int                     mask;
  91         int                     code;
  92         uint                    lock_flags;
  93         uint                    commit_flags=0;
  94         uid_t                   uid=0, iuid=0;
  95         gid_t                   gid=0, igid=0;
  96         int                     timeflags = 0;
  97         xfs_prid_t              projid=0, iprojid=0;
  98         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
  99         int                     file_owner;
 100         int                     need_iolock = 1;
 101
 102         xfs_itrace_entry(ip);
 103
 104         if (mp->m_flags & XFS_MOUNT_RDONLY)
 105                 return XFS_ERROR(EROFS);
 106
 107         /*
 108          * Cannot set certain attributes.
 109          */
 110         mask = vap->va_mask;
 111         if (mask & XFS_AT_NOSET) {
 112                 return XFS_ERROR(EINVAL);
 113         }
 114
 115         if (XFS_FORCED_SHUTDOWN(mp))
 116                 return XFS_ERROR(EIO);
 117
 118         /*
 119          * Timestamps do not need to be logged and hence do not
 120          * need to be done within a transaction.
 121          */
 122         if (mask & XFS_AT_UPDTIMES) {
 123                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 124                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 125                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 126                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 127                 xfs_ichgtime(ip, timeflags);
 128                 return 0;
 129         }
 130
 131         olddquot1 = olddquot2 = NULL;
 132         udqp = gdqp = NULL;
 133
 134         /*
 135          * If disk quotas is on, we make sure that the dquots do exist on disk,
 136          * before we start any other transactions. Trying to do this later
 137          * is messy. We don't care to take a readlock to look at the ids
 138          * in inode here, because we can't hold it across the trans_reserve.
 139          * If the IDs do change before we take the ilock, we're covered
 140          * because the i_*dquot fields will get updated anyway.
 141          */
 142         if (XFS_IS_QUOTA_ON(mp) &&
 143             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 144                 uint    qflags = 0;
 145
 146                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 147                         uid = vap->va_uid;
 148                         qflags |= XFS_QMOPT_UQUOTA;
 149                 } else {
 150                         uid = ip->i_d.di_uid;
 151                 }
 152                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 153                         gid = vap->va_gid;
 154                         qflags |= XFS_QMOPT_GQUOTA;
 155                 }  else {
 156                         gid = ip->i_d.di_gid;
 157                 }
 158                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 159                         projid = vap->va_projid;
 160                         qflags |= XFS_QMOPT_PQUOTA;
 161                 }  else {
 162                         projid = ip->i_d.di_projid;
 163                 }
 164                 /*
 165                  * We take a reference when we initialize udqp and gdqp,
 166                  * so it is important that we never blindly double trip on
 167                  * the same variable. See xfs_create() for an example.
 168                  */
 169                 ASSERT(udqp == NULL);
 170                 ASSERT(gdqp == NULL);
 171                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 172                                          &udqp, &gdqp);
 173                 if (code)
 174                         return code;
 175         }
 176
 177         /*
 178          * For the other attributes, we acquire the inode lock and
 179          * first do an error checking pass.
 180          */
 181         tp = NULL;
 182         lock_flags = XFS_ILOCK_EXCL;
 183         if (flags & ATTR_NOLOCK)
 184                 need_iolock = 0;
 185         if (!(mask & XFS_AT_SIZE)) {
 186                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 187                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 188                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 189                         commit_flags = 0;
 190                         if ((code = xfs_trans_reserve(tp, 0,
 191                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 192                                                      0, 0))) {
 193                                 lock_flags = 0;
 194                                 goto error_return;
 195                         }
 196                 }
 197         } else {
 198                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 199                     !(flags & ATTR_DMI)) {
 200                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 201                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
 202                                 vap->va_size, 0, dmflags, NULL);
 203                         if (code) {
 204                                 lock_flags = 0;
 205                                 goto error_return;
 206                         }
 207                 }
 208                 if (need_iolock)
 209                         lock_flags |= XFS_IOLOCK_EXCL;
 210         }
 211
 212         xfs_ilock(ip, lock_flags);
 213
 214         /* boolean: are we the file owner? */
 215         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 216
 217         /*
 218          * Change various properties of a file.
 219          * Only the owner or users with CAP_FOWNER
 220          * capability may do these things.
 221          */
 222         if (mask &
 223             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 224              XFS_AT_GID|XFS_AT_PROJID)) {
 225                 /*
 226                  * CAP_FOWNER overrides the following restrictions:
 227                  *
 228                  * The user ID of the calling process must be equal
 229                  * to the file owner ID, except in cases where the
 230                  * CAP_FSETID capability is applicable.
 231                  */
 232                 if (!file_owner && !capable(CAP_FOWNER)) {
 233                         code = XFS_ERROR(EPERM);
 234                         goto error_return;
 235                 }
 236
 237                 /*
 238                  * CAP_FSETID overrides the following restrictions:
 239                  *
 240                  * The effective user ID of the calling process shall match
 241                  * the file owner when setting the set-user-ID and
 242                  * set-group-ID bits on that file.
 243                  *
 244                  * The effective group ID or one of the supplementary group
 245                  * IDs of the calling process shall match the group owner of
 246                  * the file when setting the set-group-ID bit on that file
 247                  */
 248                 if (mask & XFS_AT_MODE) {
 249                         mode_t m = 0;
 250
 251                         if ((vap->va_mode & S_ISUID) && !file_owner)
 252                                 m |= S_ISUID;
 253                         if ((vap->va_mode & S_ISGID) &&
 254                             !in_group_p((gid_t)ip->i_d.di_gid))
 255                                 m |= S_ISGID;
 256 #if 0
 257                         /* Linux allows this, Irix doesn't. */
 258                         if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
 259                                 m |= S_ISVTX;
 260 #endif
 261                         if (m && !capable(CAP_FSETID))
 262                                 vap->va_mode &= ~m;
 263                 }
 264         }
 265
 266         /*
 267          * Change file ownership.  Must be the owner or privileged.
 268          * If the system was configured with the "restricted_chown"
 269          * option, the owner is not permitted to give away the file,
 270          * and can change the group id only to a group of which he
 271          * or she is a member.
 272          */
 273         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 274                 /*
 275                  * These IDs could have changed since we last looked at them.
 276                  * But, we're assured that if the ownership did change
 277                  * while we didn't have the inode locked, inode's dquot(s)
 278                  * would have changed also.
 279                  */
 280                 iuid = ip->i_d.di_uid;
 281                 iprojid = ip->i_d.di_projid;
 282                 igid = ip->i_d.di_gid;
 283                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 284                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 285                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 286                          iprojid;
 287
 288                 /*
 289                  * CAP_CHOWN overrides the following restrictions:
 290                  *
 291                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 292                  * shall override the restriction that a process cannot
 293                  * change the user ID of a file it owns and the restriction
 294                  * that the group ID supplied to the chown() function
 295                  * shall be equal to either the group ID or one of the
 296                  * supplementary group IDs of the calling process.
 297                  */
 298                 if (restricted_chown &&
 299                     (iuid != uid || (igid != gid &&
 300                                      !in_group_p((gid_t)gid))) &&
 301                     !capable(CAP_CHOWN)) {
 302                         code = XFS_ERROR(EPERM);
 303                         goto error_return;
 304                 }
 305                 /*
 306                  * Do a quota reservation only if uid/projid/gid is actually
 307                  * going to change.
 308                  */
 309                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 310                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 311                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 312                         ASSERT(tp);
 313                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 314                                                 capable(CAP_FOWNER) ?
 315                                                 XFS_QMOPT_FORCE_RES : 0);
 316                         if (code)       /* out of quota */
 317                                 goto error_return;
 318                 }
 319         }
 320
 321         /*
 322          * Truncate file.  Must have write permission and not be a directory.
 323          */
 324         if (mask & XFS_AT_SIZE) {
 325                 /* Short circuit the truncate case for zero length files */
 326                 if ((vap->va_size == 0) &&
 327                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 328                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 329                         lock_flags &= ~XFS_ILOCK_EXCL;
 330                         if (mask & XFS_AT_CTIME)
 331                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 332                         code = 0;
 333                         goto error_return;
 334                 }
 335
 336                 if (S_ISDIR(ip->i_d.di_mode)) {
 337                         code = XFS_ERROR(EISDIR);
 338                         goto error_return;
 339                 } else if (!S_ISREG(ip->i_d.di_mode)) {
 340                         code = XFS_ERROR(EINVAL);
 341                         goto error_return;
 342                 }
 343                 /*
 344                  * Make sure that the dquots are attached to the inode.
 345                  */
 346                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 347                         goto error_return;
 348         }
 349
 350         /*
 351          * Change file access or modified times.
 352          */
 353         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 354                 if (!file_owner) {
 355                         if ((flags & ATTR_UTIME) &&
 356                             !capable(CAP_FOWNER)) {
 357                                 code = XFS_ERROR(EPERM);
 358                                 goto error_return;
 359                         }
 360                 }
 361         }
 362
 363         /*
 364          * Change extent size or realtime flag.
 365          */
 366         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 367                 /*
 368                  * Can't change extent size if any extents are allocated.
 369                  */
 370                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 371                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 372                      vap->va_extsize) ) {
 373                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 374                         goto error_return;
 375                 }
 376
 377                 /*
 378                  * Can't change realtime flag if any extents are allocated.
 379                  */
 380                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 381                     (mask & XFS_AT_XFLAGS) &&
 382                     (XFS_IS_REALTIME_INODE(ip)) !=
 383                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 384                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 385                         goto error_return;
 386                 }
 387                 /*
 388                  * Extent size must be a multiple of the appropriate block
 389                  * size, if set at all.
 390                  */
 391                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 392                         xfs_extlen_t    size;
 393
 394                         if (XFS_IS_REALTIME_INODE(ip) ||
 395                             ((mask & XFS_AT_XFLAGS) &&
 396                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 397                                 size = mp->m_sb.sb_rextsize <<
 398                                        mp->m_sb.sb_blocklog;
 399                         } else {
 400                                 size = mp->m_sb.sb_blocksize;
 401                         }
 402                         if (vap->va_extsize % size) {
 403                                 code = XFS_ERROR(EINVAL);
 404                                 goto error_return;
 405                         }
 406                 }
 407                 /*
 408                  * If realtime flag is set then must have realtime data.
 409                  */
 410                 if ((mask & XFS_AT_XFLAGS) &&
 411                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 412                         if ((mp->m_sb.sb_rblocks == 0) ||
 413                             (mp->m_sb.sb_rextsize == 0) ||
 414                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 415                                 code = XFS_ERROR(EINVAL);
 416                                 goto error_return;
 417                         }
 418                 }
 419
 420                 /*
 421                  * Can't modify an immutable/append-only file unless
 422                  * we have appropriate permission.
 423                  */
 424                 if ((mask & XFS_AT_XFLAGS) &&
 425                     (ip->i_d.di_flags &
 426                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 427                      (vap->va_xflags &
 428                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 429                     !capable(CAP_LINUX_IMMUTABLE)) {
 430                         code = XFS_ERROR(EPERM);
 431                         goto error_return;
 432                 }
 433         }
 434
 435         /*
 436          * Now we can make the changes.  Before we join the inode
 437          * to the transaction, if XFS_AT_SIZE is set then take care of
 438          * the part of the truncation that must be done without the
 439          * inode lock.  This needs to be done before joining the inode
 440          * to the transaction, because the inode cannot be unlocked
 441          * once it is a part of the transaction.
 442          */
 443         if (mask & XFS_AT_SIZE) {
 444                 code = 0;
 445                 if ((vap->va_size > ip->i_size) &&
 446                     (flags & ATTR_NOSIZETOK) == 0) {
 447                         code = xfs_igrow_start(ip, vap->va_size, credp);
 448                 }
 449                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 450
 451                 /*
 452                  * We are going to log the inode size change in this
 453                  * transaction so any previous writes that are beyond the on
 454                  * disk EOF and the new EOF that have not been written out need
 455                  * to be written here. If we do not write the data out, we
 456                  * expose ourselves to the null files problem.
 457                  *
 458                  * Only flush from the on disk size to the smaller of the in
 459                  * memory file size or the new size as that's the range we
 460                  * really care about here and prevents waiting for other data
 461                  * not within the range we care about here.
 462                  */
 463                 if (!code &&
 464                     (ip->i_size != ip->i_d.di_size) &&
 465                     (vap->va_size > ip->i_d.di_size)) {
 466                         code = xfs_flush_pages(ip,
 467                                         ip->i_d.di_size, vap->va_size,
 468                                         XFS_B_ASYNC, FI_NONE);
 469                 }
 470
 471                 /* wait for all I/O to complete */
 472                 vn_iowait(ip);
 473
 474                 if (!code)
 475                         code = xfs_itruncate_data(ip, vap->va_size);
 476                 if (code) {
 477                         ASSERT(tp == NULL);
 478                         lock_flags &= ~XFS_ILOCK_EXCL;
 479                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 480                         goto error_return;
 481                 }
 482                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 483                 if ((code = xfs_trans_reserve(tp, 0,
 484                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 485                                              XFS_TRANS_PERM_LOG_RES,
 486                                              XFS_ITRUNCATE_LOG_COUNT))) {
 487                         xfs_trans_cancel(tp, 0);
 488                         if (need_iolock)
 489                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 490                         return code;
 491                 }
 492                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 493                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 494         }
 495
 496         if (tp) {
 497                 xfs_trans_ijoin(tp, ip, lock_flags);
 498                 xfs_trans_ihold(tp, ip);
 499         }
 500
 501         /*
 502          * Truncate file.  Must have write permission and not be a directory.
 503          */
 504         if (mask & XFS_AT_SIZE) {
 505                 /*
 506                  * Only change the c/mtime if we are changing the size
 507                  * or we are explicitly asked to change it. This handles
 508                  * the semantic difference between truncate() and ftruncate()
 509                  * as implemented in the VFS.
 510                  */
 511                 if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
 512                         timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 513
 514                 if (vap->va_size > ip->i_size) {
 515                         xfs_igrow_finish(tp, ip, vap->va_size,
 516                             !(flags & ATTR_DMI));
 517                 } else if ((vap->va_size <= ip->i_size) ||
 518                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 519                         /*
 520                          * signal a sync transaction unless
 521                          * we're truncating an already unlinked
 522                          * file on a wsync filesystem
 523                          */
 524                         code = xfs_itruncate_finish(&tp, ip,
 525                                             (xfs_fsize_t)vap->va_size,
 526                                             XFS_DATA_FORK,
 527                                             ((ip->i_d.di_nlink != 0 ||
 528                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 529                                              ? 1 : 0));
 530                         if (code)
 531                                 goto abort_return;
 532                         /*
 533                          * Truncated "down", so we're removing references
 534                          * to old data here - if we now delay flushing for
 535                          * a long time, we expose ourselves unduly to the
 536                          * notorious NULL files problem.  So, we mark this
 537                          * vnode and flush it when the file is closed, and
 538                          * do not wait the usual (long) time for writeout.
 539                          */
 540                         xfs_iflags_set(ip, XFS_ITRUNCATED);
 541                 }
 542         }
 543
 544         /*
 545          * Change file access modes.
 546          */
 547         if (mask & XFS_AT_MODE) {
 548                 ip->i_d.di_mode &= S_IFMT;
 549                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 550
 551                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 552                 timeflags |= XFS_ICHGTIME_CHG;
 553         }
 554
 555         /*
 556          * Change file ownership.  Must be the owner or privileged.
 557          * If the system was configured with the "restricted_chown"
 558          * option, the owner is not permitted to give away the file,
 559          * and can change the group id only to a group of which he
 560          * or she is a member.
 561          */
 562         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 563                 /*
 564                  * CAP_FSETID overrides the following restrictions:
 565                  *
 566                  * The set-user-ID and set-group-ID bits of a file will be
 567                  * cleared upon successful return from chown()
 568                  */
 569                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 570                     !capable(CAP_FSETID)) {
 571                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 572                 }
 573
 574                 /*
 575                  * Change the ownerships and register quota modifications
 576                  * in the transaction.
 577                  */
 578                 if (iuid != uid) {
 579                         if (XFS_IS_UQUOTA_ON(mp)) {
 580                                 ASSERT(mask & XFS_AT_UID);
 581                                 ASSERT(udqp);
 582                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 583                                                         &ip->i_udquot, udqp);
 584                         }
 585                         ip->i_d.di_uid = uid;
 586                 }
 587                 if (igid != gid) {
 588                         if (XFS_IS_GQUOTA_ON(mp)) {
 589                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 590                                 ASSERT(mask & XFS_AT_GID);
 591                                 ASSERT(gdqp);
 592                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 593                                                         &ip->i_gdquot, gdqp);
 594                         }
 595                         ip->i_d.di_gid = gid;
 596                 }
 597                 if (iprojid != projid) {
 598                         if (XFS_IS_PQUOTA_ON(mp)) {
 599                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 600                                 ASSERT(mask & XFS_AT_PROJID);
 601                                 ASSERT(gdqp);
 602                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 603                                                         &ip->i_gdquot, gdqp);
 604                         }
 605                         ip->i_d.di_projid = projid;
 606                         /*
 607                          * We may have to rev the inode as well as
 608                          * the superblock version number since projids didn't
 609                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 610                          */
 611                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 612                                 xfs_bump_ino_vers2(tp, ip);
 613                 }
 614
 615                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 616                 timeflags |= XFS_ICHGTIME_CHG;
 617         }
 618
 619
 620         /*
 621          * Change file access or modified times.
 622          */
 623         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 624                 if (mask & XFS_AT_ATIME) {
 625                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 626                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 627                         ip->i_update_core = 1;
 628                         timeflags &= ~XFS_ICHGTIME_ACC;
 629                 }
 630                 if (mask & XFS_AT_MTIME) {
 631                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 632                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 633                         timeflags &= ~XFS_ICHGTIME_MOD;
 634                         timeflags |= XFS_ICHGTIME_CHG;
 635                 }
 636                 if (tp && (flags & ATTR_UTIME))
 637                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 638         }
 639
 640         /*
 641          * Change XFS-added attributes.
 642          */
 643         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 644                 if (mask & XFS_AT_EXTSIZE) {
 645                         /*
 646                          * Converting bytes to fs blocks.
 647                          */
 648                         ip->i_d.di_extsize = vap->va_extsize >>
 649                                 mp->m_sb.sb_blocklog;
 650                 }
 651                 if (mask & XFS_AT_XFLAGS) {
 652                         uint    di_flags;
 653
 654                         /* can't set PREALLOC this way, just preserve it */
 655                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 656                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 657                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 658                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 659                                 di_flags |= XFS_DIFLAG_APPEND;
 660                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 661                                 di_flags |= XFS_DIFLAG_SYNC;
 662                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 663                                 di_flags |= XFS_DIFLAG_NOATIME;
 664                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 665                                 di_flags |= XFS_DIFLAG_NODUMP;
 666                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 667                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 668                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 669                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 670                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 671                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 672                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 673                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 674                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 675                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 676                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 677                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 678                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 679                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 680                                 if (vap->va_xflags & XFS_XFLAG_REALTIME)
 681                                         di_flags |= XFS_DIFLAG_REALTIME;
 682                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 683                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 684                         }
 685                         ip->i_d.di_flags = di_flags;
 686                 }
 687                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 688                 timeflags |= XFS_ICHGTIME_CHG;
 689         }
 690
 691         /*
 692          * Change file inode change time only if XFS_AT_CTIME set
 693          * AND we have been called by a DMI function.
 694          */
 695
 696         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 697                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 698                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 699                 ip->i_update_core = 1;
 700                 timeflags &= ~XFS_ICHGTIME_CHG;
 701         }
 702
 703         /*
 704          * Send out timestamp changes that need to be set to the
 705          * current time.  Not done when called by a DMI function.
 706          */
 707         if (timeflags && !(flags & ATTR_DMI))
 708                 xfs_ichgtime(ip, timeflags);
 709
 710         XFS_STATS_INC(xs_ig_attrchg);
 711
 712         /*
 713          * If this is a synchronous mount, make sure that the
 714          * transaction goes to disk before returning to the user.
 715          * This is slightly sub-optimal in that truncates require
 716          * two sync transactions instead of one for wsync filesystems.
 717          * One for the truncate and one for the timestamps since we
 718          * don't want to change the timestamps unless we're sure the
 719          * truncate worked.  Truncates are less than 1% of the laddis
 720          * mix so this probably isn't worth the trouble to optimize.
 721          */
 722         code = 0;
 723         if (tp) {
 724                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 725                         xfs_trans_set_sync(tp);
 726
 727                 code = xfs_trans_commit(tp, commit_flags);
 728         }
 729
 730         xfs_iunlock(ip, lock_flags);
 731
 732         /*
 733          * Release any dquot(s) the inode had kept before chown.
 734          */
 735         XFS_QM_DQRELE(mp, olddquot1);
 736         XFS_QM_DQRELE(mp, olddquot2);
 737         XFS_QM_DQRELE(mp, udqp);
 738         XFS_QM_DQRELE(mp, gdqp);
 739
 740         if (code) {
 741                 return code;
 742         }
 743
 744         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 745             !(flags & ATTR_DMI)) {
 746                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 747                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 748                                         0, 0, AT_DELAY_FLAG(flags));
 749         }
 750         return 0;
 751
 752  abort_return:
 753         commit_flags |= XFS_TRANS_ABORT;
 754         /* FALLTHROUGH */
 755  error_return:
 756         XFS_QM_DQRELE(mp, udqp);
 757         XFS_QM_DQRELE(mp, gdqp);
 758         if (tp) {
 759                 xfs_trans_cancel(tp, commit_flags);
 760         }
 761         if (lock_flags != 0) {
 762                 xfs_iunlock(ip, lock_flags);
 763         }
 764         return code;
 765 }
 766
 767 /*
 768  * The maximum pathlen is 1024 bytes. Since the minimum file system
 769  * blocksize is 512 bytes, we can get a max of 2 extents back from
 770  * bmapi.
 771  */
 772 #define SYMLINK_MAPS 2
 773
 774 STATIC int
 775 xfs_readlink_bmap(
 776         xfs_inode_t     *ip,
 777         char            *link)
 778 {
 779         xfs_mount_t     *mp = ip->i_mount;
 780         int             pathlen = ip->i_d.di_size;
 781         int             nmaps = SYMLINK_MAPS;
 782         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 783         xfs_daddr_t     d;
 784         int             byte_cnt;
 785         int             n;
 786         xfs_buf_t       *bp;
 787         int             error = 0;
 788
 789         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 790                         mval, &nmaps, NULL, NULL);
 791         if (error)
 792                 goto out;
 793
 794         for (n = 0; n < nmaps; n++) {
 795                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 796                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 797
 798                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 799                 error = XFS_BUF_GETERROR(bp);
 800                 if (error) {
 801                         xfs_ioerror_alert("xfs_readlink",
 802                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 803                         xfs_buf_relse(bp);
 804                         goto out;
 805                 }
 806                 if (pathlen < byte_cnt)
 807                         byte_cnt = pathlen;
 808                 pathlen -= byte_cnt;
 809
 810                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 811                 xfs_buf_relse(bp);
 812         }
 813
 814         link[ip->i_d.di_size] = '\0';
 815         error = 0;
 816
 817  out:
 818         return error;
 819 }
 820
 821 int
 822 xfs_readlink(
 823         xfs_inode_t     *ip,
 824         char            *link)
 825 {
 826         xfs_mount_t     *mp = ip->i_mount;
 827         int             pathlen;
 828         int             error = 0;
 829
 830         xfs_itrace_entry(ip);
 831
 832         if (XFS_FORCED_SHUTDOWN(mp))
 833                 return XFS_ERROR(EIO);
 834
 835         xfs_ilock(ip, XFS_ILOCK_SHARED);
 836
 837         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 838         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 839
 840         pathlen = ip->i_d.di_size;
 841         if (!pathlen)
 842                 goto out;
 843
 844         if (ip->i_df.if_flags & XFS_IFINLINE) {
 845                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 846                 link[pathlen] = '\0';
 847         } else {
 848                 error = xfs_readlink_bmap(ip, link);
 849         }
 850
 851  out:
 852         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 853         return error;
 854 }
 855
 856 /*
 857  * xfs_fsync
 858  *
 859  * This is called to sync the inode and its data out to disk.
 860  * We need to hold the I/O lock while flushing the data, and
 861  * the inode lock while flushing the inode.  The inode lock CANNOT
 862  * be held while flushing the data, so acquire after we're done
 863  * with that.
 864  */
 865 int
 866 xfs_fsync(
 867         xfs_inode_t     *ip,
 868         int             flag,
 869         xfs_off_t       start,
 870         xfs_off_t       stop)
 871 {
 872         xfs_trans_t     *tp;
 873         int             error;
 874         int             log_flushed = 0, changed = 1;
 875
 876         xfs_itrace_entry(ip);
 877
 878         ASSERT(start >= 0 && stop >= -1);
 879
 880         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 881                 return XFS_ERROR(EIO);
 882
 883         if (flag & FSYNC_DATA)
 884                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
 885
 886         /*
 887          * We always need to make sure that the required inode state
 888          * is safe on disk.  The vnode might be clean but because
 889          * of committed transactions that haven't hit the disk yet.
 890          * Likewise, there could be unflushed non-transactional
 891          * changes to the inode core that have to go to disk.
 892          *
 893          * The following code depends on one assumption:  that
 894          * any transaction that changes an inode logs the core
 895          * because it has to change some field in the inode core
 896          * (typically nextents or nblocks).  That assumption
 897          * implies that any transactions against an inode will
 898          * catch any non-transactional updates.  If inode-altering
 899          * transactions exist that violate this assumption, the
 900          * code breaks.  Right now, it figures that if the involved
 901          * update_* field is clear and the inode is unpinned, the
 902          * inode is clean.  Either it's been flushed or it's been
 903          * committed and the commit has hit the disk unpinning the inode.
 904          * (Note that xfs_inode_item_format() called at commit clears
 905          * the update_* fields.)
 906          */
 907         xfs_ilock(ip, XFS_ILOCK_SHARED);
 908
 909         /* If we are flushing data then we care about update_size
 910          * being set, otherwise we care about update_core
 911          */
 912         if ((flag & FSYNC_DATA) ?
 913                         (ip->i_update_size == 0) :
 914                         (ip->i_update_core == 0)) {
 915                 /*
 916                  * Timestamps/size haven't changed since last inode
 917                  * flush or inode transaction commit.  That means
 918                  * either nothing got written or a transaction
 919                  * committed which caught the updates.  If the
 920                  * latter happened and the transaction hasn't
 921                  * hit the disk yet, the inode will be still
 922                  * be pinned.  If it is, force the log.
 923                  */
 924
 925                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 926
 927                 if (xfs_ipincount(ip)) {
 928                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
 929                                       XFS_LOG_FORCE |
 930                                       ((flag & FSYNC_WAIT)
 931                                        ? XFS_LOG_SYNC : 0),
 932                                       &log_flushed);
 933                 } else {
 934                         /*
 935                          * If the inode is not pinned and nothing
 936                          * has changed we don't need to flush the
 937                          * cache.
 938                          */
 939                         changed = 0;
 940                 }
 941                 error = 0;
 942         } else  {
 943                 /*
 944                  * Kick off a transaction to log the inode
 945                  * core to get the updates.  Make it
 946                  * sync if FSYNC_WAIT is passed in (which
 947                  * is done by everybody but specfs).  The
 948                  * sync transaction will also force the log.
 949                  */
 950                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 951                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
 952                 if ((error = xfs_trans_reserve(tp, 0,
 953                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
 954                                 0, 0, 0)))  {
 955                         xfs_trans_cancel(tp, 0);
 956                         return error;
 957                 }
 958                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 959
 960                 /*
 961                  * Note - it's possible that we might have pushed
 962                  * ourselves out of the way during trans_reserve
 963                  * which would flush the inode.  But there's no
 964                  * guarantee that the inode buffer has actually
 965                  * gone out yet (it's delwri).  Plus the buffer
 966                  * could be pinned anyway if it's part of an
 967                  * inode in another recent transaction.  So we
 968                  * play it safe and fire off the transaction anyway.
 969                  */
 970                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 971                 xfs_trans_ihold(tp, ip);
 972                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 973                 if (flag & FSYNC_WAIT)
 974                         xfs_trans_set_sync(tp);
 975                 error = _xfs_trans_commit(tp, 0, &log_flushed);
 976
 977                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 978         }
 979
 980         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
 981                 /*
 982                  * If the log write didn't issue an ordered tag we need
 983                  * to flush the disk cache for the data device now.
 984                  */
 985                 if (!log_flushed)
 986                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
 987
 988                 /*
 989                  * If this inode is on the RT dev we need to flush that
 990                  * cache as well.
 991                  */
 992                 if (XFS_IS_REALTIME_INODE(ip))
 993                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
 994         }
 995
 996         return error;
 997 }
 998
 999 /*
1000  * This is called by xfs_inactive to free any blocks beyond eof
1001  * when the link count isn't zero and by xfs_dm_punch_hole() when
1002  * punching a hole to EOF.
1003  */
1004 int
1005 xfs_free_eofblocks(
1006         xfs_mount_t     *mp,
1007         xfs_inode_t     *ip,
1008         int             flags)
1009 {
1010         xfs_trans_t     *tp;
1011         int             error;
1012         xfs_fileoff_t   end_fsb;
1013         xfs_fileoff_t   last_fsb;
1014         xfs_filblks_t   map_len;
1015         int             nimaps;
1016         xfs_bmbt_irec_t imap;
1017         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1018
1019         /*
1020          * Figure out if there are any blocks beyond the end
1021          * of the file.  If not, then there is nothing to do.
1022          */
1023         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1024         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1025         map_len = last_fsb - end_fsb;
1026         if (map_len <= 0)
1027                 return 0;
1028
1029         nimaps = 1;
1030         xfs_ilock(ip, XFS_ILOCK_SHARED);
1031         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1032                           NULL, 0, &imap, &nimaps, NULL, NULL);
1033         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1034
1035         if (!error && (nimaps != 0) &&
1036             (imap.br_startblock != HOLESTARTBLOCK ||
1037              ip->i_delayed_blks)) {
1038                 /*
1039                  * Attach the dquots to the inode up front.
1040                  */
1041                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1042                         return error;
1043
1044                 /*
1045                  * There are blocks after the end of file.
1046                  * Free them up now by truncating the file to
1047                  * its current size.
1048                  */
1049                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1050
1051                 /*
1052                  * Do the xfs_itruncate_start() call before
1053                  * reserving any log space because
1054                  * itruncate_start will call into the buffer
1055                  * cache and we can't
1056                  * do that within a transaction.
1057                  */
1058                 if (use_iolock)
1059                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1060                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1061                                     ip->i_size);
1062                 if (error) {
1063                         xfs_trans_cancel(tp, 0);
1064                         if (use_iolock)
1065                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1066                         return error;
1067                 }
1068
1069                 error = xfs_trans_reserve(tp, 0,
1070                                           XFS_ITRUNCATE_LOG_RES(mp),
1071                                           0, XFS_TRANS_PERM_LOG_RES,
1072                                           XFS_ITRUNCATE_LOG_COUNT);
1073                 if (error) {
1074                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1075                         xfs_trans_cancel(tp, 0);
1076                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1077                         return error;
1078                 }
1079
1080                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1081                 xfs_trans_ijoin(tp, ip,
1082                                 XFS_IOLOCK_EXCL |
1083                                 XFS_ILOCK_EXCL);
1084                 xfs_trans_ihold(tp, ip);
1085
1086                 error = xfs_itruncate_finish(&tp, ip,
1087                                              ip->i_size,
1088                                              XFS_DATA_FORK,
1089                                              0);
1090                 /*
1091                  * If we get an error at this point we
1092                  * simply don't bother truncating the file.
1093                  */
1094                 if (error) {
1095                         xfs_trans_cancel(tp,
1096                                          (XFS_TRANS_RELEASE_LOG_RES |
1097                                           XFS_TRANS_ABORT));
1098                 } else {
1099                         error = xfs_trans_commit(tp,
1100                                                 XFS_TRANS_RELEASE_LOG_RES);
1101                 }
1102                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1103                                             : XFS_ILOCK_EXCL));
1104         }
1105         return error;
1106 }
1107
1108 /*
1109  * Free a symlink that has blocks associated with it.
1110  */
1111 STATIC int
1112 xfs_inactive_symlink_rmt(
1113         xfs_inode_t     *ip,
1114         xfs_trans_t     **tpp)
1115 {
1116         xfs_buf_t       *bp;
1117         int             committed;
1118         int             done;
1119         int             error;
1120         xfs_fsblock_t   first_block;
1121         xfs_bmap_free_t free_list;
1122         int             i;
1123         xfs_mount_t     *mp;
1124         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1125         int             nmaps;
1126         xfs_trans_t     *ntp;
1127         int             size;
1128         xfs_trans_t     *tp;
1129
1130         tp = *tpp;
1131         mp = ip->i_mount;
1132         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1133         /*
1134          * We're freeing a symlink that has some
1135          * blocks allocated to it.  Free the
1136          * blocks here.  We know that we've got
1137          * either 1 or 2 extents and that we can
1138          * free them all in one bunmapi call.
1139          */
1140         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1141         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1142                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1143                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1144                 xfs_trans_cancel(tp, 0);
1145                 *tpp = NULL;
1146                 return error;
1147         }
1148         /*
1149          * Lock the inode, fix the size, and join it to the transaction.
1150          * Hold it so in the normal path, we still have it locked for
1151          * the second transaction.  In the error paths we need it
1152          * held so the cancel won't rele it, see below.
1153          */
1154         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1155         size = (int)ip->i_d.di_size;
1156         ip->i_d.di_size = 0;
1157         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1158         xfs_trans_ihold(tp, ip);
1159         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1160         /*
1161          * Find the block(s) so we can inval and unmap them.
1162          */
1163         done = 0;
1164         XFS_BMAP_INIT(&free_list, &first_block);
1165         nmaps = ARRAY_SIZE(mval);
1166         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1167                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1168                         &free_list, NULL)))
1169                 goto error0;
1170         /*
1171          * Invalidate the block(s).
1172          */
1173         for (i = 0; i < nmaps; i++) {
1174                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1175                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1176                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1177                 xfs_trans_binval(tp, bp);
1178         }
1179         /*
1180          * Unmap the dead block(s) to the free_list.
1181          */
1182         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1183                         &first_block, &free_list, NULL, &done)))
1184                 goto error1;
1185         ASSERT(done);
1186         /*
1187          * Commit the first transaction.  This logs the EFI and the inode.
1188          */
1189         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1190                 goto error1;
1191         /*
1192          * The transaction must have been committed, since there were
1193          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1194          * The new tp has the extent freeing and EFDs.
1195          */
1196         ASSERT(committed);
1197         /*
1198          * The first xact was committed, so add the inode to the new one.
1199          * Mark it dirty so it will be logged and moved forward in the log as
1200          * part of every commit.
1201          */
1202         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1203         xfs_trans_ihold(tp, ip);
1204         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1205         /*
1206          * Get a new, empty transaction to return to our caller.
1207          */
1208         ntp = xfs_trans_dup(tp);
1209         /*
1210          * Commit the transaction containing extent freeing and EFDs.
1211          * If we get an error on the commit here or on the reserve below,
1212          * we need to unlock the inode since the new transaction doesn't
1213          * have the inode attached.
1214          */
1215         error = xfs_trans_commit(tp, 0);
1216         tp = ntp;
1217         if (error) {
1218                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1219                 goto error0;
1220         }
1221         /*
1222          * Remove the memory for extent descriptions (just bookkeeping).
1223          */
1224         if (ip->i_df.if_bytes)
1225                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1226         ASSERT(ip->i_df.if_bytes == 0);
1227         /*
1228          * Put an itruncate log reservation in the new transaction
1229          * for our caller.
1230          */
1231         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1232                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1233                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1234                 goto error0;
1235         }
1236         /*
1237          * Return with the inode locked but not joined to the transaction.
1238          */
1239         *tpp = tp;
1240         return 0;
1241
1242  error1:
1243         xfs_bmap_cancel(&free_list);
1244  error0:
1245         /*
1246          * Have to come here with the inode locked and either
1247          * (held and in the transaction) or (not in the transaction).
1248          * If the inode isn't held then cancel would iput it, but
1249          * that's wrong since this is inactive and the vnode ref
1250          * count is 0 already.
1251          * Cancel won't do anything to the inode if held, but it still
1252          * needs to be locked until the cancel is done, if it was
1253          * joined to the transaction.
1254          */
1255         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1256         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1257         *tpp = NULL;
1258         return error;
1259
1260 }
1261
1262 STATIC int
1263 xfs_inactive_symlink_local(
1264         xfs_inode_t     *ip,
1265         xfs_trans_t     **tpp)
1266 {
1267         int             error;
1268
1269         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1270         /*
1271          * We're freeing a symlink which fit into
1272          * the inode.  Just free the memory used
1273          * to hold the old symlink.
1274          */
1275         error = xfs_trans_reserve(*tpp, 0,
1276                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1277                                   0, XFS_TRANS_PERM_LOG_RES,
1278                                   XFS_ITRUNCATE_LOG_COUNT);
1279
1280         if (error) {
1281                 xfs_trans_cancel(*tpp, 0);
1282                 *tpp = NULL;
1283                 return error;
1284         }
1285         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1286
1287         /*
1288          * Zero length symlinks _can_ exist.
1289          */
1290         if (ip->i_df.if_bytes > 0) {
1291                 xfs_idata_realloc(ip,
1292                                   -(ip->i_df.if_bytes),
1293                                   XFS_DATA_FORK);
1294                 ASSERT(ip->i_df.if_bytes == 0);
1295         }
1296         return 0;
1297 }
1298
1299 STATIC int
1300 xfs_inactive_attrs(
1301         xfs_inode_t     *ip,
1302         xfs_trans_t     **tpp)
1303 {
1304         xfs_trans_t     *tp;
1305         int             error;
1306         xfs_mount_t     *mp;
1307
1308         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1309         tp = *tpp;
1310         mp = ip->i_mount;
1311         ASSERT(ip->i_d.di_forkoff != 0);
1312         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1313         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1314         if (error)
1315                 goto error_unlock;
1316
1317         error = xfs_attr_inactive(ip);
1318         if (error)
1319                 goto error_unlock;
1320
1321         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1322         error = xfs_trans_reserve(tp, 0,
1323                                   XFS_IFREE_LOG_RES(mp),
1324                                   0, XFS_TRANS_PERM_LOG_RES,
1325                                   XFS_INACTIVE_LOG_COUNT);
1326         if (error)
1327                 goto error_cancel;
1328
1329         xfs_ilock(ip, XFS_ILOCK_EXCL);
1330         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1331         xfs_trans_ihold(tp, ip);
1332         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1333
1334         ASSERT(ip->i_d.di_anextents == 0);
1335
1336         *tpp = tp;
1337         return 0;
1338
1339 error_cancel:
1340         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1341         xfs_trans_cancel(tp, 0);
1342 error_unlock:
1343         *tpp = NULL;
1344         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1345         return error;
1346 }
1347
1348 int
1349 xfs_release(
1350         xfs_inode_t     *ip)
1351 {
1352         bhv_vnode_t     *vp = XFS_ITOV(ip);
1353         xfs_mount_t     *mp = ip->i_mount;
1354         int             error;
1355
1356         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1357                 return 0;
1358
1359         /* If this is a read-only mount, don't do this (would generate I/O) */
1360         if (mp->m_flags & XFS_MOUNT_RDONLY)
1361                 return 0;
1362
1363         if (!XFS_FORCED_SHUTDOWN(mp)) {
1364                 int truncated;
1365
1366                 /*
1367                  * If we are using filestreams, and we have an unlinked
1368                  * file that we are processing the last close on, then nothing
1369                  * will be able to reopen and write to this file. Purge this
1370                  * inode from the filestreams cache so that it doesn't delay
1371                  * teardown of the inode.
1372                  */
1373                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1374                         xfs_filestream_deassociate(ip);
1375
1376                 /*
1377                  * If we previously truncated this file and removed old data
1378                  * in the process, we want to initiate "early" writeout on
1379                  * the last close.  This is an attempt to combat the notorious
1380                  * NULL files problem which is particularly noticable from a
1381                  * truncate down, buffered (re-)write (delalloc), followed by
1382                  * a crash.  What we are effectively doing here is
1383                  * significantly reducing the time window where we'd otherwise
1384                  * be exposed to that problem.
1385                  */
1386                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1387                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1388                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1389         }
1390
1391         if (ip->i_d.di_nlink != 0) {
1392                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1393                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1394                        ip->i_delayed_blks > 0)) &&
1395                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1396                     (!(ip->i_d.di_flags &
1397                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1398                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1399                         if (error)
1400                                 return error;
1401                 }
1402         }
1403
1404         return 0;
1405 }
1406
1407 /*
1408  * xfs_inactive
1409  *
1410  * This is called when the vnode reference count for the vnode
1411  * goes to zero.  If the file has been unlinked, then it must
1412  * now be truncated.  Also, we clear all of the read-ahead state
1413  * kept for the inode here since the file is now closed.
1414  */
1415 int
1416 xfs_inactive(
1417         xfs_inode_t     *ip)
1418 {
1419         bhv_vnode_t     *vp = XFS_ITOV(ip);
1420         xfs_bmap_free_t free_list;
1421         xfs_fsblock_t   first_block;
1422         int             committed;
1423         xfs_trans_t     *tp;
1424         xfs_mount_t     *mp;
1425         int             error;
1426         int             truncate;
1427
1428         xfs_itrace_entry(ip);
1429
1430         /*
1431          * If the inode is already free, then there can be nothing
1432          * to clean up here.
1433          */
1434         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1435                 ASSERT(ip->i_df.if_real_bytes == 0);
1436                 ASSERT(ip->i_df.if_broot_bytes == 0);
1437                 return VN_INACTIVE_CACHE;
1438         }
1439
1440         /*
1441          * Only do a truncate if it's a regular file with
1442          * some actual space in it.  It's OK to look at the
1443          * inode's fields without the lock because we're the
1444          * only one with a reference to the inode.
1445          */
1446         truncate = ((ip->i_d.di_nlink == 0) &&
1447             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1448              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1449             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1450
1451         mp = ip->i_mount;
1452
1453         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1454                 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1455
1456         error = 0;
1457
1458         /* If this is a read-only mount, don't do this (would generate I/O) */
1459         if (mp->m_flags & XFS_MOUNT_RDONLY)
1460                 goto out;
1461
1462         if (ip->i_d.di_nlink != 0) {
1463                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1464                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1465                        ip->i_delayed_blks > 0)) &&
1466                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1467                      (!(ip->i_d.di_flags &
1468                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1469                       (ip->i_delayed_blks != 0)))) {
1470                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1471                         if (error)
1472                                 return VN_INACTIVE_CACHE;
1473                 }
1474                 goto out;
1475         }
1476
1477         ASSERT(ip->i_d.di_nlink == 0);
1478
1479         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1480                 return VN_INACTIVE_CACHE;
1481
1482         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1483         if (truncate) {
1484                 /*
1485                  * Do the xfs_itruncate_start() call before
1486                  * reserving any log space because itruncate_start
1487                  * will call into the buffer cache and we can't
1488                  * do that within a transaction.
1489                  */
1490                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1491
1492                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1493                 if (error) {
1494                         xfs_trans_cancel(tp, 0);
1495                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1496                         return VN_INACTIVE_CACHE;
1497                 }
1498
1499                 error = xfs_trans_reserve(tp, 0,
1500                                           XFS_ITRUNCATE_LOG_RES(mp),
1501                                           0, XFS_TRANS_PERM_LOG_RES,
1502                                           XFS_ITRUNCATE_LOG_COUNT);
1503                 if (error) {
1504                         /* Don't call itruncate_cleanup */
1505                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1506                         xfs_trans_cancel(tp, 0);
1507                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1508                         return VN_INACTIVE_CACHE;
1509                 }
1510
1511                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1512                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1513                 xfs_trans_ihold(tp, ip);
1514
1515                 /*
1516                  * normally, we have to run xfs_itruncate_finish sync.
1517                  * But if filesystem is wsync and we're in the inactive
1518                  * path, then we know that nlink == 0, and that the
1519                  * xaction that made nlink == 0 is permanently committed
1520                  * since xfs_remove runs as a synchronous transaction.
1521                  */
1522                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1523                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1524
1525                 if (error) {
1526                         xfs_trans_cancel(tp,
1527                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1528                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1529                         return VN_INACTIVE_CACHE;
1530                 }
1531         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1532
1533                 /*
1534                  * If we get an error while cleaning up a
1535                  * symlink we bail out.
1536                  */
1537                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1538                         xfs_inactive_symlink_rmt(ip, &tp) :
1539                         xfs_inactive_symlink_local(ip, &tp);
1540
1541                 if (error) {
1542                         ASSERT(tp == NULL);
1543                         return VN_INACTIVE_CACHE;
1544                 }
1545
1546                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1547                 xfs_trans_ihold(tp, ip);
1548         } else {
1549                 error = xfs_trans_reserve(tp, 0,
1550                                           XFS_IFREE_LOG_RES(mp),
1551                                           0, XFS_TRANS_PERM_LOG_RES,
1552                                           XFS_INACTIVE_LOG_COUNT);
1553                 if (error) {
1554                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1555                         xfs_trans_cancel(tp, 0);
1556                         return VN_INACTIVE_CACHE;
1557                 }
1558
1559                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1560                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1561                 xfs_trans_ihold(tp, ip);
1562         }
1563
1564         /*
1565          * If there are attributes associated with the file
1566          * then blow them away now.  The code calls a routine
1567          * that recursively deconstructs the attribute fork.
1568          * We need to just commit the current transaction
1569          * because we can't use it for xfs_attr_inactive().
1570          */
1571         if (ip->i_d.di_anextents > 0) {
1572                 error = xfs_inactive_attrs(ip, &tp);
1573                 /*
1574                  * If we got an error, the transaction is already
1575                  * cancelled, and the inode is unlocked. Just get out.
1576                  */
1577                  if (error)
1578                          return VN_INACTIVE_CACHE;
1579         } else if (ip->i_afp) {
1580                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1581         }
1582
1583         /*
1584          * Free the inode.
1585          */
1586         XFS_BMAP_INIT(&free_list, &first_block);
1587         error = xfs_ifree(tp, ip, &free_list);
1588         if (error) {
1589                 /*
1590                  * If we fail to free the inode, shut down.  The cancel
1591                  * might do that, we need to make sure.  Otherwise the
1592                  * inode might be lost for a long time or forever.
1593                  */
1594                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1595                         cmn_err(CE_NOTE,
1596                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1597                                 error, mp->m_fsname);
1598                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1599                 }
1600                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1601         } else {
1602                 /*
1603                  * Credit the quota account(s). The inode is gone.
1604                  */
1605                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1606
1607                 /*
1608                  * Just ignore errors at this point.  There is nothing we can
1609                  * do except to try to keep going. Make sure it's not a silent
1610                  * error.
1611                  */
1612                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
1613                 if (error)
1614                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1615                                 "xfs_bmap_finish() returned error %d", error);
1616                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1617                 if (error)
1618                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1619                                 "xfs_trans_commit() returned error %d", error);
1620         }
1621         /*
1622          * Release the dquots held by inode, if any.
1623          */
1624         XFS_QM_DQDETACH(mp, ip);
1625
1626         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1627
1628  out:
1629         return VN_INACTIVE_CACHE;
1630 }
1631
1632
1633 int
1634 xfs_lookup(
1635         xfs_inode_t             *dp,
1636         struct xfs_name         *name,
1637         xfs_inode_t             **ipp)
1638 {
1639         xfs_ino_t               inum;
1640         int                     error;
1641         uint                    lock_mode;
1642
1643         xfs_itrace_entry(dp);
1644
1645         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1646                 return XFS_ERROR(EIO);
1647
1648         lock_mode = xfs_ilock_map_shared(dp);
1649         error = xfs_dir_lookup(NULL, dp, name, &inum);
1650         xfs_iunlock_map_shared(dp, lock_mode);
1651
1652         if (error)
1653                 goto out;
1654
1655         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
1656         if (error)
1657                 goto out;
1658
1659         xfs_itrace_ref(*ipp);
1660         return 0;
1661
1662  out:
1663         *ipp = NULL;
1664         return error;
1665 }
1666
1667 int
1668 xfs_create(
1669         xfs_inode_t             *dp,
1670         struct xfs_name         *name,
1671         mode_t                  mode,
1672         xfs_dev_t               rdev,
1673         xfs_inode_t             **ipp,
1674         cred_t                  *credp)
1675 {
1676         xfs_mount_t             *mp = dp->i_mount;
1677         xfs_inode_t             *ip;
1678         xfs_trans_t             *tp;
1679         int                     error;
1680         xfs_bmap_free_t         free_list;
1681         xfs_fsblock_t           first_block;
1682         boolean_t               unlock_dp_on_error = B_FALSE;
1683         int                     dm_event_sent = 0;
1684         uint                    cancel_flags;
1685         int                     committed;
1686         xfs_prid_t              prid;
1687         struct xfs_dquot        *udqp, *gdqp;
1688         uint                    resblks;
1689
1690         ASSERT(!*ipp);
1691         xfs_itrace_entry(dp);
1692
1693         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1694                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1695                                 dp, DM_RIGHT_NULL, NULL,
1696                                 DM_RIGHT_NULL, name->name, NULL,
1697                                 mode, 0, 0);
1698
1699                 if (error)
1700                         return error;
1701                 dm_event_sent = 1;
1702         }
1703
1704         if (XFS_FORCED_SHUTDOWN(mp))
1705                 return XFS_ERROR(EIO);
1706
1707         /* Return through std_return after this point. */
1708
1709         udqp = gdqp = NULL;
1710         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1711                 prid = dp->i_d.di_projid;
1712         else
1713                 prid = (xfs_prid_t)dfltprid;
1714
1715         /*
1716          * Make sure that we have allocated dquot(s) on disk.
1717          */
1718         error = XFS_QM_DQVOPALLOC(mp, dp,
1719                         current_fsuid(credp), current_fsgid(credp), prid,
1720                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1721         if (error)
1722                 goto std_return;
1723
1724         ip = NULL;
1725
1726         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1727         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1728         resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1729         /*
1730          * Initially assume that the file does not exist and
1731          * reserve the resources for that case.  If that is not
1732          * the case we'll drop the one we have and get a more
1733          * appropriate transaction later.
1734          */
1735         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1736                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1737         if (error == ENOSPC) {
1738                 resblks = 0;
1739                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1740                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1741         }
1742         if (error) {
1743                 cancel_flags = 0;
1744                 goto error_return;
1745         }
1746
1747         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1748         unlock_dp_on_error = B_TRUE;
1749
1750         XFS_BMAP_INIT(&free_list, &first_block);
1751
1752         ASSERT(ip == NULL);
1753
1754         /*
1755          * Reserve disk quota and the inode.
1756          */
1757         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1758         if (error)
1759                 goto error_return;
1760
1761         error = xfs_dir_canenter(tp, dp, name, resblks);
1762         if (error)
1763                 goto error_return;
1764         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1765                         rdev, credp, prid, resblks > 0,
1766                         &ip, &committed);
1767         if (error) {
1768                 if (error == ENOSPC)
1769                         goto error_return;
1770                 goto abort_return;
1771         }
1772         xfs_itrace_ref(ip);
1773
1774         /*
1775          * At this point, we've gotten a newly allocated inode.
1776          * It is locked (and joined to the transaction).
1777          */
1778
1779         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1780
1781         /*
1782          * Now we join the directory inode to the transaction.  We do not do it
1783          * earlier because xfs_dir_ialloc might commit the previous transaction
1784          * (and release all the locks).  An error from here on will result in
1785          * the transaction cancel unlocking dp so don't do it explicitly in the
1786          * error path.
1787          */
1788         IHOLD(dp);
1789         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1790         unlock_dp_on_error = B_FALSE;
1791
1792         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1793                                         &first_block, &free_list, resblks ?
1794                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1795         if (error) {
1796                 ASSERT(error != ENOSPC);
1797                 goto abort_return;
1798         }
1799         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1800         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1801
1802         /*
1803          * If this is a synchronous mount, make sure that the
1804          * create transaction goes to disk before returning to
1805          * the user.
1806          */
1807         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1808                 xfs_trans_set_sync(tp);
1809         }
1810
1811         dp->i_gen++;
1812
1813         /*
1814          * Attach the dquot(s) to the inodes and modify them incore.
1815          * These ids of the inode couldn't have changed since the new
1816          * inode has been locked ever since it was created.
1817          */
1818         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1819
1820         /*
1821          * xfs_trans_commit normally decrements the vnode ref count
1822          * when it unlocks the inode. Since we want to return the
1823          * vnode to the caller, we bump the vnode ref count now.
1824          */
1825         IHOLD(ip);
1826
1827         error = xfs_bmap_finish(&tp, &free_list, &committed);
1828         if (error) {
1829                 xfs_bmap_cancel(&free_list);
1830                 goto abort_rele;
1831         }
1832
1833         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1834         if (error) {
1835                 IRELE(ip);
1836                 tp = NULL;
1837                 goto error_return;
1838         }
1839
1840         XFS_QM_DQRELE(mp, udqp);
1841         XFS_QM_DQRELE(mp, gdqp);
1842
1843         *ipp = ip;
1844
1845         /* Fallthrough to std_return with error = 0  */
1846
1847 std_return:
1848         if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
1849             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1850                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1851                         dp, DM_RIGHT_NULL,
1852                         *ipp ? ip : NULL,
1853                         DM_RIGHT_NULL, name->name, NULL,
1854                         mode, error, 0);
1855         }
1856         return error;
1857
1858  abort_return:
1859         cancel_flags |= XFS_TRANS_ABORT;
1860         /* FALLTHROUGH */
1861
1862  error_return:
1863         if (tp != NULL)
1864                 xfs_trans_cancel(tp, cancel_flags);
1865
1866         XFS_QM_DQRELE(mp, udqp);
1867         XFS_QM_DQRELE(mp, gdqp);
1868
1869         if (unlock_dp_on_error)
1870                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1871
1872         goto std_return;
1873
1874  abort_rele:
1875         /*
1876          * Wait until after the current transaction is aborted to
1877          * release the inode.  This prevents recursive transactions
1878          * and deadlocks from xfs_inactive.
1879          */
1880         cancel_flags |= XFS_TRANS_ABORT;
1881         xfs_trans_cancel(tp, cancel_flags);
1882         IRELE(ip);
1883
1884         XFS_QM_DQRELE(mp, udqp);
1885         XFS_QM_DQRELE(mp, gdqp);
1886
1887         goto std_return;
1888 }
1889
1890 #ifdef DEBUG
1891 /*
1892  * Some counters to see if (and how often) we are hitting some deadlock
1893  * prevention code paths.
1894  */
1895
1896 int xfs_rm_locks;
1897 int xfs_rm_lock_delays;
1898 int xfs_rm_attempts;
1899 #endif
1900
1901 /*
1902  * The following routine will lock the inodes associated with the
1903  * directory and the named entry in the directory. The locks are
1904  * acquired in increasing inode number.
1905  *
1906  * If the entry is "..", then only the directory is locked. The
1907  * vnode ref count will still include that from the .. entry in
1908  * this case.
1909  *
1910  * There is a deadlock we need to worry about. If the locked directory is
1911  * in the AIL, it might be blocking up the log. The next inode we lock
1912  * could be already locked by another thread waiting for log space (e.g
1913  * a permanent log reservation with a long running transaction (see
1914  * xfs_itruncate_finish)). To solve this, we must check if the directory
1915  * is in the ail and use lock_nowait. If we can't lock, we need to
1916  * drop the inode lock on the directory and try again. xfs_iunlock will
1917  * potentially push the tail if we were holding up the log.
1918  */
1919 STATIC int
1920 xfs_lock_dir_and_entry(
1921         xfs_inode_t     *dp,
1922         xfs_inode_t     *ip)    /* inode of entry 'name' */
1923 {
1924         int             attempts;
1925         xfs_ino_t       e_inum;
1926         xfs_inode_t     *ips[2];
1927         xfs_log_item_t  *lp;
1928
1929 #ifdef DEBUG
1930         xfs_rm_locks++;
1931 #endif
1932         attempts = 0;
1933
1934 again:
1935         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1936
1937         e_inum = ip->i_ino;
1938
1939         xfs_itrace_ref(ip);
1940
1941         /*
1942          * We want to lock in increasing inum. Since we've already
1943          * acquired the lock on the directory, we may need to release
1944          * if if the inum of the entry turns out to be less.
1945          */
1946         if (e_inum > dp->i_ino) {
1947                 /*
1948                  * We are already in the right order, so just
1949                  * lock on the inode of the entry.
1950                  * We need to use nowait if dp is in the AIL.
1951                  */
1952
1953                 lp = (xfs_log_item_t *)dp->i_itemp;
1954                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1955                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1956                                 attempts++;
1957 #ifdef DEBUG
1958                                 xfs_rm_attempts++;
1959 #endif
1960
1961                                 /*
1962                                  * Unlock dp and try again.
1963                                  * xfs_iunlock will try to push the tail
1964                                  * if the inode is in the AIL.
1965                                  */
1966
1967                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1968
1969                                 if ((attempts % 5) == 0) {
1970                                         delay(1); /* Don't just spin the CPU */
1971 #ifdef DEBUG
1972                                         xfs_rm_lock_delays++;
1973 #endif
1974                                 }
1975                                 goto again;
1976                         }
1977                 } else {
1978                         xfs_ilock(ip, XFS_ILOCK_EXCL);
1979                 }
1980         } else if (e_inum < dp->i_ino) {
1981                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1982
1983                 ips[0] = ip;
1984                 ips[1] = dp;
1985                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
1986         }
1987         /* else  e_inum == dp->i_ino */
1988         /*     This can happen if we're asked to lock /x/..
1989          *     the entry is "..", which is also the parent directory.
1990          */
1991
1992         return 0;
1993 }
1994
1995 #ifdef DEBUG
1996 int xfs_locked_n;
1997 int xfs_small_retries;
1998 int xfs_middle_retries;
1999 int xfs_lots_retries;
2000 int xfs_lock_delays;
2001 #endif
2002
2003 /*
2004  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2005  * a different value
2006  */
2007 static inline int
2008 xfs_lock_inumorder(int lock_mode, int subclass)
2009 {
2010         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2011                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2012         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2013                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2014
2015         return lock_mode;
2016 }
2017
2018 /*
2019  * The following routine will lock n inodes in exclusive mode.
2020  * We assume the caller calls us with the inodes in i_ino order.
2021  *
2022  * We need to detect deadlock where an inode that we lock
2023  * is in the AIL and we start waiting for another inode that is locked
2024  * by a thread in a long running transaction (such as truncate). This can
2025  * result in deadlock since the long running trans might need to wait
2026  * for the inode we just locked in order to push the tail and free space
2027  * in the log.
2028  */
2029 void
2030 xfs_lock_inodes(
2031         xfs_inode_t     **ips,
2032         int             inodes,
2033         int             first_locked,
2034         uint            lock_mode)
2035 {
2036         int             attempts = 0, i, j, try_lock;
2037         xfs_log_item_t  *lp;
2038
2039         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2040
2041         if (first_locked) {
2042                 try_lock = 1;
2043                 i = 1;
2044         } else {
2045                 try_lock = 0;
2046                 i = 0;
2047         }
2048
2049 again:
2050         for (; i < inodes; i++) {
2051                 ASSERT(ips[i]);
2052
2053                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2054                         continue;
2055
2056                 /*
2057                  * If try_lock is not set yet, make sure all locked inodes
2058                  * are not in the AIL.
2059                  * If any are, set try_lock to be used later.
2060                  */
2061
2062                 if (!try_lock) {
2063                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2064                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2065                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2066                                         try_lock++;
2067                                 }
2068                         }
2069                 }
2070
2071                 /*
2072                  * If any of the previous locks we have locked is in the AIL,
2073                  * we must TRY to get the second and subsequent locks. If
2074                  * we can't get any, we must release all we have
2075                  * and try again.
2076                  */
2077
2078                 if (try_lock) {
2079                         /* try_lock must be 0 if i is 0. */
2080                         /*
2081                          * try_lock means we have an inode locked
2082                          * that is in the AIL.
2083                          */
2084                         ASSERT(i != 0);
2085                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2086                                 attempts++;
2087
2088                                 /*
2089                                  * Unlock all previous guys and try again.
2090                                  * xfs_iunlock will try to push the tail
2091                                  * if the inode is in the AIL.
2092                                  */
2093
2094                                 for(j = i - 1; j >= 0; j--) {
2095
2096                                         /*
2097                                          * Check to see if we've already
2098                                          * unlocked this one.
2099                                          * Not the first one going back,
2100                                          * and the inode ptr is the same.
2101                                          */
2102                                         if ((j != (i - 1)) && ips[j] ==
2103                                                                 ips[j+1])
2104                                                 continue;
2105
2106                                         xfs_iunlock(ips[j], lock_mode);
2107                                 }
2108
2109                                 if ((attempts % 5) == 0) {
2110                                         delay(1); /* Don't just spin the CPU */
2111 #ifdef DEBUG
2112                                         xfs_lock_delays++;
2113 #endif
2114                                 }
2115                                 i = 0;
2116                                 try_lock = 0;
2117                                 goto again;
2118                         }
2119                 } else {
2120                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2121                 }
2122         }
2123
2124 #ifdef DEBUG
2125         if (attempts) {
2126                 if (attempts < 5) xfs_small_retries++;
2127                 else if (attempts < 100) xfs_middle_retries++;
2128                 else xfs_lots_retries++;
2129         } else {
2130                 xfs_locked_n++;
2131         }
2132 #endif
2133 }
2134
2135 #ifdef  DEBUG
2136 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2137 int remove_which_error_return = 0;
2138 #else /* ! DEBUG */
2139 #define REMOVE_DEBUG_TRACE(x)
2140 #endif  /* ! DEBUG */
2141
2142 int
2143 xfs_remove(
2144         xfs_inode_t             *dp,
2145         struct xfs_name         *name,
2146         xfs_inode_t             *ip)
2147 {
2148         xfs_mount_t             *mp = dp->i_mount;
2149         xfs_trans_t             *tp = NULL;
2150         int                     error = 0;
2151         xfs_bmap_free_t         free_list;
2152         xfs_fsblock_t           first_block;
2153         int                     cancel_flags;
2154         int                     committed;
2155         int                     link_zero;
2156         uint                    resblks;
2157
2158         xfs_itrace_entry(dp);
2159
2160         if (XFS_FORCED_SHUTDOWN(mp))
2161                 return XFS_ERROR(EIO);
2162
2163         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2164                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
2165                                         NULL, DM_RIGHT_NULL, name->name, NULL,
2166                                         ip->i_d.di_mode, 0, 0);
2167                 if (error)
2168                         return error;
2169         }
2170
2171         /*
2172          * We need to get a reference to ip before we get our log
2173          * reservation. The reason for this is that we cannot call
2174          * xfs_iget for an inode for which we do not have a reference
2175          * once we've acquired a log reservation. This is because the
2176          * inode we are trying to get might be in xfs_inactive going
2177          * for a log reservation. Since we'll have to wait for the
2178          * inactive code to complete before returning from xfs_iget,
2179          * we need to make sure that we don't have log space reserved
2180          * when we call xfs_iget.  Instead we get an unlocked reference
2181          * to the inode before getting our log reservation.
2182          */
2183         IHOLD(ip);
2184
2185         xfs_itrace_entry(ip);
2186         xfs_itrace_ref(ip);
2187
2188         error = XFS_QM_DQATTACH(mp, dp, 0);
2189         if (!error && dp != ip)
2190                 error = XFS_QM_DQATTACH(mp, ip, 0);
2191         if (error) {
2192                 REMOVE_DEBUG_TRACE(__LINE__);
2193                 IRELE(ip);
2194                 goto std_return;
2195         }
2196
2197         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2198         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2199         /*
2200          * We try to get the real space reservation first,
2201          * allowing for directory btree deletion(s) implying
2202          * possible bmap insert(s).  If we can't get the space
2203          * reservation then we use 0 instead, and avoid the bmap
2204          * btree insert(s) in the directory code by, if the bmap
2205          * insert tries to happen, instead trimming the LAST
2206          * block from the directory.
2207          */
2208         resblks = XFS_REMOVE_SPACE_RES(mp);
2209         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2210                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2211         if (error == ENOSPC) {
2212                 resblks = 0;
2213                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2214                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2215         }
2216         if (error) {
2217                 ASSERT(error != ENOSPC);
2218                 REMOVE_DEBUG_TRACE(__LINE__);
2219                 xfs_trans_cancel(tp, 0);
2220                 IRELE(ip);
2221                 return error;
2222         }
2223
2224         error = xfs_lock_dir_and_entry(dp, ip);
2225         if (error) {
2226                 REMOVE_DEBUG_TRACE(__LINE__);
2227                 xfs_trans_cancel(tp, cancel_flags);
2228                 IRELE(ip);
2229                 goto std_return;
2230         }
2231
2232         /*
2233          * At this point, we've gotten both the directory and the entry
2234          * inodes locked.
2235          */
2236         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2237         if (dp != ip) {
2238                 /*
2239                  * Increment vnode ref count only in this case since
2240                  * there's an extra vnode reference in the case where
2241                  * dp == ip.
2242                  */
2243                 IHOLD(dp);
2244                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2245         }
2246
2247         /*
2248          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2249          */
2250         XFS_BMAP_INIT(&free_list, &first_block);
2251         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2252                                         &first_block, &free_list, resblks);
2253         if (error) {
2254                 ASSERT(error != ENOENT);
2255                 REMOVE_DEBUG_TRACE(__LINE__);
2256                 goto error1;
2257         }
2258         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2259
2260         dp->i_gen++;
2261         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2262
2263         error = xfs_droplink(tp, ip);
2264         if (error) {
2265                 REMOVE_DEBUG_TRACE(__LINE__);
2266                 goto error1;
2267         }
2268
2269         /* Determine if this is the last link while
2270          * we are in the transaction.
2271          */
2272         link_zero = (ip)->i_d.di_nlink==0;
2273
2274         /*
2275          * Take an extra ref on the inode so that it doesn't
2276          * go to xfs_inactive() from within the commit.
2277          */
2278         IHOLD(ip);
2279
2280         /*
2281          * If this is a synchronous mount, make sure that the
2282          * remove transaction goes to disk before returning to
2283          * the user.
2284          */
2285         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2286                 xfs_trans_set_sync(tp);
2287         }
2288
2289         error = xfs_bmap_finish(&tp, &free_list, &committed);
2290         if (error) {
2291                 REMOVE_DEBUG_TRACE(__LINE__);
2292                 goto error_rele;
2293         }
2294
2295         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2296         if (error) {
2297                 IRELE(ip);
2298                 goto std_return;
2299         }
2300
2301         /*
2302          * If we are using filestreams, kill the stream association.
2303          * If the file is still open it may get a new one but that
2304          * will get killed on last close in xfs_close() so we don't
2305          * have to worry about that.
2306          */
2307         if (link_zero && xfs_inode_is_filestream(ip))
2308                 xfs_filestream_deassociate(ip);
2309
2310         xfs_itrace_exit(ip);
2311         IRELE(ip);
2312
2313 /*      Fall through to std_return with error = 0 */
2314  std_return:
2315         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2316                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2317                                 dp, DM_RIGHT_NULL,
2318                                 NULL, DM_RIGHT_NULL,
2319                                 name->name, NULL, ip->i_d.di_mode, error, 0);
2320         }
2321         return error;
2322
2323  error1:
2324         xfs_bmap_cancel(&free_list);
2325         cancel_flags |= XFS_TRANS_ABORT;
2326         xfs_trans_cancel(tp, cancel_flags);
2327         goto std_return;
2328
2329  error_rele:
2330         /*
2331          * In this case make sure to not release the inode until after
2332          * the current transaction is aborted.  Releasing it beforehand
2333          * can cause us to go to xfs_inactive and start a recursive
2334          * transaction which can easily deadlock with the current one.
2335          */
2336         xfs_bmap_cancel(&free_list);
2337         cancel_flags |= XFS_TRANS_ABORT;
2338         xfs_trans_cancel(tp, cancel_flags);
2339
2340         IRELE(ip);
2341
2342         goto std_return;
2343 }
2344
2345 int
2346 xfs_link(
2347         xfs_inode_t             *tdp,
2348         xfs_inode_t             *sip,
2349         struct xfs_name         *target_name)
2350 {
2351         xfs_mount_t             *mp = tdp->i_mount;
2352         xfs_trans_t             *tp;
2353         xfs_inode_t             *ips[2];
2354         int                     error;
2355         xfs_bmap_free_t         free_list;
2356         xfs_fsblock_t           first_block;
2357         int                     cancel_flags;
2358         int                     committed;
2359         int                     resblks;
2360
2361         xfs_itrace_entry(tdp);
2362         xfs_itrace_entry(sip);
2363
2364         ASSERT(!S_ISDIR(sip->i_d.di_mode));
2365
2366         if (XFS_FORCED_SHUTDOWN(mp))
2367                 return XFS_ERROR(EIO);
2368
2369         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2370                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2371                                         tdp, DM_RIGHT_NULL,
2372                                         sip, DM_RIGHT_NULL,
2373                                         target_name->name, NULL, 0, 0, 0);
2374                 if (error)
2375                         return error;
2376         }
2377
2378         /* Return through std_return after this point. */
2379
2380         error = XFS_QM_DQATTACH(mp, sip, 0);
2381         if (!error && sip != tdp)
2382                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2383         if (error)
2384                 goto std_return;
2385
2386         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2387         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2388         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
2389         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2390                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2391         if (error == ENOSPC) {
2392                 resblks = 0;
2393                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2394                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2395         }
2396         if (error) {
2397                 cancel_flags = 0;
2398                 goto error_return;
2399         }
2400
2401         if (sip->i_ino < tdp->i_ino) {
2402                 ips[0] = sip;
2403                 ips[1] = tdp;
2404         } else {
2405                 ips[0] = tdp;
2406                 ips[1] = sip;
2407         }
2408
2409         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2410
2411         /*
2412          * Increment vnode ref counts since xfs_trans_commit &
2413          * xfs_trans_cancel will both unlock the inodes and
2414          * decrement the associated ref counts.
2415          */
2416         IHOLD(sip);
2417         IHOLD(tdp);
2418         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2419         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2420
2421         /*
2422          * If the source has too many links, we can't make any more to it.
2423          */
2424         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2425                 error = XFS_ERROR(EMLINK);
2426                 goto error_return;
2427         }
2428
2429         /*
2430          * If we are using project inheritance, we only allow hard link
2431          * creation in our tree when the project IDs are the same; else
2432          * the tree quota mechanism could be circumvented.
2433          */
2434         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2435                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2436                 error = XFS_ERROR(EXDEV);
2437                 goto error_return;
2438         }
2439
2440         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
2441         if (error)
2442                 goto error_return;
2443
2444         XFS_BMAP_INIT(&free_list, &first_block);
2445
2446         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2447                                         &first_block, &free_list, resblks);
2448         if (error)
2449                 goto abort_return;
2450         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2451         tdp->i_gen++;
2452         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2453
2454         error = xfs_bumplink(tp, sip);
2455         if (error)
2456                 goto abort_return;
2457
2458         /*
2459          * If this is a synchronous mount, make sure that the
2460          * link transaction goes to disk before returning to
2461          * the user.
2462          */
2463         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2464                 xfs_trans_set_sync(tp);
2465         }
2466
2467         error = xfs_bmap_finish (&tp, &free_list, &committed);
2468         if (error) {
2469                 xfs_bmap_cancel(&free_list);
2470                 goto abort_return;
2471         }
2472
2473         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2474         if (error)
2475                 goto std_return;
2476
2477         /* Fall through to std_return with error = 0. */
2478 std_return:
2479         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2480                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2481                                 tdp, DM_RIGHT_NULL,
2482                                 sip, DM_RIGHT_NULL,
2483                                 target_name->name, NULL, 0, error, 0);
2484         }
2485         return error;
2486
2487  abort_return:
2488         cancel_flags |= XFS_TRANS_ABORT;
2489         /* FALLTHROUGH */
2490
2491  error_return:
2492         xfs_trans_cancel(tp, cancel_flags);
2493         goto std_return;
2494 }
2495
2496
2497 int
2498 xfs_mkdir(
2499         xfs_inode_t             *dp,
2500         struct xfs_name         *dir_name,
2501         mode_t                  mode,
2502         xfs_inode_t             **ipp,
2503         cred_t                  *credp)
2504 {
2505         xfs_mount_t             *mp = dp->i_mount;
2506         xfs_inode_t             *cdp;   /* inode of created dir */
2507         xfs_trans_t             *tp;
2508         int                     cancel_flags;
2509         int                     error;
2510         int                     committed;
2511         xfs_bmap_free_t         free_list;
2512         xfs_fsblock_t           first_block;
2513         boolean_t               unlock_dp_on_error = B_FALSE;
2514         boolean_t               created = B_FALSE;
2515         int                     dm_event_sent = 0;
2516         xfs_prid_t              prid;
2517         struct xfs_dquot        *udqp, *gdqp;
2518         uint                    resblks;
2519
2520         if (XFS_FORCED_SHUTDOWN(mp))
2521                 return XFS_ERROR(EIO);
2522
2523         tp = NULL;
2524
2525         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2526                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2527                                         dp, DM_RIGHT_NULL, NULL,
2528                                         DM_RIGHT_NULL, dir_name->name, NULL,
2529                                         mode, 0, 0);
2530                 if (error)
2531                         return error;
2532                 dm_event_sent = 1;
2533         }
2534
2535         /* Return through std_return after this point. */
2536
2537         xfs_itrace_entry(dp);
2538
2539         mp = dp->i_mount;
2540         udqp = gdqp = NULL;
2541         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2542                 prid = dp->i_d.di_projid;
2543         else
2544                 prid = (xfs_prid_t)dfltprid;
2545
2546         /*
2547          * Make sure that we have allocated dquot(s) on disk.
2548          */
2549         error = XFS_QM_DQVOPALLOC(mp, dp,
2550                         current_fsuid(credp), current_fsgid(credp), prid,
2551                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2552         if (error)
2553                 goto std_return;
2554
2555         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2556         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2557         resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2558         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2559                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2560         if (error == ENOSPC) {
2561                 resblks = 0;
2562                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2563                                           XFS_TRANS_PERM_LOG_RES,
2564                                           XFS_MKDIR_LOG_COUNT);
2565         }
2566         if (error) {
2567                 cancel_flags = 0;
2568                 goto error_return;
2569         }
2570
2571         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2572         unlock_dp_on_error = B_TRUE;
2573
2574         /*
2575          * Check for directory link count overflow.
2576          */
2577         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2578                 error = XFS_ERROR(EMLINK);
2579                 goto error_return;
2580         }
2581
2582         /*
2583          * Reserve disk quota and the inode.
2584          */
2585         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2586         if (error)
2587                 goto error_return;
2588
2589         error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2590         if (error)
2591                 goto error_return;
2592         /*
2593          * create the directory inode.
2594          */
2595         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2596                         0, credp, prid, resblks > 0,
2597                 &cdp, NULL);
2598         if (error) {
2599                 if (error == ENOSPC)
2600                         goto error_return;
2601                 goto abort_return;
2602         }
2603         xfs_itrace_ref(cdp);
2604
2605         /*
2606          * Now we add the directory inode to the transaction.
2607          * We waited until now since xfs_dir_ialloc might start
2608          * a new transaction.  Had we joined the transaction
2609          * earlier, the locks might have gotten released. An error
2610          * from here on will result in the transaction cancel
2611          * unlocking dp so don't do it explicitly in the error path.
2612          */
2613         IHOLD(dp);
2614         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2615         unlock_dp_on_error = B_FALSE;
2616
2617         XFS_BMAP_INIT(&free_list, &first_block);
2618
2619         error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2620                                         &first_block, &free_list, resblks ?
2621                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2622         if (error) {
2623                 ASSERT(error != ENOSPC);
2624                 goto error1;
2625         }
2626         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2627
2628         /*
2629          * Bump the in memory version number of the parent directory
2630          * so that other processes accessing it will recognize that
2631          * the directory has changed.
2632          */
2633         dp->i_gen++;
2634
2635         error = xfs_dir_init(tp, cdp, dp);
2636         if (error)
2637                 goto error2;
2638
2639         cdp->i_gen = 1;
2640         error = xfs_bumplink(tp, dp);
2641         if (error)
2642                 goto error2;
2643
2644         created = B_TRUE;
2645
2646         *ipp = cdp;
2647         IHOLD(cdp);
2648
2649         /*
2650          * Attach the dquots to the new inode and modify the icount incore.
2651          */
2652         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2653
2654         /*
2655          * If this is a synchronous mount, make sure that the
2656          * mkdir transaction goes to disk before returning to
2657          * the user.
2658          */
2659         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2660                 xfs_trans_set_sync(tp);
2661         }
2662
2663         error = xfs_bmap_finish(&tp, &free_list, &committed);
2664         if (error) {
2665                 IRELE(cdp);
2666                 goto error2;
2667         }
2668
2669         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2670         XFS_QM_DQRELE(mp, udqp);
2671         XFS_QM_DQRELE(mp, gdqp);
2672         if (error) {
2673                 IRELE(cdp);
2674         }
2675
2676         /* Fall through to std_return with error = 0 or errno from
2677          * xfs_trans_commit. */
2678
2679 std_return:
2680         if ((created || (error != 0 && dm_event_sent != 0)) &&
2681             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2682                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2683                                         dp, DM_RIGHT_NULL,
2684                                         created ? cdp : NULL,
2685                                         DM_RIGHT_NULL,
2686                                         dir_name->name, NULL,
2687                                         mode, error, 0);
2688         }
2689         return error;
2690
2691  error2:
2692  error1:
2693         xfs_bmap_cancel(&free_list);
2694  abort_return:
2695         cancel_flags |= XFS_TRANS_ABORT;
2696  error_return:
2697         xfs_trans_cancel(tp, cancel_flags);
2698         XFS_QM_DQRELE(mp, udqp);
2699         XFS_QM_DQRELE(mp, gdqp);
2700
2701         if (unlock_dp_on_error)
2702                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2703
2704         goto std_return;
2705 }
2706
2707 int
2708 xfs_rmdir(
2709         xfs_inode_t             *dp,
2710         struct xfs_name         *name,
2711         xfs_inode_t             *cdp)
2712 {
2713         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2714         xfs_mount_t             *mp = dp->i_mount;
2715         xfs_trans_t             *tp;
2716         int                     error;
2717         xfs_bmap_free_t         free_list;
2718         xfs_fsblock_t           first_block;
2719         int                     cancel_flags;
2720         int                     committed;
2721         int                     last_cdp_link;
2722         uint                    resblks;
2723
2724         xfs_itrace_entry(dp);
2725
2726         if (XFS_FORCED_SHUTDOWN(mp))
2727                 return XFS_ERROR(EIO);
2728
2729         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2730                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2731                                         dp, DM_RIGHT_NULL,
2732                                         NULL, DM_RIGHT_NULL, name->name,
2733                                         NULL, cdp->i_d.di_mode, 0, 0);
2734                 if (error)
2735                         return XFS_ERROR(error);
2736         }
2737
2738         /*
2739          * We need to get a reference to cdp before we get our log
2740          * reservation.  The reason for this is that we cannot call
2741          * xfs_iget for an inode for which we do not have a reference
2742          * once we've acquired a log reservation.  This is because the
2743          * inode we are trying to get might be in xfs_inactive going
2744          * for a log reservation.  Since we'll have to wait for the
2745          * inactive code to complete before returning from xfs_iget,
2746          * we need to make sure that we don't have log space reserved
2747          * when we call xfs_iget.  Instead we get an unlocked reference
2748          * to the inode before getting our log reservation.
2749          */
2750         IHOLD(cdp);
2751
2752         /*
2753          * Get the dquots for the inodes.
2754          */
2755         error = XFS_QM_DQATTACH(mp, dp, 0);
2756         if (!error && dp != cdp)
2757                 error = XFS_QM_DQATTACH(mp, cdp, 0);
2758         if (error) {
2759                 IRELE(cdp);
2760                 REMOVE_DEBUG_TRACE(__LINE__);
2761                 goto std_return;
2762         }
2763
2764         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2765         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2766         /*
2767          * We try to get the real space reservation first,
2768          * allowing for directory btree deletion(s) implying
2769          * possible bmap insert(s).  If we can't get the space
2770          * reservation then we use 0 instead, and avoid the bmap
2771          * btree insert(s) in the directory code by, if the bmap
2772          * insert tries to happen, instead trimming the LAST
2773          * block from the directory.
2774          */
2775         resblks = XFS_REMOVE_SPACE_RES(mp);
2776         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2777                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2778         if (error == ENOSPC) {
2779                 resblks = 0;
2780                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2781                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2782         }
2783         if (error) {
2784                 ASSERT(error != ENOSPC);
2785                 cancel_flags = 0;
2786                 IRELE(cdp);
2787                 goto error_return;
2788         }
2789         XFS_BMAP_INIT(&free_list, &first_block);
2790
2791         /*
2792          * Now lock the child directory inode and the parent directory
2793          * inode in the proper order.  This will take care of validating
2794          * that the directory entry for the child directory inode has
2795          * not changed while we were obtaining a log reservation.
2796          */
2797         error = xfs_lock_dir_and_entry(dp, cdp);
2798         if (error) {
2799                 xfs_trans_cancel(tp, cancel_flags);
2800                 IRELE(cdp);
2801                 goto std_return;
2802         }
2803
2804         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2805         if (dp != cdp) {
2806                 /*
2807                  * Only increment the parent directory vnode count if
2808                  * we didn't bump it in looking up cdp.  The only time
2809                  * we don't bump it is when we're looking up ".".
2810                  */
2811                 VN_HOLD(dir_vp);
2812         }
2813
2814         xfs_itrace_ref(cdp);
2815         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
2816
2817         ASSERT(cdp->i_d.di_nlink >= 2);
2818         if (cdp->i_d.di_nlink != 2) {
2819                 error = XFS_ERROR(ENOTEMPTY);
2820                 goto error_return;
2821         }
2822         if (!xfs_dir_isempty(cdp)) {
2823                 error = XFS_ERROR(ENOTEMPTY);
2824                 goto error_return;
2825         }
2826
2827         error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
2828                                         &first_block, &free_list, resblks);
2829         if (error)
2830                 goto error1;
2831
2832         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2833
2834         /*
2835          * Bump the in memory generation count on the parent
2836          * directory so that other can know that it has changed.
2837          */
2838         dp->i_gen++;
2839
2840         /*
2841          * Drop the link from cdp's "..".
2842          */
2843         error = xfs_droplink(tp, dp);
2844         if (error) {
2845                 goto error1;
2846         }
2847
2848         /*
2849          * Drop the link from dp to cdp.
2850          */
2851         error = xfs_droplink(tp, cdp);
2852         if (error) {
2853                 goto error1;
2854         }
2855
2856         /*
2857          * Drop the "." link from cdp to self.
2858          */
2859         error = xfs_droplink(tp, cdp);
2860         if (error) {
2861                 goto error1;
2862         }
2863
2864         /* Determine these before committing transaction */
2865         last_cdp_link = (cdp)->i_d.di_nlink==0;
2866
2867         /*
2868          * Take an extra ref on the child vnode so that it
2869          * does not go to xfs_inactive() from within the commit.
2870          */
2871         IHOLD(cdp);
2872
2873         /*
2874          * If this is a synchronous mount, make sure that the
2875          * rmdir transaction goes to disk before returning to
2876          * the user.
2877          */
2878         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2879                 xfs_trans_set_sync(tp);
2880         }
2881
2882         error = xfs_bmap_finish (&tp, &free_list, &committed);
2883         if (error) {
2884                 xfs_bmap_cancel(&free_list);
2885                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
2886                                  XFS_TRANS_ABORT));
2887                 IRELE(cdp);
2888                 goto std_return;
2889         }
2890
2891         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2892         if (error) {
2893                 IRELE(cdp);
2894                 goto std_return;
2895         }
2896
2897
2898         IRELE(cdp);
2899
2900         /* Fall through to std_return with error = 0 or the errno
2901          * from xfs_trans_commit. */
2902  std_return:
2903         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2904                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2905                                         dp, DM_RIGHT_NULL,
2906                                         NULL, DM_RIGHT_NULL,
2907                                         name->name, NULL, cdp->i_d.di_mode,
2908                                         error, 0);
2909         }
2910         return error;
2911
2912  error1:
2913         xfs_bmap_cancel(&free_list);
2914         cancel_flags |= XFS_TRANS_ABORT;
2915         /* FALLTHROUGH */
2916
2917  error_return:
2918         xfs_trans_cancel(tp, cancel_flags);
2919         goto std_return;
2920 }
2921
2922 int
2923 xfs_symlink(
2924         xfs_inode_t             *dp,
2925         struct xfs_name         *link_name,
2926         const char              *target_path,
2927         mode_t                  mode,
2928         xfs_inode_t             **ipp,
2929         cred_t                  *credp)
2930 {
2931         xfs_mount_t             *mp = dp->i_mount;
2932         xfs_trans_t             *tp;
2933         xfs_inode_t             *ip;
2934         int                     error;
2935         int                     pathlen;
2936         xfs_bmap_free_t         free_list;
2937         xfs_fsblock_t           first_block;
2938         boolean_t               unlock_dp_on_error = B_FALSE;
2939         uint                    cancel_flags;
2940         int                     committed;
2941         xfs_fileoff_t           first_fsb;
2942         xfs_filblks_t           fs_blocks;
2943         int                     nmaps;
2944         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
2945         xfs_daddr_t             d;
2946         const char              *cur_chunk;
2947         int                     byte_cnt;
2948         int                     n;
2949         xfs_buf_t               *bp;
2950         xfs_prid_t              prid;
2951         struct xfs_dquot        *udqp, *gdqp;
2952         uint                    resblks;
2953
2954         *ipp = NULL;
2955         error = 0;
2956         ip = NULL;
2957         tp = NULL;
2958
2959         xfs_itrace_entry(dp);
2960
2961         if (XFS_FORCED_SHUTDOWN(mp))
2962                 return XFS_ERROR(EIO);
2963
2964         /*
2965          * Check component lengths of the target path name.
2966          */
2967         pathlen = strlen(target_path);
2968         if (pathlen >= MAXPATHLEN)      /* total string too long */
2969                 return XFS_ERROR(ENAMETOOLONG);
2970
2971         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2972                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2973                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2974                                         link_name->name, target_path, 0, 0, 0);
2975                 if (error)
2976                         return error;
2977         }
2978
2979         /* Return through std_return after this point. */
2980
2981         udqp = gdqp = NULL;
2982         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2983                 prid = dp->i_d.di_projid;
2984         else
2985                 prid = (xfs_prid_t)dfltprid;
2986
2987         /*
2988          * Make sure that we have allocated dquot(s) on disk.
2989          */
2990         error = XFS_QM_DQVOPALLOC(mp, dp,
2991                         current_fsuid(credp), current_fsgid(credp), prid,
2992                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2993         if (error)
2994                 goto std_return;
2995
2996         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
2997         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2998         /*
2999          * The symlink will fit into the inode data fork?
3000          * There can't be any attributes so we get the whole variable part.
3001          */
3002         if (pathlen <= XFS_LITINO(mp))
3003                 fs_blocks = 0;
3004         else
3005                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3006         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
3007         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3008                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3009         if (error == ENOSPC && fs_blocks == 0) {
3010                 resblks = 0;
3011                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3012                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3013         }
3014         if (error) {
3015                 cancel_flags = 0;
3016                 goto error_return;
3017         }
3018
3019         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3020         unlock_dp_on_error = B_TRUE;
3021
3022         /*
3023          * Check whether the directory allows new symlinks or not.
3024          */
3025         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3026                 error = XFS_ERROR(EPERM);
3027                 goto error_return;
3028         }
3029
3030         /*
3031          * Reserve disk quota : blocks and inode.
3032          */
3033         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3034         if (error)
3035                 goto error_return;
3036
3037         /*
3038          * Check for ability to enter directory entry, if no space reserved.
3039          */
3040         error = xfs_dir_canenter(tp, dp, link_name, resblks);
3041         if (error)
3042                 goto error_return;
3043         /*
3044          * Initialize the bmap freelist prior to calling either
3045          * bmapi or the directory create code.
3046          */
3047         XFS_BMAP_INIT(&free_list, &first_block);
3048
3049         /*
3050          * Allocate an inode for the symlink.
3051          */
3052         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3053                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3054         if (error) {
3055                 if (error == ENOSPC)
3056                         goto error_return;
3057                 goto error1;
3058         }
3059         xfs_itrace_ref(ip);
3060
3061         /*
3062          * An error after we've joined dp to the transaction will result in the
3063          * transaction cancel unlocking dp so don't do it explicitly in the
3064          * error path.
3065          */
3066         IHOLD(dp);
3067         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3068         unlock_dp_on_error = B_FALSE;
3069
3070         /*
3071          * Also attach the dquot(s) to it, if applicable.
3072          */
3073         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3074
3075         if (resblks)
3076                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3077         /*
3078          * If the symlink will fit into the inode, write it inline.
3079          */
3080         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3081                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3082                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3083                 ip->i_d.di_size = pathlen;
3084
3085                 /*
3086                  * The inode was initially created in extent format.
3087                  */
3088                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3089                 ip->i_df.if_flags |= XFS_IFINLINE;
3090
3091                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3092                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3093
3094         } else {
3095                 first_fsb = 0;
3096                 nmaps = SYMLINK_MAPS;
3097
3098                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3099                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3100                                   &first_block, resblks, mval, &nmaps,
3101                                   &free_list, NULL);
3102                 if (error) {
3103                         goto error1;
3104                 }
3105
3106                 if (resblks)
3107                         resblks -= fs_blocks;
3108                 ip->i_d.di_size = pathlen;
3109                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3110
3111                 cur_chunk = target_path;
3112                 for (n = 0; n < nmaps; n++) {
3113                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3114                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3115                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3116                                                BTOBB(byte_cnt), 0);
3117                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3118                         if (pathlen < byte_cnt) {
3119                                 byte_cnt = pathlen;
3120                         }
3121                         pathlen -= byte_cnt;
3122
3123                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3124                         cur_chunk += byte_cnt;
3125
3126                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3127                 }
3128         }
3129
3130         /*
3131          * Create the directory entry for the symlink.
3132          */
3133         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
3134                                         &first_block, &free_list, resblks);
3135         if (error)
3136                 goto error1;
3137         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3138         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3139
3140         /*
3141          * Bump the in memory version number of the parent directory
3142          * so that other processes accessing it will recognize that
3143          * the directory has changed.
3144          */
3145         dp->i_gen++;
3146
3147         /*
3148          * If this is a synchronous mount, make sure that the
3149          * symlink transaction goes to disk before returning to
3150          * the user.
3151          */
3152         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3153                 xfs_trans_set_sync(tp);
3154         }
3155
3156         /*
3157          * xfs_trans_commit normally decrements the vnode ref count
3158          * when it unlocks the inode. Since we want to return the
3159          * vnode to the caller, we bump the vnode ref count now.
3160          */
3161         IHOLD(ip);
3162
3163         error = xfs_bmap_finish(&tp, &free_list, &committed);
3164         if (error) {
3165                 goto error2;
3166         }
3167         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3168         XFS_QM_DQRELE(mp, udqp);
3169         XFS_QM_DQRELE(mp, gdqp);
3170
3171         /* Fall through to std_return with error = 0 or errno from
3172          * xfs_trans_commit     */
3173 std_return:
3174         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3175                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3176                                         dp, DM_RIGHT_NULL,
3177                                         error ? NULL : ip,
3178                                         DM_RIGHT_NULL, link_name->name,
3179                                         target_path, 0, error, 0);
3180         }
3181
3182         if (!error)
3183                 *ipp = ip;
3184         return error;
3185
3186  error2:
3187         IRELE(ip);
3188  error1:
3189         xfs_bmap_cancel(&free_list);
3190         cancel_flags |= XFS_TRANS_ABORT;
3191  error_return:
3192         xfs_trans_cancel(tp, cancel_flags);
3193         XFS_QM_DQRELE(mp, udqp);
3194         XFS_QM_DQRELE(mp, gdqp);
3195
3196         if (unlock_dp_on_error)
3197                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3198
3199         goto std_return;
3200 }
3201
3202 int
3203 xfs_inode_flush(
3204         xfs_inode_t     *ip,
3205         int             flags)
3206 {
3207         xfs_mount_t     *mp = ip->i_mount;
3208         int             error = 0;
3209
3210         if (XFS_FORCED_SHUTDOWN(mp))
3211                 return XFS_ERROR(EIO);
3212
3213         /*
3214          * Bypass inodes which have already been cleaned by
3215          * the inode flush clustering code inside xfs_iflush
3216          */
3217         if (xfs_inode_clean(ip))
3218                 return 0;
3219
3220         /*
3221          * We make this non-blocking if the inode is contended,
3222          * return EAGAIN to indicate to the caller that they
3223          * did not succeed. This prevents the flush path from
3224          * blocking on inodes inside another operation right
3225          * now, they get caught later by xfs_sync.
3226          */
3227         if (flags & FLUSH_SYNC) {
3228                 xfs_ilock(ip, XFS_ILOCK_SHARED);
3229                 xfs_iflock(ip);
3230         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3231                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3232                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3233                         return EAGAIN;
3234                 }
3235         } else {
3236                 return EAGAIN;
3237         }
3238
3239         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
3240                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
3241         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3242
3243         return error;
3244 }
3245
3246
3247 int
3248 xfs_set_dmattrs(
3249         xfs_inode_t     *ip,
3250         u_int           evmask,
3251         u_int16_t       state)
3252 {
3253         xfs_mount_t     *mp = ip->i_mount;
3254         xfs_trans_t     *tp;
3255         int             error;
3256
3257         if (!capable(CAP_SYS_ADMIN))
3258                 return XFS_ERROR(EPERM);
3259
3260         if (XFS_FORCED_SHUTDOWN(mp))
3261                 return XFS_ERROR(EIO);
3262
3263         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3264         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3265         if (error) {
3266                 xfs_trans_cancel(tp, 0);
3267                 return error;
3268         }
3269         xfs_ilock(ip, XFS_ILOCK_EXCL);
3270         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3271
3272         ip->i_d.di_dmevmask = evmask;
3273         ip->i_d.di_dmstate  = state;
3274
3275         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3276         IHOLD(ip);
3277         error = xfs_trans_commit(tp, 0);
3278
3279         return error;
3280 }
3281
3282 int
3283 xfs_reclaim(
3284         xfs_inode_t     *ip)
3285 {
3286         bhv_vnode_t     *vp = XFS_ITOV(ip);
3287
3288         xfs_itrace_entry(ip);
3289
3290         ASSERT(!VN_MAPPED(vp));
3291
3292         /* bad inode, get out here ASAP */
3293         if (VN_BAD(vp)) {
3294                 xfs_ireclaim(ip);
3295                 return 0;
3296         }
3297
3298         vn_iowait(ip);
3299
3300         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3301
3302         /*
3303          * Make sure the atime in the XFS inode is correct before freeing the
3304          * Linux inode.
3305          */
3306         xfs_synchronize_atime(ip);
3307
3308         /*
3309          * If we have nothing to flush with this inode then complete the
3310          * teardown now, otherwise break the link between the xfs inode and the
3311          * linux inode and clean up the xfs inode later. This avoids flushing
3312          * the inode to disk during the delete operation itself.
3313          *
3314          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3315          * first to ensure that xfs_iunpin() will never see an xfs inode
3316          * that has a linux inode being reclaimed. Synchronisation is provided
3317          * by the i_flags_lock.
3318          */
3319         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3320                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3321                 xfs_iflock(ip);
3322                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3323         } else {
3324                 xfs_mount_t     *mp = ip->i_mount;
3325
3326                 /* Protect sync and unpin from us */
3327                 XFS_MOUNT_ILOCK(mp);
3328                 spin_lock(&ip->i_flags_lock);
3329                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3330                 vn_to_inode(vp)->i_private = NULL;
3331                 ip->i_vnode = NULL;
3332                 spin_unlock(&ip->i_flags_lock);
3333                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3334                 XFS_MOUNT_IUNLOCK(mp);
3335         }
3336         return 0;
3337 }
3338
3339 int
3340 xfs_finish_reclaim(
3341         xfs_inode_t     *ip,
3342         int             locked,
3343         int             sync_mode)
3344 {
3345         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3346         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3347         int             error;
3348
3349         if (vp && VN_BAD(vp))
3350                 goto reclaim;
3351
3352         /* The hash lock here protects a thread in xfs_iget_core from
3353          * racing with us on linking the inode back with a vnode.
3354          * Once we have the XFS_IRECLAIM flag set it will not touch
3355          * us.
3356          */
3357         write_lock(&pag->pag_ici_lock);
3358         spin_lock(&ip->i_flags_lock);
3359         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3360             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3361                 spin_unlock(&ip->i_flags_lock);
3362                 write_unlock(&pag->pag_ici_lock);
3363                 if (locked) {
3364                         xfs_ifunlock(ip);
3365                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3366                 }
3367                 return 1;
3368         }
3369         __xfs_iflags_set(ip, XFS_IRECLAIM);
3370         spin_unlock(&ip->i_flags_lock);
3371         write_unlock(&pag->pag_ici_lock);
3372         xfs_put_perag(ip->i_mount, pag);
3373
3374         /*
3375          * If the inode is still dirty, then flush it out.  If the inode
3376          * is not in the AIL, then it will be OK to flush it delwri as
3377          * long as xfs_iflush() does not keep any references to the inode.
3378          * We leave that decision up to xfs_iflush() since it has the
3379          * knowledge of whether it's OK to simply do a delwri flush of
3380          * the inode or whether we need to wait until the inode is
3381          * pulled from the AIL.
3382          * We get the flush lock regardless, though, just to make sure
3383          * we don't free it while it is being flushed.
3384          */
3385         if (!locked) {
3386                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3387                 xfs_iflock(ip);
3388         }
3389
3390         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3391                 if (ip->i_update_core ||
3392                     ((ip->i_itemp != NULL) &&
3393                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3394                         error = xfs_iflush(ip, sync_mode);
3395                         /*
3396                          * If we hit an error, typically because of filesystem
3397                          * shutdown, we don't need to let vn_reclaim to know
3398                          * because we're gonna reclaim the inode anyway.
3399                          */
3400                         if (error) {
3401                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3402                                 goto reclaim;
3403                         }
3404                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3405                 }
3406
3407                 ASSERT(ip->i_update_core == 0);
3408                 ASSERT(ip->i_itemp == NULL ||
3409                        ip->i_itemp->ili_format.ilf_fields == 0);
3410         }
3411
3412         xfs_ifunlock(ip);
3413         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3414
3415  reclaim:
3416         xfs_ireclaim(ip);
3417         return 0;
3418 }
3419
3420 int
3421 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3422 {
3423         int             purged;
3424         xfs_inode_t     *ip, *n;
3425         int             done = 0;
3426
3427         while (!done) {
3428                 purged = 0;
3429                 XFS_MOUNT_ILOCK(mp);
3430                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3431                         if (noblock) {
3432                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3433                                         continue;
3434                                 if (xfs_ipincount(ip) ||
3435                                     !xfs_iflock_nowait(ip)) {
3436                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3437                                         continue;
3438                                 }
3439                         }
3440                         XFS_MOUNT_IUNLOCK(mp);
3441                         if (xfs_finish_reclaim(ip, noblock,
3442                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3443                                 delay(1);
3444                         purged = 1;
3445                         break;
3446                 }
3447
3448                 done = !purged;
3449         }
3450
3451         XFS_MOUNT_IUNLOCK(mp);
3452         return 0;
3453 }
3454
3455 /*
3456  * xfs_alloc_file_space()
3457  *      This routine allocates disk space for the given file.
3458  *
3459  *      If alloc_type == 0, this request is for an ALLOCSP type
3460  *      request which will change the file size.  In this case, no
3461  *      DMAPI event will be generated by the call.  A TRUNCATE event
3462  *      will be generated later by xfs_setattr.
3463  *
3464  *      If alloc_type != 0, this request is for a RESVSP type
3465  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3466  *      lower block boundary byte address is less than the file's
3467  *      length.
3468  *
3469  * RETURNS:
3470  *       0 on success
3471  *      errno on error
3472  *
3473  */
3474 STATIC int
3475 xfs_alloc_file_space(
3476         xfs_inode_t             *ip,
3477         xfs_off_t               offset,
3478         xfs_off_t               len,
3479         int                     alloc_type,
3480         int                     attr_flags)
3481 {
3482         xfs_mount_t             *mp = ip->i_mount;
3483         xfs_off_t               count;
3484         xfs_filblks_t           allocated_fsb;
3485         xfs_filblks_t           allocatesize_fsb;
3486         xfs_extlen_t            extsz, temp;
3487         xfs_fileoff_t           startoffset_fsb;
3488         xfs_fsblock_t           firstfsb;
3489         int                     nimaps;
3490         int                     bmapi_flag;
3491         int                     quota_flag;
3492         int                     rt;
3493         xfs_trans_t             *tp;
3494         xfs_bmbt_irec_t         imaps[1], *imapp;
3495         xfs_bmap_free_t         free_list;
3496         uint                    qblocks, resblks, resrtextents;
3497         int                     committed;
3498         int                     error;
3499
3500         xfs_itrace_entry(ip);
3501
3502         if (XFS_FORCED_SHUTDOWN(mp))
3503                 return XFS_ERROR(EIO);
3504
3505         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3506                 return error;
3507
3508         if (len <= 0)
3509                 return XFS_ERROR(EINVAL);
3510
3511         rt = XFS_IS_REALTIME_INODE(ip);
3512         extsz = xfs_get_extsz_hint(ip);
3513
3514         count = len;
3515         imapp = &imaps[0];
3516         nimaps = 1;
3517         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3518         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3519         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3520
3521         /*      Generate a DMAPI event if needed.       */
3522         if (alloc_type != 0 && offset < ip->i_size &&
3523                         (attr_flags&ATTR_DMI) == 0  &&
3524                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3525                 xfs_off_t           end_dmi_offset;
3526
3527                 end_dmi_offset = offset+len;
3528                 if (end_dmi_offset > ip->i_size)
3529                         end_dmi_offset = ip->i_size;
3530                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
3531                                       end_dmi_offset - offset, 0, NULL);
3532                 if (error)
3533                         return error;
3534         }
3535
3536         /*
3537          * Allocate file space until done or until there is an error
3538          */
3539 retry:
3540         while (allocatesize_fsb && !error) {
3541                 xfs_fileoff_t   s, e;
3542
3543                 /*
3544                  * Determine space reservations for data/realtime.
3545                  */
3546                 if (unlikely(extsz)) {
3547                         s = startoffset_fsb;
3548                         do_div(s, extsz);
3549                         s *= extsz;
3550                         e = startoffset_fsb + allocatesize_fsb;
3551                         if ((temp = do_mod(startoffset_fsb, extsz)))
3552                                 e += temp;
3553                         if ((temp = do_mod(e, extsz)))
3554                                 e += extsz - temp;
3555                 } else {
3556                         s = 0;
3557                         e = allocatesize_fsb;
3558                 }
3559
3560                 if (unlikely(rt)) {
3561                         resrtextents = qblocks = (uint)(e - s);
3562                         resrtextents /= mp->m_sb.sb_rextsize;
3563                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3564                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3565                 } else {
3566                         resrtextents = 0;
3567                         resblks = qblocks = \
3568                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3569                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3570                 }
3571
3572                 /*
3573                  * Allocate and setup the transaction.
3574                  */
3575                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3576                 error = xfs_trans_reserve(tp, resblks,
3577                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3578                                           XFS_TRANS_PERM_LOG_RES,
3579                                           XFS_WRITE_LOG_COUNT);
3580                 /*
3581                  * Check for running out of space
3582                  */
3583                 if (error) {
3584                         /*
3585                          * Free the transaction structure.
3586                          */
3587                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3588                         xfs_trans_cancel(tp, 0);
3589                         break;
3590                 }
3591                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3592                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3593                                                       qblocks, 0, quota_flag);
3594                 if (error)
3595                         goto error1;
3596
3597                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3598                 xfs_trans_ihold(tp, ip);
3599
3600                 /*
3601                  * Issue the xfs_bmapi() call to allocate the blocks
3602                  */
3603                 XFS_BMAP_INIT(&free_list, &firstfsb);
3604                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3605                                   allocatesize_fsb, bmapi_flag,
3606                                   &firstfsb, 0, imapp, &nimaps,
3607                                   &free_list, NULL);
3608                 if (error) {
3609                         goto error0;
3610                 }
3611
3612                 /*
3613                  * Complete the transaction
3614                  */
3615                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3616                 if (error) {
3617                         goto error0;
3618                 }
3619
3620                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3621                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3622                 if (error) {
3623                         break;
3624                 }
3625
3626                 allocated_fsb = imapp->br_blockcount;
3627
3628                 if (nimaps == 0) {
3629                         error = XFS_ERROR(ENOSPC);
3630                         break;
3631                 }
3632
3633                 startoffset_fsb += allocated_fsb;
3634                 allocatesize_fsb -= allocated_fsb;
3635         }
3636 dmapi_enospc_check:
3637         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3638             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3639                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3640                                 ip, DM_RIGHT_NULL,
3641                                 ip, DM_RIGHT_NULL,
3642                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3643                 if (error == 0)
3644                         goto retry;     /* Maybe DMAPI app. has made space */
3645                 /* else fall through with error from XFS_SEND_DATA */
3646         }
3647
3648         return error;
3649
3650 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3651         xfs_bmap_cancel(&free_list);
3652         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3653
3654 error1: /* Just cancel transaction */
3655         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3656         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3657         goto dmapi_enospc_check;
3658 }
3659
3660 /*
3661  * Zero file bytes between startoff and endoff inclusive.
3662  * The iolock is held exclusive and no blocks are buffered.
3663  */
3664 STATIC int
3665 xfs_zero_remaining_bytes(
3666         xfs_inode_t             *ip,
3667         xfs_off_t               startoff,
3668         xfs_off_t               endoff)
3669 {
3670         xfs_bmbt_irec_t         imap;
3671         xfs_fileoff_t           offset_fsb;
3672         xfs_off_t               lastoffset;
3673         xfs_off_t               offset;
3674         xfs_buf_t               *bp;
3675         xfs_mount_t             *mp = ip->i_mount;
3676         int                     nimap;
3677         int                     error = 0;
3678
3679         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3680                                 XFS_IS_REALTIME_INODE(ip) ?
3681                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3682
3683         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3684                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3685                 nimap = 1;
3686                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3687                         NULL, 0, &imap, &nimap, NULL, NULL);
3688                 if (error || nimap < 1)
3689                         break;
3690                 ASSERT(imap.br_blockcount >= 1);
3691                 ASSERT(imap.br_startoff == offset_fsb);
3692                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3693                 if (lastoffset > endoff)
3694                         lastoffset = endoff;
3695                 if (imap.br_startblock == HOLESTARTBLOCK)
3696                         continue;
3697                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3698                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3699                         continue;
3700                 XFS_BUF_UNDONE(bp);
3701                 XFS_BUF_UNWRITE(bp);
3702                 XFS_BUF_READ(bp);
3703                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3704                 xfsbdstrat(mp, bp);
3705                 error = xfs_iowait(bp);
3706                 if (error) {
3707                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3708                                           mp, bp, XFS_BUF_ADDR(bp));
3709                         break;
3710                 }
3711                 memset(XFS_BUF_PTR(bp) +
3712                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3713                       0, lastoffset - offset + 1);
3714                 XFS_BUF_UNDONE(bp);
3715                 XFS_BUF_UNREAD(bp);
3716                 XFS_BUF_WRITE(bp);
3717                 xfsbdstrat(mp, bp);
3718                 error = xfs_iowait(bp);
3719                 if (error) {
3720                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3721                                           mp, bp, XFS_BUF_ADDR(bp));
3722                         break;
3723                 }
3724         }
3725         xfs_buf_free(bp);
3726         return error;
3727 }
3728
3729 /*
3730  * xfs_free_file_space()
3731  *      This routine frees disk space for the given file.
3732  *
3733  *      This routine is only called by xfs_change_file_space
3734  *      for an UNRESVSP type call.
3735  *
3736  * RETURNS:
3737  *       0 on success
3738  *      errno on error
3739  *
3740  */
3741 STATIC int
3742 xfs_free_file_space(
3743         xfs_inode_t             *ip,
3744         xfs_off_t               offset,
3745         xfs_off_t               len,
3746         int                     attr_flags)
3747 {
3748         bhv_vnode_t             *vp;
3749         int                     committed;
3750         int                     done;
3751         xfs_off_t               end_dmi_offset;
3752         xfs_fileoff_t           endoffset_fsb;
3753         int                     error;
3754         xfs_fsblock_t           firstfsb;
3755         xfs_bmap_free_t         free_list;
3756         xfs_bmbt_irec_t         imap;
3757         xfs_off_t               ioffset;
3758         xfs_extlen_t            mod=0;
3759         xfs_mount_t             *mp;
3760         int                     nimap;
3761         uint                    resblks;
3762         uint                    rounding;
3763         int                     rt;
3764         xfs_fileoff_t           startoffset_fsb;
3765         xfs_trans_t             *tp;
3766         int                     need_iolock = 1;
3767
3768         vp = XFS_ITOV(ip);
3769         mp = ip->i_mount;
3770
3771         xfs_itrace_entry(ip);
3772
3773         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3774                 return error;
3775
3776         error = 0;
3777         if (len <= 0)   /* if nothing being freed */
3778                 return error;
3779         rt = XFS_IS_REALTIME_INODE(ip);
3780         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
3781         end_dmi_offset = offset + len;
3782         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
3783
3784         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
3785             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3786                 if (end_dmi_offset > ip->i_size)
3787                         end_dmi_offset = ip->i_size;
3788                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
3789                                 offset, end_dmi_offset - offset,
3790                                 AT_DELAY_FLAG(attr_flags), NULL);
3791                 if (error)
3792                         return error;
3793         }
3794
3795         if (attr_flags & ATTR_NOLOCK)
3796                 need_iolock = 0;
3797         if (need_iolock) {
3798                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3799                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
3800         }
3801
3802         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
3803         ioffset = offset & ~(rounding - 1);
3804
3805         if (VN_CACHED(vp) != 0) {
3806                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
3807                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
3808                 if (error)
3809                         goto out_unlock_iolock;
3810         }
3811
3812         /*
3813          * Need to zero the stuff we're not freeing, on disk.
3814          * If its a realtime file & can't use unwritten extents then we
3815          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
3816          * will take care of it for us.
3817          */
3818         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
3819                 nimap = 1;
3820                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
3821                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3822                 if (error)
3823                         goto out_unlock_iolock;
3824                 ASSERT(nimap == 0 || nimap == 1);
3825                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3826                         xfs_daddr_t     block;
3827
3828                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3829                         block = imap.br_startblock;
3830                         mod = do_div(block, mp->m_sb.sb_rextsize);
3831                         if (mod)
3832                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
3833                 }
3834                 nimap = 1;
3835                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
3836                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3837                 if (error)
3838                         goto out_unlock_iolock;
3839                 ASSERT(nimap == 0 || nimap == 1);
3840                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3841                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3842                         mod++;
3843                         if (mod && (mod != mp->m_sb.sb_rextsize))
3844                                 endoffset_fsb -= mod;
3845                 }
3846         }
3847         if ((done = (endoffset_fsb <= startoffset_fsb)))
3848                 /*
3849                  * One contiguous piece to clear
3850                  */
3851                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
3852         else {
3853                 /*
3854                  * Some full blocks, possibly two pieces to clear
3855                  */
3856                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
3857                         error = xfs_zero_remaining_bytes(ip, offset,
3858                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
3859                 if (!error &&
3860                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
3861                         error = xfs_zero_remaining_bytes(ip,
3862                                 XFS_FSB_TO_B(mp, endoffset_fsb),
3863                                 offset + len - 1);
3864         }
3865
3866         /*
3867          * free file space until done or until there is an error
3868          */
3869         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3870         while (!error && !done) {
3871
3872                 /*
3873                  * allocate and setup the transaction. Allow this
3874                  * transaction to dip into the reserve blocks to ensure
3875                  * the freeing of the space succeeds at ENOSPC.
3876                  */
3877                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3878                 tp->t_flags |= XFS_TRANS_RESERVE;
3879                 error = xfs_trans_reserve(tp,
3880                                           resblks,
3881                                           XFS_WRITE_LOG_RES(mp),
3882                                           0,
3883                                           XFS_TRANS_PERM_LOG_RES,
3884                                           XFS_WRITE_LOG_COUNT);
3885
3886                 /*
3887                  * check for running out of space
3888                  */
3889                 if (error) {
3890                         /*
3891                          * Free the transaction structure.
3892                          */
3893                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3894                         xfs_trans_cancel(tp, 0);
3895                         break;
3896                 }
3897                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3898                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
3899                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
3900                                 XFS_QMOPT_RES_REGBLKS);
3901                 if (error)
3902                         goto error1;
3903
3904                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3905                 xfs_trans_ihold(tp, ip);
3906
3907                 /*
3908                  * issue the bunmapi() call to free the blocks
3909                  */
3910                 XFS_BMAP_INIT(&free_list, &firstfsb);
3911                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
3912                                   endoffset_fsb - startoffset_fsb,
3913                                   0, 2, &firstfsb, &free_list, NULL, &done);
3914                 if (error) {
3915                         goto error0;
3916                 }
3917
3918                 /*
3919                  * complete the transaction
3920                  */
3921                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3922                 if (error) {
3923                         goto error0;
3924                 }
3925
3926                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3927                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3928         }
3929
3930  out_unlock_iolock:
3931         if (need_iolock)
3932                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3933         return error;
3934
3935  error0:
3936         xfs_bmap_cancel(&free_list);
3937  error1:
3938         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3939         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
3940                     XFS_ILOCK_EXCL);
3941         return error;
3942 }
3943
3944 /*
3945  * xfs_change_file_space()
3946  *      This routine allocates or frees disk space for the given file.
3947  *      The user specified parameters are checked for alignment and size
3948  *      limitations.
3949  *
3950  * RETURNS:
3951  *       0 on success
3952  *      errno on error
3953  *
3954  */
3955 int
3956 xfs_change_file_space(
3957         xfs_inode_t     *ip,
3958         int             cmd,
3959         xfs_flock64_t   *bf,
3960         xfs_off_t       offset,
3961         cred_t          *credp,
3962         int             attr_flags)
3963 {
3964         xfs_mount_t     *mp = ip->i_mount;
3965         int             clrprealloc;
3966         int             error;
3967         xfs_fsize_t     fsize;
3968         int             setprealloc;
3969         xfs_off_t       startoffset;
3970         xfs_off_t       llen;
3971         xfs_trans_t     *tp;
3972         bhv_vattr_t     va;
3973
3974         xfs_itrace_entry(ip);
3975
3976         if (!S_ISREG(ip->i_d.di_mode))
3977                 return XFS_ERROR(EINVAL);
3978
3979         switch (bf->l_whence) {
3980         case 0: /*SEEK_SET*/
3981                 break;
3982         case 1: /*SEEK_CUR*/
3983                 bf->l_start += offset;
3984                 break;
3985         case 2: /*SEEK_END*/
3986                 bf->l_start += ip->i_size;
3987                 break;
3988         default:
3989                 return XFS_ERROR(EINVAL);
3990         }
3991
3992         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
3993
3994         if (   (bf->l_start < 0)
3995             || (bf->l_start > XFS_MAXIOFFSET(mp))
3996             || (bf->l_start + llen < 0)
3997             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
3998                 return XFS_ERROR(EINVAL);
3999
4000         bf->l_whence = 0;
4001
4002         startoffset = bf->l_start;
4003         fsize = ip->i_size;
4004
4005         /*
4006          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4007          * file space.
4008          * These calls do NOT zero the data space allocated to the file,
4009          * nor do they change the file size.
4010          *
4011          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4012          * space.
4013          * These calls cause the new file data to be zeroed and the file
4014          * size to be changed.
4015          */
4016         setprealloc = clrprealloc = 0;
4017
4018         switch (cmd) {
4019         case XFS_IOC_RESVSP:
4020         case XFS_IOC_RESVSP64:
4021                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4022                                                                 1, attr_flags);
4023                 if (error)
4024                         return error;
4025                 setprealloc = 1;
4026                 break;
4027
4028         case XFS_IOC_UNRESVSP:
4029         case XFS_IOC_UNRESVSP64:
4030                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4031                                                                 attr_flags)))
4032                         return error;
4033                 break;
4034
4035         case XFS_IOC_ALLOCSP:
4036         case XFS_IOC_ALLOCSP64:
4037         case XFS_IOC_FREESP:
4038         case XFS_IOC_FREESP64:
4039                 if (startoffset > fsize) {
4040                         error = xfs_alloc_file_space(ip, fsize,
4041                                         startoffset - fsize, 0, attr_flags);
4042                         if (error)
4043                                 break;
4044                 }
4045
4046                 va.va_mask = XFS_AT_SIZE;
4047                 va.va_size = startoffset;
4048
4049                 error = xfs_setattr(ip, &va, attr_flags, credp);
4050
4051                 if (error)
4052                         return error;
4053
4054                 clrprealloc = 1;
4055                 break;
4056
4057         default:
4058                 ASSERT(0);
4059                 return XFS_ERROR(EINVAL);
4060         }
4061
4062         /*
4063          * update the inode timestamp, mode, and prealloc flag bits
4064          */
4065         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4066
4067         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4068                                       0, 0, 0))) {
4069                 /* ASSERT(0); */
4070                 xfs_trans_cancel(tp, 0);
4071                 return error;
4072         }
4073
4074         xfs_ilock(ip, XFS_ILOCK_EXCL);
4075
4076         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4077         xfs_trans_ihold(tp, ip);
4078
4079         if ((attr_flags & ATTR_DMI) == 0) {
4080                 ip->i_d.di_mode &= ~S_ISUID;
4081
4082                 /*
4083                  * Note that we don't have to worry about mandatory
4084                  * file locking being disabled here because we only
4085                  * clear the S_ISGID bit if the Group execute bit is
4086                  * on, but if it was on then mandatory locking wouldn't
4087                  * have been enabled.
4088                  */
4089                 if (ip->i_d.di_mode & S_IXGRP)
4090                         ip->i_d.di_mode &= ~S_ISGID;
4091
4092                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4093         }
4094         if (setprealloc)
4095                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4096         else if (clrprealloc)
4097                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4098
4099         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4100         xfs_trans_set_sync(tp);
4101
4102         error = xfs_trans_commit(tp, 0);
4103
4104         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4105
4106         return error;
4107 }