fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32
  33 #include "xfs.h"
  34 #include "xfs_macros.h"
  35 #include "xfs_types.h"
  36 #include "xfs_inum.h"
  37 #include "xfs_log.h"
  38 #include "xfs_trans.h"
  39 #include "xfs_sb.h"
  40 #include "xfs_ag.h"
  41 #include "xfs_dir.h"
  42 #include "xfs_dir2.h"
  43 #include "xfs_dmapi.h"
  44 #include "xfs_mount.h"
  45 #include "xfs_alloc_btree.h"
  46 #include "xfs_bmap_btree.h"
  47 #include "xfs_ialloc_btree.h"
  48 #include "xfs_itable.h"
  49 #include "xfs_btree.h"
  50 #include "xfs_ialloc.h"
  51 #include "xfs_alloc.h"
  52 #include "xfs_attr_sf.h"
  53 #include "xfs_dir_sf.h"
  54 #include "xfs_dir2_sf.h"
  55 #include "xfs_dinode.h"
  56 #include "xfs_inode_item.h"
  57 #include "xfs_inode.h"
  58 #include "xfs_bmap.h"
  59 #include "xfs_da_btree.h"
  60 #include "xfs_attr.h"
  61 #include "xfs_rw.h"
  62 #include "xfs_error.h"
  63 #include "xfs_bit.h"
  64 #include "xfs_rtalloc.h"
  65 #include "xfs_quota.h"
  66 #include "xfs_utils.h"
  67 #include "xfs_trans_space.h"
  68 #include "xfs_dir_leaf.h"
  69 #include "xfs_dmapi.h"
  70 #include "xfs_mac.h"
  71 #include "xfs_log_priv.h"
  72
  73
  74 /*
  75  * The maximum pathlen is 1024 bytes. Since the minimum file system
  76  * blocksize is 512 bytes, we can get a max of 2 extents back from
  77  * bmapi.
  78  */
  79 #define SYMLINK_MAPS 2
  80
  81 extern int xfs_ioctl(bhv_desc_t *, struct inode *, struct file *,
  82                         unsigned int, unsigned long);
  83
  84
  85 /*
  86  * For xfs, we check that the file isn't too big to be opened by this kernel.
  87  * No other open action is required for regular files.  Devices are handled
  88  * through the specfs file system, pipes through fifofs.  Device and
  89  * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
  90  * when a new vnode is first looked up or created.
  91  */
  92 STATIC int
  93 xfs_open(
  94         bhv_desc_t      *bdp,
  95         cred_t          *credp)
  96 {
  97         int             mode;
  98         vnode_t         *vp;
  99         xfs_inode_t     *ip;
 100
 101         vp = BHV_TO_VNODE(bdp);
 102         ip = XFS_BHVTOI(bdp);
 103
 104         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 105                 return XFS_ERROR(EIO);
 106
 107         /*
 108          * If it's a directory with any blocks, read-ahead block 0
 109          * as we're almost certain to have the next operation be a read there.
 110          */
 111         if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
 112                 mode = xfs_ilock_map_shared(ip);
 113                 if (ip->i_d.di_nextents > 0)
 114                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
 115                 xfs_iunlock(ip, mode);
 116         }
 117         return 0;
 118 }
 119
 120
 121 /*
 122  * xfs_getattr
 123  */
 124 STATIC int
 125 xfs_getattr(
 126         bhv_desc_t      *bdp,
 127         vattr_t         *vap,
 128         int             flags,
 129         cred_t          *credp)
 130 {
 131         xfs_inode_t     *ip;
 132         xfs_mount_t     *mp;
 133         vnode_t         *vp;
 134
 135         vp  = BHV_TO_VNODE(bdp);
 136         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 137
 138         ip = XFS_BHVTOI(bdp);
 139         mp = ip->i_mount;
 140
 141         if (XFS_FORCED_SHUTDOWN(mp))
 142                 return XFS_ERROR(EIO);
 143
 144         if (!(flags & ATTR_LAZY))
 145                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 146
 147         vap->va_size = ip->i_d.di_size;
 148         if (vap->va_mask == XFS_AT_SIZE) {
 149                 if (!(flags & ATTR_LAZY))
 150                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 151                 return 0;
 152         }
 153         vap->va_nblocks =
 154                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 155         vap->va_fsid = mp->m_dev;
 156 #if XFS_BIG_FILESYSTEMS
 157         vap->va_nodeid = ip->i_ino + mp->m_inoadd;
 158 #else
 159         vap->va_nodeid = ip->i_ino;
 160 #endif
 161         vap->va_nlink = ip->i_d.di_nlink;
 162
 163         /*
 164          * Quick exit for non-stat callers
 165          */
 166         if ((vap->va_mask &
 167             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 168               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0) {
 169                 if (!(flags & ATTR_LAZY))
 170                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 171                 return 0;
 172         }
 173
 174         /*
 175          * Copy from in-core inode.
 176          */
 177         vap->va_type = vp->v_type;
 178         vap->va_mode = ip->i_d.di_mode & MODEMASK;
 179         vap->va_uid = ip->i_d.di_uid;
 180         vap->va_gid = ip->i_d.di_gid;
 181         vap->va_projid = ip->i_d.di_projid;
 182
 183         /*
 184          * Check vnode type block/char vs. everything else.
 185          * Do it with bitmask because that's faster than looking
 186          * for multiple values individually.
 187          */
 188         if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
 189                 vap->va_rdev = 0;
 190
 191                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 192
 193 #if 0
 194                         /* Large block sizes confuse various
 195                          * user space programs, so letting the
 196                          * stripe size through is not a good
 197                          * idea for now.
 198                          */
 199                         vap->va_blksize = mp->m_swidth ?
 200                                 /*
 201                                  * If the underlying volume is a stripe, then
 202                                  * return the stripe width in bytes as the
 203                                  * recommended I/O size.
 204                                  */
 205                                 (mp->m_swidth << mp->m_sb.sb_blocklog) :
 206                                 /*
 207                                  * Return the largest of the preferred buffer
 208                                  * sizes since doing small I/Os into larger
 209                                  * buffers causes buffers to be decommissioned.
 210                                  * The value returned is in bytes.
 211                                  */
 212                                 (1 << (int)MAX(mp->m_readio_log,
 213                                                mp->m_writeio_log));
 214
 215 #else
 216                         vap->va_blksize =
 217                                 /*
 218                                  * Return the largest of the preferred buffer
 219                                  * sizes since doing small I/Os into larger
 220                                  * buffers causes buffers to be decommissioned.
 221                                  * The value returned is in bytes.
 222                                  */
 223                                 1 << (int)MAX(mp->m_readio_log,
 224                                                mp->m_writeio_log);
 225 #endif
 226                 } else {
 227
 228                         /*
 229                          * If the file blocks are being allocated from a
 230                          * realtime partition, then return the inode's
 231                          * realtime extent size or the realtime volume's
 232                          * extent size.
 233                          */
 234                         vap->va_blksize = ip->i_d.di_extsize ?
 235                                 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
 236                                 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
 237                 }
 238         } else {
 239                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 240                 vap->va_blksize = BLKDEV_IOSIZE;
 241         }
 242
 243         vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
 244         vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
 245         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 246         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 247         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 248         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 249
 250         /*
 251          * Exit for stat callers.  See if any of the rest of the fields
 252          * to be filled in are needed.
 253          */
 254         if ((vap->va_mask &
 255              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 256               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) {
 257                 if (!(flags & ATTR_LAZY))
 258                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 259                 return 0;
 260         }
 261         /*
 262          * convert di_flags to xflags
 263          */
 264         vap->va_xflags =
 265                 ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 266                         XFS_XFLAG_REALTIME : 0) |
 267                 ((ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ?
 268                         XFS_XFLAG_PREALLOC : 0) |
 269                 (XFS_IFORK_Q(ip) ?
 270                         XFS_XFLAG_HASATTR : 0);
 271         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 272         vap->va_nextents =
 273                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 274                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 275                         ip->i_d.di_nextents;
 276         if (ip->i_afp != NULL)
 277                 vap->va_anextents =
 278                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 279                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 280                                  ip->i_d.di_anextents;
 281         else
 282                 vap->va_anextents = 0;
 283         vap->va_gencount = ip->i_d.di_gen;
 284         vap->va_vcode = 0L;
 285
 286         if (!(flags & ATTR_LAZY))
 287                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 288         return 0;
 289 }
 290
 291
 292 /*
 293  * xfs_setattr
 294  */
 295 STATIC int
 296 xfs_setattr(
 297         bhv_desc_t              *bdp,
 298         vattr_t                 *vap,
 299         int                     flags,
 300         cred_t                  *credp)
 301 {
 302         xfs_inode_t             *ip;
 303         xfs_trans_t             *tp;
 304         xfs_mount_t             *mp;
 305         int                     mask;
 306         int                     code;
 307         uint                    lock_flags;
 308         uint                    commit_flags=0;
 309         uid_t                   uid=0, iuid=0;
 310         gid_t                   gid=0, igid=0;
 311         int                     timeflags = 0;
 312         vnode_t                 *vp;
 313         xfs_prid_t              projid=0, iprojid=0;
 314         int                     mandlock_before, mandlock_after;
 315         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 316         int                     file_owner;
 317
 318         vp = BHV_TO_VNODE(bdp);
 319         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 320
 321         /*
 322          * Cannot set certain attributes.
 323          */
 324         mask = vap->va_mask;
 325         if (mask & XFS_AT_NOSET) {
 326                 return XFS_ERROR(EINVAL);
 327         }
 328
 329         ip = XFS_BHVTOI(bdp);
 330         mp = ip->i_mount;
 331
 332         if (XFS_FORCED_SHUTDOWN(mp))
 333                 return XFS_ERROR(EIO);
 334
 335         /*
 336          * Timestamps do not need to be logged and hence do not
 337          * need to be done within a transaction.
 338          */
 339         if (mask & XFS_AT_UPDTIMES) {
 340                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 341                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 342                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 343                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 344                 xfs_ichgtime(ip, timeflags);
 345                 return 0;
 346         }
 347
 348         olddquot1 = olddquot2 = NULL;
 349         udqp = gdqp = NULL;
 350
 351         /*
 352          * If disk quotas is on, we make sure that the dquots do exist on disk,
 353          * before we start any other transactions. Trying to do this later
 354          * is messy. We don't care to take a readlock to look at the ids
 355          * in inode here, because we can't hold it across the trans_reserve.
 356          * If the IDs do change before we take the ilock, we're covered
 357          * because the i_*dquot fields will get updated anyway.
 358          */
 359         if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) {
 360                 uint    qflags = 0;
 361
 362                 if (mask & XFS_AT_UID) {
 363                         uid = vap->va_uid;
 364                         qflags |= XFS_QMOPT_UQUOTA;
 365                 } else {
 366                         uid = ip->i_d.di_uid;
 367                 }
 368                 if (mask & XFS_AT_GID) {
 369                         gid = vap->va_gid;
 370                         qflags |= XFS_QMOPT_GQUOTA;
 371                 }  else {
 372                         gid = ip->i_d.di_gid;
 373                 }
 374                 /*
 375                  * We take a reference when we initialize udqp and gdqp,
 376                  * so it is important that we never blindly double trip on
 377                  * the same variable. See xfs_create() for an example.
 378                  */
 379                 ASSERT(udqp == NULL);
 380                 ASSERT(gdqp == NULL);
 381                 code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp);
 382                 if (code)
 383                         return (code);
 384         }
 385
 386         /*
 387          * For the other attributes, we acquire the inode lock and
 388          * first do an error checking pass.
 389          */
 390         tp = NULL;
 391         lock_flags = XFS_ILOCK_EXCL;
 392         if (!(mask & XFS_AT_SIZE)) {
 393                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 394                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 395                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 396                         commit_flags = 0;
 397                         if ((code = xfs_trans_reserve(tp, 0,
 398                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 399                                                      0, 0))) {
 400                                 lock_flags = 0;
 401                                 goto error_return;
 402                         }
 403                 }
 404         } else {
 405                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 406                     !(flags & ATTR_DMI)) {
 407                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, bdp,
 408                                 vap->va_size, 0, AT_DELAY_FLAG(flags), NULL);
 409                         if (code) {
 410                                 lock_flags = 0;
 411                                 goto error_return;
 412                         }
 413                 }
 414                 lock_flags |= XFS_IOLOCK_EXCL;
 415         }
 416
 417         xfs_ilock(ip, lock_flags);
 418
 419         if (_MAC_XFS_IACCESS(ip, MACWRITE, credp)) {
 420                 code = XFS_ERROR(EACCES);
 421                 goto error_return;
 422         }
 423
 424         /* boolean: are we the file owner? */
 425         file_owner = (current->fsuid == ip->i_d.di_uid);
 426
 427         /*
 428          * Change various properties of a file.
 429          * Only the owner or users with CAP_FOWNER
 430          * capability may do these things.
 431          */
 432         if (mask &
 433             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 434              XFS_AT_GID|XFS_AT_PROJID)) {
 435                 /*
 436                  * CAP_FOWNER overrides the following restrictions:
 437                  *
 438                  * The user ID of the calling process must be equal
 439                  * to the file owner ID, except in cases where the
 440                  * CAP_FSETID capability is applicable.
 441                  */
 442                 if (!file_owner && !capable(CAP_FOWNER)) {
 443                         code = XFS_ERROR(EPERM);
 444                         goto error_return;
 445                 }
 446
 447                 /*
 448                  * CAP_FSETID overrides the following restrictions:
 449                  *
 450                  * The effective user ID of the calling process shall match
 451                  * the file owner when setting the set-user-ID and
 452                  * set-group-ID bits on that file.
 453                  *
 454                  * The effective group ID or one of the supplementary group
 455                  * IDs of the calling process shall match the group owner of
 456                  * the file when setting the set-group-ID bit on that file
 457                  */
 458                 if (mask & XFS_AT_MODE) {
 459                         mode_t m = 0;
 460
 461                         if ((vap->va_mode & ISUID) && !file_owner)
 462                                 m |= ISUID;
 463                         if ((vap->va_mode & ISGID) &&
 464                             !in_group_p((gid_t)ip->i_d.di_gid))
 465                                 m |= ISGID;
 466 #if 0
 467                         /* Linux allows this, Irix doesn't. */
 468                         if ((vap->va_mode & ISVTX) && vp->v_type != VDIR)
 469                                 m |= ISVTX;
 470 #endif
 471                         if (m && !capable(CAP_FSETID))
 472                                 vap->va_mode &= ~m;
 473                 }
 474         }
 475
 476         /*
 477          * Change file ownership.  Must be the owner or privileged.
 478          * If the system was configured with the "restricted_chown"
 479          * option, the owner is not permitted to give away the file,
 480          * and can change the group id only to a group of which he
 481          * or she is a member.
 482          */
 483         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 484                 /*
 485                  * These IDs could have changed since we last looked at them.
 486                  * But, we're assured that if the ownership did change
 487                  * while we didn't have the inode locked, inode's dquot(s)
 488                  * would have changed also.
 489                  */
 490                 iuid = ip->i_d.di_uid;
 491                 iprojid = ip->i_d.di_projid;
 492                 igid = ip->i_d.di_gid;
 493                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 494                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 495                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 496                          iprojid;
 497
 498                 /*
 499                  * CAP_CHOWN overrides the following restrictions:
 500                  *
 501                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 502                  * shall override the restriction that a process cannot
 503                  * change the user ID of a file it owns and the restriction
 504                  * that the group ID supplied to the chown() function
 505                  * shall be equal to either the group ID or one of the
 506                  * supplementary group IDs of the calling process.
 507                  *
 508                  * XXX: How does restricted_chown affect projid?
 509                  */
 510                 if (restricted_chown &&
 511                     (iuid != uid || (igid != gid &&
 512                                      !in_group_p((gid_t)gid))) &&
 513                     !capable(CAP_CHOWN)) {
 514                         code = XFS_ERROR(EPERM);
 515                         goto error_return;
 516                 }
 517                 /*
 518                  * Do a quota reservation only if uid or gid is actually
 519                  * going to change.
 520                  */
 521                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 522                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 523                         ASSERT(tp);
 524                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 525                                                 capable(CAP_FOWNER) ?
 526                                                 XFS_QMOPT_FORCE_RES : 0);
 527                         if (code)       /* out of quota */
 528                                 goto error_return;
 529                 }
 530         }
 531
 532         /*
 533          * Truncate file.  Must have write permission and not be a directory.
 534          */
 535         if (mask & XFS_AT_SIZE) {
 536                 /* Short circuit the truncate case for zero length files */
 537                 if ((vap->va_size == 0) &&
 538                    (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
 539                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 540                         lock_flags &= ~XFS_ILOCK_EXCL;
 541                         if (mask & XFS_AT_CTIME)
 542                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 543                         code = 0;
 544                         goto error_return;
 545                 }
 546
 547                 if (vp->v_type == VDIR) {
 548                         code = XFS_ERROR(EISDIR);
 549                         goto error_return;
 550                 } else if (vp->v_type != VREG) {
 551                         code = XFS_ERROR(EINVAL);
 552                         goto error_return;
 553                 }
 554                 /*
 555                  * Make sure that the dquots are attached to the inode.
 556                  */
 557                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 558                         goto error_return;
 559         }
 560
 561         /*
 562          * Change file access or modified times.
 563          */
 564         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 565                 if (!file_owner) {
 566                         if ((flags & ATTR_UTIME) &&
 567                             !capable(CAP_FOWNER)) {
 568                                 code = XFS_ERROR(EPERM);
 569                                 goto error_return;
 570                         }
 571                 }
 572         }
 573
 574         /*
 575          * Change extent size or realtime flag.
 576          */
 577         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 578                 /*
 579                  * Can't change extent size if any extents are allocated.
 580                  */
 581                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 582                     (mask & XFS_AT_EXTSIZE) &&
 583                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 584                      vap->va_extsize) ) {
 585                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 586                         goto error_return;
 587                 }
 588
 589                 /*
 590                  * Can't set extent size unless the file is marked, or
 591                  * about to be marked as a realtime file.
 592                  *
 593                  * This check will be removed when fixed size extents
 594                  * with buffered data writes is implemented.
 595                  *
 596                  */
 597                 if ((mask & XFS_AT_EXTSIZE)                     &&
 598                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 599                      vap->va_extsize) &&
 600                     (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 601                        ((mask & XFS_AT_XFLAGS) &&
 602                         (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
 603                         code = XFS_ERROR(EINVAL);
 604                         goto error_return;
 605                 }
 606
 607                 /*
 608                  * Can't change realtime flag if any extents are allocated.
 609                  */
 610                 if (ip->i_d.di_nextents && (mask & XFS_AT_XFLAGS) &&
 611                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 612                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 613                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 614                         goto error_return;
 615                 }
 616                 /*
 617                  * Extent size must be a multiple of the appropriate block
 618                  * size, if set at all.
 619                  */
 620                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 621                         xfs_extlen_t    size;
 622
 623                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 624                             ((mask & XFS_AT_XFLAGS) &&
 625                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 626                                 size = mp->m_sb.sb_rextsize <<
 627                                        mp->m_sb.sb_blocklog;
 628                         } else {
 629                                 size = mp->m_sb.sb_blocksize;
 630                         }
 631                         if (vap->va_extsize % size) {
 632                                 code = XFS_ERROR(EINVAL);
 633                                 goto error_return;
 634                         }
 635                 }
 636                 /*
 637                  * If realtime flag is set then must have realtime data.
 638                  */
 639                 if ((mask & XFS_AT_XFLAGS) &&
 640                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 641                         if ((mp->m_sb.sb_rblocks == 0) ||
 642                             (mp->m_sb.sb_rextsize == 0) ||
 643                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 644                                 code = XFS_ERROR(EINVAL);
 645                                 goto error_return;
 646                         }
 647                 }
 648         }
 649
 650         /*
 651          * Now we can make the changes.  Before we join the inode
 652          * to the transaction, if XFS_AT_SIZE is set then take care of
 653          * the part of the truncation that must be done without the
 654          * inode lock.  This needs to be done before joining the inode
 655          * to the transaction, because the inode cannot be unlocked
 656          * once it is a part of the transaction.
 657          */
 658         if (mask & XFS_AT_SIZE) {
 659                 if (vap->va_size > ip->i_d.di_size) {
 660                         code = xfs_igrow_start(ip, vap->va_size, credp);
 661                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 662                 } else if (vap->va_size <= ip->i_d.di_size) {
 663                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 664                         xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
 665                                             (xfs_fsize_t)vap->va_size);
 666                         code = 0;
 667                 } else {
 668                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 669                         code = 0;
 670                 }
 671                 if (code) {
 672                         ASSERT(tp == NULL);
 673                         lock_flags &= ~XFS_ILOCK_EXCL;
 674                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 675                         goto error_return;
 676                 }
 677                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 678                 if ((code = xfs_trans_reserve(tp, 0,
 679                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 680                                              XFS_TRANS_PERM_LOG_RES,
 681                                              XFS_ITRUNCATE_LOG_COUNT))) {
 682                         xfs_trans_cancel(tp, 0);
 683                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 684                         return code;
 685                 }
 686                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 687                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 688         }
 689
 690         if (tp) {
 691                 xfs_trans_ijoin(tp, ip, lock_flags);
 692                 xfs_trans_ihold(tp, ip);
 693         }
 694
 695         /* determine whether mandatory locking mode changes */
 696         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 697
 698         /*
 699          * Truncate file.  Must have write permission and not be a directory.
 700          */
 701         if (mask & XFS_AT_SIZE) {
 702                 if (vap->va_size > ip->i_d.di_size) {
 703                         xfs_igrow_finish(tp, ip, vap->va_size,
 704                             !(flags & ATTR_DMI));
 705                 } else if ((vap->va_size <= ip->i_d.di_size) ||
 706                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 707                         /*
 708                          * signal a sync transaction unless
 709                          * we're truncating an already unlinked
 710                          * file on a wsync filesystem
 711                          */
 712                         code = xfs_itruncate_finish(&tp, ip,
 713                                             (xfs_fsize_t)vap->va_size,
 714                                             XFS_DATA_FORK,
 715                                             ((ip->i_d.di_nlink != 0 ||
 716                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 717                                              ? 1 : 0));
 718                         if (code) {
 719                                 goto abort_return;
 720                         }
 721                 }
 722                 /*
 723                  * Have to do this even if the file's size doesn't change.
 724                  */
 725                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 726         }
 727
 728         /*
 729          * Change file access modes.
 730          */
 731         if (mask & XFS_AT_MODE) {
 732                 ip->i_d.di_mode &= IFMT;
 733                 ip->i_d.di_mode |= vap->va_mode & ~IFMT;
 734
 735                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 736                 timeflags |= XFS_ICHGTIME_CHG;
 737         }
 738
 739         /*
 740          * Change file ownership.  Must be the owner or privileged.
 741          * If the system was configured with the "restricted_chown"
 742          * option, the owner is not permitted to give away the file,
 743          * and can change the group id only to a group of which he
 744          * or she is a member.
 745          */
 746         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 747                 /*
 748                  * CAP_FSETID overrides the following restrictions:
 749                  *
 750                  * The set-user-ID and set-group-ID bits of a file will be
 751                  * cleared upon successful return from chown()
 752                  */
 753                 if ((ip->i_d.di_mode & (ISUID|ISGID)) &&
 754                     !capable(CAP_FSETID)) {
 755                         ip->i_d.di_mode &= ~(ISUID|ISGID);
 756                 }
 757
 758                 /*
 759                  * Change the ownerships and register quota modifications
 760                  * in the transaction.
 761                  */
 762                 if (iuid != uid) {
 763                         if (XFS_IS_UQUOTA_ON(mp)) {
 764                                 ASSERT(mask & XFS_AT_UID);
 765                                 ASSERT(udqp);
 766                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 767                                                         &ip->i_udquot, udqp);
 768                         }
 769                         ip->i_d.di_uid = uid;
 770                 }
 771                 if (igid != gid) {
 772                         if (XFS_IS_GQUOTA_ON(mp)) {
 773                                 ASSERT(mask & XFS_AT_GID);
 774                                 ASSERT(gdqp);
 775                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 776                                                         &ip->i_gdquot, gdqp);
 777                         }
 778                         ip->i_d.di_gid = gid;
 779                 }
 780                 if (iprojid != projid) {
 781                         ip->i_d.di_projid = projid;
 782                         /*
 783                          * We may have to rev the inode as well as
 784                          * the superblock version number since projids didn't
 785                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 786                          */
 787                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 788                                 xfs_bump_ino_vers2(tp, ip);
 789                 }
 790
 791                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 792                 timeflags |= XFS_ICHGTIME_CHG;
 793         }
 794
 795
 796         /*
 797          * Change file access or modified times.
 798          */
 799         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 800                 if (mask & XFS_AT_ATIME) {
 801                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 802                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 803                         ip->i_update_core = 1;
 804                         timeflags &= ~XFS_ICHGTIME_ACC;
 805                 }
 806                 if (mask & XFS_AT_MTIME) {
 807                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 808                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 809                         timeflags &= ~XFS_ICHGTIME_MOD;
 810                         timeflags |= XFS_ICHGTIME_CHG;
 811                 }
 812                 if (tp && (flags & ATTR_UTIME))
 813                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 814         }
 815
 816         /*
 817          * Change XFS-added attributes.
 818          */
 819         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 820                 if (mask & XFS_AT_EXTSIZE) {
 821                         /*
 822                          * Converting bytes to fs blocks.
 823                          */
 824                         ip->i_d.di_extsize = vap->va_extsize >>
 825                                 mp->m_sb.sb_blocklog;
 826                 }
 827                 if (mask & XFS_AT_XFLAGS) {
 828                         ip->i_d.di_flags = 0;
 829                         if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 830                                 ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
 831                                 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 832                         }
 833                         /* can't set PREALLOC this way, just ignore it */
 834                 }
 835                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 836                 timeflags |= XFS_ICHGTIME_CHG;
 837         }
 838
 839         /*
 840          * Change file inode change time only if XFS_AT_CTIME set
 841          * AND we have been called by a DMI function.
 842          */
 843
 844         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 845                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 846                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 847                 ip->i_update_core = 1;
 848                 timeflags &= ~XFS_ICHGTIME_CHG;
 849         }
 850
 851         /*
 852          * Send out timestamp changes that need to be set to the
 853          * current time.  Not done when called by a DMI function.
 854          */
 855         if (timeflags && !(flags & ATTR_DMI))
 856                 xfs_ichgtime(ip, timeflags);
 857
 858         XFS_STATS_INC(xfsstats.xs_ig_attrchg);
 859
 860         /*
 861          * If this is a synchronous mount, make sure that the
 862          * transaction goes to disk before returning to the user.
 863          * This is slightly sub-optimal in that truncates require
 864          * two sync transactions instead of one for wsync filesytems.
 865          * One for the truncate and one for the timestamps since we
 866          * don't want to change the timestamps unless we're sure the
 867          * truncate worked.  Truncates are less than 1% of the laddis
 868          * mix so this probably isn't worth the trouble to optimize.
 869          */
 870         code = 0;
 871         if (tp) {
 872                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 873                         xfs_trans_set_sync(tp);
 874
 875                 code = xfs_trans_commit(tp, commit_flags, NULL);
 876         }
 877
 878         /*
 879          * If the (regular) file's mandatory locking mode changed, then
 880          * notify the vnode.  We do this under the inode lock to prevent
 881          * racing calls to vop_vnode_change.
 882          */
 883         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 884         if (mandlock_before != mandlock_after) {
 885                 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
 886                                  mandlock_after);
 887         }
 888
 889         xfs_iunlock(ip, lock_flags);
 890
 891         /*
 892          * Release any dquot(s) the inode had kept before chown.
 893          */
 894         XFS_QM_DQRELE(mp, olddquot1);
 895         XFS_QM_DQRELE(mp, olddquot2);
 896         XFS_QM_DQRELE(mp, udqp);
 897         XFS_QM_DQRELE(mp, gdqp);
 898
 899         if (code) {
 900                 return code;
 901         }
 902
 903         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
 904             !(flags & ATTR_DMI)) {
 905                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, bdp, DM_RIGHT_NULL,
 906                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 907                                         0, 0, AT_DELAY_FLAG(flags));
 908         }
 909         return 0;
 910
 911  abort_return:
 912         commit_flags |= XFS_TRANS_ABORT;
 913         /* FALLTHROUGH */
 914  error_return:
 915         XFS_QM_DQRELE(mp, udqp);
 916         XFS_QM_DQRELE(mp, gdqp);
 917         if (tp) {
 918                 xfs_trans_cancel(tp, commit_flags);
 919         }
 920         if (lock_flags != 0) {
 921                 xfs_iunlock(ip, lock_flags);
 922         }
 923         return code;
 924 }
 925
 926
 927 /*
 928  * xfs_access
 929  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 930  */
 931 STATIC int
 932 xfs_access(
 933         bhv_desc_t      *bdp,
 934         int             mode,
 935         cred_t          *credp)
 936 {
 937         xfs_inode_t     *ip;
 938         int             error;
 939
 940         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 941                                                (inst_t *)__return_address);
 942
 943         ip = XFS_BHVTOI(bdp);
 944         xfs_ilock(ip, XFS_ILOCK_SHARED);
 945         error = xfs_iaccess(ip, mode, credp);
 946         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 947         return error;
 948 }
 949
 950
 951 /*
 952  * xfs_readlink
 953  *
 954  */
 955 STATIC int
 956 xfs_readlink(
 957         bhv_desc_t      *bdp,
 958         uio_t           *uiop,
 959         cred_t          *credp)
 960 {
 961         xfs_inode_t     *ip;
 962         int             count;
 963         xfs_off_t       offset;
 964         int             pathlen;
 965         vnode_t         *vp;
 966         int             error = 0;
 967         xfs_mount_t     *mp;
 968         int             nmaps;
 969         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 970         xfs_daddr_t     d;
 971         int             byte_cnt;
 972         int             n;
 973         xfs_buf_t       *bp;
 974
 975         vp = BHV_TO_VNODE(bdp);
 976         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 977
 978         ip = XFS_BHVTOI(bdp);
 979         mp = ip->i_mount;
 980
 981         if (XFS_FORCED_SHUTDOWN(mp))
 982                 return XFS_ERROR(EIO);
 983
 984         xfs_ilock(ip, XFS_ILOCK_SHARED);
 985
 986         ASSERT((ip->i_d.di_mode & IFMT) == IFLNK);
 987
 988         offset = uiop->uio_offset;
 989         count = uiop->uio_resid;
 990
 991         if (offset < 0) {
 992                 error = XFS_ERROR(EINVAL);
 993                 goto error_return;
 994         }
 995         if (count <= 0) {
 996                 error = 0;
 997                 goto error_return;
 998         }
 999
1000         if (!(uiop->uio_fmode & FINVIS)) {
1001                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
1002         }
1003
1004         /*
1005          * See if the symlink is stored inline.
1006          */
1007         pathlen = (int)ip->i_d.di_size;
1008
1009         if (ip->i_df.if_flags & XFS_IFINLINE) {
1010                 error = uiomove(ip->i_df.if_u1.if_data, pathlen, UIO_READ, uiop);
1011         }
1012         else {
1013                 /*
1014                  * Symlink not inline.  Call bmap to get it in.
1015                  */
1016                 nmaps = SYMLINK_MAPS;
1017
1018                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1019                                   0, NULL, 0, mval, &nmaps, NULL);
1020
1021                 if (error) {
1022                         goto error_return;
1023                 }
1024
1025                 for (n = 0; n < nmaps; n++) {
1026                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1027                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1028                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1029                                       BTOBB(byte_cnt), 0);
1030                         error = XFS_BUF_GETERROR(bp);
1031                         if (error) {
1032                                 xfs_ioerror_alert("xfs_readlink",
1033                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1034                                 xfs_buf_relse(bp);
1035                                 goto error_return;
1036                         }
1037                         if (pathlen < byte_cnt)
1038                                 byte_cnt = pathlen;
1039                         pathlen -= byte_cnt;
1040
1041                         error = uiomove(XFS_BUF_PTR(bp), byte_cnt,
1042                                          UIO_READ, uiop);
1043                         xfs_buf_relse (bp);
1044                 }
1045
1046         }
1047
1048
1049 error_return:
1050
1051         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1052
1053         return error;
1054 }
1055
1056
1057 /*
1058  * xfs_fsync
1059  *
1060  * This is called to sync the inode and its data out to disk.
1061  * We need to hold the I/O lock while flushing the data, and
1062  * the inode lock while flushing the inode.  The inode lock CANNOT
1063  * be held while flushing the data, so acquire after we're done
1064  * with that.
1065  */
1066 STATIC int
1067 xfs_fsync(
1068         bhv_desc_t      *bdp,
1069         int             flag,
1070         cred_t          *credp,
1071         xfs_off_t       start,
1072         xfs_off_t       stop)
1073 {
1074         xfs_inode_t     *ip;
1075         int             error;
1076         int             error2;
1077         int             syncall;
1078         vnode_t         *vp;
1079         xfs_trans_t     *tp;
1080
1081         vp = BHV_TO_VNODE(bdp);
1082         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1083
1084         ip = XFS_BHVTOI(bdp);
1085
1086         ASSERT(start >= 0 && stop >= -1);
1087
1088         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1089                 return XFS_ERROR(EIO);
1090
1091         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1092
1093         syncall = error = error2 = 0;
1094
1095         if (stop == -1)  {
1096                 ASSERT(start >= 0);
1097                 if (start == 0)
1098                         syncall = 1;
1099                 stop = xfs_file_last_byte(ip);
1100         }
1101
1102         /*
1103          * If we're invalidating, always flush since we want to
1104          * tear things down.  Otherwise, don't flush anything if
1105          * we're not dirty.
1106          */
1107         if (flag & FSYNC_INVAL) {
1108                 if (ip->i_df.if_flags & XFS_IFEXTENTS &&
1109                     ip->i_df.if_bytes > 0) {
1110                         VOP_FLUSHINVAL_PAGES(vp, start, -1, FI_REMAPF_LOCKED);
1111                 }
1112                 ASSERT(syncall == 0 || (VN_CACHED(vp) == 0));
1113         } else {
1114                 /*
1115                  * In the non-invalidating case, calls to fsync() do not
1116                  * flush all the dirty mmap'd pages.  That requires a
1117                  * call to msync().
1118                  */
1119                 VOP_FLUSH_PAGES(vp, start, -1,
1120                                 (flag & FSYNC_WAIT) ? 0 : XFS_B_ASYNC,
1121                                 FI_NONE, error2);
1122         }
1123
1124         if (error2) {
1125                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1126                 return XFS_ERROR(error2);
1127         }
1128
1129         /*
1130          * We always need to make sure that the required inode state
1131          * is safe on disk.  The vnode might be clean but because
1132          * of committed transactions that haven't hit the disk yet.
1133          * Likewise, there could be unflushed non-transactional
1134          * changes to the inode core that have to go to disk.
1135          *
1136          * The following code depends on one assumption:  that
1137          * any transaction that changes an inode logs the core
1138          * because it has to change some field in the inode core
1139          * (typically nextents or nblocks).  That assumption
1140          * implies that any transactions against an inode will
1141          * catch any non-transactional updates.  If inode-altering
1142          * transactions exist that violate this assumption, the
1143          * code breaks.  Right now, it figures that if the involved
1144          * update_* field is clear and the inode is unpinned, the
1145          * inode is clean.  Either it's been flushed or it's been
1146          * committed and the commit has hit the disk unpinning the inode.
1147          * (Note that xfs_inode_item_format() called at commit clears
1148          * the update_* fields.)
1149          */
1150         xfs_ilock(ip, XFS_ILOCK_SHARED);
1151
1152         /* If we are flushing data then we care about update_size
1153          * being set, otherwise we care about update_core
1154          */
1155         if ((flag & FSYNC_DATA) ?
1156                         (ip->i_update_size == 0) :
1157                         (ip->i_update_core == 0)) {
1158                 /*
1159                  * Timestamps/size haven't changed since last inode
1160                  * flush or inode transaction commit.  That means
1161                  * either nothing got written or a transaction
1162                  * committed which caught the updates.  If the
1163                  * latter happened and the transaction hasn't
1164                  * hit the disk yet, the inode will be still
1165                  * be pinned.  If it is, force the log.
1166                  */
1167
1168                 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED);
1169
1170                 if (xfs_ipincount(ip)) {
1171                         xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1172                                       XFS_LOG_FORCE |
1173                                       ((flag & FSYNC_WAIT)
1174                                        ? XFS_LOG_SYNC : 0));
1175                 }
1176                 error = 0;
1177         } else  {
1178                 /*
1179                  * Kick off a transaction to log the inode
1180                  * core to get the updates.  Make it
1181                  * sync if FSYNC_WAIT is passed in (which
1182                  * is done by everybody but specfs).  The
1183                  * sync transaction will also force the log.
1184                  */
1185                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1186                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1187                 if ((error = xfs_trans_reserve(tp, 0,
1188                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1189                                 0, 0, 0)))  {
1190                         xfs_trans_cancel(tp, 0);
1191                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1192                         return error;
1193                 }
1194                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1195
1196                 /*
1197                  * Note - it's possible that we might have pushed
1198                  * ourselves out of the way during trans_reserve
1199                  * which would flush the inode.  But there's no
1200                  * guarantee that the inode buffer has actually
1201                  * gone out yet (it's delwri).  Plus the buffer
1202                  * could be pinned anyway if it's part of an
1203                  * inode in another recent transaction.  So we
1204                  * play it safe and fire off the transaction anyway.
1205                  */
1206                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
1207                 xfs_trans_ihold(tp, ip);
1208                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1209                 if (flag & FSYNC_WAIT)
1210                         xfs_trans_set_sync(tp);
1211                 error = xfs_trans_commit(tp, 0, NULL);
1212
1213                 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
1214         }
1215         return error;
1216 }
1217
1218
1219 #if 0
1220 /*
1221  * This is a utility routine for xfs_inactive.  It is called when a
1222  * transaction attempting to free up the disk space for a file encounters
1223  * an error.  It cancels the old transaction and starts up a new one
1224  * to be used to free up the inode.  It also sets the inode size and extent
1225  * counts to 0 and frees up any memory being used to store inline data,
1226  * extents, or btree roots.
1227  */
1228 STATIC void
1229 xfs_itruncate_cleanup(
1230         xfs_trans_t     **tpp,
1231         xfs_inode_t     *ip,
1232         int             commit_flags,
1233         int             fork)
1234 {
1235         xfs_mount_t     *mp;
1236         /* REFERENCED */
1237         int             error;
1238
1239         mp = ip->i_mount;
1240         if (*tpp) {
1241                 xfs_trans_cancel(*tpp, commit_flags | XFS_TRANS_ABORT);
1242         }
1243         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1244         *tpp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1245         error = xfs_trans_reserve(*tpp, 0, XFS_IFREE_LOG_RES(mp), 0, 0,
1246                                   XFS_DEFAULT_LOG_COUNT);
1247         if (error) {
1248                 return;
1249         }
1250
1251         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1252         xfs_trans_ijoin(*tpp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1253         xfs_trans_ihold(*tpp, ip);
1254
1255         xfs_idestroy_fork(ip, fork);
1256
1257         if (fork == XFS_DATA_FORK) {
1258                 ip->i_d.di_nblocks = 0;
1259                 ip->i_d.di_nextents = 0;
1260                 ip->i_d.di_size = 0;
1261         } else {
1262                 ip->i_d.di_anextents = 0;
1263         }
1264         xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1265 }
1266 #endif
1267
1268 /*
1269  * This is called by xfs_inactive to free any blocks beyond eof,
1270  * when the link count isn't zero.
1271  */
1272 STATIC int
1273 xfs_inactive_free_eofblocks(
1274         xfs_mount_t     *mp,
1275         xfs_inode_t     *ip)
1276 {
1277         xfs_trans_t     *tp;
1278         int             error;
1279         xfs_fileoff_t   end_fsb;
1280         xfs_fileoff_t   last_fsb;
1281         xfs_filblks_t   map_len;
1282         int             nimaps;
1283         xfs_bmbt_irec_t imap;
1284
1285         /*
1286          * Figure out if there are any blocks beyond the end
1287          * of the file.  If not, then there is nothing to do.
1288          */
1289         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1290         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
1291         map_len = last_fsb - end_fsb;
1292         if (map_len <= 0)
1293                 return (0);
1294
1295         nimaps = 1;
1296         xfs_ilock(ip, XFS_ILOCK_SHARED);
1297         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1298                           NULL, 0, &imap, &nimaps, NULL);
1299         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1300
1301         if (!error && (nimaps != 0) &&
1302             (imap.br_startblock != HOLESTARTBLOCK)) {
1303                 /*
1304                  * Attach the dquots to the inode up front.
1305                  */
1306                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1307                         return (error);
1308
1309                 /*
1310                  * There are blocks after the end of file.
1311                  * Free them up now by truncating the file to
1312                  * its current size.
1313                  */
1314                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1315
1316                 /*
1317                  * Do the xfs_itruncate_start() call before
1318                  * reserving any log space because
1319                  * itruncate_start will call into the buffer
1320                  * cache and we can't
1321                  * do that within a transaction.
1322                  */
1323                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1324                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1325                                     ip->i_d.di_size);
1326
1327                 error = xfs_trans_reserve(tp, 0,
1328                                           XFS_ITRUNCATE_LOG_RES(mp),
1329                                           0, XFS_TRANS_PERM_LOG_RES,
1330                                           XFS_ITRUNCATE_LOG_COUNT);
1331                 if (error) {
1332                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1333                         xfs_trans_cancel(tp, 0);
1334                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1335                         return (error);
1336                 }
1337
1338                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1339                 xfs_trans_ijoin(tp, ip,
1340                                 XFS_IOLOCK_EXCL |
1341                                 XFS_ILOCK_EXCL);
1342                 xfs_trans_ihold(tp, ip);
1343
1344                 error = xfs_itruncate_finish(&tp, ip,
1345                                              ip->i_d.di_size,
1346                                              XFS_DATA_FORK,
1347                                              0);
1348                 /*
1349                  * If we get an error at this point we
1350                  * simply don't bother truncating the file.
1351                  */
1352                 if (error) {
1353                         xfs_trans_cancel(tp,
1354                                          (XFS_TRANS_RELEASE_LOG_RES |
1355                                           XFS_TRANS_ABORT));
1356                 } else {
1357                         error = xfs_trans_commit(tp,
1358                                                 XFS_TRANS_RELEASE_LOG_RES,
1359                                                 NULL);
1360                 }
1361                 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1362         }
1363         return (error);
1364 }
1365
1366 /*
1367  * Free a symlink that has blocks associated with it.
1368  */
1369 STATIC int
1370 xfs_inactive_symlink_rmt(
1371         xfs_inode_t     *ip,
1372         xfs_trans_t     **tpp)
1373 {
1374         xfs_buf_t       *bp;
1375         int             committed;
1376         int             done;
1377         int             error;
1378         xfs_fsblock_t   first_block;
1379         xfs_bmap_free_t free_list;
1380         int             i;
1381         xfs_mount_t     *mp;
1382         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1383         int             nmaps;
1384         xfs_trans_t     *ntp;
1385         int             size;
1386         xfs_trans_t     *tp;
1387
1388         tp = *tpp;
1389         mp = ip->i_mount;
1390         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1391         /*
1392          * We're freeing a symlink that has some
1393          * blocks allocated to it.  Free the
1394          * blocks here.  We know that we've got
1395          * either 1 or 2 extents and that we can
1396          * free them all in one bunmapi call.
1397          */
1398         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1399         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1400                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1401                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1402                 xfs_trans_cancel(tp, 0);
1403                 *tpp = NULL;
1404                 return error;
1405         }
1406         /*
1407          * Lock the inode, fix the size, and join it to the transaction.
1408          * Hold it so in the normal path, we still have it locked for
1409          * the second transaction.  In the error paths we need it
1410          * held so the cancel won't rele it, see below.
1411          */
1412         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1413         size = (int)ip->i_d.di_size;
1414         ip->i_d.di_size = 0;
1415         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1416         xfs_trans_ihold(tp, ip);
1417         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1418         /*
1419          * Find the block(s) so we can inval and unmap them.
1420          */
1421         done = 0;
1422         XFS_BMAP_INIT(&free_list, &first_block);
1423         nmaps = sizeof(mval) / sizeof(mval[0]);
1424         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1425                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1426                         &free_list)))
1427                 goto error0;
1428         /*
1429          * Invalidate the block(s).
1430          */
1431         for (i = 0; i < nmaps; i++) {
1432                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1433                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1434                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1435                 xfs_trans_binval(tp, bp);
1436         }
1437         /*
1438          * Unmap the dead block(s) to the free_list.
1439          */
1440         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1441                         &first_block, &free_list, &done)))
1442                 goto error1;
1443         ASSERT(done);
1444         /*
1445          * Commit the first transaction.  This logs the EFI and the inode.
1446          */
1447         if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1448                 goto error1;
1449         /*
1450          * The transaction must have been committed, since there were
1451          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1452          * The new tp has the extent freeing and EFDs.
1453          */
1454         ASSERT(committed);
1455         /*
1456          * The first xact was committed, so add the inode to the new one.
1457          * Mark it dirty so it will be logged and moved forward in the log as
1458          * part of every commit.
1459          */
1460         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1461         xfs_trans_ihold(tp, ip);
1462         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1463         /*
1464          * Get a new, empty transaction to return to our caller.
1465          */
1466         ntp = xfs_trans_dup(tp);
1467         /*
1468          * Commit the transaction containing extent freeing and EFD's.
1469          * If we get an error on the commit here or on the reserve below,
1470          * we need to unlock the inode since the new transaction doesn't
1471          * have the inode attached.
1472          */
1473         error = xfs_trans_commit(tp, 0, NULL);
1474         tp = ntp;
1475         if (error) {
1476                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1477                 goto error0;
1478         }
1479         /*
1480          * Remove the memory for extent descriptions (just bookkeeping).
1481          */
1482         if (ip->i_df.if_bytes)
1483                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1484         ASSERT(ip->i_df.if_bytes == 0);
1485         /*
1486          * Put an itruncate log reservation in the new transaction
1487          * for our caller.
1488          */
1489         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1490                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1491                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1492                 goto error0;
1493         }
1494         /*
1495          * Return with the inode locked but not joined to the transaction.
1496          */
1497         *tpp = tp;
1498         return 0;
1499
1500  error1:
1501         xfs_bmap_cancel(&free_list);
1502  error0:
1503         /*
1504          * Have to come here with the inode locked and either
1505          * (held and in the transaction) or (not in the transaction).
1506          * If the inode isn't held then cancel would iput it, but
1507          * that's wrong since this is inactive and the vnode ref
1508          * count is 0 already.
1509          * Cancel won't do anything to the inode if held, but it still
1510          * needs to be locked until the cancel is done, if it was
1511          * joined to the transaction.
1512          */
1513         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1514         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1515         *tpp = NULL;
1516         return error;
1517
1518 }
1519
1520 STATIC int
1521 xfs_inactive_symlink_local(
1522         xfs_inode_t     *ip,
1523         xfs_trans_t     **tpp)
1524 {
1525         int             error;
1526
1527         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1528         /*
1529          * We're freeing a symlink which fit into
1530          * the inode.  Just free the memory used
1531          * to hold the old symlink.
1532          */
1533         error = xfs_trans_reserve(*tpp, 0,
1534                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1535                                   0, XFS_TRANS_PERM_LOG_RES,
1536                                   XFS_ITRUNCATE_LOG_COUNT);
1537
1538         if (error) {
1539                 xfs_trans_cancel(*tpp, 0);
1540                 *tpp = NULL;
1541                 return (error);
1542         }
1543         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1544
1545         /*
1546          * Zero length symlinks _can_ exist.
1547          */
1548         if (ip->i_df.if_bytes > 0) {
1549                 xfs_idata_realloc(ip,
1550                                   -(ip->i_df.if_bytes),
1551                                   XFS_DATA_FORK);
1552                 ASSERT(ip->i_df.if_bytes == 0);
1553         }
1554         return (0);
1555 }
1556
1557 /*
1558  *
1559  */
1560 STATIC int
1561 xfs_inactive_attrs(
1562         xfs_inode_t     *ip,
1563         xfs_trans_t     **tpp,
1564         int             *commitflags)
1565 {
1566         xfs_trans_t     *tp;
1567         int             error;
1568         xfs_mount_t     *mp;
1569
1570         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1571         tp = *tpp;
1572         mp = ip->i_mount;
1573         ASSERT(ip->i_d.di_forkoff != 0);
1574         xfs_trans_commit(tp, *commitflags, NULL);
1575         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1576         *commitflags = 0;
1577
1578         error = xfs_attr_inactive(ip);
1579         if (error) {
1580                 *tpp = NULL;
1581                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1582                 return (error); /* goto out*/
1583         }
1584
1585         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1586         error = xfs_trans_reserve(tp, 0,
1587                                   XFS_IFREE_LOG_RES(mp),
1588                                   0, 0,
1589                                   XFS_DEFAULT_LOG_COUNT);
1590         if (error) {
1591                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1592                 xfs_trans_cancel(tp, 0);
1593                 *tpp = NULL;
1594                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1595                 return (error);
1596         }
1597
1598         xfs_ilock(ip, XFS_ILOCK_EXCL);
1599         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1600         xfs_trans_ihold(tp, ip);
1601         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1602
1603         ASSERT(ip->i_d.di_anextents == 0);
1604
1605         *tpp = tp;
1606         return (0);
1607 }
1608
1609 STATIC int
1610 xfs_release(
1611         bhv_desc_t      *bdp)
1612 {
1613         xfs_inode_t     *ip;
1614         vnode_t         *vp;
1615         xfs_mount_t     *mp;
1616         int             error;
1617
1618         vp = BHV_TO_VNODE(bdp);
1619         ip = XFS_BHVTOI(bdp);
1620
1621         if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
1622                 return 0;
1623         }
1624
1625         /* If this is a read-only mount, don't do this (would generate I/O) */
1626         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1627                 return 0;
1628
1629         mp = ip->i_mount;
1630
1631         if (ip->i_d.di_nlink != 0) {
1632                 if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
1633                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1634                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1635                     (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC))) {
1636                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1637                                 return (error);
1638                         /* Update linux inode block count after free above */
1639                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1640                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1641                 }
1642         }
1643
1644         return 0;
1645 }
1646
1647 /*
1648  * xfs_inactive
1649  *
1650  * This is called when the vnode reference count for the vnode
1651  * goes to zero.  If the file has been unlinked, then it must
1652  * now be truncated.  Also, we clear all of the read-ahead state
1653  * kept for the inode here since the file is now closed.
1654  */
1655 STATIC int
1656 xfs_inactive(
1657         bhv_desc_t      *bdp,
1658         cred_t          *credp)
1659 {
1660         xfs_inode_t     *ip;
1661         vnode_t         *vp;
1662         xfs_trans_t     *tp;
1663         xfs_mount_t     *mp;
1664         int             error;
1665         int             commit_flags;
1666         int             truncate;
1667
1668         vp = BHV_TO_VNODE(bdp);
1669         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1670
1671         ip = XFS_BHVTOI(bdp);
1672
1673         /*
1674          * If the inode is already free, then there can be nothing
1675          * to clean up here.
1676          */
1677         if (ip->i_d.di_mode == 0) {
1678                 ASSERT(ip->i_df.if_real_bytes == 0);
1679                 ASSERT(ip->i_df.if_broot_bytes == 0);
1680                 return VN_INACTIVE_CACHE;
1681         }
1682
1683         /*
1684          * Only do a truncate if it's a regular file with
1685          * some actual space in it.  It's OK to look at the
1686          * inode's fields without the lock because we're the
1687          * only one with a reference to the inode.
1688          */
1689         truncate = ((ip->i_d.di_nlink == 0) &&
1690             ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
1691             ((ip->i_d.di_mode & IFMT) == IFREG));
1692
1693         mp = ip->i_mount;
1694
1695         if (ip->i_d.di_nlink == 0 &&
1696             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1697                 (void) XFS_SEND_DESTROY(mp, bdp, DM_RIGHT_NULL);
1698         }
1699
1700         error = 0;
1701
1702         /* If this is a read-only mount, don't do this (would generate I/O) */
1703         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1704                 goto out;
1705
1706         if (ip->i_d.di_nlink != 0) {
1707                 if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
1708                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1709                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1710                     (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ||
1711                      (ip->i_delayed_blks != 0))) {
1712                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1713                                 return (VN_INACTIVE_CACHE);
1714                         /* Update linux inode block count after free above */
1715                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1716                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1717                 }
1718                 goto out;
1719         }
1720
1721         ASSERT(ip->i_d.di_nlink == 0);
1722
1723         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1724                 return (VN_INACTIVE_CACHE);
1725
1726         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1727         if (truncate) {
1728                 /*
1729                  * Do the xfs_itruncate_start() call before
1730                  * reserving any log space because itruncate_start
1731                  * will call into the buffer cache and we can't
1732                  * do that within a transaction.
1733                  */
1734                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1735
1736                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1737
1738                 error = xfs_trans_reserve(tp, 0,
1739                                           XFS_ITRUNCATE_LOG_RES(mp),
1740                                           0, XFS_TRANS_PERM_LOG_RES,
1741                                           XFS_ITRUNCATE_LOG_COUNT);
1742                 if (error) {
1743                         /* Don't call itruncate_cleanup */
1744                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1745                         xfs_trans_cancel(tp, 0);
1746                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1747                         return (VN_INACTIVE_CACHE);
1748                 }
1749
1750                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1751                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1752                 xfs_trans_ihold(tp, ip);
1753
1754                 /*
1755                  * normally, we have to run xfs_itruncate_finish sync.
1756                  * But if filesystem is wsync and we're in the inactive
1757                  * path, then we know that nlink == 0, and that the
1758                  * xaction that made nlink == 0 is permanently committed
1759                  * since xfs_remove runs as a synchronous transaction.
1760                  */
1761                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1762                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1763                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
1764
1765                 if (error) {
1766                         xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
1767                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1768                         return (VN_INACTIVE_CACHE);
1769                 }
1770         } else if ((ip->i_d.di_mode & IFMT) == IFLNK) {
1771
1772                 /*
1773                  * If we get an error while cleaning up a
1774                  * symlink we bail out.
1775                  */
1776                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1777                         xfs_inactive_symlink_rmt(ip, &tp) :
1778                         xfs_inactive_symlink_local(ip, &tp);
1779
1780                 if (error) {
1781                         ASSERT(tp == NULL);
1782                         return (VN_INACTIVE_CACHE);
1783                 }
1784
1785                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1786                 xfs_trans_ihold(tp, ip);
1787                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
1788
1789         } else {
1790                 error = xfs_trans_reserve(tp, 0,
1791                                           XFS_IFREE_LOG_RES(mp),
1792                                           0, 0,
1793                                           XFS_DEFAULT_LOG_COUNT);
1794                 if (error) {
1795                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1796                         xfs_trans_cancel(tp, 0);
1797                         return (VN_INACTIVE_CACHE);
1798                 }
1799
1800                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1801                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1802                 xfs_trans_ihold(tp, ip);
1803                 commit_flags = 0;
1804         }
1805
1806         /*
1807          * If there are attributes associated with the file
1808          * then blow them away now.  The code calls a routine
1809          * that recursively deconstructs the attribute fork.
1810          * We need to just commit the current transaction
1811          * because we can't use it for xfs_attr_inactive().
1812          */
1813         if (ip->i_d.di_anextents > 0) {
1814                 error = xfs_inactive_attrs(ip, &tp, &commit_flags);
1815                 /*
1816                  * If we got an error, the transaction is already
1817                  * cancelled, and the inode is unlocked. Just get out.
1818                  */
1819                  if (error)
1820                          return (VN_INACTIVE_CACHE);
1821         } else if (ip->i_afp) {
1822                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1823         }
1824
1825         /*
1826          * Free the inode.
1827          */
1828         error = xfs_ifree(tp, ip);
1829         if (error) {
1830                 /*
1831                  * If we fail to free the inode, shut down.  The cancel
1832                  * might do that, we need to make sure.  Otherwise the
1833                  * inode might be lost for a long time or forever.
1834                  */
1835                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1836                         cmn_err(CE_NOTE,
1837                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1838                                 error, mp->m_fsname);
1839                         xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1840                 }
1841                 xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
1842         } else {
1843                 /*
1844                  * Credit the quota account(s). The inode is gone.
1845                  */
1846                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1847
1848                 /*
1849                  * Just ignore errors at this point.  There is
1850                  * nothing we can do except to try to keep going.
1851                  */
1852                 (void) xfs_trans_commit(tp, commit_flags, NULL);
1853         }
1854         /*
1855          * Release the dquots held by inode, if any.
1856          */
1857         XFS_QM_DQDETACH(mp, ip);
1858
1859         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1860
1861  out:
1862         return VN_INACTIVE_CACHE;
1863 }
1864
1865
1866 /*
1867  * xfs_lookup
1868  */
1869 STATIC int
1870 xfs_lookup(
1871         bhv_desc_t              *dir_bdp,
1872         vname_t                 *dentry,
1873         vnode_t                 **vpp,
1874         int                     flags,
1875         vnode_t                 *rdir,
1876         cred_t                  *credp)
1877 {
1878         xfs_inode_t             *dp, *ip;
1879         xfs_ino_t               e_inum;
1880         int                     error;
1881         uint                    lock_mode;
1882         vnode_t                 *dir_vp;
1883
1884         dir_vp = BHV_TO_VNODE(dir_bdp);
1885         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1886
1887         dp = XFS_BHVTOI(dir_bdp);
1888
1889         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1890                 return XFS_ERROR(EIO);
1891
1892         lock_mode = xfs_ilock_map_shared(dp);
1893         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1894         if (!error) {
1895                 *vpp = XFS_ITOV(ip);
1896                 ITRACE(ip);
1897         }
1898         xfs_iunlock_map_shared(dp, lock_mode);
1899         return error;
1900 }
1901
1902
1903 #define XFS_CREATE_NEW_MAXTRIES 10000
1904
1905 /*
1906  * xfs_create (create a new file).
1907  */
1908 STATIC int
1909 xfs_create(
1910         bhv_desc_t              *dir_bdp,
1911         vname_t                 *dentry,
1912         vattr_t                 *vap,
1913         vnode_t                 **vpp,
1914         cred_t                  *credp)
1915 {
1916         char                    *name = VNAME(dentry);
1917         vnode_t                 *dir_vp;
1918         xfs_inode_t             *dp, *ip;
1919         vnode_t                 *vp=NULL;
1920         xfs_trans_t             *tp;
1921         xfs_mount_t             *mp;
1922         xfs_dev_t               rdev;
1923         int                     error;
1924         xfs_bmap_free_t         free_list;
1925         xfs_fsblock_t           first_block;
1926         boolean_t               dp_joined_to_trans;
1927         int                     dm_event_sent = 0;
1928         uint                    cancel_flags;
1929         int                     committed;
1930         xfs_prid_t              prid;
1931         struct xfs_dquot        *udqp, *gdqp;
1932         uint                    resblks;
1933         int                     dm_di_mode;
1934         int                     namelen;
1935
1936         ASSERT(!*vpp);
1937         dir_vp = BHV_TO_VNODE(dir_bdp);
1938         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1939
1940         dp = XFS_BHVTOI(dir_bdp);
1941         mp = dp->i_mount;
1942
1943         dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
1944         namelen = VNAMELEN(dentry);
1945
1946         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1947                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1948                                 dir_bdp, DM_RIGHT_NULL, NULL,
1949                                 DM_RIGHT_NULL, name, NULL,
1950                                 dm_di_mode, 0, 0);
1951
1952                 if (error)
1953                         return error;
1954                 dm_event_sent = 1;
1955         }
1956
1957         if (XFS_FORCED_SHUTDOWN(mp))
1958                 return XFS_ERROR(EIO);
1959
1960         /* Return through std_return after this point. */
1961
1962         udqp = gdqp = NULL;
1963         if (vap->va_mask & XFS_AT_PROJID)
1964                 prid = (xfs_prid_t)vap->va_projid;
1965         else
1966                 prid = (xfs_prid_t)dfltprid;
1967
1968         /*
1969          * Make sure that we have allocated dquot(s) on disk.
1970          */
1971         error = XFS_QM_DQVOPALLOC(mp, dp, current->fsuid, current->fsgid,
1972                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1973         if (error)
1974                 goto std_return;
1975
1976         ip = NULL;
1977         dp_joined_to_trans = B_FALSE;
1978
1979         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1980         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1981         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1982         /*
1983          * Initially assume that the file does not exist and
1984          * reserve the resources for that case.  If that is not
1985          * the case we'll drop the one we have and get a more
1986          * appropriate transaction later.
1987          */
1988         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1989                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1990         if (error == ENOSPC) {
1991                 resblks = 0;
1992                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1993                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1994         }
1995         if (error) {
1996                 cancel_flags = 0;
1997                 dp = NULL;
1998                 goto error_return;
1999         }
2000
2001         xfs_ilock(dp, XFS_ILOCK_EXCL);
2002
2003         XFS_BMAP_INIT(&free_list, &first_block);
2004
2005         ASSERT(ip == NULL);
2006
2007         /*
2008          * Reserve disk quota and the inode.
2009          */
2010         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2011         if (error)
2012                 goto error_return;
2013
2014         if (resblks == 0 &&
2015             (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
2016                 goto error_return;
2017         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
2018         error = xfs_dir_ialloc(&tp, dp,
2019                         MAKEIMODE(vap->va_type,vap->va_mode), 1,
2020                         rdev, credp, prid, resblks > 0,
2021                         &ip, &committed);
2022         if (error) {
2023                 if (error == ENOSPC)
2024                         goto error_return;
2025                 goto abort_return;
2026         }
2027         ITRACE(ip);
2028
2029         /*
2030          * At this point, we've gotten a newly allocated inode.
2031          * It is locked (and joined to the transaction).
2032          */
2033
2034         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
2035
2036         /*
2037          * Now we join the directory inode to the transaction.
2038          * We do not do it earlier because xfs_dir_ialloc
2039          * might commit the previous transaction (and release
2040          * all the locks).
2041          */
2042
2043         VN_HOLD(dir_vp);
2044         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2045         dp_joined_to_trans = B_TRUE;
2046
2047         error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
2048                 &first_block, &free_list,
2049                 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2050         if (error) {
2051                 ASSERT(error != ENOSPC);
2052                 goto abort_return;
2053         }
2054         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2055         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2056
2057         /*
2058          * If this is a synchronous mount, make sure that the
2059          * create transaction goes to disk before returning to
2060          * the user.
2061          */
2062         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2063                 xfs_trans_set_sync(tp);
2064         }
2065
2066         dp->i_gen++;
2067
2068         /*
2069          * Attach the dquot(s) to the inodes and modify them incore.
2070          * These ids of the inode couldn't have changed since the new
2071          * inode has been locked ever since it was created.
2072          */
2073         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2074
2075         /*
2076          * xfs_trans_commit normally decrements the vnode ref count
2077          * when it unlocks the inode. Since we want to return the
2078          * vnode to the caller, we bump the vnode ref count now.
2079          */
2080         IHOLD(ip);
2081         vp = XFS_ITOV(ip);
2082
2083         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2084         if (error) {
2085                 xfs_bmap_cancel(&free_list);
2086                 goto abort_rele;
2087         }
2088
2089         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2090         if (error) {
2091                 IRELE(ip);
2092                 tp = NULL;
2093                 goto error_return;
2094         }
2095
2096         XFS_QM_DQRELE(mp, udqp);
2097         XFS_QM_DQRELE(mp, gdqp);
2098
2099         /*
2100          * Propogate the fact that the vnode changed after the
2101          * xfs_inode locks have been released.
2102          */
2103         VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2104
2105         *vpp = vp;
2106
2107         /* Fallthrough to std_return with error = 0  */
2108
2109 std_return:
2110         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2111                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2112                                                         DM_EVENT_POSTCREATE)) {
2113                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2114                         dir_bdp, DM_RIGHT_NULL,
2115                         *vpp ? vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops):NULL,
2116                         DM_RIGHT_NULL, name, NULL,
2117                         dm_di_mode, error, 0);
2118         }
2119         return error;
2120
2121  abort_return:
2122         cancel_flags |= XFS_TRANS_ABORT;
2123         /* FALLTHROUGH */
2124  error_return:
2125
2126         if (tp != NULL)
2127                 xfs_trans_cancel(tp, cancel_flags);
2128
2129         if (!dp_joined_to_trans && (dp != NULL))
2130                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2131         XFS_QM_DQRELE(mp, udqp);
2132         XFS_QM_DQRELE(mp, gdqp);
2133
2134         goto std_return;
2135
2136  abort_rele:
2137         /*
2138          * Wait until after the current transaction is aborted to
2139          * release the inode.  This prevents recursive transactions
2140          * and deadlocks from xfs_inactive.
2141          */
2142         cancel_flags |= XFS_TRANS_ABORT;
2143         xfs_trans_cancel(tp, cancel_flags);
2144         IRELE(ip);
2145
2146         XFS_QM_DQRELE(mp, udqp);
2147         XFS_QM_DQRELE(mp, gdqp);
2148
2149         goto std_return;
2150 }
2151
2152 #ifdef DEBUG
2153 /*
2154  * Some counters to see if (and how often) we are hitting some deadlock
2155  * prevention code paths.
2156  */
2157
2158 int xfs_rm_locks;
2159 int xfs_rm_lock_delays;
2160 int xfs_rm_attempts;
2161 #endif
2162
2163 /*
2164  * The following routine will lock the inodes associated with the
2165  * directory and the named entry in the directory. The locks are
2166  * acquired in increasing inode number.
2167  *
2168  * If the entry is "..", then only the directory is locked. The
2169  * vnode ref count will still include that from the .. entry in
2170  * this case.
2171  *
2172  * There is a deadlock we need to worry about. If the locked directory is
2173  * in the AIL, it might be blocking up the log. The next inode we lock
2174  * could be already locked by another thread waiting for log space (e.g
2175  * a permanent log reservation with a long running transaction (see
2176  * xfs_itruncate_finish)). To solve this, we must check if the directory
2177  * is in the ail and use lock_nowait. If we can't lock, we need to
2178  * drop the inode lock on the directory and try again. xfs_iunlock will
2179  * potentially push the tail if we were holding up the log.
2180  */
2181 STATIC int
2182 xfs_lock_dir_and_entry(
2183         xfs_inode_t     *dp,
2184         vname_t         *dentry,
2185         xfs_inode_t     *ip)    /* inode of entry 'name' */
2186 {
2187         int             attempts;
2188         xfs_ino_t       e_inum;
2189         xfs_inode_t     *ips[2];
2190         xfs_log_item_t  *lp;
2191
2192 #ifdef DEBUG
2193         xfs_rm_locks++;
2194 #endif
2195         attempts = 0;
2196
2197 again:
2198         xfs_ilock(dp, XFS_ILOCK_EXCL);
2199
2200         e_inum = ip->i_ino;
2201
2202         ITRACE(ip);
2203
2204         /*
2205          * We want to lock in increasing inum. Since we've already
2206          * acquired the lock on the directory, we may need to release
2207          * if if the inum of the entry turns out to be less.
2208          */
2209         if (e_inum > dp->i_ino) {
2210                 /*
2211                  * We are already in the right order, so just
2212                  * lock on the inode of the entry.
2213                  * We need to use nowait if dp is in the AIL.
2214                  */
2215
2216                 lp = (xfs_log_item_t *)dp->i_itemp;
2217                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2218                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2219                                 attempts++;
2220 #ifdef DEBUG
2221                                 xfs_rm_attempts++;
2222 #endif
2223
2224                                 /*
2225                                  * Unlock dp and try again.
2226                                  * xfs_iunlock will try to push the tail
2227                                  * if the inode is in the AIL.
2228                                  */
2229
2230                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2231
2232                                 if ((attempts % 5) == 0) {
2233                                         delay(1); /* Don't just spin the CPU */
2234 #ifdef DEBUG
2235                                         xfs_rm_lock_delays++;
2236 #endif
2237                                 }
2238                                 goto again;
2239                         }
2240                 } else {
2241                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2242                 }
2243         } else if (e_inum < dp->i_ino) {
2244                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2245
2246                 ips[0] = ip;
2247                 ips[1] = dp;
2248                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2249         }
2250         /* else  e_inum == dp->i_ino */
2251         /*     This can happen if we're asked to lock /x/..
2252          *     the entry is "..", which is also the parent directory.
2253          */
2254
2255         return 0;
2256 }
2257
2258 #ifdef DEBUG
2259 int xfs_locked_n;
2260 int xfs_small_retries;
2261 int xfs_middle_retries;
2262 int xfs_lots_retries;
2263 int xfs_lock_delays;
2264 #endif
2265
2266 /*
2267  * The following routine will lock n inodes in exclusive mode.
2268  * We assume the caller calls us with the inodes in i_ino order.
2269  *
2270  * We need to detect deadlock where an inode that we lock
2271  * is in the AIL and we start waiting for another inode that is locked
2272  * by a thread in a long running transaction (such as truncate). This can
2273  * result in deadlock since the long running trans might need to wait
2274  * for the inode we just locked in order to push the tail and free space
2275  * in the log.
2276  */
2277 void
2278 xfs_lock_inodes(
2279         xfs_inode_t     **ips,
2280         int             inodes,
2281         int             first_locked,
2282         uint            lock_mode)
2283 {
2284         int             attempts = 0, i, j, try_lock;
2285         xfs_log_item_t  *lp;
2286
2287         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2288
2289         if (first_locked) {
2290                 try_lock = 1;
2291                 i = 1;
2292         } else {
2293                 try_lock = 0;
2294                 i = 0;
2295         }
2296
2297 again:
2298         for (; i < inodes; i++) {
2299                 ASSERT(ips[i]);
2300
2301                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2302                         continue;
2303
2304                 /*
2305                  * If try_lock is not set yet, make sure all locked inodes
2306                  * are not in the AIL.
2307                  * If any are, set try_lock to be used later.
2308                  */
2309
2310                 if (!try_lock) {
2311                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2312                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2313                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2314                                         try_lock++;
2315                                 }
2316                         }
2317                 }
2318
2319                 /*
2320                  * If any of the previous locks we have locked is in the AIL,
2321                  * we must TRY to get the second and subsequent locks. If
2322                  * we can't get any, we must release all we have
2323                  * and try again.
2324                  */
2325
2326                 if (try_lock) {
2327                         /* try_lock must be 0 if i is 0. */
2328                         /*
2329                          * try_lock means we have an inode locked
2330                          * that is in the AIL.
2331                          */
2332                         ASSERT(i != 0);
2333                         if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2334                                 attempts++;
2335
2336                                 /*
2337                                  * Unlock all previous guys and try again.
2338                                  * xfs_iunlock will try to push the tail
2339                                  * if the inode is in the AIL.
2340                                  */
2341
2342                                 for(j = i - 1; j >= 0; j--) {
2343
2344                                         /*
2345                                          * Check to see if we've already
2346                                          * unlocked this one.
2347                                          * Not the first one going back,
2348                                          * and the inode ptr is the same.
2349                                          */
2350                                         if ((j != (i - 1)) && ips[j] ==
2351                                                                 ips[j+1])
2352                                                 continue;
2353
2354                                         xfs_iunlock(ips[j], lock_mode);
2355                                 }
2356
2357                                 if ((attempts % 5) == 0) {
2358                                         delay(1); /* Don't just spin the CPU */
2359 #ifdef DEBUG
2360                                         xfs_lock_delays++;
2361 #endif
2362                                 }
2363                                 i = 0;
2364                                 try_lock = 0;
2365                                 goto again;
2366                         }
2367                 } else {
2368                         xfs_ilock(ips[i], lock_mode);
2369                 }
2370         }
2371
2372 #ifdef DEBUG
2373         if (attempts) {
2374                 if (attempts < 5) xfs_small_retries++;
2375                 else if (attempts < 100) xfs_middle_retries++;
2376                 else xfs_lots_retries++;
2377         } else {
2378                 xfs_locked_n++;
2379         }
2380 #endif
2381 }
2382
2383 #ifdef  DEBUG
2384 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2385 int remove_which_error_return = 0;
2386 #else /* ! DEBUG */
2387 #define REMOVE_DEBUG_TRACE(x)
2388 #endif  /* ! DEBUG */
2389
2390
2391 /*
2392  * xfs_remove
2393  *
2394  */
2395 STATIC int
2396 xfs_remove(
2397         bhv_desc_t              *dir_bdp,
2398         vname_t                 *dentry,
2399         cred_t                  *credp)
2400 {
2401         vnode_t                 *dir_vp;
2402         char                    *name = VNAME(dentry);
2403         xfs_inode_t             *dp, *ip;
2404         xfs_trans_t             *tp = NULL;
2405         xfs_mount_t             *mp;
2406         int                     error = 0;
2407         xfs_bmap_free_t         free_list;
2408         xfs_fsblock_t           first_block;
2409         int                     cancel_flags;
2410         int                     committed;
2411         int                     dm_di_mode = 0;
2412         int                     link_zero;
2413         uint                    resblks;
2414         int                     namelen;
2415
2416         dir_vp = BHV_TO_VNODE(dir_bdp);
2417         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2418
2419         dp = XFS_BHVTOI(dir_bdp);
2420         mp = dp->i_mount;
2421
2422         if (XFS_FORCED_SHUTDOWN(mp))
2423                 return XFS_ERROR(EIO);
2424
2425         namelen = VNAMELEN(dentry);
2426
2427         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2428                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_bdp,
2429                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2430                                         name, NULL, 0, 0, 0);
2431                 if (error)
2432                         return error;
2433         }
2434
2435         /* From this point on, return through std_return */
2436         ip = NULL;
2437
2438         /*
2439          * We need to get a reference to ip before we get our log
2440          * reservation. The reason for this is that we cannot call
2441          * xfs_iget for an inode for which we do not have a reference
2442          * once we've acquired a log reservation. This is because the
2443          * inode we are trying to get might be in xfs_inactive going
2444          * for a log reservation. Since we'll have to wait for the
2445          * inactive code to complete before returning from xfs_iget,
2446          * we need to make sure that we don't have log space reserved
2447          * when we call xfs_iget.  Instead we get an unlocked referece
2448          * to the inode before getting our log reservation.
2449          */
2450         error = xfs_get_dir_entry(dentry, &ip);
2451         if (error) {
2452                 REMOVE_DEBUG_TRACE(__LINE__);
2453                 goto std_return;
2454         }
2455
2456         dm_di_mode = ip->i_d.di_mode;
2457
2458         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2459
2460         ITRACE(ip);
2461
2462         error = XFS_QM_DQATTACH(mp, dp, 0);
2463         if (!error && dp != ip)
2464                 error = XFS_QM_DQATTACH(mp, ip, 0);
2465         if (error) {
2466                 REMOVE_DEBUG_TRACE(__LINE__);
2467                 IRELE(ip);
2468                 goto std_return;
2469         }
2470
2471         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2472         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2473         /*
2474          * We try to get the real space reservation first,
2475          * allowing for directory btree deletion(s) implying
2476          * possible bmap insert(s).  If we can't get the space
2477          * reservation then we use 0 instead, and avoid the bmap
2478          * btree insert(s) in the directory code by, if the bmap
2479          * insert tries to happen, instead trimming the LAST
2480          * block from the directory.
2481          */
2482         resblks = XFS_REMOVE_SPACE_RES(mp);
2483         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2484                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2485         if (error == ENOSPC) {
2486                 resblks = 0;
2487                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2488                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2489         }
2490         if (error) {
2491                 ASSERT(error != ENOSPC);
2492                 REMOVE_DEBUG_TRACE(__LINE__);
2493                 xfs_trans_cancel(tp, 0);
2494                 IRELE(ip);
2495                 return error;
2496         }
2497
2498         error = xfs_lock_dir_and_entry(dp, dentry, ip);
2499         if (error) {
2500                 REMOVE_DEBUG_TRACE(__LINE__);
2501                 xfs_trans_cancel(tp, cancel_flags);
2502                 IRELE(ip);
2503                 goto std_return;
2504         }
2505
2506         /*
2507          * At this point, we've gotten both the directory and the entry
2508          * inodes locked.
2509          */
2510         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2511         if (dp != ip) {
2512                 /*
2513                  * Increment vnode ref count only in this case since
2514                  * there's an extra vnode reference in the case where
2515                  * dp == ip.
2516                  */
2517                 IHOLD(dp);
2518                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2519         }
2520
2521         if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, credp))) {
2522                 REMOVE_DEBUG_TRACE(__LINE__);
2523                 goto error_return;
2524         }
2525
2526         /*
2527          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2528          */
2529         XFS_BMAP_INIT(&free_list, &first_block);
2530         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2531                 &first_block, &free_list, 0);
2532         if (error) {
2533                 ASSERT(error != ENOENT);
2534                 REMOVE_DEBUG_TRACE(__LINE__);
2535                 goto error1;
2536         }
2537         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2538
2539         dp->i_gen++;
2540         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2541
2542         error = xfs_droplink(tp, ip);
2543         if (error) {
2544                 REMOVE_DEBUG_TRACE(__LINE__);
2545                 goto error1;
2546         }
2547
2548         /* Determine if this is the last link while
2549          * we are in the transaction.
2550          */
2551         link_zero = (ip)->i_d.di_nlink==0;
2552
2553         /*
2554          * Take an extra ref on the inode so that it doesn't
2555          * go to xfs_inactive() from within the commit.
2556          */
2557         IHOLD(ip);
2558
2559         /*
2560          * If this is a synchronous mount, make sure that the
2561          * remove transaction goes to disk before returning to
2562          * the user.
2563          */
2564         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2565                 xfs_trans_set_sync(tp);
2566         }
2567
2568         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2569         if (error) {
2570                 REMOVE_DEBUG_TRACE(__LINE__);
2571                 goto error_rele;
2572         }
2573
2574         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2575         if (error) {
2576                 IRELE(ip);
2577                 goto std_return;
2578         }
2579
2580         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2581
2582         /*
2583          * Let interposed file systems know about removed links.
2584          */
2585         VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2586
2587         IRELE(ip);
2588
2589 /*      Fall through to std_return with error = 0 */
2590  std_return:
2591         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2592                                                 DM_EVENT_POSTREMOVE)) {
2593                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2594                                 dir_bdp, DM_RIGHT_NULL,
2595                                 NULL, DM_RIGHT_NULL,
2596                                 name, NULL, dm_di_mode, error, 0);
2597         }
2598         return error;
2599
2600  error1:
2601         xfs_bmap_cancel(&free_list);
2602         cancel_flags |= XFS_TRANS_ABORT;
2603
2604  error_return:
2605         xfs_trans_cancel(tp, cancel_flags);
2606         goto std_return;
2607
2608  error_rele:
2609         /*
2610          * In this case make sure to not release the inode until after
2611          * the current transaction is aborted.  Releasing it beforehand
2612          * can cause us to go to xfs_inactive and start a recursive
2613          * transaction which can easily deadlock with the current one.
2614          */
2615         xfs_bmap_cancel(&free_list);
2616         cancel_flags |= XFS_TRANS_ABORT;
2617         xfs_trans_cancel(tp, cancel_flags);
2618
2619         IRELE(ip);
2620
2621         goto std_return;
2622 }
2623
2624
2625 /*
2626  * xfs_link
2627  *
2628  */
2629 STATIC int
2630 xfs_link(
2631         bhv_desc_t              *target_dir_bdp,
2632         vnode_t                 *src_vp,
2633         vname_t                 *dentry,
2634         cred_t                  *credp)
2635 {
2636         xfs_inode_t             *tdp, *sip;
2637         xfs_trans_t             *tp;
2638         xfs_mount_t             *mp;
2639         xfs_inode_t             *ips[2];
2640         int                     error;
2641         xfs_bmap_free_t         free_list;
2642         xfs_fsblock_t           first_block;
2643         int                     cancel_flags;
2644         int                     committed;
2645         vnode_t                 *target_dir_vp;
2646         bhv_desc_t              *src_bdp;
2647         int                     resblks;
2648         char                    *target_name = VNAME(dentry);
2649         int                     target_namelen;
2650
2651         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2652         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2653         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2654
2655         target_namelen = VNAMELEN(dentry);
2656         if (src_vp->v_type == VDIR)
2657                 return XFS_ERROR(EPERM);
2658
2659         /*
2660          * For now, manually find the XFS behavior descriptor for
2661          * the source vnode.  If it doesn't exist then something
2662          * is wrong and we should just return an error.
2663          * Eventually we need to figure out how link is going to
2664          * work in the face of stacked vnodes.
2665          */
2666         src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
2667         if (src_bdp == NULL) {
2668                 return XFS_ERROR(EXDEV);
2669         }
2670         sip = XFS_BHVTOI(src_bdp);
2671         tdp = XFS_BHVTOI(target_dir_bdp);
2672         mp = tdp->i_mount;
2673         if (XFS_FORCED_SHUTDOWN(mp))
2674                 return XFS_ERROR(EIO);
2675
2676         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2677                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2678                                         target_dir_bdp, DM_RIGHT_NULL,
2679                                         src_bdp, DM_RIGHT_NULL,
2680                                         target_name, NULL, 0, 0, 0);
2681                 if (error)
2682                         return error;
2683         }
2684
2685         /* Return through std_return after this point. */
2686
2687         error = XFS_QM_DQATTACH(mp, sip, 0);
2688         if (!error && sip != tdp)
2689                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2690         if (error)
2691                 goto std_return;
2692
2693         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2694         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2695         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2696         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2697                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2698         if (error == ENOSPC) {
2699                 resblks = 0;
2700                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2701                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2702         }
2703         if (error) {
2704                 cancel_flags = 0;
2705                 goto error_return;
2706         }
2707
2708         if (sip->i_ino < tdp->i_ino) {
2709                 ips[0] = sip;
2710                 ips[1] = tdp;
2711         } else {
2712                 ips[0] = tdp;
2713                 ips[1] = sip;
2714         }
2715
2716         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2717
2718         /*
2719          * Increment vnode ref counts since xfs_trans_commit &
2720          * xfs_trans_cancel will both unlock the inodes and
2721          * decrement the associated ref counts.
2722          */
2723         VN_HOLD(src_vp);
2724         VN_HOLD(target_dir_vp);
2725         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2726         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2727
2728         /*
2729          * If the source has too many links, we can't make any more to it.
2730          */
2731         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2732                 error = XFS_ERROR(EMLINK);
2733                 goto error_return;
2734         }
2735
2736         if (resblks == 0 &&
2737             (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2738                         target_namelen)))
2739                 goto error_return;
2740
2741         XFS_BMAP_INIT(&free_list, &first_block);
2742
2743         error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2744                                    sip->i_ino, &first_block, &free_list,
2745                                    resblks);
2746         if (error)
2747                 goto abort_return;
2748         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2749         tdp->i_gen++;
2750         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2751
2752         error = xfs_bumplink(tp, sip);
2753         if (error) {
2754                 goto abort_return;
2755         }
2756
2757         /*
2758          * If this is a synchronous mount, make sure that the
2759          * link transaction goes to disk before returning to
2760          * the user.
2761          */
2762         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2763                 xfs_trans_set_sync(tp);
2764         }
2765
2766         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2767         if (error) {
2768                 xfs_bmap_cancel(&free_list);
2769                 goto abort_return;
2770         }
2771
2772         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2773         if (error) {
2774                 goto std_return;
2775         }
2776
2777         /* Fall through to std_return with error = 0. */
2778 std_return:
2779         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2780                                                 DM_EVENT_POSTLINK)) {
2781                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2782                                 target_dir_bdp, DM_RIGHT_NULL,
2783                                 src_bdp, DM_RIGHT_NULL,
2784                                 target_name, NULL, 0, error, 0);
2785         }
2786         return error;
2787
2788  abort_return:
2789         cancel_flags |= XFS_TRANS_ABORT;
2790         /* FALLTHROUGH */
2791  error_return:
2792         xfs_trans_cancel(tp, cancel_flags);
2793
2794         goto std_return;
2795 }
2796 /*
2797  * xfs_mkdir
2798  *
2799  */
2800 STATIC int
2801 xfs_mkdir(
2802         bhv_desc_t              *dir_bdp,
2803         vname_t                 *dentry,
2804         vattr_t                 *vap,
2805         vnode_t                 **vpp,
2806         cred_t                  *credp)
2807 {
2808         char                    *dir_name = VNAME(dentry);
2809         xfs_inode_t             *dp;
2810         xfs_inode_t             *cdp;   /* inode of created dir */
2811         vnode_t                 *cvp;   /* vnode of created dir */
2812         xfs_trans_t             *tp;
2813         xfs_dev_t               rdev;
2814         xfs_mount_t             *mp;
2815         int                     cancel_flags;
2816         int                     error;
2817         int                     committed;
2818         xfs_bmap_free_t         free_list;
2819         xfs_fsblock_t           first_block;
2820         vnode_t                 *dir_vp;
2821         boolean_t               dp_joined_to_trans;
2822         boolean_t               created = B_FALSE;
2823         int                     dm_event_sent = 0;
2824         xfs_prid_t              prid;
2825         struct xfs_dquot        *udqp, *gdqp;
2826         uint                    resblks;
2827         int                     dm_di_mode;
2828         int                     dir_namelen;
2829
2830         dir_vp = BHV_TO_VNODE(dir_bdp);
2831         dp = XFS_BHVTOI(dir_bdp);
2832         mp = dp->i_mount;
2833
2834         if (XFS_FORCED_SHUTDOWN(mp))
2835                 return XFS_ERROR(EIO);
2836
2837         dir_namelen = VNAMELEN(dentry);
2838
2839         tp = NULL;
2840         dp_joined_to_trans = B_FALSE;
2841         dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
2842
2843         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2844                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2845                                         dir_bdp, DM_RIGHT_NULL, NULL,
2846                                         DM_RIGHT_NULL, dir_name, NULL,
2847                                         dm_di_mode, 0, 0);
2848                 if (error)
2849                         return error;
2850                 dm_event_sent = 1;
2851         }
2852
2853         /* Return through std_return after this point. */
2854
2855         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2856
2857         mp = dp->i_mount;
2858         udqp = gdqp = NULL;
2859         if (vap->va_mask & XFS_AT_PROJID)
2860                 prid = (xfs_prid_t)vap->va_projid;
2861         else
2862                 prid = (xfs_prid_t)dfltprid;
2863
2864         /*
2865          * Make sure that we have allocated dquot(s) on disk.
2866          */
2867         error = XFS_QM_DQVOPALLOC(mp, dp, current->fsuid, current->fsgid,
2868                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2869         if (error)
2870                 goto std_return;
2871
2872         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2873         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2874         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2875         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2876                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2877         if (error == ENOSPC) {
2878                 resblks = 0;
2879                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2880                                           XFS_TRANS_PERM_LOG_RES,
2881                                           XFS_MKDIR_LOG_COUNT);
2882         }
2883         if (error) {
2884                 cancel_flags = 0;
2885                 dp = NULL;
2886                 goto error_return;
2887         }
2888
2889         xfs_ilock(dp, XFS_ILOCK_EXCL);
2890
2891         /*
2892          * Check for directory link count overflow.
2893          */
2894         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2895                 error = XFS_ERROR(EMLINK);
2896                 goto error_return;
2897         }
2898
2899         /*
2900          * Reserve disk quota and the inode.
2901          */
2902         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2903         if (error)
2904                 goto error_return;
2905
2906         if (resblks == 0 &&
2907             (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2908                 goto error_return;
2909         /*
2910          * create the directory inode.
2911          */
2912         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
2913         error = xfs_dir_ialloc(&tp, dp,
2914                         MAKEIMODE(vap->va_type,vap->va_mode), 2,
2915                         rdev, credp, prid, resblks > 0,
2916                 &cdp, NULL);
2917         if (error) {
2918                 if (error == ENOSPC)
2919                         goto error_return;
2920                 goto abort_return;
2921         }
2922         ITRACE(cdp);
2923
2924         /*
2925          * Now we add the directory inode to the transaction.
2926          * We waited until now since xfs_dir_ialloc might start
2927          * a new transaction.  Had we joined the transaction
2928          * earlier, the locks might have gotten released.
2929          */
2930         VN_HOLD(dir_vp);
2931         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2932         dp_joined_to_trans = B_TRUE;
2933
2934         XFS_BMAP_INIT(&free_list, &first_block);
2935
2936         error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2937                         cdp->i_ino, &first_block, &free_list,
2938                         resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2939         if (error) {
2940                 ASSERT(error != ENOSPC);
2941                 goto error1;
2942         }
2943         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2944
2945         /*
2946          * Bump the in memory version number of the parent directory
2947          * so that other processes accessing it will recognize that
2948          * the directory has changed.
2949          */
2950         dp->i_gen++;
2951
2952         error = XFS_DIR_INIT(mp, tp, cdp, dp);
2953         if (error) {
2954                 goto error2;
2955         }
2956
2957         cdp->i_gen = 1;
2958         error = xfs_bumplink(tp, dp);
2959         if (error) {
2960                 goto error2;
2961         }
2962
2963         cvp = XFS_ITOV(cdp);
2964
2965         created = B_TRUE;
2966
2967         *vpp = cvp;
2968         IHOLD(cdp);
2969
2970         /*
2971          * Attach the dquots to the new inode and modify the icount incore.
2972          */
2973         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2974
2975         /*
2976          * If this is a synchronous mount, make sure that the
2977          * mkdir transaction goes to disk before returning to
2978          * the user.
2979          */
2980         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2981                 xfs_trans_set_sync(tp);
2982         }
2983
2984         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2985         if (error) {
2986                 IRELE(cdp);
2987                 goto error2;
2988         }
2989
2990         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2991         XFS_QM_DQRELE(mp, udqp);
2992         XFS_QM_DQRELE(mp, gdqp);
2993         if (error) {
2994                 IRELE(cdp);
2995         }
2996
2997         /* Fall through to std_return with error = 0 or errno from
2998          * xfs_trans_commit. */
2999
3000 std_return:
3001         if ( (created || (error != 0 && dm_event_sent != 0)) &&
3002                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3003                                                 DM_EVENT_POSTCREATE)) {
3004                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
3005                                         dir_bdp, DM_RIGHT_NULL,
3006                                         created ? XFS_ITOBHV(cdp):NULL,
3007                                         DM_RIGHT_NULL,
3008                                         dir_name, NULL,
3009                                         dm_di_mode, error, 0);
3010         }
3011         return error;
3012
3013  error2:
3014  error1:
3015         xfs_bmap_cancel(&free_list);
3016  abort_return:
3017         cancel_flags |= XFS_TRANS_ABORT;
3018  error_return:
3019         xfs_trans_cancel(tp, cancel_flags);
3020         XFS_QM_DQRELE(mp, udqp);
3021         XFS_QM_DQRELE(mp, gdqp);
3022
3023         if (!dp_joined_to_trans && (dp != NULL)) {
3024                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3025         }
3026
3027         goto std_return;
3028 }
3029
3030
3031 /*
3032  * xfs_rmdir
3033  *
3034  */
3035 STATIC int
3036 xfs_rmdir(
3037         bhv_desc_t              *dir_bdp,
3038         vname_t                 *dentry,
3039         cred_t                  *credp)
3040 {
3041         char                    *name = VNAME(dentry);
3042         xfs_inode_t             *dp;
3043         xfs_inode_t             *cdp;   /* child directory */
3044         xfs_trans_t             *tp;
3045         xfs_mount_t             *mp;
3046         int                     error;
3047         xfs_bmap_free_t         free_list;
3048         xfs_fsblock_t           first_block;
3049         int                     cancel_flags;
3050         int                     committed;
3051         vnode_t                 *dir_vp;
3052         int                     dm_di_mode = 0;
3053         int                     last_cdp_link;
3054         int                     namelen;
3055         uint                    resblks;
3056
3057         dir_vp = BHV_TO_VNODE(dir_bdp);
3058         dp = XFS_BHVTOI(dir_bdp);
3059         mp = dp->i_mount;
3060
3061         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3062
3063         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3064                 return XFS_ERROR(EIO);
3065         namelen = VNAMELEN(dentry);
3066
3067         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3068                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3069                                         dir_bdp, DM_RIGHT_NULL,
3070                                         NULL, DM_RIGHT_NULL,
3071                                         name, NULL, 0, 0, 0);
3072                 if (error)
3073                         return XFS_ERROR(error);
3074         }
3075
3076         /* Return through std_return after this point. */
3077
3078         cdp = NULL;
3079
3080         /*
3081          * We need to get a reference to cdp before we get our log
3082          * reservation.  The reason for this is that we cannot call
3083          * xfs_iget for an inode for which we do not have a reference
3084          * once we've acquired a log reservation.  This is because the
3085          * inode we are trying to get might be in xfs_inactive going
3086          * for a log reservation.  Since we'll have to wait for the
3087          * inactive code to complete before returning from xfs_iget,
3088          * we need to make sure that we don't have log space reserved
3089          * when we call xfs_iget.  Instead we get an unlocked referece
3090          * to the inode before getting our log reservation.
3091          */
3092         error = xfs_get_dir_entry(dentry, &cdp);
3093         if (error) {
3094                 REMOVE_DEBUG_TRACE(__LINE__);
3095                 goto std_return;
3096         }
3097         mp = dp->i_mount;
3098         dm_di_mode = cdp->i_d.di_mode;
3099
3100         /*
3101          * Get the dquots for the inodes.
3102          */
3103         error = XFS_QM_DQATTACH(mp, dp, 0);
3104         if (!error && dp != cdp)
3105                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3106         if (error) {
3107                 IRELE(cdp);
3108                 REMOVE_DEBUG_TRACE(__LINE__);
3109                 goto std_return;
3110         }
3111
3112         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3113         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3114         /*
3115          * We try to get the real space reservation first,
3116          * allowing for directory btree deletion(s) implying
3117          * possible bmap insert(s).  If we can't get the space
3118          * reservation then we use 0 instead, and avoid the bmap
3119          * btree insert(s) in the directory code by, if the bmap
3120          * insert tries to happen, instead trimming the LAST
3121          * block from the directory.
3122          */
3123         resblks = XFS_REMOVE_SPACE_RES(mp);
3124         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3125                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3126         if (error == ENOSPC) {
3127                 resblks = 0;
3128                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3129                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3130         }
3131         if (error) {
3132                 ASSERT(error != ENOSPC);
3133                 cancel_flags = 0;
3134                 IRELE(cdp);
3135                 goto error_return;
3136         }
3137         XFS_BMAP_INIT(&free_list, &first_block);
3138
3139         /*
3140          * Now lock the child directory inode and the parent directory
3141          * inode in the proper order.  This will take care of validating
3142          * that the directory entry for the child directory inode has
3143          * not changed while we were obtaining a log reservation.
3144          */
3145         error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3146         if (error) {
3147                 xfs_trans_cancel(tp, cancel_flags);
3148                 IRELE(cdp);
3149                 goto std_return;
3150         }
3151
3152         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3153         if (dp != cdp) {
3154                 /*
3155                  * Only increment the parent directory vnode count if
3156                  * we didn't bump it in looking up cdp.  The only time
3157                  * we don't bump it is when we're looking up ".".
3158                  */
3159                 VN_HOLD(dir_vp);
3160         }
3161
3162         ITRACE(cdp);
3163         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3164
3165         if ((error = _MAC_XFS_IACCESS(cdp, MACWRITE, credp))) {
3166                 goto error_return;
3167         }
3168
3169         ASSERT(cdp->i_d.di_nlink >= 2);
3170         if (cdp->i_d.di_nlink != 2) {
3171                 error = XFS_ERROR(ENOTEMPTY);
3172                 goto error_return;
3173         }
3174         if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3175                 error = XFS_ERROR(ENOTEMPTY);
3176                 goto error_return;
3177         }
3178
3179         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3180                 &first_block, &free_list, resblks);
3181         if (error) {
3182                 goto error1;
3183         }
3184
3185         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3186
3187         /*
3188          * Bump the in memory generation count on the parent
3189          * directory so that other can know that it has changed.
3190          */
3191         dp->i_gen++;
3192
3193         /*
3194          * Drop the link from cdp's "..".
3195          */
3196         error = xfs_droplink(tp, dp);
3197         if (error) {
3198                 goto error1;
3199         }
3200
3201         /*
3202          * Drop the link from dp to cdp.
3203          */
3204         error = xfs_droplink(tp, cdp);
3205         if (error) {
3206                 goto error1;
3207         }
3208
3209         /*
3210          * Drop the "." link from cdp to self.
3211          */
3212         error = xfs_droplink(tp, cdp);
3213         if (error) {
3214                 goto error1;
3215         }
3216
3217         /* Determine these before committing transaction */
3218         last_cdp_link = (cdp)->i_d.di_nlink==0;
3219
3220         /*
3221          * Take an extra ref on the child vnode so that it
3222          * does not go to xfs_inactive() from within the commit.
3223          */
3224         IHOLD(cdp);
3225
3226         /*
3227          * If this is a synchronous mount, make sure that the
3228          * rmdir transaction goes to disk before returning to
3229          * the user.
3230          */
3231         if (mp->m_flags & XFS_MOUNT_WSYNC) {
3232                 xfs_trans_set_sync(tp);
3233         }
3234
3235         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3236         if (error) {
3237                 xfs_bmap_cancel(&free_list);
3238                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3239                                  XFS_TRANS_ABORT));
3240                 IRELE(cdp);
3241                 goto std_return;
3242         }
3243
3244         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3245         if (error) {
3246                 IRELE(cdp);
3247                 goto std_return;
3248         }
3249
3250
3251         /*
3252          * Let interposed file systems know about removed links.
3253          */
3254         VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3255
3256         IRELE(cdp);
3257
3258         /* Fall through to std_return with error = 0 or the errno
3259          * from xfs_trans_commit. */
3260 std_return:
3261         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3262                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3263                                         dir_bdp, DM_RIGHT_NULL,
3264                                         NULL, DM_RIGHT_NULL,
3265                                         name, NULL, dm_di_mode,
3266                                         error, 0);
3267         }
3268         return error;
3269
3270  error1:
3271         xfs_bmap_cancel(&free_list);
3272         cancel_flags |= XFS_TRANS_ABORT;
3273  error_return:
3274         xfs_trans_cancel(tp, cancel_flags);
3275         goto std_return;
3276 }
3277
3278
3279 /*
3280  * xfs_readdir
3281  *
3282  * Read dp's entries starting at uiop->uio_offset and translate them into
3283  * bufsize bytes worth of struct dirents starting at bufbase.
3284  */
3285 STATIC int
3286 xfs_readdir(
3287         bhv_desc_t      *dir_bdp,
3288         uio_t           *uiop,
3289         cred_t          *credp,
3290         int             *eofp)
3291 {
3292         xfs_inode_t     *dp;
3293         xfs_trans_t     *tp = NULL;
3294         int             error = 0;
3295         uint            lock_mode;
3296         xfs_off_t       start_offset;
3297
3298         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3299                                                (inst_t *)__return_address);
3300         dp = XFS_BHVTOI(dir_bdp);
3301
3302         if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3303                 return XFS_ERROR(EIO);
3304         }
3305
3306         lock_mode = xfs_ilock_map_shared(dp);
3307         start_offset = uiop->uio_offset;
3308         error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3309         if (start_offset != uiop->uio_offset) {
3310                 xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
3311         }
3312         xfs_iunlock_map_shared(dp, lock_mode);
3313         return error;
3314 }
3315
3316
3317 /*
3318  * xfs_symlink
3319  *
3320  */
3321 STATIC int
3322 xfs_symlink(
3323         bhv_desc_t              *dir_bdp,
3324         vname_t                 *dentry,
3325         vattr_t                 *vap,
3326         char                    *target_path,
3327         vnode_t                 **vpp,
3328         cred_t                  *credp)
3329 {
3330         xfs_trans_t             *tp;
3331         xfs_mount_t             *mp;
3332         xfs_inode_t             *dp;
3333         xfs_inode_t             *ip;
3334         int                     error;
3335         int                     pathlen;
3336         xfs_dev_t               rdev;
3337         xfs_bmap_free_t         free_list;
3338         xfs_fsblock_t           first_block;
3339         boolean_t               dp_joined_to_trans;
3340         vnode_t                 *dir_vp;
3341         uint                    cancel_flags;
3342         int                     committed;
3343         xfs_fileoff_t           first_fsb;
3344         xfs_filblks_t           fs_blocks;
3345         int                     nmaps;
3346         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3347         xfs_daddr_t             d;
3348         char                    *cur_chunk;
3349         int                     byte_cnt;
3350         int                     n;
3351         xfs_buf_t               *bp;
3352         xfs_prid_t              prid;
3353         struct xfs_dquot        *udqp, *gdqp;
3354         uint                    resblks;
3355         char                    *link_name = VNAME(dentry);
3356         int                     link_namelen;
3357
3358         *vpp = NULL;
3359         dir_vp = BHV_TO_VNODE(dir_bdp);
3360         dp = XFS_BHVTOI(dir_bdp);
3361         dp_joined_to_trans = B_FALSE;
3362         error = 0;
3363         ip = NULL;
3364         tp = NULL;
3365
3366         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3367
3368         mp = dp->i_mount;
3369
3370         if (XFS_FORCED_SHUTDOWN(mp))
3371                 return XFS_ERROR(EIO);
3372
3373         link_namelen = VNAMELEN(dentry);
3374
3375         /*
3376          * Check component lengths of the target path name.
3377          */
3378         pathlen = strlen(target_path);
3379         if (pathlen >= MAXPATHLEN)      /* total string too long */
3380                 return XFS_ERROR(ENAMETOOLONG);
3381         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3382                 int len, total;
3383                 char *path;
3384
3385                 for(total = 0, path = target_path; total < pathlen;) {
3386                         /*
3387                          * Skip any slashes.
3388                          */
3389                         while(*path == '/') {
3390                                 total++;
3391                                 path++;
3392                         }
3393
3394                         /*
3395                          * Count up to the next slash or end of path.
3396                          * Error out if the component is bigger than MAXNAMELEN.
3397                          */
3398                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3399                                 if (++len >= MAXNAMELEN) {
3400                                         error = ENAMETOOLONG;
3401                                         return error;
3402                                 }
3403                         }
3404                 }
3405         }
3406
3407         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3408                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_bdp,
3409                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3410                                         link_name, target_path, 0, 0, 0);
3411                 if (error)
3412                         return error;
3413         }
3414
3415         /* Return through std_return after this point. */
3416
3417         udqp = gdqp = NULL;
3418         if (vap->va_mask & XFS_AT_PROJID)
3419                 prid = (xfs_prid_t)vap->va_projid;
3420         else
3421                 prid = (xfs_prid_t)dfltprid;
3422
3423         /*
3424          * Make sure that we have allocated dquot(s) on disk.
3425          */
3426         error = XFS_QM_DQVOPALLOC(mp, dp, current->fsuid, current->fsgid,
3427                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3428         if (error)
3429                 goto std_return;
3430
3431         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3432         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3433         /*
3434          * The symlink will fit into the inode data fork?
3435          * There can't be any attributes so we get the whole variable part.
3436          */
3437         if (pathlen <= XFS_LITINO(mp))
3438                 fs_blocks = 0;
3439         else
3440                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3441         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3442         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3443                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3444         if (error == ENOSPC && fs_blocks == 0) {
3445                 resblks = 0;
3446                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3447                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3448         }
3449         if (error) {
3450                 cancel_flags = 0;
3451                 dp = NULL;
3452                 goto error_return;
3453         }
3454
3455         xfs_ilock(dp, XFS_ILOCK_EXCL);
3456
3457         /*
3458          * Reserve disk quota : blocks and inode.
3459          */
3460         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3461         if (error)
3462                 goto error_return;
3463
3464         /*
3465          * Check for ability to enter directory entry, if no space reserved.
3466          */
3467         if (resblks == 0 &&
3468             (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3469                 goto error_return;
3470         /*
3471          * Initialize the bmap freelist prior to calling either
3472          * bmapi or the directory create code.
3473          */
3474         XFS_BMAP_INIT(&free_list, &first_block);
3475
3476         /*
3477          * Allocate an inode for the symlink.
3478          */
3479         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
3480
3481         error = xfs_dir_ialloc(&tp, dp, IFLNK | (vap->va_mode&~IFMT),
3482                                1, rdev, credp, prid, resblks > 0, &ip, NULL);
3483         if (error) {
3484                 if (error == ENOSPC)
3485                         goto error_return;
3486                 goto error1;
3487         }
3488         ITRACE(ip);
3489
3490         VN_HOLD(dir_vp);
3491         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3492         dp_joined_to_trans = B_TRUE;
3493
3494         /*
3495          * Also attach the dquot(s) to it, if applicable.
3496          */
3497         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3498
3499         if (resblks)
3500                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3501         /*
3502          * If the symlink will fit into the inode, write it inline.
3503          */
3504         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3505                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3506                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3507                 ip->i_d.di_size = pathlen;
3508
3509                 /*
3510                  * The inode was initially created in extent format.
3511                  */
3512                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3513                 ip->i_df.if_flags |= XFS_IFINLINE;
3514
3515                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3516                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3517
3518         } else {
3519                 first_fsb = 0;
3520                 nmaps = SYMLINK_MAPS;
3521
3522                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3523                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3524                                   &first_block, resblks, mval, &nmaps,
3525                                   &free_list);
3526                 if (error) {
3527                         goto error1;
3528                 }
3529
3530                 if (resblks)
3531                         resblks -= fs_blocks;
3532                 ip->i_d.di_size = pathlen;
3533                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3534
3535                 cur_chunk = target_path;
3536                 for (n = 0; n < nmaps; n++) {
3537                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3538                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3539                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3540                                                BTOBB(byte_cnt), 0);
3541                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3542                         if (pathlen < byte_cnt) {
3543                                 byte_cnt = pathlen;
3544                         }
3545                         pathlen -= byte_cnt;
3546
3547                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3548                         cur_chunk += byte_cnt;
3549
3550                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3551                 }
3552         }
3553
3554         /*
3555          * Create the directory entry for the symlink.
3556          */
3557         error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3558                         ip->i_ino, &first_block, &free_list, resblks);
3559         if (error) {
3560                 goto error1;
3561         }
3562         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3563         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3564
3565         /*
3566          * Bump the in memory version number of the parent directory
3567          * so that other processes accessing it will recognize that
3568          * the directory has changed.
3569          */
3570         dp->i_gen++;
3571
3572         /*
3573          * If this is a synchronous mount, make sure that the
3574          * symlink transaction goes to disk before returning to
3575          * the user.
3576          */
3577         if (mp->m_flags & XFS_MOUNT_WSYNC) {
3578                 xfs_trans_set_sync(tp);
3579         }
3580
3581         /*
3582          * xfs_trans_commit normally decrements the vnode ref count
3583          * when it unlocks the inode. Since we want to return the
3584          * vnode to the caller, we bump the vnode ref count now.
3585          */
3586         IHOLD(ip);
3587
3588         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3589         if (error) {
3590                 goto error2;
3591         }
3592         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3593         XFS_QM_DQRELE(mp, udqp);
3594         XFS_QM_DQRELE(mp, gdqp);
3595
3596         /* Fall through to std_return with error = 0 or errno from
3597          * xfs_trans_commit     */
3598 std_return:
3599         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3600                              DM_EVENT_POSTSYMLINK)) {
3601                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3602                                         dir_bdp, DM_RIGHT_NULL,
3603                                         error ? NULL : XFS_ITOBHV(ip),
3604                                         DM_RIGHT_NULL, link_name, target_path,
3605                                         0, error, 0);
3606         }
3607
3608         if (!error) {
3609                 vnode_t *vp;
3610
3611                 ASSERT(ip);
3612                 vp = XFS_ITOV(ip);
3613                 *vpp = vp;
3614         }
3615         return error;
3616
3617  error2:
3618         IRELE(ip);
3619  error1:
3620         xfs_bmap_cancel(&free_list);
3621         cancel_flags |= XFS_TRANS_ABORT;
3622  error_return:
3623         xfs_trans_cancel(tp, cancel_flags);
3624         XFS_QM_DQRELE(mp, udqp);
3625         XFS_QM_DQRELE(mp, gdqp);
3626
3627         if (!dp_joined_to_trans && (dp != NULL)) {
3628                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3629         }
3630
3631         goto std_return;
3632 }
3633
3634
3635 /*
3636  * xfs_fid2
3637  *
3638  * A fid routine that takes a pointer to a previously allocated
3639  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3640  */
3641 STATIC int
3642 xfs_fid2(
3643         bhv_desc_t      *bdp,
3644         fid_t           *fidp)
3645 {
3646         xfs_inode_t     *ip;
3647         xfs_fid2_t      *xfid;
3648
3649         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3650                                        (inst_t *)__return_address);
3651         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3652
3653         xfid = (xfs_fid2_t *)fidp;
3654         ip = XFS_BHVTOI(bdp);
3655         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3656         xfid->fid_pad = 0;
3657         /*
3658          * use memcpy because the inode is a long long and there's no
3659          * assurance that xfid->fid_ino is properly aligned.
3660          */
3661         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3662         xfid->fid_gen = ip->i_d.di_gen;
3663
3664         return 0;
3665 }
3666
3667
3668 /*
3669  * xfs_rwlock
3670  */
3671 int
3672 xfs_rwlock(
3673         bhv_desc_t      *bdp,
3674         vrwlock_t       locktype)
3675 {
3676         xfs_inode_t     *ip;
3677         vnode_t         *vp;
3678
3679         vp = BHV_TO_VNODE(bdp);
3680         if (vp->v_type == VDIR)
3681                 return 1;
3682         ip = XFS_BHVTOI(bdp);
3683         if (locktype == VRWLOCK_WRITE) {
3684                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3685         } else if (locktype == VRWLOCK_TRY_READ) {
3686                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
3687         } else if (locktype == VRWLOCK_TRY_WRITE) {
3688                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
3689         } else {
3690                 ASSERT((locktype == VRWLOCK_READ) ||
3691                        (locktype == VRWLOCK_WRITE_DIRECT));
3692                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3693         }
3694
3695         return 1;
3696 }
3697
3698
3699 /*
3700  * xfs_rwunlock
3701  */
3702 void
3703 xfs_rwunlock(
3704         bhv_desc_t      *bdp,
3705         vrwlock_t       locktype)
3706 {
3707         xfs_inode_t     *ip;
3708         vnode_t         *vp;
3709
3710         vp = BHV_TO_VNODE(bdp);
3711         if (vp->v_type == VDIR)
3712                 return;
3713         ip = XFS_BHVTOI(bdp);
3714         if (locktype == VRWLOCK_WRITE) {
3715                 xfs_iunlock (ip, XFS_IOLOCK_EXCL);
3716         } else {
3717                 ASSERT((locktype == VRWLOCK_READ) ||
3718                        (locktype == VRWLOCK_WRITE_DIRECT));
3719                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3720         }
3721         return;
3722 }
3723
3724 STATIC int
3725 xfs_inode_flush(
3726         bhv_desc_t      *bdp,
3727         int             flags)
3728 {
3729         xfs_inode_t     *ip;
3730         xfs_dinode_t    *dip;
3731         xfs_mount_t     *mp;
3732         xfs_buf_t       *bp;
3733         int             error = 0;
3734
3735         ip = XFS_BHVTOI(bdp);
3736         mp = ip->i_mount;
3737
3738         if (XFS_FORCED_SHUTDOWN(mp))
3739                 return XFS_ERROR(EIO);
3740
3741         /* Bypass inodes which have already been cleaned by
3742          * the inode flush clustering code inside xfs_iflush
3743          */
3744         if ((ip->i_update_core == 0) &&
3745             ((ip->i_itemp == NULL) ||
3746              !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)))
3747                 return 0;
3748
3749         if (flags & FLUSH_LOG) {
3750                 xfs_inode_log_item_t *iip = ip->i_itemp;
3751
3752                 if (iip && iip->ili_last_lsn) {
3753                         xlog_t  *log = mp->m_log;
3754                         xfs_lsn_t       sync_lsn;
3755                         int             s, log_flags = XFS_LOG_FORCE;
3756
3757                         s = GRANT_LOCK(log);
3758                         sync_lsn = log->l_last_sync_lsn;
3759                         GRANT_UNLOCK(log, s);
3760
3761                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3762                                 return 0;
3763
3764                         if (flags & FLUSH_SYNC)
3765                                 log_flags |= XFS_LOG_SYNC;
3766                         return xfs_log_force(mp, iip->ili_last_lsn,
3767                                                 log_flags);
3768                 }
3769         }
3770
3771         /* We make this non-blocking if the inode is contended,
3772          * return EAGAIN to indicate to the caller that they
3773          * did not succeed. This prevents the flush path from
3774          * blocking on inodes inside another operation right
3775          * now, they get caught later by xfs_sync.
3776          */
3777         if (flags & FLUSH_INODE) {
3778                 if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3779                         if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) {
3780                                 int     flush_flags;
3781
3782 #if 0
3783                                 /* not turning this on until some
3784                                  * performance analysis is done
3785                                  */
3786                                 if (flags & FLUSH_SYNC)
3787                                         flush_flags = XFS_IFLUSH_SYNC;
3788                                 else
3789 #endif
3790                                 flush_flags = XFS_IFLUSH_DELWRI_ELSE_ASYNC;
3791
3792                                 xfs_ifunlock(ip);
3793                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3794                                 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
3795                                 if (error)
3796                                         return error;
3797                                 xfs_buf_relse(bp);
3798
3799                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED) == 0)
3800                                         return EAGAIN;
3801
3802                                 if (xfs_ipincount(ip) ||
3803                                     !xfs_iflock_nowait(ip)) {
3804                                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3805                                         return EAGAIN;
3806                                 }
3807
3808                                 error = xfs_iflush(ip, flush_flags);
3809                         } else {
3810                                 error = EAGAIN;
3811                         }
3812                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3813                 } else {
3814                         error = EAGAIN;
3815                 }
3816         }
3817
3818         return error;
3819 }
3820
3821
3822 int
3823 xfs_set_dmattrs (
3824         bhv_desc_t      *bdp,
3825         u_int           evmask,
3826         u_int16_t       state,
3827         cred_t          *credp)
3828 {
3829         xfs_inode_t     *ip;
3830         xfs_trans_t     *tp;
3831         xfs_mount_t     *mp;
3832         int             error;
3833
3834         if (!capable(CAP_SYS_ADMIN))
3835                 return XFS_ERROR(EPERM);
3836
3837         ip = XFS_BHVTOI(bdp);
3838         mp = ip->i_mount;
3839
3840         if (XFS_FORCED_SHUTDOWN(mp))
3841                 return XFS_ERROR(EIO);
3842
3843         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3844         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3845         if (error) {
3846                 xfs_trans_cancel(tp, 0);
3847                 return error;
3848         }
3849         xfs_ilock(ip, XFS_ILOCK_EXCL);
3850         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3851
3852         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3853         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3854
3855         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3856         IHOLD(ip);
3857         error = xfs_trans_commit(tp, 0, NULL);
3858
3859         return error;
3860 }
3861
3862
3863 /*
3864  * xfs_reclaim
3865  */
3866 STATIC int
3867 xfs_reclaim(
3868         bhv_desc_t      *bdp)
3869 {
3870         xfs_inode_t     *ip;
3871         vnode_t         *vp;
3872
3873         vp = BHV_TO_VNODE(bdp);
3874
3875         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3876
3877         ASSERT(!VN_MAPPED(vp));
3878         ip = XFS_BHVTOI(bdp);
3879
3880         if ((ip->i_d.di_mode & IFMT) == IFREG) {
3881                 if (ip->i_d.di_size > 0) {
3882                         /*
3883                          * Flush and invalidate any data left around that is
3884                          * a part of this file.
3885                          *
3886                          * Get the inode's i/o lock so that buffers are pushed
3887                          * out while holding the proper lock.  We can't hold
3888                          * the inode lock here since flushing out buffers may
3889                          * cause us to try to get the lock in xfs_strategy().
3890                          *
3891                          * We don't have to call remapf() here, because there
3892                          * cannot be any mapped file references to this vnode
3893                          * since it is being reclaimed.
3894                          */
3895                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
3896
3897                         /*
3898                          * If we hit an IO error, we need to make sure that the
3899                          * buffer and page caches of file data for
3900                          * the file are tossed away. We don't want to use
3901                          * VOP_FLUSHINVAL_PAGES here because we don't want dirty
3902                          * pages to stay attached to the vnode, but be
3903                          * marked P_BAD. pdflush/vnode_pagebad
3904                          * hates that.
3905                          */
3906                         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3907                                 VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
3908                         } else {
3909                                 VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3910                         }
3911
3912                         ASSERT(VN_CACHED(vp) == 0);
3913                         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
3914                                ip->i_delayed_blks == 0);
3915                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3916                 } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3917                         /*
3918                          * di_size field may not be quite accurate if we're
3919                          * shutting down.
3920                          */
3921                         VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3922                         ASSERT(VN_CACHED(vp) == 0);
3923                 }
3924         }
3925
3926         /* If we have nothing to flush with this inode then complete the
3927          * teardown now, otherwise break the link between the xfs inode
3928          * and the linux inode and clean up the xfs inode later. This
3929          * avoids flushing the inode to disk during the delete operation
3930          * itself.
3931          */
3932         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3933                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3934                 xfs_iflock(ip);
3935                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3936         } else {
3937                 xfs_mount_t     *mp = ip->i_mount;
3938
3939                 /* Protect sync from us */
3940                 XFS_MOUNT_ILOCK(mp);
3941                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3942                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3943                 ip->i_flags |= XFS_IRECLAIMABLE;
3944                 XFS_MOUNT_IUNLOCK(mp);
3945         }
3946         return 0;
3947 }
3948
3949 int
3950 xfs_finish_reclaim(
3951         xfs_inode_t     *ip,
3952         int             locked,
3953         int             sync_mode)
3954 {
3955         xfs_ihash_t     *ih = ip->i_hash;
3956         int             error;
3957
3958         /* The hash lock here protects a thread in xfs_iget_core from
3959          * racing with us on linking the inode back with a vnode.
3960          * Once we have the XFS_IRECLAIM flag set it will not touch
3961          * us.
3962          */
3963         write_lock(&ih->ih_lock);
3964         if ((ip->i_flags & XFS_IRECLAIM) ||
3965             (!(ip->i_flags & XFS_IRECLAIMABLE) &&
3966               (XFS_ITOV_NULL(ip) == NULL))) {
3967                 write_unlock(&ih->ih_lock);
3968                 if (locked) {
3969                         xfs_ifunlock(ip);
3970                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3971                 }
3972                 return(1);
3973         }
3974         ip->i_flags |= XFS_IRECLAIM;
3975         write_unlock(&ih->ih_lock);
3976
3977         /*
3978          * If the inode is still dirty, then flush it out.  If the inode
3979          * is not in the AIL, then it will be OK to flush it delwri as
3980          * long as xfs_iflush() does not keep any references to the inode.
3981          * We leave that decision up to xfs_iflush() since it has the
3982          * knowledge of whether it's OK to simply do a delwri flush of
3983          * the inode or whether we need to wait until the inode is
3984          * pulled from the AIL.
3985          * We get the flush lock regardless, though, just to make sure
3986          * we don't free it while it is being flushed.
3987          */
3988         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3989                 if (!locked) {
3990                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3991                         xfs_iflock(ip);
3992                 }
3993
3994                 if (ip->i_update_core ||
3995                     ((ip->i_itemp != NULL) &&
3996                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3997                         error = xfs_iflush(ip, sync_mode);
3998                         /*
3999                          * If we hit an error, typically because of filesystem
4000                          * shutdown, we don't need to let vn_reclaim to know
4001                          * because we're gonna reclaim the inode anyway.
4002                          */
4003                         if (error) {
4004                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4005                                 xfs_ireclaim(ip);
4006                                 return (0);
4007                         }
4008                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
4009                 }
4010
4011                 ASSERT(ip->i_update_core == 0);
4012                 ASSERT(ip->i_itemp == NULL ||
4013                        ip->i_itemp->ili_format.ilf_fields == 0);
4014                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4015         } else if (locked) {
4016                 /*
4017                  * We are not interested in doing an iflush if we're
4018                  * in the process of shutting down the filesystem forcibly.
4019                  * So, just reclaim the inode.
4020                  */
4021                 xfs_ifunlock(ip);
4022                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4023         }
4024
4025         xfs_ireclaim(ip);
4026         return 0;
4027 }
4028
4029 int
4030 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
4031 {
4032         int             purged;
4033         struct list_head        *curr, *next;
4034         xfs_inode_t     *ip;
4035         int             done = 0;
4036
4037         while (!done) {
4038                 purged = 0;
4039                 XFS_MOUNT_ILOCK(mp);
4040                 list_for_each_safe(curr, next, &mp->m_del_inodes) {
4041                         ip = list_entry(curr, xfs_inode_t, i_reclaim);
4042                         if (noblock) {
4043                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
4044                                         continue;
4045                                 if (xfs_ipincount(ip) ||
4046                                     !xfs_iflock_nowait(ip)) {
4047                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4048                                         continue;
4049                                 }
4050                         }
4051                         XFS_MOUNT_IUNLOCK(mp);
4052                         xfs_finish_reclaim(ip, noblock,
4053                                 XFS_IFLUSH_DELWRI_ELSE_ASYNC);
4054                         purged = 1;
4055                         break;
4056                 }
4057
4058                 done = !purged;
4059         }
4060
4061         XFS_MOUNT_IUNLOCK(mp);
4062         return 0;
4063 }
4064
4065 /*
4066  * xfs_alloc_file_space()
4067  *      This routine allocates disk space for the given file.
4068  *
4069  *      If alloc_type == 0, this request is for an ALLOCSP type
4070  *      request which will change the file size.  In this case, no
4071  *      DMAPI event will be generated by the call.  A TRUNCATE event
4072  *      will be generated later by xfs_setattr.
4073  *
4074  *      If alloc_type != 0, this request is for a RESVSP type
4075  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4076  *      lower block boundary byte address is less than the file's
4077  *      length.
4078  *
4079  * RETURNS:
4080  *       0 on success
4081  *      errno on error
4082  *
4083  */
4084 int
4085 xfs_alloc_file_space(
4086         xfs_inode_t             *ip,
4087         xfs_off_t               offset,
4088         xfs_off_t               len,
4089         int                     alloc_type,
4090         int                     attr_flags)
4091 {
4092         xfs_filblks_t           allocated_fsb;
4093         xfs_filblks_t           allocatesize_fsb;
4094         int                     committed;
4095         xfs_off_t               count;
4096         xfs_filblks_t           datablocks;
4097         int                     error;
4098         xfs_fsblock_t           firstfsb;
4099         xfs_bmap_free_t         free_list;
4100         xfs_bmbt_irec_t         *imapp;
4101         xfs_bmbt_irec_t         imaps[1];
4102         xfs_mount_t             *mp;
4103         int                     numrtextents;
4104         int                     reccount;
4105         uint                    resblks;
4106         int                     rt;
4107         int                     rtextsize;
4108         xfs_fileoff_t           startoffset_fsb;
4109         xfs_trans_t             *tp;
4110         int                     xfs_bmapi_flags;
4111
4112         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4113         mp = ip->i_mount;
4114
4115         if (XFS_FORCED_SHUTDOWN(mp))
4116                 return XFS_ERROR(EIO);
4117
4118         /*
4119          * determine if this is a realtime file
4120          */
4121         if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
4122                 if (ip->i_d.di_extsize)
4123                         rtextsize = ip->i_d.di_extsize;
4124                 else
4125                         rtextsize = mp->m_sb.sb_rextsize;
4126         } else
4127                 rtextsize = 0;
4128
4129         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4130                 return error;
4131
4132         if (len <= 0)
4133                 return XFS_ERROR(EINVAL);
4134
4135         count = len;
4136         error = 0;
4137         imapp = &imaps[0];
4138         reccount = 1;
4139         xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4140         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4141         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4142
4143         /*      Generate a DMAPI event if needed.       */
4144         if (alloc_type != 0 && offset < ip->i_d.di_size &&
4145                         (attr_flags&ATTR_DMI) == 0  &&
4146                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4147                 xfs_off_t           end_dmi_offset;
4148
4149                 end_dmi_offset = offset+len;
4150                 if (end_dmi_offset > ip->i_d.di_size)
4151                         end_dmi_offset = ip->i_d.di_size;
4152                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOBHV(ip),
4153                         offset, end_dmi_offset - offset,
4154                         0, NULL);
4155                 if (error)
4156                         return(error);
4157         }
4158
4159         /*
4160          * allocate file space until done or until there is an error
4161          */
4162 retry:
4163         while (allocatesize_fsb && !error) {
4164                 /*
4165                  * determine if reserving space on
4166                  * the data or realtime partition.
4167                  */
4168                 if (rt) {
4169                         xfs_fileoff_t s, e;
4170
4171                         s = startoffset_fsb;
4172                         do_div(s, rtextsize);
4173                         s *= rtextsize;
4174                         e = roundup_64(startoffset_fsb + allocatesize_fsb,
4175                                 rtextsize);
4176                         numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
4177                         datablocks = 0;
4178                 } else {
4179                         datablocks = allocatesize_fsb;
4180                         numrtextents = 0;
4181                 }
4182
4183                 /*
4184                  * allocate and setup the transaction
4185                  */
4186                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4187                 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
4188                 error = xfs_trans_reserve(tp,
4189                                           resblks,
4190                                           XFS_WRITE_LOG_RES(mp),
4191                                           numrtextents,
4192                                           XFS_TRANS_PERM_LOG_RES,
4193                                           XFS_WRITE_LOG_COUNT);
4194
4195                 /*
4196                  * check for running out of space
4197                  */
4198                 if (error) {
4199                         /*
4200                          * Free the transaction structure.
4201                          */
4202                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4203                         xfs_trans_cancel(tp, 0);
4204                         break;
4205                 }
4206                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4207                 error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp,
4208                                 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4209                                 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4210                 if (error)
4211                         goto error1;
4212
4213                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4214                 xfs_trans_ihold(tp, ip);
4215
4216                 /*
4217                  * issue the bmapi() call to allocate the blocks
4218                  */
4219                 XFS_BMAP_INIT(&free_list, &firstfsb);
4220                 error = xfs_bmapi(tp, ip, startoffset_fsb,
4221                                   allocatesize_fsb, xfs_bmapi_flags,
4222                                   &firstfsb, 0, imapp, &reccount,
4223                                   &free_list);
4224                 if (error) {
4225                         goto error0;
4226                 }
4227
4228                 /*
4229                  * complete the transaction
4230                  */
4231                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4232                 if (error) {
4233                         goto error0;
4234                 }
4235
4236                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4237                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4238                 if (error) {
4239                         break;
4240                 }
4241
4242                 allocated_fsb = imapp->br_blockcount;
4243
4244                 if (reccount == 0) {
4245                         error = XFS_ERROR(ENOSPC);
4246                         break;
4247                 }
4248
4249                 startoffset_fsb += allocated_fsb;
4250                 allocatesize_fsb -= allocated_fsb;
4251         }
4252 dmapi_enospc_check:
4253         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4254             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4255
4256                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4257                                 XFS_ITOBHV(ip), DM_RIGHT_NULL,
4258                                 XFS_ITOBHV(ip), DM_RIGHT_NULL,
4259                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4260                 if (error == 0)
4261                         goto retry;     /* Maybe DMAPI app. has made space */
4262                 /* else fall through with error from XFS_SEND_DATA */
4263         }
4264
4265         return error;
4266
4267  error0:
4268         xfs_bmap_cancel(&free_list);
4269  error1:
4270         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4271         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4272         goto dmapi_enospc_check;
4273 }
4274
4275 /*
4276  * Zero file bytes between startoff and endoff inclusive.
4277  * The iolock is held exclusive and no blocks are buffered.
4278  */
4279 STATIC int
4280 xfs_zero_remaining_bytes(
4281         xfs_inode_t             *ip,
4282         xfs_off_t               startoff,
4283         xfs_off_t               endoff)
4284 {
4285         xfs_buf_t               *bp;
4286         int                     error=0;
4287         xfs_bmbt_irec_t         imap;
4288         xfs_off_t               lastoffset;
4289         xfs_mount_t             *mp;
4290         int                     nimap;
4291         xfs_off_t               offset;
4292         xfs_fileoff_t           offset_fsb;
4293
4294         mp = ip->i_mount;
4295         bp = XFS_ngetrbuf(mp->m_sb.sb_blocksize,mp);
4296         ASSERT(!XFS_BUF_GETERROR(bp));
4297
4298         if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
4299                 XFS_BUF_SET_TARGET(bp, mp->m_rtdev_targp);
4300         } else {
4301                 XFS_BUF_SET_TARGET(bp, mp->m_ddev_targp);
4302         }
4303
4304         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4305                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4306                 nimap = 1;
4307                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
4308                         &nimap, NULL);
4309                 if (error || nimap < 1)
4310                         break;
4311                 ASSERT(imap.br_blockcount >= 1);
4312                 ASSERT(imap.br_startoff == offset_fsb);
4313                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4314                 if (lastoffset > endoff)
4315                         lastoffset = endoff;
4316                 if (imap.br_startblock == HOLESTARTBLOCK)
4317                         continue;
4318                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4319                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4320                         continue;
4321                 XFS_BUF_UNDONE(bp);
4322                 XFS_BUF_UNWRITE(bp);
4323                 XFS_BUF_READ(bp);
4324                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4325                 xfsbdstrat(mp, bp);
4326                 if ((error = xfs_iowait(bp))) {
4327                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4328                                           mp, bp, XFS_BUF_ADDR(bp));
4329                         break;
4330                 }
4331                 memset(XFS_BUF_PTR(bp) +
4332                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4333                       0, lastoffset - offset + 1);
4334                 XFS_BUF_UNDONE(bp);
4335                 XFS_BUF_UNREAD(bp);
4336                 XFS_BUF_WRITE(bp);
4337                 xfsbdstrat(mp, bp);
4338                 if ((error = xfs_iowait(bp))) {
4339                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4340                                           mp, bp, XFS_BUF_ADDR(bp));
4341                         break;
4342                 }
4343         }
4344         XFS_nfreerbuf(bp);
4345         return error;
4346 }
4347
4348 /*
4349  * xfs_free_file_space()
4350  *      This routine frees disk space for the given file.
4351  *
4352  *      This routine is only called by xfs_change_file_space
4353  *      for an UNRESVSP type call.
4354  *
4355  * RETURNS:
4356  *       0 on success
4357  *      errno on error
4358  *
4359  */
4360 STATIC int
4361 xfs_free_file_space(
4362         xfs_inode_t             *ip,
4363         xfs_off_t               offset,
4364         xfs_off_t               len,
4365         int                     attr_flags)
4366 {
4367         int                     committed;
4368         int                     done;
4369         xfs_off_t               end_dmi_offset;
4370         xfs_fileoff_t           endoffset_fsb;
4371         int                     error;
4372         xfs_fsblock_t           firstfsb;
4373         xfs_bmap_free_t         free_list;
4374         xfs_off_t               ilen;
4375         xfs_bmbt_irec_t         imap;
4376         xfs_off_t               ioffset;
4377         xfs_extlen_t            mod=0;
4378         xfs_mount_t             *mp;
4379         int                     nimap;
4380         uint                    resblks;
4381         int                     rounding;
4382         int                     rt;
4383         xfs_fileoff_t           startoffset_fsb;
4384         xfs_trans_t             *tp;
4385
4386         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4387         mp = ip->i_mount;
4388
4389         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4390                 return error;
4391
4392         error = 0;
4393         if (len <= 0)   /* if nothing being freed */
4394                 return error;
4395         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4396         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4397         end_dmi_offset = offset + len;
4398         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4399
4400         if (offset < ip->i_d.di_size &&
4401             (attr_flags & ATTR_DMI) == 0 &&
4402             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4403                 if (end_dmi_offset > ip->i_d.di_size)
4404                         end_dmi_offset = ip->i_d.di_size;
4405                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOBHV(ip),
4406                                 offset, end_dmi_offset - offset,
4407                                 AT_DELAY_FLAG(attr_flags), NULL);
4408                 if (error)
4409                         return(error);
4410         }
4411
4412         xfs_ilock(ip, XFS_IOLOCK_EXCL);
4413         rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4414                         (__uint8_t)NBPP);
4415         ilen = len + (offset & (rounding - 1));
4416         ioffset = offset & ~(rounding - 1);
4417         if (ilen & (rounding - 1))
4418                 ilen = (ilen + rounding) & ~(rounding - 1);
4419         xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
4420         /*
4421          * Need to zero the stuff we're not freeing, on disk.
4422          * If its a realtime file & can't use unwritten extents then we
4423          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4424          * will take care of it for us.
4425          */
4426         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4427                 nimap = 1;
4428                 error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
4429                         &imap, &nimap, NULL);
4430                 if (error)
4431                         return error;
4432                 ASSERT(nimap == 0 || nimap == 1);
4433                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4434                         xfs_daddr_t     block;
4435
4436                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4437                         block = imap.br_startblock;
4438                         mod = do_div(block, mp->m_sb.sb_rextsize);
4439                         if (mod)
4440                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4441                 }
4442                 nimap = 1;
4443                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
4444                         &imap, &nimap, NULL);
4445                 if (error)
4446                         return error;
4447                 ASSERT(nimap == 0 || nimap == 1);
4448                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4449                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4450                         mod++;
4451                         if (mod && (mod != mp->m_sb.sb_rextsize))
4452                                 endoffset_fsb -= mod;
4453                 }
4454         }
4455         if ((done = (endoffset_fsb <= startoffset_fsb)))
4456                 /*
4457                  * One contiguous piece to clear
4458                  */
4459                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4460         else {
4461                 /*
4462                  * Some full blocks, possibly two pieces to clear
4463                  */
4464                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4465                         error = xfs_zero_remaining_bytes(ip, offset,
4466                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4467                 if (!error &&
4468                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4469                         error = xfs_zero_remaining_bytes(ip,
4470                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4471                                 offset + len - 1);
4472         }
4473
4474         /*
4475          * free file space until done or until there is an error
4476          */
4477         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4478         while (!error && !done) {
4479
4480                 /*
4481                  * allocate and setup the transaction
4482                  */
4483                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4484                 error = xfs_trans_reserve(tp,
4485                                           resblks,
4486                                           XFS_WRITE_LOG_RES(mp),
4487                                           0,
4488                                           XFS_TRANS_PERM_LOG_RES,
4489                                           XFS_WRITE_LOG_COUNT);
4490
4491                 /*
4492                  * check for running out of space
4493                  */
4494                 if (error) {
4495                         /*
4496                          * Free the transaction structure.
4497                          */
4498                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4499                         xfs_trans_cancel(tp, 0);
4500                         break;
4501                 }
4502                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4503                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4504                                 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4505                                 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4506                 if (error)
4507                         goto error1;
4508
4509                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4510                 xfs_trans_ihold(tp, ip);
4511
4512                 /*
4513                  * issue the bunmapi() call to free the blocks
4514                  */
4515                 XFS_BMAP_INIT(&free_list, &firstfsb);
4516                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4517                                   endoffset_fsb - startoffset_fsb,
4518                                   0, 2, &firstfsb, &free_list, &done);
4519                 if (error) {
4520                         goto error0;
4521                 }
4522
4523                 /*
4524                  * complete the transaction
4525                  */
4526                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4527                 if (error) {
4528                         goto error0;
4529                 }
4530
4531                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4532                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4533         }
4534
4535         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4536         return error;
4537
4538  error0:
4539         xfs_bmap_cancel(&free_list);
4540  error1:
4541         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4542         xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
4543         return error;
4544 }
4545
4546 /*
4547  * xfs_change_file_space()
4548  *      This routine allocates or frees disk space for the given file.
4549  *      The user specified parameters are checked for alignment and size
4550  *      limitations.
4551  *
4552  * RETURNS:
4553  *       0 on success
4554  *      errno on error
4555  *
4556  */
4557 int
4558 xfs_change_file_space(
4559         bhv_desc_t      *bdp,
4560         int             cmd,
4561         xfs_flock64_t   *bf,
4562         xfs_off_t       offset,
4563         cred_t          *credp,
4564         int             attr_flags)
4565 {
4566         int             clrprealloc;
4567         int             error;
4568         xfs_fsize_t     fsize;
4569         xfs_inode_t     *ip;
4570         xfs_mount_t     *mp;
4571         int             setprealloc;
4572         xfs_off_t       startoffset;
4573         xfs_off_t       llen;
4574         xfs_trans_t     *tp;
4575         vattr_t         va;
4576         vnode_t         *vp;
4577
4578         vp = BHV_TO_VNODE(bdp);
4579         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4580
4581         ip = XFS_BHVTOI(bdp);
4582         mp = ip->i_mount;
4583
4584         /*
4585          * must be a regular file and have write permission
4586          */
4587         if (vp->v_type != VREG)
4588                 return XFS_ERROR(EINVAL);
4589
4590         xfs_ilock(ip, XFS_ILOCK_SHARED);
4591
4592         if ((error = xfs_iaccess(ip, IWRITE, credp))) {
4593                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4594                 return error;
4595         }
4596
4597         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4598
4599         switch (bf->l_whence) {
4600         case 0: /*SEEK_SET*/
4601                 break;
4602         case 1: /*SEEK_CUR*/
4603                 bf->l_start += offset;
4604                 break;
4605         case 2: /*SEEK_END*/
4606                 bf->l_start += ip->i_d.di_size;
4607                 break;
4608         default:
4609                 return XFS_ERROR(EINVAL);
4610         }
4611
4612         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4613
4614         if (   (bf->l_start < 0)
4615             || (bf->l_start > XFS_MAX_FILE_OFFSET)
4616             || (bf->l_start + llen < 0)
4617             || (bf->l_start + llen > XFS_MAX_FILE_OFFSET))
4618                 return XFS_ERROR(EINVAL);
4619
4620         bf->l_whence = 0;
4621
4622         startoffset = bf->l_start;
4623         fsize = ip->i_d.di_size;
4624
4625         /*
4626          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4627          * file space.
4628          * These calls do NOT zero the data space allocated to the file,
4629          * nor do they change the file size.
4630          *
4631          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4632          * space.
4633          * These calls cause the new file data to be zeroed and the file
4634          * size to be changed.
4635          */
4636         setprealloc = clrprealloc = 0;
4637
4638         switch (cmd) {
4639         case XFS_IOC_RESVSP:
4640         case XFS_IOC_RESVSP64:
4641                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4642                                                                 1, attr_flags);
4643                 if (error)
4644                         return error;
4645                 setprealloc = 1;
4646                 break;
4647
4648         case XFS_IOC_UNRESVSP:
4649         case XFS_IOC_UNRESVSP64:
4650                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4651                                                                 attr_flags)))
4652                         return error;
4653                 break;
4654
4655         case XFS_IOC_ALLOCSP:
4656         case XFS_IOC_ALLOCSP64:
4657         case XFS_IOC_FREESP:
4658         case XFS_IOC_FREESP64:
4659                 if (startoffset > fsize) {
4660                         error = xfs_alloc_file_space(ip, fsize,
4661                                         startoffset - fsize, 0, attr_flags);
4662                         if (error)
4663                                 break;
4664                 }
4665
4666                 va.va_mask = XFS_AT_SIZE;
4667                 va.va_size = startoffset;
4668
4669                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4670
4671                 if (error)
4672                         return error;
4673
4674                 clrprealloc = 1;
4675                 break;
4676
4677         default:
4678                 ASSERT(0);
4679                 return XFS_ERROR(EINVAL);
4680         }
4681
4682         /*
4683          * update the inode timestamp, mode, and prealloc flag bits
4684          */
4685         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4686
4687         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4688                                       0, 0, 0))) {
4689                 /* ASSERT(0); */
4690                 xfs_trans_cancel(tp, 0);
4691                 return error;
4692         }
4693
4694         xfs_ilock(ip, XFS_ILOCK_EXCL);
4695
4696         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4697         xfs_trans_ihold(tp, ip);
4698
4699         ip->i_d.di_mode &= ~ISUID;
4700
4701         /*
4702          * Note that we don't have to worry about mandatory
4703          * file locking being disabled here because we only
4704          * clear the ISGID bit if the Group execute bit is
4705          * on, but if it was on then mandatory locking wouldn't
4706          * have been enabled.
4707          */
4708         if (ip->i_d.di_mode & (IEXEC >> 3))
4709                 ip->i_d.di_mode &= ~ISGID;
4710
4711         xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4712
4713         if (setprealloc)
4714                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4715         else if (clrprealloc)
4716                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4717
4718         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4719         xfs_trans_set_sync(tp);
4720
4721         error = xfs_trans_commit(tp, 0, NULL);
4722
4723         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4724
4725         return error;
4726 }
4727
4728 vnodeops_t xfs_vnodeops = {
4729         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4730         .vop_open               = xfs_open,
4731         .vop_read               = xfs_read,
4732         .vop_sendfile           = xfs_sendfile,
4733         .vop_write              = xfs_write,
4734         .vop_ioctl              = xfs_ioctl,
4735         .vop_getattr            = xfs_getattr,
4736         .vop_setattr            = xfs_setattr,
4737         .vop_access             = xfs_access,
4738         .vop_lookup             = xfs_lookup,
4739         .vop_create             = xfs_create,
4740         .vop_remove             = xfs_remove,
4741         .vop_link               = xfs_link,
4742         .vop_rename             = xfs_rename,
4743         .vop_mkdir              = xfs_mkdir,
4744         .vop_rmdir              = xfs_rmdir,
4745         .vop_readdir            = xfs_readdir,
4746         .vop_symlink            = xfs_symlink,
4747         .vop_readlink           = xfs_readlink,
4748         .vop_fsync              = xfs_fsync,
4749         .vop_inactive           = xfs_inactive,
4750         .vop_fid2               = xfs_fid2,
4751         .vop_rwlock             = xfs_rwlock,
4752         .vop_rwunlock           = xfs_rwunlock,
4753         .vop_bmap               = xfs_bmap,
4754         .vop_reclaim            = xfs_reclaim,
4755         .vop_attr_get           = xfs_attr_get,
4756         .vop_attr_set           = xfs_attr_set,
4757         .vop_attr_remove        = xfs_attr_remove,
4758         .vop_attr_list          = xfs_attr_list,
4759         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4760         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4761         .vop_tosspages          = fs_tosspages,
4762         .vop_flushinval_pages   = fs_flushinval_pages,
4763         .vop_flush_pages        = fs_flush_pages,
4764         .vop_release            = xfs_release,
4765         .vop_iflush             = xfs_inode_flush,
4766 };