sys/kern/vfs_vnops.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
  39  * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
  40  * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.54 2007/11/02 19:52:25 dillon Exp $
  41  */
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/fcntl.h>
  46 #include <sys/file.h>
  47 #include <sys/stat.h>
  48 #include <sys/proc.h>
  49 #include <sys/mount.h>
  50 #include <sys/nlookup.h>
  51 #include <sys/vnode.h>
  52 #include <sys/buf.h>
  53 #include <sys/filio.h>
  54 #include <sys/ttycom.h>
  55 #include <sys/conf.h>
  56 #include <sys/syslog.h>
  57
  58 static int vn_closefile (struct file *fp);
  59 static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
  60                 struct ucred *cred);
  61 static int vn_read (struct file *fp, struct uio *uio,
  62                 struct ucred *cred, int flags);
  63 static int svn_read (struct file *fp, struct uio *uio,
  64                 struct ucred *cred, int flags);
  65 static int vn_poll (struct file *fp, int events, struct ucred *cred);
  66 static int vn_kqfilter (struct file *fp, struct knote *kn);
  67 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred);
  68 static int vn_write (struct file *fp, struct uio *uio,
  69                 struct ucred *cred, int flags);
  70 static int svn_write (struct file *fp, struct uio *uio,
  71                 struct ucred *cred, int flags);
  72
  73 struct fileops vnode_fileops = {
  74         .fo_read = vn_read,
  75         .fo_write = vn_write,
  76         .fo_ioctl = vn_ioctl,
  77         .fo_poll = vn_poll,
  78         .fo_kqfilter = vn_kqfilter,
  79         .fo_stat = vn_statfile,
  80         .fo_close = vn_closefile,
  81         .fo_shutdown = nofo_shutdown
  82 };
  83
  84 struct fileops specvnode_fileops = {
  85         .fo_read = svn_read,
  86         .fo_write = svn_write,
  87         .fo_ioctl = vn_ioctl,
  88         .fo_poll = vn_poll,
  89         .fo_kqfilter = vn_kqfilter,
  90         .fo_stat = vn_statfile,
  91         .fo_close = vn_closefile,
  92         .fo_shutdown = nofo_shutdown
  93 };
  94
  95 /*
  96  * Shortcut the device read/write.  This avoids a lot of vnode junk.
  97  * Basically the specfs vnops for read and write take the locked vnode,
  98  * unlock it (because we can't hold the vnode locked while reading or writing
  99  * a device which may block indefinitely), issues the device operation, then
 100  * relock the vnode before returning, plus other junk.  This bypasses all
 101  * of that and just does the device operation.
 102  */
 103 void
 104 vn_setspecops(struct file *fp)
 105 {
 106         if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
 107                 fp->f_ops = &specvnode_fileops;
 108         }
 109 }
 110
 111 /*
 112  * Common code for vnode open operations.  Check permissions, and call
 113  * the VOP_NOPEN or VOP_NCREATE routine.
 114  *
 115  * The caller is responsible for setting up nd with nlookup_init() and
 116  * for cleaning it up with nlookup_done(), whether we return an error
 117  * or not.
 118  *
 119  * On success nd->nl_open_vp will hold a referenced and, if requested,
 120  * locked vnode.  A locked vnode is requested via NLC_LOCKVP.  If fp
 121  * is non-NULL the vnode will be installed in the file pointer.
 122  *
 123  * NOTE: The vnode is referenced just once on return whether or not it
 124  * is also installed in the file pointer.
 125  */
 126 int
 127 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode)
 128 {
 129         struct vnode *vp;
 130         struct vnode *dvp;
 131         struct ucred *cred = nd->nl_cred;
 132         struct vattr vat;
 133         struct vattr *vap = &vat;
 134         int mode, error;
 135
 136         /*
 137          * Lookup the path and create or obtain the vnode.  After a
 138          * successful lookup a locked nd->nl_nch will be returned.
 139          *
 140          * The result of this section should be a locked vnode.
 141          *
 142          * XXX with only a little work we should be able to avoid locking
 143          * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set.
 144          */
 145         if (fmode & O_CREAT) {
 146                 /*
 147                  * CONDITIONAL CREATE FILE CASE
 148                  *
 149                  * Setting NLC_CREATE causes a negative hit to store
 150                  * the negative hit ncp and not return an error.  Then
 151                  * nc_error or nc_vp may be checked to see if the ncp
 152                  * represents a negative hit.  NLC_CREATE also requires
 153                  * write permission on the governing directory or EPERM
 154                  * is returned.
 155                  */
 156                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 157                         nd->nl_flags |= NLC_FOLLOW;
 158                 nd->nl_flags |= NLC_CREATE;
 159                 bwillwrite();
 160                 error = nlookup(nd);
 161         } else {
 162                 /*
 163                  * NORMAL OPEN FILE CASE
 164                  */
 165                 error = nlookup(nd);
 166         }
 167
 168         if (error)
 169                 return (error);
 170
 171         /*
 172          * split case to allow us to re-resolve and retry the ncp in case
 173          * we get ESTALE.
 174          */
 175 again:
 176         if (fmode & O_CREAT) {
 177                 if (nd->nl_nch.ncp->nc_vp == NULL) {
 178                         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
 179                                 return (error);
 180                         if ((dvp = nd->nl_nch.ncp->nc_parent->nc_vp) == NULL)
 181                                 return (EPERM);
 182                         /* vhold(dvp); - dvp can't go away */
 183                         VATTR_NULL(vap);
 184                         vap->va_type = VREG;
 185                         vap->va_mode = cmode;
 186                         if (fmode & O_EXCL)
 187                                 vap->va_vaflags |= VA_EXCLUSIVE;
 188                         error = VOP_NCREATE(&nd->nl_nch, dvp, &vp,
 189                                             nd->nl_cred, vap);
 190                         /* vdrop(dvp); */
 191                         if (error)
 192                                 return (error);
 193                         fmode &= ~O_TRUNC;
 194                         /* locked vnode is returned */
 195                 } else {
 196                         if (fmode & O_EXCL) {
 197                                 error = EEXIST;
 198                         } else {
 199                                 error = cache_vget(&nd->nl_nch, cred,
 200                                                     LK_EXCLUSIVE, &vp);
 201                         }
 202                         if (error)
 203                                 return (error);
 204                         fmode &= ~O_CREAT;
 205                 }
 206         } else {
 207                 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
 208                 if (error)
 209                         return (error);
 210         }
 211
 212         /*
 213          * We have a locked vnode and ncp now.  Note that the ncp will
 214          * be cleaned up by the caller if nd->nl_nch is left intact.
 215          */
 216         if (vp->v_type == VLNK) {
 217                 error = EMLINK;
 218                 goto bad;
 219         }
 220         if (vp->v_type == VSOCK) {
 221                 error = EOPNOTSUPP;
 222                 goto bad;
 223         }
 224         if ((fmode & O_CREAT) == 0) {
 225                 mode = 0;
 226                 if (fmode & (FWRITE | O_TRUNC)) {
 227                         if (vp->v_type == VDIR) {
 228                                 error = EISDIR;
 229                                 goto bad;
 230                         }
 231                         error = vn_writechk(vp, &nd->nl_nch);
 232                         if (error) {
 233                                 /*
 234                                  * Special stale handling, re-resolve the
 235                                  * vnode.
 236                                  */
 237                                 if (error == ESTALE) {
 238                                         vput(vp);
 239                                         vp = NULL;
 240                                         cache_setunresolved(&nd->nl_nch);
 241                                         error = cache_resolve(&nd->nl_nch, cred);
 242                                         if (error == 0)
 243                                                 goto again;
 244                                 }
 245                                 goto bad;
 246                         }
 247                         mode |= VWRITE;
 248                 }
 249                 if (fmode & FREAD)
 250                         mode |= VREAD;
 251                 if (mode) {
 252                         error = VOP_ACCESS(vp, mode, cred);
 253                         if (error) {
 254                                 /*
 255                                  * Special stale handling, re-resolve the
 256                                  * vnode.
 257                                  */
 258                                 if (error == ESTALE) {
 259                                         vput(vp);
 260                                         vp = NULL;
 261                                         cache_setunresolved(&nd->nl_nch);
 262                                         error = cache_resolve(&nd->nl_nch, cred);
 263                                         if (error == 0)
 264                                                 goto again;
 265                                 }
 266                                 goto bad;
 267                         }
 268                 }
 269         }
 270         if (fmode & O_TRUNC) {
 271                 vn_unlock(vp);                          /* XXX */
 272                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
 273                 VATTR_NULL(vap);
 274                 vap->va_size = 0;
 275                 error = VOP_SETATTR(vp, vap, cred);
 276                 if (error)
 277                         goto bad;
 278         }
 279
 280         /*
 281          * Setup the fp so VOP_OPEN can override it.  No descriptor has been
 282          * associated with the fp yet so we own it clean.
 283          *
 284          * f_nchandle inherits nl_nch.  This used to be necessary only for
 285          * directories but now we do it unconditionally so f*() ops
 286          * such as fchmod() can access the actual namespace that was
 287          * used to open the file.
 288          */
 289         if (fp) {
 290                 fp->f_nchandle = nd->nl_nch;
 291                 cache_zero(&nd->nl_nch);
 292                 cache_unlock(&fp->f_nchandle);
 293         }
 294
 295         /*
 296          * Get rid of nl_nch.  vn_open does not return it (it returns the
 297          * vnode or the file pointer).  Note: we can't leave nl_nch locked
 298          * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
 299          * on /dev/ttyd0
 300          */
 301         if (nd->nl_nch.ncp)
 302                 cache_put(&nd->nl_nch);
 303
 304         error = VOP_OPEN(vp, fmode, cred, fp);
 305         if (error) {
 306                 /*
 307                  * setting f_ops to &badfileops will prevent the descriptor
 308                  * code from trying to close and release the vnode, since
 309                  * the open failed we do not want to call close.
 310                  */
 311                 if (fp) {
 312                         fp->f_data = NULL;
 313                         fp->f_ops = &badfileops;
 314                 }
 315                 goto bad;
 316         }
 317
 318 #if 0
 319         /*
 320          * Assert that VREG files have been setup for vmio.
 321          */
 322         KASSERT(vp->v_type != VREG || vp->v_object != NULL,
 323                 ("vn_open: regular file was not VMIO enabled!"));
 324 #endif
 325
 326         /*
 327          * Return the vnode.  XXX needs some cleaning up.  The vnode is
 328          * only returned in the fp == NULL case.
 329          */
 330         if (fp == NULL) {
 331                 nd->nl_open_vp = vp;
 332                 nd->nl_vp_fmode = fmode;
 333                 if ((nd->nl_flags & NLC_LOCKVP) == 0)
 334                         vn_unlock(vp);
 335         } else {
 336                 vput(vp);
 337         }
 338         return (0);
 339 bad:
 340         if (vp)
 341                 vput(vp);
 342         return (error);
 343 }
 344
 345 int
 346 vn_opendisk(const char *devname, int fmode, struct vnode **vpp)
 347 {
 348         struct vnode *vp;
 349         int error;
 350
 351         if (strncmp(devname, "/dev/", 5) == 0)
 352                 devname += 5;
 353         if ((vp = getsynthvnode(devname)) == NULL) {
 354                 error = ENODEV;
 355         } else {
 356                 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
 357                 vn_unlock(vp);
 358                 if (error) {
 359                         vrele(vp);
 360                         vp = NULL;
 361                 }
 362         }
 363         *vpp = vp;
 364         return (error);
 365 }
 366
 367 /*
 368  * Check for write permissions on the specified vnode.  nch may be NULL.
 369  */
 370 int
 371 vn_writechk(struct vnode *vp, struct nchandle *nch)
 372 {
 373         /*
 374          * If there's shared text associated with
 375          * the vnode, try to free it up once.  If
 376          * we fail, we can't allow writing.
 377          */
 378         if (vp->v_flag & VTEXT)
 379                 return (ETXTBSY);
 380
 381         /*
 382          * If the vnode represents a regular file, check the mount
 383          * point via the nch.  This may be a different mount point
 384          * then the one embedded in the vnode (e.g. nullfs).
 385          *
 386          * We can still write to non-regular files (e.g. devices)
 387          * via read-only mounts.
 388          */
 389         if (nch && nch->ncp && vp->v_type == VREG)
 390                 return (ncp_writechk(nch));
 391         return (0);
 392 }
 393
 394 /*
 395  * Check whether the underlying mount is read-only.  The mount point
 396  * referenced by the namecache may be different from the mount point
 397  * used by the underlying vnode in the case of NULLFS, so a separate
 398  * check is needed.
 399  */
 400 int
 401 ncp_writechk(struct nchandle *nch)
 402 {
 403         if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
 404                 return (EROFS);
 405         return(0);
 406 }
 407
 408 /*
 409  * Vnode close call
 410  */
 411 int
 412 vn_close(struct vnode *vp, int flags)
 413 {
 414         int error;
 415
 416         if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) == 0) {
 417                 error = VOP_CLOSE(vp, flags);
 418                 vn_unlock(vp);
 419         }
 420         vrele(vp);
 421         return (error);
 422 }
 423
 424 static __inline
 425 int
 426 sequential_heuristic(struct uio *uio, struct file *fp)
 427 {
 428         /*
 429          * Sequential heuristic - detect sequential operation
 430          */
 431         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 432             uio->uio_offset == fp->f_nextoff) {
 433                 int tmpseq = fp->f_seqcount;
 434                 /*
 435                  * XXX we assume that the filesystem block size is
 436                  * the default.  Not true, but still gives us a pretty
 437                  * good indicator of how sequential the read operations
 438                  * are.
 439                  */
 440                 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
 441                 if (tmpseq > IO_SEQMAX)
 442                         tmpseq = IO_SEQMAX;
 443                 fp->f_seqcount = tmpseq;
 444                 return(fp->f_seqcount << IO_SEQSHIFT);
 445         }
 446
 447         /*
 448          * Not sequential, quick draw-down of seqcount
 449          */
 450         if (fp->f_seqcount > 1)
 451                 fp->f_seqcount = 1;
 452         else
 453                 fp->f_seqcount = 0;
 454         return(0);
 455 }
 456
 457 /*
 458  * Package up an I/O request on a vnode into a uio and do it.
 459  */
 460 int
 461 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
 462         off_t offset, enum uio_seg segflg, int ioflg,
 463         struct ucred *cred, int *aresid)
 464 {
 465         struct uio auio;
 466         struct iovec aiov;
 467         struct ccms_lock ccms_lock;
 468         int error;
 469
 470         if ((ioflg & IO_NODELOCKED) == 0)
 471                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 472         auio.uio_iov = &aiov;
 473         auio.uio_iovcnt = 1;
 474         aiov.iov_base = base;
 475         aiov.iov_len = len;
 476         auio.uio_resid = len;
 477         auio.uio_offset = offset;
 478         auio.uio_segflg = segflg;
 479         auio.uio_rw = rw;
 480         auio.uio_td = curthread;
 481         ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
 482         if (rw == UIO_READ) {
 483                 error = VOP_READ(vp, &auio, ioflg, cred);
 484         } else {
 485                 error = VOP_WRITE(vp, &auio, ioflg, cred);
 486         }
 487         ccms_lock_put(&vp->v_ccms, &ccms_lock);
 488         if (aresid)
 489                 *aresid = auio.uio_resid;
 490         else
 491                 if (auio.uio_resid && error == 0)
 492                         error = EIO;
 493         if ((ioflg & IO_NODELOCKED) == 0)
 494                 vn_unlock(vp);
 495         return (error);
 496 }
 497
 498 /*
 499  * Package up an I/O request on a vnode into a uio and do it.  The I/O
 500  * request is split up into smaller chunks and we try to avoid saturating
 501  * the buffer cache while potentially holding a vnode locked, so we
 502  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
 503  * to give other processes a chance to lock the vnode (either other processes
 504  * core'ing the same binary, or unrelated processes scanning the directory).
 505  */
 506 int
 507 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
 508                  off_t offset, enum uio_seg segflg, int ioflg,
 509                  struct ucred *cred, int *aresid)
 510 {
 511         int error = 0;
 512
 513         do {
 514                 int chunk;
 515
 516                 /*
 517                  * Force `offset' to a multiple of MAXBSIZE except possibly
 518                  * for the first chunk, so that filesystems only need to
 519                  * write full blocks except possibly for the first and last
 520                  * chunks.
 521                  */
 522                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 523
 524                 if (chunk > len)
 525                         chunk = len;
 526                 if (rw != UIO_READ && vp->v_type == VREG)
 527                         bwillwrite();
 528                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 529                             ioflg, cred, aresid);
 530                 len -= chunk;   /* aresid calc already includes length */
 531                 if (error)
 532                         break;
 533                 offset += chunk;
 534                 base += chunk;
 535                 uio_yield();
 536         } while (len);
 537         if (aresid)
 538                 *aresid += len;
 539         return (error);
 540 }
 541
 542 /*
 543  * MPALMOSTSAFE - acquires mplock
 544  */
 545 static int
 546 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
 547 {
 548         struct ccms_lock ccms_lock;
 549         struct vnode *vp;
 550         int error, ioflag;
 551
 552         get_mplock();
 553         KASSERT(uio->uio_td == curthread,
 554                 ("uio_td %p is not td %p", uio->uio_td, curthread));
 555         vp = (struct vnode *)fp->f_data;
 556
 557         ioflag = 0;
 558         if (flags & O_FBLOCKING) {
 559                 /* ioflag &= ~IO_NDELAY; */
 560         } else if (flags & O_FNONBLOCKING) {
 561                 ioflag |= IO_NDELAY;
 562         } else if (fp->f_flag & FNONBLOCK) {
 563                 ioflag |= IO_NDELAY;
 564         }
 565         if (flags & O_FBUFFERED) {
 566                 /* ioflag &= ~IO_DIRECT; */
 567         } else if (flags & O_FUNBUFFERED) {
 568                 ioflag |= IO_DIRECT;
 569         } else if (fp->f_flag & O_DIRECT) {
 570                 ioflag |= IO_DIRECT;
 571         }
 572         vn_lock(vp, LK_SHARED | LK_RETRY);
 573         if ((flags & O_FOFFSET) == 0)
 574                 uio->uio_offset = fp->f_offset;
 575         ioflag |= sequential_heuristic(uio, fp);
 576
 577         ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
 578         error = VOP_READ(vp, uio, ioflag, cred);
 579         ccms_lock_put(&vp->v_ccms, &ccms_lock);
 580         if ((flags & O_FOFFSET) == 0)
 581                 fp->f_offset = uio->uio_offset;
 582         fp->f_nextoff = uio->uio_offset;
 583         vn_unlock(vp);
 584         rel_mplock();
 585         return (error);
 586 }
 587
 588 /*
 589  * Device-optimized file table vnode read routine.
 590  *
 591  * This bypasses the VOP table and talks directly to the device.  Most
 592  * filesystems just route to specfs and can make this optimization.
 593  *
 594  * MPALMOSTSAFE - acquires mplock
 595  */
 596 static int
 597 svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
 598 {
 599         struct vnode *vp;
 600         int ioflag;
 601         int error;
 602         cdev_t dev;
 603
 604         get_mplock();
 605         KASSERT(uio->uio_td == curthread,
 606                 ("uio_td %p is not td %p", uio->uio_td, curthread));
 607
 608         vp = (struct vnode *)fp->f_data;
 609         if (vp == NULL || vp->v_type == VBAD) {
 610                 error = EBADF;
 611                 goto done;
 612         }
 613
 614         if ((dev = vp->v_rdev) == NULL) {
 615                 error = EBADF;
 616                 goto done;
 617         }
 618         reference_dev(dev);
 619
 620         if (uio->uio_resid == 0) {
 621                 error = 0;
 622                 goto done;
 623         }
 624         if ((flags & O_FOFFSET) == 0)
 625                 uio->uio_offset = fp->f_offset;
 626
 627         ioflag = 0;
 628         if (flags & O_FBLOCKING) {
 629                 /* ioflag &= ~IO_NDELAY; */
 630         } else if (flags & O_FNONBLOCKING) {
 631                 ioflag |= IO_NDELAY;
 632         } else if (fp->f_flag & FNONBLOCK) {
 633                 ioflag |= IO_NDELAY;
 634         }
 635         if (flags & O_FBUFFERED) {
 636                 /* ioflag &= ~IO_DIRECT; */
 637         } else if (flags & O_FUNBUFFERED) {
 638                 ioflag |= IO_DIRECT;
 639         } else if (fp->f_flag & O_DIRECT) {
 640                 ioflag |= IO_DIRECT;
 641         }
 642         ioflag |= sequential_heuristic(uio, fp);
 643
 644         error = dev_dread(dev, uio, ioflag);
 645
 646         release_dev(dev);
 647         if ((flags & O_FOFFSET) == 0)
 648                 fp->f_offset = uio->uio_offset;
 649         fp->f_nextoff = uio->uio_offset;
 650 done:
 651         rel_mplock();
 652         return (error);
 653 }
 654
 655 /*
 656  * MPALMOSTSAFE - acquires mplock
 657  */
 658 static int
 659 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
 660 {
 661         struct ccms_lock ccms_lock;
 662         struct vnode *vp;
 663         int error, ioflag;
 664
 665         get_mplock();
 666         KASSERT(uio->uio_td == curthread,
 667                 ("uio_td %p is not p %p", uio->uio_td, curthread));
 668         vp = (struct vnode *)fp->f_data;
 669         if (vp->v_type == VREG)
 670                 bwillwrite();
 671         vp = (struct vnode *)fp->f_data;        /* XXX needed? */
 672
 673         ioflag = IO_UNIT;
 674         if (vp->v_type == VREG &&
 675            ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
 676                 ioflag |= IO_APPEND;
 677         }
 678
 679         if (flags & O_FBLOCKING) {
 680                 /* ioflag &= ~IO_NDELAY; */
 681         } else if (flags & O_FNONBLOCKING) {
 682                 ioflag |= IO_NDELAY;
 683         } else if (fp->f_flag & FNONBLOCK) {
 684                 ioflag |= IO_NDELAY;
 685         }
 686         if (flags & O_FBUFFERED) {
 687                 /* ioflag &= ~IO_DIRECT; */
 688         } else if (flags & O_FUNBUFFERED) {
 689                 ioflag |= IO_DIRECT;
 690         } else if (fp->f_flag & O_DIRECT) {
 691                 ioflag |= IO_DIRECT;
 692         }
 693         if (flags & O_FASYNCWRITE) {
 694                 /* ioflag &= ~IO_SYNC; */
 695         } else if (flags & O_FSYNCWRITE) {
 696                 ioflag |= IO_SYNC;
 697         } else if (fp->f_flag & O_FSYNC) {
 698                 ioflag |= IO_SYNC;
 699         }
 700
 701         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
 702                 ioflag |= IO_SYNC;
 703         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 704         if ((flags & O_FOFFSET) == 0)
 705                 uio->uio_offset = fp->f_offset;
 706         ioflag |= sequential_heuristic(uio, fp);
 707         ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
 708         error = VOP_WRITE(vp, uio, ioflag, cred);
 709         ccms_lock_put(&vp->v_ccms, &ccms_lock);
 710         if ((flags & O_FOFFSET) == 0)
 711                 fp->f_offset = uio->uio_offset;
 712         fp->f_nextoff = uio->uio_offset;
 713         vn_unlock(vp);
 714         rel_mplock();
 715         return (error);
 716 }
 717
 718 /*
 719  * Device-optimized file table vnode write routine.
 720  *
 721  * This bypasses the VOP table and talks directly to the device.  Most
 722  * filesystems just route to specfs and can make this optimization.
 723  *
 724  * MPALMOSTSAFE - acquires mplock
 725  */
 726 static int
 727 svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
 728 {
 729         struct vnode *vp;
 730         int ioflag;
 731         int error;
 732         cdev_t dev;
 733
 734         get_mplock();
 735         KASSERT(uio->uio_td == curthread,
 736                 ("uio_td %p is not p %p", uio->uio_td, curthread));
 737
 738         vp = (struct vnode *)fp->f_data;
 739         if (vp == NULL || vp->v_type == VBAD) {
 740                 error = EBADF;
 741                 goto done;
 742         }
 743         if (vp->v_type == VREG)
 744                 bwillwrite();
 745         vp = (struct vnode *)fp->f_data;        /* XXX needed? */
 746
 747         if ((dev = vp->v_rdev) == NULL) {
 748                 error = EBADF;
 749                 goto done;
 750         }
 751         reference_dev(dev);
 752
 753         if ((flags & O_FOFFSET) == 0)
 754                 uio->uio_offset = fp->f_offset;
 755
 756         ioflag = IO_UNIT;
 757         if (vp->v_type == VREG &&
 758            ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
 759                 ioflag |= IO_APPEND;
 760         }
 761
 762         if (flags & O_FBLOCKING) {
 763                 /* ioflag &= ~IO_NDELAY; */
 764         } else if (flags & O_FNONBLOCKING) {
 765                 ioflag |= IO_NDELAY;
 766         } else if (fp->f_flag & FNONBLOCK) {
 767                 ioflag |= IO_NDELAY;
 768         }
 769         if (flags & O_FBUFFERED) {
 770                 /* ioflag &= ~IO_DIRECT; */
 771         } else if (flags & O_FUNBUFFERED) {
 772                 ioflag |= IO_DIRECT;
 773         } else if (fp->f_flag & O_DIRECT) {
 774                 ioflag |= IO_DIRECT;
 775         }
 776         if (flags & O_FASYNCWRITE) {
 777                 /* ioflag &= ~IO_SYNC; */
 778         } else if (flags & O_FSYNCWRITE) {
 779                 ioflag |= IO_SYNC;
 780         } else if (fp->f_flag & O_FSYNC) {
 781                 ioflag |= IO_SYNC;
 782         }
 783
 784         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
 785                 ioflag |= IO_SYNC;
 786         ioflag |= sequential_heuristic(uio, fp);
 787
 788         error = dev_dwrite(dev, uio, ioflag);
 789
 790         release_dev(dev);
 791         if ((flags & O_FOFFSET) == 0)
 792                 fp->f_offset = uio->uio_offset;
 793         fp->f_nextoff = uio->uio_offset;
 794 done:
 795         rel_mplock();
 796         return (error);
 797 }
 798
 799 /*
 800  * MPALMOSTSAFE - acquires mplock
 801  */
 802 static int
 803 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred)
 804 {
 805         struct vnode *vp;
 806         int error;
 807
 808         get_mplock();
 809         vp = (struct vnode *)fp->f_data;
 810         error = vn_stat(vp, sb, cred);
 811         rel_mplock();
 812         return (error);
 813 }
 814
 815 int
 816 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred)
 817 {
 818         struct vattr vattr;
 819         struct vattr *vap;
 820         int error;
 821         u_short mode;
 822         cdev_t dev;
 823
 824         vap = &vattr;
 825         error = VOP_GETATTR(vp, vap);
 826         if (error)
 827                 return (error);
 828
 829         /*
 830          * Zero the spare stat fields
 831          */
 832         sb->st_lspare = 0;
 833         sb->st_qspare = 0;
 834
 835         /*
 836          * Copy from vattr table
 837          */
 838         if (vap->va_fsid != VNOVAL)
 839                 sb->st_dev = vap->va_fsid;
 840         else
 841                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 842         sb->st_ino = vap->va_fileid;
 843         mode = vap->va_mode;
 844         switch (vap->va_type) {
 845         case VREG:
 846                 mode |= S_IFREG;
 847                 break;
 848         case VDATABASE:
 849                 mode |= S_IFDB;
 850                 break;
 851         case VDIR:
 852                 mode |= S_IFDIR;
 853                 break;
 854         case VBLK:
 855                 mode |= S_IFBLK;
 856                 break;
 857         case VCHR:
 858                 mode |= S_IFCHR;
 859                 break;
 860         case VLNK:
 861                 mode |= S_IFLNK;
 862                 /* This is a cosmetic change, symlinks do not have a mode. */
 863                 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
 864                         sb->st_mode &= ~ACCESSPERMS;    /* 0000 */
 865                 else
 866                         sb->st_mode |= ACCESSPERMS;     /* 0777 */
 867                 break;
 868         case VSOCK:
 869                 mode |= S_IFSOCK;
 870                 break;
 871         case VFIFO:
 872                 mode |= S_IFIFO;
 873                 break;
 874         default:
 875                 return (EBADF);
 876         };
 877         sb->st_mode = mode;
 878         if (vap->va_nlink > (nlink_t)-1)
 879                 sb->st_nlink = (nlink_t)-1;
 880         else
 881                 sb->st_nlink = vap->va_nlink;
 882         sb->st_uid = vap->va_uid;
 883         sb->st_gid = vap->va_gid;
 884         sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
 885         sb->st_size = vap->va_size;
 886         sb->st_atimespec = vap->va_atime;
 887         sb->st_mtimespec = vap->va_mtime;
 888         sb->st_ctimespec = vap->va_ctime;
 889
 890         /*
 891          * A VCHR and VBLK device may track the last access and last modified
 892          * time independantly of the filesystem.  This is particularly true
 893          * because device read and write calls may bypass the filesystem.
 894          */
 895         if (vp->v_type == VCHR || vp->v_type == VBLK) {
 896                 if ((dev = vp->v_rdev) != NULL) {
 897                         if (dev->si_lastread) {
 898                                 sb->st_atimespec.tv_sec = dev->si_lastread;
 899                                 sb->st_atimespec.tv_nsec = 0;
 900                         }
 901                         if (dev->si_lastwrite) {
 902                                 sb->st_atimespec.tv_sec = dev->si_lastwrite;
 903                                 sb->st_atimespec.tv_nsec = 0;
 904                         }
 905                 }
 906         }
 907
 908         /*
 909          * According to www.opengroup.org, the meaning of st_blksize is
 910          *   "a filesystem-specific preferred I/O block size for this
 911          *    object.  In some filesystem types, this may vary from file
 912          *    to file"
 913          * Default to PAGE_SIZE after much discussion.
 914          */
 915
 916         if (vap->va_type == VREG) {
 917                 sb->st_blksize = vap->va_blocksize;
 918         } else if (vn_isdisk(vp, NULL)) {
 919                 /*
 920                  * XXX this is broken.  If the device is not yet open (aka
 921                  * stat() call, aka v_rdev == NULL), how are we supposed
 922                  * to get a valid block size out of it?
 923                  */
 924                 cdev_t dev;
 925
 926                 if ((dev = vp->v_rdev) == NULL) {
 927                         if (vp->v_type == VCHR)
 928                                 dev = get_dev(vp->v_umajor, vp->v_uminor);
 929                 }
 930                 sb->st_blksize = dev->si_bsize_best;
 931                 if (sb->st_blksize < dev->si_bsize_phys)
 932                         sb->st_blksize = dev->si_bsize_phys;
 933                 if (sb->st_blksize < BLKDEV_IOSIZE)
 934                         sb->st_blksize = BLKDEV_IOSIZE;
 935         } else {
 936                 sb->st_blksize = PAGE_SIZE;
 937         }
 938
 939         sb->st_flags = vap->va_flags;
 940         if (suser_cred(cred, 0))
 941                 sb->st_gen = 0;
 942         else
 943                 sb->st_gen = (u_int32_t)vap->va_gen;
 944
 945 #if (S_BLKSIZE == 512)
 946         /* Optimize this case */
 947         sb->st_blocks = vap->va_bytes >> 9;
 948 #else
 949         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 950 #endif
 951         sb->st_fsmid = vap->va_fsmid;
 952         return (0);
 953 }
 954
 955 /*
 956  * MPALMOSTSAFE - acquires mplock
 957  */
 958 static int
 959 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred)
 960 {
 961         struct vnode *vp = ((struct vnode *)fp->f_data);
 962         struct vnode *ovp;
 963         struct vattr vattr;
 964         int error;
 965
 966         get_mplock();
 967
 968         switch (vp->v_type) {
 969         case VREG:
 970         case VDIR:
 971                 if (com == FIONREAD) {
 972                         if ((error = VOP_GETATTR(vp, &vattr)) != 0)
 973                                 break;
 974                         *(int *)data = vattr.va_size - fp->f_offset;
 975                         error = 0;
 976                         break;
 977                 }
 978                 if (com == FIOASYNC) {                          /* XXX */
 979                         error = 0;                              /* XXX */
 980                         break;
 981                 }
 982                 /* fall into ... */
 983         default:
 984 #if 0
 985                 return (ENOTTY);
 986 #endif
 987         case VFIFO:
 988         case VCHR:
 989         case VBLK:
 990                 if (com == FIODTYPE) {
 991                         if (vp->v_type != VCHR && vp->v_type != VBLK) {
 992                                 error = ENOTTY;
 993                                 break;
 994                         }
 995                         *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
 996                         error = 0;
 997                         break;
 998                 }
 999                 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
1000                 if (error == 0 && com == TIOCSCTTY) {
1001                         struct proc *p = curthread->td_proc;
1002                         struct session *sess;
1003
1004                         if (p == NULL) {
1005                                 error = ENOTTY;
1006                                 break;
1007                         }
1008
1009                         sess = p->p_session;
1010                         /* Do nothing if reassigning same control tty */
1011                         if (sess->s_ttyvp == vp) {
1012                                 error = 0;
1013                                 break;
1014                         }
1015
1016                         /* Get rid of reference to old control tty */
1017                         ovp = sess->s_ttyvp;
1018                         vref(vp);
1019                         sess->s_ttyvp = vp;
1020                         if (ovp)
1021                                 vrele(ovp);
1022                 }
1023                 break;
1024         }
1025         rel_mplock();
1026         return (error);
1027 }
1028
1029 /*
1030  * MPALMOSTSAFE - acquires mplock
1031  */
1032 static int
1033 vn_poll(struct file *fp, int events, struct ucred *cred)
1034 {
1035         int error;
1036
1037         get_mplock();
1038         error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
1039         rel_mplock();
1040         return (error);
1041 }
1042
1043 /*
1044  * Check that the vnode is still valid, and if so
1045  * acquire requested lock.
1046  */
1047 int
1048 #ifndef DEBUG_LOCKS
1049 vn_lock(struct vnode *vp, int flags)
1050 #else
1051 debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line)
1052 #endif
1053 {
1054         int error;
1055
1056         do {
1057 #ifdef  DEBUG_LOCKS
1058                 vp->filename = filename;
1059                 vp->line = line;
1060                 error = debuglockmgr(&vp->v_lock, flags,
1061                                      "vn_lock", filename, line);
1062 #else
1063                 error = lockmgr(&vp->v_lock, flags);
1064 #endif
1065                 if (error == 0)
1066                         break;
1067         } while (flags & LK_RETRY);
1068
1069         /*
1070          * Because we (had better!) have a ref on the vnode, once it
1071          * goes to VRECLAIMED state it will not be recycled until all
1072          * refs go away.  So we can just check the flag.
1073          */
1074         if (error == 0 && (vp->v_flag & VRECLAIMED)) {
1075                 lockmgr(&vp->v_lock, LK_RELEASE);
1076                 error = ENOENT;
1077         }
1078         return (error);
1079 }
1080
1081 void
1082 vn_unlock(struct vnode *vp)
1083 {
1084         lockmgr(&vp->v_lock, LK_RELEASE);
1085 }
1086
1087 int
1088 vn_islocked(struct vnode *vp)
1089 {
1090         return (lockstatus(&vp->v_lock, curthread));
1091 }
1092
1093 /*
1094  * MPALMOSTSAFE - acquires mplock
1095  */
1096 static int
1097 vn_closefile(struct file *fp)
1098 {
1099         int error;
1100
1101         get_mplock();
1102         fp->f_ops = &badfileops;
1103         error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
1104         rel_mplock();
1105         return(error);
1106 }
1107
1108 /*
1109  * MPALMOSTSAFE - acquires mplock
1110  */
1111 static int
1112 vn_kqfilter(struct file *fp, struct knote *kn)
1113 {
1114         int error;
1115
1116         get_mplock();
1117         error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
1118         rel_mplock();
1119         return (error);
1120 }