sys/kern/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
  35  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
  36  */
  37
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/buf.h>
  41 #include <sys/conf.h>
  42 #include <sys/sysent.h>
  43 #include <sys/malloc.h>
  44 #include <sys/mount.h>
  45 #include <sys/mountctl.h>
  46 #include <sys/sysproto.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/fcntl.h>
  50 #include <sys/file.h>
  51 #include <sys/linker.h>
  52 #include <sys/stat.h>
  53 #include <sys/unistd.h>
  54 #include <sys/vnode.h>
  55 #include <sys/proc.h>
  56 #include <sys/priv.h>
  57 #include <sys/jail.h>
  58 #include <sys/namei.h>
  59 #include <sys/nlookup.h>
  60 #include <sys/dirent.h>
  61 #include <sys/extattr.h>
  62 #include <sys/spinlock.h>
  63 #include <sys/kern_syscall.h>
  64 #include <sys/objcache.h>
  65 #include <sys/sysctl.h>
  66
  67 #include <sys/buf2.h>
  68 #include <sys/file2.h>
  69 #include <sys/spinlock2.h>
  70
  71 #include <vm/vm.h>
  72 #include <vm/vm_object.h>
  73 #include <vm/vm_page.h>
  74
  75 #include <machine/limits.h>
  76 #include <machine/stdarg.h>
  77
  78 static void mount_warning(struct mount *mp, const char *ctl, ...)
  79                 __printflike(2, 3);
  80 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
  81 static int checkvp_chdir (struct vnode *vn, struct thread *td);
  82 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
  83 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
  84 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
  85 static int getutimes (struct timeval *, struct timespec *);
  86 static int getutimens (const struct timespec *, struct timespec *, int *);
  87 static int setfown (struct mount *, struct vnode *, uid_t, gid_t);
  88 static int setfmode (struct vnode *, int);
  89 static int setfflags (struct vnode *, int);
  90 static int setutimes (struct vnode *, struct vattr *,
  91                         const struct timespec *, int);
  92 static int      usermount = 0;  /* if 1, non-root can mount fs. */
  93
  94 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
  95     "Allow non-root users to mount filesystems");
  96
  97 /*
  98  * Virtual File System System Calls
  99  */
 100
 101 /*
 102  * Mount a file system.
 103  *
 104  * mount_args(char *type, char *path, int flags, caddr_t data)
 105  *
 106  * MPALMOSTSAFE
 107  */
 108 int
 109 sys_mount(struct mount_args *uap)
 110 {
 111         struct thread *td = curthread;
 112         struct vnode *vp;
 113         struct nchandle nch;
 114         struct mount *mp, *nullmp;
 115         struct vfsconf *vfsp;
 116         int error, flag = 0, flag2 = 0;
 117         int hasmount;
 118         struct vattr va;
 119         struct nlookupdata nd;
 120         char fstypename[MFSNAMELEN];
 121         struct ucred *cred;
 122
 123         cred = td->td_ucred;
 124         if (jailed(cred)) {
 125                 error = EPERM;
 126                 goto done;
 127         }
 128         if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
 129                 goto done;
 130
 131         /*
 132          * Do not allow NFS export by non-root users.
 133          */
 134         if (uap->flags & MNT_EXPORTED) {
 135                 error = priv_check(td, PRIV_ROOT);
 136                 if (error)
 137                         goto done;
 138         }
 139         /*
 140          * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
 141          */
 142         if (priv_check(td, PRIV_ROOT))
 143                 uap->flags |= MNT_NOSUID | MNT_NODEV;
 144
 145         /*
 146          * Lookup the requested path and extract the nch and vnode.
 147          */
 148         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 149         if (error == 0) {
 150                 if ((error = nlookup(&nd)) == 0) {
 151                         if (nd.nl_nch.ncp->nc_vp == NULL)
 152                                 error = ENOENT;
 153                 }
 154         }
 155         if (error) {
 156                 nlookup_done(&nd);
 157                 goto done;
 158         }
 159
 160         /*
 161          * If the target filesystem is resolved via a nullfs mount, then
 162          * nd.nl_nch.mount will be pointing to the nullfs mount structure
 163          * instead of the target file system. We need it in case we are
 164          * doing an update.
 165          */
 166         nullmp = nd.nl_nch.mount;
 167
 168         /*
 169          * Extract the locked+refd ncp and cleanup the nd structure
 170          */
 171         nch = nd.nl_nch;
 172         cache_zero(&nd.nl_nch);
 173         nlookup_done(&nd);
 174
 175         if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
 176             (mp = cache_findmount(&nch)) != NULL) {
 177                 cache_dropmount(mp);
 178                 hasmount = 1;
 179         } else {
 180                 hasmount = 0;
 181         }
 182
 183
 184         /*
 185          * now we have the locked ref'd nch and unreferenced vnode.
 186          */
 187         vp = nch.ncp->nc_vp;
 188         if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
 189                 cache_put(&nch);
 190                 goto done;
 191         }
 192         cache_unlock(&nch);
 193
 194         /*
 195          * Extract the file system type. We need to know this early, to take
 196          * appropriate actions if we are dealing with a nullfs.
 197          */
 198         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
 199                 cache_drop(&nch);
 200                 vput(vp);
 201                 goto done;
 202         }
 203
 204         /*
 205          * Now we have an unlocked ref'd nch and a locked ref'd vp
 206          */
 207         if (uap->flags & MNT_UPDATE) {
 208                 if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
 209                         cache_drop(&nch);
 210                         vput(vp);
 211                         error = EINVAL;
 212                         goto done;
 213                 }
 214
 215                 if (strncmp(fstypename, "null", 5) == 0) {
 216                         KKASSERT(nullmp);
 217                         mp = nullmp;
 218                 } else {
 219                         mp = vp->v_mount;
 220                 }
 221
 222                 flag = mp->mnt_flag;
 223                 flag2 = mp->mnt_kern_flag;
 224                 /*
 225                  * We only allow the filesystem to be reloaded if it
 226                  * is currently mounted read-only.
 227                  */
 228                 if ((uap->flags & MNT_RELOAD) &&
 229                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 230                         cache_drop(&nch);
 231                         vput(vp);
 232                         error = EOPNOTSUPP;     /* Needs translation */
 233                         goto done;
 234                 }
 235                 /*
 236                  * Only root, or the user that did the original mount is
 237                  * permitted to update it.
 238                  */
 239                 if (mp->mnt_stat.f_owner != cred->cr_uid &&
 240                     (error = priv_check(td, PRIV_ROOT))) {
 241                         cache_drop(&nch);
 242                         vput(vp);
 243                         goto done;
 244                 }
 245                 if (vfs_busy(mp, LK_NOWAIT)) {
 246                         cache_drop(&nch);
 247                         vput(vp);
 248                         error = EBUSY;
 249                         goto done;
 250                 }
 251                 if (hasmount) {
 252                         cache_drop(&nch);
 253                         vfs_unbusy(mp);
 254                         vput(vp);
 255                         error = EBUSY;
 256                         goto done;
 257                 }
 258                 mp->mnt_flag |=
 259                     uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 260                 lwkt_gettoken(&mp->mnt_token);
 261                 vn_unlock(vp);
 262                 vfsp = mp->mnt_vfc;
 263                 goto update;
 264         }
 265
 266         /*
 267          * If the user is not root, ensure that they own the directory
 268          * onto which we are attempting to mount.
 269          */
 270         if ((error = VOP_GETATTR(vp, &va)) ||
 271             (va.va_uid != cred->cr_uid &&
 272              (error = priv_check(td, PRIV_ROOT)))) {
 273                 cache_drop(&nch);
 274                 vput(vp);
 275                 goto done;
 276         }
 277         if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
 278                 cache_drop(&nch);
 279                 vput(vp);
 280                 goto done;
 281         }
 282         if (vp->v_type != VDIR) {
 283                 cache_drop(&nch);
 284                 vput(vp);
 285                 error = ENOTDIR;
 286                 goto done;
 287         }
 288         if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
 289                 cache_drop(&nch);
 290                 vput(vp);
 291                 error = EPERM;
 292                 goto done;
 293         }
 294         vfsp = vfsconf_find_by_name(fstypename);
 295         if (vfsp == NULL) {
 296                 linker_file_t lf;
 297
 298                 /* Only load modules for root (very important!) */
 299                 if ((error = priv_check(td, PRIV_ROOT)) != 0) {
 300                         cache_drop(&nch);
 301                         vput(vp);
 302                         goto done;
 303                 }
 304                 error = linker_load_file(fstypename, &lf);
 305                 if (error || lf == NULL) {
 306                         cache_drop(&nch);
 307                         vput(vp);
 308                         if (lf == NULL)
 309                                 error = ENODEV;
 310                         goto done;
 311                 }
 312                 lf->userrefs++;
 313                 /* lookup again, see if the VFS was loaded */
 314                 vfsp = vfsconf_find_by_name(fstypename);
 315                 if (vfsp == NULL) {
 316                         lf->userrefs--;
 317                         linker_file_unload(lf);
 318                         cache_drop(&nch);
 319                         vput(vp);
 320                         error = ENODEV;
 321                         goto done;
 322                 }
 323         }
 324         if (hasmount) {
 325                 cache_drop(&nch);
 326                 vput(vp);
 327                 error = EBUSY;
 328                 goto done;
 329         }
 330
 331         /*
 332          * Allocate and initialize the filesystem.
 333          */
 334         mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
 335         mount_init(mp);
 336         vfs_busy(mp, LK_NOWAIT);
 337         mp->mnt_op = vfsp->vfc_vfsops;
 338         mp->mnt_vfc = vfsp;
 339         mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
 340         vfsp->vfc_refcount++;
 341         mp->mnt_stat.f_type = vfsp->vfc_typenum;
 342         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 343         strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 344         mp->mnt_stat.f_owner = cred->cr_uid;
 345         lwkt_gettoken(&mp->mnt_token);
 346         vn_unlock(vp);
 347 update:
 348         /*
 349          * (per-mount token acquired at this point)
 350          *
 351          * Set the mount level flags.
 352          */
 353         if (uap->flags & MNT_RDONLY)
 354                 mp->mnt_flag |= MNT_RDONLY;
 355         else if (mp->mnt_flag & MNT_RDONLY)
 356                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 357         mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 358             MNT_SYNCHRONOUS | MNT_ASYNC | MNT_NOATIME |
 359             MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
 360             MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
 361             MNT_AUTOMOUNTED);
 362         mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
 363             MNT_NODEV | MNT_SYNCHRONOUS | MNT_ASYNC | MNT_FORCE |
 364             MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
 365             MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
 366             MNT_AUTOMOUNTED);
 367
 368         /*
 369          * Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
 370          * This way the initial VFS_MOUNT() call will also be MPSAFE.
 371          */
 372         if (vfsp->vfc_flags & VFCF_MPSAFE)
 373                 mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;
 374
 375         /*
 376          * Mount the filesystem.
 377          * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 378          * get.
 379          */
 380         error = VFS_MOUNT(mp, uap->path, uap->data, cred);
 381         if (mp->mnt_flag & MNT_UPDATE) {
 382                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 383                         mp->mnt_flag &= ~MNT_RDONLY;
 384                 mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 385                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 386                 if (error) {
 387                         mp->mnt_flag = flag;
 388                         mp->mnt_kern_flag = flag2;
 389                 }
 390                 lwkt_reltoken(&mp->mnt_token);
 391                 vfs_unbusy(mp);
 392                 vrele(vp);
 393                 cache_drop(&nch);
 394                 goto done;
 395         }
 396         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 397
 398         /*
 399          * Put the new filesystem on the mount list after root.  The mount
 400          * point gets its own mnt_ncmountpt (unless the VFS already set one
 401          * up) which represents the root of the mount.  The lookup code
 402          * detects the mount point going forward and checks the root of
 403          * the mount going backwards.
 404          *
 405          * It is not necessary to invalidate or purge the vnode underneath
 406          * because elements under the mount will be given their own glue
 407          * namecache record.
 408          */
 409         if (!error) {
 410                 if (mp->mnt_ncmountpt.ncp == NULL) {
 411                         /*
 412                          * Allocate, then unlock, but leave the ref intact.
 413                          * This is the mnt_refs (1) that we will retain
 414                          * through to the unmount.
 415                          */
 416                         cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
 417                         cache_unlock(&mp->mnt_ncmountpt);
 418                 }
 419                 vn_unlock(vp);
 420                 mp->mnt_ncmounton = nch;                /* inherits ref */
 421                 cache_lock(&nch);
 422                 nch.ncp->nc_flag |= NCF_ISMOUNTPT;
 423                 cache_unlock(&nch);
 424                 cache_ismounting(mp);
 425                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 426
 427                 mountlist_insert(mp, MNTINS_LAST);
 428                 vn_unlock(vp);
 429                 checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
 430                 error = vfs_allocate_syncvnode(mp);
 431                 lwkt_reltoken(&mp->mnt_token);
 432                 vfs_unbusy(mp);
 433                 error = VFS_START(mp, 0);
 434                 vrele(vp);
 435                 KNOTE(&fs_klist, VQ_MOUNT);
 436         } else {
 437                 vn_syncer_thr_stop(mp);
 438                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
 439                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
 440                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
 441                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
 442                 vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
 443                 mp->mnt_vfc->vfc_refcount--;
 444                 lwkt_reltoken(&mp->mnt_token);
 445                 vfs_unbusy(mp);
 446                 kfree(mp, M_MOUNT);
 447                 cache_drop(&nch);
 448                 vput(vp);
 449         }
 450 done:
 451         return (error);
 452 }
 453
 454 /*
 455  * Scan all active processes to see if any of them have a current
 456  * or root directory onto which the new filesystem has just been
 457  * mounted. If so, replace them with the new mount point.
 458  *
 459  * Both old_nch and new_nch are ref'd on call but not locked.
 460  * new_nch must be temporarily locked so it can be associated with the
 461  * vnode representing the root of the mount point.
 462  */
 463 struct checkdirs_info {
 464         struct nchandle old_nch;
 465         struct nchandle new_nch;
 466         struct vnode *old_vp;
 467         struct vnode *new_vp;
 468 };
 469
 470 static int checkdirs_callback(struct proc *p, void *data);
 471
 472 static void
 473 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
 474 {
 475         struct checkdirs_info info;
 476         struct vnode *olddp;
 477         struct vnode *newdp;
 478         struct mount *mp;
 479
 480         /*
 481          * If the old mount point's vnode has a usecount of 1, it is not
 482          * being held as a descriptor anywhere.
 483          */
 484         olddp = old_nch->ncp->nc_vp;
 485         if (olddp == NULL || VREFCNT(olddp) == 1)
 486                 return;
 487
 488         /*
 489          * Force the root vnode of the new mount point to be resolved
 490          * so we can update any matching processes.
 491          */
 492         mp = new_nch->mount;
 493         if (VFS_ROOT(mp, &newdp))
 494                 panic("mount: lost mount");
 495         vn_unlock(newdp);
 496         cache_lock(new_nch);
 497         vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
 498         cache_setunresolved(new_nch);
 499         cache_setvp(new_nch, newdp);
 500         cache_unlock(new_nch);
 501
 502         /*
 503          * Special handling of the root node
 504          */
 505         if (rootvnode == olddp) {
 506                 vref(newdp);
 507                 vfs_cache_setroot(newdp, cache_hold(new_nch));
 508         }
 509
 510         /*
 511          * Pass newdp separately so the callback does not have to access
 512          * it via new_nch->ncp->nc_vp.
 513          */
 514         info.old_nch = *old_nch;
 515         info.new_nch = *new_nch;
 516         info.new_vp = newdp;
 517         allproc_scan(checkdirs_callback, &info, 0);
 518         vput(newdp);
 519 }
 520
 521 /*
 522  * NOTE: callback is not MP safe because the scanned process's filedesc
 523  * structure can be ripped out from under us, amoung other things.
 524  */
 525 static int
 526 checkdirs_callback(struct proc *p, void *data)
 527 {
 528         struct checkdirs_info *info = data;
 529         struct filedesc *fdp;
 530         struct nchandle ncdrop1;
 531         struct nchandle ncdrop2;
 532         struct vnode *vprele1;
 533         struct vnode *vprele2;
 534
 535         if ((fdp = p->p_fd) != NULL) {
 536                 cache_zero(&ncdrop1);
 537                 cache_zero(&ncdrop2);
 538                 vprele1 = NULL;
 539                 vprele2 = NULL;
 540
 541                 /*
 542                  * MPUNSAFE - XXX fdp can be pulled out from under a
 543                  * foreign process.
 544                  *
 545                  * A shared filedesc is ok, we don't have to copy it
 546                  * because we are making this change globally.
 547                  */
 548                 spin_lock(&fdp->fd_spin);
 549                 if (fdp->fd_ncdir.mount == info->old_nch.mount &&
 550                     fdp->fd_ncdir.ncp == info->old_nch.ncp) {
 551                         vprele1 = fdp->fd_cdir;
 552                         vref(info->new_vp);
 553                         fdp->fd_cdir = info->new_vp;
 554                         ncdrop1 = fdp->fd_ncdir;
 555                         cache_copy(&info->new_nch, &fdp->fd_ncdir);
 556                 }
 557                 if (fdp->fd_nrdir.mount == info->old_nch.mount &&
 558                     fdp->fd_nrdir.ncp == info->old_nch.ncp) {
 559                         vprele2 = fdp->fd_rdir;
 560                         vref(info->new_vp);
 561                         fdp->fd_rdir = info->new_vp;
 562                         ncdrop2 = fdp->fd_nrdir;
 563                         cache_copy(&info->new_nch, &fdp->fd_nrdir);
 564                 }
 565                 spin_unlock(&fdp->fd_spin);
 566                 if (ncdrop1.ncp)
 567                         cache_drop(&ncdrop1);
 568                 if (ncdrop2.ncp)
 569                         cache_drop(&ncdrop2);
 570                 if (vprele1)
 571                         vrele(vprele1);
 572                 if (vprele2)
 573                         vrele(vprele2);
 574         }
 575         return(0);
 576 }
 577
 578 /*
 579  * Unmount a file system.
 580  *
 581  * Note: unmount takes a path to the vnode mounted on as argument,
 582  * not special file (as before).
 583  *
 584  * umount_args(char *path, int flags)
 585  *
 586  * MPALMOSTSAFE
 587  */
 588 int
 589 sys_unmount(struct unmount_args *uap)
 590 {
 591         struct thread *td = curthread;
 592         struct proc *p __debugvar = td->td_proc;
 593         struct mount *mp = NULL;
 594         struct nlookupdata nd;
 595         int error;
 596
 597         KKASSERT(p);
 598         if (td->td_ucred->cr_prison != NULL) {
 599                 error = EPERM;
 600                 goto done;
 601         }
 602         if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
 603                 goto done;
 604
 605         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 606         if (error == 0)
 607                 error = nlookup(&nd);
 608         if (error)
 609                 goto out;
 610
 611         mp = nd.nl_nch.mount;
 612
 613         /*
 614          * Only root, or the user that did the original mount is
 615          * permitted to unmount this filesystem.
 616          */
 617         if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
 618             (error = priv_check(td, PRIV_ROOT)))
 619                 goto out;
 620
 621         /*
 622          * Don't allow unmounting the root file system.
 623          */
 624         if (mp->mnt_flag & MNT_ROOTFS) {
 625                 error = EINVAL;
 626                 goto out;
 627         }
 628
 629         /*
 630          * Must be the root of the filesystem
 631          */
 632         if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
 633                 error = EINVAL;
 634                 goto out;
 635         }
 636
 637         /*
 638          * If no error try to issue the unmount.  We lose our cache
 639          * ref when we call nlookup_done so we must hold the mount point
 640          * to prevent use-after-free races.
 641          */
 642 out:
 643         if (error == 0) {
 644                 mount_hold(mp);
 645                 nlookup_done(&nd);
 646                 error = dounmount(mp, uap->flags);
 647                 mount_drop(mp);
 648         } else {
 649                 nlookup_done(&nd);
 650         }
 651 done:
 652         return (error);
 653 }
 654
 655 /*
 656  * Do the actual file system unmount (interlocked against the mountlist
 657  * token and mp->mnt_token).
 658  */
 659 static int
 660 dounmount_interlock(struct mount *mp)
 661 {
 662         if (mp->mnt_kern_flag & MNTK_UNMOUNT)
 663                 return (EBUSY);
 664         mp->mnt_kern_flag |= MNTK_UNMOUNT;
 665         return(0);
 666 }
 667
 668 static int
 669 unmount_allproc_cb(struct proc *p, void *arg)
 670 {
 671         struct mount *mp;
 672
 673         if (p->p_textnch.ncp == NULL)
 674                 return 0;
 675
 676         mp = (struct mount *)arg;
 677         if (p->p_textnch.mount == mp)
 678                 cache_drop(&p->p_textnch);
 679
 680         return 0;
 681 }
 682
 683 /*
 684  * The guts of the unmount code.  The mount owns one ref and one hold
 685  * count.  If we successfully interlock the unmount, those refs are ours.
 686  * (The ref is from mnt_ncmountpt).
 687  */
 688 int
 689 dounmount(struct mount *mp, int flags)
 690 {
 691         struct namecache *ncp;
 692         struct nchandle nch;
 693         struct vnode *vp;
 694         int error;
 695         int async_flag;
 696         int lflags;
 697         int freeok = 1;
 698         int retry;
 699
 700         lwkt_gettoken(&mp->mnt_token);
 701
 702         /*
 703          * Exclusive access for unmounting purposes.
 704          */
 705         if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
 706                 goto out;
 707
 708         /*
 709          * We now 'own' the last mp->mnt_refs
 710          *
 711          * Allow filesystems to detect that a forced unmount is in progress.
 712          */
 713         if (flags & MNT_FORCE)
 714                 mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 715         lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
 716         error = lockmgr(&mp->mnt_lock, lflags);
 717         if (error) {
 718                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 719                 if (mp->mnt_kern_flag & MNTK_MWAIT) {
 720                         mp->mnt_kern_flag &= ~MNTK_MWAIT;
 721                         wakeup(mp);
 722                 }
 723                 goto out;
 724         }
 725
 726         if (mp->mnt_flag & MNT_EXPUBLIC)
 727                 vfs_setpublicfs(NULL, NULL, NULL);
 728
 729         vfs_msync(mp, MNT_WAIT);
 730         async_flag = mp->mnt_flag & MNT_ASYNC;
 731         mp->mnt_flag &=~ MNT_ASYNC;
 732
 733         /*
 734          * If this filesystem isn't aliasing other filesystems,
 735          * try to invalidate any remaining namecache entries and
 736          * check the count afterwords.
 737          *
 738          * We own the last mnt_refs by owning mnt_ncmountpt.
 739          */
 740         if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
 741                 cache_lock(&mp->mnt_ncmountpt);
 742                 cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
 743                 cache_unlock(&mp->mnt_ncmountpt);
 744
 745                 cache_clearmntcache();
 746                 if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 747                     (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
 748                         allproc_scan(&unmount_allproc_cb, mp, 0);
 749                 }
 750
 751                 cache_clearmntcache();
 752                 if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 753                     (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
 754
 755                         if ((flags & MNT_FORCE) == 0) {
 756                                 error = EBUSY;
 757                                 mount_warning(mp, "Cannot unmount: "
 758                                                   "%d namecache "
 759                                                   "references still "
 760                                                   "present",
 761                                                   ncp->nc_refs - 1);
 762                         } else {
 763                                 mount_warning(mp, "Forced unmount: "
 764                                                   "%d namecache "
 765                                                   "references still "
 766                                                   "present",
 767                                                   ncp->nc_refs - 1);
 768                                 freeok = 0;
 769                         }
 770                 }
 771         }
 772
 773         /*
 774          * Decomission our special mnt_syncer vnode.  This also stops
 775          * the vnlru code.  If we are unable to unmount we recommission
 776          * the vnode.
 777          *
 778          * Then sync the filesystem.
 779          */
 780         if ((vp = mp->mnt_syncer) != NULL) {
 781                 mp->mnt_syncer = NULL;
 782                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 783                 vrele(vp);
 784         }
 785         if ((mp->mnt_flag & MNT_RDONLY) == 0)
 786                 VFS_SYNC(mp, MNT_WAIT);
 787
 788         /*
 789          * nchandle records ref the mount structure.  Expect a count of 1
 790          * (our mount->mnt_ncmountpt).
 791          *
 792          * Scans can get temporary refs on a mountpoint (thought really
 793          * heavy duty stuff like cache_findmount() do not).
 794          */
 795         if (mp->mnt_refs != 1)
 796                 cache_clearmntcache();
 797         for (retry = 0; retry < 10 && mp->mnt_refs != 1; ++retry) {
 798                 cache_unmounting(mp);
 799                 tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 10 + 1);
 800                 cache_clearmntcache();
 801         }
 802         if (mp->mnt_refs != 1) {
 803                 if ((flags & MNT_FORCE) == 0) {
 804                         mount_warning(mp, "Cannot unmount: "
 805                                           "%d mount refs still present",
 806                                           mp->mnt_refs - 1);
 807                         error = EBUSY;
 808                 } else {
 809                         mount_warning(mp, "Forced unmount: "
 810                                           "%d mount refs still present",
 811                                           mp->mnt_refs - 1);
 812                         freeok = 0;
 813                 }
 814         }
 815
 816         /*
 817          * So far so good, sync the filesystem once more and
 818          * call the VFS unmount code if the sync succeeds.
 819          */
 820         if (error == 0) {
 821                 if (mp->mnt_flag & MNT_RDONLY) {
 822                         error = VFS_UNMOUNT(mp, flags);
 823                 } else {
 824                         error = VFS_SYNC(mp, MNT_WAIT);
 825                         if ((error == 0) ||
 826                             (error == EOPNOTSUPP) || /* No sync */
 827                             (flags & MNT_FORCE)) {
 828                                 error = VFS_UNMOUNT(mp, flags);
 829                         }
 830                 }
 831         }
 832
 833         /*
 834          * If an error occurred we can still recover, restoring the
 835          * syncer vnode and misc flags.
 836          */
 837         if (error) {
 838                 if (mp->mnt_syncer == NULL)
 839                         vfs_allocate_syncvnode(mp);
 840                 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 841                 mp->mnt_flag |= async_flag;
 842                 lockmgr(&mp->mnt_lock, LK_RELEASE);
 843                 if (mp->mnt_kern_flag & MNTK_MWAIT) {
 844                         mp->mnt_kern_flag &= ~MNTK_MWAIT;
 845                         wakeup(mp);
 846                 }
 847                 goto out;
 848         }
 849         /*
 850          * Clean up any journals still associated with the mount after
 851          * filesystem activity has ceased.
 852          */
 853         journal_remove_all_journals(mp,
 854             ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
 855
 856         mountlist_remove(mp);
 857
 858         /*
 859          * Remove any installed vnode ops here so the individual VFSs don't
 860          * have to.
 861          *
 862          * mnt_refs should go to zero when we scrap mnt_ncmountpt.
 863          */
 864         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
 865         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
 866         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
 867         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
 868         vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
 869
 870         if (mp->mnt_ncmountpt.ncp != NULL) {
 871                 nch = mp->mnt_ncmountpt;
 872                 cache_zero(&mp->mnt_ncmountpt);
 873                 cache_clrmountpt(&nch);
 874                 cache_drop(&nch);
 875         }
 876         if (mp->mnt_ncmounton.ncp != NULL) {
 877                 cache_unmounting(mp);
 878                 nch = mp->mnt_ncmounton;
 879                 cache_zero(&mp->mnt_ncmounton);
 880                 cache_clrmountpt(&nch);
 881                 cache_drop(&nch);
 882         }
 883
 884         mp->mnt_vfc->vfc_refcount--;
 885         if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 886                 panic("unmount: dangling vnode");
 887
 888         /*
 889          * Release the lock
 890          */
 891         lockmgr(&mp->mnt_lock, LK_RELEASE);
 892         if (mp->mnt_kern_flag & MNTK_MWAIT) {
 893                 mp->mnt_kern_flag &= ~MNTK_MWAIT;
 894                 wakeup(mp);
 895         }
 896
 897         /*
 898          * If we reach here and freeok != 0 we must free the mount.
 899          * mnt_refs should already have dropped to 0, so if it is not
 900          * zero we must cycle the caches and wait.
 901          *
 902          * When we are satisfied that the mount has disconnected we can
 903          * drop the hold on the mp that represented the mount (though the
 904          * caller might actually have another, so the caller's drop may
 905          * do the actual free).
 906          */
 907         if (freeok) {
 908                 if (mp->mnt_refs > 0)
 909                         cache_clearmntcache();
 910                 while (mp->mnt_refs > 0) {
 911                         cache_unmounting(mp);
 912                         wakeup(mp);
 913                         tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
 914                         cache_clearmntcache();
 915                 }
 916                 lwkt_reltoken(&mp->mnt_token);
 917                 mount_drop(mp);
 918                 mp = NULL;
 919         }
 920         error = 0;
 921         KNOTE(&fs_klist, VQ_UNMOUNT);
 922 out:
 923         if (mp)
 924                 lwkt_reltoken(&mp->mnt_token);
 925         return (error);
 926 }
 927
 928 static
 929 void
 930 mount_warning(struct mount *mp, const char *ctl, ...)
 931 {
 932         char *ptr;
 933         char *buf;
 934         __va_list va;
 935
 936         __va_start(va, ctl);
 937         if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
 938                            &ptr, &buf, 0) == 0) {
 939                 kprintf("unmount(%s): ", ptr);
 940                 kvprintf(ctl, va);
 941                 kprintf("\n");
 942                 kfree(buf, M_TEMP);
 943         } else {
 944                 kprintf("unmount(%p", mp);
 945                 if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
 946                         kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
 947                 kprintf("): ");
 948                 kvprintf(ctl, va);
 949                 kprintf("\n");
 950         }
 951         __va_end(va);
 952 }
 953
 954 /*
 955  * Shim cache_fullpath() to handle the case where a process is chrooted into
 956  * a subdirectory of a mount.  In this case if the root mount matches the
 957  * process root directory's mount we have to specify the process's root
 958  * directory instead of the mount point, because the mount point might
 959  * be above the root directory.
 960  */
 961 static
 962 int
 963 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
 964 {
 965         struct nchandle *nch;
 966
 967         if (p && p->p_fd->fd_nrdir.mount == mp)
 968                 nch = &p->p_fd->fd_nrdir;
 969         else
 970                 nch = &mp->mnt_ncmountpt;
 971         return(cache_fullpath(p, nch, NULL, rb, fb, 0));
 972 }
 973
 974 /*
 975  * Sync each mounted filesystem.
 976  */
 977
 978 #ifdef DEBUG
 979 static int syncprt = 0;
 980 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 981 #endif /* DEBUG */
 982
 983 static int sync_callback(struct mount *mp, void *data);
 984
 985 int
 986 sys_sync(struct sync_args *uap)
 987 {
 988         mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
 989         return (0);
 990 }
 991
 992 static
 993 int
 994 sync_callback(struct mount *mp, void *data __unused)
 995 {
 996         int asyncflag;
 997
 998         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 999                 asyncflag = mp->mnt_flag & MNT_ASYNC;
1000                 mp->mnt_flag &= ~MNT_ASYNC;
1001                 vfs_msync(mp, MNT_NOWAIT);
1002                 VFS_SYNC(mp, MNT_NOWAIT);
1003                 mp->mnt_flag |= asyncflag;
1004         }
1005         return(0);
1006 }
1007
1008 /* XXX PRISON: could be per prison flag */
1009 static int prison_quotas;
1010 #if 0
1011 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
1012 #endif
1013
1014 /*
1015  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
1016  *
1017  * Change filesystem quotas.
1018  *
1019  * MPALMOSTSAFE
1020  */
1021 int
1022 sys_quotactl(struct quotactl_args *uap)
1023 {
1024         struct nlookupdata nd;
1025         struct thread *td;
1026         struct mount *mp;
1027         int error;
1028
1029         td = curthread;
1030         if (td->td_ucred->cr_prison && !prison_quotas) {
1031                 error = EPERM;
1032                 goto done;
1033         }
1034
1035         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1036         if (error == 0)
1037                 error = nlookup(&nd);
1038         if (error == 0) {
1039                 mp = nd.nl_nch.mount;
1040                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
1041                                     uap->arg, nd.nl_cred);
1042         }
1043         nlookup_done(&nd);
1044 done:
1045         return (error);
1046 }
1047
1048 /*
1049  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
1050  *              void *buf, int buflen)
1051  *
1052  * This function operates on a mount point and executes the specified
1053  * operation using the specified control data, and possibly returns data.
1054  *
1055  * The actual number of bytes stored in the result buffer is returned, 0
1056  * if none, otherwise an error is returned.
1057  *
1058  * MPALMOSTSAFE
1059  */
1060 int
1061 sys_mountctl(struct mountctl_args *uap)
1062 {
1063         struct thread *td = curthread;
1064         struct proc *p = td->td_proc;
1065         struct file *fp;
1066         void *ctl = NULL;
1067         void *buf = NULL;
1068         char *path = NULL;
1069         int error;
1070
1071         /*
1072          * Sanity and permissions checks.  We must be root.
1073          */
1074         KKASSERT(p);
1075         if (td->td_ucred->cr_prison != NULL)
1076                 return (EPERM);
1077         if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
1078             (error = priv_check(td, PRIV_ROOT)) != 0)
1079                 return (error);
1080
1081         /*
1082          * Argument length checks
1083          */
1084         if (uap->ctllen < 0 || uap->ctllen > 1024)
1085                 return (EINVAL);
1086         if (uap->buflen < 0 || uap->buflen > 16 * 1024)
1087                 return (EINVAL);
1088         if (uap->path == NULL)
1089                 return (EINVAL);
1090
1091         /*
1092          * Allocate the necessary buffers and copyin data
1093          */
1094         path = objcache_get(namei_oc, M_WAITOK);
1095         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1096         if (error)
1097                 goto done;
1098
1099         if (uap->ctllen) {
1100                 ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1101                 error = copyin(uap->ctl, ctl, uap->ctllen);
1102                 if (error)
1103                         goto done;
1104         }
1105         if (uap->buflen)
1106                 buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1107
1108         /*
1109          * Validate the descriptor
1110          */
1111         if (uap->fd >= 0) {
1112                 fp = holdfp(p->p_fd, uap->fd, -1);
1113                 if (fp == NULL) {
1114                         error = EBADF;
1115                         goto done;
1116                 }
1117         } else {
1118                 fp = NULL;
1119         }
1120
1121         /*
1122          * Execute the internal kernel function and clean up.
1123          */
1124         error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
1125         if (fp)
1126                 fdrop(fp);
1127         if (error == 0 && uap->sysmsg_result > 0)
1128                 error = copyout(buf, uap->buf, uap->sysmsg_result);
1129 done:
1130         if (path)
1131                 objcache_put(namei_oc, path);
1132         if (ctl)
1133                 kfree(ctl, M_TEMP);
1134         if (buf)
1135                 kfree(buf, M_TEMP);
1136         return (error);
1137 }
1138
1139 /*
1140  * Execute a mount control operation by resolving the path to a mount point
1141  * and calling vop_mountctl().
1142  *
1143  * Use the mount point from the nch instead of the vnode so nullfs mounts
1144  * can properly spike the VOP.
1145  */
1146 int
1147 kern_mountctl(const char *path, int op, struct file *fp,
1148                 const void *ctl, int ctllen,
1149                 void *buf, int buflen, int *res)
1150 {
1151         struct vnode *vp;
1152         struct nlookupdata nd;
1153         struct nchandle nch;
1154         struct mount *mp;
1155         int error;
1156
1157         *res = 0;
1158         vp = NULL;
1159         error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1160         if (error)
1161                 return (error);
1162         error = nlookup(&nd);
1163         if (error) {
1164                 nlookup_done(&nd);
1165                 return (error);
1166         }
1167         error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1168         if (error) {
1169                 nlookup_done(&nd);
1170                 return (error);
1171         }
1172
1173         /*
1174          * Yes, all this is needed to use the nch.mount below, because
1175          * we must maintain a ref on the mount to avoid ripouts (e.g.
1176          * due to heavy mount/unmount use by synth or poudriere).
1177          */
1178         nch = nd.nl_nch;
1179         cache_zero(&nd.nl_nch);
1180         cache_unlock(&nch);
1181         nlookup_done(&nd);
1182         vn_unlock(vp);
1183
1184         mp = nch.mount;
1185
1186         /*
1187          * Must be the root of the filesystem
1188          */
1189         if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1190                 cache_drop(&nch);
1191                 vrele(vp);
1192                 return (EINVAL);
1193         }
1194         if (mp == NULL || mp->mnt_kern_flag & MNTK_UNMOUNT) {
1195                 kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
1196                         path);
1197                 cache_drop(&nch);
1198                 vrele(vp);
1199                 return (EINVAL);
1200         }
1201         error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1202                              buf, buflen, res);
1203         vrele(vp);
1204         cache_drop(&nch);
1205
1206         return (error);
1207 }
1208
1209 int
1210 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1211 {
1212         struct thread *td = curthread;
1213         struct proc *p = td->td_proc;
1214         struct mount *mp;
1215         struct statfs *sp;
1216         char *fullpath, *freepath;
1217         int error;
1218
1219         if ((error = nlookup(nd)) != 0)
1220                 return (error);
1221         mp = nd->nl_nch.mount;
1222         sp = &mp->mnt_stat;
1223         if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1224                 return (error);
1225
1226         error = mount_path(p, mp, &fullpath, &freepath);
1227         if (error)
1228                 return(error);
1229         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1230         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1231         kfree(freepath, M_TEMP);
1232
1233         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1234         bcopy(sp, buf, sizeof(*buf));
1235         /* Only root should have access to the fsid's. */
1236         if (priv_check(td, PRIV_ROOT))
1237                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1238         return (0);
1239 }
1240
1241 /*
1242  * statfs_args(char *path, struct statfs *buf)
1243  *
1244  * Get filesystem statistics.
1245  */
1246 int
1247 sys_statfs(struct statfs_args *uap)
1248 {
1249         struct nlookupdata nd;
1250         struct statfs buf;
1251         int error;
1252
1253         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1254         if (error == 0)
1255                 error = kern_statfs(&nd, &buf);
1256         nlookup_done(&nd);
1257         if (error == 0)
1258                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1259         return (error);
1260 }
1261
1262 int
1263 kern_fstatfs(int fd, struct statfs *buf)
1264 {
1265         struct thread *td = curthread;
1266         struct proc *p = td->td_proc;
1267         struct file *fp;
1268         struct mount *mp;
1269         struct statfs *sp;
1270         char *fullpath, *freepath;
1271         int error;
1272
1273         KKASSERT(p);
1274         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1275                 return (error);
1276
1277         /*
1278          * Try to use mount info from any overlays rather than the
1279          * mount info for the underlying vnode, otherwise we will
1280          * fail when operating on null-mounted paths inside a chroot.
1281          */
1282         if ((mp = fp->f_nchandle.mount) == NULL)
1283                 mp = ((struct vnode *)fp->f_data)->v_mount;
1284         if (mp == NULL) {
1285                 error = EBADF;
1286                 goto done;
1287         }
1288         if (fp->f_cred == NULL) {
1289                 error = EINVAL;
1290                 goto done;
1291         }
1292         sp = &mp->mnt_stat;
1293         if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1294                 goto done;
1295
1296         if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1297                 goto done;
1298         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1299         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1300         kfree(freepath, M_TEMP);
1301
1302         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1303         bcopy(sp, buf, sizeof(*buf));
1304
1305         /* Only root should have access to the fsid's. */
1306         if (priv_check(td, PRIV_ROOT))
1307                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1308         error = 0;
1309 done:
1310         fdrop(fp);
1311         return (error);
1312 }
1313
1314 /*
1315  * fstatfs_args(int fd, struct statfs *buf)
1316  *
1317  * Get filesystem statistics.
1318  */
1319 int
1320 sys_fstatfs(struct fstatfs_args *uap)
1321 {
1322         struct statfs buf;
1323         int error;
1324
1325         error = kern_fstatfs(uap->fd, &buf);
1326
1327         if (error == 0)
1328                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1329         return (error);
1330 }
1331
1332 int
1333 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1334 {
1335         struct mount *mp;
1336         struct statvfs *sp;
1337         int error;
1338
1339         if ((error = nlookup(nd)) != 0)
1340                 return (error);
1341         mp = nd->nl_nch.mount;
1342         sp = &mp->mnt_vstat;
1343         if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1344                 return (error);
1345
1346         sp->f_flag = 0;
1347         if (mp->mnt_flag & MNT_RDONLY)
1348                 sp->f_flag |= ST_RDONLY;
1349         if (mp->mnt_flag & MNT_NOSUID)
1350                 sp->f_flag |= ST_NOSUID;
1351         bcopy(sp, buf, sizeof(*buf));
1352         return (0);
1353 }
1354
1355 /*
1356  * statfs_args(char *path, struct statfs *buf)
1357  *
1358  * Get filesystem statistics.
1359  */
1360 int
1361 sys_statvfs(struct statvfs_args *uap)
1362 {
1363         struct nlookupdata nd;
1364         struct statvfs buf;
1365         int error;
1366
1367         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1368         if (error == 0)
1369                 error = kern_statvfs(&nd, &buf);
1370         nlookup_done(&nd);
1371         if (error == 0)
1372                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1373         return (error);
1374 }
1375
1376 int
1377 kern_fstatvfs(int fd, struct statvfs *buf)
1378 {
1379         struct thread *td = curthread;
1380         struct proc *p = td->td_proc;
1381         struct file *fp;
1382         struct mount *mp;
1383         struct statvfs *sp;
1384         int error;
1385
1386         KKASSERT(p);
1387         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1388                 return (error);
1389         if ((mp = fp->f_nchandle.mount) == NULL)
1390                 mp = ((struct vnode *)fp->f_data)->v_mount;
1391         if (mp == NULL) {
1392                 error = EBADF;
1393                 goto done;
1394         }
1395         if (fp->f_cred == NULL) {
1396                 error = EINVAL;
1397                 goto done;
1398         }
1399         sp = &mp->mnt_vstat;
1400         if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1401                 goto done;
1402
1403         sp->f_flag = 0;
1404         if (mp->mnt_flag & MNT_RDONLY)
1405                 sp->f_flag |= ST_RDONLY;
1406         if (mp->mnt_flag & MNT_NOSUID)
1407                 sp->f_flag |= ST_NOSUID;
1408
1409         bcopy(sp, buf, sizeof(*buf));
1410         error = 0;
1411 done:
1412         fdrop(fp);
1413         return (error);
1414 }
1415
1416 /*
1417  * fstatfs_args(int fd, struct statfs *buf)
1418  *
1419  * Get filesystem statistics.
1420  */
1421 int
1422 sys_fstatvfs(struct fstatvfs_args *uap)
1423 {
1424         struct statvfs buf;
1425         int error;
1426
1427         error = kern_fstatvfs(uap->fd, &buf);
1428
1429         if (error == 0)
1430                 error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1431         return (error);
1432 }
1433
1434 /*
1435  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1436  *
1437  * Get statistics on all filesystems.
1438  */
1439
1440 struct getfsstat_info {
1441         struct statfs *sfsp;
1442         long count;
1443         long maxcount;
1444         int error;
1445         int flags;
1446         struct thread *td;
1447 };
1448
1449 static int getfsstat_callback(struct mount *, void *);
1450
1451 int
1452 sys_getfsstat(struct getfsstat_args *uap)
1453 {
1454         struct thread *td = curthread;
1455         struct getfsstat_info info;
1456
1457         bzero(&info, sizeof(info));
1458
1459         info.maxcount = uap->bufsize / sizeof(struct statfs);
1460         info.sfsp = uap->buf;
1461         info.count = 0;
1462         info.flags = uap->flags;
1463         info.td = td;
1464
1465         mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1466         if (info.sfsp && info.count > info.maxcount)
1467                 uap->sysmsg_result = info.maxcount;
1468         else
1469                 uap->sysmsg_result = info.count;
1470         return (info.error);
1471 }
1472
1473 static int
1474 getfsstat_callback(struct mount *mp, void *data)
1475 {
1476         struct getfsstat_info *info = data;
1477         struct statfs *sp;
1478         char *freepath;
1479         char *fullpath;
1480         int error;
1481
1482         if (info->sfsp && info->count < info->maxcount) {
1483                 if (info->td->td_proc &&
1484                     !chroot_visible_mnt(mp, info->td->td_proc)) {
1485                         return(0);
1486                 }
1487                 sp = &mp->mnt_stat;
1488
1489                 /*
1490                  * If MNT_NOWAIT or MNT_LAZY is specified, do not
1491                  * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1492                  * overrides MNT_WAIT.
1493                  */
1494                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1495                     (info->flags & MNT_WAIT)) &&
1496                     (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1497                         return(0);
1498                 }
1499                 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1500
1501                 error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1502                 if (error) {
1503                         info->error = error;
1504                         return(-1);
1505                 }
1506                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1507                 strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1508                 kfree(freepath, M_TEMP);
1509
1510                 error = copyout(sp, info->sfsp, sizeof(*sp));
1511                 if (error) {
1512                         info->error = error;
1513                         return (-1);
1514                 }
1515                 ++info->sfsp;
1516         }
1517         info->count++;
1518         return(0);
1519 }
1520
1521 /*
1522  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1523                    long bufsize, int flags)
1524  *
1525  * Get statistics on all filesystems.
1526  */
1527
1528 struct getvfsstat_info {
1529         struct statfs *sfsp;
1530         struct statvfs *vsfsp;
1531         long count;
1532         long maxcount;
1533         int error;
1534         int flags;
1535         struct thread *td;
1536 };
1537
1538 static int getvfsstat_callback(struct mount *, void *);
1539
1540 int
1541 sys_getvfsstat(struct getvfsstat_args *uap)
1542 {
1543         struct thread *td = curthread;
1544         struct getvfsstat_info info;
1545
1546         bzero(&info, sizeof(info));
1547
1548         info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1549         info.sfsp = uap->buf;
1550         info.vsfsp = uap->vbuf;
1551         info.count = 0;
1552         info.flags = uap->flags;
1553         info.td = td;
1554
1555         mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1556         if (info.vsfsp && info.count > info.maxcount)
1557                 uap->sysmsg_result = info.maxcount;
1558         else
1559                 uap->sysmsg_result = info.count;
1560         return (info.error);
1561 }
1562
1563 static int
1564 getvfsstat_callback(struct mount *mp, void *data)
1565 {
1566         struct getvfsstat_info *info = data;
1567         struct statfs *sp;
1568         struct statvfs *vsp;
1569         char *freepath;
1570         char *fullpath;
1571         int error;
1572
1573         if (info->vsfsp && info->count < info->maxcount) {
1574                 if (info->td->td_proc &&
1575                     !chroot_visible_mnt(mp, info->td->td_proc)) {
1576                         return(0);
1577                 }
1578                 sp = &mp->mnt_stat;
1579                 vsp = &mp->mnt_vstat;
1580
1581                 /*
1582                  * If MNT_NOWAIT or MNT_LAZY is specified, do not
1583                  * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1584                  * overrides MNT_WAIT.
1585                  */
1586                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1587                     (info->flags & MNT_WAIT)) &&
1588                     (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1589                         return(0);
1590                 }
1591                 sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1592
1593                 if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1594                     (info->flags & MNT_WAIT)) &&
1595                     (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1596                         return(0);
1597                 }
1598                 vsp->f_flag = 0;
1599                 if (mp->mnt_flag & MNT_RDONLY)
1600                         vsp->f_flag |= ST_RDONLY;
1601                 if (mp->mnt_flag & MNT_NOSUID)
1602                         vsp->f_flag |= ST_NOSUID;
1603
1604                 error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1605                 if (error) {
1606                         info->error = error;
1607                         return(-1);
1608                 }
1609                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1610                 strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1611                 kfree(freepath, M_TEMP);
1612
1613                 error = copyout(sp, info->sfsp, sizeof(*sp));
1614                 if (error == 0)
1615                         error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1616                 if (error) {
1617                         info->error = error;
1618                         return (-1);
1619                 }
1620                 ++info->sfsp;
1621                 ++info->vsfsp;
1622         }
1623         info->count++;
1624         return(0);
1625 }
1626
1627
1628 /*
1629  * fchdir_args(int fd)
1630  *
1631  * Change current working directory to a given file descriptor.
1632  */
1633 int
1634 sys_fchdir(struct fchdir_args *uap)
1635 {
1636         struct thread *td = curthread;
1637         struct proc *p = td->td_proc;
1638         struct filedesc *fdp = p->p_fd;
1639         struct vnode *vp, *ovp;
1640         struct mount *mp;
1641         struct file *fp;
1642         struct nchandle nch, onch, tnch;
1643         int error;
1644
1645         if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1646                 return (error);
1647         lwkt_gettoken(&p->p_token);
1648         vp = (struct vnode *)fp->f_data;
1649         vref(vp);
1650         vn_lock(vp, LK_SHARED | LK_RETRY);
1651         if (fp->f_nchandle.ncp == NULL)
1652                 error = ENOTDIR;
1653         else
1654                 error = checkvp_chdir(vp, td);
1655         if (error) {
1656                 vput(vp);
1657                 goto done;
1658         }
1659         cache_copy(&fp->f_nchandle, &nch);
1660
1661         /*
1662          * If the ncp has become a mount point, traverse through
1663          * the mount point.
1664          */
1665
1666         while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1667                (mp = cache_findmount(&nch)) != NULL
1668         ) {
1669                 error = nlookup_mp(mp, &tnch);
1670                 if (error == 0) {
1671                         cache_unlock(&tnch);    /* leave ref intact */
1672                         vput(vp);
1673                         vp = tnch.ncp->nc_vp;
1674                         error = vget(vp, LK_SHARED);
1675                         KKASSERT(error == 0);
1676                         cache_drop(&nch);
1677                         nch = tnch;
1678                 }
1679                 cache_dropmount(mp);
1680         }
1681         if (error == 0) {
1682                 spin_lock(&fdp->fd_spin);
1683                 ovp = fdp->fd_cdir;
1684                 onch = fdp->fd_ncdir;
1685                 fdp->fd_cdir = vp;
1686                 fdp->fd_ncdir = nch;
1687                 spin_unlock(&fdp->fd_spin);
1688                 vn_unlock(vp);          /* leave ref intact */
1689                 cache_drop(&onch);
1690                 vrele(ovp);
1691         } else {
1692                 cache_drop(&nch);
1693                 vput(vp);
1694         }
1695         fdrop(fp);
1696 done:
1697         lwkt_reltoken(&p->p_token);
1698         return (error);
1699 }
1700
1701 int
1702 kern_chdir(struct nlookupdata *nd)
1703 {
1704         struct thread *td = curthread;
1705         struct proc *p = td->td_proc;
1706         struct filedesc *fdp = p->p_fd;
1707         struct vnode *vp, *ovp;
1708         struct nchandle onch;
1709         int error;
1710
1711         nd->nl_flags |= NLC_SHAREDLOCK;
1712         if ((error = nlookup(nd)) != 0)
1713                 return (error);
1714         if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1715                 return (ENOENT);
1716         if ((error = vget(vp, LK_SHARED)) != 0)
1717                 return (error);
1718
1719         lwkt_gettoken(&p->p_token);
1720         error = checkvp_chdir(vp, td);
1721         vn_unlock(vp);
1722         if (error == 0) {
1723                 spin_lock(&fdp->fd_spin);
1724                 ovp = fdp->fd_cdir;
1725                 onch = fdp->fd_ncdir;
1726                 fdp->fd_ncdir = nd->nl_nch;
1727                 fdp->fd_cdir = vp;
1728                 spin_unlock(&fdp->fd_spin);
1729                 cache_unlock(&nd->nl_nch);      /* leave reference intact */
1730                 cache_drop(&onch);
1731                 vrele(ovp);
1732                 cache_zero(&nd->nl_nch);
1733         } else {
1734                 vrele(vp);
1735         }
1736         lwkt_reltoken(&p->p_token);
1737         return (error);
1738 }
1739
1740 /*
1741  * chdir_args(char *path)
1742  *
1743  * Change current working directory (``.'').
1744  */
1745 int
1746 sys_chdir(struct chdir_args *uap)
1747 {
1748         struct nlookupdata nd;
1749         int error;
1750
1751         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1752         if (error == 0)
1753                 error = kern_chdir(&nd);
1754         nlookup_done(&nd);
1755         return (error);
1756 }
1757
1758 /*
1759  * Helper function for raised chroot(2) security function:  Refuse if
1760  * any filedescriptors are open directories.
1761  */
1762 static int
1763 chroot_refuse_vdir_fds(struct filedesc *fdp)
1764 {
1765         struct vnode *vp;
1766         struct file *fp;
1767         int error;
1768         int fd;
1769
1770         for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1771                 if ((error = holdvnode(fdp, fd, &fp)) != 0)
1772                         continue;
1773                 vp = (struct vnode *)fp->f_data;
1774                 if (vp->v_type != VDIR) {
1775                         fdrop(fp);
1776                         continue;
1777                 }
1778                 fdrop(fp);
1779                 return(EPERM);
1780         }
1781         return (0);
1782 }
1783
1784 /*
1785  * This sysctl determines if we will allow a process to chroot(2) if it
1786  * has a directory open:
1787  *      0: disallowed for all processes.
1788  *      1: allowed for processes that were not already chroot(2)'ed.
1789  *      2: allowed for all processes.
1790  */
1791
1792 static int chroot_allow_open_directories = 1;
1793
1794 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1795      &chroot_allow_open_directories, 0, "");
1796
1797 /*
1798  * chroot to the specified namecache entry.  We obtain the vp from the
1799  * namecache data.  The passed ncp must be locked and referenced and will
1800  * remain locked and referenced on return.
1801  */
1802 int
1803 kern_chroot(struct nchandle *nch)
1804 {
1805         struct thread *td = curthread;
1806         struct proc *p = td->td_proc;
1807         struct filedesc *fdp = p->p_fd;
1808         struct vnode *vp;
1809         int error;
1810
1811         /*
1812          * Only privileged user can chroot
1813          */
1814         error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1815         if (error)
1816                 return (error);
1817
1818         /*
1819          * Disallow open directory descriptors (fchdir() breakouts).
1820          */
1821         if (chroot_allow_open_directories == 0 ||
1822            (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1823                 if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1824                         return (error);
1825         }
1826         if ((vp = nch->ncp->nc_vp) == NULL)
1827                 return (ENOENT);
1828
1829         if ((error = vget(vp, LK_SHARED)) != 0)
1830                 return (error);
1831
1832         /*
1833          * Check the validity of vp as a directory to change to and
1834          * associate it with rdir/jdir.
1835          */
1836         error = checkvp_chdir(vp, td);
1837         vn_unlock(vp);                  /* leave reference intact */
1838         if (error == 0) {
1839                 lwkt_gettoken(&p->p_token);
1840                 vrele(fdp->fd_rdir);
1841                 fdp->fd_rdir = vp;      /* reference inherited by fd_rdir */
1842                 cache_drop(&fdp->fd_nrdir);
1843                 cache_copy(nch, &fdp->fd_nrdir);
1844                 if (fdp->fd_jdir == NULL) {
1845                         fdp->fd_jdir = vp;
1846                         vref(fdp->fd_jdir);
1847                         cache_copy(nch, &fdp->fd_njdir);
1848                 }
1849                 if ((p->p_flags & P_DIDCHROOT) == 0) {
1850                         p->p_flags |= P_DIDCHROOT;
1851                         if (p->p_depth <= 65535 - 32)
1852                                 p->p_depth += 32;
1853                 }
1854                 lwkt_reltoken(&p->p_token);
1855         } else {
1856                 vrele(vp);
1857         }
1858         return (error);
1859 }
1860
1861 /*
1862  * chroot_args(char *path)
1863  *
1864  * Change notion of root (``/'') directory.
1865  */
1866 int
1867 sys_chroot(struct chroot_args *uap)
1868 {
1869         struct thread *td __debugvar = curthread;
1870         struct nlookupdata nd;
1871         int error;
1872
1873         KKASSERT(td->td_proc);
1874         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1875         if (error == 0) {
1876                 nd.nl_flags |= NLC_EXEC;
1877                 error = nlookup(&nd);
1878                 if (error == 0)
1879                         error = kern_chroot(&nd.nl_nch);
1880         }
1881         nlookup_done(&nd);
1882         return(error);
1883 }
1884
1885 int
1886 sys_chroot_kernel(struct chroot_kernel_args *uap)
1887 {
1888         struct thread *td = curthread;
1889         struct nlookupdata nd;
1890         struct nchandle *nch;
1891         struct vnode *vp;
1892         int error;
1893
1894         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1895         if (error)
1896                 goto error_nond;
1897
1898         error = nlookup(&nd);
1899         if (error)
1900                 goto error_out;
1901
1902         nch = &nd.nl_nch;
1903
1904         error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1905         if (error)
1906                 goto error_out;
1907
1908         if ((vp = nch->ncp->nc_vp) == NULL) {
1909                 error = ENOENT;
1910                 goto error_out;
1911         }
1912
1913         if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
1914                 goto error_out;
1915
1916         kprintf("chroot_kernel: set new rootnch/rootvnode to %s\n", uap->path);
1917         vfs_cache_setroot(vp, cache_hold(nch));
1918
1919 error_out:
1920         nlookup_done(&nd);
1921 error_nond:
1922         return(error);
1923 }
1924
1925 /*
1926  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1927  * determine whether it is legal to chdir to the vnode.  The vnode's state
1928  * is not changed by this call.
1929  */
1930 static int
1931 checkvp_chdir(struct vnode *vp, struct thread *td)
1932 {
1933         int error;
1934
1935         if (vp->v_type != VDIR)
1936                 error = ENOTDIR;
1937         else
1938                 error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1939         return (error);
1940 }
1941
1942 int
1943 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1944 {
1945         struct thread *td = curthread;
1946         struct proc *p = td->td_proc;
1947         struct lwp *lp = td->td_lwp;
1948         struct filedesc *fdp = p->p_fd;
1949         int cmode, flags;
1950         struct file *nfp;
1951         struct file *fp;
1952         struct vnode *vp;
1953         int type, indx, error = 0;
1954         struct flock lf;
1955
1956         if ((oflags & O_ACCMODE) == O_ACCMODE)
1957                 return (EINVAL);
1958         flags = FFLAGS(oflags);
1959         error = falloc(lp, &nfp, NULL);
1960         if (error)
1961                 return (error);
1962         fp = nfp;
1963         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1964
1965         /*
1966          * XXX p_dupfd is a real mess.  It allows a device to return a
1967          * file descriptor to be duplicated rather then doing the open
1968          * itself.
1969          */
1970         lp->lwp_dupfd = -1;
1971
1972         /*
1973          * Call vn_open() to do the lookup and assign the vnode to the
1974          * file pointer.  vn_open() does not change the ref count on fp
1975          * and the vnode, on success, will be inherited by the file pointer
1976          * and unlocked.
1977          *
1978          * Request a shared lock on the vnode if possible.
1979          *
1980          * Executable binaries can race VTEXT against O_RDWR opens, so
1981          * use an exclusive lock for O_RDWR opens as well.
1982          *
1983          * NOTE: We need a flag to separate terminal vnode locking from
1984          *       parent locking.  O_CREAT needs parent locking, but O_TRUNC
1985          *       and O_RDWR only need to lock the terminal vnode exclusively.
1986          */
1987         nd->nl_flags |= NLC_LOCKVP;
1988         if ((flags & (O_CREAT|O_TRUNC|O_RDWR)) == 0)
1989                 nd->nl_flags |= NLC_SHAREDLOCK;
1990
1991         error = vn_open(nd, fp, flags, cmode);
1992         nlookup_done(nd);
1993
1994         if (error) {
1995                 /*
1996                  * handle special fdopen() case.  bleh.  dupfdopen() is
1997                  * responsible for dropping the old contents of ofiles[indx]
1998                  * if it succeeds.
1999                  *
2000                  * Note that fsetfd() will add a ref to fp which represents
2001                  * the fd_files[] assignment.  We must still drop our
2002                  * reference.
2003                  */
2004                 if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
2005                         if (fdalloc(p, 0, &indx) == 0) {
2006                                 error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
2007                                 if (error == 0) {
2008                                         *res = indx;
2009                                         fdrop(fp);      /* our ref */
2010                                         return (0);
2011                                 }
2012                                 fsetfd(fdp, NULL, indx);
2013                         }
2014                 }
2015                 fdrop(fp);      /* our ref */
2016                 if (error == ERESTART)
2017                         error = EINTR;
2018                 return (error);
2019         }
2020
2021         /*
2022          * ref the vnode for ourselves so it can't be ripped out from under
2023          * is.  XXX need an ND flag to request that the vnode be returned
2024          * anyway.
2025          *
2026          * Reserve a file descriptor but do not assign it until the open
2027          * succeeds.
2028          */
2029         vp = (struct vnode *)fp->f_data;
2030         vref(vp);
2031         if ((error = fdalloc(p, 0, &indx)) != 0) {
2032                 fdrop(fp);
2033                 vrele(vp);
2034                 return (error);
2035         }
2036
2037         /*
2038          * If no error occurs the vp will have been assigned to the file
2039          * pointer.
2040          */
2041         lp->lwp_dupfd = 0;
2042
2043         if (flags & (O_EXLOCK | O_SHLOCK)) {
2044                 lf.l_whence = SEEK_SET;
2045                 lf.l_start = 0;
2046                 lf.l_len = 0;
2047                 if (flags & O_EXLOCK)
2048                         lf.l_type = F_WRLCK;
2049                 else
2050                         lf.l_type = F_RDLCK;
2051                 if (flags & FNONBLOCK)
2052                         type = 0;
2053                 else
2054                         type = F_WAIT;
2055
2056                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
2057                         /*
2058                          * lock request failed.  Clean up the reserved
2059                          * descriptor.
2060                          */
2061                         vrele(vp);
2062                         fsetfd(fdp, NULL, indx);
2063                         fdrop(fp);
2064                         return (error);
2065                 }
2066                 atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
2067         }
2068 #if 0
2069         /*
2070          * Assert that all regular file vnodes were created with a object.
2071          */
2072         KASSERT(vp->v_type != VREG || vp->v_object != NULL,
2073                 ("open: regular file has no backing object after vn_open"));
2074 #endif
2075
2076         vrele(vp);
2077
2078         /*
2079          * release our private reference, leaving the one associated with the
2080          * descriptor table intact.
2081          */
2082         if (oflags & O_CLOEXEC)
2083                 fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
2084         fsetfd(fdp, fp, indx);
2085         fdrop(fp);
2086         *res = indx;
2087         return (error);
2088 }
2089
2090 /*
2091  * open_args(char *path, int flags, int mode)
2092  *
2093  * Check permissions, allocate an open file structure,
2094  * and call the device open routine if any.
2095  */
2096 int
2097 sys_open(struct open_args *uap)
2098 {
2099         struct nlookupdata nd;
2100         int error;
2101
2102         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2103         if (error == 0) {
2104                 error = kern_open(&nd, uap->flags,
2105                                     uap->mode, &uap->sysmsg_result);
2106         }
2107         nlookup_done(&nd);
2108         return (error);
2109 }
2110
2111 /*
2112  * openat_args(int fd, char *path, int flags, int mode)
2113  */
2114 int
2115 sys_openat(struct openat_args *uap)
2116 {
2117         struct nlookupdata nd;
2118         int error;
2119         struct file *fp;
2120
2121         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2122         if (error == 0) {
2123                 error = kern_open(&nd, uap->flags, uap->mode,
2124                                         &uap->sysmsg_result);
2125         }
2126         nlookup_done_at(&nd, fp);
2127         return (error);
2128 }
2129
2130 int
2131 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2132 {
2133         struct thread *td = curthread;
2134         struct proc *p = td->td_proc;
2135         struct vnode *vp;
2136         struct vattr vattr;
2137         int error;
2138         int whiteout = 0;
2139
2140         KKASSERT(p);
2141
2142         VATTR_NULL(&vattr);
2143         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2144         vattr.va_rmajor = rmajor;
2145         vattr.va_rminor = rminor;
2146
2147         switch (mode & S_IFMT) {
2148         case S_IFMT:    /* used by badsect to flag bad sectors */
2149                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
2150                 vattr.va_type = VBAD;
2151                 break;
2152         case S_IFCHR:
2153                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2154                 vattr.va_type = VCHR;
2155                 break;
2156         case S_IFBLK:
2157                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2158                 vattr.va_type = VBLK;
2159                 break;
2160         case S_IFWHT:
2161                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
2162                 whiteout = 1;
2163                 break;
2164         case S_IFDIR:   /* special directories support for HAMMER */
2165                 error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
2166                 vattr.va_type = VDIR;
2167                 break;
2168         default:
2169                 error = EINVAL;
2170                 break;
2171         }
2172
2173         if (error)
2174                 return (error);
2175
2176         bwillinode(1);
2177         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2178         if ((error = nlookup(nd)) != 0)
2179                 return (error);
2180         if (nd->nl_nch.ncp->nc_vp)
2181                 return (EEXIST);
2182         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2183                 return (error);
2184
2185         if (whiteout) {
2186                 error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2187                                       nd->nl_cred, NAMEI_CREATE);
2188         } else {
2189                 vp = NULL;
2190                 error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2191                                    &vp, nd->nl_cred, &vattr);
2192                 if (error == 0)
2193                         vput(vp);
2194         }
2195         return (error);
2196 }
2197
2198 /*
2199  * mknod_args(char *path, int mode, int dev)
2200  *
2201  * Create a special file.
2202  */
2203 int
2204 sys_mknod(struct mknod_args *uap)
2205 {
2206         struct nlookupdata nd;
2207         int error;
2208
2209         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2210         if (error == 0) {
2211                 error = kern_mknod(&nd, uap->mode,
2212                                    umajor(uap->dev), uminor(uap->dev));
2213         }
2214         nlookup_done(&nd);
2215         return (error);
2216 }
2217
2218 /*
2219  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2220  *
2221  * Create a special file.  The path is relative to the directory associated
2222  * with fd.
2223  */
2224 int
2225 sys_mknodat(struct mknodat_args *uap)
2226 {
2227         struct nlookupdata nd;
2228         struct file *fp;
2229         int error;
2230
2231         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2232         if (error == 0) {
2233                 error = kern_mknod(&nd, uap->mode,
2234                                    umajor(uap->dev), uminor(uap->dev));
2235         }
2236         nlookup_done_at(&nd, fp);
2237         return (error);
2238 }
2239
2240 int
2241 kern_mkfifo(struct nlookupdata *nd, int mode)
2242 {
2243         struct thread *td = curthread;
2244         struct proc *p = td->td_proc;
2245         struct vattr vattr;
2246         struct vnode *vp;
2247         int error;
2248
2249         bwillinode(1);
2250
2251         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2252         if ((error = nlookup(nd)) != 0)
2253                 return (error);
2254         if (nd->nl_nch.ncp->nc_vp)
2255                 return (EEXIST);
2256         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2257                 return (error);
2258
2259         VATTR_NULL(&vattr);
2260         vattr.va_type = VFIFO;
2261         vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2262         vp = NULL;
2263         error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2264         if (error == 0)
2265                 vput(vp);
2266         return (error);
2267 }
2268
2269 /*
2270  * mkfifo_args(char *path, int mode)
2271  *
2272  * Create a named pipe.
2273  */
2274 int
2275 sys_mkfifo(struct mkfifo_args *uap)
2276 {
2277         struct nlookupdata nd;
2278         int error;
2279
2280         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2281         if (error == 0)
2282                 error = kern_mkfifo(&nd, uap->mode);
2283         nlookup_done(&nd);
2284         return (error);
2285 }
2286
2287 /*
2288  * mkfifoat_args(int fd, char *path, mode_t mode)
2289  *
2290  * Create a named pipe.  The path is relative to the directory associated
2291  * with fd.
2292  */
2293 int
2294 sys_mkfifoat(struct mkfifoat_args *uap)
2295 {
2296         struct nlookupdata nd;
2297         struct file *fp;
2298         int error;
2299
2300         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2301         if (error == 0)
2302                 error = kern_mkfifo(&nd, uap->mode);
2303         nlookup_done_at(&nd, fp);
2304         return (error);
2305 }
2306
2307 static int hardlink_check_uid = 0;
2308 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2309     &hardlink_check_uid, 0,
2310     "Unprivileged processes cannot create hard links to files owned by other "
2311     "users");
2312 static int hardlink_check_gid = 0;
2313 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2314     &hardlink_check_gid, 0,
2315     "Unprivileged processes cannot create hard links to files owned by other "
2316     "groups");
2317
2318 static int
2319 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2320 {
2321         struct vattr va;
2322         int error;
2323
2324         /*
2325          * Shortcut if disabled
2326          */
2327         if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2328                 return (0);
2329
2330         /*
2331          * Privileged user can always hardlink
2332          */
2333         if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2334                 return (0);
2335
2336         /*
2337          * Otherwise only if the originating file is owned by the
2338          * same user or group.  Note that any group is allowed if
2339          * the file is owned by the caller.
2340          */
2341         error = VOP_GETATTR(vp, &va);
2342         if (error != 0)
2343                 return (error);
2344
2345         if (hardlink_check_uid) {
2346                 if (cred->cr_uid != va.va_uid)
2347                         return (EPERM);
2348         }
2349
2350         if (hardlink_check_gid) {
2351                 if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2352                         return (EPERM);
2353         }
2354
2355         return (0);
2356 }
2357
2358 int
2359 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2360 {
2361         struct thread *td = curthread;
2362         struct vnode *vp;
2363         int error;
2364
2365         /*
2366          * Lookup the source and obtained a locked vnode.
2367          *
2368          * You may only hardlink a file which you have write permission
2369          * on or which you own.
2370          *
2371          * XXX relookup on vget failure / race ?
2372          */
2373         bwillinode(1);
2374         nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2375         if ((error = nlookup(nd)) != 0)
2376                 return (error);
2377         vp = nd->nl_nch.ncp->nc_vp;
2378         KKASSERT(vp != NULL);
2379         if (vp->v_type == VDIR)
2380                 return (EPERM);         /* POSIX */
2381         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2382                 return (error);
2383         if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2384                 return (error);
2385
2386         /*
2387          * Unlock the source so we can lookup the target without deadlocking
2388          * (XXX vp is locked already, possible other deadlock?).  The target
2389          * must not exist.
2390          */
2391         KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2392         nd->nl_flags &= ~NLC_NCPISLOCKED;
2393         cache_unlock(&nd->nl_nch);
2394         vn_unlock(vp);
2395
2396         linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2397         if ((error = nlookup(linknd)) != 0) {
2398                 vrele(vp);
2399                 return (error);
2400         }
2401         if (linknd->nl_nch.ncp->nc_vp) {
2402                 vrele(vp);
2403                 return (EEXIST);
2404         }
2405         error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
2406         if (error) {
2407                 vrele(vp);
2408                 return (error);
2409         }
2410
2411         /*
2412          * Finally run the new API VOP.
2413          */
2414         error = can_hardlink(vp, td, td->td_ucred);
2415         if (error == 0) {
2416                 error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2417                                   vp, linknd->nl_cred);
2418         }
2419         vput(vp);
2420         return (error);
2421 }
2422
2423 /*
2424  * link_args(char *path, char *link)
2425  *
2426  * Make a hard file link.
2427  */
2428 int
2429 sys_link(struct link_args *uap)
2430 {
2431         struct nlookupdata nd, linknd;
2432         int error;
2433
2434         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2435         if (error == 0) {
2436                 error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2437                 if (error == 0)
2438                         error = kern_link(&nd, &linknd);
2439                 nlookup_done(&linknd);
2440         }
2441         nlookup_done(&nd);
2442         return (error);
2443 }
2444
2445 /*
2446  * linkat_args(int fd1, char *path1, int fd2, char *path2, int flags)
2447  *
2448  * Make a hard file link. The path1 argument is relative to the directory
2449  * associated with fd1, and similarly the path2 argument is relative to
2450  * the directory associated with fd2.
2451  */
2452 int
2453 sys_linkat(struct linkat_args *uap)
2454 {
2455         struct nlookupdata nd, linknd;
2456         struct file *fp1, *fp2;
2457         int error;
2458
2459         error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
2460             (uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
2461         if (error == 0) {
2462                 error = nlookup_init_at(&linknd, &fp2, uap->fd2,
2463                     uap->path2, UIO_USERSPACE, 0);
2464                 if (error == 0)
2465                         error = kern_link(&nd, &linknd);
2466                 nlookup_done_at(&linknd, fp2);
2467         }
2468         nlookup_done_at(&nd, fp1);
2469         return (error);
2470 }
2471
2472 int
2473 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2474 {
2475         struct vattr vattr;
2476         struct vnode *vp;
2477         struct vnode *dvp;
2478         int error;
2479
2480         bwillinode(1);
2481         nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2482         if ((error = nlookup(nd)) != 0)
2483                 return (error);
2484         if (nd->nl_nch.ncp->nc_vp)
2485                 return (EEXIST);
2486         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2487                 return (error);
2488         dvp = nd->nl_dvp;
2489         VATTR_NULL(&vattr);
2490         vattr.va_mode = mode;
2491         error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2492         if (error == 0)
2493                 vput(vp);
2494         return (error);
2495 }
2496
2497 /*
2498  * symlink(char *path, char *link)
2499  *
2500  * Make a symbolic link.
2501  */
2502 int
2503 sys_symlink(struct symlink_args *uap)
2504 {
2505         struct thread *td = curthread;
2506         struct nlookupdata nd;
2507         char *path;
2508         int error;
2509         int mode;
2510
2511         path = objcache_get(namei_oc, M_WAITOK);
2512         error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2513         if (error == 0) {
2514                 error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2515                 if (error == 0) {
2516                         mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2517                         error = kern_symlink(&nd, path, mode);
2518                 }
2519                 nlookup_done(&nd);
2520         }
2521         objcache_put(namei_oc, path);
2522         return (error);
2523 }
2524
2525 /*
2526  * symlinkat_args(char *path1, int fd, char *path2)
2527  *
2528  * Make a symbolic link.  The path2 argument is relative to the directory
2529  * associated with fd.
2530  */
2531 int
2532 sys_symlinkat(struct symlinkat_args *uap)
2533 {
2534         struct thread *td = curthread;
2535         struct nlookupdata nd;
2536         struct file *fp;
2537         char *path1;
2538         int error;
2539         int mode;
2540
2541         path1 = objcache_get(namei_oc, M_WAITOK);
2542         error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2543         if (error == 0) {
2544                 error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2545                     UIO_USERSPACE, 0);
2546                 if (error == 0) {
2547                         mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2548                         error = kern_symlink(&nd, path1, mode);
2549                 }
2550                 nlookup_done_at(&nd, fp);
2551         }
2552         objcache_put(namei_oc, path1);
2553         return (error);
2554 }
2555
2556 /*
2557  * undelete_args(char *path)
2558  *
2559  * Delete a whiteout from the filesystem.
2560  */
2561 int
2562 sys_undelete(struct undelete_args *uap)
2563 {
2564         struct nlookupdata nd;
2565         int error;
2566
2567         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2568         bwillinode(1);
2569         nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2570         if (error == 0)
2571                 error = nlookup(&nd);
2572         if (error == 0)
2573                 error = ncp_writechk(&nd.nl_nch);
2574         if (error == 0) {
2575                 error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2576                                       NAMEI_DELETE);
2577         }
2578         nlookup_done(&nd);
2579         return (error);
2580 }
2581
2582 int
2583 kern_unlink(struct nlookupdata *nd)
2584 {
2585         int error;
2586
2587         bwillinode(1);
2588         nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2589         if ((error = nlookup(nd)) != 0)
2590                 return (error);
2591         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2592                 return (error);
2593         error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2594         return (error);
2595 }
2596
2597 /*
2598  * unlink_args(char *path)
2599  *
2600  * Delete a name from the filesystem.
2601  */
2602 int
2603 sys_unlink(struct unlink_args *uap)
2604 {
2605         struct nlookupdata nd;
2606         int error;
2607
2608         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2609         if (error == 0)
2610                 error = kern_unlink(&nd);
2611         nlookup_done(&nd);
2612         return (error);
2613 }
2614
2615
2616 /*
2617  * unlinkat_args(int fd, char *path, int flags)
2618  *
2619  * Delete the file or directory entry pointed to by fd/path.
2620  */
2621 int
2622 sys_unlinkat(struct unlinkat_args *uap)
2623 {
2624         struct nlookupdata nd;
2625         struct file *fp;
2626         int error;
2627
2628         if (uap->flags & ~AT_REMOVEDIR)
2629                 return (EINVAL);
2630
2631         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2632         if (error == 0) {
2633                 if (uap->flags & AT_REMOVEDIR)
2634                         error = kern_rmdir(&nd);
2635                 else
2636                         error = kern_unlink(&nd);
2637         }
2638         nlookup_done_at(&nd, fp);
2639         return (error);
2640 }
2641
2642 int
2643 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2644 {
2645         struct thread *td = curthread;
2646         struct proc *p = td->td_proc;
2647         struct file *fp;
2648         struct vnode *vp;
2649         struct vattr vattr;
2650         off_t new_offset;
2651         int error;
2652
2653         fp = holdfp(p->p_fd, fd, -1);
2654         if (fp == NULL)
2655                 return (EBADF);
2656         if (fp->f_type != DTYPE_VNODE) {
2657                 error = ESPIPE;
2658                 goto done;
2659         }
2660         vp = (struct vnode *)fp->f_data;
2661
2662         switch (whence) {
2663         case L_INCR:
2664                 spin_lock(&fp->f_spin);
2665                 new_offset = fp->f_offset + offset;
2666                 error = 0;
2667                 break;
2668         case L_XTND:
2669                 error = VOP_GETATTR(vp, &vattr);
2670                 spin_lock(&fp->f_spin);
2671                 new_offset = offset + vattr.va_size;
2672                 break;
2673         case L_SET:
2674                 new_offset = offset;
2675                 error = 0;
2676                 spin_lock(&fp->f_spin);
2677                 break;
2678         default:
2679                 new_offset = 0;
2680                 error = EINVAL;
2681                 spin_lock(&fp->f_spin);
2682                 break;
2683         }
2684
2685         /*
2686          * Validate the seek position.  Negative offsets are not allowed
2687          * for regular files or directories.
2688          *
2689          * Normally we would also not want to allow negative offsets for
2690          * character and block-special devices.  However kvm addresses
2691          * on 64 bit architectures might appear to be negative and must
2692          * be allowed.
2693          */
2694         if (error == 0) {
2695                 if (new_offset < 0 &&
2696                     (vp->v_type == VREG || vp->v_type == VDIR)) {
2697                         error = EINVAL;
2698                 } else {
2699                         fp->f_offset = new_offset;
2700                 }
2701         }
2702         *res = fp->f_offset;
2703         spin_unlock(&fp->f_spin);
2704 done:
2705         fdrop(fp);
2706         return (error);
2707 }
2708
2709 /*
2710  * lseek_args(int fd, int pad, off_t offset, int whence)
2711  *
2712  * Reposition read/write file offset.
2713  */
2714 int
2715 sys_lseek(struct lseek_args *uap)
2716 {
2717         int error;
2718
2719         error = kern_lseek(uap->fd, uap->offset, uap->whence,
2720                            &uap->sysmsg_offset);
2721
2722         return (error);
2723 }
2724
2725 /*
2726  * Check if current process can access given file.  amode is a bitmask of *_OK
2727  * access bits.  flags is a bitmask of AT_* flags.
2728  */
2729 int
2730 kern_access(struct nlookupdata *nd, int amode, int flags)
2731 {
2732         struct vnode *vp;
2733         int error, mode;
2734
2735         if (flags & ~AT_EACCESS)
2736                 return (EINVAL);
2737         nd->nl_flags |= NLC_SHAREDLOCK;
2738         if ((error = nlookup(nd)) != 0)
2739                 return (error);
2740 retry:
2741         error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
2742         if (error)
2743                 return (error);
2744
2745         /* Flags == 0 means only check for existence. */
2746         if (amode) {
2747                 mode = 0;
2748                 if (amode & R_OK)
2749                         mode |= VREAD;
2750                 if (amode & W_OK)
2751                         mode |= VWRITE;
2752                 if (amode & X_OK)
2753                         mode |= VEXEC;
2754                 if ((mode & VWRITE) == 0 ||
2755                     (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2756                         error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2757
2758                 /*
2759                  * If the file handle is stale we have to re-resolve the
2760                  * entry with the ncp held exclusively.  This is a hack
2761                  * at the moment.
2762                  */
2763                 if (error == ESTALE) {
2764                         vput(vp);
2765                         cache_unlock(&nd->nl_nch);
2766                         cache_lock(&nd->nl_nch);
2767                         cache_setunresolved(&nd->nl_nch);
2768                         error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2769                         if (error == 0) {
2770                                 vp = NULL;
2771                                 goto retry;
2772                         }
2773                         return(error);
2774                 }
2775         }
2776         vput(vp);
2777         return (error);
2778 }
2779
2780 /*
2781  * access_args(char *path, int flags)
2782  *
2783  * Check access permissions.
2784  */
2785 int
2786 sys_access(struct access_args *uap)
2787 {
2788         struct nlookupdata nd;
2789         int error;
2790
2791         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2792         if (error == 0)
2793                 error = kern_access(&nd, uap->flags, 0);
2794         nlookup_done(&nd);
2795         return (error);
2796 }
2797
2798
2799 /*
2800  * eaccess_args(char *path, int flags)
2801  *
2802  * Check access permissions.
2803  */
2804 int
2805 sys_eaccess(struct eaccess_args *uap)
2806 {
2807         struct nlookupdata nd;
2808         int error;
2809
2810         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2811         if (error == 0)
2812                 error = kern_access(&nd, uap->flags, AT_EACCESS);
2813         nlookup_done(&nd);
2814         return (error);
2815 }
2816
2817
2818 /*
2819  * faccessat_args(int fd, char *path, int amode, int flags)
2820  *
2821  * Check access permissions.
2822  */
2823 int
2824 sys_faccessat(struct faccessat_args *uap)
2825 {
2826         struct nlookupdata nd;
2827         struct file *fp;
2828         int error;
2829
2830         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2831                                 NLC_FOLLOW);
2832         if (error == 0)
2833                 error = kern_access(&nd, uap->amode, uap->flags);
2834         nlookup_done_at(&nd, fp);
2835         return (error);
2836 }
2837
2838 int
2839 kern_stat(struct nlookupdata *nd, struct stat *st)
2840 {
2841         int error;
2842         struct vnode *vp;
2843
2844         nd->nl_flags |= NLC_SHAREDLOCK;
2845         if ((error = nlookup(nd)) != 0)
2846                 return (error);
2847 again:
2848         if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2849                 return (ENOENT);
2850
2851         if ((error = vget(vp, LK_SHARED)) != 0)
2852                 return (error);
2853         error = vn_stat(vp, st, nd->nl_cred);
2854
2855         /*
2856          * If the file handle is stale we have to re-resolve the
2857          * entry with the ncp held exclusively.  This is a hack
2858          * at the moment.
2859          */
2860         if (error == ESTALE) {
2861                 vput(vp);
2862                 cache_unlock(&nd->nl_nch);
2863                 cache_lock(&nd->nl_nch);
2864                 cache_setunresolved(&nd->nl_nch);
2865                 error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2866                 if (error == 0)
2867                         goto again;
2868         } else {
2869                 vput(vp);
2870         }
2871         return (error);
2872 }
2873
2874 /*
2875  * stat_args(char *path, struct stat *ub)
2876  *
2877  * Get file status; this version follows links.
2878  */
2879 int
2880 sys_stat(struct stat_args *uap)
2881 {
2882         struct nlookupdata nd;
2883         struct stat st;
2884         int error;
2885
2886         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2887         if (error == 0) {
2888                 error = kern_stat(&nd, &st);
2889                 if (error == 0)
2890                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
2891         }
2892         nlookup_done(&nd);
2893         return (error);
2894 }
2895
2896 /*
2897  * lstat_args(char *path, struct stat *ub)
2898  *
2899  * Get file status; this version does not follow links.
2900  */
2901 int
2902 sys_lstat(struct lstat_args *uap)
2903 {
2904         struct nlookupdata nd;
2905         struct stat st;
2906         int error;
2907
2908         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2909         if (error == 0) {
2910                 error = kern_stat(&nd, &st);
2911                 if (error == 0)
2912                         error = copyout(&st, uap->ub, sizeof(*uap->ub));
2913         }
2914         nlookup_done(&nd);
2915         return (error);
2916 }
2917
2918 /*
2919  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2920  *
2921  * Get status of file pointed to by fd/path.
2922  */
2923 int
2924 sys_fstatat(struct fstatat_args *uap)
2925 {
2926         struct nlookupdata nd;
2927         struct stat st;
2928         int error;
2929         int flags;
2930         struct file *fp;
2931
2932         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2933                 return (EINVAL);
2934
2935         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2936
2937         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
2938                                 UIO_USERSPACE, flags);
2939         if (error == 0) {
2940                 error = kern_stat(&nd, &st);
2941                 if (error == 0)
2942                         error = copyout(&st, uap->sb, sizeof(*uap->sb));
2943         }
2944         nlookup_done_at(&nd, fp);
2945         return (error);
2946 }
2947
2948 static int
2949 kern_pathconf(char *path, int name, int flags, register_t *sysmsg_regp)
2950 {
2951         struct nlookupdata nd;
2952         struct vnode *vp;
2953         int error;
2954
2955         vp = NULL;
2956         error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
2957         if (error == 0)
2958                 error = nlookup(&nd);
2959         if (error == 0)
2960                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2961         nlookup_done(&nd);
2962         if (error == 0) {
2963                 error = VOP_PATHCONF(vp, name, sysmsg_regp);
2964                 vput(vp);
2965         }
2966         return (error);
2967 }
2968
2969 /*
2970  * pathconf_Args(char *path, int name)
2971  *
2972  * Get configurable pathname variables.
2973  */
2974 int
2975 sys_pathconf(struct pathconf_args *uap)
2976 {
2977         return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
2978                 &uap->sysmsg_reg));
2979 }
2980
2981 /*
2982  * lpathconf_Args(char *path, int name)
2983  *
2984  * Get configurable pathname variables, but don't follow symlinks.
2985  */
2986 int
2987 sys_lpathconf(struct lpathconf_args *uap)
2988 {
2989         return (kern_pathconf(uap->path, uap->name, 0, &uap->sysmsg_reg));
2990 }
2991
2992 /*
2993  * XXX: daver
2994  * kern_readlink isn't properly split yet.  There is a copyin burried
2995  * in VOP_READLINK().
2996  */
2997 int
2998 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2999 {
3000         struct thread *td = curthread;
3001         struct vnode *vp;
3002         struct iovec aiov;
3003         struct uio auio;
3004         int error;
3005
3006         nd->nl_flags |= NLC_SHAREDLOCK;
3007         if ((error = nlookup(nd)) != 0)
3008                 return (error);
3009         error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
3010         if (error)
3011                 return (error);
3012         if (vp->v_type != VLNK) {
3013                 error = EINVAL;
3014         } else {
3015                 aiov.iov_base = buf;
3016                 aiov.iov_len = count;
3017                 auio.uio_iov = &aiov;
3018                 auio.uio_iovcnt = 1;
3019                 auio.uio_offset = 0;
3020                 auio.uio_rw = UIO_READ;
3021                 auio.uio_segflg = UIO_USERSPACE;
3022                 auio.uio_td = td;
3023                 auio.uio_resid = count;
3024                 error = VOP_READLINK(vp, &auio, td->td_ucred);
3025         }
3026         vput(vp);
3027         *res = count - auio.uio_resid;
3028         return (error);
3029 }
3030
3031 /*
3032  * readlink_args(char *path, char *buf, int count)
3033  *
3034  * Return target name of a symbolic link.
3035  */
3036 int
3037 sys_readlink(struct readlink_args *uap)
3038 {
3039         struct nlookupdata nd;
3040         int error;
3041
3042         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3043         if (error == 0) {
3044                 error = kern_readlink(&nd, uap->buf, uap->count,
3045                                         &uap->sysmsg_result);
3046         }
3047         nlookup_done(&nd);
3048         return (error);
3049 }
3050
3051 /*
3052  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
3053  *
3054  * Return target name of a symbolic link.  The path is relative to the
3055  * directory associated with fd.
3056  */
3057 int
3058 sys_readlinkat(struct readlinkat_args *uap)
3059 {
3060         struct nlookupdata nd;
3061         struct file *fp;
3062         int error;
3063
3064         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3065         if (error == 0) {
3066                 error = kern_readlink(&nd, uap->buf, uap->bufsize,
3067                                         &uap->sysmsg_result);
3068         }
3069         nlookup_done_at(&nd, fp);
3070         return (error);
3071 }
3072
3073 static int
3074 setfflags(struct vnode *vp, int flags)
3075 {
3076         struct thread *td = curthread;
3077         int error;
3078         struct vattr vattr;
3079
3080         /*
3081          * Prevent non-root users from setting flags on devices.  When
3082          * a device is reused, users can retain ownership of the device
3083          * if they are allowed to set flags and programs assume that
3084          * chown can't fail when done as root.
3085          */
3086         if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
3087             ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
3088                 return (error);
3089
3090         /*
3091          * note: vget is required for any operation that might mod the vnode
3092          * so VINACTIVE is properly cleared.
3093          */
3094         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3095                 VATTR_NULL(&vattr);
3096                 vattr.va_flags = flags;
3097                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3098                 vput(vp);
3099         }
3100         return (error);
3101 }
3102
3103 /*
3104  * chflags(char *path, int flags)
3105  *
3106  * Change flags of a file given a path name.
3107  */
3108 int
3109 sys_chflags(struct chflags_args *uap)
3110 {
3111         struct nlookupdata nd;
3112         struct vnode *vp;
3113         int error;
3114
3115         vp = NULL;
3116         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3117         if (error == 0)
3118                 error = nlookup(&nd);
3119         if (error == 0)
3120                 error = ncp_writechk(&nd.nl_nch);
3121         if (error == 0)
3122                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3123         nlookup_done(&nd);
3124         if (error == 0) {
3125                 error = setfflags(vp, uap->flags);
3126                 vrele(vp);
3127         }
3128         return (error);
3129 }
3130
3131 /*
3132  * lchflags(char *path, int flags)
3133  *
3134  * Change flags of a file given a path name, but don't follow symlinks.
3135  */
3136 int
3137 sys_lchflags(struct lchflags_args *uap)
3138 {
3139         struct nlookupdata nd;
3140         struct vnode *vp;
3141         int error;
3142
3143         vp = NULL;
3144         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3145         if (error == 0)
3146                 error = nlookup(&nd);
3147         if (error == 0)
3148                 error = ncp_writechk(&nd.nl_nch);
3149         if (error == 0)
3150                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3151         nlookup_done(&nd);
3152         if (error == 0) {
3153                 error = setfflags(vp, uap->flags);
3154                 vrele(vp);
3155         }
3156         return (error);
3157 }
3158
3159 /*
3160  * fchflags_args(int fd, int flags)
3161  *
3162  * Change flags of a file given a file descriptor.
3163  */
3164 int
3165 sys_fchflags(struct fchflags_args *uap)
3166 {
3167         struct thread *td = curthread;
3168         struct proc *p = td->td_proc;
3169         struct file *fp;
3170         int error;
3171
3172         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3173                 return (error);
3174         if (fp->f_nchandle.ncp)
3175                 error = ncp_writechk(&fp->f_nchandle);
3176         if (error == 0)
3177                 error = setfflags((struct vnode *) fp->f_data, uap->flags);
3178         fdrop(fp);
3179         return (error);
3180 }
3181
3182 /*
3183  * chflagsat_args(int fd, const char *path, int flags, int atflags)
3184  * change flags given a pathname relative to a filedescriptor
3185  */
3186 int sys_chflagsat(struct chflagsat_args *uap)
3187 {
3188         struct nlookupdata nd;
3189         struct vnode *vp;
3190         struct file *fp;
3191         int error;
3192         int lookupflags;
3193
3194         if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
3195                 return (EINVAL);
3196
3197         lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3198
3199         vp = NULL;
3200         error = nlookup_init_at(&nd, &fp, uap->fd,  uap->path, UIO_USERSPACE, lookupflags);
3201         if (error == 0)
3202                 error = nlookup(&nd);
3203         if (error == 0)
3204                 error = ncp_writechk(&nd.nl_nch);
3205         if (error == 0)
3206                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3207         nlookup_done_at(&nd, fp);
3208         if (error == 0) {
3209                 error = setfflags(vp, uap->flags);
3210                 vrele(vp);
3211         }
3212         return (error);
3213 }
3214
3215
3216 static int
3217 setfmode(struct vnode *vp, int mode)
3218 {
3219         struct thread *td = curthread;
3220         int error;
3221         struct vattr vattr;
3222
3223         /*
3224          * note: vget is required for any operation that might mod the vnode
3225          * so VINACTIVE is properly cleared.
3226          */
3227         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3228                 VATTR_NULL(&vattr);
3229                 vattr.va_mode = mode & ALLPERMS;
3230                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3231                 cache_inval_wxok(vp);
3232                 vput(vp);
3233         }
3234         return error;
3235 }
3236
3237 int
3238 kern_chmod(struct nlookupdata *nd, int mode)
3239 {
3240         struct vnode *vp;
3241         int error;
3242
3243         if ((error = nlookup(nd)) != 0)
3244                 return (error);
3245         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3246                 return (error);
3247         if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3248                 error = setfmode(vp, mode);
3249         vrele(vp);
3250         return (error);
3251 }
3252
3253 /*
3254  * chmod_args(char *path, int mode)
3255  *
3256  * Change mode of a file given path name.
3257  */
3258 int
3259 sys_chmod(struct chmod_args *uap)
3260 {
3261         struct nlookupdata nd;
3262         int error;
3263
3264         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3265         if (error == 0)
3266                 error = kern_chmod(&nd, uap->mode);
3267         nlookup_done(&nd);
3268         return (error);
3269 }
3270
3271 /*
3272  * lchmod_args(char *path, int mode)
3273  *
3274  * Change mode of a file given path name (don't follow links.)
3275  */
3276 int
3277 sys_lchmod(struct lchmod_args *uap)
3278 {
3279         struct nlookupdata nd;
3280         int error;
3281
3282         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3283         if (error == 0)
3284                 error = kern_chmod(&nd, uap->mode);
3285         nlookup_done(&nd);
3286         return (error);
3287 }
3288
3289 /*
3290  * fchmod_args(int fd, int mode)
3291  *
3292  * Change mode of a file given a file descriptor.
3293  */
3294 int
3295 sys_fchmod(struct fchmod_args *uap)
3296 {
3297         struct thread *td = curthread;
3298         struct proc *p = td->td_proc;
3299         struct file *fp;
3300         int error;
3301
3302         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3303                 return (error);
3304         if (fp->f_nchandle.ncp)
3305                 error = ncp_writechk(&fp->f_nchandle);
3306         if (error == 0)
3307                 error = setfmode((struct vnode *)fp->f_data, uap->mode);
3308         fdrop(fp);
3309         return (error);
3310 }
3311
3312 /*
3313  * fchmodat_args(char *path, int mode)
3314  *
3315  * Change mode of a file pointed to by fd/path.
3316  */
3317 int
3318 sys_fchmodat(struct fchmodat_args *uap)
3319 {
3320         struct nlookupdata nd;
3321         struct file *fp;
3322         int error;
3323         int flags;
3324
3325         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3326                 return (EINVAL);
3327         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3328
3329         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3330                                 UIO_USERSPACE, flags);
3331         if (error == 0)
3332                 error = kern_chmod(&nd, uap->mode);
3333         nlookup_done_at(&nd, fp);
3334         return (error);
3335 }
3336
3337 static int
3338 setfown(struct mount *mp, struct vnode *vp, uid_t uid, gid_t gid)
3339 {
3340         struct thread *td = curthread;
3341         int error;
3342         struct vattr vattr;
3343         uid_t o_uid;
3344         gid_t o_gid;
3345         uint64_t size;
3346
3347         /*
3348          * note: vget is required for any operation that might mod the vnode
3349          * so VINACTIVE is properly cleared.
3350          */
3351         if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3352                 if ((error = VOP_GETATTR(vp, &vattr)) != 0)
3353                         return error;
3354                 o_uid = vattr.va_uid;
3355                 o_gid = vattr.va_gid;
3356                 size = vattr.va_size;
3357
3358                 VATTR_NULL(&vattr);
3359                 vattr.va_uid = uid;
3360                 vattr.va_gid = gid;
3361                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3362                 vput(vp);
3363         }
3364
3365         if (error == 0) {
3366                 if (uid == -1)
3367                         uid = o_uid;
3368                 if (gid == -1)
3369                         gid = o_gid;
3370                 VFS_ACCOUNT(mp, o_uid, o_gid, -size);
3371                 VFS_ACCOUNT(mp,   uid,   gid,  size);
3372         }
3373
3374         return error;
3375 }
3376
3377 int
3378 kern_chown(struct nlookupdata *nd, int uid, int gid)
3379 {
3380         struct vnode *vp;
3381         int error;
3382
3383         if ((error = nlookup(nd)) != 0)
3384                 return (error);
3385         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3386                 return (error);
3387         if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3388                 error = setfown(nd->nl_nch.mount, vp, uid, gid);
3389         vrele(vp);
3390         return (error);
3391 }
3392
3393 /*
3394  * chown(char *path, int uid, int gid)
3395  *
3396  * Set ownership given a path name.
3397  */
3398 int
3399 sys_chown(struct chown_args *uap)
3400 {
3401         struct nlookupdata nd;
3402         int error;
3403
3404         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3405         if (error == 0)
3406                 error = kern_chown(&nd, uap->uid, uap->gid);
3407         nlookup_done(&nd);
3408         return (error);
3409 }
3410
3411 /*
3412  * lchown_args(char *path, int uid, int gid)
3413  *
3414  * Set ownership given a path name, do not cross symlinks.
3415  */
3416 int
3417 sys_lchown(struct lchown_args *uap)
3418 {
3419         struct nlookupdata nd;
3420         int error;
3421
3422         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3423         if (error == 0)
3424                 error = kern_chown(&nd, uap->uid, uap->gid);
3425         nlookup_done(&nd);
3426         return (error);
3427 }
3428
3429 /*
3430  * fchown_args(int fd, int uid, int gid)
3431  *
3432  * Set ownership given a file descriptor.
3433  */
3434 int
3435 sys_fchown(struct fchown_args *uap)
3436 {
3437         struct thread *td = curthread;
3438         struct proc *p = td->td_proc;
3439         struct file *fp;
3440         int error;
3441
3442         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3443                 return (error);
3444         if (fp->f_nchandle.ncp)
3445                 error = ncp_writechk(&fp->f_nchandle);
3446         if (error == 0)
3447                 error = setfown(p->p_fd->fd_ncdir.mount,
3448                         (struct vnode *)fp->f_data, uap->uid, uap->gid);
3449         fdrop(fp);
3450         return (error);
3451 }
3452
3453 /*
3454  * fchownat(int fd, char *path, int uid, int gid, int flags)
3455  *
3456  * Set ownership of file pointed to by fd/path.
3457  */
3458 int
3459 sys_fchownat(struct fchownat_args *uap)
3460 {
3461         struct nlookupdata nd;
3462         struct file *fp;
3463         int error;
3464         int flags;
3465
3466         if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3467                 return (EINVAL);
3468         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3469
3470         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3471                                 UIO_USERSPACE, flags);
3472         if (error == 0)
3473                 error = kern_chown(&nd, uap->uid, uap->gid);
3474         nlookup_done_at(&nd, fp);
3475         return (error);
3476 }
3477
3478
3479 static int
3480 getutimes(struct timeval *tvp, struct timespec *tsp)
3481 {
3482         struct timeval tv[2];
3483         int error;
3484
3485         if (tvp == NULL) {
3486                 microtime(&tv[0]);
3487                 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3488                 tsp[1] = tsp[0];
3489         } else {
3490                 if ((error = itimerfix(tvp)) != 0)
3491                         return (error);
3492                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3493                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3494         }
3495         return 0;
3496 }
3497
3498 static int
3499 getutimens(const struct timespec *ts, struct timespec *newts, int *nullflag)
3500 {
3501         struct timespec tsnow;
3502         int error;
3503
3504         *nullflag = 0;
3505         nanotime(&tsnow);
3506         if (ts == NULL) {
3507                 newts[0] = tsnow;
3508                 newts[1] = tsnow;
3509                 *nullflag = 1;
3510                 return (0);
3511         }
3512
3513         newts[0] = ts[0];
3514         newts[1] = ts[1];
3515         if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT)
3516                 return (0);
3517         if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
3518                 *nullflag = 1;
3519
3520         if (newts[0].tv_nsec == UTIME_OMIT)
3521                 newts[0].tv_sec = VNOVAL;
3522         else if (newts[0].tv_nsec == UTIME_NOW)
3523                 newts[0] = tsnow;
3524         else if ((error = itimespecfix(&newts[0])) != 0)
3525                 return (error);
3526
3527         if (newts[1].tv_nsec == UTIME_OMIT)
3528                 newts[1].tv_sec = VNOVAL;
3529         else if (newts[1].tv_nsec == UTIME_NOW)
3530                 newts[1] = tsnow;
3531         else if ((error = itimespecfix(&newts[1])) != 0)
3532                 return (error);
3533
3534         return (0);
3535 }
3536
3537 static int
3538 setutimes(struct vnode *vp, struct vattr *vattr,
3539           const struct timespec *ts, int nullflag)
3540 {
3541         struct thread *td = curthread;
3542         int error;
3543
3544         VATTR_NULL(vattr);
3545         vattr->va_atime = ts[0];
3546         vattr->va_mtime = ts[1];
3547         if (nullflag)
3548                 vattr->va_vaflags |= VA_UTIMES_NULL;
3549         error = VOP_SETATTR(vp, vattr, td->td_ucred);
3550
3551         return error;
3552 }
3553
3554 int
3555 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3556 {
3557         struct timespec ts[2];
3558         int error;
3559
3560         if (tptr) {
3561                 if ((error = getutimes(tptr, ts)) != 0)
3562                         return (error);
3563         }
3564         error = kern_utimensat(nd, tptr ? ts : NULL, 0);
3565         return (error);
3566 }
3567
3568 /*
3569  * utimes_args(char *path, struct timeval *tptr)
3570  *
3571  * Set the access and modification times of a file.
3572  */
3573 int
3574 sys_utimes(struct utimes_args *uap)
3575 {
3576         struct timeval tv[2];
3577         struct nlookupdata nd;
3578         int error;
3579
3580         if (uap->tptr) {
3581                 error = copyin(uap->tptr, tv, sizeof(tv));
3582                 if (error)
3583                         return (error);
3584         }
3585         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3586         if (error == 0)
3587                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3588         nlookup_done(&nd);
3589         return (error);
3590 }
3591
3592 /*
3593  * lutimes_args(char *path, struct timeval *tptr)
3594  *
3595  * Set the access and modification times of a file.
3596  */
3597 int
3598 sys_lutimes(struct lutimes_args *uap)
3599 {
3600         struct timeval tv[2];
3601         struct nlookupdata nd;
3602         int error;
3603
3604         if (uap->tptr) {
3605                 error = copyin(uap->tptr, tv, sizeof(tv));
3606                 if (error)
3607                         return (error);
3608         }
3609         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3610         if (error == 0)
3611                 error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3612         nlookup_done(&nd);
3613         return (error);
3614 }
3615
3616 /*
3617  * Set utimes on a file descriptor.  The creds used to open the
3618  * file are used to determine whether the operation is allowed
3619  * or not.
3620  */
3621 int
3622 kern_futimens(int fd, struct timespec *ts)
3623 {
3624         struct thread *td = curthread;
3625         struct proc *p = td->td_proc;
3626         struct timespec newts[2];
3627         struct file *fp;
3628         struct vnode *vp;
3629         struct vattr vattr;
3630         int nullflag;
3631         int error;
3632
3633         error = getutimens(ts, newts, &nullflag);
3634         if (error)
3635                 return (error);
3636         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3637                 return (error);
3638         if (fp->f_nchandle.ncp)
3639                 error = ncp_writechk(&fp->f_nchandle);
3640         if (error == 0) {
3641                 vp = fp->f_data;
3642                 error = vget(vp, LK_EXCLUSIVE);
3643                 if (error == 0) {
3644                         error = VOP_GETATTR(vp, &vattr);
3645                         if (error == 0) {
3646                                 error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3647                                                    fp->f_cred);
3648                         }
3649                         if (error == 0) {
3650                                 error = setutimes(vp, &vattr, newts, nullflag);
3651                         }
3652                         vput(vp);
3653                 }
3654         }
3655         fdrop(fp);
3656         return (error);
3657 }
3658
3659 /*
3660  * futimens_args(int fd, struct timespec *ts)
3661  *
3662  * Set the access and modification times of a file.
3663  */
3664 int
3665 sys_futimens(struct futimens_args *uap)
3666 {
3667         struct timespec ts[2];
3668         int error;
3669
3670         if (uap->ts) {
3671                 error = copyin(uap->ts, ts, sizeof(ts));
3672                 if (error)
3673                         return (error);
3674         }
3675         error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
3676         return (error);
3677 }
3678
3679 int
3680 kern_futimes(int fd, struct timeval *tptr)
3681 {
3682         struct timespec ts[2];
3683         int error;
3684
3685         if (tptr) {
3686                 if ((error = getutimes(tptr, ts)) != 0)
3687                         return (error);
3688         }
3689         error = kern_futimens(fd, tptr ? ts : NULL);
3690         return (error);
3691 }
3692
3693 /*
3694  * futimes_args(int fd, struct timeval *tptr)
3695  *
3696  * Set the access and modification times of a file.
3697  */
3698 int
3699 sys_futimes(struct futimes_args *uap)
3700 {
3701         struct timeval tv[2];
3702         int error;
3703
3704         if (uap->tptr) {
3705                 error = copyin(uap->tptr, tv, sizeof(tv));
3706                 if (error)
3707                         return (error);
3708         }
3709         error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3710         return (error);
3711 }
3712
3713 int
3714 kern_utimensat(struct nlookupdata *nd, const struct timespec *ts, int flags)
3715 {
3716         struct timespec newts[2];
3717         struct vnode *vp;
3718         struct vattr vattr;
3719         int nullflag;
3720         int error;
3721
3722         if (flags & ~AT_SYMLINK_NOFOLLOW)
3723                 return (EINVAL);
3724
3725         error = getutimens(ts, newts, &nullflag);
3726         if (error)
3727                 return (error);
3728
3729         nd->nl_flags |= NLC_OWN | NLC_WRITE;
3730         if ((error = nlookup(nd)) != 0)
3731                 return (error);
3732         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3733                 return (error);
3734         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3735                 return (error);
3736         if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3737                 error = vget(vp, LK_EXCLUSIVE);
3738                 if (error == 0) {
3739                         error = setutimes(vp, &vattr, newts, nullflag);
3740                         vput(vp);
3741                 }
3742         }
3743         vrele(vp);
3744         return (error);
3745 }
3746
3747 /*
3748  * utimensat_args(int fd, const char *path, const struct timespec *ts, int flags);
3749  *
3750  * Set file access and modification times of a file.
3751  */
3752 int
3753 sys_utimensat(struct utimensat_args *uap)
3754 {
3755         struct timespec ts[2];
3756         struct nlookupdata nd;
3757         struct file *fp;
3758         int error;
3759         int flags;
3760
3761         if (uap->ts) {
3762                 error = copyin(uap->ts, ts, sizeof(ts));
3763                 if (error)
3764                         return (error);
3765         }
3766
3767         flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3768         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3769                                 UIO_USERSPACE, flags);
3770         if (error == 0)
3771                 error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
3772         nlookup_done_at(&nd, fp);
3773         return (error);
3774 }
3775
3776 int
3777 kern_truncate(struct nlookupdata *nd, off_t length)
3778 {
3779         struct vnode *vp;
3780         struct vattr vattr;
3781         int error;
3782         uid_t uid = 0;
3783         gid_t gid = 0;
3784         uint64_t old_size = 0;
3785
3786         if (length < 0)
3787                 return(EINVAL);
3788         nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3789         if ((error = nlookup(nd)) != 0)
3790                 return (error);
3791         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3792                 return (error);
3793         if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3794                 return (error);
3795         error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
3796         if (error) {
3797                 vrele(vp);
3798                 return (error);
3799         }
3800         if (vp->v_type == VDIR) {
3801                 error = EISDIR;
3802                 goto done;
3803         }
3804         if (vfs_quota_enabled) {
3805                 error = VOP_GETATTR(vp, &vattr);
3806                 KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
3807                 uid = vattr.va_uid;
3808                 gid = vattr.va_gid;
3809                 old_size = vattr.va_size;
3810         }
3811
3812         if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3813                 VATTR_NULL(&vattr);
3814                 vattr.va_size = length;
3815                 error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3816                 VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
3817         }
3818 done:
3819         vput(vp);
3820         return (error);
3821 }
3822
3823 /*
3824  * truncate(char *path, int pad, off_t length)
3825  *
3826  * Truncate a file given its path name.
3827  */
3828 int
3829 sys_truncate(struct truncate_args *uap)
3830 {
3831         struct nlookupdata nd;
3832         int error;
3833
3834         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3835         if (error == 0)
3836                 error = kern_truncate(&nd, uap->length);
3837         nlookup_done(&nd);
3838         return error;
3839 }
3840
3841 int
3842 kern_ftruncate(int fd, off_t length)
3843 {
3844         struct thread *td = curthread;
3845         struct proc *p = td->td_proc;
3846         struct vattr vattr;
3847         struct vnode *vp;
3848         struct file *fp;
3849         int error;
3850         uid_t uid = 0;
3851         gid_t gid = 0;
3852         uint64_t old_size = 0;
3853         struct mount *mp;
3854
3855         if (length < 0)
3856                 return(EINVAL);
3857         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3858                 return (error);
3859         if (fp->f_nchandle.ncp) {
3860                 error = ncp_writechk(&fp->f_nchandle);
3861                 if (error)
3862                         goto done;
3863         }
3864         if ((fp->f_flag & FWRITE) == 0) {
3865                 error = EINVAL;
3866                 goto done;
3867         }
3868         if (fp->f_flag & FAPPENDONLY) { /* inode was set s/uapnd */
3869                 error = EINVAL;
3870                 goto done;
3871         }
3872         vp = (struct vnode *)fp->f_data;
3873         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3874         if (vp->v_type == VDIR) {
3875                 error = EISDIR;
3876                 vn_unlock(vp);
3877                 goto done;
3878         }
3879
3880         if (vfs_quota_enabled) {
3881                 error = VOP_GETATTR(vp, &vattr);
3882                 KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
3883                 uid = vattr.va_uid;
3884                 gid = vattr.va_gid;
3885                 old_size = vattr.va_size;
3886         }
3887
3888         if ((error = vn_writechk(vp, NULL)) == 0) {
3889                 VATTR_NULL(&vattr);
3890                 vattr.va_size = length;
3891                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3892                 mp = vq_vptomp(vp);
3893                 VFS_ACCOUNT(mp, uid, gid, length - old_size);
3894         }
3895         vn_unlock(vp);
3896 done:
3897         fdrop(fp);
3898         return (error);
3899 }
3900
3901 /*
3902  * ftruncate_args(int fd, int pad, off_t length)
3903  *
3904  * Truncate a file given a file descriptor.
3905  */
3906 int
3907 sys_ftruncate(struct ftruncate_args *uap)
3908 {
3909         int error;
3910
3911         error = kern_ftruncate(uap->fd, uap->length);
3912
3913         return (error);
3914 }
3915
3916 /*
3917  * fsync(int fd)
3918  *
3919  * Sync an open file.
3920  */
3921 int
3922 sys_fsync(struct fsync_args *uap)
3923 {
3924         struct thread *td = curthread;
3925         struct proc *p = td->td_proc;
3926         struct vnode *vp;
3927         struct file *fp;
3928         vm_object_t obj;
3929         int error;
3930
3931         if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3932                 return (error);
3933         vp = (struct vnode *)fp->f_data;
3934         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3935         if ((obj = vp->v_object) != NULL) {
3936                 if (vp->v_mount == NULL ||
3937                     (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
3938                         vm_object_page_clean(obj, 0, 0, 0);
3939                 }
3940         }
3941         error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3942         if (error == 0 && vp->v_mount)
3943                 error = buf_fsync(vp);
3944         vn_unlock(vp);
3945         fdrop(fp);
3946
3947         return (error);
3948 }
3949
3950 int
3951 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
3952 {
3953         struct nchandle fnchd;
3954         struct nchandle tnchd;
3955         struct namecache *ncp;
3956         struct vnode *fdvp;
3957         struct vnode *tdvp;
3958         struct mount *mp;
3959         int error;
3960         u_int fncp_gen;
3961         u_int tncp_gen;
3962
3963         bwillinode(1);
3964         fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
3965         if ((error = nlookup(fromnd)) != 0)
3966                 return (error);
3967         if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
3968                 return (ENOENT);
3969         fnchd.mount = fromnd->nl_nch.mount;
3970         cache_hold(&fnchd);
3971
3972         /*
3973          * unlock the source nch so we can lookup the target nch without
3974          * deadlocking.  The target may or may not exist so we do not check
3975          * for a target vp like kern_mkdir() and other creation functions do.
3976          *
3977          * The source and target directories are ref'd and rechecked after
3978          * everything is relocked to determine if the source or target file
3979          * has been renamed.
3980          */
3981         KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
3982         fromnd->nl_flags &= ~NLC_NCPISLOCKED;
3983
3984         fncp_gen = fromnd->nl_nch.ncp->nc_generation;
3985
3986         cache_unlock(&fromnd->nl_nch);
3987
3988         tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
3989         if ((error = nlookup(tond)) != 0) {
3990                 cache_drop(&fnchd);
3991                 return (error);
3992         }
3993         tncp_gen = tond->nl_nch.ncp->nc_generation;
3994
3995         if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
3996                 cache_drop(&fnchd);
3997                 return (ENOENT);
3998         }
3999         tnchd.mount = tond->nl_nch.mount;
4000         cache_hold(&tnchd);
4001
4002         /*
4003          * If the source and target are the same there is nothing to do
4004          */
4005         if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
4006                 cache_drop(&fnchd);
4007                 cache_drop(&tnchd);
4008                 return (0);
4009         }
4010
4011         /*
4012          * Mount points cannot be renamed or overwritten
4013          */
4014         if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
4015             NCF_ISMOUNTPT
4016         ) {
4017                 cache_drop(&fnchd);
4018                 cache_drop(&tnchd);
4019                 return (EINVAL);
4020         }
4021
4022         /*
4023          * Relock the source ncp.  cache_relock() will deal with any
4024          * deadlocks against the already-locked tond and will also
4025          * make sure both are resolved.
4026          *
4027          * NOTE AFTER RELOCKING: The source or target ncp may have become
4028          * invalid while they were unlocked, nc_vp and nc_mount could
4029          * be NULL.
4030          */
4031         cache_relock(&fromnd->nl_nch, fromnd->nl_cred,
4032                      &tond->nl_nch, tond->nl_cred);
4033         fromnd->nl_flags |= NLC_NCPISLOCKED;
4034
4035         /*
4036          * If the namecache generation changed for either fromnd or tond,
4037          * we must retry.
4038          */
4039         if (fromnd->nl_nch.ncp->nc_generation != fncp_gen ||
4040             tond->nl_nch.ncp->nc_generation != tncp_gen) {
4041                 kprintf("kern_rename: retry due to gen on: "
4042                         "\"%s\" -> \"%s\"\n",
4043                         fromnd->nl_nch.ncp->nc_name,
4044                         tond->nl_nch.ncp->nc_name);
4045                 cache_drop(&fnchd);
4046                 cache_drop(&tnchd);
4047                 return (EAGAIN);
4048         }
4049
4050         /*
4051          * If either fromnd or tond are marked destroyed a ripout occured
4052          * out from under us and we must retry.
4053          */
4054         if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)) ||
4055             fromnd->nl_nch.ncp->nc_vp == NULL ||
4056             (tond->nl_nch.ncp->nc_flag & NCF_DESTROYED)) {
4057                 kprintf("kern_rename: retry due to ripout on: "
4058                         "\"%s\" -> \"%s\"\n",
4059                         fromnd->nl_nch.ncp->nc_name,
4060                         tond->nl_nch.ncp->nc_name);
4061                 cache_drop(&fnchd);
4062                 cache_drop(&tnchd);
4063                 return (EAGAIN);
4064         }
4065
4066         /*
4067          * Make sure the parent directories linkages are the same.
4068          * XXX shouldn't be needed any more w/ generation check above.
4069          */
4070         if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
4071             tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
4072                 cache_drop(&fnchd);
4073                 cache_drop(&tnchd);
4074                 return (ENOENT);
4075         }
4076
4077         /*
4078          * Both the source and target must be within the same filesystem and
4079          * in the same filesystem as their parent directories within the
4080          * namecache topology.
4081          *
4082          * NOTE: fromnd's nc_mount or nc_vp could be NULL.
4083          */
4084         mp = fnchd.mount;
4085         if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
4086             mp != tond->nl_nch.mount) {
4087                 cache_drop(&fnchd);
4088                 cache_drop(&tnchd);
4089                 return (EXDEV);
4090         }
4091
4092         /*
4093          * Make sure the mount point is writable
4094          */
4095         if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
4096                 cache_drop(&fnchd);
4097                 cache_drop(&tnchd);
4098                 return (error);
4099         }
4100
4101         /*
4102          * If the target exists and either the source or target is a directory,
4103          * then both must be directories.
4104          *
4105          * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
4106          * have become NULL.
4107          */
4108         if (tond->nl_nch.ncp->nc_vp) {
4109                 if (fromnd->nl_nch.ncp->nc_vp == NULL) {
4110                         error = ENOENT;
4111                 } else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4112                         if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
4113                                 error = ENOTDIR;
4114                 } else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
4115                         error = EISDIR;
4116                 }
4117         }
4118
4119         /*
4120          * You cannot rename a source into itself or a subdirectory of itself.
4121          * We check this by travsersing the target directory upwards looking
4122          * for a match against the source.
4123          *
4124          * XXX MPSAFE
4125          */
4126         if (error == 0) {
4127                 for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
4128                         if (fromnd->nl_nch.ncp == ncp) {
4129                                 error = EINVAL;
4130                                 break;
4131                         }
4132                 }
4133         }
4134
4135         cache_drop(&fnchd);
4136         cache_drop(&tnchd);
4137
4138         /*
4139          * Even though the namespaces are different, they may still represent
4140          * hardlinks to the same file.  The filesystem might have a hard time
4141          * with this so we issue a NREMOVE of the source instead of a NRENAME
4142          * when we detect the situation.
4143          */
4144         if (error == 0) {
4145                 fdvp = fromnd->nl_dvp;
4146                 tdvp = tond->nl_dvp;
4147                 if (fdvp == NULL || tdvp == NULL) {
4148                         error = EPERM;
4149                 } else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
4150                         error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
4151                                             fromnd->nl_cred);
4152                 } else {
4153                         error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
4154                                             fdvp, tdvp, tond->nl_cred);
4155                 }
4156         }
4157         return (error);
4158 }
4159
4160 /*
4161  * rename_args(char *from, char *to)
4162  *
4163  * Rename files.  Source and destination must either both be directories,
4164  * or both not be directories.  If target is a directory, it must be empty.
4165  */
4166 int
4167 sys_rename(struct rename_args *uap)
4168 {
4169         struct nlookupdata fromnd, tond;
4170         int error;
4171
4172         do {
4173                 error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
4174                 if (error == 0) {
4175                         error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
4176                         if (error == 0)
4177                                 error = kern_rename(&fromnd, &tond);
4178                         nlookup_done(&tond);
4179                 }
4180                 nlookup_done(&fromnd);
4181         } while (error == EAGAIN);
4182         return (error);
4183 }
4184
4185 /*
4186  * renameat_args(int oldfd, char *old, int newfd, char *new)
4187  *
4188  * Rename files using paths relative to the directories associated with
4189  * oldfd and newfd.  Source and destination must either both be directories,
4190  * or both not be directories.  If target is a directory, it must be empty.
4191  */
4192 int
4193 sys_renameat(struct renameat_args *uap)
4194 {
4195         struct nlookupdata oldnd, newnd;
4196         struct file *oldfp, *newfp;
4197         int error;
4198
4199         do {
4200                 error = nlookup_init_at(&oldnd, &oldfp,
4201                                         uap->oldfd, uap->old,
4202                                         UIO_USERSPACE, 0);
4203                 if (error == 0) {
4204                         error = nlookup_init_at(&newnd, &newfp,
4205                                                 uap->newfd, uap->new,
4206                                                 UIO_USERSPACE, 0);
4207                         if (error == 0)
4208                                 error = kern_rename(&oldnd, &newnd);
4209                         nlookup_done_at(&newnd, newfp);
4210                 }
4211                 nlookup_done_at(&oldnd, oldfp);
4212         } while (error == EAGAIN);
4213         return (error);
4214 }
4215
4216 int
4217 kern_mkdir(struct nlookupdata *nd, int mode)
4218 {
4219         struct thread *td = curthread;
4220         struct proc *p = td->td_proc;
4221         struct vnode *vp;
4222         struct vattr vattr;
4223         int error;
4224
4225         bwillinode(1);
4226         nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
4227         if ((error = nlookup(nd)) != 0)
4228                 return (error);
4229
4230         if (nd->nl_nch.ncp->nc_vp)
4231                 return (EEXIST);
4232         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4233                 return (error);
4234         VATTR_NULL(&vattr);
4235         vattr.va_type = VDIR;
4236         vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
4237
4238         vp = NULL;
4239         error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
4240         if (error == 0)
4241                 vput(vp);
4242         return (error);
4243 }
4244
4245 /*
4246  * mkdir_args(char *path, int mode)
4247  *
4248  * Make a directory file.
4249  */
4250 int
4251 sys_mkdir(struct mkdir_args *uap)
4252 {
4253         struct nlookupdata nd;
4254         int error;
4255
4256         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4257         if (error == 0)
4258                 error = kern_mkdir(&nd, uap->mode);
4259         nlookup_done(&nd);
4260         return (error);
4261 }
4262
4263 /*
4264  * mkdirat_args(int fd, char *path, mode_t mode)
4265  *
4266  * Make a directory file.  The path is relative to the directory associated
4267  * with fd.
4268  */
4269 int
4270 sys_mkdirat(struct mkdirat_args *uap)
4271 {
4272         struct nlookupdata nd;
4273         struct file *fp;
4274         int error;
4275
4276         error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
4277         if (error == 0)
4278                 error = kern_mkdir(&nd, uap->mode);
4279         nlookup_done_at(&nd, fp);
4280         return (error);
4281 }
4282
4283 int
4284 kern_rmdir(struct nlookupdata *nd)
4285 {
4286         int error;
4287
4288         bwillinode(1);
4289         nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4290         if ((error = nlookup(nd)) != 0)
4291                 return (error);
4292
4293         /*
4294          * Do not allow directories representing mount points to be
4295          * deleted, even if empty.  Check write perms on mount point
4296          * in case the vnode is aliased (aka nullfs).
4297          */
4298         if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4299                 return (EBUSY);
4300         if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4301                 return (error);
4302         error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4303         return (error);
4304 }
4305
4306 /*
4307  * rmdir_args(char *path)
4308  *
4309  * Remove a directory file.
4310  */
4311 int
4312 sys_rmdir(struct rmdir_args *uap)
4313 {
4314         struct nlookupdata nd;
4315         int error;
4316
4317         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4318         if (error == 0)
4319                 error = kern_rmdir(&nd);
4320         nlookup_done(&nd);
4321         return (error);
4322 }
4323
4324 int
4325 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4326                    enum uio_seg direction)
4327 {
4328         struct thread *td = curthread;
4329         struct proc *p = td->td_proc;
4330         struct vnode *vp;
4331         struct file *fp;
4332         struct uio auio;
4333         struct iovec aiov;
4334         off_t loff;
4335         int error, eofflag;
4336
4337         if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
4338                 return (error);
4339         if ((fp->f_flag & FREAD) == 0) {
4340                 error = EBADF;
4341                 goto done;
4342         }
4343         vp = (struct vnode *)fp->f_data;
4344         if (vp->v_type != VDIR) {
4345                 error = EINVAL;
4346                 goto done;
4347         }
4348         aiov.iov_base = buf;
4349         aiov.iov_len = count;
4350         auio.uio_iov = &aiov;
4351         auio.uio_iovcnt = 1;
4352         auio.uio_rw = UIO_READ;
4353         auio.uio_segflg = direction;
4354         auio.uio_td = td;
4355         auio.uio_resid = count;
4356         loff = auio.uio_offset = fp->f_offset;
4357         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
4358         fp->f_offset = auio.uio_offset;
4359         if (error)
4360                 goto done;
4361
4362         /*
4363          * WARNING!  *basep may not be wide enough to accomodate the
4364          * seek offset.   XXX should we hack this to return the upper 32 bits
4365          * for offsets greater then 4G?
4366          */
4367         if (basep) {
4368                 *basep = (long)loff;
4369         }
4370         *res = count - auio.uio_resid;
4371 done:
4372         fdrop(fp);
4373         return (error);
4374 }
4375
4376 /*
4377  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4378  *
4379  * Read a block of directory entries in a file system independent format.
4380  */
4381 int
4382 sys_getdirentries(struct getdirentries_args *uap)
4383 {
4384         long base;
4385         int error;
4386
4387         error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4388                                    &uap->sysmsg_result, UIO_USERSPACE);
4389
4390         if (error == 0 && uap->basep)
4391                 error = copyout(&base, uap->basep, sizeof(*uap->basep));
4392         return (error);
4393 }
4394
4395 /*
4396  * getdents_args(int fd, char *buf, size_t count)
4397  */
4398 int
4399 sys_getdents(struct getdents_args *uap)
4400 {
4401         int error;
4402
4403         error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4404                                    &uap->sysmsg_result, UIO_USERSPACE);
4405
4406         return (error);
4407 }
4408
4409 /*
4410  * Set the mode mask for creation of filesystem nodes.
4411  *
4412  * umask(int newmask)
4413  */
4414 int
4415 sys_umask(struct umask_args *uap)
4416 {
4417         struct thread *td = curthread;
4418         struct proc *p = td->td_proc;
4419         struct filedesc *fdp;
4420
4421         fdp = p->p_fd;
4422         uap->sysmsg_result = fdp->fd_cmask;
4423         fdp->fd_cmask = uap->newmask & ALLPERMS;
4424         return (0);
4425 }
4426
4427 /*
4428  * revoke(char *path)
4429  *
4430  * Void all references to file by ripping underlying filesystem
4431  * away from vnode.
4432  */
4433 int
4434 sys_revoke(struct revoke_args *uap)
4435 {
4436         struct nlookupdata nd;
4437         struct vattr vattr;
4438         struct vnode *vp;
4439         struct ucred *cred;
4440         int error;
4441
4442         vp = NULL;
4443         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4444         if (error == 0)
4445                 error = nlookup(&nd);
4446         if (error == 0)
4447                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4448         cred = crhold(nd.nl_cred);
4449         nlookup_done(&nd);
4450         if (error == 0) {
4451                 if (error == 0)
4452                         error = VOP_GETATTR(vp, &vattr);
4453                 if (error == 0 && cred->cr_uid != vattr.va_uid)
4454                         error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
4455                 if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4456                         if (vcount(vp) > 0)
4457                                 error = vrevoke(vp, cred);
4458                 } else if (error == 0) {
4459                         error = vrevoke(vp, cred);
4460                 }
4461                 vrele(vp);
4462         }
4463         if (cred)
4464                 crfree(cred);
4465         return (error);
4466 }
4467
4468 /*
4469  * getfh_args(char *fname, fhandle_t *fhp)
4470  *
4471  * Get (NFS) file handle
4472  *
4473  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4474  * mount.  This allows nullfs mounts to be explicitly exported.
4475  *
4476  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4477  *
4478  *          nullfs mounts of subdirectories are not safe.  That is, it will
4479  *          work, but you do not really have protection against access to
4480  *          the related parent directories.
4481  */
4482 int
4483 sys_getfh(struct getfh_args *uap)
4484 {
4485         struct thread *td = curthread;
4486         struct nlookupdata nd;
4487         fhandle_t fh;
4488         struct vnode *vp;
4489         struct mount *mp;
4490         int error;
4491
4492         /*
4493          * Must be super user
4494          */
4495         if ((error = priv_check(td, PRIV_ROOT)) != 0)
4496                 return (error);
4497
4498         vp = NULL;
4499         error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4500         if (error == 0)
4501                 error = nlookup(&nd);
4502         if (error == 0)
4503                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4504         mp = nd.nl_nch.mount;
4505         nlookup_done(&nd);
4506         if (error == 0) {
4507                 bzero(&fh, sizeof(fh));
4508                 fh.fh_fsid = mp->mnt_stat.f_fsid;
4509                 error = VFS_VPTOFH(vp, &fh.fh_fid);
4510                 vput(vp);
4511                 if (error == 0)
4512                         error = copyout(&fh, uap->fhp, sizeof(fh));
4513         }
4514         return (error);
4515 }
4516
4517 /*
4518  * fhopen_args(const struct fhandle *u_fhp, int flags)
4519  *
4520  * syscall for the rpc.lockd to use to translate a NFS file handle into
4521  * an open descriptor.
4522  *
4523  * warning: do not remove the priv_check() call or this becomes one giant
4524  * security hole.
4525  */
4526 int
4527 sys_fhopen(struct fhopen_args *uap)
4528 {
4529         struct thread *td = curthread;
4530         struct filedesc *fdp = td->td_proc->p_fd;
4531         struct mount *mp;
4532         struct vnode *vp;
4533         struct fhandle fhp;
4534         struct vattr vat;
4535         struct vattr *vap = &vat;
4536         struct flock lf;
4537         int fmode, mode, error = 0, type;
4538         struct file *nfp;
4539         struct file *fp;
4540         int indx;
4541
4542         /*
4543          * Must be super user
4544          */
4545         error = priv_check(td, PRIV_ROOT);
4546         if (error)
4547                 return (error);
4548
4549         fmode = FFLAGS(uap->flags);
4550
4551         /*
4552          * Why not allow a non-read/write open for our lockd?
4553          */
4554         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4555                 return (EINVAL);
4556         error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4557         if (error)
4558                 return(error);
4559
4560         /*
4561          * Find the mount point
4562          */
4563         mp = vfs_getvfs(&fhp.fh_fsid);
4564         if (mp == NULL) {
4565                 error = ESTALE;
4566                 goto  done;
4567         }
4568         /* now give me my vnode, it gets returned to me locked */
4569         error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4570         if (error)
4571                 goto done;
4572         /*
4573          * from now on we have to make sure not
4574          * to forget about the vnode
4575          * any error that causes an abort must vput(vp)
4576          * just set error = err and 'goto bad;'.
4577          */
4578
4579         /*
4580          * from vn_open
4581          */
4582         if (vp->v_type == VLNK) {
4583                 error = EMLINK;
4584                 goto bad;
4585         }
4586         if (vp->v_type == VSOCK) {
4587                 error = EOPNOTSUPP;
4588                 goto bad;
4589         }
4590         mode = 0;
4591         if (fmode & (FWRITE | O_TRUNC)) {
4592                 if (vp->v_type == VDIR) {
4593                         error = EISDIR;
4594                         goto bad;
4595                 }
4596                 error = vn_writechk(vp, NULL);
4597                 if (error)
4598                         goto bad;
4599                 mode |= VWRITE;
4600         }
4601         if (fmode & FREAD)
4602                 mode |= VREAD;
4603         if (mode) {
4604                 error = VOP_ACCESS(vp, mode, td->td_ucred);
4605                 if (error)
4606                         goto bad;
4607         }
4608         if (fmode & O_TRUNC) {
4609                 vn_unlock(vp);                          /* XXX */
4610                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
4611                 VATTR_NULL(vap);
4612                 vap->va_size = 0;
4613                 error = VOP_SETATTR(vp, vap, td->td_ucred);
4614                 if (error)
4615                         goto bad;
4616         }
4617
4618         /*
4619          * VOP_OPEN needs the file pointer so it can potentially override
4620          * it.
4621          *
4622          * WARNING! no f_nchandle will be associated when fhopen()ing a
4623          * directory.  XXX
4624          */
4625         if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4626                 goto bad;
4627         fp = nfp;
4628
4629         error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4630         if (error) {
4631                 /*
4632                  * setting f_ops this way prevents VOP_CLOSE from being
4633                  * called or fdrop() releasing the vp from v_data.   Since
4634                  * the VOP_OPEN failed we don't want to VOP_CLOSE.
4635                  */
4636                 fp->f_ops = &badfileops;
4637                 fp->f_data = NULL;
4638                 goto bad_drop;
4639         }
4640
4641         /*
4642          * The fp is given its own reference, we still have our ref and lock.
4643          *
4644          * Assert that all regular files must be created with a VM object.
4645          */
4646         if (vp->v_type == VREG && vp->v_object == NULL) {
4647                 kprintf("fhopen: regular file did not have VM object: %p\n", vp);
4648                 goto bad_drop;
4649         }
4650
4651         /*
4652          * The open was successful.  Handle any locking requirements.
4653          */
4654         if (fmode & (O_EXLOCK | O_SHLOCK)) {
4655                 lf.l_whence = SEEK_SET;
4656                 lf.l_start = 0;
4657                 lf.l_len = 0;
4658                 if (fmode & O_EXLOCK)
4659                         lf.l_type = F_WRLCK;
4660                 else
4661                         lf.l_type = F_RDLCK;
4662                 if (fmode & FNONBLOCK)
4663                         type = 0;
4664                 else
4665                         type = F_WAIT;
4666                 vn_unlock(vp);
4667                 if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
4668                         /*
4669                          * release our private reference.
4670                          */
4671                         fsetfd(fdp, NULL, indx);
4672                         fdrop(fp);
4673                         vrele(vp);
4674                         goto done;
4675                 }
4676                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4677                 atomic_set_int(&fp->f_flag, FHASLOCK);  /* race ok */
4678         }
4679
4680         /*
4681          * Clean up.  Associate the file pointer with the previously
4682          * reserved descriptor and return it.
4683          */
4684         vput(vp);
4685         if (uap->flags & O_CLOEXEC)
4686                 fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
4687         fsetfd(fdp, fp, indx);
4688         fdrop(fp);
4689         uap->sysmsg_result = indx;
4690         return (error);
4691
4692 bad_drop:
4693         fsetfd(fdp, NULL, indx);
4694         fdrop(fp);
4695 bad:
4696         vput(vp);
4697 done:
4698         return (error);
4699 }
4700
4701 /*
4702  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4703  */
4704 int
4705 sys_fhstat(struct fhstat_args *uap)
4706 {
4707         struct thread *td = curthread;
4708         struct stat sb;
4709         fhandle_t fh;
4710         struct mount *mp;
4711         struct vnode *vp;
4712         int error;
4713
4714         /*
4715          * Must be super user
4716          */
4717         error = priv_check(td, PRIV_ROOT);
4718         if (error)
4719                 return (error);
4720
4721         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4722         if (error)
4723                 return (error);
4724
4725         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4726                 error = ESTALE;
4727         if (error == 0) {
4728                 if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4729                         error = vn_stat(vp, &sb, td->td_ucred);
4730                         vput(vp);
4731                 }
4732         }
4733         if (error == 0)
4734                 error = copyout(&sb, uap->sb, sizeof(sb));
4735         return (error);
4736 }
4737
4738 /*
4739  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4740  */
4741 int
4742 sys_fhstatfs(struct fhstatfs_args *uap)
4743 {
4744         struct thread *td = curthread;
4745         struct proc *p = td->td_proc;
4746         struct statfs *sp;
4747         struct mount *mp;
4748         struct vnode *vp;
4749         struct statfs sb;
4750         char *fullpath, *freepath;
4751         fhandle_t fh;
4752         int error;
4753
4754         /*
4755          * Must be super user
4756          */
4757         if ((error = priv_check(td, PRIV_ROOT)))
4758                 return (error);
4759
4760         if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4761                 return (error);
4762
4763         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4764                 error = ESTALE;
4765                 goto done;
4766         }
4767         if (p != NULL && !chroot_visible_mnt(mp, p)) {
4768                 error = ESTALE;
4769                 goto done;
4770         }
4771
4772         if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4773                 goto done;
4774         mp = vp->v_mount;
4775         sp = &mp->mnt_stat;
4776         vput(vp);
4777         if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4778                 goto done;
4779
4780         error = mount_path(p, mp, &fullpath, &freepath);
4781         if (error)
4782                 goto done;
4783         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4784         strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4785         kfree(freepath, M_TEMP);
4786
4787         sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4788         if (priv_check(td, PRIV_ROOT)) {
4789                 bcopy(sp, &sb, sizeof(sb));
4790                 sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4791                 sp = &sb;
4792         }
4793         error = copyout(sp, uap->buf, sizeof(*sp));
4794 done:
4795         return (error);
4796 }
4797
4798 /*
4799  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4800  */
4801 int
4802 sys_fhstatvfs(struct fhstatvfs_args *uap)
4803 {
4804         struct thread *td = curthread;
4805         struct proc *p = td->td_proc;
4806         struct statvfs *sp;
4807         struct mount *mp;
4808         struct vnode *vp;
4809         fhandle_t fh;
4810         int error;
4811
4812         /*
4813          * Must be super user
4814          */
4815         if ((error = priv_check(td, PRIV_ROOT)))
4816                 return (error);
4817
4818         if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4819                 return (error);
4820
4821         if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4822                 error = ESTALE;
4823                 goto done;
4824         }
4825         if (p != NULL && !chroot_visible_mnt(mp, p)) {
4826                 error = ESTALE;
4827                 goto done;
4828         }
4829
4830         if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4831                 goto done;
4832         mp = vp->v_mount;
4833         sp = &mp->mnt_vstat;
4834         vput(vp);
4835         if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4836                 goto done;
4837
4838         sp->f_flag = 0;
4839         if (mp->mnt_flag & MNT_RDONLY)
4840                 sp->f_flag |= ST_RDONLY;
4841         if (mp->mnt_flag & MNT_NOSUID)
4842                 sp->f_flag |= ST_NOSUID;
4843         error = copyout(sp, uap->buf, sizeof(*sp));
4844 done:
4845         return (error);
4846 }
4847
4848
4849 /*
4850  * Syscall to push extended attribute configuration information into the
4851  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4852  * a command (int cmd), and attribute name and misc data.  For now, the
4853  * attribute name is left in userspace for consumption by the VFS_op.
4854  * It will probably be changed to be copied into sysspace by the
4855  * syscall in the future, once issues with various consumers of the
4856  * attribute code have raised their hands.
4857  *
4858  * Currently this is used only by UFS Extended Attributes.
4859  */
4860 int
4861 sys_extattrctl(struct extattrctl_args *uap)
4862 {
4863         struct nlookupdata nd;
4864         struct vnode *vp;
4865         char attrname[EXTATTR_MAXNAMELEN];
4866         int error;
4867         size_t size;
4868
4869         attrname[0] = 0;
4870         vp = NULL;
4871         error = 0;
4872
4873         if (error == 0 && uap->filename) {
4874                 error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
4875                                      NLC_FOLLOW);
4876                 if (error == 0)
4877                         error = nlookup(&nd);
4878                 if (error == 0)
4879                         error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4880                 nlookup_done(&nd);
4881         }
4882
4883         if (error == 0 && uap->attrname) {
4884                 error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
4885                                   &size);
4886         }
4887
4888         if (error == 0) {
4889                 error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4890                 if (error == 0)
4891                         error = nlookup(&nd);
4892                 if (error == 0)
4893                         error = ncp_writechk(&nd.nl_nch);
4894                 if (error == 0) {
4895                         error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
4896                                                uap->attrnamespace,
4897                                                uap->attrname, nd.nl_cred);
4898                 }
4899                 nlookup_done(&nd);
4900         }
4901
4902         return (error);
4903 }
4904
4905 /*
4906  * Syscall to get a named extended attribute on a file or directory.
4907  */
4908 int
4909 sys_extattr_set_file(struct extattr_set_file_args *uap)
4910 {
4911         char attrname[EXTATTR_MAXNAMELEN];
4912         struct nlookupdata nd;
4913         struct vnode *vp;
4914         struct uio auio;
4915         struct iovec aiov;
4916         int error;
4917
4918         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4919         if (error)
4920                 return (error);
4921
4922         vp = NULL;
4923
4924         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4925         if (error == 0)
4926                 error = nlookup(&nd);
4927         if (error == 0)
4928                 error = ncp_writechk(&nd.nl_nch);
4929         if (error == 0)
4930                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4931         if (error) {
4932                 nlookup_done(&nd);
4933                 return (error);
4934         }
4935
4936         bzero(&auio, sizeof(auio));
4937         aiov.iov_base = uap->data;
4938         aiov.iov_len = uap->nbytes;
4939         auio.uio_iov = &aiov;
4940         auio.uio_iovcnt = 1;
4941         auio.uio_offset = 0;
4942         auio.uio_resid = uap->nbytes;
4943         auio.uio_rw = UIO_WRITE;
4944         auio.uio_td = curthread;
4945
4946         error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
4947                                &auio, nd.nl_cred);
4948
4949         vput(vp);
4950         nlookup_done(&nd);
4951         return (error);
4952 }
4953
4954 /*
4955  * Syscall to get a named extended attribute on a file or directory.
4956  */
4957 int
4958 sys_extattr_get_file(struct extattr_get_file_args *uap)
4959 {
4960         char attrname[EXTATTR_MAXNAMELEN];
4961         struct nlookupdata nd;
4962         struct uio auio;
4963         struct iovec aiov;
4964         struct vnode *vp;
4965         int error;
4966
4967         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4968         if (error)
4969                 return (error);
4970
4971         vp = NULL;
4972
4973         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4974         if (error == 0)
4975                 error = nlookup(&nd);
4976         if (error == 0)
4977                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
4978         if (error) {
4979                 nlookup_done(&nd);
4980                 return (error);
4981         }
4982
4983         bzero(&auio, sizeof(auio));
4984         aiov.iov_base = uap->data;
4985         aiov.iov_len = uap->nbytes;
4986         auio.uio_iov = &aiov;
4987         auio.uio_iovcnt = 1;
4988         auio.uio_offset = 0;
4989         auio.uio_resid = uap->nbytes;
4990         auio.uio_rw = UIO_READ;
4991         auio.uio_td = curthread;
4992
4993         error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
4994                                 &auio, nd.nl_cred);
4995         uap->sysmsg_result = uap->nbytes - auio.uio_resid;
4996
4997         vput(vp);
4998         nlookup_done(&nd);
4999         return(error);
5000 }
5001
5002 /*
5003  * Syscall to delete a named extended attribute from a file or directory.
5004  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
5005  */
5006 int
5007 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
5008 {
5009         char attrname[EXTATTR_MAXNAMELEN];
5010         struct nlookupdata nd;
5011         struct vnode *vp;
5012         int error;
5013
5014         error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5015         if (error)
5016                 return(error);
5017
5018         error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5019         if (error == 0)
5020                 error = nlookup(&nd);
5021         if (error == 0)
5022                 error = ncp_writechk(&nd.nl_nch);
5023         if (error == 0) {
5024                 error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5025                 if (error == 0) {
5026                         error = VOP_SETEXTATTR(vp, uap->attrnamespace,
5027                                                attrname, NULL, nd.nl_cred);
5028                         vput(vp);
5029                 }
5030         }
5031         nlookup_done(&nd);
5032         return(error);
5033 }
5034
5035 /*
5036  * Determine if the mount is visible to the process.
5037  */
5038 static int
5039 chroot_visible_mnt(struct mount *mp, struct proc *p)
5040 {
5041         struct nchandle nch;
5042
5043         /*
5044          * Traverse from the mount point upwards.  If we hit the process
5045          * root then the mount point is visible to the process.
5046          */
5047         nch = mp->mnt_ncmountpt;
5048         while (nch.ncp) {
5049                 if (nch.mount == p->p_fd->fd_nrdir.mount &&
5050                     nch.ncp == p->p_fd->fd_nrdir.ncp) {
5051                         return(1);
5052                 }
5053                 if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
5054                         nch = nch.mount->mnt_ncmounton;
5055                 } else {
5056                         nch.ncp = nch.ncp->nc_parent;
5057                 }
5058         }
5059
5060         /*
5061          * If the mount point is not visible to the process, but the
5062          * process root is in a subdirectory of the mount, return
5063          * TRUE anyway.
5064          */
5065         if (p->p_fd->fd_nrdir.mount == mp)
5066                 return(1);
5067
5068         return(0);
5069 }
5070