sys/vfs/procfs/procfs_vnops.c

   1 /*
   2  * Copyright (c) 1993, 1995 Jan-Simon Pendry
   3  * Copyright (c) 1993, 1995
   4  *      The Regents of the University of California.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * Jan-Simon Pendry.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 3. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)procfs_vnops.c      8.18 (Berkeley) 5/21/95
  34  *
  35  * $FreeBSD: src/sys/miscfs/procfs/procfs_vnops.c,v 1.76.2.7 2002/01/22 17:22:59 nectar Exp $
  36  */
  37
  38 /*
  39  * procfs vnode interface
  40  */
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/time.h>
  45 #include <sys/kernel.h>
  46 #include <sys/lock.h>
  47 #include <sys/fcntl.h>
  48 #include <sys/proc.h>
  49 #include <sys/priv.h>
  50 #include <sys/signalvar.h>
  51 #include <sys/vnode.h>
  52 #include <sys/uio.h>
  53 #include <sys/mount.h>
  54 #include <sys/namei.h>
  55 #include <sys/dirent.h>
  56 #include <sys/malloc.h>
  57 #include <sys/reg.h>
  58 #include <vm/vm_zone.h>
  59 #include <vfs/procfs/procfs.h>
  60 #include <sys/pioctl.h>
  61
  62 #include <sys/spinlock2.h>
  63
  64 #include <machine/limits.h>
  65
  66 static int      procfs_access (struct vop_access_args *);
  67 static int      procfs_badop (struct vop_generic_args *);
  68 static int      procfs_bmap (struct vop_bmap_args *);
  69 static int      procfs_close (struct vop_close_args *);
  70 static int      procfs_getattr (struct vop_getattr_args *);
  71 static int      procfs_inactive (struct vop_inactive_args *);
  72 static int      procfs_ioctl (struct vop_ioctl_args *);
  73 static int      procfs_lookup (struct vop_old_lookup_args *);
  74 static int      procfs_open (struct vop_open_args *);
  75 static int      procfs_print (struct vop_print_args *);
  76 static int      procfs_readdir (struct vop_readdir_args *);
  77 static int      procfs_readlink (struct vop_readlink_args *);
  78 static int      procfs_reclaim (struct vop_reclaim_args *);
  79 static int      procfs_setattr (struct vop_setattr_args *);
  80
  81 static int      procfs_readdir_proc(struct vop_readdir_args *);
  82 static int      procfs_readdir_root(struct vop_readdir_args *);
  83
  84 /*
  85  * procfs vnode operations.
  86  */
  87 struct vop_ops procfs_vnode_vops = {
  88         .vop_default =          vop_defaultop,
  89         .vop_access =           procfs_access,
  90         .vop_advlock =          (void *)procfs_badop,
  91         .vop_bmap =             procfs_bmap,
  92         .vop_close =            procfs_close,
  93         .vop_old_create =       (void *)procfs_badop,
  94         .vop_getattr =          procfs_getattr,
  95         .vop_inactive =         procfs_inactive,
  96         .vop_old_link =         (void *)procfs_badop,
  97         .vop_old_lookup =       procfs_lookup,
  98         .vop_old_mkdir =        (void *)procfs_badop,
  99         .vop_old_mknod =        (void *)procfs_badop,
 100         .vop_open =             procfs_open,
 101         .vop_pathconf =         vop_stdpathconf,
 102         .vop_print =            procfs_print,
 103         .vop_read =             procfs_rw,
 104         .vop_readdir =          procfs_readdir,
 105         .vop_readlink =         procfs_readlink,
 106         .vop_reclaim =          procfs_reclaim,
 107         .vop_old_remove =       (void *)procfs_badop,
 108         .vop_old_rename =       (void *)procfs_badop,
 109         .vop_old_rmdir =        (void *)procfs_badop,
 110         .vop_setattr =          procfs_setattr,
 111         .vop_old_symlink =      (void *)procfs_badop,
 112         .vop_write =            (void *)procfs_rw,
 113         .vop_ioctl =            procfs_ioctl
 114 };
 115
 116
 117 /*
 118  * This is a list of the valid names in the
 119  * process-specific sub-directories.  It is
 120  * used in procfs_lookup and procfs_readdir
 121  */
 122 static struct proc_target {
 123         u_char  pt_type;
 124         u_char  pt_namlen;
 125         char    *pt_name;
 126         pfstype pt_pfstype;
 127         int     (*pt_valid) (struct lwp *p);
 128 } proc_targets[] = {
 129 #define N(s) sizeof(s)-1, s
 130         /*        name          type            validp */
 131         { DT_DIR, N("."),       Pproc,          NULL },
 132         { DT_DIR, N(".."),      Proot,          NULL },
 133         { DT_REG, N("mem"),     Pmem,           NULL },
 134         { DT_REG, N("regs"),    Pregs,          procfs_validregs },
 135         { DT_REG, N("fpregs"),  Pfpregs,        procfs_validfpregs },
 136         { DT_REG, N("dbregs"),  Pdbregs,        procfs_validdbregs },
 137         { DT_REG, N("ctl"),     Pctl,           NULL },
 138         { DT_REG, N("status"),  Pstatus,        NULL },
 139         { DT_REG, N("note"),    Pnote,          NULL },
 140         { DT_REG, N("notepg"),  Pnotepg,        NULL },
 141         { DT_REG, N("map"),     Pmap,           procfs_validmap },
 142         { DT_REG, N("etype"),   Ptype,          procfs_validtype },
 143         { DT_REG, N("cmdline"), Pcmdline,       NULL },
 144         { DT_REG, N("rlimit"),  Prlimit,        NULL },
 145         { DT_LNK, N("file"),    Pfile,          NULL },
 146 #undef N
 147 };
 148 static const int nproc_targets = NELEM(proc_targets);
 149
 150 static pid_t atopid (const char *, u_int);
 151
 152 /*
 153  * set things up for doing i/o on
 154  * the pfsnode (vp).  (vp) is locked
 155  * on entry, and should be left locked
 156  * on exit.
 157  *
 158  * for procfs we don't need to do anything
 159  * in particular for i/o.  all that is done
 160  * is to support exclusive open on process
 161  * memory images.
 162  *
 163  * procfs_open(struct vnode *a_vp, int a_mode, struct ucred *a_cred,
 164  *             struct file *a_fp)
 165  */
 166 static int
 167 procfs_open(struct vop_open_args *ap)
 168 {
 169         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 170         struct proc *p1, *p2;
 171         int error;
 172
 173         p2 = pfs_pfind(pfs->pfs_pid);
 174         if (p2 == NULL)
 175                 return (ENOENT);
 176         if (pfs->pfs_pid && !PRISON_CHECK(ap->a_cred, p2->p_ucred)) {
 177                 error = ENOENT;
 178                 goto done;
 179         }
 180
 181         switch (pfs->pfs_type) {
 182         case Pmem:
 183                 if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
 184                     ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
 185                         error = EBUSY;
 186                         goto done;
 187                 }
 188
 189                 p1 = curproc;
 190                 KKASSERT(p1);
 191                 /* Can't trace a process that's currently exec'ing. */
 192                 if ((p2->p_flags & P_INEXEC) != 0) {
 193                         error = EAGAIN;
 194                         goto done;
 195                 }
 196                 if (!CHECKIO(p1, p2) || p_trespass(ap->a_cred, p2->p_ucred)) {
 197                         error = EPERM;
 198                         goto done;
 199                 }
 200
 201                 if (ap->a_mode & FWRITE)
 202                         pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
 203
 204                 break;
 205
 206         default:
 207                 break;
 208         }
 209         error = vop_stdopen(ap);
 210 done:
 211         pfs_pdone(p2);
 212         return error;
 213 }
 214
 215 /*
 216  * close the pfsnode (vp) after doing i/o.
 217  * (vp) is not locked on entry or exit.
 218  *
 219  * nothing to do for procfs other than undo
 220  * any exclusive open flag (see _open above).
 221  *
 222  * procfs_close(struct vnode *a_vp, int a_fflag, struct ucred *a_cred)
 223  */
 224 static int
 225 procfs_close(struct vop_close_args *ap)
 226 {
 227         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 228         struct proc *p;
 229
 230         /*
 231          * Make sure the lock is exclusive for opencount tests
 232          */
 233         vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY);
 234
 235         switch (pfs->pfs_type) {
 236         case Pmem:
 237                 if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
 238                         pfs->pfs_flags &= ~(FWRITE|O_EXCL);
 239                 /*
 240                  * v_opencount determines the last real close on the vnode.
 241                  *
 242                  * If this is the last close, then it checks to see if
 243                  * the target process has PF_LINGER set in p_pfsflags,
 244                  * if this is *not* the case, then the process' stop flags
 245                  * are cleared, and the process is woken up.  This is
 246                  * to help prevent the case where a process has been
 247                  * told to stop on an event, but then the requesting process
 248                  * has gone away or forgotten about it.
 249                  */
 250                 p = NULL;
 251                 if ((ap->a_vp->v_opencount < 2)
 252                     && ((p = pfs_pfind(pfs->pfs_pid)) != NULL ||
 253                         (p = pfs_zpfind(pfs->pfs_pid)) != NULL)
 254                     && !(p->p_pfsflags & PF_LINGER)) {
 255                         spin_lock(&p->p_spin);
 256                         p->p_stops = 0;
 257                         p->p_step = 0;
 258                         spin_unlock(&p->p_spin);
 259                         wakeup(&p->p_stype);
 260                         wakeup(&p->p_step);
 261                 }
 262                 pfs_pdone(p);
 263                 break;
 264         default:
 265                 break;
 266         }
 267
 268         return (vop_stdclose(ap));
 269 }
 270
 271 /*
 272  * do an ioctl operation on a pfsnode (vp).
 273  * (vp) is not locked on entry or exit.
 274  */
 275 static int
 276 procfs_ioctl(struct vop_ioctl_args *ap)
 277 {
 278         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 279         struct proc *procp;
 280         struct proc *p;
 281         int error;
 282         int signo;
 283         struct procfs_status *psp;
 284         unsigned char flags;
 285
 286         procp = pfs_pfind(pfs->pfs_pid);
 287         if (procp == NULL)
 288                 return ENOTTY;
 289         p = curproc;
 290         if (p == NULL) {
 291                 error = EINVAL;
 292                 goto done;
 293         }
 294
 295         /* Can't trace a process that's currently exec'ing. */
 296         if ((procp->p_flags & P_INEXEC) != 0) {
 297                 error = EAGAIN;
 298                 goto done;
 299         }
 300         if (!CHECKIO(p, procp) || p_trespass(ap->a_cred, procp->p_ucred)) {
 301                 error = EPERM;
 302                 goto done;
 303         }
 304
 305         switch (ap->a_command) {
 306         case PIOCBIS:
 307           spin_lock(&procp->p_spin);
 308           procp->p_stops |= *(unsigned int*)ap->a_data;
 309           spin_unlock(&procp->p_spin);
 310           break;
 311         case PIOCBIC:
 312           spin_lock(&procp->p_spin);
 313           procp->p_stops &= ~*(unsigned int*)ap->a_data;
 314           spin_unlock(&procp->p_spin);
 315           break;
 316         case PIOCSFL:
 317           /*
 318            * NFLAGS is "non-suser_xxx flags" -- currently, only
 319            * PFS_ISUGID ("ignore set u/g id");
 320            */
 321 #define NFLAGS  (PF_ISUGID)
 322           flags = (unsigned char)*(unsigned int*)ap->a_data;
 323           if (flags & NFLAGS && (error = priv_check_cred(ap->a_cred, PRIV_ROOT, 0)))
 324             goto done;
 325           procp->p_pfsflags = flags;
 326           break;
 327         case PIOCGFL:
 328           *(unsigned int*)ap->a_data = (unsigned int)procp->p_pfsflags;
 329           break;
 330         case PIOCSTATUS:
 331           /*
 332            * NOTE: syscall entry deals with stopevents and may run without
 333            *       the MP lock.
 334            */
 335           psp = (struct procfs_status *)ap->a_data;
 336           psp->flags = procp->p_pfsflags;
 337           psp->events = procp->p_stops;
 338           spin_lock(&procp->p_spin);
 339           if (procp->p_step) {
 340             psp->state = 0;
 341             psp->why = procp->p_stype;
 342             psp->val = procp->p_xstat;
 343             spin_unlock(&procp->p_spin);
 344           } else {
 345             psp->state = 1;
 346             spin_unlock(&procp->p_spin);
 347             psp->why = 0;       /* Not defined values */
 348             psp->val = 0;       /* Not defined values */
 349           }
 350           break;
 351         case PIOCWAIT:
 352           /*
 353            * NOTE: syscall entry deals with stopevents and may run without
 354            *       the MP lock.
 355            */
 356           psp = (struct procfs_status *)ap->a_data;
 357           spin_lock(&procp->p_spin);
 358           while (procp->p_step == 0) {
 359             tsleep_interlock(&procp->p_stype, PCATCH);
 360             spin_unlock(&procp->p_spin);
 361             if (procp->p_stops == 0) {
 362                 error = 0;
 363                 goto done;
 364             }
 365             if (procp->p_flags & P_POSTEXIT) {
 366                 error = EINVAL;
 367                 goto done;
 368             }
 369             if (procp->p_flags & P_INEXEC) {
 370                 error = EAGAIN;
 371                 goto done;
 372             }
 373             error = tsleep(&procp->p_stype, PCATCH | PINTERLOCKED,
 374                            "piocwait", 0);
 375             if (error)
 376               goto done;
 377             spin_lock(&procp->p_spin);
 378           }
 379           spin_unlock(&procp->p_spin);
 380           psp->state = 1;       /* It stopped */
 381           psp->flags = procp->p_pfsflags;
 382           psp->events = procp->p_stops;
 383           psp->why = procp->p_stype;    /* why it stopped */
 384           psp->val = procp->p_xstat;    /* any extra info */
 385           break;
 386         case PIOCCONT:  /* Restart a proc */
 387           /*
 388            * NOTE: syscall entry deals with stopevents and may run without
 389            *       the MP lock.  However, the caller is presumably interlocked
 390            *       by having waited.
 391            */
 392           if (procp->p_step == 0) {
 393             error = EINVAL;     /* Can only start a stopped process */
 394             goto done;
 395           }
 396           if ((signo = *(int*)ap->a_data) != 0) {
 397             if (signo >= NSIG || signo <= 0) {
 398               error = EINVAL;
 399               goto done;
 400             }
 401             ksignal(procp, signo);
 402           }
 403           procp->p_step = 0;
 404           wakeup(&procp->p_step);
 405           break;
 406         default:
 407           error = ENOTTY;
 408           goto done;
 409         }
 410         error = 0;
 411 done:
 412         pfs_pdone(procp);
 413         return error;
 414 }
 415
 416 /*
 417  * do block mapping for pfsnode (vp).
 418  * since we don't use the buffer cache
 419  * for procfs this function should never
 420  * be called.  in any case, it's not clear
 421  * what part of the kernel ever makes use
 422  * of this function.  for sanity, this is the
 423  * usual no-op bmap, although returning
 424  * (EIO) would be a reasonable alternative.
 425  *
 426  * XXX mmap assumes buffer cache operation
 427  *
 428  * procfs_bmap(struct vnode *a_vp, off_t a_loffset,
 429  *              off_t *a_doffsetp, int *a_runp, int *a_runb)
 430  */
 431 static int
 432 procfs_bmap(struct vop_bmap_args *ap)
 433 {
 434         if (ap->a_doffsetp != NULL)
 435                 *ap->a_doffsetp = ap->a_loffset;
 436         if (ap->a_runp != NULL)
 437                 *ap->a_runp = 0;
 438         if (ap->a_runb != NULL)
 439                 *ap->a_runb = 0;
 440         return (0);
 441 }
 442
 443 /*
 444  * procfs_inactive is called when the pfsnode
 445  * is vrele'd and the reference count goes
 446  * to zero.  (vp) will be on the vnode free
 447  * list, so to get it back vget() must be
 448  * used.
 449  *
 450  * (vp) is locked on entry, but must be unlocked on exit.
 451  *
 452  * procfs_inactive(struct vnode *a_vp)
 453  */
 454 static int
 455 procfs_inactive(struct vop_inactive_args *ap)
 456 {
 457         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 458
 459         if (pfs->pfs_pid & PFS_DEAD)
 460                 vrecycle(ap->a_vp);
 461         return (0);
 462 }
 463
 464 /*
 465  * _reclaim is called when getnewvnode()
 466  * wants to make use of an entry on the vnode
 467  * free list.  at this time the filesystem needs
 468  * to free any private data and remove the node
 469  * from any private lists.
 470  *
 471  * procfs_reclaim(struct vnode *a_vp)
 472  */
 473 static int
 474 procfs_reclaim(struct vop_reclaim_args *ap)
 475 {
 476         return (procfs_freevp(ap->a_vp));
 477 }
 478
 479 /*
 480  * _print is used for debugging.
 481  * just print a readable description
 482  * of (vp).
 483  *
 484  * procfs_print(struct vnode *a_vp)
 485  */
 486 static int
 487 procfs_print(struct vop_print_args *ap)
 488 {
 489         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 490
 491         kprintf("tag VT_PROCFS, type %d, pid %ld, mode %x, flags %lx\n",
 492             pfs->pfs_type, (long)pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
 493         return (0);
 494 }
 495
 496 /*
 497  * generic entry point for unsupported operations
 498  */
 499 static int
 500 procfs_badop(struct vop_generic_args *ap)
 501 {
 502         return (EIO);
 503 }
 504
 505 /*
 506  * Invent attributes for pfsnode (vp) and store
 507  * them in (vap).
 508  * Directories lengths are returned as zero since
 509  * any real length would require the genuine size
 510  * to be computed, and nothing cares anyway.
 511  *
 512  * this is relatively minimal for procfs.
 513  *
 514  * procfs_getattr(struct vnode *a_vp, struct vattr *a_vap)
 515  */
 516 static int
 517 procfs_getattr(struct vop_getattr_args *ap)
 518 {
 519         struct pfsnode *pfs = VTOPFS(ap->a_vp);
 520         struct vattr *vap = ap->a_vap;
 521         struct proc *procp;
 522         int error;
 523
 524         /*
 525          * First make sure that the process and its credentials
 526          * still exist.
 527          */
 528         switch (pfs->pfs_type) {
 529         case Proot:
 530         case Pcurproc:
 531                 procp = NULL;
 532                 break;
 533         default:
 534                 procp = pfs_pfind(pfs->pfs_pid);
 535                 if (procp == NULL || procp->p_ucred == NULL) {
 536                         error = ENOENT;
 537                         goto done;
 538                 }
 539                 break;
 540         }
 541
 542         error = 0;
 543
 544         /* start by zeroing out the attributes */
 545         VATTR_NULL(vap);
 546
 547         /* next do all the common fields */
 548         vap->va_type = ap->a_vp->v_type;
 549         vap->va_mode = pfs->pfs_mode;
 550         vap->va_fileid = pfs->pfs_fileno;
 551         vap->va_flags = 0;
 552         vap->va_blocksize = PAGE_SIZE;
 553         vap->va_bytes = vap->va_size = 0;
 554         vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 555
 556         /*
 557          * Make all times be current TOD.
 558          * It would be possible to get the process start
 559          * time from the p_stat structure, but there's
 560          * no "file creation" time stamp anyway, and the
 561          * p_stat structure is not addressible if u. gets
 562          * swapped out for that process.
 563          */
 564         nanotime(&vap->va_ctime);
 565         vap->va_atime = vap->va_mtime = vap->va_ctime;
 566
 567         /*
 568          * If the process has exercised some setuid or setgid
 569          * privilege, then rip away read/write permission so
 570          * that only root can gain access.
 571          */
 572         switch (pfs->pfs_type) {
 573         case Pctl:
 574         case Pregs:
 575         case Pfpregs:
 576         case Pdbregs:
 577         case Pmem:
 578                 if (procp->p_flags & P_SUGID) {
 579                         vap->va_mode &= ~((VREAD|VWRITE)|
 580                                           ((VREAD|VWRITE)>>3)|
 581                                           ((VREAD|VWRITE)>>6));
 582                 }
 583                 break;
 584         default:
 585                 break;
 586         }
 587
 588         /*
 589          * now do the object specific fields
 590          *
 591          * The size could be set from struct reg, but it's hardly
 592          * worth the trouble, and it puts some (potentially) machine
 593          * dependent data into this machine-independent code.  If it
 594          * becomes important then this function should break out into
 595          * a per-file stat function in the corresponding .c file.
 596          */
 597
 598         vap->va_nlink = 1;
 599         if (procp) {
 600                 if (procp->p_ucred) {
 601                         vap->va_uid = procp->p_ucred->cr_uid;
 602                         vap->va_gid = procp->p_ucred->cr_gid;
 603                 } else {
 604                         vap->va_uid = -1;
 605                         vap->va_gid = -1;
 606                 }
 607         }
 608
 609         switch (pfs->pfs_type) {
 610         case Proot:
 611                 /*
 612                  * Set nlink to 1 to tell fts(3) we don't actually know.
 613                  */
 614                 vap->va_nlink = 1;
 615                 vap->va_uid = 0;
 616                 vap->va_gid = 0;
 617                 vap->va_size = vap->va_bytes = DEV_BSIZE;
 618                 break;
 619
 620         case Pcurproc: {
 621                 char buf[16];           /* should be enough */
 622
 623                 vap->va_uid = 0;
 624                 vap->va_gid = 0;
 625                 vap->va_size = ksnprintf(buf, sizeof(buf),
 626                                          "%ld", (long)curproc->p_pid);
 627                 vap->va_bytes = vap->va_size;
 628                 break;
 629         }
 630
 631         case Pproc:
 632                 vap->va_nlink = nproc_targets;
 633                 vap->va_size = vap->va_bytes = DEV_BSIZE;
 634                 break;
 635
 636         case Pfile: {
 637                 char *fullpath, *freepath;
 638
 639                 if (procp->p_textnch.ncp) {
 640                         struct nchandle nch;
 641
 642                         cache_copy(&procp->p_textnch, &nch);
 643                         error = cache_fullpath(procp, &nch, NULL,
 644                                                &fullpath, &freepath, 0);
 645                         cache_drop(&nch);
 646                 } else {
 647                         error = EINVAL;
 648                 }
 649
 650                 if (error == 0) {
 651                         vap->va_size = strlen(fullpath);
 652                         kfree(freepath, M_TEMP);
 653                 } else {
 654                         vap->va_size = sizeof("unknown") - 1;
 655                         error = 0;
 656                 }
 657                 vap->va_bytes = vap->va_size;
 658                 break;
 659         }
 660
 661         case Pmem:
 662                 /*
 663                  * If we denied owner access earlier, then we have to
 664                  * change the owner to root - otherwise 'ps' and friends
 665                  * will break even though they are setgid kmem. *SIGH*
 666                  */
 667                 if (procp->p_flags & P_SUGID)
 668                         vap->va_uid = 0;
 669                 else if (procp->p_ucred)
 670                         vap->va_uid = procp->p_ucred->cr_uid;
 671                 else
 672                         vap->va_uid = -1;
 673                 break;
 674
 675         case Pregs:
 676                 vap->va_bytes = vap->va_size = sizeof(struct reg);
 677                 break;
 678
 679         case Pfpregs:
 680                 vap->va_bytes = vap->va_size = sizeof(struct fpreg);
 681                 break;
 682
 683         case Pdbregs:
 684                 vap->va_bytes = vap->va_size = sizeof(struct dbreg);
 685                 break;
 686
 687         case Ptype:
 688         case Pmap:
 689         case Pctl:
 690         case Pstatus:
 691         case Pnote:
 692         case Pnotepg:
 693         case Pcmdline:
 694         case Prlimit:
 695                 break;
 696
 697         default:
 698                 panic("procfs_getattr");
 699         }
 700 done:
 701         pfs_pdone(procp);
 702         return (error);
 703 }
 704
 705 /*
 706  * procfs_setattr(struct vnode *a_vp, struct vattr *a_vap,
 707  *                struct ucred *a_cred)
 708  */
 709 static int
 710 procfs_setattr(struct vop_setattr_args *ap)
 711 {
 712         if (ap->a_vap->va_flags != VNOVAL)
 713                 return (EOPNOTSUPP);
 714
 715         /*
 716          * just fake out attribute setting
 717          * it's not good to generate an error
 718          * return, otherwise things like creat()
 719          * will fail when they try to set the
 720          * file length to 0.  worse, this means
 721          * that echo $note > /proc/$pid/note will fail.
 722          */
 723
 724         return (0);
 725 }
 726
 727 /*
 728  * implement access checking.
 729  *
 730  * procfs_access(struct vnode *a_vp, int a_mode, struct ucred *a_cred)
 731  */
 732 static int
 733 procfs_access(struct vop_access_args *ap)
 734 {
 735         struct vattr vattr;
 736         int error;
 737
 738         error = VOP_GETATTR(ap->a_vp, &vattr);
 739         if (!error)
 740                 error = vop_helper_access(ap, vattr.va_uid, vattr.va_gid,
 741                                 vattr.va_mode, 0);
 742         return (error);
 743 }
 744
 745 /*
 746  * lookup.  this is incredibly complicated in the general case, however
 747  * for most pseudo-filesystems very little needs to be done.
 748  *
 749  * procfs_lookup(struct vnode *a_dvp, struct vnode **a_vpp,
 750  *               struct componentname *a_cnp)
 751  */
 752 static int
 753 procfs_lookup(struct vop_old_lookup_args *ap)
 754 {
 755         struct componentname *cnp = ap->a_cnp;
 756         struct vnode **vpp = ap->a_vpp;
 757         struct vnode *dvp = ap->a_dvp;
 758         char *pname = cnp->cn_nameptr;
 759         /* struct proc *curp = cnp->cn_proc; */
 760         struct proc_target *pt;
 761         pid_t pid;
 762         struct pfsnode *pfs;
 763         struct proc *p;
 764         struct lwp *lp;
 765         int i;
 766         int error;
 767
 768         *vpp = NULL;
 769
 770         if (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)
 771                 return (EROFS);
 772
 773         p = NULL;
 774         error = 0;
 775         if (cnp->cn_namelen == 1 && *pname == '.') {
 776                 *vpp = dvp;
 777                 vref(*vpp);
 778                 goto out;
 779         }
 780
 781         pfs = VTOPFS(dvp);
 782         switch (pfs->pfs_type) {
 783         case Proot:
 784                 if (cnp->cn_flags & CNP_ISDOTDOT)
 785                         return (EIO);
 786
 787                 if (CNEQ(cnp, "curproc", 7)) {
 788                         error = procfs_allocvp(dvp->v_mount, vpp, 0, Pcurproc);
 789                         goto out;
 790                 }
 791
 792                 pid = atopid(pname, cnp->cn_namelen);
 793                 if (pid == NO_PID)
 794                         break;
 795
 796                 p = pfs_pfind(pid);
 797                 if (p == NULL)
 798                         break;
 799
 800                 if (!PRISON_CHECK(ap->a_cnp->cn_cred, p->p_ucred))
 801                         break;
 802
 803                 if (ps_showallprocs == 0 && ap->a_cnp->cn_cred->cr_uid != 0 &&
 804                     ap->a_cnp->cn_cred->cr_uid != p->p_ucred->cr_uid)
 805                         break;
 806
 807                 error = procfs_allocvp(dvp->v_mount, vpp, pid, Pproc);
 808                 goto out;
 809
 810         case Pproc:
 811                 if (cnp->cn_flags & CNP_ISDOTDOT) {
 812                         error = procfs_root(dvp->v_mount, vpp);
 813                         goto out;
 814                 }
 815
 816                 p = pfs_pfind(pfs->pfs_pid);
 817                 if (p == NULL)
 818                         break;
 819                 /* XXX lwp */
 820                 lp = FIRST_LWP_IN_PROC(p);
 821
 822                 if (!PRISON_CHECK(ap->a_cnp->cn_cred, p->p_ucred))
 823                         break;
 824
 825                 if (ps_showallprocs == 0 && ap->a_cnp->cn_cred->cr_uid != 0 &&
 826                     ap->a_cnp->cn_cred->cr_uid != p->p_ucred->cr_uid)
 827                         break;
 828
 829                 for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
 830                         if (cnp->cn_namelen == pt->pt_namlen &&
 831                             bcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
 832                             (pt->pt_valid == NULL || (*pt->pt_valid)(lp)))
 833                                 goto found;
 834                 }
 835                 break;
 836         found:
 837                 error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
 838                                         pt->pt_pfstype);
 839                 goto out;
 840
 841         default:
 842                 error = ENOTDIR;
 843                 goto out;
 844         }
 845         if (cnp->cn_nameiop == NAMEI_LOOKUP)
 846                 error = ENOENT;
 847         else
 848                 error = EROFS;
 849         /*
 850          * If no error occured *vpp will hold a referenced locked vnode.
 851          * dvp was passed to us locked and *vpp must be returned locked.
 852          * If *vpp != dvp then we should unlock dvp if (1) this is not the
 853          * last component or (2) CNP_LOCKPARENT is not set.
 854          */
 855 out:
 856         if (error == 0 && *vpp != dvp) {
 857                 if ((cnp->cn_flags & CNP_LOCKPARENT) == 0) {
 858                         cnp->cn_flags |= CNP_PDIRUNLOCK;
 859                         vn_unlock(dvp);
 860                 }
 861         }
 862         pfs_pdone(p);
 863         return (error);
 864 }
 865
 866 /*
 867  * Does this process have a text file?
 868  */
 869 int
 870 procfs_validfile(struct lwp *lp)
 871 {
 872         return (procfs_findtextvp(lp->lwp_proc) != NULLVP);
 873 }
 874
 875 /*
 876  * readdir() returns directory entries from pfsnode (vp).
 877  *
 878  * We generate just one directory entry at a time, as it would probably
 879  * not pay off to buffer several entries locally to save uiomove calls.
 880  *
 881  * procfs_readdir(struct vnode *a_vp, struct uio *a_uio, struct ucred *a_cred,
 882  *                int *a_eofflag, int *a_ncookies, off_t **a_cookies)
 883  */
 884 static int
 885 procfs_readdir(struct vop_readdir_args *ap)
 886 {
 887         struct pfsnode *pfs;
 888         int error;
 889
 890         if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX)
 891                 return (EINVAL);
 892         error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
 893         if (error)
 894                 return (error);
 895         pfs = VTOPFS(ap->a_vp);
 896
 897         switch (pfs->pfs_type) {
 898         case Pproc:
 899                 /*
 900                  * this is for the process-specific sub-directories.
 901                  * all that is needed to is copy out all the entries
 902                  * from the procent[] table (top of this file).
 903                  */
 904                 error = procfs_readdir_proc(ap);
 905                 break;
 906         case Proot:
 907                 /*
 908                  * this is for the root of the procfs filesystem
 909                  * what is needed is a special entry for "curproc"
 910                  * followed by an entry for each process on allproc
 911                  */
 912                 error = procfs_readdir_root(ap);
 913                 break;
 914         default:
 915                 error = ENOTDIR;
 916                 break;
 917         }
 918
 919         vn_unlock(ap->a_vp);
 920         return (error);
 921 }
 922
 923 static int
 924 procfs_readdir_proc(struct vop_readdir_args *ap)
 925 {
 926         struct pfsnode *pfs;
 927         int error, i, retval;
 928         struct proc *p;
 929         struct lwp *lp;
 930         struct proc_target *pt;
 931         struct uio *uio = ap->a_uio;
 932
 933         pfs = VTOPFS(ap->a_vp);
 934         p = pfs_pfind(pfs->pfs_pid);
 935         if (p == NULL)
 936                 return(0);
 937         if (!PRISON_CHECK(ap->a_cred, p->p_ucred)) {
 938                 error = 0;
 939                 goto done;
 940         }
 941         /* XXX lwp, not MPSAFE */
 942         lp = FIRST_LWP_IN_PROC(p);
 943
 944         error = 0;
 945         i = (int)uio->uio_offset;
 946         if (i < 0) {
 947                 error = EINVAL;
 948                 goto done;
 949         }
 950
 951         for (pt = &proc_targets[i];
 952              !error && uio->uio_resid > 0 && i < nproc_targets; pt++, i++) {
 953                 if (pt->pt_valid && (*pt->pt_valid)(lp) == 0)
 954                         continue;
 955
 956                 retval = vop_write_dirent(&error, uio,
 957                     PROCFS_FILENO(pfs->pfs_pid, pt->pt_pfstype), pt->pt_type,
 958                     pt->pt_namlen, pt->pt_name);
 959                 if (retval)
 960                         break;
 961         }
 962
 963         uio->uio_offset = (off_t)i;
 964         error = 0;
 965 done:
 966         pfs_pdone(p);
 967         return error;
 968 }
 969
 970 struct procfs_readdir_root_info {
 971         int error;
 972         int i;
 973         int pcnt;
 974         struct uio *uio;
 975         struct ucred *cred;
 976 };
 977
 978 static int procfs_readdir_root_callback(struct proc *p, void *data);
 979
 980 static int
 981 procfs_readdir_root(struct vop_readdir_args *ap)
 982 {
 983         struct procfs_readdir_root_info info;
 984         struct uio *uio = ap->a_uio;
 985         int res;
 986
 987         res = 0;
 988         info.error = 0;
 989         info.i = (int)uio->uio_offset;
 990
 991         if (info.i < 0)
 992                 return (EINVAL);
 993
 994         info.pcnt = 0;
 995         info.uio = uio;
 996         info.cred = ap->a_cred;
 997         while (info.pcnt < 3) {
 998                 res = procfs_readdir_root_callback(NULL, &info);
 999                 if (res < 0)
1000                         break;
1001         }
1002         if (res >= 0)
1003                 allproc_scan(procfs_readdir_root_callback, &info, 0);
1004         uio->uio_offset = (off_t)info.i;
1005
1006         return (info.error);
1007 }
1008
1009 static int
1010 procfs_readdir_root_callback(struct proc *p, void *data)
1011 {
1012         struct procfs_readdir_root_info *info = data;
1013         struct uio *uio;
1014         int retval;
1015         ino_t d_ino;
1016         const char *d_name;
1017         char d_name_pid[20];
1018         size_t d_namlen;
1019         uint8_t d_type;
1020
1021         uio = info->uio;
1022
1023         if (uio->uio_resid <= 0 || info->error)
1024                 return(-1);
1025
1026         switch (info->pcnt) {
1027         case 0:         /* `.' */
1028                 d_ino = PROCFS_FILENO(0, Proot);
1029                 d_name = ".";
1030                 d_namlen = 1;
1031                 d_type = DT_DIR;
1032                 break;
1033         case 1:         /* `..' */
1034                 d_ino = PROCFS_FILENO(0, Proot);
1035                 d_name = "..";
1036                 d_namlen = 2;
1037                 d_type = DT_DIR;
1038                 break;
1039
1040         case 2:
1041                 d_ino = PROCFS_FILENO(0, Pcurproc);
1042                 d_namlen = 7;
1043                 d_name = "curproc";
1044                 d_type = DT_LNK;
1045                 break;
1046
1047
1048         default:
1049                 if (!PRISON_CHECK(info->cred, p->p_ucred))
1050                         return(0);
1051                 if (ps_showallprocs == 0 &&
1052                     info->cred->cr_uid != 0 &&
1053                     info->cred->cr_uid != p->p_ucred->cr_uid) {
1054                         return(0);
1055                 }
1056
1057                 /*
1058                  * Skip entries we have already returned (optimization)
1059                  */
1060                 if (info->pcnt < info->i) {
1061                         ++info->pcnt;
1062                         return(0);
1063                 }
1064
1065                 d_ino = PROCFS_FILENO(p->p_pid, Pproc);
1066                 d_namlen = ksnprintf(d_name_pid, sizeof(d_name_pid),
1067                     "%ld", (long)p->p_pid);
1068                 d_name = d_name_pid;
1069                 d_type = DT_DIR;
1070                 break;
1071         }
1072
1073         /*
1074          * Skip entries we have already returned (optimization)
1075          */
1076         if (info->pcnt < info->i) {
1077                 ++info->pcnt;
1078                 return(0);
1079         }
1080
1081         retval = vop_write_dirent(&info->error, uio,
1082                                   d_ino, d_type, d_namlen, d_name);
1083         if (retval)
1084                 return(-1);
1085         ++info->pcnt;
1086         ++info->i;
1087         return(0);
1088 }
1089
1090 /*
1091  * readlink reads the link of `curproc' or `file'
1092  */
1093 static int
1094 procfs_readlink(struct vop_readlink_args *ap)
1095 {
1096         char buf[16];           /* should be enough */
1097         struct proc *procp;
1098         struct vnode *vp = ap->a_vp;
1099         struct pfsnode *pfs = VTOPFS(vp);
1100         char *fullpath, *freepath;
1101         int error, len;
1102
1103         switch (pfs->pfs_type) {
1104         case Pcurproc:
1105                 if (pfs->pfs_fileno != PROCFS_FILENO(0, Pcurproc))
1106                         return (EINVAL);
1107
1108                 len = ksnprintf(buf, sizeof(buf), "%ld", (long)curproc->p_pid);
1109
1110                 return (uiomove(buf, len, ap->a_uio));
1111         case Pfile:
1112                 /*
1113                  * procfs's directory topology is somewhat asynchronous from
1114                  * reality so it is possible for pid requests to race exiting
1115                  * processes.  In this situation, bit 31 is set in
1116                  * pfs->pfs_pid which guarantees that pfs_pfind() will return
1117                  * NULL.
1118                  *
1119                  * It is also possible to catch a process in the middle of
1120                  * an exit sequence so various fields might wind up being
1121                  * NULL that are not normally NULL.
1122                  */
1123                 procp = pfs_pfind(pfs->pfs_pid);
1124                 if (procp == NULL || procp->p_ucred == NULL) {
1125                         pfs_pdone(procp);
1126                         return (uiomove("unknown", sizeof("unknown") - 1,
1127                                         ap->a_uio));
1128                 }
1129                 if (procp->p_textnch.ncp) {
1130                         struct nchandle nch;
1131
1132                         cache_copy(&procp->p_textnch, &nch);
1133                         error = cache_fullpath(procp, &nch, NULL,
1134                                                &fullpath, &freepath, 0);
1135                         cache_drop(&nch);
1136                 } else {
1137                         error = EINVAL;
1138                 }
1139
1140                 if (error != 0) {
1141                         pfs_pdone(procp);
1142                         return (uiomove("unknown", sizeof("unknown") - 1,
1143                                         ap->a_uio));
1144                 }
1145                 error = uiomove(fullpath, strlen(fullpath), ap->a_uio);
1146                 kfree(freepath, M_TEMP);
1147                 pfs_pdone(procp);
1148                 return (error);
1149         default:
1150                 return (EINVAL);
1151         }
1152 }
1153
1154 /*
1155  * convert decimal ascii to pid_t
1156  */
1157 static pid_t
1158 atopid(const char *b, u_int len)
1159 {
1160         pid_t p = 0;
1161
1162         while (len--) {
1163                 char c = *b++;
1164                 if (c < '0' || c > '9')
1165                         return (NO_PID);
1166                 p = 10 * p + (c - '0');
1167                 if (p > PID_MAX)
1168                         return (NO_PID);
1169         }
1170
1171         return (p);
1172 }
1173