kernel/fs/fdfs/fdops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2017 by Delphix. All rights reserved.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All rights reserved.          */
  28
  29
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/debug.h>
  34 #include <sys/dirent.h>
  35 #include <sys/errno.h>
  36 #include <sys/file.h>
  37 #include <sys/inline.h>
  38 #include <sys/kmem.h>
  39 #include <sys/pathname.h>
  40 #include <sys/resource.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/mount.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/systm.h>
  45 #include <sys/uio.h>
  46 #include <sys/vfs.h>
  47 #include <sys/vnode.h>
  48 #include <sys/cred.h>
  49 #include <sys/mntent.h>
  50 #include <sys/mount.h>
  51 #include <sys/user.h>
  52 #include <sys/t_lock.h>
  53 #include <sys/modctl.h>
  54 #include <sys/policy.h>
  55 #include <sys/fs_subr.h>
  56 #include <sys/atomic.h>
  57 #include <sys/mkdev.h>
  58
  59 #define round(r)        (((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
  60 #define fdtoi(n)        ((n)+100)
  61
  62 #define FDDIRSIZE 14
  63 struct fddirect {
  64         short   d_ino;
  65         char    d_name[FDDIRSIZE];
  66 };
  67
  68 #define FDROOTINO       2
  69 #define FDSDSIZE        sizeof (struct fddirect)
  70 #define FDNSIZE         10
  71
  72 static int              fdfstype = 0;
  73 static major_t          fdfsmaj;
  74 static minor_t          fdfsmin;
  75 static major_t          fdrmaj;
  76 static kmutex_t         fd_minor_lock;
  77
  78 static int fdget(vnode_t *, char *, vnode_t **);
  79
  80 /* ARGSUSED */
  81 static int
  82 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
  83 {
  84         if ((*vpp)->v_type != VDIR) {
  85                 mutex_enter(&(*vpp)->v_lock);
  86                 (*vpp)->v_flag |= VDUP;
  87                 mutex_exit(&(*vpp)->v_lock);
  88         }
  89         return (0);
  90 }
  91
  92 /* ARGSUSED */
  93 static int
  94 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
  95     caller_context_t *ct)
  96 {
  97         return (0);
  98 }
  99
 100 /* ARGSUSED */
 101 static int
 102 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
 103 {
 104         static struct fddirect dotbuf[] = {
 105                 { FDROOTINO, "."  },
 106                 { FDROOTINO, ".." }
 107         };
 108         struct fddirect dirbuf;
 109         int i, n;
 110         int minfd, maxfd, modoff, error = 0;
 111         int nentries;
 112         rctl_qty_t fdno_ctl;
 113         int endoff;
 114
 115         if (vp->v_type != VDIR)
 116                 return (ENOSYS);
 117
 118         mutex_enter(&curproc->p_lock);
 119         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 120             curproc->p_rctls, curproc);
 121         nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
 122         mutex_exit(&curproc->p_lock);
 123
 124         endoff = (nentries + 2) * FDSDSIZE;
 125
 126         /*
 127          * Fake up ".", "..", and the /dev/fd directory entries.
 128          */
 129         if (uiop->uio_loffset < 0 ||
 130             uiop->uio_loffset >= (offset_t)endoff ||
 131             uiop->uio_resid <= 0)
 132                 return (0);
 133         ASSERT(uiop->uio_loffset <= MAXOFF_T);
 134         if (uiop->uio_offset < 2*FDSDSIZE) {
 135                 error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
 136                     MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
 137                     UIO_READ, uiop);
 138                 if (uiop->uio_resid <= 0 || error)
 139                         return (error);
 140         }
 141         minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
 142         maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
 143         modoff = uiop->uio_offset % FDSDSIZE;
 144
 145         for (i = 0; i < FDDIRSIZE; i++)
 146                 dirbuf.d_name[i] = '\0';
 147         for (i = minfd; i < MIN(maxfd, nentries); i++) {
 148                 n = i;
 149                 dirbuf.d_ino = fdtoi(n);
 150                 numtos((ulong_t)n, dirbuf.d_name);
 151                 error = uiomove((caddr_t)&dirbuf + modoff,
 152                     MIN(uiop->uio_resid, FDSDSIZE - modoff),
 153                     UIO_READ, uiop);
 154                 if (uiop->uio_resid <= 0 || error)
 155                         return (error);
 156                 modoff = 0;
 157         }
 158
 159         return (error);
 160 }
 161
 162 /* ARGSUSED */
 163 static int
 164 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 165     caller_context_t *ct)
 166 {
 167         vfs_t *vfsp = vp->v_vfsp;
 168         timestruc_t now;
 169
 170         if (vp->v_type == VDIR) {
 171                 vap->va_nlink = 2;
 172                 vap->va_size = (uoff_t)
 173                     ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
 174                 vap->va_mode = 0555;
 175                 vap->va_nodeid = (ino64_t)FDROOTINO;
 176         } else {
 177                 vap->va_nlink = 1;
 178                 vap->va_size = 0;
 179                 vap->va_mode = 0666;
 180                 vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
 181         }
 182         vap->va_type = vp->v_type;
 183         vap->va_rdev = vp->v_rdev;
 184         vap->va_blksize = vfsp->vfs_bsize;
 185         vap->va_nblocks = (fsblkcnt64_t)0;
 186         gethrestime(&now);
 187         vap->va_atime = vap->va_mtime = vap->va_ctime = now;
 188         vap->va_uid = 0;
 189         vap->va_gid = 0;
 190         vap->va_fsid = vfsp->vfs_dev;
 191         vap->va_seq = 0;
 192         return (0);
 193 }
 194
 195 /* ARGSUSED */
 196 static int
 197 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
 198 {
 199         return (0);
 200 }
 201
 202 /* ARGSUSED */
 203 static int
 204 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp, int flags,
 205     vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags,
 206     pathname_t *realpnp)
 207 {
 208         if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
 209                 VN_HOLD(dp);
 210                 *vpp = dp;
 211                 return (0);
 212         }
 213         return (fdget(dp, comp, vpp));
 214 }
 215
 216 /* ARGSUSED */
 217 static int
 218 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl, int mode,
 219     vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
 220     vsecattr_t *vsecp)
 221 {
 222         return (fdget(dvp, comp, vpp));
 223 }
 224
 225 /* ARGSUSED */
 226 static int
 227 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
 228     int flags)
 229 {
 230         /* bp holds one dirent structure */
 231         uoff_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (uoff_t)];
 232         struct dirent64 *dirent = (struct dirent64 *)bp;
 233         int reclen, nentries;
 234         rctl_qty_t fdno_ctl;
 235         int  n;
 236         int oresid;
 237         off_t off;
 238
 239         if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
 240             (uiop->uio_offset % FDSDSIZE) != 0)
 241                 return (ENOENT);
 242
 243         ASSERT(uiop->uio_loffset <= MAXOFF_T);
 244         oresid = uiop->uio_resid;
 245         bzero(bp, sizeof (bp));
 246
 247         mutex_enter(&curproc->p_lock);
 248         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 249             curproc->p_rctls, curproc);
 250         nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
 251         mutex_exit(&curproc->p_lock);
 252
 253         while (uiop->uio_resid > 0) {
 254                 if ((off = uiop->uio_offset) == 0) {    /* "." */
 255                         dirent->d_ino = (ino64_t)FDROOTINO;
 256                         dirent->d_name[0] = '.';
 257                         dirent->d_name[1] = '\0';
 258                         reclen = DIRENT64_RECLEN(1);
 259                 } else if (off == FDSDSIZE) {           /* ".." */
 260                         dirent->d_ino = (ino64_t)FDROOTINO;
 261                         dirent->d_name[0] = '.';
 262                         dirent->d_name[1] = '.';
 263                         dirent->d_name[2] = '\0';
 264                         reclen = DIRENT64_RECLEN(2);
 265                 } else {
 266                         /*
 267                          * Return entries corresponding to the allowable
 268                          * number of file descriptors for this process.
 269                          */
 270                         if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
 271                                 break;
 272                         dirent->d_ino = (ino64_t)fdtoi(n);
 273                         numtos((ulong_t)n, dirent->d_name);
 274                         reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
 275                 }
 276                 dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
 277                 dirent->d_reclen = (ushort_t)reclen;
 278
 279                 if (reclen > uiop->uio_resid) {
 280                         /*
 281                          * Error if no entries have been returned yet.
 282                          */
 283                         if (uiop->uio_resid == oresid)
 284                                 return (EINVAL);
 285                         break;
 286                 }
 287                 /*
 288                  * uiomove() updates both resid and offset by the same
 289                  * amount.  But we want offset to change in increments
 290                  * of FDSDSIZE, which is different from the number of bytes
 291                  * being returned to the user.  So we set uio_offset
 292                  * separately, ignoring what uiomove() does.
 293                  */
 294                 if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
 295                         return (EFAULT);
 296                 uiop->uio_offset = off + FDSDSIZE;
 297         }
 298         if (eofp)
 299                 *eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
 300         return (0);
 301 }
 302
 303 /* ARGSUSED */
 304 static void
 305 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 306 {
 307         mutex_enter(&vp->v_lock);
 308         ASSERT(vp->v_count >= 1);
 309         VN_RELE_LOCKED(vp);
 310         if (vp->v_count != 0) {
 311                 mutex_exit(&vp->v_lock);
 312                 return;
 313         }
 314         mutex_exit(&vp->v_lock);
 315         vn_invalid(vp);
 316         vn_free(vp);
 317 }
 318
 319 static const struct vnodeops fd_vnodeops = {
 320         .vnop_name = "fdfs",
 321         .vop_open = fdopen,
 322         .vop_close = fdclose,
 323         .vop_read = fdread,
 324         .vop_getattr = fdgetattr,
 325         .vop_access = fdaccess,
 326         .vop_lookup = fdlookup,
 327         .vop_create = fdcreate,
 328         .vop_readdir = fdreaddir,
 329         .vop_inactive = fdinactive,
 330         .vop_frlock = fs_nosys,
 331         .vop_poll = fs_nosys_poll,
 332         .vop_dispose = fs_nodispose,
 333 };
 334
 335 static int
 336 fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
 337 {
 338         int n = 0;
 339         struct vnode *vp;
 340
 341         while (*comp) {
 342                 if (*comp < '0' || *comp > '9')
 343                         return (ENOENT);
 344                 n = 10 * n + *comp++ - '0';
 345         }
 346         vp = vn_alloc(KM_SLEEP);
 347         vp->v_type = VCHR;
 348         vp->v_vfsp = dvp->v_vfsp;
 349         vn_setops(vp, &fd_vnodeops);
 350         vp->v_data = NULL;
 351         vp->v_flag = VNOMAP;
 352         vp->v_rdev = makedevice(fdrmaj, n);
 353         vn_exists(vp);
 354         *vpp = vp;
 355         return (0);
 356 }
 357
 358 /*
 359  * fdfs is mounted on /dev/fd, however, there are two interesting
 360  * possibilities - two threads racing to do the same mount (protected
 361  * by vfs locking), and two threads mounting fdfs in different places.
 362  */
 363 /*ARGSUSED*/
 364 static int
 365 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 366 {
 367         struct vnode *vp;
 368
 369         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 370                 return (EPERM);
 371         if (mvp->v_type != VDIR)
 372                 return (ENOTDIR);
 373
 374         mutex_enter(&mvp->v_lock);
 375         if ((uap->flags & MS_OVERLAY) == 0 &&
 376             (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
 377                 mutex_exit(&mvp->v_lock);
 378                 return (EBUSY);
 379         }
 380         mutex_exit(&mvp->v_lock);
 381
 382         /*
 383          * Having the resource be anything but "fd" doesn't make sense
 384          */
 385         vfs_setresource(vfsp, "fd", 0);
 386
 387         vp = vn_alloc(KM_SLEEP);
 388         vp->v_vfsp = vfsp;
 389         vn_setops(vp, &fd_vnodeops);
 390         vp->v_type = VDIR;
 391         vp->v_data = NULL;
 392         vp->v_flag |= VROOT;
 393         vfsp->vfs_fstype = fdfstype;
 394         vfsp->vfs_data = (char *)vp;
 395         mutex_enter(&fd_minor_lock);
 396         do {
 397                 fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
 398                 vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
 399         } while (vfs_devismounted(vfsp->vfs_dev));
 400         mutex_exit(&fd_minor_lock);
 401         vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
 402         vfsp->vfs_bsize = 1024;
 403         return (0);
 404 }
 405
 406 /* ARGSUSED */
 407 static int
 408 fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
 409 {
 410         vnode_t *rvp;
 411
 412         if (secpolicy_fs_unmount(cr, vfsp) != 0)
 413                 return (EPERM);
 414
 415         /*
 416          * forced unmount is not supported by this file system
 417          * and thus, ENOTSUP, is being returned.
 418          */
 419         if (flag & MS_FORCE)
 420                 return (ENOTSUP);
 421
 422         rvp = (vnode_t *)vfsp->vfs_data;
 423         if (rvp->v_count > 1)
 424                 return (EBUSY);
 425
 426         VN_RELE(rvp);
 427         return (0);
 428 }
 429
 430 /* ARGSUSED */
 431 static int
 432 fdroot(vfs_t *vfsp, vnode_t **vpp)
 433 {
 434         vnode_t *vp = (vnode_t *)vfsp->vfs_data;
 435
 436         VN_HOLD(vp);
 437         *vpp = vp;
 438         return (0);
 439 }
 440
 441 /*
 442  * No locking required because I held the root vnode before calling this
 443  * function so the vfs won't disappear on me.  To be more explicit:
 444  * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
 445  */
 446 static int
 447 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
 448 {
 449         dev32_t d32;
 450         rctl_qty_t fdno_ctl;
 451
 452         mutex_enter(&curproc->p_lock);
 453         fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
 454             curproc->p_rctls, curproc);
 455         mutex_exit(&curproc->p_lock);
 456
 457         bzero(sp, sizeof (*sp));
 458         sp->f_bsize = 1024;
 459         sp->f_frsize = 1024;
 460         sp->f_blocks = (fsblkcnt64_t)0;
 461         sp->f_bfree = (fsblkcnt64_t)0;
 462         sp->f_bavail = (fsblkcnt64_t)0;
 463         sp->f_files = (fsfilcnt64_t)
 464             (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
 465         sp->f_ffree = (fsfilcnt64_t)0;
 466         sp->f_favail = (fsfilcnt64_t)0;
 467         (void) cmpldev(&d32, vfsp->vfs_dev);
 468         sp->f_fsid = d32;
 469         (void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
 470         sp->f_flag = vf_to_stf(vfsp->vfs_flag);
 471         sp->f_namemax = FDNSIZE;
 472         (void) strcpy(sp->f_fstr, "/dev/fd");
 473         (void) strcpy(&sp->f_fstr[8], "/dev/fd");
 474         return (0);
 475 }
 476
 477 static const struct vfsops fd_vfsops = {
 478         .vfs_mount = fdmount,
 479         .vfs_unmount = fdunmount,
 480         .vfs_root = fdroot,
 481         .vfs_statvfs = fdstatvfs,
 482 };
 483
 484 int
 485 fdinit(int fstype, char *name)
 486 {
 487         int error;
 488
 489         fdfstype = fstype;
 490         ASSERT(fdfstype != 0);
 491
 492         /*
 493          * Associate VFS ops vector with this fstype.
 494          */
 495         error = vfs_setfsops(fstype, &fd_vfsops);
 496         if (error != 0) {
 497                 cmn_err(CE_WARN, "fdinit: bad fstype");
 498                 return (error);
 499         }
 500
 501         /*
 502          * Assign unique "device" numbers (reported by stat(2)).
 503          */
 504         fdfsmaj = getudev();
 505         fdrmaj = getudev();
 506         if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
 507                 cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
 508                 if (fdfsmaj == (major_t)-1)
 509                         fdfsmaj = 0;
 510                 if (fdrmaj == (major_t)-1)
 511                         fdrmaj = 0;
 512         }
 513         mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
 514         return (0);
 515 }
 516
 517 /*
 518  * FDFS Mount options table
 519  */
 520 static char *rw_cancel[] = { MNTOPT_RO, NULL };
 521
 522 static mntopt_t mntopts[] = {
 523 /*
 524  *      option name             cancel option   default arg     flags
 525  */
 526         { MNTOPT_RW,            rw_cancel,      NULL,           MO_DEFAULT,
 527                 (void *)MNTOPT_NOINTR },
 528         { MNTOPT_IGNORE,        NULL,           NULL,           0,
 529                 NULL },
 530 };
 531
 532 static mntopts_t fdfs_mntopts = {
 533         sizeof (mntopts) / sizeof (mntopt_t),
 534         mntopts
 535 };
 536
 537 static vfsdef_t vfw = {
 538         VFSDEF_VERSION,
 539         "fd",
 540         fdinit,
 541         VSW_HASPROTO | VSW_ZMOUNT,
 542         &fdfs_mntopts
 543 };
 544
 545 static struct modlfs modlfs = {
 546         &mod_fsops,
 547         "filesystem for fd",
 548         &vfw
 549 };
 550
 551 static struct modlinkage modlinkage = {
 552         MODREV_1,
 553         &modlfs,
 554         NULL
 555 };
 556
 557 int
 558 _init(void)
 559 {
 560         return (mod_install(&modlinkage));
 561 }
 562
 563 int
 564 _info(struct modinfo *modinfop)
 565 {
 566         return (mod_info(&modlinkage, modinfop));
 567 }