sys/vfs/devfs/devfs_vnops.c

   1 /*
   2  * (MPSAFE)
   3  *
   4  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to The DragonFly Project
   7  * by Alex Hornung <ahornung@gmail.com>
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  * 3. Neither the name of The DragonFly Project nor the names of its
  20  *    contributors may be used to endorse or promote products derived
  21  *    from this software without specific, prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/time.h>
  39 #include <sys/kernel.h>
  40 #include <sys/lock.h>
  41 #include <sys/fcntl.h>
  42 #include <sys/proc.h>
  43 #include <sys/priv.h>
  44 #include <sys/signalvar.h>
  45 #include <sys/vnode.h>
  46 #include <sys/uio.h>
  47 #include <sys/mount.h>
  48 #include <sys/file.h>
  49 #include <sys/namei.h>
  50 #include <sys/dirent.h>
  51 #include <sys/malloc.h>
  52 #include <sys/stat.h>
  53 #include <sys/reg.h>
  54 #include <vm/vm_pager.h>
  55 #include <vm/vm_zone.h>
  56 #include <vm/vm_object.h>
  57 #include <sys/filio.h>
  58 #include <sys/ttycom.h>
  59 #include <sys/tty.h>
  60 #include <sys/diskslice.h>
  61 #include <sys/sysctl.h>
  62 #include <sys/devfs.h>
  63 #include <sys/pioctl.h>
  64 #include <vfs/fifofs/fifo.h>
  65
  66 #include <machine/limits.h>
  67
  68 #include <sys/buf2.h>
  69 #include <sys/sysref2.h>
  70 #include <vm/vm_page2.h>
  71
  72 #ifndef SPEC_CHAIN_DEBUG
  73 #define SPEC_CHAIN_DEBUG 0
  74 #endif
  75
  76 MALLOC_DECLARE(M_DEVFS);
  77 #define DEVFS_BADOP     (void *)devfs_vop_badop
  78
  79 static int devfs_vop_badop(struct vop_generic_args *);
  80 static int devfs_vop_access(struct vop_access_args *);
  81 static int devfs_vop_inactive(struct vop_inactive_args *);
  82 static int devfs_vop_reclaim(struct vop_reclaim_args *);
  83 static int devfs_vop_readdir(struct vop_readdir_args *);
  84 static int devfs_vop_getattr(struct vop_getattr_args *);
  85 static int devfs_vop_setattr(struct vop_setattr_args *);
  86 static int devfs_vop_readlink(struct vop_readlink_args *);
  87 static int devfs_vop_print(struct vop_print_args *);
  88
  89 static int devfs_vop_nresolve(struct vop_nresolve_args *);
  90 static int devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
  91 static int devfs_vop_nmkdir(struct vop_nmkdir_args *);
  92 static int devfs_vop_nsymlink(struct vop_nsymlink_args *);
  93 static int devfs_vop_nrmdir(struct vop_nrmdir_args *);
  94 static int devfs_vop_nremove(struct vop_nremove_args *);
  95
  96 static int devfs_spec_open(struct vop_open_args *);
  97 static int devfs_spec_close(struct vop_close_args *);
  98 static int devfs_spec_fsync(struct vop_fsync_args *);
  99
 100 static int devfs_spec_read(struct vop_read_args *);
 101 static int devfs_spec_write(struct vop_write_args *);
 102 static int devfs_spec_ioctl(struct vop_ioctl_args *);
 103 static int devfs_spec_kqfilter(struct vop_kqfilter_args *);
 104 static int devfs_spec_strategy(struct vop_strategy_args *);
 105 static void devfs_spec_strategy_done(struct bio *);
 106 static int devfs_spec_freeblks(struct vop_freeblks_args *);
 107 static int devfs_spec_bmap(struct vop_bmap_args *);
 108 static int devfs_spec_advlock(struct vop_advlock_args *);
 109 static void devfs_spec_getpages_iodone(struct bio *);
 110 static int devfs_spec_getpages(struct vop_getpages_args *);
 111
 112 static int devfs_fo_close(struct file *);
 113 static int devfs_fo_read(struct file *, struct uio *, struct ucred *, int);
 114 static int devfs_fo_write(struct file *, struct uio *, struct ucred *, int);
 115 static int devfs_fo_stat(struct file *, struct stat *, struct ucred *);
 116 static int devfs_fo_kqfilter(struct file *, struct knote *);
 117 static int devfs_fo_ioctl(struct file *, u_long, caddr_t,
 118                                 struct ucred *, struct sysmsg *);
 119 static __inline int sequential_heuristic(struct uio *, struct file *);
 120
 121 extern struct lock devfs_lock;
 122
 123 /*
 124  * devfs vnode operations for regular files.  All vnode ops are MPSAFE.
 125  */
 126 struct vop_ops devfs_vnode_norm_vops = {
 127         .vop_default =          vop_defaultop,
 128         .vop_access =           devfs_vop_access,
 129         .vop_advlock =          DEVFS_BADOP,
 130         .vop_bmap =             DEVFS_BADOP,
 131         .vop_close =            vop_stdclose,
 132         .vop_getattr =          devfs_vop_getattr,
 133         .vop_inactive =         devfs_vop_inactive,
 134         .vop_ncreate =          DEVFS_BADOP,
 135         .vop_nresolve =         devfs_vop_nresolve,
 136         .vop_nlookupdotdot =    devfs_vop_nlookupdotdot,
 137         .vop_nlink =            DEVFS_BADOP,
 138         .vop_nmkdir =           devfs_vop_nmkdir,
 139         .vop_nmknod =           DEVFS_BADOP,
 140         .vop_nremove =          devfs_vop_nremove,
 141         .vop_nrename =          DEVFS_BADOP,
 142         .vop_nrmdir =           devfs_vop_nrmdir,
 143         .vop_nsymlink =         devfs_vop_nsymlink,
 144         .vop_open =             vop_stdopen,
 145         .vop_pathconf =         vop_stdpathconf,
 146         .vop_print =            devfs_vop_print,
 147         .vop_read =             DEVFS_BADOP,
 148         .vop_readdir =          devfs_vop_readdir,
 149         .vop_readlink =         devfs_vop_readlink,
 150         .vop_reclaim =          devfs_vop_reclaim,
 151         .vop_setattr =          devfs_vop_setattr,
 152         .vop_write =            DEVFS_BADOP,
 153         .vop_ioctl =            DEVFS_BADOP
 154 };
 155
 156 /*
 157  * devfs vnode operations for character devices.  All vnode ops are MPSAFE.
 158  */
 159 struct vop_ops devfs_vnode_dev_vops = {
 160         .vop_default =          vop_defaultop,
 161         .vop_access =           devfs_vop_access,
 162         .vop_advlock =          devfs_spec_advlock,
 163         .vop_bmap =             devfs_spec_bmap,
 164         .vop_close =            devfs_spec_close,
 165         .vop_freeblks =         devfs_spec_freeblks,
 166         .vop_fsync =            devfs_spec_fsync,
 167         .vop_getattr =          devfs_vop_getattr,
 168         .vop_getpages =         devfs_spec_getpages,
 169         .vop_inactive =         devfs_vop_inactive,
 170         .vop_open =             devfs_spec_open,
 171         .vop_pathconf =         vop_stdpathconf,
 172         .vop_print =            devfs_vop_print,
 173         .vop_kqfilter =         devfs_spec_kqfilter,
 174         .vop_read =             devfs_spec_read,
 175         .vop_readdir =          DEVFS_BADOP,
 176         .vop_readlink =         DEVFS_BADOP,
 177         .vop_reclaim =          devfs_vop_reclaim,
 178         .vop_setattr =          devfs_vop_setattr,
 179         .vop_strategy =         devfs_spec_strategy,
 180         .vop_write =            devfs_spec_write,
 181         .vop_ioctl =            devfs_spec_ioctl
 182 };
 183
 184 /*
 185  * devfs file pointer operations.  All fileops are MPSAFE.
 186  */
 187 struct vop_ops *devfs_vnode_dev_vops_p = &devfs_vnode_dev_vops;
 188
 189 struct fileops devfs_dev_fileops = {
 190         .fo_read        = devfs_fo_read,
 191         .fo_write       = devfs_fo_write,
 192         .fo_ioctl       = devfs_fo_ioctl,
 193         .fo_kqfilter    = devfs_fo_kqfilter,
 194         .fo_stat        = devfs_fo_stat,
 195         .fo_close       = devfs_fo_close,
 196         .fo_shutdown    = nofo_shutdown
 197 };
 198
 199 /*
 200  * These two functions are possibly temporary hacks for devices (aka
 201  * the pty code) which want to control the node attributes themselves.
 202  *
 203  * XXX we may ultimately desire to simply remove the uid/gid/mode
 204  * from the node entirely.
 205  *
 206  * MPSAFE - sorta.  Theoretically the overwrite can compete since they
 207  *          are loading from the same fields.
 208  */
 209 static __inline void
 210 node_sync_dev_get(struct devfs_node *node)
 211 {
 212         cdev_t dev;
 213
 214         if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) {
 215                 node->uid = dev->si_uid;
 216                 node->gid = dev->si_gid;
 217                 node->mode = dev->si_perms;
 218         }
 219 }
 220
 221 static __inline void
 222 node_sync_dev_set(struct devfs_node *node)
 223 {
 224         cdev_t dev;
 225
 226         if ((dev = node->d_dev) && (dev->si_flags & SI_OVERRIDE)) {
 227                 dev->si_uid = node->uid;
 228                 dev->si_gid = node->gid;
 229                 dev->si_perms = node->mode;
 230         }
 231 }
 232
 233 /*
 234  * generic entry point for unsupported operations
 235  */
 236 static int
 237 devfs_vop_badop(struct vop_generic_args *ap)
 238 {
 239         return (EIO);
 240 }
 241
 242
 243 static int
 244 devfs_vop_access(struct vop_access_args *ap)
 245 {
 246         struct devfs_node *node = DEVFS_NODE(ap->a_vp);
 247         int error;
 248
 249         if (!devfs_node_is_accessible(node))
 250                 return ENOENT;
 251         node_sync_dev_get(node);
 252         error = vop_helper_access(ap, node->uid, node->gid,
 253                                   node->mode, node->flags);
 254
 255         return error;
 256 }
 257
 258
 259 static int
 260 devfs_vop_inactive(struct vop_inactive_args *ap)
 261 {
 262         struct devfs_node *node = DEVFS_NODE(ap->a_vp);
 263
 264         if (node == NULL || (node->flags & DEVFS_NODE_LINKED) == 0)
 265                 vrecycle(ap->a_vp);
 266         return 0;
 267 }
 268
 269
 270 static int
 271 devfs_vop_reclaim(struct vop_reclaim_args *ap)
 272 {
 273         struct devfs_node *node;
 274         struct vnode *vp;
 275         int locked;
 276
 277         /*
 278          * Check if it is locked already. if not, we acquire the devfs lock
 279          */
 280         if ((lockstatus(&devfs_lock, curthread)) != LK_EXCLUSIVE) {
 281                 lockmgr(&devfs_lock, LK_EXCLUSIVE);
 282                 locked = 1;
 283         } else {
 284                 locked = 0;
 285         }
 286
 287         /*
 288          * Get rid of the devfs_node if it is no longer linked into the
 289          * topology.
 290          */
 291         vp = ap->a_vp;
 292         if ((node = DEVFS_NODE(vp)) != NULL) {
 293                 node->v_node = NULL;
 294                 if ((node->flags & DEVFS_NODE_LINKED) == 0)
 295                         devfs_freep(node);
 296         }
 297
 298         if (locked)
 299                 lockmgr(&devfs_lock, LK_RELEASE);
 300
 301         /*
 302          * v_rdev needs to be properly released using v_release_rdev
 303          * Make sure v_data is NULL as well.
 304          */
 305         vp->v_data = NULL;
 306         v_release_rdev(vp);
 307         return 0;
 308 }
 309
 310
 311 static int
 312 devfs_vop_readdir(struct vop_readdir_args *ap)
 313 {
 314         struct devfs_node *dnode = DEVFS_NODE(ap->a_vp);
 315         struct devfs_node *node;
 316         int cookie_index;
 317         int ncookies;
 318         int error2;
 319         int error;
 320         int r;
 321         off_t *cookies;
 322         off_t saveoff;
 323
 324         devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_readdir() called!\n");
 325
 326         if (ap->a_uio->uio_offset < 0 || ap->a_uio->uio_offset > INT_MAX)
 327                 return (EINVAL);
 328         error = vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
 329         if (error)
 330                 return (error);
 331
 332         if (!devfs_node_is_accessible(dnode)) {
 333                 vn_unlock(ap->a_vp);
 334                 return ENOENT;
 335         }
 336
 337         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 338
 339         saveoff = ap->a_uio->uio_offset;
 340
 341         if (ap->a_ncookies) {
 342                 ncookies = ap->a_uio->uio_resid / 16 + 1; /* Why / 16 ?? */
 343                 if (ncookies > 256)
 344                         ncookies = 256;
 345                 cookies = kmalloc(256 * sizeof(off_t), M_TEMP, M_WAITOK);
 346                 cookie_index = 0;
 347         } else {
 348                 ncookies = -1;
 349                 cookies = NULL;
 350                 cookie_index = 0;
 351         }
 352
 353         nanotime(&dnode->atime);
 354
 355         if (saveoff == 0) {
 356                 r = vop_write_dirent(&error, ap->a_uio, dnode->d_dir.d_ino,
 357                                      DT_DIR, 1, ".");
 358                 if (r)
 359                         goto done;
 360                 if (cookies)
 361                         cookies[cookie_index] = saveoff;
 362                 saveoff++;
 363                 cookie_index++;
 364                 if (cookie_index == ncookies)
 365                         goto done;
 366         }
 367
 368         if (saveoff == 1) {
 369                 if (dnode->parent) {
 370                         r = vop_write_dirent(&error, ap->a_uio,
 371                                              dnode->parent->d_dir.d_ino,
 372                                              DT_DIR, 2, "..");
 373                 } else {
 374                         r = vop_write_dirent(&error, ap->a_uio,
 375                                              dnode->d_dir.d_ino,
 376                                              DT_DIR, 2, "..");
 377                 }
 378                 if (r)
 379                         goto done;
 380                 if (cookies)
 381                         cookies[cookie_index] = saveoff;
 382                 saveoff++;
 383                 cookie_index++;
 384                 if (cookie_index == ncookies)
 385                         goto done;
 386         }
 387
 388         TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
 389                 if ((node->flags & DEVFS_HIDDEN) ||
 390                     (node->flags & DEVFS_INVISIBLE)) {
 391                         continue;
 392                 }
 393
 394                 /*
 395                  * If the node type is a valid devfs alias, then we make
 396                  * sure that the target isn't hidden. If it is, we don't
 397                  * show the link in the directory listing.
 398                  */
 399                 if ((node->node_type == Nlink) && (node->link_target != NULL) &&
 400                         (node->link_target->flags & DEVFS_HIDDEN))
 401                         continue;
 402
 403                 if (node->cookie < saveoff)
 404                         continue;
 405
 406                 saveoff = node->cookie;
 407
 408                 error2 = vop_write_dirent(&error, ap->a_uio, node->d_dir.d_ino,
 409                                           node->d_dir.d_type,
 410                                           node->d_dir.d_namlen,
 411                                           node->d_dir.d_name);
 412
 413                 if (error2)
 414                         break;
 415
 416                 saveoff++;
 417
 418                 if (cookies)
 419                         cookies[cookie_index] = node->cookie;
 420                 ++cookie_index;
 421                 if (cookie_index == ncookies)
 422                         break;
 423         }
 424
 425 done:
 426         lockmgr(&devfs_lock, LK_RELEASE);
 427         vn_unlock(ap->a_vp);
 428
 429         ap->a_uio->uio_offset = saveoff;
 430         if (error && cookie_index == 0) {
 431                 if (cookies) {
 432                         kfree(cookies, M_TEMP);
 433                         *ap->a_ncookies = 0;
 434                         *ap->a_cookies = NULL;
 435                 }
 436         } else {
 437                 if (cookies) {
 438                         *ap->a_ncookies = cookie_index;
 439                         *ap->a_cookies = cookies;
 440                 }
 441         }
 442         return (error);
 443 }
 444
 445
 446 static int
 447 devfs_vop_nresolve(struct vop_nresolve_args *ap)
 448 {
 449         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 450         struct devfs_node *node, *found = NULL;
 451         struct namecache *ncp;
 452         struct vnode *vp = NULL;
 453         int error = 0;
 454         int len;
 455         int depth;
 456
 457         ncp = ap->a_nch->ncp;
 458         len = ncp->nc_nlen;
 459
 460         if (!devfs_node_is_accessible(dnode))
 461                 return ENOENT;
 462
 463         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 464
 465         if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir)) {
 466                 error = ENOENT;
 467                 cache_setvp(ap->a_nch, NULL);
 468                 goto out;
 469         }
 470
 471         TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
 472                 if (len == node->d_dir.d_namlen) {
 473                         if (!memcmp(ncp->nc_name, node->d_dir.d_name, len)) {
 474                                 found = node;
 475                                 break;
 476                         }
 477                 }
 478         }
 479
 480         if (found) {
 481                 depth = 0;
 482                 while ((found->node_type == Nlink) && (found->link_target)) {
 483                         if (depth >= 8) {
 484                                 devfs_debug(DEVFS_DEBUG_SHOW, "Recursive link or depth >= 8");
 485                                 break;
 486                         }
 487
 488                         found = found->link_target;
 489                         ++depth;
 490                 }
 491
 492                 if (!(found->flags & DEVFS_HIDDEN))
 493                         devfs_allocv(/*ap->a_dvp->v_mount, */ &vp, found);
 494         }
 495
 496         if (vp == NULL) {
 497                 error = ENOENT;
 498                 cache_setvp(ap->a_nch, NULL);
 499                 goto out;
 500
 501         }
 502         KKASSERT(vp);
 503         vn_unlock(vp);
 504         cache_setvp(ap->a_nch, vp);
 505         vrele(vp);
 506 out:
 507         lockmgr(&devfs_lock, LK_RELEASE);
 508
 509         return error;
 510 }
 511
 512
 513 static int
 514 devfs_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
 515 {
 516         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 517
 518         *ap->a_vpp = NULL;
 519         if (!devfs_node_is_accessible(dnode))
 520                 return ENOENT;
 521
 522         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 523         if (dnode->parent != NULL) {
 524                 devfs_allocv(ap->a_vpp, dnode->parent);
 525                 vn_unlock(*ap->a_vpp);
 526         }
 527         lockmgr(&devfs_lock, LK_RELEASE);
 528
 529         return ((*ap->a_vpp == NULL) ? ENOENT : 0);
 530 }
 531
 532
 533 static int
 534 devfs_vop_getattr(struct vop_getattr_args *ap)
 535 {
 536         struct devfs_node *node = DEVFS_NODE(ap->a_vp);
 537         struct vattr *vap = ap->a_vap;
 538         struct partinfo pinfo;
 539         int error = 0;
 540
 541 #if 0
 542         if (!devfs_node_is_accessible(node))
 543                 return ENOENT;
 544 #endif
 545         node_sync_dev_get(node);
 546
 547         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 548
 549         /* start by zeroing out the attributes */
 550         VATTR_NULL(vap);
 551
 552         /* next do all the common fields */
 553         vap->va_type = ap->a_vp->v_type;
 554         vap->va_mode = node->mode;
 555         vap->va_fileid = DEVFS_NODE(ap->a_vp)->d_dir.d_ino ;
 556         vap->va_flags = 0;
 557         vap->va_blocksize = DEV_BSIZE;
 558         vap->va_bytes = vap->va_size = 0;
 559
 560         vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 561
 562         vap->va_atime = node->atime;
 563         vap->va_mtime = node->mtime;
 564         vap->va_ctime = node->ctime;
 565
 566         vap->va_nlink = 1; /* number of references to file */
 567
 568         vap->va_uid = node->uid;
 569         vap->va_gid = node->gid;
 570
 571         vap->va_rmajor = 0;
 572         vap->va_rminor = 0;
 573
 574         if ((node->node_type == Ndev) && node->d_dev)  {
 575                 reference_dev(node->d_dev);
 576                 vap->va_rminor = node->d_dev->si_uminor;
 577                 release_dev(node->d_dev);
 578         }
 579
 580         /* For a softlink the va_size is the length of the softlink */
 581         if (node->symlink_name != 0) {
 582                 vap->va_bytes = vap->va_size = node->symlink_namelen;
 583         }
 584
 585         /*
 586          * For a disk-type device, va_size is the size of the underlying
 587          * device, so that lseek() works properly.
 588          */
 589         if ((node->d_dev) && (dev_dflags(node->d_dev) & D_DISK)) {
 590                 bzero(&pinfo, sizeof(pinfo));
 591                 error = dev_dioctl(node->d_dev, DIOCGPART, (void *)&pinfo,
 592                                    0, proc0.p_ucred, NULL, NULL);
 593                 if ((error == 0) && (pinfo.media_blksize != 0)) {
 594                         vap->va_size = pinfo.media_size;
 595                 } else {
 596                         vap->va_size = 0;
 597                         error = 0;
 598                 }
 599         }
 600
 601         lockmgr(&devfs_lock, LK_RELEASE);
 602
 603         return (error);
 604 }
 605
 606
 607 static int
 608 devfs_vop_setattr(struct vop_setattr_args *ap)
 609 {
 610         struct devfs_node *node = DEVFS_NODE(ap->a_vp);
 611         struct vattr *vap;
 612         uid_t cur_uid;
 613         gid_t cur_gid;
 614         mode_t cur_mode;
 615         int error = 0;
 616
 617         if (!devfs_node_is_accessible(node))
 618                 return ENOENT;
 619         node_sync_dev_get(node);
 620
 621         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 622
 623         vap = ap->a_vap;
 624
 625         if ((vap->va_uid != (uid_t)VNOVAL) || (vap->va_gid != (gid_t)VNOVAL)) {
 626                 cur_uid = node->uid;
 627                 cur_gid = node->gid;
 628                 cur_mode = node->mode;
 629                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
 630                     ap->a_cred, &cur_uid, &cur_gid, &cur_mode);
 631                 if (error)
 632                         goto out;
 633
 634                 if (node->uid != cur_uid || node->gid != cur_gid) {
 635                         node->uid = cur_uid;
 636                         node->gid = cur_gid;
 637                         node->mode = cur_mode;
 638                 }
 639         }
 640
 641         if (vap->va_mode != (mode_t)VNOVAL) {
 642                 cur_mode = node->mode;
 643                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
 644                     node->uid, node->gid, &cur_mode);
 645                 if (error == 0 && node->mode != cur_mode) {
 646                         node->mode = cur_mode;
 647                 }
 648         }
 649
 650 out:
 651         node_sync_dev_set(node);
 652         nanotime(&node->ctime);
 653         lockmgr(&devfs_lock, LK_RELEASE);
 654
 655         return error;
 656 }
 657
 658
 659 static int
 660 devfs_vop_readlink(struct vop_readlink_args *ap)
 661 {
 662         struct devfs_node *node = DEVFS_NODE(ap->a_vp);
 663         int ret;
 664
 665         if (!devfs_node_is_accessible(node))
 666                 return ENOENT;
 667
 668         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 669         ret = uiomove(node->symlink_name, node->symlink_namelen, ap->a_uio);
 670         lockmgr(&devfs_lock, LK_RELEASE);
 671
 672         return ret;
 673 }
 674
 675
 676 static int
 677 devfs_vop_print(struct vop_print_args *ap)
 678 {
 679         return (0);
 680 }
 681
 682 static int
 683 devfs_vop_nmkdir(struct vop_nmkdir_args *ap)
 684 {
 685         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 686         struct devfs_node *node;
 687
 688         if (!devfs_node_is_accessible(dnode))
 689                 return ENOENT;
 690
 691         if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
 692                 goto out;
 693
 694         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 695         devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Ndir,
 696                       ap->a_nch->ncp->nc_name, dnode, NULL);
 697
 698         if (*ap->a_vpp) {
 699                 node = DEVFS_NODE(*ap->a_vpp);
 700                 node->flags |= DEVFS_USER_CREATED;
 701                 cache_setunresolved(ap->a_nch);
 702                 cache_setvp(ap->a_nch, *ap->a_vpp);
 703         }
 704         lockmgr(&devfs_lock, LK_RELEASE);
 705 out:
 706         return ((*ap->a_vpp == NULL) ? ENOTDIR : 0);
 707 }
 708
 709 static int
 710 devfs_vop_nsymlink(struct vop_nsymlink_args *ap)
 711 {
 712         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 713         struct devfs_node *node;
 714         size_t targetlen;
 715
 716         if (!devfs_node_is_accessible(dnode))
 717                 return ENOENT;
 718
 719         ap->a_vap->va_type = VLNK;
 720
 721         if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
 722                 goto out;
 723
 724         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 725         devfs_allocvp(ap->a_dvp->v_mount, ap->a_vpp, Nlink,
 726                       ap->a_nch->ncp->nc_name, dnode, NULL);
 727
 728         targetlen = strlen(ap->a_target);
 729         if (*ap->a_vpp) {
 730                 node = DEVFS_NODE(*ap->a_vpp);
 731                 node->flags |= DEVFS_USER_CREATED;
 732                 node->symlink_namelen = targetlen;
 733                 node->symlink_name = kmalloc(targetlen + 1, M_DEVFS, M_WAITOK);
 734                 memcpy(node->symlink_name, ap->a_target, targetlen);
 735                 node->symlink_name[targetlen] = '\0';
 736                 cache_setunresolved(ap->a_nch);
 737                 cache_setvp(ap->a_nch, *ap->a_vpp);
 738         }
 739         lockmgr(&devfs_lock, LK_RELEASE);
 740 out:
 741         return ((*ap->a_vpp == NULL) ? ENOTDIR : 0);
 742 }
 743
 744 static int
 745 devfs_vop_nrmdir(struct vop_nrmdir_args *ap)
 746 {
 747         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 748         struct devfs_node *node;
 749         struct namecache *ncp;
 750         int error = ENOENT;
 751
 752         ncp = ap->a_nch->ncp;
 753
 754         if (!devfs_node_is_accessible(dnode))
 755                 return ENOENT;
 756
 757         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 758
 759         if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
 760                 goto out;
 761
 762         TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
 763                 if (ncp->nc_nlen != node->d_dir.d_namlen)
 764                         continue;
 765                 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen))
 766                         continue;
 767
 768                 /*
 769                  * only allow removal of user created dirs
 770                  */
 771                 if ((node->flags & DEVFS_USER_CREATED) == 0) {
 772                         error = EPERM;
 773                         goto out;
 774                 } else if (node->node_type != Ndir) {
 775                         error = ENOTDIR;
 776                         goto out;
 777                 } else if (node->nchildren > 2) {
 778                         error = ENOTEMPTY;
 779                         goto out;
 780                 } else {
 781                         if (node->v_node)
 782                                 cache_inval_vp(node->v_node, CINV_DESTROY);
 783                         devfs_unlinkp(node);
 784                         error = 0;
 785                         break;
 786                 }
 787         }
 788
 789         cache_unlink(ap->a_nch);
 790 out:
 791         lockmgr(&devfs_lock, LK_RELEASE);
 792         return error;
 793 }
 794
 795 static int
 796 devfs_vop_nremove(struct vop_nremove_args *ap)
 797 {
 798         struct devfs_node *dnode = DEVFS_NODE(ap->a_dvp);
 799         struct devfs_node *node;
 800         struct namecache *ncp;
 801         int error = ENOENT;
 802
 803         ncp = ap->a_nch->ncp;
 804
 805         if (!devfs_node_is_accessible(dnode))
 806                 return ENOENT;
 807
 808         lockmgr(&devfs_lock, LK_EXCLUSIVE);
 809
 810         if ((dnode->node_type != Nroot) && (dnode->node_type != Ndir))
 811                 goto out;
 812
 813         TAILQ_FOREACH(node, DEVFS_DENODE_HEAD(dnode), link) {
 814                 if (ncp->nc_nlen != node->d_dir.d_namlen)
 815                         continue;
 816                 if (memcmp(ncp->nc_name, node->d_dir.d_name, ncp->nc_nlen))
 817                         continue;
 818
 819                 /*
 820                  * only allow removal of user created stuff (e.g. symlinks)
 821                  */
 822                 if ((node->flags & DEVFS_USER_CREATED) == 0) {
 823                         error = EPERM;
 824                         goto out;
 825                 } else if (node->node_type == Ndir) {
 826                         error = EISDIR;
 827                         goto out;
 828                 } else {
 829                         if (node->v_node)
 830                                 cache_inval_vp(node->v_node, CINV_DESTROY);
 831                         devfs_unlinkp(node);
 832                         error = 0;
 833                         break;
 834                 }
 835         }
 836
 837         cache_unlink(ap->a_nch);
 838 out:
 839         lockmgr(&devfs_lock, LK_RELEASE);
 840         return error;
 841 }
 842
 843
 844 static int
 845 devfs_spec_open(struct vop_open_args *ap)
 846 {
 847         struct vnode *vp = ap->a_vp;
 848         struct vnode *orig_vp = NULL;
 849         struct devfs_node *node = DEVFS_NODE(vp);
 850         struct devfs_node *newnode;
 851         cdev_t dev, ndev = NULL;
 852         int error = 0;
 853
 854         if (node) {
 855                 if (node->d_dev == NULL)
 856                         return ENXIO;
 857                 if (!devfs_node_is_accessible(node))
 858                         return ENOENT;
 859         }
 860
 861         if ((dev = vp->v_rdev) == NULL)
 862                 return ENXIO;
 863
 864         vn_lock(vp, LK_UPGRADE | LK_RETRY);
 865
 866         if (node && ap->a_fp) {
 867                 int exists;
 868
 869                 devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_open: -1.1-\n");
 870                 lockmgr(&devfs_lock, LK_EXCLUSIVE);
 871
 872                 ndev = devfs_clone(dev, node->d_dir.d_name,
 873                                    node->d_dir.d_namlen,
 874                                    ap->a_mode, ap->a_cred);
 875                 if (ndev != NULL) {
 876                         newnode = devfs_create_device_node(
 877                                         DEVFS_MNTDATA(vp->v_mount)->root_node,
 878                                         ndev, &exists, NULL, NULL);
 879                         /* XXX: possibly destroy device if this happens */
 880
 881                         if (newnode != NULL) {
 882                                 dev = ndev;
 883                                 if (exists == 0)
 884                                         devfs_link_dev(dev);
 885
 886                                 devfs_debug(DEVFS_DEBUG_DEBUG,
 887                                                 "parent here is: %s, node is: |%s|\n",
 888                                                 ((node->parent->node_type == Nroot) ?
 889                                                 "ROOT!" : node->parent->d_dir.d_name),
 890                                                 newnode->d_dir.d_name);
 891                                 devfs_debug(DEVFS_DEBUG_DEBUG,
 892                                                 "test: %s\n",
 893                                                 ((struct devfs_node *)(TAILQ_LAST(DEVFS_DENODE_HEAD(node->parent), devfs_node_head)))->d_dir.d_name);
 894
 895                                 /*
 896                                  * orig_vp is set to the original vp if we
 897                                  * cloned.
 898                                  */
 899                                 /* node->flags |= DEVFS_CLONED; */
 900                                 devfs_allocv(&vp, newnode);
 901                                 orig_vp = ap->a_vp;
 902                                 ap->a_vp = vp;
 903                         }
 904                 }
 905                 lockmgr(&devfs_lock, LK_RELEASE);
 906                 /*
 907                  * Synchronize devfs here to make sure that, if the cloned
 908                  * device creates other device nodes in addition to the
 909                  * cloned one, all of them are created by the time we return
 910                  * from opening the cloned one.
 911                  */
 912                 if (ndev)
 913                         devfs_config();
 914         }
 915
 916         devfs_debug(DEVFS_DEBUG_DEBUG,
 917                     "devfs_spec_open() called on %s! \n",
 918                     dev->si_name);
 919
 920         /*
 921          * Make this field valid before any I/O in ->d_open
 922          */
 923         if (!dev->si_iosize_max)
 924                 /* XXX: old DFLTPHYS == 64KB dependency */
 925                 dev->si_iosize_max = min(MAXPHYS,64*1024);
 926
 927         if (dev_dflags(dev) & D_TTY)
 928                 vsetflags(vp, VISTTY);
 929
 930         /*
 931          * Open underlying device
 932          */
 933         vn_unlock(vp);
 934         error = dev_dopen(dev, ap->a_mode, S_IFCHR, ap->a_cred, ap->a_fp);
 935         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 936
 937         /*
 938          * Clean up any cloned vp if we error out.
 939          */
 940         if (error) {
 941                 if (orig_vp) {
 942                         vput(vp);
 943                         ap->a_vp = orig_vp;
 944                         /* orig_vp = NULL; */
 945                 }
 946                 return error;
 947         }
 948
 949         /*
 950          * This checks if the disk device is going to be opened for writing.
 951          * It will be only allowed in the cases where securelevel permits it
 952          * and it's not mounted R/W.
 953          */
 954         if ((dev_dflags(dev) & D_DISK) && (ap->a_mode & FWRITE) &&
 955             (ap->a_cred != FSCRED)) {
 956
 957                 /* Very secure mode. No open for writing allowed */
 958                 if (securelevel >= 2)
 959                         return EPERM;
 960
 961                 /*
 962                  * If it is mounted R/W, do not allow to open for writing.
 963                  * In the case it's mounted read-only but securelevel
 964                  * is >= 1, then do not allow opening for writing either.
 965                  */
 966                 if (vfs_mountedon(vp)) {
 967                         if (!(dev->si_mountpoint->mnt_flag & MNT_RDONLY))
 968                                 return EBUSY;
 969                         else if (securelevel >= 1)
 970                                 return EPERM;
 971                 }
 972         }
 973
 974         if (dev_dflags(dev) & D_TTY) {
 975                 if (dev->si_tty) {
 976                         struct tty *tp;
 977                         tp = dev->si_tty;
 978                         if (!tp->t_stop) {
 979                                 devfs_debug(DEVFS_DEBUG_DEBUG,
 980                                             "devfs: no t_stop\n");
 981                                 tp->t_stop = nottystop;
 982                         }
 983                 }
 984         }
 985
 986
 987         if (vn_isdisk(vp, NULL)) {
 988                 if (!dev->si_bsize_phys)
 989                         dev->si_bsize_phys = DEV_BSIZE;
 990                 vinitvmio(vp, IDX_TO_OFF(INT_MAX), PAGE_SIZE, -1);
 991         }
 992
 993         vop_stdopen(ap);
 994 #if 0
 995         if (node)
 996                 nanotime(&node->atime);
 997 #endif
 998
 999         /*
1000          * If we replaced the vp the vop_stdopen() call will have loaded
1001          * it into fp->f_data and vref()d the vp, giving us two refs.  So
1002          * instead of just unlocking it here we have to vput() it.
1003          */
1004         if (orig_vp)
1005                 vput(vp);
1006
1007         /* Ugly pty magic, to make pty devices appear once they are opened */
1008         if (node && (node->flags & DEVFS_PTY) == DEVFS_PTY)
1009                 node->flags &= ~DEVFS_INVISIBLE;
1010
1011         if (ap->a_fp) {
1012                 KKASSERT(ap->a_fp->f_type == DTYPE_VNODE);
1013                 KKASSERT((ap->a_fp->f_flag & FMASK) == (ap->a_mode & FMASK));
1014                 ap->a_fp->f_ops = &devfs_dev_fileops;
1015                 KKASSERT(ap->a_fp->f_data == (void *)vp);
1016         }
1017
1018         return 0;
1019 }
1020
1021 static int
1022 devfs_spec_close(struct vop_close_args *ap)
1023 {
1024         struct devfs_node *node;
1025         struct proc *p = curproc;
1026         struct vnode *vp = ap->a_vp;
1027         cdev_t dev = vp->v_rdev;
1028         int error = 0;
1029         int needrelock;
1030         int opencount;
1031
1032         /*
1033          * We do special tests on the opencount so unfortunately we need
1034          * an exclusive lock.
1035          */
1036         vn_lock(vp, LK_UPGRADE | LK_RETRY);
1037
1038         if (dev)
1039                 devfs_debug(DEVFS_DEBUG_DEBUG,
1040                             "devfs_spec_close() called on %s! \n",
1041                             dev->si_name);
1042         else
1043                 devfs_debug(DEVFS_DEBUG_DEBUG,
1044                             "devfs_spec_close() called, null vode!\n");
1045
1046         /*
1047          * A couple of hacks for devices and tty devices.  The
1048          * vnode ref count cannot be used to figure out the
1049          * last close, but we can use v_opencount now that
1050          * revoke works properly.
1051          *
1052          * Detect the last close on a controlling terminal and clear
1053          * the session (half-close).
1054          *
1055          * XXX opencount is not SMP safe.  The vnode is locked but there
1056          *     may be multiple vnodes referencing the same device.
1057          */
1058         if (dev) {
1059                 /*
1060                  * NOTE: Try to avoid global tokens when testing opencount
1061                  * XXX hack, fixme. needs a struct lock and opencount in
1062                  * struct cdev itself.
1063                  */
1064                 reference_dev(dev);
1065                 opencount = vp->v_opencount;
1066                 if (opencount <= 1)
1067                         opencount = count_dev(dev);   /* XXX NOT SMP SAFE */
1068         } else {
1069                 opencount = 0;
1070         }
1071
1072         if (p && vp->v_opencount <= 1 && vp == p->p_session->s_ttyvp) {
1073                 p->p_session->s_ttyvp = NULL;
1074                 vrele(vp);
1075         }
1076
1077         /*
1078          * Vnodes can be opened and closed multiple times.  Do not really
1079          * close the device unless (1) it is being closed forcibly,
1080          * (2) the device wants to track closes, or (3) this is the last
1081          * vnode doing its last close on the device.
1082          *
1083          * XXX the VXLOCK (force close) case can leave vnodes referencing
1084          * a closed device.  This might not occur now that our revoke is
1085          * fixed.
1086          */
1087         devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -1- \n");
1088         if (dev && ((vp->v_flag & VRECLAIMED) ||
1089             (dev_dflags(dev) & D_TRACKCLOSE) ||
1090             (opencount == 1))) {
1091                 /*
1092                  * Ugly pty magic, to make pty devices disappear again once
1093                  * they are closed.
1094                  */
1095                 node = DEVFS_NODE(ap->a_vp);
1096                 if (node && (node->flags & DEVFS_PTY))
1097                         node->flags |= DEVFS_INVISIBLE;
1098
1099                 /*
1100                  * Unlock around dev_dclose(), unless the vnode is
1101                  * undergoing a vgone/reclaim (during umount).
1102                  */
1103                 needrelock = 0;
1104                 if ((vp->v_flag & VRECLAIMED) == 0 && vn_islocked(vp)) {
1105                         needrelock = 1;
1106                         vn_unlock(vp);
1107                 }
1108
1109                 /*
1110                  * WARNING!  If the device destroys itself the devfs node
1111                  *           can disappear here.
1112                  *
1113                  * WARNING!  vn_lock() will fail if the vp is in a VRECLAIM,
1114                  *           which can occur during umount.
1115                  */
1116                 error = dev_dclose(dev, ap->a_fflag, S_IFCHR, ap->a_fp);
1117                 /* node is now stale */
1118
1119                 if (needrelock) {
1120                         if (vn_lock(vp, LK_EXCLUSIVE |
1121                                         LK_RETRY |
1122                                         LK_FAILRECLAIM) != 0) {
1123                                 panic("devfs_spec_close: vnode %p "
1124                                       "unexpectedly could not be relocked",
1125                                       vp);
1126                         }
1127                 }
1128         } else {
1129                 error = 0;
1130         }
1131         devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_spec_close() -2- \n");
1132
1133         /*
1134          * Track the actual opens and closes on the vnode.  The last close
1135          * disassociates the rdev.  If the rdev is already disassociated or
1136          * the opencount is already 0, the vnode might have been revoked
1137          * and no further opencount tracking occurs.
1138          */
1139         if (dev)
1140                 release_dev(dev);
1141         if (vp->v_opencount > 0)
1142                 vop_stdclose(ap);
1143         return(error);
1144
1145 }
1146
1147
1148 static int
1149 devfs_fo_close(struct file *fp)
1150 {
1151         struct vnode *vp = (struct vnode *)fp->f_data;
1152         int error;
1153
1154         fp->f_ops = &badfileops;
1155         error = vn_close(vp, fp->f_flag, fp);
1156         devfs_clear_cdevpriv(fp);
1157
1158         return (error);
1159 }
1160
1161
1162 /*
1163  * Device-optimized file table vnode read routine.
1164  *
1165  * This bypasses the VOP table and talks directly to the device.  Most
1166  * filesystems just route to specfs and can make this optimization.
1167  */
1168 static int
1169 devfs_fo_read(struct file *fp, struct uio *uio,
1170                  struct ucred *cred, int flags)
1171 {
1172         struct devfs_node *node;
1173         struct vnode *vp;
1174         int ioflag;
1175         int error;
1176         cdev_t dev;
1177
1178         KASSERT(uio->uio_td == curthread,
1179                 ("uio_td %p is not td %p", uio->uio_td, curthread));
1180
1181         if (uio->uio_resid == 0)
1182                 return 0;
1183
1184         vp = (struct vnode *)fp->f_data;
1185         if (vp == NULL || vp->v_type == VBAD)
1186                 return EBADF;
1187
1188         node = DEVFS_NODE(vp);
1189
1190         if ((dev = vp->v_rdev) == NULL)
1191                 return EBADF;
1192
1193         reference_dev(dev);
1194
1195         if ((flags & O_FOFFSET) == 0)
1196                 uio->uio_offset = fp->f_offset;
1197
1198         ioflag = 0;
1199         if (flags & O_FBLOCKING) {
1200                 /* ioflag &= ~IO_NDELAY; */
1201         } else if (flags & O_FNONBLOCKING) {
1202                 ioflag |= IO_NDELAY;
1203         } else if (fp->f_flag & FNONBLOCK) {
1204                 ioflag |= IO_NDELAY;
1205         }
1206         if (fp->f_flag & O_DIRECT) {
1207                 ioflag |= IO_DIRECT;
1208         }
1209         ioflag |= sequential_heuristic(uio, fp);
1210
1211         error = dev_dread(dev, uio, ioflag, fp);
1212
1213         release_dev(dev);
1214         if (node)
1215                 nanotime(&node->atime);
1216         if ((flags & O_FOFFSET) == 0)
1217                 fp->f_offset = uio->uio_offset;
1218         fp->f_nextoff = uio->uio_offset;
1219
1220         return (error);
1221 }
1222
1223
1224 static int
1225 devfs_fo_write(struct file *fp, struct uio *uio,
1226                   struct ucred *cred, int flags)
1227 {
1228         struct devfs_node *node;
1229         struct vnode *vp;
1230         int ioflag;
1231         int error;
1232         cdev_t dev;
1233
1234         KASSERT(uio->uio_td == curthread,
1235                 ("uio_td %p is not p %p", uio->uio_td, curthread));
1236
1237         vp = (struct vnode *)fp->f_data;
1238         if (vp == NULL || vp->v_type == VBAD)
1239                 return EBADF;
1240
1241         node = DEVFS_NODE(vp);
1242
1243         if (vp->v_type == VREG)
1244                 bwillwrite(uio->uio_resid);
1245
1246         vp = (struct vnode *)fp->f_data;
1247
1248         if ((dev = vp->v_rdev) == NULL)
1249                 return EBADF;
1250
1251         reference_dev(dev);
1252
1253         if ((flags & O_FOFFSET) == 0)
1254                 uio->uio_offset = fp->f_offset;
1255
1256         ioflag = IO_UNIT;
1257         if (vp->v_type == VREG &&
1258            ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
1259                 ioflag |= IO_APPEND;
1260         }
1261
1262         if (flags & O_FBLOCKING) {
1263                 /* ioflag &= ~IO_NDELAY; */
1264         } else if (flags & O_FNONBLOCKING) {
1265                 ioflag |= IO_NDELAY;
1266         } else if (fp->f_flag & FNONBLOCK) {
1267                 ioflag |= IO_NDELAY;
1268         }
1269         if (fp->f_flag & O_DIRECT) {
1270                 ioflag |= IO_DIRECT;
1271         }
1272         if (flags & O_FASYNCWRITE) {
1273                 /* ioflag &= ~IO_SYNC; */
1274         } else if (flags & O_FSYNCWRITE) {
1275                 ioflag |= IO_SYNC;
1276         } else if (fp->f_flag & O_FSYNC) {
1277                 ioflag |= IO_SYNC;
1278         }
1279
1280         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
1281                 ioflag |= IO_SYNC;
1282         ioflag |= sequential_heuristic(uio, fp);
1283
1284         error = dev_dwrite(dev, uio, ioflag, fp);
1285
1286         release_dev(dev);
1287         if (node) {
1288                 nanotime(&node->atime);
1289                 nanotime(&node->mtime);
1290         }
1291
1292         if ((flags & O_FOFFSET) == 0)
1293                 fp->f_offset = uio->uio_offset;
1294         fp->f_nextoff = uio->uio_offset;
1295
1296         return (error);
1297 }
1298
1299
1300 static int
1301 devfs_fo_stat(struct file *fp, struct stat *sb, struct ucred *cred)
1302 {
1303         struct vnode *vp;
1304         struct vattr vattr;
1305         struct vattr *vap;
1306         u_short mode;
1307         cdev_t dev;
1308         int error;
1309
1310         vp = (struct vnode *)fp->f_data;
1311         if (vp == NULL || vp->v_type == VBAD)
1312                 return EBADF;
1313
1314         error = vn_stat(vp, sb, cred);
1315         if (error)
1316                 return (error);
1317
1318         vap = &vattr;
1319         error = VOP_GETATTR(vp, vap);
1320         if (error)
1321                 return (error);
1322
1323         /*
1324          * Zero the spare stat fields
1325          */
1326         sb->st_lspare = 0;
1327         sb->st_qspare1 = 0;
1328         sb->st_qspare2 = 0;
1329
1330         /*
1331          * Copy from vattr table ... or not in case it's a cloned device
1332          */
1333         if (vap->va_fsid != VNOVAL)
1334                 sb->st_dev = vap->va_fsid;
1335         else
1336                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1337
1338         sb->st_ino = vap->va_fileid;
1339
1340         mode = vap->va_mode;
1341         mode |= S_IFCHR;
1342         sb->st_mode = mode;
1343
1344         if (vap->va_nlink > (nlink_t)-1)
1345                 sb->st_nlink = (nlink_t)-1;
1346         else
1347                 sb->st_nlink = vap->va_nlink;
1348
1349         sb->st_uid = vap->va_uid;
1350         sb->st_gid = vap->va_gid;
1351         sb->st_rdev = dev2udev(DEVFS_NODE(vp)->d_dev);
1352         sb->st_size = vap->va_bytes;
1353         sb->st_atimespec = vap->va_atime;
1354         sb->st_mtimespec = vap->va_mtime;
1355         sb->st_ctimespec = vap->va_ctime;
1356
1357         /*
1358          * A VCHR and VBLK device may track the last access and last modified
1359          * time independantly of the filesystem.  This is particularly true
1360          * because device read and write calls may bypass the filesystem.
1361          */
1362         if (vp->v_type == VCHR || vp->v_type == VBLK) {
1363                 dev = vp->v_rdev;
1364                 if (dev != NULL) {
1365                         if (dev->si_lastread) {
1366                                 sb->st_atimespec.tv_sec = time_second +
1367                                                           (time_uptime -
1368                                                            dev->si_lastread);
1369                                 sb->st_atimespec.tv_nsec = 0;
1370                         }
1371                         if (dev->si_lastwrite) {
1372                                 sb->st_atimespec.tv_sec = time_second +
1373                                                           (time_uptime -
1374                                                            dev->si_lastwrite);
1375                                 sb->st_atimespec.tv_nsec = 0;
1376                         }
1377                 }
1378         }
1379
1380         /*
1381          * According to www.opengroup.org, the meaning of st_blksize is
1382          *   "a filesystem-specific preferred I/O block size for this
1383          *    object.  In some filesystem types, this may vary from file
1384          *    to file"
1385          * Default to PAGE_SIZE after much discussion.
1386          */
1387
1388         sb->st_blksize = PAGE_SIZE;
1389
1390         sb->st_flags = vap->va_flags;
1391
1392         error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
1393         if (error)
1394                 sb->st_gen = 0;
1395         else
1396                 sb->st_gen = (u_int32_t)vap->va_gen;
1397
1398         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1399
1400         return (0);
1401 }
1402
1403
1404 static int
1405 devfs_fo_kqfilter(struct file *fp, struct knote *kn)
1406 {
1407         struct vnode *vp;
1408         int error;
1409         cdev_t dev;
1410
1411         vp = (struct vnode *)fp->f_data;
1412         if (vp == NULL || vp->v_type == VBAD) {
1413                 error = EBADF;
1414                 goto done;
1415         }
1416         if ((dev = vp->v_rdev) == NULL) {
1417                 error = EBADF;
1418                 goto done;
1419         }
1420         reference_dev(dev);
1421
1422         error = dev_dkqfilter(dev, kn, fp);
1423
1424         release_dev(dev);
1425
1426 done:
1427         return (error);
1428 }
1429
1430 static int
1431 devfs_fo_ioctl(struct file *fp, u_long com, caddr_t data,
1432                   struct ucred *ucred, struct sysmsg *msg)
1433 {
1434 #if 0
1435         struct devfs_node *node;
1436 #endif
1437         struct vnode *vp;
1438         struct vnode *ovp;
1439         cdev_t  dev;
1440         int error;
1441         struct fiodname_args *name_args;
1442         size_t namlen;
1443         const char *name;
1444
1445         vp = ((struct vnode *)fp->f_data);
1446
1447         if ((dev = vp->v_rdev) == NULL)
1448                 return EBADF;           /* device was revoked */
1449
1450         reference_dev(dev);
1451
1452 #if 0
1453         node = DEVFS_NODE(vp);
1454 #endif
1455
1456         devfs_debug(DEVFS_DEBUG_DEBUG,
1457                     "devfs_fo_ioctl() called! for dev %s\n",
1458                     dev->si_name);
1459
1460         if (com == FIODTYPE) {
1461                 *(int *)data = dev_dflags(dev) & D_TYPEMASK;
1462                 error = 0;
1463                 goto out;
1464         } else if (com == FIODNAME) {
1465                 name_args = (struct fiodname_args *)data;
1466                 name = dev->si_name;
1467                 namlen = strlen(name) + 1;
1468
1469                 devfs_debug(DEVFS_DEBUG_DEBUG,
1470                             "ioctl, got: FIODNAME for %s\n", name);
1471
1472                 if (namlen <= name_args->len)
1473                         error = copyout(dev->si_name, name_args->name, namlen);
1474                 else
1475                         error = EINVAL;
1476
1477                 devfs_debug(DEVFS_DEBUG_DEBUG,
1478                             "ioctl stuff: error: %d\n", error);
1479                 goto out;
1480         }
1481
1482         error = dev_dioctl(dev, com, data, fp->f_flag, ucred, msg, fp);
1483
1484 #if 0
1485         if (node) {
1486                 nanotime(&node->atime);
1487                 nanotime(&node->mtime);
1488         }
1489 #endif
1490         if (com == TIOCSCTTY) {
1491                 devfs_debug(DEVFS_DEBUG_DEBUG,
1492                             "devfs_fo_ioctl: got TIOCSCTTY on %s\n",
1493                             dev->si_name);
1494         }
1495         if (error == 0 && com == TIOCSCTTY) {
1496                 struct proc *p = curthread->td_proc;
1497                 struct session *sess;
1498
1499                 devfs_debug(DEVFS_DEBUG_DEBUG,
1500                             "devfs_fo_ioctl: dealing with TIOCSCTTY on %s\n",
1501                             dev->si_name);
1502                 if (p == NULL) {
1503                         error = ENOTTY;
1504                         goto out;
1505                 }
1506                 sess = p->p_session;
1507
1508                 /*
1509                  * Do nothing if reassigning same control tty
1510                  */
1511                 if (sess->s_ttyvp == vp) {
1512                         error = 0;
1513                         goto out;
1514                 }
1515
1516                 /*
1517                  * Get rid of reference to old control tty
1518                  */
1519                 ovp = sess->s_ttyvp;
1520                 vref(vp);
1521                 sess->s_ttyvp = vp;
1522                 if (ovp)
1523                         vrele(ovp);
1524         }
1525
1526 out:
1527         release_dev(dev);
1528         devfs_debug(DEVFS_DEBUG_DEBUG, "devfs_fo_ioctl() finished! \n");
1529         return (error);
1530 }
1531
1532
1533 static int
1534 devfs_spec_fsync(struct vop_fsync_args *ap)
1535 {
1536         struct vnode *vp = ap->a_vp;
1537         int error;
1538
1539         if (!vn_isdisk(vp, NULL))
1540                 return (0);
1541
1542         /*
1543          * Flush all dirty buffers associated with a block device.
1544          */
1545         error = vfsync(vp, ap->a_waitfor, 10000, NULL, NULL);
1546         return (error);
1547 }
1548
1549 static int
1550 devfs_spec_read(struct vop_read_args *ap)
1551 {
1552         struct devfs_node *node;
1553         struct vnode *vp;
1554         struct uio *uio;
1555         cdev_t dev;
1556         int error;
1557
1558         vp = ap->a_vp;
1559         dev = vp->v_rdev;
1560         uio = ap->a_uio;
1561         node = DEVFS_NODE(vp);
1562
1563         if (dev == NULL)                /* device was revoked */
1564                 return (EBADF);
1565         if (uio->uio_resid == 0)
1566                 return (0);
1567
1568         vn_unlock(vp);
1569         error = dev_dread(dev, uio, ap->a_ioflag, NULL);
1570         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1571
1572         if (node)
1573                 nanotime(&node->atime);
1574
1575         return (error);
1576 }
1577
1578 /*
1579  * Vnode op for write
1580  *
1581  * spec_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
1582  *            struct ucred *a_cred)
1583  */
1584 static int
1585 devfs_spec_write(struct vop_write_args *ap)
1586 {
1587         struct devfs_node *node;
1588         struct vnode *vp;
1589         struct uio *uio;
1590         cdev_t dev;
1591         int error;
1592
1593         vp = ap->a_vp;
1594         dev = vp->v_rdev;
1595         uio = ap->a_uio;
1596         node = DEVFS_NODE(vp);
1597
1598         KKASSERT(uio->uio_segflg != UIO_NOCOPY);
1599
1600         if (dev == NULL)                /* device was revoked */
1601                 return (EBADF);
1602
1603         vn_unlock(vp);
1604         error = dev_dwrite(dev, uio, ap->a_ioflag, NULL);
1605         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1606
1607         if (node) {
1608                 nanotime(&node->atime);
1609                 nanotime(&node->mtime);
1610         }
1611
1612         return (error);
1613 }
1614
1615 /*
1616  * Device ioctl operation.
1617  *
1618  * spec_ioctl(struct vnode *a_vp, int a_command, caddr_t a_data,
1619  *            int a_fflag, struct ucred *a_cred, struct sysmsg *msg)
1620  */
1621 static int
1622 devfs_spec_ioctl(struct vop_ioctl_args *ap)
1623 {
1624         struct vnode *vp = ap->a_vp;
1625 #if 0
1626         struct devfs_node *node;
1627 #endif
1628         cdev_t dev;
1629
1630         if ((dev = vp->v_rdev) == NULL)
1631                 return (EBADF);         /* device was revoked */
1632 #if 0
1633         node = DEVFS_NODE(vp);
1634
1635         if (node) {
1636                 nanotime(&node->atime);
1637                 nanotime(&node->mtime);
1638         }
1639 #endif
1640
1641         return (dev_dioctl(dev, ap->a_command, ap->a_data, ap->a_fflag,
1642                            ap->a_cred, ap->a_sysmsg, NULL));
1643 }
1644
1645 /*
1646  * spec_kqfilter(struct vnode *a_vp, struct knote *a_kn)
1647  */
1648 /* ARGSUSED */
1649 static int
1650 devfs_spec_kqfilter(struct vop_kqfilter_args *ap)
1651 {
1652         struct vnode *vp = ap->a_vp;
1653 #if 0
1654         struct devfs_node *node;
1655 #endif
1656         cdev_t dev;
1657
1658         if ((dev = vp->v_rdev) == NULL)
1659                 return (EBADF);         /* device was revoked (EBADF) */
1660 #if 0
1661         node = DEVFS_NODE(vp);
1662
1663         if (node)
1664                 nanotime(&node->atime);
1665 #endif
1666
1667         return (dev_dkqfilter(dev, ap->a_kn, NULL));
1668 }
1669
1670 /*
1671  * Convert a vnode strategy call into a device strategy call.  Vnode strategy
1672  * calls are not limited to device DMA limits so we have to deal with the
1673  * case.
1674  *
1675  * spec_strategy(struct vnode *a_vp, struct bio *a_bio)
1676  */
1677 static int
1678 devfs_spec_strategy(struct vop_strategy_args *ap)
1679 {
1680         struct bio *bio = ap->a_bio;
1681         struct buf *bp = bio->bio_buf;
1682         struct buf *nbp;
1683         struct vnode *vp;
1684         struct mount *mp;
1685         int chunksize;
1686         int maxiosize;
1687
1688         if (bp->b_cmd != BUF_CMD_READ && LIST_FIRST(&bp->b_dep) != NULL)
1689                 buf_start(bp);
1690
1691         /*
1692          * Collect statistics on synchronous and asynchronous read
1693          * and write counts for disks that have associated filesystems.
1694          */
1695         vp = ap->a_vp;
1696         KKASSERT(vp->v_rdev != NULL);   /* XXX */
1697         if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) {
1698                 if (bp->b_cmd == BUF_CMD_READ) {
1699                         if (bp->b_flags & BIO_SYNC)
1700                                 mp->mnt_stat.f_syncreads++;
1701                         else
1702                                 mp->mnt_stat.f_asyncreads++;
1703                 } else {
1704                         if (bp->b_flags & BIO_SYNC)
1705                                 mp->mnt_stat.f_syncwrites++;
1706                         else
1707                                 mp->mnt_stat.f_asyncwrites++;
1708                 }
1709         }
1710
1711         /*
1712          * Device iosize limitations only apply to read and write.  Shortcut
1713          * the I/O if it fits.
1714          */
1715         if ((maxiosize = vp->v_rdev->si_iosize_max) == 0) {
1716                 devfs_debug(DEVFS_DEBUG_DEBUG,
1717                             "%s: si_iosize_max not set!\n",
1718                             dev_dname(vp->v_rdev));
1719                 maxiosize = MAXPHYS;
1720         }
1721 #if SPEC_CHAIN_DEBUG & 2
1722         maxiosize = 4096;
1723 #endif
1724         if (bp->b_bcount <= maxiosize ||
1725             (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) {
1726                 dev_dstrategy_chain(vp->v_rdev, bio);
1727                 return (0);
1728         }
1729
1730         /*
1731          * Clone the buffer and set up an I/O chain to chunk up the I/O.
1732          */
1733         nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO);
1734         initbufbio(nbp);
1735         buf_dep_init(nbp);
1736         BUF_LOCK(nbp, LK_EXCLUSIVE);
1737         BUF_KERNPROC(nbp);
1738         nbp->b_vp = vp;
1739         nbp->b_flags = B_PAGING | (bp->b_flags & B_BNOCLIP);
1740         nbp->b_data = bp->b_data;
1741         nbp->b_bio1.bio_done = devfs_spec_strategy_done;
1742         nbp->b_bio1.bio_offset = bio->bio_offset;
1743         nbp->b_bio1.bio_caller_info1.ptr = bio;
1744
1745         /*
1746          * Start the first transfer
1747          */
1748         if (vn_isdisk(vp, NULL))
1749                 chunksize = vp->v_rdev->si_bsize_phys;
1750         else
1751                 chunksize = DEV_BSIZE;
1752         chunksize = maxiosize / chunksize * chunksize;
1753 #if SPEC_CHAIN_DEBUG & 1
1754         devfs_debug(DEVFS_DEBUG_DEBUG,
1755                     "spec_strategy chained I/O chunksize=%d\n",
1756                     chunksize);
1757 #endif
1758         nbp->b_cmd = bp->b_cmd;
1759         nbp->b_bcount = chunksize;
1760         nbp->b_bufsize = chunksize;     /* used to detect a short I/O */
1761         nbp->b_bio1.bio_caller_info2.index = chunksize;
1762
1763 #if SPEC_CHAIN_DEBUG & 1
1764         devfs_debug(DEVFS_DEBUG_DEBUG,
1765                     "spec_strategy: chain %p offset %d/%d bcount %d\n",
1766                     bp, 0, bp->b_bcount, nbp->b_bcount);
1767 #endif
1768
1769         dev_dstrategy(vp->v_rdev, &nbp->b_bio1);
1770
1771         if (DEVFS_NODE(vp)) {
1772                 nanotime(&DEVFS_NODE(vp)->atime);
1773                 nanotime(&DEVFS_NODE(vp)->mtime);
1774         }
1775
1776         return (0);
1777 }
1778
1779 /*
1780  * Chunked up transfer completion routine - chain transfers until done
1781  *
1782  * NOTE: MPSAFE callback.
1783  */
1784 static
1785 void
1786 devfs_spec_strategy_done(struct bio *nbio)
1787 {
1788         struct buf *nbp = nbio->bio_buf;
1789         struct bio *bio = nbio->bio_caller_info1.ptr;   /* original bio */
1790         struct buf *bp = bio->bio_buf;                  /* original bp */
1791         int chunksize = nbio->bio_caller_info2.index;   /* chunking */
1792         int boffset = nbp->b_data - bp->b_data;
1793
1794         if (nbp->b_flags & B_ERROR) {
1795                 /*
1796                  * An error terminates the chain, propogate the error back
1797                  * to the original bp
1798                  */
1799                 bp->b_flags |= B_ERROR;
1800                 bp->b_error = nbp->b_error;
1801                 bp->b_resid = bp->b_bcount - boffset +
1802                               (nbp->b_bcount - nbp->b_resid);
1803 #if SPEC_CHAIN_DEBUG & 1
1804                 devfs_debug(DEVFS_DEBUG_DEBUG,
1805                             "spec_strategy: chain %p error %d bcount %d/%d\n",
1806                             bp, bp->b_error, bp->b_bcount,
1807                             bp->b_bcount - bp->b_resid);
1808 #endif
1809         } else if (nbp->b_resid) {
1810                 /*
1811                  * A short read or write terminates the chain
1812                  */
1813                 bp->b_error = nbp->b_error;
1814                 bp->b_resid = bp->b_bcount - boffset +
1815                               (nbp->b_bcount - nbp->b_resid);
1816 #if SPEC_CHAIN_DEBUG & 1
1817                 devfs_debug(DEVFS_DEBUG_DEBUG,
1818                             "spec_strategy: chain %p short read(1) "
1819                             "bcount %d/%d\n",
1820                             bp, bp->b_bcount - bp->b_resid, bp->b_bcount);
1821 #endif
1822         } else if (nbp->b_bcount != nbp->b_bufsize) {
1823                 /*
1824                  * A short read or write can also occur by truncating b_bcount
1825                  */
1826 #if SPEC_CHAIN_DEBUG & 1
1827                 devfs_debug(DEVFS_DEBUG_DEBUG,
1828                             "spec_strategy: chain %p short read(2) "
1829                             "bcount %d/%d\n",
1830                             bp, nbp->b_bcount + boffset, bp->b_bcount);
1831 #endif
1832                 bp->b_error = 0;
1833                 bp->b_bcount = nbp->b_bcount + boffset;
1834                 bp->b_resid = nbp->b_resid;
1835         } else if (nbp->b_bcount + boffset == bp->b_bcount) {
1836                 /*
1837                  * No more data terminates the chain
1838                  */
1839 #if SPEC_CHAIN_DEBUG & 1
1840                 devfs_debug(DEVFS_DEBUG_DEBUG,
1841                             "spec_strategy: chain %p finished bcount %d\n",
1842                             bp, bp->b_bcount);
1843 #endif
1844                 bp->b_error = 0;
1845                 bp->b_resid = 0;
1846         } else {
1847                 /*
1848                  * Continue the chain
1849                  */
1850                 boffset += nbp->b_bcount;
1851                 nbp->b_data = bp->b_data + boffset;
1852                 nbp->b_bcount = bp->b_bcount - boffset;
1853                 if (nbp->b_bcount > chunksize)
1854                         nbp->b_bcount = chunksize;
1855                 nbp->b_bio1.bio_done = devfs_spec_strategy_done;
1856                 nbp->b_bio1.bio_offset = bio->bio_offset + boffset;
1857
1858 #if SPEC_CHAIN_DEBUG & 1
1859                 devfs_debug(DEVFS_DEBUG_DEBUG,
1860                             "spec_strategy: chain %p offset %d/%d bcount %d\n",
1861                             bp, boffset, bp->b_bcount, nbp->b_bcount);
1862 #endif
1863
1864                 dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1);
1865                 return;
1866         }
1867
1868         /*
1869          * Fall through to here on termination.  biodone(bp) and
1870          * clean up and free nbp.
1871          */
1872         biodone(bio);
1873         BUF_UNLOCK(nbp);
1874         uninitbufbio(nbp);
1875         kfree(nbp, M_DEVBUF);
1876 }
1877
1878 /*
1879  * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length)
1880  */
1881 static int
1882 devfs_spec_freeblks(struct vop_freeblks_args *ap)
1883 {
1884         struct buf *bp;
1885
1886         /*
1887          * Must be a synchronous operation
1888          */
1889         KKASSERT(ap->a_vp->v_rdev != NULL);
1890         if ((ap->a_vp->v_rdev->si_flags & SI_CANFREE) == 0)
1891                 return (0);
1892         bp = geteblk(ap->a_length);
1893         bp->b_cmd = BUF_CMD_FREEBLKS;
1894         bp->b_bio1.bio_flags |= BIO_SYNC;
1895         bp->b_bio1.bio_offset = ap->a_offset;
1896         bp->b_bio1.bio_done = biodone_sync;
1897         bp->b_bcount = ap->a_length;
1898         dev_dstrategy(ap->a_vp->v_rdev, &bp->b_bio1);
1899         biowait(&bp->b_bio1, "TRIM");
1900         brelse(bp);
1901
1902         return (0);
1903 }
1904
1905 /*
1906  * Implement degenerate case where the block requested is the block
1907  * returned, and assume that the entire device is contiguous in regards
1908  * to the contiguous block range (runp and runb).
1909  *
1910  * spec_bmap(struct vnode *a_vp, off_t a_loffset,
1911  *           off_t *a_doffsetp, int *a_runp, int *a_runb)
1912  */
1913 static int
1914 devfs_spec_bmap(struct vop_bmap_args *ap)
1915 {
1916         if (ap->a_doffsetp != NULL)
1917                 *ap->a_doffsetp = ap->a_loffset;
1918         if (ap->a_runp != NULL)
1919                 *ap->a_runp = MAXBSIZE;
1920         if (ap->a_runb != NULL) {
1921                 if (ap->a_loffset < MAXBSIZE)
1922                         *ap->a_runb = (int)ap->a_loffset;
1923                 else
1924                         *ap->a_runb = MAXBSIZE;
1925         }
1926         return (0);
1927 }
1928
1929
1930 /*
1931  * Special device advisory byte-level locks.
1932  *
1933  * spec_advlock(struct vnode *a_vp, caddr_t a_id, int a_op,
1934  *              struct flock *a_fl, int a_flags)
1935  */
1936 /* ARGSUSED */
1937 static int
1938 devfs_spec_advlock(struct vop_advlock_args *ap)
1939 {
1940         return ((ap->a_flags & F_POSIX) ? EINVAL : EOPNOTSUPP);
1941 }
1942
1943 /*
1944  * NOTE: MPSAFE callback.
1945  */
1946 static void
1947 devfs_spec_getpages_iodone(struct bio *bio)
1948 {
1949         bio->bio_buf->b_cmd = BUF_CMD_DONE;
1950         wakeup(bio->bio_buf);
1951 }
1952
1953 /*
1954  * spec_getpages() - get pages associated with device vnode.
1955  *
1956  * Note that spec_read and spec_write do not use the buffer cache, so we
1957  * must fully implement getpages here.
1958  */
1959 static int
1960 devfs_spec_getpages(struct vop_getpages_args *ap)
1961 {
1962         vm_offset_t kva;
1963         int error;
1964         int i, pcount, size;
1965         struct buf *bp;
1966         vm_page_t m;
1967         vm_ooffset_t offset;
1968         int toff, nextoff, nread;
1969         struct vnode *vp = ap->a_vp;
1970         int blksiz;
1971         int gotreqpage;
1972
1973         error = 0;
1974         pcount = round_page(ap->a_count) / PAGE_SIZE;
1975
1976         /*
1977          * Calculate the offset of the transfer and do sanity check.
1978          */
1979         offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset;
1980
1981         /*
1982          * Round up physical size for real devices.  We cannot round using
1983          * v_mount's block size data because v_mount has nothing to do with
1984          * the device.  i.e. it's usually '/dev'.  We need the physical block
1985          * size for the device itself.
1986          *
1987          * We can't use v_rdev->si_mountpoint because it only exists when the
1988          * block device is mounted.  However, we can use v_rdev.
1989          */
1990         if (vn_isdisk(vp, NULL))
1991                 blksiz = vp->v_rdev->si_bsize_phys;
1992         else
1993                 blksiz = DEV_BSIZE;
1994
1995         size = roundup2(ap->a_count, blksiz);
1996
1997         bp = getpbuf_kva(NULL);
1998         kva = (vm_offset_t)bp->b_data;
1999
2000         /*
2001          * Map the pages to be read into the kva.
2002          */
2003         pmap_qenter(kva, ap->a_m, pcount);
2004
2005         /* Build a minimal buffer header. */
2006         bp->b_cmd = BUF_CMD_READ;
2007         bp->b_bcount = size;
2008         bp->b_resid = 0;
2009         bsetrunningbufspace(bp, size);
2010
2011         bp->b_bio1.bio_offset = offset;
2012         bp->b_bio1.bio_done = devfs_spec_getpages_iodone;
2013
2014         mycpu->gd_cnt.v_vnodein++;
2015         mycpu->gd_cnt.v_vnodepgsin += pcount;
2016
2017         /* Do the input. */
2018         vn_strategy(ap->a_vp, &bp->b_bio1);
2019
2020         crit_enter();
2021
2022         /* We definitely need to be at splbio here. */
2023         while (bp->b_cmd != BUF_CMD_DONE)
2024                 tsleep(bp, 0, "spread", 0);
2025
2026         crit_exit();
2027
2028         if (bp->b_flags & B_ERROR) {
2029                 if (bp->b_error)
2030                         error = bp->b_error;
2031                 else
2032                         error = EIO;
2033         }
2034
2035         /*
2036          * If EOF is encountered we must zero-extend the result in order
2037          * to ensure that the page does not contain garabge.  When no
2038          * error occurs, an early EOF is indicated if b_bcount got truncated.
2039          * b_resid is relative to b_bcount and should be 0, but some devices
2040          * might indicate an EOF with b_resid instead of truncating b_bcount.
2041          */
2042         nread = bp->b_bcount - bp->b_resid;
2043         if (nread < ap->a_count)
2044                 bzero((caddr_t)kva + nread, ap->a_count - nread);
2045         pmap_qremove(kva, pcount);
2046
2047         gotreqpage = 0;
2048         for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) {
2049                 nextoff = toff + PAGE_SIZE;
2050                 m = ap->a_m[i];
2051
2052                 /*
2053                  * NOTE: vm_page_undirty/clear_dirty etc do not clear the
2054                  *       pmap modified bit.  pmap modified bit should have
2055                  *       already been cleared.
2056                  */
2057                 if (nextoff <= nread) {
2058                         m->valid = VM_PAGE_BITS_ALL;
2059                         vm_page_undirty(m);
2060                 } else if (toff < nread) {
2061                         /*
2062                          * Since this is a VM request, we have to supply the
2063                          * unaligned offset to allow vm_page_set_valid()
2064                          * to zero sub-DEV_BSIZE'd portions of the page.
2065                          */
2066                         vm_page_set_valid(m, 0, nread - toff);
2067                         vm_page_clear_dirty_end_nonincl(m, 0, nread - toff);
2068                 } else {
2069                         m->valid = 0;
2070                         vm_page_undirty(m);
2071                 }
2072
2073                 if (i != ap->a_reqpage) {
2074                         /*
2075                          * Just in case someone was asking for this page we
2076                          * now tell them that it is ok to use.
2077                          */
2078                         if (!error || (m->valid == VM_PAGE_BITS_ALL)) {
2079                                 if (m->valid) {
2080                                         if (m->flags & PG_REFERENCED) {
2081                                                 vm_page_activate(m);
2082                                         } else {
2083                                                 vm_page_deactivate(m);
2084                                         }
2085                                         vm_page_wakeup(m);
2086                                 } else {
2087                                         vm_page_free(m);
2088                                 }
2089                         } else {
2090                                 vm_page_free(m);
2091                         }
2092                 } else if (m->valid) {
2093                         gotreqpage = 1;
2094                         /*
2095                          * Since this is a VM request, we need to make the
2096                          * entire page presentable by zeroing invalid sections.
2097                          */
2098                         if (m->valid != VM_PAGE_BITS_ALL)
2099                             vm_page_zero_invalid(m, FALSE);
2100                 }
2101         }
2102         if (!gotreqpage) {
2103                 m = ap->a_m[ap->a_reqpage];
2104                 devfs_debug(DEVFS_DEBUG_WARNING,
2105             "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n",
2106                         devtoname(vp->v_rdev), error, bp, bp->b_vp);
2107                 devfs_debug(DEVFS_DEBUG_WARNING,
2108             "               size: %d, resid: %d, a_count: %d, valid: 0x%x\n",
2109                     size, bp->b_resid, ap->a_count, m->valid);
2110                 devfs_debug(DEVFS_DEBUG_WARNING,
2111             "               nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n",
2112                     nread, ap->a_reqpage, (u_long)m->pindex, pcount);
2113                 /*
2114                  * Free the buffer header back to the swap buffer pool.
2115                  */
2116                 relpbuf(bp, NULL);
2117                 return VM_PAGER_ERROR;
2118         }
2119         /*
2120          * Free the buffer header back to the swap buffer pool.
2121          */
2122         relpbuf(bp, NULL);
2123         if (DEVFS_NODE(ap->a_vp))
2124                 nanotime(&DEVFS_NODE(ap->a_vp)->mtime);
2125         return VM_PAGER_OK;
2126 }
2127
2128 static __inline
2129 int
2130 sequential_heuristic(struct uio *uio, struct file *fp)
2131 {
2132         /*
2133          * Sequential heuristic - detect sequential operation
2134          */
2135         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
2136             uio->uio_offset == fp->f_nextoff) {
2137                 /*
2138                  * XXX we assume that the filesystem block size is
2139                  * the default.  Not true, but still gives us a pretty
2140                  * good indicator of how sequential the read operations
2141                  * are.
2142                  */
2143                 int tmpseq = fp->f_seqcount;
2144
2145                 tmpseq += (uio->uio_resid + MAXBSIZE - 1) / MAXBSIZE;
2146                 if (tmpseq > IO_SEQMAX)
2147                         tmpseq = IO_SEQMAX;
2148                 fp->f_seqcount = tmpseq;
2149                 return(fp->f_seqcount << IO_SEQSHIFT);
2150         }
2151
2152         /*
2153          * Not sequential, quick draw-down of seqcount
2154          */
2155         if (fp->f_seqcount > 1)
2156                 fp->f_seqcount = 1;
2157         else
2158                 fp->f_seqcount = 0;
2159         return(0);
2160 }