don't bother resolving onbld python module deps
[unleashed.git] / kernel / fs / hsfs / hsfs_vnops.c
blob696eec8e34b4177241fe2820b9c10157fad7cd37
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2017 by Delphix. All rights reserved.
30 * Vnode operations for the High Sierra filesystem
33 #include <sys/types.h>
34 #include <sys/t_lock.h>
35 #include <sys/param.h>
36 #include <sys/time.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/resource.h>
40 #include <sys/signal.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/buf.h>
44 #include <sys/vfs.h>
45 #include <sys/stat.h>
46 #include <sys/vnode.h>
47 #include <sys/mode.h>
48 #include <sys/proc.h>
49 #include <sys/disp.h>
50 #include <sys/file.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/kmem.h>
54 #include <sys/uio.h>
55 #include <sys/conf.h>
56 #include <sys/errno.h>
57 #include <sys/mman.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/fbuf.h>
63 #include <sys/dirent.h>
64 #include <sys/errno.h>
65 #include <sys/dkio.h>
66 #include <sys/cmn_err.h>
67 #include <sys/atomic.h>
69 #include <vm/hat.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
77 #include <vm/rm.h>
78 #include <vm/page.h>
79 #include <sys/swap.h>
80 #include <sys/avl.h>
81 #include <sys/sunldi.h>
82 #include <sys/ddi.h>
83 #include <sys/sunddi.h>
84 #include <sys/sdt.h>
87 * For struct modlinkage
89 #include <sys/modctl.h>
91 #include <sys/fs/hsfs_spec.h>
92 #include <sys/fs/hsfs_node.h>
93 #include <sys/fs/hsfs_impl.h>
94 #include <sys/fs/hsfs_susp.h>
95 #include <sys/fs/hsfs_rrip.h>
97 #include <sys/fs_subr.h>
99 /* # of contiguous requests to detect sequential access pattern */
100 static int seq_contig_requests = 2;
103 * This is the max number os taskq threads that will be created
104 * if required. Since we are using a Dynamic TaskQ by default only
105 * one thread is created initially.
107 * NOTE: In the usual hsfs use case this per fs instance number
108 * of taskq threads should not place any undue load on a system.
109 * Even on an unusual system with say 100 CDROM drives, 800 threads
110 * will not be created unless all the drives are loaded and all
111 * of them are saturated with I/O at the same time! If there is at
112 * all a complaint of system load due to such an unusual case it
113 * should be easy enough to change to one per-machine Dynamic TaskQ
114 * for all hsfs mounts with a nthreads of say 32.
116 static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */
118 /* Min count of adjacent bufs that will avoid buf coalescing */
119 static int hsched_coalesce_min = 2;
122 * Kmem caches for heavily used small allocations. Using these kmem
123 * caches provides a factor of 3 reduction in system time and greatly
124 * aids overall throughput esp. on SPARC.
126 struct kmem_cache *hio_cache;
127 struct kmem_cache *hio_info_cache;
130 * This tunable allows us to ignore inode numbers from rrip-1.12.
131 * In this case, we fall back to our default inode algorithm.
133 extern int use_rrip_inodes;
136 * Free behind logic from UFS to tame our thirst for
137 * the page cache.
138 * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
139 * explanation.
141 static int freebehind = 1;
142 static int smallfile = 0;
143 static int cache_read_ahead = 0;
144 static uoff_t smallfile64 = 32 * 1024;
145 #define SMALLFILE1_D 1000
146 #define SMALLFILE2_D 10
147 static uoff_t smallfile1 = 32 * 1024;
148 static uoff_t smallfile2 = 32 * 1024;
149 static clock_t smallfile_update = 0; /* when to recompute */
150 static uint_t smallfile1_d = SMALLFILE1_D;
151 static uint_t smallfile2_d = SMALLFILE2_D;
153 static int hsched_deadline_compare(const void *x1, const void *x2);
154 static int hsched_offset_compare(const void *x1, const void *x2);
155 static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
156 int hsched_invoke_strategy(struct hsfs *fsp);
158 /* ARGSUSED */
159 static int
160 hsfs_fsync(vnode_t *cp, int syncflag, cred_t *cred, caller_context_t *ct)
162 return (0);
166 /*ARGSUSED*/
167 static int
168 hsfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
169 struct caller_context *ct)
171 caddr_t base;
172 offset_t diff;
173 int error;
174 struct hsnode *hp;
175 uint_t filesize;
176 int dofree;
178 hp = VTOH(vp);
180 * if vp is of type VDIR, make sure dirent
181 * is filled up with all info (because of ptbl)
183 if (vp->v_type == VDIR) {
184 if (hp->hs_dirent.ext_size == 0)
185 hs_filldirent(vp, &hp->hs_dirent);
187 filesize = hp->hs_dirent.ext_size;
189 /* Sanity checks. */
190 if (uiop->uio_resid == 0 || /* No data wanted. */
191 uiop->uio_loffset > HS_MAXFILEOFF || /* Offset too big. */
192 uiop->uio_loffset >= filesize) /* Past EOF. */
193 return (0);
195 do {
197 * We want to ask for only the "right" amount of data.
198 * In this case that means:-
200 * We can't get data from beyond our EOF. If asked,
201 * we will give a short read.
203 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
204 * These buffers are always MAXBSIZE aligned.
205 * If our starting offset is not MAXBSIZE aligned,
206 * we can only ask for less than MAXBSIZE bytes.
208 * If our requested offset and length are such that
209 * they belong in different MAXBSIZE aligned slots
210 * then we'll be making more than one call on
211 * segmap_getmapflt.
213 * This diagram shows the variables we use and their
214 * relationships.
216 * |<-----MAXBSIZE----->|
217 * +--------------------------...+
218 * |.....mapon->|<--n-->|....*...|EOF
219 * +--------------------------...+
220 * uio_loffset->|
221 * uio_resid....|<---------->|
222 * diff.........|<-------------->|
224 * So, in this case our offset is not aligned
225 * and our request takes us outside of the
226 * MAXBSIZE window. We will break this up into
227 * two segmap_getmapflt calls.
229 size_t nbytes;
230 offset_t mapon;
231 size_t n;
232 uint_t flags;
234 mapon = uiop->uio_loffset & MAXBOFFSET;
235 diff = filesize - uiop->uio_loffset;
236 nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
237 n = MIN(diff, nbytes);
238 if (n <= 0) {
239 /* EOF or request satisfied. */
240 return (0);
244 * Freebehind computation taken from:
245 * usr/src/uts/common/fs/ufs/ufs_vnops.c
247 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
248 uint64_t percpufreeb;
249 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
250 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
251 percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
252 smallfile1 = percpufreeb / smallfile1_d;
253 smallfile2 = percpufreeb / smallfile2_d;
254 smallfile1 = MAX(smallfile1, smallfile);
255 smallfile1 = MAX(smallfile1, smallfile64);
256 smallfile2 = MAX(smallfile1, smallfile2);
257 smallfile_update = drv_hztousec(ddi_get_lbolt())
258 + 1000000;
261 dofree = freebehind &&
262 hp->hs_prev_offset == uiop->uio_loffset &&
263 hp->hs_ra_bytes > 0;
265 base = segmap_getmapflt(segkmap, vp,
266 (uoff_t)uiop->uio_loffset, n, 1, S_READ);
268 error = uiomove(base + mapon, n, UIO_READ, uiop);
270 if (error == 0) {
272 * if read a whole block, or read to eof,
273 * won't need this buffer again soon.
275 if (n + mapon == MAXBSIZE ||
276 uiop->uio_loffset == filesize)
277 flags = SM_DONTNEED;
278 else
279 flags = 0;
281 if (dofree) {
282 flags = SM_FREE | SM_ASYNC;
283 if ((cache_read_ahead == 0) &&
284 uiop->uio_loffset > smallfile2)
285 flags |= SM_DONTNEED;
288 error = segmap_release(segkmap, base, flags);
289 } else
290 (void) segmap_release(segkmap, base, 0);
291 } while (error == 0 && uiop->uio_resid > 0);
293 return (error);
296 /*ARGSUSED2*/
297 static int
298 hsfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
299 caller_context_t *ct)
301 struct hsnode *hp;
302 struct vfs *vfsp;
303 struct hsfs *fsp;
305 hp = VTOH(vp);
306 fsp = VFS_TO_HSFS(vp->v_vfsp);
307 vfsp = vp->v_vfsp;
309 if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
310 hs_filldirent(vp, &hp->hs_dirent);
312 vap->va_type = IFTOVT(hp->hs_dirent.mode);
313 vap->va_mode = hp->hs_dirent.mode;
314 vap->va_uid = hp->hs_dirent.uid;
315 vap->va_gid = hp->hs_dirent.gid;
317 vap->va_fsid = vfsp->vfs_dev;
318 vap->va_nodeid = (ino64_t)hp->hs_nodeid;
319 vap->va_nlink = hp->hs_dirent.nlink;
320 vap->va_size = (offset_t)hp->hs_dirent.ext_size;
322 vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
323 vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
324 vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
325 vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
326 vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
327 vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
328 if (vp->v_type == VCHR || vp->v_type == VBLK)
329 vap->va_rdev = hp->hs_dirent.r_dev;
330 else
331 vap->va_rdev = 0;
332 vap->va_blksize = vfsp->vfs_bsize;
333 /* no. of blocks = no. of data blocks + no. of xar blocks */
334 vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
335 (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
336 vap->va_seq = hp->hs_seq;
337 return (0);
340 /*ARGSUSED*/
341 static int
342 hsfs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred,
343 caller_context_t *ct)
345 struct hsnode *hp;
347 if (vp->v_type != VLNK)
348 return (EINVAL);
350 hp = VTOH(vp);
352 if (hp->hs_dirent.sym_link == NULL)
353 return (ENOENT);
355 return (uiomove(hp->hs_dirent.sym_link,
356 (size_t)MIN(hp->hs_dirent.ext_size,
357 uiop->uio_resid), UIO_READ, uiop));
360 /*ARGSUSED*/
361 static void
362 hsfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
364 struct hsnode *hp;
365 struct hsfs *fsp;
367 int nopage;
369 hp = VTOH(vp);
370 fsp = VFS_TO_HSFS(vp->v_vfsp);
372 * Note: acquiring and holding v_lock for quite a while
373 * here serializes on the vnode; this is unfortunate, but
374 * likely not to overly impact performance, as the underlying
375 * device (CDROM drive) is quite slow.
377 rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
378 mutex_enter(&hp->hs_contents_lock);
379 mutex_enter(&vp->v_lock);
381 if (vp->v_count < 1) {
382 panic("hsfs_inactive: v_count < 1");
383 /*NOTREACHED*/
386 VN_RELE_LOCKED(vp);
387 if (vp->v_count > 0 || (hp->hs_flags & HREF) == 0) {
388 mutex_exit(&vp->v_lock);
389 mutex_exit(&hp->hs_contents_lock);
390 rw_exit(&fsp->hsfs_hash_lock);
391 return;
393 if (vp->v_count == 0) {
395 * Free the hsnode.
396 * If there are no pages associated with the
397 * hsnode, give it back to the kmem_cache,
398 * else put at the end of this file system's
399 * internal free list.
401 nopage = !vn_has_cached_data(vp);
402 hp->hs_flags = 0;
404 * exit these locks now, since hs_freenode may
405 * kmem_free the hsnode and embedded vnode
407 mutex_exit(&vp->v_lock);
408 mutex_exit(&hp->hs_contents_lock);
409 hs_freenode(vp, fsp, nopage);
410 } else {
411 mutex_exit(&vp->v_lock);
412 mutex_exit(&hp->hs_contents_lock);
414 rw_exit(&fsp->hsfs_hash_lock);
418 /*ARGSUSED*/
419 static int
420 hsfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
421 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
422 caller_context_t *ct, int *direntflags, pathname_t *realpnp)
424 int error;
425 int namelen = (int)strlen(nm);
427 if (*nm == '\0') {
428 VN_HOLD(dvp);
429 *vpp = dvp;
430 return (0);
434 * If we're looking for ourself, life is simple.
436 if (namelen == 1 && *nm == '.') {
437 if (error = hs_access(dvp, (mode_t)VEXEC, cred))
438 return (error);
439 VN_HOLD(dvp);
440 *vpp = dvp;
441 return (0);
444 return (hs_dirlook(dvp, nm, namelen, vpp, cred));
448 /*ARGSUSED*/
449 static int
450 hsfs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
451 caller_context_t *ct, int flags)
453 struct hsnode *dhp;
454 struct hsfs *fsp;
455 struct hs_direntry hd;
456 struct dirent64 *nd;
457 int error;
458 uint_t offset; /* real offset in directory */
459 uint_t dirsiz; /* real size of directory */
460 uchar_t *blkp;
461 int hdlen; /* length of hs directory entry */
462 long ndlen; /* length of dirent entry */
463 int bytes_wanted;
464 size_t bufsize; /* size of dirent buffer */
465 char *outbuf; /* ptr to dirent buffer */
466 char *dname;
467 int dnamelen;
468 size_t dname_size;
469 struct fbuf *fbp;
470 uint_t last_offset; /* last index into current dir block */
471 ino64_t dirino; /* temporary storage before storing in dirent */
472 off_t diroff;
474 dhp = VTOH(vp);
475 fsp = VFS_TO_HSFS(vp->v_vfsp);
476 if (dhp->hs_dirent.ext_size == 0)
477 hs_filldirent(vp, &dhp->hs_dirent);
478 dirsiz = dhp->hs_dirent.ext_size;
479 if (uiop->uio_loffset >= dirsiz) { /* at or beyond EOF */
480 if (eofp)
481 *eofp = 1;
482 return (0);
484 ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
485 offset = uiop->uio_loffset;
487 dname_size = fsp->hsfs_namemax + 1; /* 1 for the ending NUL */
488 dname = kmem_alloc(dname_size, KM_SLEEP);
489 bufsize = uiop->uio_resid + sizeof (struct dirent64);
491 outbuf = kmem_alloc(bufsize, KM_SLEEP);
492 nd = (struct dirent64 *)outbuf;
494 while (offset < dirsiz) {
495 bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
497 error = fbread(vp, (offset_t)(offset & MAXBMASK),
498 (unsigned int)bytes_wanted, S_READ, &fbp);
499 if (error)
500 goto done;
502 blkp = (uchar_t *)fbp->fb_addr;
503 last_offset = (offset & MAXBMASK) + fbp->fb_count;
505 #define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */
507 while (offset < last_offset) {
509 * Very similar validation code is found in
510 * process_dirblock(), hsfs_node.c.
511 * For an explanation, see there.
512 * It may make sense for the future to
513 * "consolidate" the code in hs_parsedir(),
514 * process_dirblock() and hsfs_readdir() into
515 * a single utility function.
517 hdlen = (int)((uchar_t)
518 HDE_DIR_LEN(&blkp[rel_offset(offset)]));
519 if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
520 offset + hdlen > last_offset) {
522 * advance to next sector boundary
524 offset = roundup(offset + 1, HS_SECTOR_SIZE);
525 if (hdlen)
526 hs_log_bogus_disk_warning(fsp,
527 HSFS_ERR_TRAILING_JUNK, 0);
529 continue;
532 bzero(&hd, sizeof (hd));
535 * Just ignore invalid directory entries.
536 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
538 if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
539 &hd, dname, &dnamelen, last_offset - offset)) {
541 * Determine if there is enough room
543 ndlen = (long)DIRENT64_RECLEN((dnamelen));
545 if ((ndlen + ((char *)nd - outbuf)) >
546 uiop->uio_resid) {
547 fbrelse(fbp, S_READ);
548 goto done; /* output buffer full */
551 diroff = offset + hdlen;
553 * If the media carries rrip-v1.12 or newer,
554 * and we trust the inodes from the rrip data
555 * (use_rrip_inodes != 0), use that data. If the
556 * media has been created by a recent mkisofs
557 * version, we may trust all numbers in the
558 * starting extent number; otherwise, we cannot
559 * do this for zero sized files and symlinks,
560 * because if we did we'd end up mapping all of
561 * them to the same node. We use HS_DUMMY_INO
562 * in this case and make sure that we will not
563 * map all files to the same meta data.
565 if (hd.inode != 0 && use_rrip_inodes) {
566 dirino = hd.inode;
567 } else if ((hd.ext_size == 0 ||
568 hd.sym_link != NULL) &&
569 (fsp->hsfs_flags & HSFSMNT_INODE) == 0) {
570 dirino = HS_DUMMY_INO;
571 } else {
572 dirino = hd.ext_lbn;
575 /* strncpy(9f) will zero uninitialized bytes */
577 ASSERT(strlen(dname) + 1 <=
578 DIRENT64_NAMELEN(ndlen));
579 (void) strncpy(nd->d_name, dname,
580 DIRENT64_NAMELEN(ndlen));
581 nd->d_reclen = (ushort_t)ndlen;
582 nd->d_off = (offset_t)diroff;
583 nd->d_ino = dirino;
584 nd = (struct dirent64 *)((char *)nd + ndlen);
587 * free up space allocated for symlink
589 if (hd.sym_link != NULL) {
590 kmem_free(hd.sym_link,
591 (size_t)(hd.ext_size+1));
592 hd.sym_link = NULL;
595 offset += hdlen;
597 fbrelse(fbp, S_READ);
601 * Got here for one of the following reasons:
602 * 1) outbuf is full (error == 0)
603 * 2) end of directory reached (error == 0)
604 * 3) error reading directory sector (error != 0)
605 * 4) directory entry crosses sector boundary (error == 0)
607 * If any directory entries have been copied, don't report
608 * case 4. Instead, return the valid directory entries.
610 * If no entries have been copied, report the error.
611 * If case 4, this will be indistiguishable from EOF.
613 done:
614 ndlen = ((char *)nd - outbuf);
615 if (ndlen != 0) {
616 error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
617 uiop->uio_loffset = offset;
619 kmem_free(dname, dname_size);
620 kmem_free(outbuf, bufsize);
621 if (eofp && error == 0)
622 *eofp = (uiop->uio_loffset >= dirsiz);
623 return (error);
626 /*ARGSUSED2*/
627 static int
628 hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
630 struct hsnode *hp;
631 struct hsfid *fid;
633 if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
634 fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
635 return (ENOSPC);
638 fid = (struct hsfid *)fidp;
639 fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
640 hp = VTOH(vp);
641 mutex_enter(&hp->hs_contents_lock);
642 fid->hf_dir_lbn = hp->hs_dir_lbn;
643 fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
644 fid->hf_ino = hp->hs_nodeid;
645 mutex_exit(&hp->hs_contents_lock);
646 return (0);
649 /*ARGSUSED*/
650 static int
651 hsfs_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
653 return (0);
656 /*ARGSUSED*/
657 static int
658 hsfs_close(struct vnode *vp, int flag, int count, offset_t offset,
659 struct cred *cred, caller_context_t *ct)
661 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
662 cleanshares(vp, ttoproc(curthread)->p_pid);
663 return (0);
666 /*ARGSUSED2*/
667 static int
668 hsfs_access(struct vnode *vp, int mode, int flags, cred_t *cred,
669 caller_context_t *ct)
671 return (hs_access(vp, (mode_t)mode, cred));
675 * the seek time of a CD-ROM is very slow, and data transfer
676 * rate is even worse (max. 150K per sec). The design
677 * decision is to reduce access to cd-rom as much as possible,
678 * and to transfer a sizable block (read-ahead) of data at a time.
679 * UFS style of read ahead one block at a time is not appropriate,
680 * and is not supported
684 * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
686 #define KLUSTSIZE (56 * 1024)
687 /* we don't support read ahead */
688 int hsfs_lostpage; /* no. of times we lost original page */
691 * Used to prevent biodone() from releasing buf resources that
692 * we didn't allocate in quite the usual way.
694 /*ARGSUSED*/
696 hsfs_iodone(struct buf *bp)
698 sema_v(&bp->b_io);
699 return (0);
703 * The taskq thread that invokes the scheduling function to ensure
704 * that all readaheads are complete and cleans up the associated
705 * memory and releases the page lock.
707 void
708 hsfs_ra_task(void *arg)
710 struct hio_info *info = arg;
711 uint_t count;
712 struct buf *wbuf;
714 ASSERT(info->pp != NULL);
716 for (count = 0; count < info->bufsused; count++) {
717 wbuf = &(info->bufs[count]);
719 DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
720 while (sema_tryp(&(info->sema[count])) == 0) {
721 if (hsched_invoke_strategy(info->fsp)) {
722 sema_p(&(info->sema[count]));
723 break;
726 sema_destroy(&(info->sema[count]));
727 DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
728 biofini(&(info->bufs[count]));
730 for (count = 0; count < info->bufsused; count++) {
731 if (info->vas[count] != NULL) {
732 ppmapout(info->vas[count]);
735 kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
736 kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
737 kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
739 pvn_read_done(info->pp, 0);
740 kmem_cache_free(hio_info_cache, info);
744 * Submit asynchronous readahead requests to the I/O scheduler
745 * depending on the number of pages to read ahead. These requests
746 * are asynchronous to the calling thread but I/O requests issued
747 * subsequently by other threads with higher LBNs must wait for
748 * these readaheads to complete since we have a single ordered
749 * I/O pipeline. Thus these readaheads are semi-asynchronous.
750 * A TaskQ handles waiting for the readaheads to complete.
752 * This function is mostly a copy of hsfs_getapage but somewhat
753 * simpler. A readahead request is aborted if page allocation
754 * fails.
756 /*ARGSUSED*/
757 static int
758 hsfs_getpage_ra(struct vnode *vp, uoff_t off, struct seg *seg,
759 caddr_t addr, struct hsnode *hp, struct hsfs *fsp, int xarsiz,
760 offset_t bof, int chunk_lbn_count, int chunk_data_bytes)
762 struct buf *bufs;
763 caddr_t *vas;
764 caddr_t va;
765 struct page *pp, *searchp, *lastp;
766 struct vnode *devvp;
767 ulong_t byte_offset;
768 size_t io_len_tmp;
769 uint_t io_off, io_len;
770 uint_t xlen;
771 uint_t filsiz;
772 uint_t secsize;
773 uint_t bufcnt;
774 uint_t bufsused;
775 uint_t count;
776 uint_t io_end;
777 uint_t which_chunk_lbn;
778 uint_t offset_lbn;
779 uint_t offset_extra;
780 offset_t offset_bytes;
781 uint_t remaining_bytes;
782 uint_t extension;
783 int remainder; /* must be signed */
784 diskaddr_t driver_block;
785 uoff_t io_off_tmp;
786 ksema_t *fio_done;
787 struct hio_info *info;
788 size_t len;
790 ASSERT(fsp->hqueue != NULL);
792 if (addr >= seg->s_base + seg->s_size) {
793 return (-1);
796 devvp = fsp->hsfs_devvp;
797 secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */
799 /* file data size */
800 filsiz = hp->hs_dirent.ext_size;
802 if (off >= filsiz)
803 return (0);
805 extension = 0;
806 pp = NULL;
808 extension += hp->hs_ra_bytes;
811 * Some CD writers (e.g. Kodak Photo CD writers)
812 * create CDs in TAO mode and reserve tracks that
813 * are not completely written. Some sectors remain
814 * unreadable for this reason and give I/O errors.
815 * Also, there's no point in reading sectors
816 * we'll never look at. So, if we're asked to go
817 * beyond the end of a file, truncate to the length
818 * of that file.
820 * Additionally, this behaviour is required by section
821 * 6.4.5 of ISO 9660:1988(E).
823 len = MIN(extension ? extension : PAGESIZE, filsiz - off);
825 /* A little paranoia */
826 if (len <= 0)
827 return (-1);
830 * After all that, make sure we're asking for things in units
831 * that bdev_strategy() will understand (see bug 4202551).
833 len = roundup(len, DEV_BSIZE);
835 pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
836 &io_len_tmp, off, len, 1);
838 if (pp == NULL) {
839 hp->hs_num_contig = 0;
840 hp->hs_ra_bytes = 0;
841 hp->hs_prev_offset = 0;
842 return (-1);
845 io_off = (uint_t)io_off_tmp;
846 io_len = (uint_t)io_len_tmp;
848 /* check for truncation */
850 * xxx Clean up and return EIO instead?
851 * xxx Ought to go to uoff_t for everything, but we
852 * xxx call lots of things that want uint_t arguments.
854 ASSERT(io_off == io_off_tmp);
857 * get enough buffers for worst-case scenario
858 * (i.e., no coalescing possible).
860 bufcnt = (len + secsize - 1) / secsize;
861 bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
862 vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
865 * Allocate a array of semaphores since we are doing I/O
866 * scheduling.
868 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
871 * If our filesize is not an integer multiple of PAGESIZE,
872 * we zero that part of the last page that's between EOF and
873 * the PAGESIZE boundary.
875 xlen = io_len & PAGEOFFSET;
876 if (xlen != 0)
877 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
879 DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
881 va = NULL;
882 lastp = NULL;
883 searchp = pp;
884 io_end = io_off + io_len;
885 for (count = 0, byte_offset = io_off;
886 byte_offset < io_end;
887 count++) {
888 ASSERT(count < bufcnt);
890 bioinit(&bufs[count]);
891 bufs[count].b_edev = devvp->v_rdev;
892 bufs[count].b_dev = cmpdev(devvp->v_rdev);
893 bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
894 bufs[count].b_iodone = hsfs_iodone;
895 bufs[count].b_vp = vp;
896 bufs[count].b_file = vp;
898 /* Compute disk address for interleaving. */
900 /* considered without skips */
901 which_chunk_lbn = byte_offset / chunk_data_bytes;
903 /* factor in skips */
904 offset_lbn = which_chunk_lbn * chunk_lbn_count;
906 /* convert to physical byte offset for lbn */
907 offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
909 /* don't forget offset into lbn */
910 offset_extra = byte_offset % chunk_data_bytes;
912 /* get virtual block number for driver */
913 driver_block = lbtodb(bof + xarsiz
914 + offset_bytes + offset_extra);
916 if (lastp != searchp) {
917 /* this branch taken first time through loop */
918 va = vas[count] = ppmapin(searchp, PROT_WRITE,
919 (caddr_t)-1);
920 /* ppmapin() guarantees not to return NULL */
921 } else {
922 vas[count] = NULL;
925 bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
926 bufs[count].b_offset =
927 (offset_t)(byte_offset - io_off + off);
930 * We specifically use the b_lblkno member here
931 * as even in the 32 bit world driver_block can
932 * get very large in line with the ISO9660 spec.
935 bufs[count].b_lblkno = driver_block;
937 remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
938 - byte_offset;
941 * remaining_bytes can't be zero, as we derived
942 * which_chunk_lbn directly from byte_offset.
944 if ((remaining_bytes + byte_offset) < (off + len)) {
945 /* coalesce-read the rest of the chunk */
946 bufs[count].b_bcount = remaining_bytes;
947 } else {
948 /* get the final bits */
949 bufs[count].b_bcount = off + len - byte_offset;
952 remainder = PAGESIZE - (byte_offset % PAGESIZE);
953 if (bufs[count].b_bcount > remainder) {
954 bufs[count].b_bcount = remainder;
957 bufs[count].b_bufsize = bufs[count].b_bcount;
958 if (((offset_t)byte_offset + bufs[count].b_bcount) >
959 HS_MAXFILEOFF) {
960 break;
962 byte_offset += bufs[count].b_bcount;
965 * We are scheduling I/O so we need to enqueue
966 * requests rather than calling bdev_strategy
967 * here. A later invocation of the scheduling
968 * function will take care of doing the actual
969 * I/O as it selects requests from the queue as
970 * per the scheduling logic.
972 struct hio *hsio = kmem_cache_alloc(hio_cache,
973 KM_SLEEP);
975 sema_init(&fio_done[count], 0, NULL,
976 SEMA_DEFAULT, NULL);
977 hsio->bp = &bufs[count];
978 hsio->sema = &fio_done[count];
979 hsio->io_lblkno = bufs[count].b_lblkno;
980 hsio->nblocks = howmany(hsio->bp->b_bcount,
981 DEV_BSIZE);
983 /* used for deadline */
984 hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
986 /* for I/O coalescing */
987 hsio->contig_chain = NULL;
988 hsched_enqueue_io(fsp, hsio, 1);
990 lwp_stat_update(LWP_STAT_INBLK, 1);
991 lastp = searchp;
992 if ((remainder - bufs[count].b_bcount) < 1) {
993 searchp = searchp->p_next;
997 bufsused = count;
998 info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
999 info->bufs = bufs;
1000 info->vas = vas;
1001 info->sema = fio_done;
1002 info->bufsused = bufsused;
1003 info->bufcnt = bufcnt;
1004 info->fsp = fsp;
1005 info->pp = pp;
1007 (void) taskq_dispatch(fsp->hqueue->ra_task,
1008 hsfs_ra_task, info, KM_SLEEP);
1010 * The I/O locked pages are unlocked in our taskq thread.
1012 return (0);
1016 * Each file may have a different interleaving on disk. This makes
1017 * things somewhat interesting. The gist is that there are some
1018 * number of contiguous data sectors, followed by some other number
1019 * of contiguous skip sectors. The sum of those two sets of sectors
1020 * defines the interleave size. Unfortunately, it means that we generally
1021 * can't simply read N sectors starting at a given offset to satisfy
1022 * any given request.
1024 * What we do is get the relevant memory pages via pvn_read_kluster(),
1025 * then stride through the interleaves, setting up a buf for each
1026 * sector that needs to be brought in. Instead of kmem_alloc'ing
1027 * space for the sectors, though, we just point at the appropriate
1028 * spot in the relevant page for each of them. This saves us a bunch
1029 * of copying.
1031 * NOTICE: The code below in hsfs_getapage is mostly same as the code
1032 * in hsfs_getpage_ra above (with some omissions). If you are
1033 * making any change to this function, please also look at
1034 * hsfs_getpage_ra.
1036 /*ARGSUSED*/
1037 static int
1038 hsfs_getapage(struct vnode *vp, uoff_t off, size_t len, uint_t *protp,
1039 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
1040 enum seg_rw rw, struct cred *cred)
1042 struct hsnode *hp;
1043 struct hsfs *fsp;
1044 int err;
1045 struct buf *bufs;
1046 caddr_t *vas;
1047 caddr_t va;
1048 struct page *pp, *searchp, *lastp;
1049 page_t *pagefound;
1050 offset_t bof;
1051 struct vnode *devvp;
1052 ulong_t byte_offset;
1053 size_t io_len_tmp;
1054 uint_t io_off, io_len;
1055 uint_t xlen;
1056 uint_t filsiz;
1057 uint_t secsize;
1058 uint_t bufcnt;
1059 uint_t bufsused;
1060 uint_t count;
1061 uint_t io_end;
1062 uint_t which_chunk_lbn;
1063 uint_t offset_lbn;
1064 uint_t offset_extra;
1065 offset_t offset_bytes;
1066 uint_t remaining_bytes;
1067 uint_t extension;
1068 int remainder; /* must be signed */
1069 int chunk_lbn_count;
1070 int chunk_data_bytes;
1071 int xarsiz;
1072 diskaddr_t driver_block;
1073 uoff_t io_off_tmp;
1074 ksema_t *fio_done;
1075 int calcdone;
1078 * We don't support asynchronous operation at the moment, so
1079 * just pretend we did it. If the pages are ever actually
1080 * needed, they'll get brought in then.
1082 if (pl == NULL)
1083 return (0);
1085 hp = VTOH(vp);
1086 fsp = VFS_TO_HSFS(vp->v_vfsp);
1087 devvp = fsp->hsfs_devvp;
1088 secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */
1090 /* file data size */
1091 filsiz = hp->hs_dirent.ext_size;
1093 /* disk addr for start of file */
1094 bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
1096 /* xarsiz byte must be skipped for data */
1097 xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
1099 /* how many logical blocks in an interleave (data+skip) */
1100 chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
1102 if (chunk_lbn_count == 0) {
1103 chunk_lbn_count = 1;
1107 * Convert interleaving size into bytes. The zero case
1108 * (no interleaving) optimization is handled as a side-
1109 * effect of the read-ahead logic.
1111 if (hp->hs_dirent.intlf_sz == 0) {
1112 chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
1114 * Optimization: If our pagesize is a multiple of LBN
1115 * bytes, we can avoid breaking up a page into individual
1116 * lbn-sized requests.
1118 if (PAGESIZE % chunk_data_bytes == 0) {
1119 chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
1120 chunk_data_bytes = PAGESIZE;
1122 } else {
1123 chunk_data_bytes =
1124 LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
1127 reread:
1128 err = 0;
1129 pagefound = 0;
1130 calcdone = 0;
1133 * Do some read-ahead. This mostly saves us a bit of
1134 * system cpu time more than anything else when doing
1135 * sequential reads. At some point, could do the
1136 * read-ahead asynchronously which might gain us something
1137 * on wall time, but it seems unlikely....
1139 * We do the easy case here, which is to read through
1140 * the end of the chunk, minus whatever's at the end that
1141 * won't exactly fill a page.
1143 if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
1144 which_chunk_lbn = (off + len) / chunk_data_bytes;
1145 extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
1146 extension -= (extension % PAGESIZE);
1147 } else {
1148 extension = roundup(len, PAGESIZE);
1151 atomic_inc_64(&fsp->total_pages_requested);
1153 pp = NULL;
1154 again:
1155 /* search for page in buffer */
1156 if ((pagefound = page_exists(&vp->v_object, off)) == 0) {
1158 * Need to really do disk IO to get the page.
1160 if (!calcdone) {
1161 extension += hp->hs_ra_bytes;
1163 len = (extension != 0) ? extension : PAGESIZE;
1166 * Some cd writers don't write sectors that aren't
1167 * used. Also, there's no point in reading sectors
1168 * we'll never look at. So, if we're asked to go
1169 * beyond the end of a file, truncate to the length
1170 * of that file.
1172 * Additionally, this behaviour is required by section
1173 * 6.4.5 of ISO 9660:1988(E).
1175 if (off < filsiz && off + len > filsiz)
1176 len = filsiz - off;
1179 * After all that, make sure we're asking for things
1180 * in units that bdev_strategy() will understand.
1182 len = roundup(len, DEV_BSIZE);
1183 calcdone = 1;
1186 pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
1187 &io_len_tmp, off, len, 0);
1189 if (pp == NULL) {
1191 * Pressure on memory, roll back readahead
1193 hp->hs_num_contig = 0;
1194 hp->hs_ra_bytes = 0;
1195 hp->hs_prev_offset = 0;
1196 goto again;
1199 io_off = (uint_t)io_off_tmp;
1200 io_len = (uint_t)io_len_tmp;
1202 /* check for truncation */
1204 * xxx Clean up and return EIO instead?
1205 * xxx Ought to go to uoff_t for everything, but we
1206 * xxx call lots of things that want uint_t arguments.
1208 ASSERT(io_off == io_off_tmp);
1211 * get enough buffers for worst-case scenario
1212 * (i.e., no coalescing possible).
1214 bufcnt = (len + secsize - 1) / secsize;
1215 bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
1216 vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
1219 * Allocate a array of semaphores if we are doing I/O
1220 * scheduling.
1222 if (fsp->hqueue != NULL)
1223 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
1224 KM_SLEEP);
1225 for (count = 0; count < bufcnt; count++) {
1226 bioinit(&bufs[count]);
1227 bufs[count].b_edev = devvp->v_rdev;
1228 bufs[count].b_dev = cmpdev(devvp->v_rdev);
1229 bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
1230 bufs[count].b_iodone = hsfs_iodone;
1231 bufs[count].b_vp = vp;
1232 bufs[count].b_file = vp;
1236 * If our filesize is not an integer multiple of PAGESIZE,
1237 * we zero that part of the last page that's between EOF and
1238 * the PAGESIZE boundary.
1240 xlen = io_len & PAGEOFFSET;
1241 if (xlen != 0)
1242 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
1244 va = NULL;
1245 lastp = NULL;
1246 searchp = pp;
1247 io_end = io_off + io_len;
1248 for (count = 0, byte_offset = io_off;
1249 byte_offset < io_end; count++) {
1250 ASSERT(count < bufcnt);
1252 /* Compute disk address for interleaving. */
1254 /* considered without skips */
1255 which_chunk_lbn = byte_offset / chunk_data_bytes;
1257 /* factor in skips */
1258 offset_lbn = which_chunk_lbn * chunk_lbn_count;
1260 /* convert to physical byte offset for lbn */
1261 offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
1263 /* don't forget offset into lbn */
1264 offset_extra = byte_offset % chunk_data_bytes;
1266 /* get virtual block number for driver */
1267 driver_block =
1268 lbtodb(bof + xarsiz + offset_bytes + offset_extra);
1270 if (lastp != searchp) {
1271 /* this branch taken first time through loop */
1272 va = vas[count] =
1273 ppmapin(searchp, PROT_WRITE, (caddr_t)-1);
1274 /* ppmapin() guarantees not to return NULL */
1275 } else {
1276 vas[count] = NULL;
1279 bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
1280 bufs[count].b_offset =
1281 (offset_t)(byte_offset - io_off + off);
1284 * We specifically use the b_lblkno member here
1285 * as even in the 32 bit world driver_block can
1286 * get very large in line with the ISO9660 spec.
1289 bufs[count].b_lblkno = driver_block;
1291 remaining_bytes =
1292 ((which_chunk_lbn + 1) * chunk_data_bytes)
1293 - byte_offset;
1296 * remaining_bytes can't be zero, as we derived
1297 * which_chunk_lbn directly from byte_offset.
1299 if ((remaining_bytes + byte_offset) < (off + len)) {
1300 /* coalesce-read the rest of the chunk */
1301 bufs[count].b_bcount = remaining_bytes;
1302 } else {
1303 /* get the final bits */
1304 bufs[count].b_bcount = off + len - byte_offset;
1308 * It would be nice to do multiple pages'
1309 * worth at once here when the opportunity
1310 * arises, as that has been shown to improve
1311 * our wall time. However, to do that
1312 * requires that we use the pageio subsystem,
1313 * which doesn't mix well with what we're
1314 * already using here. We can't use pageio
1315 * all the time, because that subsystem
1316 * assumes that a page is stored in N
1317 * contiguous blocks on the device.
1318 * Interleaving violates that assumption.
1320 * Update: This is now not so big a problem
1321 * because of the I/O scheduler sitting below
1322 * that can re-order and coalesce I/O requests.
1325 remainder = PAGESIZE - (byte_offset % PAGESIZE);
1326 if (bufs[count].b_bcount > remainder) {
1327 bufs[count].b_bcount = remainder;
1330 bufs[count].b_bufsize = bufs[count].b_bcount;
1331 if (((offset_t)byte_offset + bufs[count].b_bcount) >
1332 HS_MAXFILEOFF) {
1333 break;
1335 byte_offset += bufs[count].b_bcount;
1337 if (fsp->hqueue == NULL) {
1338 (void) bdev_strategy(&bufs[count]);
1340 } else {
1342 * We are scheduling I/O so we need to enqueue
1343 * requests rather than calling bdev_strategy
1344 * here. A later invocation of the scheduling
1345 * function will take care of doing the actual
1346 * I/O as it selects requests from the queue as
1347 * per the scheduling logic.
1349 struct hio *hsio = kmem_cache_alloc(hio_cache,
1350 KM_SLEEP);
1352 sema_init(&fio_done[count], 0, NULL,
1353 SEMA_DEFAULT, NULL);
1354 hsio->bp = &bufs[count];
1355 hsio->sema = &fio_done[count];
1356 hsio->io_lblkno = bufs[count].b_lblkno;
1357 hsio->nblocks = howmany(hsio->bp->b_bcount,
1358 DEV_BSIZE);
1360 /* used for deadline */
1361 hsio->io_timestamp =
1362 drv_hztousec(ddi_get_lbolt());
1364 /* for I/O coalescing */
1365 hsio->contig_chain = NULL;
1366 hsched_enqueue_io(fsp, hsio, 0);
1369 lwp_stat_update(LWP_STAT_INBLK, 1);
1370 lastp = searchp;
1371 if ((remainder - bufs[count].b_bcount) < 1) {
1372 searchp = searchp->p_next;
1376 bufsused = count;
1377 /* Now wait for everything to come in */
1378 if (fsp->hqueue == NULL) {
1379 for (count = 0; count < bufsused; count++) {
1380 if (err == 0) {
1381 err = biowait(&bufs[count]);
1382 } else
1383 (void) biowait(&bufs[count]);
1385 } else {
1386 for (count = 0; count < bufsused; count++) {
1387 struct buf *wbuf;
1390 * Invoke scheduling function till our buf
1391 * is processed. In doing this it might
1392 * process bufs enqueued by other threads
1393 * which is good.
1395 wbuf = &bufs[count];
1396 DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
1397 while (sema_tryp(&fio_done[count]) == 0) {
1399 * hsched_invoke_strategy will return 1
1400 * if the I/O queue is empty. This means
1401 * that there is another thread who has
1402 * issued our buf and is waiting. So we
1403 * just block instead of spinning.
1405 if (hsched_invoke_strategy(fsp)) {
1406 sema_p(&fio_done[count]);
1407 break;
1410 sema_destroy(&fio_done[count]);
1411 DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
1413 if (err == 0) {
1414 err = geterror(wbuf);
1417 kmem_free(fio_done, bufcnt * sizeof (ksema_t));
1420 /* Don't leak resources */
1421 for (count = 0; count < bufcnt; count++) {
1422 biofini(&bufs[count]);
1423 if (count < bufsused && vas[count] != NULL) {
1424 ppmapout(vas[count]);
1428 kmem_free(vas, bufcnt * sizeof (caddr_t));
1429 kmem_free(bufs, bufcnt * sizeof (struct buf));
1432 if (err) {
1433 pvn_read_done(pp, B_ERROR);
1434 return (err);
1438 * Lock the requested page, and the one after it if possible.
1439 * Don't bother if our caller hasn't given us a place to stash
1440 * the page pointers, since otherwise we'd lock pages that would
1441 * never get unlocked.
1443 if (pagefound) {
1444 int index;
1445 ulong_t soff;
1448 * Make sure it's in memory before we say it's here.
1450 if ((pp = page_lookup(&vp->v_object, off, SE_SHARED)) == NULL) {
1451 hsfs_lostpage++;
1452 goto reread;
1455 pl[0] = pp;
1456 index = 1;
1457 atomic_inc_64(&fsp->cache_read_pages);
1460 * Try to lock the next page, if it exists, without
1461 * blocking.
1463 plsz -= PAGESIZE;
1464 /* LINTED (plsz is unsigned) */
1465 for (soff = off + PAGESIZE; plsz > 0;
1466 soff += PAGESIZE, plsz -= PAGESIZE) {
1467 pp = page_lookup_nowait(&vp->v_object, (uoff_t)soff,
1468 SE_SHARED);
1469 if (pp == NULL)
1470 break;
1471 pl[index++] = pp;
1473 pl[index] = NULL;
1476 * Schedule a semi-asynchronous readahead if we are
1477 * accessing the last cached page for the current
1478 * file.
1480 * Doing this here means that readaheads will be
1481 * issued only if cache-hits occur. This is an advantage
1482 * since cache-hits would mean that readahead is giving
1483 * the desired benefit. If cache-hits do not occur there
1484 * is no point in reading ahead of time - the system
1485 * is loaded anyway.
1487 if (fsp->hqueue != NULL &&
1488 hp->hs_prev_offset - off == PAGESIZE &&
1489 hp->hs_prev_offset < filsiz &&
1490 hp->hs_ra_bytes > 0 &&
1491 !page_exists(&vp->v_object, hp->hs_prev_offset)) {
1492 (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
1493 addr + PAGESIZE, hp, fsp, xarsiz, bof,
1494 chunk_lbn_count, chunk_data_bytes);
1497 return (0);
1500 if (pp != NULL) {
1501 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
1504 return (err);
1507 /*ARGSUSED*/
1508 static int
1509 hsfs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
1510 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
1511 enum seg_rw rw, struct cred *cred, caller_context_t *ct)
1513 uint_t filsiz;
1514 struct hsfs *fsp;
1515 struct hsnode *hp;
1517 fsp = VFS_TO_HSFS(vp->v_vfsp);
1518 hp = VTOH(vp);
1520 /* does not support write */
1521 if (rw == S_WRITE) {
1522 return (EROFS);
1525 if (vp->v_flag & VNOMAP) {
1526 return (ENOSYS);
1529 ASSERT(off <= HS_MAXFILEOFF);
1532 * Determine file data size for EOF check.
1534 filsiz = hp->hs_dirent.ext_size;
1535 if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
1536 return (EFAULT); /* beyond EOF */
1539 * Async Read-ahead computation.
1540 * This attempts to detect sequential access pattern and
1541 * enables reading extra pages ahead of time.
1543 if (fsp->hqueue != NULL) {
1545 * This check for sequential access also takes into
1546 * account segmap weirdness when reading in chunks
1547 * less than the segmap size of 8K.
1549 if (hp->hs_prev_offset == off || (off <
1550 hp->hs_prev_offset && off + MAX(len, PAGESIZE)
1551 >= hp->hs_prev_offset)) {
1552 if (hp->hs_num_contig <
1553 (seq_contig_requests - 1)) {
1554 hp->hs_num_contig++;
1556 } else {
1558 * We increase readahead quantum till
1559 * a predefined max. max_readahead_bytes
1560 * is a multiple of PAGESIZE.
1562 if (hp->hs_ra_bytes <
1563 fsp->hqueue->max_ra_bytes) {
1564 hp->hs_ra_bytes += PAGESIZE;
1567 } else {
1569 * Not contiguous so reduce read ahead counters.
1571 if (hp->hs_ra_bytes > 0)
1572 hp->hs_ra_bytes -= PAGESIZE;
1574 if (hp->hs_ra_bytes <= 0) {
1575 hp->hs_ra_bytes = 0;
1576 if (hp->hs_num_contig > 0)
1577 hp->hs_num_contig--;
1581 * Length must be rounded up to page boundary.
1582 * since we read in units of pages.
1584 hp->hs_prev_offset = off + roundup(len, PAGESIZE);
1585 DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
1587 if (protp != NULL)
1588 *protp = PROT_ALL;
1590 return (pvn_getpages(hsfs_getapage, vp, off, len, protp, pl, plsz,
1591 seg, addr, rw, cred));
1597 * This function should never be called. We need to have it to pass
1598 * it as an argument to other functions.
1600 /*ARGSUSED*/
1602 hsfs_putapage(vnode_t *vp, page_t *pp, uoff_t *offp, size_t *lenp,
1603 int flags, cred_t *cr)
1605 /* should never happen - just destroy it */
1606 cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
1607 pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
1608 return (0);
1613 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
1614 * B_INVAL is set by:
1616 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
1617 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
1618 * which translates to an MC_SYNC with the MS_INVALIDATE flag.
1620 * The B_FREE (as well as the B_DONTNEED) flag is set when the
1621 * MADV_SEQUENTIAL advice has been used. fop_putpage is invoked
1622 * from SEGVN to release pages behind a pagefault.
1624 /*ARGSUSED*/
1625 static int
1626 hsfs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
1627 struct cred *cr, caller_context_t *ct)
1629 int error = 0;
1631 if (vp->v_count == 0) {
1632 panic("hsfs_putpage: bad v_count");
1633 /*NOTREACHED*/
1636 if (vp->v_flag & VNOMAP)
1637 return (ENOSYS);
1639 ASSERT(off <= HS_MAXFILEOFF);
1641 if (!vn_has_cached_data(vp)) /* no pages mapped */
1642 return (0);
1644 if (len == 0) { /* from 'off' to EOF */
1645 error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr);
1646 } else {
1647 offset_t end_off = off + len;
1648 offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
1649 offset_t io_off;
1651 file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
1652 if (end_off > file_size)
1653 end_off = file_size;
1655 for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
1656 page_t *pp;
1659 * We insist on getting the page only if we are
1660 * about to invalidate, free or write it and
1661 * the B_ASYNC flag is not set.
1663 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1664 pp = page_lookup(&vp->v_object, io_off,
1665 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
1666 } else {
1667 pp = page_lookup_nowait(&vp->v_object,
1668 io_off,
1669 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
1672 if (pp == NULL)
1673 continue;
1676 * Normally pvn_getdirty() should return 0, which
1677 * impies that it has done the job for us.
1678 * The shouldn't-happen scenario is when it returns 1.
1679 * This means that the page has been modified and
1680 * needs to be put back.
1681 * Since we can't write on a CD, we fake a failed
1682 * I/O and force pvn_write_done() to destroy the page.
1684 if (pvn_getdirty(pp, flags) == 1) {
1685 cmn_err(CE_NOTE,
1686 "hsfs_putpage: dirty HSFS page");
1687 pvn_write_done(pp, flags |
1688 B_ERROR | B_WRITE | B_INVAL | B_FORCE);
1692 return (error);
1696 /*ARGSUSED*/
1697 static int
1698 hsfs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
1699 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cred,
1700 caller_context_t *ct)
1702 struct segvn_crargs vn_a;
1703 int error;
1705 /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
1707 if (vp->v_flag & VNOMAP)
1708 return (ENOSYS);
1710 if ((prot & PROT_WRITE) && (flags & MAP_SHARED))
1711 return (ENOSYS);
1713 if (off > HS_MAXFILEOFF || off < 0 ||
1714 (off + len) < 0 || (off + len) > HS_MAXFILEOFF)
1715 return (ENXIO);
1717 if (vp->v_type != VREG) {
1718 return (ENODEV);
1722 * If file is being locked, disallow mapping.
1724 if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
1725 return (EAGAIN);
1727 as_rangelock(as);
1728 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
1729 if (error != 0) {
1730 as_rangeunlock(as);
1731 return (error);
1734 vn_a.vp = vp;
1735 vn_a.offset = off;
1736 vn_a.type = flags & MAP_TYPE;
1737 vn_a.prot = prot;
1738 vn_a.maxprot = maxprot;
1739 vn_a.flags = flags & ~MAP_TYPE;
1740 vn_a.cred = cred;
1741 vn_a.amp = NULL;
1742 vn_a.szc = 0;
1743 vn_a.lgrp_mem_policy_flags = 0;
1745 error = as_map(as, *addrp, len, segvn_create, &vn_a);
1746 as_rangeunlock(as);
1747 return (error);
1750 /* ARGSUSED */
1751 static int
1752 hsfs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
1753 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
1754 caller_context_t *ct)
1756 struct hsnode *hp;
1758 if (vp->v_flag & VNOMAP)
1759 return (ENOSYS);
1761 hp = VTOH(vp);
1762 mutex_enter(&hp->hs_contents_lock);
1763 hp->hs_mapcnt += btopr(len);
1764 mutex_exit(&hp->hs_contents_lock);
1765 return (0);
1768 /*ARGSUSED*/
1769 static int
1770 hsfs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
1771 size_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr,
1772 caller_context_t *ct)
1774 struct hsnode *hp;
1776 if (vp->v_flag & VNOMAP)
1777 return (ENOSYS);
1779 hp = VTOH(vp);
1780 mutex_enter(&hp->hs_contents_lock);
1781 hp->hs_mapcnt -= btopr(len); /* Count released mappings */
1782 ASSERT(hp->hs_mapcnt >= 0);
1783 mutex_exit(&hp->hs_contents_lock);
1784 return (0);
1787 /* ARGSUSED */
1788 static int
1789 hsfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1790 caller_context_t *ct)
1792 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1795 /* ARGSUSED */
1796 static int
1797 hsfs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
1798 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
1799 caller_context_t *ct)
1801 struct hsnode *hp = VTOH(vp);
1804 * If the file is being mapped, disallow fs_frlock.
1805 * We are not holding the hs_contents_lock while checking
1806 * hs_mapcnt because the current locking strategy drops all
1807 * locks before calling fs_frlock.
1808 * So, hs_mapcnt could change before we enter fs_frlock making
1809 * it meaningless to have held hs_contents_lock in the first place.
1811 if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
1812 return (EAGAIN);
1814 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1817 static int
1818 hsched_deadline_compare(const void *x1, const void *x2)
1820 const struct hio *h1 = x1;
1821 const struct hio *h2 = x2;
1823 if (h1->io_timestamp < h2->io_timestamp)
1824 return (-1);
1825 if (h1->io_timestamp > h2->io_timestamp)
1826 return (1);
1828 if (h1->io_lblkno < h2->io_lblkno)
1829 return (-1);
1830 if (h1->io_lblkno > h2->io_lblkno)
1831 return (1);
1833 if (h1 < h2)
1834 return (-1);
1835 if (h1 > h2)
1836 return (1);
1838 return (0);
1841 static int
1842 hsched_offset_compare(const void *x1, const void *x2)
1844 const struct hio *h1 = x1;
1845 const struct hio *h2 = x2;
1847 if (h1->io_lblkno < h2->io_lblkno)
1848 return (-1);
1849 if (h1->io_lblkno > h2->io_lblkno)
1850 return (1);
1852 if (h1 < h2)
1853 return (-1);
1854 if (h1 > h2)
1855 return (1);
1857 return (0);
1860 void
1861 hsched_init_caches(void)
1863 hio_cache = kmem_cache_create("hsfs_hio_cache",
1864 sizeof (struct hio), 0, NULL,
1865 NULL, NULL, NULL, NULL, 0);
1867 hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
1868 sizeof (struct hio_info), 0, NULL,
1869 NULL, NULL, NULL, NULL, 0);
1872 void
1873 hsched_fini_caches(void)
1875 kmem_cache_destroy(hio_cache);
1876 kmem_cache_destroy(hio_info_cache);
1880 * Initialize I/O scheduling structures. This is called via hsfs_mount
1882 void
1883 hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
1885 struct hsfs_queue *hqueue = fsp->hqueue;
1886 struct vnode *vp = fsp->hsfs_devvp;
1888 /* TaskQ name of the form: hsched_task_ + stringof(int) */
1889 char namebuf[23];
1890 int error, err;
1891 struct dk_cinfo info;
1892 ldi_handle_t lh;
1893 ldi_ident_t li;
1896 * Default maxtransfer = 16k chunk
1898 hqueue->dev_maxtransfer = 16384;
1901 * Try to fetch the maximum device transfer size. This is used to
1902 * ensure that a coalesced block does not exceed the maxtransfer.
1904 err = ldi_ident_from_mod(modlinkage, &li);
1905 if (err) {
1906 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
1907 cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
1908 err);
1909 goto set_ra;
1912 err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
1913 ldi_ident_release(li);
1914 if (err) {
1915 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
1916 cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
1917 goto set_ra;
1920 error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
1921 CRED(), &err);
1922 err = ldi_close(lh, FREAD, CRED());
1923 if (err) {
1924 cmn_err(CE_NOTE, "hsched_init: Querying device failed");
1925 cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
1928 if (error == 0) {
1929 hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
1932 set_ra:
1934 * Max size of data to read ahead for sequential access pattern.
1935 * Conservative to avoid letting the underlying CD drive to spin
1936 * down, in case the application is reading slowly.
1937 * We read ahead upto a max of 4 pages.
1939 hqueue->max_ra_bytes = PAGESIZE * 8;
1941 mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
1942 mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
1943 avl_create(&(hqueue->read_tree), hsched_offset_compare,
1944 sizeof (struct hio), offsetof(struct hio, io_offset_node));
1945 avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
1946 sizeof (struct hio), offsetof(struct hio, io_deadline_node));
1948 (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
1949 hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
1950 minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
1952 hqueue->next = NULL;
1953 hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1956 void
1957 hsched_fini(struct hsfs_queue *hqueue)
1959 if (hqueue != NULL) {
1961 * Remove the sentinel if there was one.
1963 if (hqueue->next != NULL) {
1964 avl_remove(&hqueue->read_tree, hqueue->next);
1965 kmem_cache_free(hio_cache, hqueue->next);
1967 avl_destroy(&(hqueue->read_tree));
1968 avl_destroy(&(hqueue->deadline_tree));
1969 mutex_destroy(&(hqueue->hsfs_queue_lock));
1970 mutex_destroy(&(hqueue->strategy_lock));
1973 * If there are any existing readahead threads running
1974 * taskq_destroy will wait for them to finish.
1976 taskq_destroy(hqueue->ra_task);
1977 kmem_free(hqueue->nbuf, sizeof (struct buf));
1982 * Determine if two I/O requests are adjacent to each other so
1983 * that they can coalesced.
1985 #define IS_ADJACENT(io, nio) \
1986 (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
1987 (io)->bp->b_edev == (nio)->bp->b_edev)
1990 * This performs the actual I/O scheduling logic. We use the Circular
1991 * Look algorithm here. Sort the I/O requests in ascending order of
1992 * logical block number and process them starting with the lowest
1993 * numbered block and progressing towards higher block numbers in the
1994 * queue. Once there are no more higher numbered blocks, start again
1995 * with the lowest one. This is good for CD/DVD as you keep moving
1996 * the head in one direction along the outward spiral track and avoid
1997 * too many seeks as much as possible. The re-ordering also allows
1998 * us to coalesce adjacent requests into one larger request.
1999 * This is thus essentially a 1-way Elevator with front merging.
2001 * In addition each read request here has a deadline and will be
2002 * processed out of turn if the deadline (500ms) expires.
2004 * This function is necessarily serialized via hqueue->strategy_lock.
2005 * This function sits just below hsfs_getapage and processes all read
2006 * requests orginating from that function.
2009 hsched_invoke_strategy(struct hsfs *fsp)
2011 struct hsfs_queue *hqueue;
2012 struct buf *nbuf;
2013 struct hio *fio, *nio, *tio, *prev, *last;
2014 size_t bsize, soffset, offset, data;
2015 int bioret, bufcount;
2016 struct vnode *fvp;
2017 ksema_t *io_done;
2018 caddr_t iodata;
2020 hqueue = fsp->hqueue;
2021 mutex_enter(&hqueue->strategy_lock);
2022 mutex_enter(&hqueue->hsfs_queue_lock);
2025 * Check for Deadline expiration first
2027 fio = avl_first(&hqueue->deadline_tree);
2030 * Paranoid check for empty I/O queue. Both deadline
2031 * and read trees contain same data sorted in different
2032 * ways. So empty deadline tree = empty read tree.
2034 if (fio == NULL) {
2036 * Remove the sentinel if there was one.
2038 if (hqueue->next != NULL) {
2039 avl_remove(&hqueue->read_tree, hqueue->next);
2040 kmem_cache_free(hio_cache, hqueue->next);
2041 hqueue->next = NULL;
2043 mutex_exit(&hqueue->hsfs_queue_lock);
2044 mutex_exit(&hqueue->strategy_lock);
2045 return (1);
2048 if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
2049 < HSFS_READ_DEADLINE) {
2051 * Apply standard scheduling logic. This uses the
2052 * C-LOOK approach. Process I/O requests in ascending
2053 * order of logical block address till no subsequent
2054 * higher numbered block request remains. Then start
2055 * again from the lowest numbered block in the queue.
2057 * We do this cheaply here by means of a sentinel.
2058 * The last processed I/O structure from the previous
2059 * invocation of this func, is left dangling in the
2060 * read_tree so that we can easily scan to the next
2061 * higher numbered request and remove the sentinel.
2063 fio = NULL;
2064 if (hqueue->next != NULL) {
2065 fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
2066 avl_remove(&hqueue->read_tree, hqueue->next);
2067 kmem_cache_free(hio_cache, hqueue->next);
2068 hqueue->next = NULL;
2070 if (fio == NULL) {
2071 fio = avl_first(&hqueue->read_tree);
2073 } else if (hqueue->next != NULL) {
2074 DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
2076 avl_remove(&hqueue->read_tree, hqueue->next);
2077 kmem_cache_free(hio_cache, hqueue->next);
2078 hqueue->next = NULL;
2082 * In addition we try to coalesce contiguous
2083 * requests into one bigger request.
2085 bufcount = 1;
2086 bsize = ldbtob(fio->nblocks);
2087 fvp = fio->bp->b_file;
2088 nio = AVL_NEXT(&hqueue->read_tree, fio);
2089 tio = fio;
2090 while (nio != NULL && IS_ADJACENT(tio, nio) &&
2091 bsize < hqueue->dev_maxtransfer) {
2092 avl_remove(&hqueue->deadline_tree, tio);
2093 avl_remove(&hqueue->read_tree, tio);
2094 tio->contig_chain = nio;
2095 bsize += ldbtob(nio->nblocks);
2096 prev = tio;
2097 tio = nio;
2100 * This check is required to detect the case where
2101 * we are merging adjacent buffers belonging to
2102 * different files. fvp is used to set the b_file
2103 * parameter in the coalesced buf. b_file is used
2104 * by DTrace so we do not want DTrace to accrue
2105 * requests to two different files to any one file.
2107 if (fvp && tio->bp->b_file != fvp) {
2108 fvp = NULL;
2111 nio = AVL_NEXT(&hqueue->read_tree, nio);
2112 bufcount++;
2116 * tio is not removed from the read_tree as it serves as a sentinel
2117 * to cheaply allow us to scan to the next higher numbered I/O
2118 * request.
2120 hqueue->next = tio;
2121 avl_remove(&hqueue->deadline_tree, tio);
2122 mutex_exit(&hqueue->hsfs_queue_lock);
2123 DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
2124 size_t, bsize);
2127 * The benefit of coalescing occurs if the the savings in I/O outweighs
2128 * the cost of doing the additional work below.
2129 * It was observed that coalescing 2 buffers results in diminishing
2130 * returns, so we do coalescing if we have >2 adjacent bufs.
2132 if (bufcount > hsched_coalesce_min) {
2134 * We have coalesced blocks. First allocate mem and buf for
2135 * the entire coalesced chunk.
2136 * Since we are guaranteed single-threaded here we pre-allocate
2137 * one buf at mount time and that is re-used every time. This
2138 * is a synthesized buf structure that uses kmem_alloced chunk.
2139 * Not quite a normal buf attached to pages.
2141 fsp->coalesced_bytes += bsize;
2142 nbuf = hqueue->nbuf;
2143 bioinit(nbuf);
2144 nbuf->b_edev = fio->bp->b_edev;
2145 nbuf->b_dev = fio->bp->b_dev;
2146 nbuf->b_flags = fio->bp->b_flags;
2147 nbuf->b_iodone = fio->bp->b_iodone;
2148 iodata = kmem_alloc(bsize, KM_SLEEP);
2149 nbuf->b_un.b_addr = iodata;
2150 nbuf->b_lblkno = fio->bp->b_lblkno;
2151 nbuf->b_vp = fvp;
2152 nbuf->b_file = fvp;
2153 nbuf->b_bcount = bsize;
2154 nbuf->b_bufsize = bsize;
2156 DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
2157 bufcount, size_t, bsize);
2160 * Perform I/O for the coalesced block.
2162 (void) bdev_strategy(nbuf);
2165 * Duplicate the last IO node to leave the sentinel alone.
2166 * The sentinel is freed in the next invocation of this
2167 * function.
2169 prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
2170 prev->contig_chain->bp = tio->bp;
2171 prev->contig_chain->sema = tio->sema;
2172 tio = prev->contig_chain;
2173 tio->contig_chain = NULL;
2174 soffset = ldbtob(fio->bp->b_lblkno);
2175 nio = fio;
2177 bioret = biowait(nbuf);
2178 data = bsize - nbuf->b_resid;
2179 biofini(nbuf);
2180 mutex_exit(&hqueue->strategy_lock);
2183 * We use the b_resid parameter to detect how much
2184 * data was succesfully transferred. We will signal
2185 * a success to all the fully retrieved actual bufs
2186 * before coalescing, rest is signaled as error,
2187 * if any.
2189 tio = nio;
2190 DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
2191 int, bioret, size_t, data);
2194 * Copy data and signal success to all the bufs
2195 * which can be fully satisfied from b_resid.
2197 while (nio != NULL && data >= nio->bp->b_bcount) {
2198 offset = ldbtob(nio->bp->b_lblkno) - soffset;
2199 bcopy(iodata + offset, nio->bp->b_un.b_addr,
2200 nio->bp->b_bcount);
2201 data -= nio->bp->b_bcount;
2202 bioerror(nio->bp, 0);
2203 biodone(nio->bp);
2204 sema_v(nio->sema);
2205 tio = nio;
2206 nio = nio->contig_chain;
2207 kmem_cache_free(hio_cache, tio);
2211 * Signal error to all the leftover bufs (if any)
2212 * after b_resid data is exhausted.
2214 while (nio != NULL) {
2215 nio->bp->b_resid = nio->bp->b_bcount - data;
2216 bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
2217 bioerror(nio->bp, bioret);
2218 biodone(nio->bp);
2219 sema_v(nio->sema);
2220 tio = nio;
2221 nio = nio->contig_chain;
2222 kmem_cache_free(hio_cache, tio);
2223 data = 0;
2225 kmem_free(iodata, bsize);
2226 } else {
2228 nbuf = tio->bp;
2229 io_done = tio->sema;
2230 nio = fio;
2231 last = tio;
2233 while (nio != NULL) {
2234 (void) bdev_strategy(nio->bp);
2235 nio = nio->contig_chain;
2237 nio = fio;
2238 mutex_exit(&hqueue->strategy_lock);
2240 while (nio != NULL) {
2241 if (nio == last) {
2242 (void) biowait(nbuf);
2243 sema_v(io_done);
2244 break;
2245 /* sentinel last not freed. See above. */
2246 } else {
2247 (void) biowait(nio->bp);
2248 sema_v(nio->sema);
2250 tio = nio;
2251 nio = nio->contig_chain;
2252 kmem_cache_free(hio_cache, tio);
2255 return (0);
2259 * Insert an I/O request in the I/O scheduler's pipeline
2260 * Using AVL tree makes it easy to reorder the I/O request
2261 * based on logical block number.
2263 static void
2264 hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
2266 struct hsfs_queue *hqueue = fsp->hqueue;
2268 mutex_enter(&hqueue->hsfs_queue_lock);
2270 fsp->physical_read_bytes += hsio->bp->b_bcount;
2271 if (ra)
2272 fsp->readahead_bytes += hsio->bp->b_bcount;
2274 avl_add(&hqueue->deadline_tree, hsio);
2275 avl_add(&hqueue->read_tree, hsio);
2277 DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
2278 struct hsfs_queue *, hqueue, int, ra);
2280 mutex_exit(&hqueue->hsfs_queue_lock);
2283 /* ARGSUSED */
2284 static int
2285 hsfs_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
2286 caller_context_t *ct)
2288 struct hsfs *fsp;
2290 int error = 0;
2292 switch (cmd) {
2294 case _PC_NAME_MAX:
2295 fsp = VFS_TO_HSFS(vp->v_vfsp);
2296 *valp = fsp->hsfs_namemax;
2297 break;
2299 case _PC_FILESIZEBITS:
2300 *valp = 33; /* Without multi extent support: 4 GB - 2k */
2301 break;
2303 case _PC_TIMESTAMP_RESOLUTION:
2305 * HSFS keeps, at best, 1/100 second timestamp resolution.
2307 *valp = 10000000L;
2308 break;
2310 default:
2311 error = fs_pathconf(vp, cmd, valp, cr, ct);
2312 break;
2315 return (error);
2318 const struct vnodeops hsfs_vnodeops = {
2319 .vnop_name = "hsfs",
2320 .vop_open = hsfs_open,
2321 .vop_close = hsfs_close,
2322 .vop_read = hsfs_read,
2323 .vop_getattr = hsfs_getattr,
2324 .vop_access = hsfs_access,
2325 .vop_lookup = hsfs_lookup,
2326 .vop_readdir = hsfs_readdir,
2327 .vop_readlink = hsfs_readlink,
2328 .vop_fsync = hsfs_fsync,
2329 .vop_inactive = hsfs_inactive,
2330 .vop_fid = hsfs_fid,
2331 .vop_seek = hsfs_seek,
2332 .vop_frlock = hsfs_frlock,
2333 .vop_getpage = hsfs_getpage,
2334 .vop_putpage = hsfs_putpage,
2335 .vop_map = hsfs_map,
2336 .vop_addmap = hsfs_addmap,
2337 .vop_delmap = hsfs_delmap,
2338 .vop_pathconf = hsfs_pathconf,