4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2017 by Delphix. All rights reserved.
30 * Vnode operations for the High Sierra filesystem
33 #include <sys/types.h>
34 #include <sys/t_lock.h>
35 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/resource.h>
40 #include <sys/signal.h>
46 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
56 #include <sys/errno.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
63 #include <sys/dirent.h>
64 #include <sys/errno.h>
66 #include <sys/cmn_err.h>
67 #include <sys/atomic.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
81 #include <sys/sunldi.h>
83 #include <sys/sunddi.h>
87 * For struct modlinkage
89 #include <sys/modctl.h>
91 #include <sys/fs/hsfs_spec.h>
92 #include <sys/fs/hsfs_node.h>
93 #include <sys/fs/hsfs_impl.h>
94 #include <sys/fs/hsfs_susp.h>
95 #include <sys/fs/hsfs_rrip.h>
97 #include <sys/fs_subr.h>
99 /* # of contiguous requests to detect sequential access pattern */
100 static int seq_contig_requests
= 2;
103 * This is the max number os taskq threads that will be created
104 * if required. Since we are using a Dynamic TaskQ by default only
105 * one thread is created initially.
107 * NOTE: In the usual hsfs use case this per fs instance number
108 * of taskq threads should not place any undue load on a system.
109 * Even on an unusual system with say 100 CDROM drives, 800 threads
110 * will not be created unless all the drives are loaded and all
111 * of them are saturated with I/O at the same time! If there is at
112 * all a complaint of system load due to such an unusual case it
113 * should be easy enough to change to one per-machine Dynamic TaskQ
114 * for all hsfs mounts with a nthreads of say 32.
116 static int hsfs_taskq_nthreads
= 8; /* # of taskq threads per fs */
118 /* Min count of adjacent bufs that will avoid buf coalescing */
119 static int hsched_coalesce_min
= 2;
122 * Kmem caches for heavily used small allocations. Using these kmem
123 * caches provides a factor of 3 reduction in system time and greatly
124 * aids overall throughput esp. on SPARC.
126 struct kmem_cache
*hio_cache
;
127 struct kmem_cache
*hio_info_cache
;
130 * This tunable allows us to ignore inode numbers from rrip-1.12.
131 * In this case, we fall back to our default inode algorithm.
133 extern int use_rrip_inodes
;
136 * Free behind logic from UFS to tame our thirst for
138 * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
141 static int freebehind
= 1;
142 static int smallfile
= 0;
143 static int cache_read_ahead
= 0;
144 static uoff_t smallfile64
= 32 * 1024;
145 #define SMALLFILE1_D 1000
146 #define SMALLFILE2_D 10
147 static uoff_t smallfile1
= 32 * 1024;
148 static uoff_t smallfile2
= 32 * 1024;
149 static clock_t smallfile_update
= 0; /* when to recompute */
150 static uint_t smallfile1_d
= SMALLFILE1_D
;
151 static uint_t smallfile2_d
= SMALLFILE2_D
;
153 static int hsched_deadline_compare(const void *x1
, const void *x2
);
154 static int hsched_offset_compare(const void *x1
, const void *x2
);
155 static void hsched_enqueue_io(struct hsfs
*fsp
, struct hio
*hsio
, int ra
);
156 int hsched_invoke_strategy(struct hsfs
*fsp
);
160 hsfs_fsync(vnode_t
*cp
, int syncflag
, cred_t
*cred
, caller_context_t
*ct
)
168 hsfs_read(struct vnode
*vp
, struct uio
*uiop
, int ioflag
, struct cred
*cred
,
169 struct caller_context
*ct
)
180 * if vp is of type VDIR, make sure dirent
181 * is filled up with all info (because of ptbl)
183 if (vp
->v_type
== VDIR
) {
184 if (hp
->hs_dirent
.ext_size
== 0)
185 hs_filldirent(vp
, &hp
->hs_dirent
);
187 filesize
= hp
->hs_dirent
.ext_size
;
190 if (uiop
->uio_resid
== 0 || /* No data wanted. */
191 uiop
->uio_loffset
> HS_MAXFILEOFF
|| /* Offset too big. */
192 uiop
->uio_loffset
>= filesize
) /* Past EOF. */
197 * We want to ask for only the "right" amount of data.
198 * In this case that means:-
200 * We can't get data from beyond our EOF. If asked,
201 * we will give a short read.
203 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
204 * These buffers are always MAXBSIZE aligned.
205 * If our starting offset is not MAXBSIZE aligned,
206 * we can only ask for less than MAXBSIZE bytes.
208 * If our requested offset and length are such that
209 * they belong in different MAXBSIZE aligned slots
210 * then we'll be making more than one call on
213 * This diagram shows the variables we use and their
216 * |<-----MAXBSIZE----->|
217 * +--------------------------...+
218 * |.....mapon->|<--n-->|....*...|EOF
219 * +--------------------------...+
221 * uio_resid....|<---------->|
222 * diff.........|<-------------->|
224 * So, in this case our offset is not aligned
225 * and our request takes us outside of the
226 * MAXBSIZE window. We will break this up into
227 * two segmap_getmapflt calls.
234 mapon
= uiop
->uio_loffset
& MAXBOFFSET
;
235 diff
= filesize
- uiop
->uio_loffset
;
236 nbytes
= (size_t)MIN(MAXBSIZE
- mapon
, uiop
->uio_resid
);
237 n
= MIN(diff
, nbytes
);
239 /* EOF or request satisfied. */
244 * Freebehind computation taken from:
245 * usr/src/uts/common/fs/ufs/ufs_vnops.c
247 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update
) {
248 uint64_t percpufreeb
;
249 if (smallfile1_d
== 0) smallfile1_d
= SMALLFILE1_D
;
250 if (smallfile2_d
== 0) smallfile2_d
= SMALLFILE2_D
;
251 percpufreeb
= ptob((uint64_t)freemem
) / ncpus_online
;
252 smallfile1
= percpufreeb
/ smallfile1_d
;
253 smallfile2
= percpufreeb
/ smallfile2_d
;
254 smallfile1
= MAX(smallfile1
, smallfile
);
255 smallfile1
= MAX(smallfile1
, smallfile64
);
256 smallfile2
= MAX(smallfile1
, smallfile2
);
257 smallfile_update
= drv_hztousec(ddi_get_lbolt())
261 dofree
= freebehind
&&
262 hp
->hs_prev_offset
== uiop
->uio_loffset
&&
265 base
= segmap_getmapflt(segkmap
, vp
,
266 (uoff_t
)uiop
->uio_loffset
, n
, 1, S_READ
);
268 error
= uiomove(base
+ mapon
, n
, UIO_READ
, uiop
);
272 * if read a whole block, or read to eof,
273 * won't need this buffer again soon.
275 if (n
+ mapon
== MAXBSIZE
||
276 uiop
->uio_loffset
== filesize
)
282 flags
= SM_FREE
| SM_ASYNC
;
283 if ((cache_read_ahead
== 0) &&
284 uiop
->uio_loffset
> smallfile2
)
285 flags
|= SM_DONTNEED
;
288 error
= segmap_release(segkmap
, base
, flags
);
290 (void) segmap_release(segkmap
, base
, 0);
291 } while (error
== 0 && uiop
->uio_resid
> 0);
298 hsfs_getattr(struct vnode
*vp
, struct vattr
*vap
, int flags
, struct cred
*cred
,
299 caller_context_t
*ct
)
306 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
309 if ((hp
->hs_dirent
.ext_size
== 0) && (vp
->v_type
== VDIR
)) {
310 hs_filldirent(vp
, &hp
->hs_dirent
);
312 vap
->va_type
= IFTOVT(hp
->hs_dirent
.mode
);
313 vap
->va_mode
= hp
->hs_dirent
.mode
;
314 vap
->va_uid
= hp
->hs_dirent
.uid
;
315 vap
->va_gid
= hp
->hs_dirent
.gid
;
317 vap
->va_fsid
= vfsp
->vfs_dev
;
318 vap
->va_nodeid
= (ino64_t
)hp
->hs_nodeid
;
319 vap
->va_nlink
= hp
->hs_dirent
.nlink
;
320 vap
->va_size
= (offset_t
)hp
->hs_dirent
.ext_size
;
322 vap
->va_atime
.tv_sec
= hp
->hs_dirent
.adate
.tv_sec
;
323 vap
->va_atime
.tv_nsec
= hp
->hs_dirent
.adate
.tv_usec
*1000;
324 vap
->va_mtime
.tv_sec
= hp
->hs_dirent
.mdate
.tv_sec
;
325 vap
->va_mtime
.tv_nsec
= hp
->hs_dirent
.mdate
.tv_usec
*1000;
326 vap
->va_ctime
.tv_sec
= hp
->hs_dirent
.cdate
.tv_sec
;
327 vap
->va_ctime
.tv_nsec
= hp
->hs_dirent
.cdate
.tv_usec
*1000;
328 if (vp
->v_type
== VCHR
|| vp
->v_type
== VBLK
)
329 vap
->va_rdev
= hp
->hs_dirent
.r_dev
;
332 vap
->va_blksize
= vfsp
->vfs_bsize
;
333 /* no. of blocks = no. of data blocks + no. of xar blocks */
334 vap
->va_nblocks
= (fsblkcnt64_t
)howmany(vap
->va_size
+ (u_longlong_t
)
335 (hp
->hs_dirent
.xar_len
<< fsp
->hsfs_vol
.lbn_shift
), DEV_BSIZE
);
336 vap
->va_seq
= hp
->hs_seq
;
342 hsfs_readlink(struct vnode
*vp
, struct uio
*uiop
, struct cred
*cred
,
343 caller_context_t
*ct
)
347 if (vp
->v_type
!= VLNK
)
352 if (hp
->hs_dirent
.sym_link
== NULL
)
355 return (uiomove(hp
->hs_dirent
.sym_link
,
356 (size_t)MIN(hp
->hs_dirent
.ext_size
,
357 uiop
->uio_resid
), UIO_READ
, uiop
));
362 hsfs_inactive(struct vnode
*vp
, struct cred
*cred
, caller_context_t
*ct
)
370 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
372 * Note: acquiring and holding v_lock for quite a while
373 * here serializes on the vnode; this is unfortunate, but
374 * likely not to overly impact performance, as the underlying
375 * device (CDROM drive) is quite slow.
377 rw_enter(&fsp
->hsfs_hash_lock
, RW_WRITER
);
378 mutex_enter(&hp
->hs_contents_lock
);
379 mutex_enter(&vp
->v_lock
);
381 if (vp
->v_count
< 1) {
382 panic("hsfs_inactive: v_count < 1");
387 if (vp
->v_count
> 0 || (hp
->hs_flags
& HREF
) == 0) {
388 mutex_exit(&vp
->v_lock
);
389 mutex_exit(&hp
->hs_contents_lock
);
390 rw_exit(&fsp
->hsfs_hash_lock
);
393 if (vp
->v_count
== 0) {
396 * If there are no pages associated with the
397 * hsnode, give it back to the kmem_cache,
398 * else put at the end of this file system's
399 * internal free list.
401 nopage
= !vn_has_cached_data(vp
);
404 * exit these locks now, since hs_freenode may
405 * kmem_free the hsnode and embedded vnode
407 mutex_exit(&vp
->v_lock
);
408 mutex_exit(&hp
->hs_contents_lock
);
409 hs_freenode(vp
, fsp
, nopage
);
411 mutex_exit(&vp
->v_lock
);
412 mutex_exit(&hp
->hs_contents_lock
);
414 rw_exit(&fsp
->hsfs_hash_lock
);
420 hsfs_lookup(struct vnode
*dvp
, char *nm
, struct vnode
**vpp
,
421 struct pathname
*pnp
, int flags
, struct vnode
*rdir
, struct cred
*cred
,
422 caller_context_t
*ct
, int *direntflags
, pathname_t
*realpnp
)
425 int namelen
= (int)strlen(nm
);
434 * If we're looking for ourself, life is simple.
436 if (namelen
== 1 && *nm
== '.') {
437 if (error
= hs_access(dvp
, (mode_t
)VEXEC
, cred
))
444 return (hs_dirlook(dvp
, nm
, namelen
, vpp
, cred
));
450 hsfs_readdir(struct vnode
*vp
, struct uio
*uiop
, struct cred
*cred
, int *eofp
,
451 caller_context_t
*ct
, int flags
)
455 struct hs_direntry hd
;
458 uint_t offset
; /* real offset in directory */
459 uint_t dirsiz
; /* real size of directory */
461 int hdlen
; /* length of hs directory entry */
462 long ndlen
; /* length of dirent entry */
464 size_t bufsize
; /* size of dirent buffer */
465 char *outbuf
; /* ptr to dirent buffer */
470 uint_t last_offset
; /* last index into current dir block */
471 ino64_t dirino
; /* temporary storage before storing in dirent */
475 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
476 if (dhp
->hs_dirent
.ext_size
== 0)
477 hs_filldirent(vp
, &dhp
->hs_dirent
);
478 dirsiz
= dhp
->hs_dirent
.ext_size
;
479 if (uiop
->uio_loffset
>= dirsiz
) { /* at or beyond EOF */
484 ASSERT(uiop
->uio_loffset
<= HS_MAXFILEOFF
);
485 offset
= uiop
->uio_loffset
;
487 dname_size
= fsp
->hsfs_namemax
+ 1; /* 1 for the ending NUL */
488 dname
= kmem_alloc(dname_size
, KM_SLEEP
);
489 bufsize
= uiop
->uio_resid
+ sizeof (struct dirent64
);
491 outbuf
= kmem_alloc(bufsize
, KM_SLEEP
);
492 nd
= (struct dirent64
*)outbuf
;
494 while (offset
< dirsiz
) {
495 bytes_wanted
= MIN(MAXBSIZE
, dirsiz
- (offset
& MAXBMASK
));
497 error
= fbread(vp
, (offset_t
)(offset
& MAXBMASK
),
498 (unsigned int)bytes_wanted
, S_READ
, &fbp
);
502 blkp
= (uchar_t
*)fbp
->fb_addr
;
503 last_offset
= (offset
& MAXBMASK
) + fbp
->fb_count
;
505 #define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */
507 while (offset
< last_offset
) {
509 * Very similar validation code is found in
510 * process_dirblock(), hsfs_node.c.
511 * For an explanation, see there.
512 * It may make sense for the future to
513 * "consolidate" the code in hs_parsedir(),
514 * process_dirblock() and hsfs_readdir() into
515 * a single utility function.
517 hdlen
= (int)((uchar_t
)
518 HDE_DIR_LEN(&blkp
[rel_offset(offset
)]));
519 if (hdlen
< HDE_ROOT_DIR_REC_SIZE
||
520 offset
+ hdlen
> last_offset
) {
522 * advance to next sector boundary
524 offset
= roundup(offset
+ 1, HS_SECTOR_SIZE
);
526 hs_log_bogus_disk_warning(fsp
,
527 HSFS_ERR_TRAILING_JUNK
, 0);
532 bzero(&hd
, sizeof (hd
));
535 * Just ignore invalid directory entries.
536 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
538 if (!hs_parsedir(fsp
, &blkp
[rel_offset(offset
)],
539 &hd
, dname
, &dnamelen
, last_offset
- offset
)) {
541 * Determine if there is enough room
543 ndlen
= (long)DIRENT64_RECLEN((dnamelen
));
545 if ((ndlen
+ ((char *)nd
- outbuf
)) >
547 fbrelse(fbp
, S_READ
);
548 goto done
; /* output buffer full */
551 diroff
= offset
+ hdlen
;
553 * If the media carries rrip-v1.12 or newer,
554 * and we trust the inodes from the rrip data
555 * (use_rrip_inodes != 0), use that data. If the
556 * media has been created by a recent mkisofs
557 * version, we may trust all numbers in the
558 * starting extent number; otherwise, we cannot
559 * do this for zero sized files and symlinks,
560 * because if we did we'd end up mapping all of
561 * them to the same node. We use HS_DUMMY_INO
562 * in this case and make sure that we will not
563 * map all files to the same meta data.
565 if (hd
.inode
!= 0 && use_rrip_inodes
) {
567 } else if ((hd
.ext_size
== 0 ||
568 hd
.sym_link
!= NULL
) &&
569 (fsp
->hsfs_flags
& HSFSMNT_INODE
) == 0) {
570 dirino
= HS_DUMMY_INO
;
575 /* strncpy(9f) will zero uninitialized bytes */
577 ASSERT(strlen(dname
) + 1 <=
578 DIRENT64_NAMELEN(ndlen
));
579 (void) strncpy(nd
->d_name
, dname
,
580 DIRENT64_NAMELEN(ndlen
));
581 nd
->d_reclen
= (ushort_t
)ndlen
;
582 nd
->d_off
= (offset_t
)diroff
;
584 nd
= (struct dirent64
*)((char *)nd
+ ndlen
);
587 * free up space allocated for symlink
589 if (hd
.sym_link
!= NULL
) {
590 kmem_free(hd
.sym_link
,
591 (size_t)(hd
.ext_size
+1));
597 fbrelse(fbp
, S_READ
);
601 * Got here for one of the following reasons:
602 * 1) outbuf is full (error == 0)
603 * 2) end of directory reached (error == 0)
604 * 3) error reading directory sector (error != 0)
605 * 4) directory entry crosses sector boundary (error == 0)
607 * If any directory entries have been copied, don't report
608 * case 4. Instead, return the valid directory entries.
610 * If no entries have been copied, report the error.
611 * If case 4, this will be indistiguishable from EOF.
614 ndlen
= ((char *)nd
- outbuf
);
616 error
= uiomove(outbuf
, (size_t)ndlen
, UIO_READ
, uiop
);
617 uiop
->uio_loffset
= offset
;
619 kmem_free(dname
, dname_size
);
620 kmem_free(outbuf
, bufsize
);
621 if (eofp
&& error
== 0)
622 *eofp
= (uiop
->uio_loffset
>= dirsiz
);
628 hsfs_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
633 if (fidp
->fid_len
< (sizeof (*fid
) - sizeof (fid
->hf_len
))) {
634 fidp
->fid_len
= sizeof (*fid
) - sizeof (fid
->hf_len
);
638 fid
= (struct hsfid
*)fidp
;
639 fid
->hf_len
= sizeof (*fid
) - sizeof (fid
->hf_len
);
641 mutex_enter(&hp
->hs_contents_lock
);
642 fid
->hf_dir_lbn
= hp
->hs_dir_lbn
;
643 fid
->hf_dir_off
= (ushort_t
)hp
->hs_dir_off
;
644 fid
->hf_ino
= hp
->hs_nodeid
;
645 mutex_exit(&hp
->hs_contents_lock
);
651 hsfs_open(struct vnode
**vpp
, int flag
, struct cred
*cred
, caller_context_t
*ct
)
658 hsfs_close(struct vnode
*vp
, int flag
, int count
, offset_t offset
,
659 struct cred
*cred
, caller_context_t
*ct
)
661 (void) cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
662 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
668 hsfs_access(struct vnode
*vp
, int mode
, int flags
, cred_t
*cred
,
669 caller_context_t
*ct
)
671 return (hs_access(vp
, (mode_t
)mode
, cred
));
675 * the seek time of a CD-ROM is very slow, and data transfer
676 * rate is even worse (max. 150K per sec). The design
677 * decision is to reduce access to cd-rom as much as possible,
678 * and to transfer a sizable block (read-ahead) of data at a time.
679 * UFS style of read ahead one block at a time is not appropriate,
680 * and is not supported
684 * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
686 #define KLUSTSIZE (56 * 1024)
687 /* we don't support read ahead */
688 int hsfs_lostpage
; /* no. of times we lost original page */
691 * Used to prevent biodone() from releasing buf resources that
692 * we didn't allocate in quite the usual way.
696 hsfs_iodone(struct buf
*bp
)
703 * The taskq thread that invokes the scheduling function to ensure
704 * that all readaheads are complete and cleans up the associated
705 * memory and releases the page lock.
708 hsfs_ra_task(void *arg
)
710 struct hio_info
*info
= arg
;
714 ASSERT(info
->pp
!= NULL
);
716 for (count
= 0; count
< info
->bufsused
; count
++) {
717 wbuf
= &(info
->bufs
[count
]);
719 DTRACE_PROBE1(hsfs_io_wait_ra
, struct buf
*, wbuf
);
720 while (sema_tryp(&(info
->sema
[count
])) == 0) {
721 if (hsched_invoke_strategy(info
->fsp
)) {
722 sema_p(&(info
->sema
[count
]));
726 sema_destroy(&(info
->sema
[count
]));
727 DTRACE_PROBE1(hsfs_io_done_ra
, struct buf
*, wbuf
);
728 biofini(&(info
->bufs
[count
]));
730 for (count
= 0; count
< info
->bufsused
; count
++) {
731 if (info
->vas
[count
] != NULL
) {
732 ppmapout(info
->vas
[count
]);
735 kmem_free(info
->vas
, info
->bufcnt
* sizeof (caddr_t
));
736 kmem_free(info
->bufs
, info
->bufcnt
* sizeof (struct buf
));
737 kmem_free(info
->sema
, info
->bufcnt
* sizeof (ksema_t
));
739 pvn_read_done(info
->pp
, 0);
740 kmem_cache_free(hio_info_cache
, info
);
744 * Submit asynchronous readahead requests to the I/O scheduler
745 * depending on the number of pages to read ahead. These requests
746 * are asynchronous to the calling thread but I/O requests issued
747 * subsequently by other threads with higher LBNs must wait for
748 * these readaheads to complete since we have a single ordered
749 * I/O pipeline. Thus these readaheads are semi-asynchronous.
750 * A TaskQ handles waiting for the readaheads to complete.
752 * This function is mostly a copy of hsfs_getapage but somewhat
753 * simpler. A readahead request is aborted if page allocation
758 hsfs_getpage_ra(struct vnode
*vp
, uoff_t off
, struct seg
*seg
,
759 caddr_t addr
, struct hsnode
*hp
, struct hsfs
*fsp
, int xarsiz
,
760 offset_t bof
, int chunk_lbn_count
, int chunk_data_bytes
)
765 struct page
*pp
, *searchp
, *lastp
;
769 uint_t io_off
, io_len
;
777 uint_t which_chunk_lbn
;
780 offset_t offset_bytes
;
781 uint_t remaining_bytes
;
783 int remainder
; /* must be signed */
784 diskaddr_t driver_block
;
787 struct hio_info
*info
;
790 ASSERT(fsp
->hqueue
!= NULL
);
792 if (addr
>= seg
->s_base
+ seg
->s_size
) {
796 devvp
= fsp
->hsfs_devvp
;
797 secsize
= fsp
->hsfs_vol
.lbn_size
; /* bytes per logical block */
800 filsiz
= hp
->hs_dirent
.ext_size
;
808 extension
+= hp
->hs_ra_bytes
;
811 * Some CD writers (e.g. Kodak Photo CD writers)
812 * create CDs in TAO mode and reserve tracks that
813 * are not completely written. Some sectors remain
814 * unreadable for this reason and give I/O errors.
815 * Also, there's no point in reading sectors
816 * we'll never look at. So, if we're asked to go
817 * beyond the end of a file, truncate to the length
820 * Additionally, this behaviour is required by section
821 * 6.4.5 of ISO 9660:1988(E).
823 len
= MIN(extension
? extension
: PAGESIZE
, filsiz
- off
);
825 /* A little paranoia */
830 * After all that, make sure we're asking for things in units
831 * that bdev_strategy() will understand (see bug 4202551).
833 len
= roundup(len
, DEV_BSIZE
);
835 pp
= pvn_read_kluster(vp
, off
, seg
, addr
, &io_off_tmp
,
836 &io_len_tmp
, off
, len
, 1);
839 hp
->hs_num_contig
= 0;
841 hp
->hs_prev_offset
= 0;
845 io_off
= (uint_t
)io_off_tmp
;
846 io_len
= (uint_t
)io_len_tmp
;
848 /* check for truncation */
850 * xxx Clean up and return EIO instead?
851 * xxx Ought to go to uoff_t for everything, but we
852 * xxx call lots of things that want uint_t arguments.
854 ASSERT(io_off
== io_off_tmp
);
857 * get enough buffers for worst-case scenario
858 * (i.e., no coalescing possible).
860 bufcnt
= (len
+ secsize
- 1) / secsize
;
861 bufs
= kmem_alloc(bufcnt
* sizeof (struct buf
), KM_SLEEP
);
862 vas
= kmem_alloc(bufcnt
* sizeof (caddr_t
), KM_SLEEP
);
865 * Allocate a array of semaphores since we are doing I/O
868 fio_done
= kmem_alloc(bufcnt
* sizeof (ksema_t
), KM_SLEEP
);
871 * If our filesize is not an integer multiple of PAGESIZE,
872 * we zero that part of the last page that's between EOF and
873 * the PAGESIZE boundary.
875 xlen
= io_len
& PAGEOFFSET
;
877 pagezero(pp
->p_prev
, xlen
, PAGESIZE
- xlen
);
879 DTRACE_PROBE2(hsfs_readahead
, struct vnode
*, vp
, uint_t
, io_len
);
884 io_end
= io_off
+ io_len
;
885 for (count
= 0, byte_offset
= io_off
;
886 byte_offset
< io_end
;
888 ASSERT(count
< bufcnt
);
890 bioinit(&bufs
[count
]);
891 bufs
[count
].b_edev
= devvp
->v_rdev
;
892 bufs
[count
].b_dev
= cmpdev(devvp
->v_rdev
);
893 bufs
[count
].b_flags
= B_NOCACHE
|B_BUSY
|B_READ
;
894 bufs
[count
].b_iodone
= hsfs_iodone
;
895 bufs
[count
].b_vp
= vp
;
896 bufs
[count
].b_file
= vp
;
898 /* Compute disk address for interleaving. */
900 /* considered without skips */
901 which_chunk_lbn
= byte_offset
/ chunk_data_bytes
;
903 /* factor in skips */
904 offset_lbn
= which_chunk_lbn
* chunk_lbn_count
;
906 /* convert to physical byte offset for lbn */
907 offset_bytes
= LBN_TO_BYTE(offset_lbn
, vp
->v_vfsp
);
909 /* don't forget offset into lbn */
910 offset_extra
= byte_offset
% chunk_data_bytes
;
912 /* get virtual block number for driver */
913 driver_block
= lbtodb(bof
+ xarsiz
914 + offset_bytes
+ offset_extra
);
916 if (lastp
!= searchp
) {
917 /* this branch taken first time through loop */
918 va
= vas
[count
] = ppmapin(searchp
, PROT_WRITE
,
920 /* ppmapin() guarantees not to return NULL */
925 bufs
[count
].b_un
.b_addr
= va
+ byte_offset
% PAGESIZE
;
926 bufs
[count
].b_offset
=
927 (offset_t
)(byte_offset
- io_off
+ off
);
930 * We specifically use the b_lblkno member here
931 * as even in the 32 bit world driver_block can
932 * get very large in line with the ISO9660 spec.
935 bufs
[count
].b_lblkno
= driver_block
;
937 remaining_bytes
= ((which_chunk_lbn
+ 1) * chunk_data_bytes
)
941 * remaining_bytes can't be zero, as we derived
942 * which_chunk_lbn directly from byte_offset.
944 if ((remaining_bytes
+ byte_offset
) < (off
+ len
)) {
945 /* coalesce-read the rest of the chunk */
946 bufs
[count
].b_bcount
= remaining_bytes
;
948 /* get the final bits */
949 bufs
[count
].b_bcount
= off
+ len
- byte_offset
;
952 remainder
= PAGESIZE
- (byte_offset
% PAGESIZE
);
953 if (bufs
[count
].b_bcount
> remainder
) {
954 bufs
[count
].b_bcount
= remainder
;
957 bufs
[count
].b_bufsize
= bufs
[count
].b_bcount
;
958 if (((offset_t
)byte_offset
+ bufs
[count
].b_bcount
) >
962 byte_offset
+= bufs
[count
].b_bcount
;
965 * We are scheduling I/O so we need to enqueue
966 * requests rather than calling bdev_strategy
967 * here. A later invocation of the scheduling
968 * function will take care of doing the actual
969 * I/O as it selects requests from the queue as
970 * per the scheduling logic.
972 struct hio
*hsio
= kmem_cache_alloc(hio_cache
,
975 sema_init(&fio_done
[count
], 0, NULL
,
977 hsio
->bp
= &bufs
[count
];
978 hsio
->sema
= &fio_done
[count
];
979 hsio
->io_lblkno
= bufs
[count
].b_lblkno
;
980 hsio
->nblocks
= howmany(hsio
->bp
->b_bcount
,
983 /* used for deadline */
984 hsio
->io_timestamp
= drv_hztousec(ddi_get_lbolt());
986 /* for I/O coalescing */
987 hsio
->contig_chain
= NULL
;
988 hsched_enqueue_io(fsp
, hsio
, 1);
990 lwp_stat_update(LWP_STAT_INBLK
, 1);
992 if ((remainder
- bufs
[count
].b_bcount
) < 1) {
993 searchp
= searchp
->p_next
;
998 info
= kmem_cache_alloc(hio_info_cache
, KM_SLEEP
);
1001 info
->sema
= fio_done
;
1002 info
->bufsused
= bufsused
;
1003 info
->bufcnt
= bufcnt
;
1007 (void) taskq_dispatch(fsp
->hqueue
->ra_task
,
1008 hsfs_ra_task
, info
, KM_SLEEP
);
1010 * The I/O locked pages are unlocked in our taskq thread.
1016 * Each file may have a different interleaving on disk. This makes
1017 * things somewhat interesting. The gist is that there are some
1018 * number of contiguous data sectors, followed by some other number
1019 * of contiguous skip sectors. The sum of those two sets of sectors
1020 * defines the interleave size. Unfortunately, it means that we generally
1021 * can't simply read N sectors starting at a given offset to satisfy
1022 * any given request.
1024 * What we do is get the relevant memory pages via pvn_read_kluster(),
1025 * then stride through the interleaves, setting up a buf for each
1026 * sector that needs to be brought in. Instead of kmem_alloc'ing
1027 * space for the sectors, though, we just point at the appropriate
1028 * spot in the relevant page for each of them. This saves us a bunch
1031 * NOTICE: The code below in hsfs_getapage is mostly same as the code
1032 * in hsfs_getpage_ra above (with some omissions). If you are
1033 * making any change to this function, please also look at
1038 hsfs_getapage(struct vnode
*vp
, uoff_t off
, size_t len
, uint_t
*protp
,
1039 struct page
*pl
[], size_t plsz
, struct seg
*seg
, caddr_t addr
,
1040 enum seg_rw rw
, struct cred
*cred
)
1048 struct page
*pp
, *searchp
, *lastp
;
1051 struct vnode
*devvp
;
1052 ulong_t byte_offset
;
1054 uint_t io_off
, io_len
;
1062 uint_t which_chunk_lbn
;
1064 uint_t offset_extra
;
1065 offset_t offset_bytes
;
1066 uint_t remaining_bytes
;
1068 int remainder
; /* must be signed */
1069 int chunk_lbn_count
;
1070 int chunk_data_bytes
;
1072 diskaddr_t driver_block
;
1078 * We don't support asynchronous operation at the moment, so
1079 * just pretend we did it. If the pages are ever actually
1080 * needed, they'll get brought in then.
1086 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
1087 devvp
= fsp
->hsfs_devvp
;
1088 secsize
= fsp
->hsfs_vol
.lbn_size
; /* bytes per logical block */
1090 /* file data size */
1091 filsiz
= hp
->hs_dirent
.ext_size
;
1093 /* disk addr for start of file */
1094 bof
= LBN_TO_BYTE((offset_t
)hp
->hs_dirent
.ext_lbn
, vp
->v_vfsp
);
1096 /* xarsiz byte must be skipped for data */
1097 xarsiz
= hp
->hs_dirent
.xar_len
<< fsp
->hsfs_vol
.lbn_shift
;
1099 /* how many logical blocks in an interleave (data+skip) */
1100 chunk_lbn_count
= hp
->hs_dirent
.intlf_sz
+ hp
->hs_dirent
.intlf_sk
;
1102 if (chunk_lbn_count
== 0) {
1103 chunk_lbn_count
= 1;
1107 * Convert interleaving size into bytes. The zero case
1108 * (no interleaving) optimization is handled as a side-
1109 * effect of the read-ahead logic.
1111 if (hp
->hs_dirent
.intlf_sz
== 0) {
1112 chunk_data_bytes
= LBN_TO_BYTE(1, vp
->v_vfsp
);
1114 * Optimization: If our pagesize is a multiple of LBN
1115 * bytes, we can avoid breaking up a page into individual
1116 * lbn-sized requests.
1118 if (PAGESIZE
% chunk_data_bytes
== 0) {
1119 chunk_lbn_count
= BYTE_TO_LBN(PAGESIZE
, vp
->v_vfsp
);
1120 chunk_data_bytes
= PAGESIZE
;
1124 LBN_TO_BYTE(hp
->hs_dirent
.intlf_sz
, vp
->v_vfsp
);
1133 * Do some read-ahead. This mostly saves us a bit of
1134 * system cpu time more than anything else when doing
1135 * sequential reads. At some point, could do the
1136 * read-ahead asynchronously which might gain us something
1137 * on wall time, but it seems unlikely....
1139 * We do the easy case here, which is to read through
1140 * the end of the chunk, minus whatever's at the end that
1141 * won't exactly fill a page.
1143 if (hp
->hs_ra_bytes
> 0 && chunk_data_bytes
!= PAGESIZE
) {
1144 which_chunk_lbn
= (off
+ len
) / chunk_data_bytes
;
1145 extension
= ((which_chunk_lbn
+ 1) * chunk_data_bytes
) - off
;
1146 extension
-= (extension
% PAGESIZE
);
1148 extension
= roundup(len
, PAGESIZE
);
1151 atomic_inc_64(&fsp
->total_pages_requested
);
1155 /* search for page in buffer */
1156 if ((pagefound
= page_exists(&vp
->v_object
, off
)) == 0) {
1158 * Need to really do disk IO to get the page.
1161 extension
+= hp
->hs_ra_bytes
;
1163 len
= (extension
!= 0) ? extension
: PAGESIZE
;
1166 * Some cd writers don't write sectors that aren't
1167 * used. Also, there's no point in reading sectors
1168 * we'll never look at. So, if we're asked to go
1169 * beyond the end of a file, truncate to the length
1172 * Additionally, this behaviour is required by section
1173 * 6.4.5 of ISO 9660:1988(E).
1175 if (off
< filsiz
&& off
+ len
> filsiz
)
1179 * After all that, make sure we're asking for things
1180 * in units that bdev_strategy() will understand.
1182 len
= roundup(len
, DEV_BSIZE
);
1186 pp
= pvn_read_kluster(vp
, off
, seg
, addr
, &io_off_tmp
,
1187 &io_len_tmp
, off
, len
, 0);
1191 * Pressure on memory, roll back readahead
1193 hp
->hs_num_contig
= 0;
1194 hp
->hs_ra_bytes
= 0;
1195 hp
->hs_prev_offset
= 0;
1199 io_off
= (uint_t
)io_off_tmp
;
1200 io_len
= (uint_t
)io_len_tmp
;
1202 /* check for truncation */
1204 * xxx Clean up and return EIO instead?
1205 * xxx Ought to go to uoff_t for everything, but we
1206 * xxx call lots of things that want uint_t arguments.
1208 ASSERT(io_off
== io_off_tmp
);
1211 * get enough buffers for worst-case scenario
1212 * (i.e., no coalescing possible).
1214 bufcnt
= (len
+ secsize
- 1) / secsize
;
1215 bufs
= kmem_zalloc(bufcnt
* sizeof (struct buf
), KM_SLEEP
);
1216 vas
= kmem_alloc(bufcnt
* sizeof (caddr_t
), KM_SLEEP
);
1219 * Allocate a array of semaphores if we are doing I/O
1222 if (fsp
->hqueue
!= NULL
)
1223 fio_done
= kmem_alloc(bufcnt
* sizeof (ksema_t
),
1225 for (count
= 0; count
< bufcnt
; count
++) {
1226 bioinit(&bufs
[count
]);
1227 bufs
[count
].b_edev
= devvp
->v_rdev
;
1228 bufs
[count
].b_dev
= cmpdev(devvp
->v_rdev
);
1229 bufs
[count
].b_flags
= B_NOCACHE
|B_BUSY
|B_READ
;
1230 bufs
[count
].b_iodone
= hsfs_iodone
;
1231 bufs
[count
].b_vp
= vp
;
1232 bufs
[count
].b_file
= vp
;
1236 * If our filesize is not an integer multiple of PAGESIZE,
1237 * we zero that part of the last page that's between EOF and
1238 * the PAGESIZE boundary.
1240 xlen
= io_len
& PAGEOFFSET
;
1242 pagezero(pp
->p_prev
, xlen
, PAGESIZE
- xlen
);
1247 io_end
= io_off
+ io_len
;
1248 for (count
= 0, byte_offset
= io_off
;
1249 byte_offset
< io_end
; count
++) {
1250 ASSERT(count
< bufcnt
);
1252 /* Compute disk address for interleaving. */
1254 /* considered without skips */
1255 which_chunk_lbn
= byte_offset
/ chunk_data_bytes
;
1257 /* factor in skips */
1258 offset_lbn
= which_chunk_lbn
* chunk_lbn_count
;
1260 /* convert to physical byte offset for lbn */
1261 offset_bytes
= LBN_TO_BYTE(offset_lbn
, vp
->v_vfsp
);
1263 /* don't forget offset into lbn */
1264 offset_extra
= byte_offset
% chunk_data_bytes
;
1266 /* get virtual block number for driver */
1268 lbtodb(bof
+ xarsiz
+ offset_bytes
+ offset_extra
);
1270 if (lastp
!= searchp
) {
1271 /* this branch taken first time through loop */
1273 ppmapin(searchp
, PROT_WRITE
, (caddr_t
)-1);
1274 /* ppmapin() guarantees not to return NULL */
1279 bufs
[count
].b_un
.b_addr
= va
+ byte_offset
% PAGESIZE
;
1280 bufs
[count
].b_offset
=
1281 (offset_t
)(byte_offset
- io_off
+ off
);
1284 * We specifically use the b_lblkno member here
1285 * as even in the 32 bit world driver_block can
1286 * get very large in line with the ISO9660 spec.
1289 bufs
[count
].b_lblkno
= driver_block
;
1292 ((which_chunk_lbn
+ 1) * chunk_data_bytes
)
1296 * remaining_bytes can't be zero, as we derived
1297 * which_chunk_lbn directly from byte_offset.
1299 if ((remaining_bytes
+ byte_offset
) < (off
+ len
)) {
1300 /* coalesce-read the rest of the chunk */
1301 bufs
[count
].b_bcount
= remaining_bytes
;
1303 /* get the final bits */
1304 bufs
[count
].b_bcount
= off
+ len
- byte_offset
;
1308 * It would be nice to do multiple pages'
1309 * worth at once here when the opportunity
1310 * arises, as that has been shown to improve
1311 * our wall time. However, to do that
1312 * requires that we use the pageio subsystem,
1313 * which doesn't mix well with what we're
1314 * already using here. We can't use pageio
1315 * all the time, because that subsystem
1316 * assumes that a page is stored in N
1317 * contiguous blocks on the device.
1318 * Interleaving violates that assumption.
1320 * Update: This is now not so big a problem
1321 * because of the I/O scheduler sitting below
1322 * that can re-order and coalesce I/O requests.
1325 remainder
= PAGESIZE
- (byte_offset
% PAGESIZE
);
1326 if (bufs
[count
].b_bcount
> remainder
) {
1327 bufs
[count
].b_bcount
= remainder
;
1330 bufs
[count
].b_bufsize
= bufs
[count
].b_bcount
;
1331 if (((offset_t
)byte_offset
+ bufs
[count
].b_bcount
) >
1335 byte_offset
+= bufs
[count
].b_bcount
;
1337 if (fsp
->hqueue
== NULL
) {
1338 (void) bdev_strategy(&bufs
[count
]);
1342 * We are scheduling I/O so we need to enqueue
1343 * requests rather than calling bdev_strategy
1344 * here. A later invocation of the scheduling
1345 * function will take care of doing the actual
1346 * I/O as it selects requests from the queue as
1347 * per the scheduling logic.
1349 struct hio
*hsio
= kmem_cache_alloc(hio_cache
,
1352 sema_init(&fio_done
[count
], 0, NULL
,
1353 SEMA_DEFAULT
, NULL
);
1354 hsio
->bp
= &bufs
[count
];
1355 hsio
->sema
= &fio_done
[count
];
1356 hsio
->io_lblkno
= bufs
[count
].b_lblkno
;
1357 hsio
->nblocks
= howmany(hsio
->bp
->b_bcount
,
1360 /* used for deadline */
1361 hsio
->io_timestamp
=
1362 drv_hztousec(ddi_get_lbolt());
1364 /* for I/O coalescing */
1365 hsio
->contig_chain
= NULL
;
1366 hsched_enqueue_io(fsp
, hsio
, 0);
1369 lwp_stat_update(LWP_STAT_INBLK
, 1);
1371 if ((remainder
- bufs
[count
].b_bcount
) < 1) {
1372 searchp
= searchp
->p_next
;
1377 /* Now wait for everything to come in */
1378 if (fsp
->hqueue
== NULL
) {
1379 for (count
= 0; count
< bufsused
; count
++) {
1381 err
= biowait(&bufs
[count
]);
1383 (void) biowait(&bufs
[count
]);
1386 for (count
= 0; count
< bufsused
; count
++) {
1390 * Invoke scheduling function till our buf
1391 * is processed. In doing this it might
1392 * process bufs enqueued by other threads
1395 wbuf
= &bufs
[count
];
1396 DTRACE_PROBE1(hsfs_io_wait
, struct buf
*, wbuf
);
1397 while (sema_tryp(&fio_done
[count
]) == 0) {
1399 * hsched_invoke_strategy will return 1
1400 * if the I/O queue is empty. This means
1401 * that there is another thread who has
1402 * issued our buf and is waiting. So we
1403 * just block instead of spinning.
1405 if (hsched_invoke_strategy(fsp
)) {
1406 sema_p(&fio_done
[count
]);
1410 sema_destroy(&fio_done
[count
]);
1411 DTRACE_PROBE1(hsfs_io_done
, struct buf
*, wbuf
);
1414 err
= geterror(wbuf
);
1417 kmem_free(fio_done
, bufcnt
* sizeof (ksema_t
));
1420 /* Don't leak resources */
1421 for (count
= 0; count
< bufcnt
; count
++) {
1422 biofini(&bufs
[count
]);
1423 if (count
< bufsused
&& vas
[count
] != NULL
) {
1424 ppmapout(vas
[count
]);
1428 kmem_free(vas
, bufcnt
* sizeof (caddr_t
));
1429 kmem_free(bufs
, bufcnt
* sizeof (struct buf
));
1433 pvn_read_done(pp
, B_ERROR
);
1438 * Lock the requested page, and the one after it if possible.
1439 * Don't bother if our caller hasn't given us a place to stash
1440 * the page pointers, since otherwise we'd lock pages that would
1441 * never get unlocked.
1448 * Make sure it's in memory before we say it's here.
1450 if ((pp
= page_lookup(&vp
->v_object
, off
, SE_SHARED
)) == NULL
) {
1457 atomic_inc_64(&fsp
->cache_read_pages
);
1460 * Try to lock the next page, if it exists, without
1464 /* LINTED (plsz is unsigned) */
1465 for (soff
= off
+ PAGESIZE
; plsz
> 0;
1466 soff
+= PAGESIZE
, plsz
-= PAGESIZE
) {
1467 pp
= page_lookup_nowait(&vp
->v_object
, (uoff_t
)soff
,
1476 * Schedule a semi-asynchronous readahead if we are
1477 * accessing the last cached page for the current
1480 * Doing this here means that readaheads will be
1481 * issued only if cache-hits occur. This is an advantage
1482 * since cache-hits would mean that readahead is giving
1483 * the desired benefit. If cache-hits do not occur there
1484 * is no point in reading ahead of time - the system
1487 if (fsp
->hqueue
!= NULL
&&
1488 hp
->hs_prev_offset
- off
== PAGESIZE
&&
1489 hp
->hs_prev_offset
< filsiz
&&
1490 hp
->hs_ra_bytes
> 0 &&
1491 !page_exists(&vp
->v_object
, hp
->hs_prev_offset
)) {
1492 (void) hsfs_getpage_ra(vp
, hp
->hs_prev_offset
, seg
,
1493 addr
+ PAGESIZE
, hp
, fsp
, xarsiz
, bof
,
1494 chunk_lbn_count
, chunk_data_bytes
);
1501 pvn_plist_init(pp
, pl
, plsz
, off
, io_len
, rw
);
1509 hsfs_getpage(struct vnode
*vp
, offset_t off
, size_t len
, uint_t
*protp
,
1510 struct page
*pl
[], size_t plsz
, struct seg
*seg
, caddr_t addr
,
1511 enum seg_rw rw
, struct cred
*cred
, caller_context_t
*ct
)
1517 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
1520 /* does not support write */
1521 if (rw
== S_WRITE
) {
1525 if (vp
->v_flag
& VNOMAP
) {
1529 ASSERT(off
<= HS_MAXFILEOFF
);
1532 * Determine file data size for EOF check.
1534 filsiz
= hp
->hs_dirent
.ext_size
;
1535 if ((off
+ len
) > (offset_t
)(filsiz
+ PAGEOFFSET
) && seg
!= segkmap
)
1536 return (EFAULT
); /* beyond EOF */
1539 * Async Read-ahead computation.
1540 * This attempts to detect sequential access pattern and
1541 * enables reading extra pages ahead of time.
1543 if (fsp
->hqueue
!= NULL
) {
1545 * This check for sequential access also takes into
1546 * account segmap weirdness when reading in chunks
1547 * less than the segmap size of 8K.
1549 if (hp
->hs_prev_offset
== off
|| (off
<
1550 hp
->hs_prev_offset
&& off
+ MAX(len
, PAGESIZE
)
1551 >= hp
->hs_prev_offset
)) {
1552 if (hp
->hs_num_contig
<
1553 (seq_contig_requests
- 1)) {
1554 hp
->hs_num_contig
++;
1558 * We increase readahead quantum till
1559 * a predefined max. max_readahead_bytes
1560 * is a multiple of PAGESIZE.
1562 if (hp
->hs_ra_bytes
<
1563 fsp
->hqueue
->max_ra_bytes
) {
1564 hp
->hs_ra_bytes
+= PAGESIZE
;
1569 * Not contiguous so reduce read ahead counters.
1571 if (hp
->hs_ra_bytes
> 0)
1572 hp
->hs_ra_bytes
-= PAGESIZE
;
1574 if (hp
->hs_ra_bytes
<= 0) {
1575 hp
->hs_ra_bytes
= 0;
1576 if (hp
->hs_num_contig
> 0)
1577 hp
->hs_num_contig
--;
1581 * Length must be rounded up to page boundary.
1582 * since we read in units of pages.
1584 hp
->hs_prev_offset
= off
+ roundup(len
, PAGESIZE
);
1585 DTRACE_PROBE1(hsfs_compute_ra
, struct hsnode
*, hp
);
1590 return (pvn_getpages(hsfs_getapage
, vp
, off
, len
, protp
, pl
, plsz
,
1591 seg
, addr
, rw
, cred
));
1597 * This function should never be called. We need to have it to pass
1598 * it as an argument to other functions.
1602 hsfs_putapage(vnode_t
*vp
, page_t
*pp
, uoff_t
*offp
, size_t *lenp
,
1603 int flags
, cred_t
*cr
)
1605 /* should never happen - just destroy it */
1606 cmn_err(CE_NOTE
, "hsfs_putapage: dirty HSFS page");
1607 pvn_write_done(pp
, B_ERROR
| B_WRITE
| B_INVAL
| B_FORCE
| flags
);
1613 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
1614 * B_INVAL is set by:
1616 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
1617 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
1618 * which translates to an MC_SYNC with the MS_INVALIDATE flag.
1620 * The B_FREE (as well as the B_DONTNEED) flag is set when the
1621 * MADV_SEQUENTIAL advice has been used. fop_putpage is invoked
1622 * from SEGVN to release pages behind a pagefault.
1626 hsfs_putpage(struct vnode
*vp
, offset_t off
, size_t len
, int flags
,
1627 struct cred
*cr
, caller_context_t
*ct
)
1631 if (vp
->v_count
== 0) {
1632 panic("hsfs_putpage: bad v_count");
1636 if (vp
->v_flag
& VNOMAP
)
1639 ASSERT(off
<= HS_MAXFILEOFF
);
1641 if (!vn_has_cached_data(vp
)) /* no pages mapped */
1644 if (len
== 0) { /* from 'off' to EOF */
1645 error
= pvn_vplist_dirty(vp
, off
, hsfs_putapage
, flags
, cr
);
1647 offset_t end_off
= off
+ len
;
1648 offset_t file_size
= VTOH(vp
)->hs_dirent
.ext_size
;
1651 file_size
= (file_size
+ PAGESIZE
- 1) & PAGEMASK
;
1652 if (end_off
> file_size
)
1653 end_off
= file_size
;
1655 for (io_off
= off
; io_off
< end_off
; io_off
+= PAGESIZE
) {
1659 * We insist on getting the page only if we are
1660 * about to invalidate, free or write it and
1661 * the B_ASYNC flag is not set.
1663 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
1664 pp
= page_lookup(&vp
->v_object
, io_off
,
1665 (flags
& (B_INVAL
| B_FREE
)) ? SE_EXCL
: SE_SHARED
);
1667 pp
= page_lookup_nowait(&vp
->v_object
,
1669 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
1676 * Normally pvn_getdirty() should return 0, which
1677 * impies that it has done the job for us.
1678 * The shouldn't-happen scenario is when it returns 1.
1679 * This means that the page has been modified and
1680 * needs to be put back.
1681 * Since we can't write on a CD, we fake a failed
1682 * I/O and force pvn_write_done() to destroy the page.
1684 if (pvn_getdirty(pp
, flags
) == 1) {
1686 "hsfs_putpage: dirty HSFS page");
1687 pvn_write_done(pp
, flags
|
1688 B_ERROR
| B_WRITE
| B_INVAL
| B_FORCE
);
1698 hsfs_map(struct vnode
*vp
, offset_t off
, struct as
*as
, caddr_t
*addrp
,
1699 size_t len
, uchar_t prot
, uchar_t maxprot
, uint_t flags
, struct cred
*cred
,
1700 caller_context_t
*ct
)
1702 struct segvn_crargs vn_a
;
1705 /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
1707 if (vp
->v_flag
& VNOMAP
)
1710 if ((prot
& PROT_WRITE
) && (flags
& MAP_SHARED
))
1713 if (off
> HS_MAXFILEOFF
|| off
< 0 ||
1714 (off
+ len
) < 0 || (off
+ len
) > HS_MAXFILEOFF
)
1717 if (vp
->v_type
!= VREG
) {
1722 * If file is being locked, disallow mapping.
1724 if (vn_has_mandatory_locks(vp
, VTOH(vp
)->hs_dirent
.mode
))
1728 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
1736 vn_a
.type
= flags
& MAP_TYPE
;
1738 vn_a
.maxprot
= maxprot
;
1739 vn_a
.flags
= flags
& ~MAP_TYPE
;
1743 vn_a
.lgrp_mem_policy_flags
= 0;
1745 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
1752 hsfs_addmap(struct vnode
*vp
, offset_t off
, struct as
*as
, caddr_t addr
,
1753 size_t len
, uchar_t prot
, uchar_t maxprot
, uint_t flags
, struct cred
*cr
,
1754 caller_context_t
*ct
)
1758 if (vp
->v_flag
& VNOMAP
)
1762 mutex_enter(&hp
->hs_contents_lock
);
1763 hp
->hs_mapcnt
+= btopr(len
);
1764 mutex_exit(&hp
->hs_contents_lock
);
1770 hsfs_delmap(struct vnode
*vp
, offset_t off
, struct as
*as
, caddr_t addr
,
1771 size_t len
, uint_t prot
, uint_t maxprot
, uint_t flags
, struct cred
*cr
,
1772 caller_context_t
*ct
)
1776 if (vp
->v_flag
& VNOMAP
)
1780 mutex_enter(&hp
->hs_contents_lock
);
1781 hp
->hs_mapcnt
-= btopr(len
); /* Count released mappings */
1782 ASSERT(hp
->hs_mapcnt
>= 0);
1783 mutex_exit(&hp
->hs_contents_lock
);
1789 hsfs_seek(struct vnode
*vp
, offset_t ooff
, offset_t
*noffp
,
1790 caller_context_t
*ct
)
1792 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
1797 hsfs_frlock(struct vnode
*vp
, int cmd
, struct flock64
*bfp
, int flag
,
1798 offset_t offset
, struct flk_callback
*flk_cbp
, cred_t
*cr
,
1799 caller_context_t
*ct
)
1801 struct hsnode
*hp
= VTOH(vp
);
1804 * If the file is being mapped, disallow fs_frlock.
1805 * We are not holding the hs_contents_lock while checking
1806 * hs_mapcnt because the current locking strategy drops all
1807 * locks before calling fs_frlock.
1808 * So, hs_mapcnt could change before we enter fs_frlock making
1809 * it meaningless to have held hs_contents_lock in the first place.
1811 if (hp
->hs_mapcnt
> 0 && MANDLOCK(vp
, hp
->hs_dirent
.mode
))
1814 return (fs_frlock(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
, ct
));
1818 hsched_deadline_compare(const void *x1
, const void *x2
)
1820 const struct hio
*h1
= x1
;
1821 const struct hio
*h2
= x2
;
1823 if (h1
->io_timestamp
< h2
->io_timestamp
)
1825 if (h1
->io_timestamp
> h2
->io_timestamp
)
1828 if (h1
->io_lblkno
< h2
->io_lblkno
)
1830 if (h1
->io_lblkno
> h2
->io_lblkno
)
1842 hsched_offset_compare(const void *x1
, const void *x2
)
1844 const struct hio
*h1
= x1
;
1845 const struct hio
*h2
= x2
;
1847 if (h1
->io_lblkno
< h2
->io_lblkno
)
1849 if (h1
->io_lblkno
> h2
->io_lblkno
)
1861 hsched_init_caches(void)
1863 hio_cache
= kmem_cache_create("hsfs_hio_cache",
1864 sizeof (struct hio
), 0, NULL
,
1865 NULL
, NULL
, NULL
, NULL
, 0);
1867 hio_info_cache
= kmem_cache_create("hsfs_hio_info_cache",
1868 sizeof (struct hio_info
), 0, NULL
,
1869 NULL
, NULL
, NULL
, NULL
, 0);
1873 hsched_fini_caches(void)
1875 kmem_cache_destroy(hio_cache
);
1876 kmem_cache_destroy(hio_info_cache
);
1880 * Initialize I/O scheduling structures. This is called via hsfs_mount
1883 hsched_init(struct hsfs
*fsp
, int fsid
, struct modlinkage
*modlinkage
)
1885 struct hsfs_queue
*hqueue
= fsp
->hqueue
;
1886 struct vnode
*vp
= fsp
->hsfs_devvp
;
1888 /* TaskQ name of the form: hsched_task_ + stringof(int) */
1891 struct dk_cinfo info
;
1896 * Default maxtransfer = 16k chunk
1898 hqueue
->dev_maxtransfer
= 16384;
1901 * Try to fetch the maximum device transfer size. This is used to
1902 * ensure that a coalesced block does not exceed the maxtransfer.
1904 err
= ldi_ident_from_mod(modlinkage
, &li
);
1906 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
1907 cmn_err(CE_NOTE
, "hsched_init: ldi_ident_from_mod err=%d\n",
1912 err
= ldi_open_by_dev(&(vp
->v_rdev
), OTYP_CHR
, FREAD
, CRED(), &lh
, li
);
1913 ldi_ident_release(li
);
1915 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
1916 cmn_err(CE_NOTE
, "hsched_init: ldi_open err=%d\n", err
);
1920 error
= ldi_ioctl(lh
, DKIOCINFO
, (intptr_t)&info
, FKIOCTL
,
1922 err
= ldi_close(lh
, FREAD
, CRED());
1924 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
1925 cmn_err(CE_NOTE
, "hsched_init: ldi_close err=%d\n", err
);
1929 hqueue
->dev_maxtransfer
= ldbtob(info
.dki_maxtransfer
);
1934 * Max size of data to read ahead for sequential access pattern.
1935 * Conservative to avoid letting the underlying CD drive to spin
1936 * down, in case the application is reading slowly.
1937 * We read ahead upto a max of 4 pages.
1939 hqueue
->max_ra_bytes
= PAGESIZE
* 8;
1941 mutex_init(&(hqueue
->hsfs_queue_lock
), NULL
, MUTEX_DEFAULT
, NULL
);
1942 mutex_init(&(hqueue
->strategy_lock
), NULL
, MUTEX_DEFAULT
, NULL
);
1943 avl_create(&(hqueue
->read_tree
), hsched_offset_compare
,
1944 sizeof (struct hio
), offsetof(struct hio
, io_offset_node
));
1945 avl_create(&(hqueue
->deadline_tree
), hsched_deadline_compare
,
1946 sizeof (struct hio
), offsetof(struct hio
, io_deadline_node
));
1948 (void) snprintf(namebuf
, sizeof (namebuf
), "hsched_task_%d", fsid
);
1949 hqueue
->ra_task
= taskq_create(namebuf
, hsfs_taskq_nthreads
,
1950 minclsyspri
+ 2, 1, 104857600 / PAGESIZE
, TASKQ_DYNAMIC
);
1952 hqueue
->next
= NULL
;
1953 hqueue
->nbuf
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
1957 hsched_fini(struct hsfs_queue
*hqueue
)
1959 if (hqueue
!= NULL
) {
1961 * Remove the sentinel if there was one.
1963 if (hqueue
->next
!= NULL
) {
1964 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
1965 kmem_cache_free(hio_cache
, hqueue
->next
);
1967 avl_destroy(&(hqueue
->read_tree
));
1968 avl_destroy(&(hqueue
->deadline_tree
));
1969 mutex_destroy(&(hqueue
->hsfs_queue_lock
));
1970 mutex_destroy(&(hqueue
->strategy_lock
));
1973 * If there are any existing readahead threads running
1974 * taskq_destroy will wait for them to finish.
1976 taskq_destroy(hqueue
->ra_task
);
1977 kmem_free(hqueue
->nbuf
, sizeof (struct buf
));
1982 * Determine if two I/O requests are adjacent to each other so
1983 * that they can coalesced.
1985 #define IS_ADJACENT(io, nio) \
1986 (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
1987 (io)->bp->b_edev == (nio)->bp->b_edev)
1990 * This performs the actual I/O scheduling logic. We use the Circular
1991 * Look algorithm here. Sort the I/O requests in ascending order of
1992 * logical block number and process them starting with the lowest
1993 * numbered block and progressing towards higher block numbers in the
1994 * queue. Once there are no more higher numbered blocks, start again
1995 * with the lowest one. This is good for CD/DVD as you keep moving
1996 * the head in one direction along the outward spiral track and avoid
1997 * too many seeks as much as possible. The re-ordering also allows
1998 * us to coalesce adjacent requests into one larger request.
1999 * This is thus essentially a 1-way Elevator with front merging.
2001 * In addition each read request here has a deadline and will be
2002 * processed out of turn if the deadline (500ms) expires.
2004 * This function is necessarily serialized via hqueue->strategy_lock.
2005 * This function sits just below hsfs_getapage and processes all read
2006 * requests orginating from that function.
2009 hsched_invoke_strategy(struct hsfs
*fsp
)
2011 struct hsfs_queue
*hqueue
;
2013 struct hio
*fio
, *nio
, *tio
, *prev
, *last
;
2014 size_t bsize
, soffset
, offset
, data
;
2015 int bioret
, bufcount
;
2020 hqueue
= fsp
->hqueue
;
2021 mutex_enter(&hqueue
->strategy_lock
);
2022 mutex_enter(&hqueue
->hsfs_queue_lock
);
2025 * Check for Deadline expiration first
2027 fio
= avl_first(&hqueue
->deadline_tree
);
2030 * Paranoid check for empty I/O queue. Both deadline
2031 * and read trees contain same data sorted in different
2032 * ways. So empty deadline tree = empty read tree.
2036 * Remove the sentinel if there was one.
2038 if (hqueue
->next
!= NULL
) {
2039 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2040 kmem_cache_free(hio_cache
, hqueue
->next
);
2041 hqueue
->next
= NULL
;
2043 mutex_exit(&hqueue
->hsfs_queue_lock
);
2044 mutex_exit(&hqueue
->strategy_lock
);
2048 if (drv_hztousec(ddi_get_lbolt()) - fio
->io_timestamp
2049 < HSFS_READ_DEADLINE
) {
2051 * Apply standard scheduling logic. This uses the
2052 * C-LOOK approach. Process I/O requests in ascending
2053 * order of logical block address till no subsequent
2054 * higher numbered block request remains. Then start
2055 * again from the lowest numbered block in the queue.
2057 * We do this cheaply here by means of a sentinel.
2058 * The last processed I/O structure from the previous
2059 * invocation of this func, is left dangling in the
2060 * read_tree so that we can easily scan to the next
2061 * higher numbered request and remove the sentinel.
2064 if (hqueue
->next
!= NULL
) {
2065 fio
= AVL_NEXT(&hqueue
->read_tree
, hqueue
->next
);
2066 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2067 kmem_cache_free(hio_cache
, hqueue
->next
);
2068 hqueue
->next
= NULL
;
2071 fio
= avl_first(&hqueue
->read_tree
);
2073 } else if (hqueue
->next
!= NULL
) {
2074 DTRACE_PROBE1(hsfs_deadline_expiry
, struct hio
*, fio
);
2076 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2077 kmem_cache_free(hio_cache
, hqueue
->next
);
2078 hqueue
->next
= NULL
;
2082 * In addition we try to coalesce contiguous
2083 * requests into one bigger request.
2086 bsize
= ldbtob(fio
->nblocks
);
2087 fvp
= fio
->bp
->b_file
;
2088 nio
= AVL_NEXT(&hqueue
->read_tree
, fio
);
2090 while (nio
!= NULL
&& IS_ADJACENT(tio
, nio
) &&
2091 bsize
< hqueue
->dev_maxtransfer
) {
2092 avl_remove(&hqueue
->deadline_tree
, tio
);
2093 avl_remove(&hqueue
->read_tree
, tio
);
2094 tio
->contig_chain
= nio
;
2095 bsize
+= ldbtob(nio
->nblocks
);
2100 * This check is required to detect the case where
2101 * we are merging adjacent buffers belonging to
2102 * different files. fvp is used to set the b_file
2103 * parameter in the coalesced buf. b_file is used
2104 * by DTrace so we do not want DTrace to accrue
2105 * requests to two different files to any one file.
2107 if (fvp
&& tio
->bp
->b_file
!= fvp
) {
2111 nio
= AVL_NEXT(&hqueue
->read_tree
, nio
);
2116 * tio is not removed from the read_tree as it serves as a sentinel
2117 * to cheaply allow us to scan to the next higher numbered I/O
2121 avl_remove(&hqueue
->deadline_tree
, tio
);
2122 mutex_exit(&hqueue
->hsfs_queue_lock
);
2123 DTRACE_PROBE3(hsfs_io_dequeued
, struct hio
*, fio
, int, bufcount
,
2127 * The benefit of coalescing occurs if the the savings in I/O outweighs
2128 * the cost of doing the additional work below.
2129 * It was observed that coalescing 2 buffers results in diminishing
2130 * returns, so we do coalescing if we have >2 adjacent bufs.
2132 if (bufcount
> hsched_coalesce_min
) {
2134 * We have coalesced blocks. First allocate mem and buf for
2135 * the entire coalesced chunk.
2136 * Since we are guaranteed single-threaded here we pre-allocate
2137 * one buf at mount time and that is re-used every time. This
2138 * is a synthesized buf structure that uses kmem_alloced chunk.
2139 * Not quite a normal buf attached to pages.
2141 fsp
->coalesced_bytes
+= bsize
;
2142 nbuf
= hqueue
->nbuf
;
2144 nbuf
->b_edev
= fio
->bp
->b_edev
;
2145 nbuf
->b_dev
= fio
->bp
->b_dev
;
2146 nbuf
->b_flags
= fio
->bp
->b_flags
;
2147 nbuf
->b_iodone
= fio
->bp
->b_iodone
;
2148 iodata
= kmem_alloc(bsize
, KM_SLEEP
);
2149 nbuf
->b_un
.b_addr
= iodata
;
2150 nbuf
->b_lblkno
= fio
->bp
->b_lblkno
;
2153 nbuf
->b_bcount
= bsize
;
2154 nbuf
->b_bufsize
= bsize
;
2156 DTRACE_PROBE3(hsfs_coalesced_io_start
, struct hio
*, fio
, int,
2157 bufcount
, size_t, bsize
);
2160 * Perform I/O for the coalesced block.
2162 (void) bdev_strategy(nbuf
);
2165 * Duplicate the last IO node to leave the sentinel alone.
2166 * The sentinel is freed in the next invocation of this
2169 prev
->contig_chain
= kmem_cache_alloc(hio_cache
, KM_SLEEP
);
2170 prev
->contig_chain
->bp
= tio
->bp
;
2171 prev
->contig_chain
->sema
= tio
->sema
;
2172 tio
= prev
->contig_chain
;
2173 tio
->contig_chain
= NULL
;
2174 soffset
= ldbtob(fio
->bp
->b_lblkno
);
2177 bioret
= biowait(nbuf
);
2178 data
= bsize
- nbuf
->b_resid
;
2180 mutex_exit(&hqueue
->strategy_lock
);
2183 * We use the b_resid parameter to detect how much
2184 * data was succesfully transferred. We will signal
2185 * a success to all the fully retrieved actual bufs
2186 * before coalescing, rest is signaled as error,
2190 DTRACE_PROBE3(hsfs_coalesced_io_done
, struct hio
*, nio
,
2191 int, bioret
, size_t, data
);
2194 * Copy data and signal success to all the bufs
2195 * which can be fully satisfied from b_resid.
2197 while (nio
!= NULL
&& data
>= nio
->bp
->b_bcount
) {
2198 offset
= ldbtob(nio
->bp
->b_lblkno
) - soffset
;
2199 bcopy(iodata
+ offset
, nio
->bp
->b_un
.b_addr
,
2201 data
-= nio
->bp
->b_bcount
;
2202 bioerror(nio
->bp
, 0);
2206 nio
= nio
->contig_chain
;
2207 kmem_cache_free(hio_cache
, tio
);
2211 * Signal error to all the leftover bufs (if any)
2212 * after b_resid data is exhausted.
2214 while (nio
!= NULL
) {
2215 nio
->bp
->b_resid
= nio
->bp
->b_bcount
- data
;
2216 bzero(nio
->bp
->b_un
.b_addr
+ data
, nio
->bp
->b_resid
);
2217 bioerror(nio
->bp
, bioret
);
2221 nio
= nio
->contig_chain
;
2222 kmem_cache_free(hio_cache
, tio
);
2225 kmem_free(iodata
, bsize
);
2229 io_done
= tio
->sema
;
2233 while (nio
!= NULL
) {
2234 (void) bdev_strategy(nio
->bp
);
2235 nio
= nio
->contig_chain
;
2238 mutex_exit(&hqueue
->strategy_lock
);
2240 while (nio
!= NULL
) {
2242 (void) biowait(nbuf
);
2245 /* sentinel last not freed. See above. */
2247 (void) biowait(nio
->bp
);
2251 nio
= nio
->contig_chain
;
2252 kmem_cache_free(hio_cache
, tio
);
2259 * Insert an I/O request in the I/O scheduler's pipeline
2260 * Using AVL tree makes it easy to reorder the I/O request
2261 * based on logical block number.
2264 hsched_enqueue_io(struct hsfs
*fsp
, struct hio
*hsio
, int ra
)
2266 struct hsfs_queue
*hqueue
= fsp
->hqueue
;
2268 mutex_enter(&hqueue
->hsfs_queue_lock
);
2270 fsp
->physical_read_bytes
+= hsio
->bp
->b_bcount
;
2272 fsp
->readahead_bytes
+= hsio
->bp
->b_bcount
;
2274 avl_add(&hqueue
->deadline_tree
, hsio
);
2275 avl_add(&hqueue
->read_tree
, hsio
);
2277 DTRACE_PROBE3(hsfs_io_enqueued
, struct hio
*, hsio
,
2278 struct hsfs_queue
*, hqueue
, int, ra
);
2280 mutex_exit(&hqueue
->hsfs_queue_lock
);
2285 hsfs_pathconf(struct vnode
*vp
, int cmd
, ulong_t
*valp
, struct cred
*cr
,
2286 caller_context_t
*ct
)
2295 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
2296 *valp
= fsp
->hsfs_namemax
;
2299 case _PC_FILESIZEBITS
:
2300 *valp
= 33; /* Without multi extent support: 4 GB - 2k */
2303 case _PC_TIMESTAMP_RESOLUTION
:
2305 * HSFS keeps, at best, 1/100 second timestamp resolution.
2311 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2318 const struct vnodeops hsfs_vnodeops
= {
2319 .vnop_name
= "hsfs",
2320 .vop_open
= hsfs_open
,
2321 .vop_close
= hsfs_close
,
2322 .vop_read
= hsfs_read
,
2323 .vop_getattr
= hsfs_getattr
,
2324 .vop_access
= hsfs_access
,
2325 .vop_lookup
= hsfs_lookup
,
2326 .vop_readdir
= hsfs_readdir
,
2327 .vop_readlink
= hsfs_readlink
,
2328 .vop_fsync
= hsfs_fsync
,
2329 .vop_inactive
= hsfs_inactive
,
2330 .vop_fid
= hsfs_fid
,
2331 .vop_seek
= hsfs_seek
,
2332 .vop_frlock
= hsfs_frlock
,
2333 .vop_getpage
= hsfs_getpage
,
2334 .vop_putpage
= hsfs_putpage
,
2335 .vop_map
= hsfs_map
,
2336 .vop_addmap
= hsfs_addmap
,
2337 .vop_delmap
= hsfs_delmap
,
2338 .vop_pathconf
= hsfs_pathconf
,