4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
29 * Vnode operations for the High Sierra filesystem
32 #include <sys/types.h>
33 #include <sys/t_lock.h>
34 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/resource.h>
39 #include <sys/signal.h>
44 #include <sys/vfs_opreg.h>
46 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
56 #include <sys/errno.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
63 #include <sys/dirent.h>
64 #include <sys/errno.h>
66 #include <sys/cmn_err.h>
67 #include <sys/atomic.h>
74 #include <vm/seg_map.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_vn.h>
81 #include <sys/sunldi.h>
83 #include <sys/sunddi.h>
87 * For struct modlinkage
89 #include <sys/modctl.h>
91 #include <sys/fs/hsfs_spec.h>
92 #include <sys/fs/hsfs_node.h>
93 #include <sys/fs/hsfs_impl.h>
94 #include <sys/fs/hsfs_susp.h>
95 #include <sys/fs/hsfs_rrip.h>
97 #include <fs/fs_subr.h>
99 /* # of contiguous requests to detect sequential access pattern */
100 static int seq_contig_requests
= 2;
103 * This is the max number os taskq threads that will be created
104 * if required. Since we are using a Dynamic TaskQ by default only
105 * one thread is created initially.
107 * NOTE: In the usual hsfs use case this per fs instance number
108 * of taskq threads should not place any undue load on a system.
109 * Even on an unusual system with say 100 CDROM drives, 800 threads
110 * will not be created unless all the drives are loaded and all
111 * of them are saturated with I/O at the same time! If there is at
112 * all a complaint of system load due to such an unusual case it
113 * should be easy enough to change to one per-machine Dynamic TaskQ
114 * for all hsfs mounts with a nthreads of say 32.
116 static int hsfs_taskq_nthreads
= 8; /* # of taskq threads per fs */
118 /* Min count of adjacent bufs that will avoid buf coalescing */
119 static int hsched_coalesce_min
= 2;
122 * Kmem caches for heavily used small allocations. Using these kmem
123 * caches provides a factor of 3 reduction in system time and greatly
124 * aids overall throughput esp. on SPARC.
126 struct kmem_cache
*hio_cache
;
127 struct kmem_cache
*hio_info_cache
;
130 * This tunable allows us to ignore inode numbers from rrip-1.12.
131 * In this case, we fall back to our default inode algorithm.
133 extern int use_rrip_inodes
;
136 * Free behind logic from UFS to tame our thirst for
138 * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
141 static int freebehind
= 1;
142 static int smallfile
= 0;
143 static int cache_read_ahead
= 0;
144 static u_offset_t smallfile64
= 32 * 1024;
145 #define SMALLFILE1_D 1000
146 #define SMALLFILE2_D 10
147 static u_offset_t smallfile1
= 32 * 1024;
148 static u_offset_t smallfile2
= 32 * 1024;
149 static clock_t smallfile_update
= 0; /* when to recompute */
150 static uint_t smallfile1_d
= SMALLFILE1_D
;
151 static uint_t smallfile2_d
= SMALLFILE2_D
;
153 static int hsched_deadline_compare(const void *x1
, const void *x2
);
154 static int hsched_offset_compare(const void *x1
, const void *x2
);
155 static void hsched_enqueue_io(struct hsfs
*fsp
, struct hio
*hsio
, int ra
);
156 int hsched_invoke_strategy(struct hsfs
*fsp
);
160 hsfs_fsync(vnode_t
*cp
,
163 caller_context_t
*ct
)
171 hsfs_read(struct vnode
*vp
,
175 struct caller_context
*ct
)
186 * if vp is of type VDIR, make sure dirent
187 * is filled up with all info (because of ptbl)
189 if (vp
->v_type
== VDIR
) {
190 if (hp
->hs_dirent
.ext_size
== 0)
191 hs_filldirent(vp
, &hp
->hs_dirent
);
193 filesize
= hp
->hs_dirent
.ext_size
;
196 if (uiop
->uio_resid
== 0 || /* No data wanted. */
197 uiop
->uio_loffset
> HS_MAXFILEOFF
|| /* Offset too big. */
198 uiop
->uio_loffset
>= filesize
) /* Past EOF. */
203 * We want to ask for only the "right" amount of data.
204 * In this case that means:-
206 * We can't get data from beyond our EOF. If asked,
207 * we will give a short read.
209 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
210 * These buffers are always MAXBSIZE aligned.
211 * If our starting offset is not MAXBSIZE aligned,
212 * we can only ask for less than MAXBSIZE bytes.
214 * If our requested offset and length are such that
215 * they belong in different MAXBSIZE aligned slots
216 * then we'll be making more than one call on
219 * This diagram shows the variables we use and their
222 * |<-----MAXBSIZE----->|
223 * +--------------------------...+
224 * |.....mapon->|<--n-->|....*...|EOF
225 * +--------------------------...+
227 * uio_resid....|<---------->|
228 * diff.........|<-------------->|
230 * So, in this case our offset is not aligned
231 * and our request takes us outside of the
232 * MAXBSIZE window. We will break this up into
233 * two segmap_getmapflt calls.
240 mapon
= uiop
->uio_loffset
& MAXBOFFSET
;
241 diff
= filesize
- uiop
->uio_loffset
;
242 nbytes
= (size_t)MIN(MAXBSIZE
- mapon
, uiop
->uio_resid
);
243 n
= MIN(diff
, nbytes
);
245 /* EOF or request satisfied. */
250 * Freebehind computation taken from:
251 * usr/src/uts/common/fs/ufs/ufs_vnops.c
253 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update
) {
254 uint64_t percpufreeb
;
255 if (smallfile1_d
== 0) smallfile1_d
= SMALLFILE1_D
;
256 if (smallfile2_d
== 0) smallfile2_d
= SMALLFILE2_D
;
257 percpufreeb
= ptob((uint64_t)freemem
) / ncpus_online
;
258 smallfile1
= percpufreeb
/ smallfile1_d
;
259 smallfile2
= percpufreeb
/ smallfile2_d
;
260 smallfile1
= MAX(smallfile1
, smallfile
);
261 smallfile1
= MAX(smallfile1
, smallfile64
);
262 smallfile2
= MAX(smallfile1
, smallfile2
);
263 smallfile_update
= drv_hztousec(ddi_get_lbolt())
267 dofree
= freebehind
&&
268 hp
->hs_prev_offset
== uiop
->uio_loffset
&&
271 base
= segmap_getmapflt(segkmap
, vp
,
272 (u_offset_t
)uiop
->uio_loffset
, n
, 1, S_READ
);
274 error
= uiomove(base
+ mapon
, n
, UIO_READ
, uiop
);
278 * if read a whole block, or read to eof,
279 * won't need this buffer again soon.
281 if (n
+ mapon
== MAXBSIZE
||
282 uiop
->uio_loffset
== filesize
)
288 flags
= SM_FREE
| SM_ASYNC
;
289 if ((cache_read_ahead
== 0) &&
290 uiop
->uio_loffset
> smallfile2
)
291 flags
|= SM_DONTNEED
;
294 error
= segmap_release(segkmap
, base
, flags
);
296 (void) segmap_release(segkmap
, base
, 0);
297 } while (error
== 0 && uiop
->uio_resid
> 0);
309 caller_context_t
*ct
)
316 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
319 if ((hp
->hs_dirent
.ext_size
== 0) && (vp
->v_type
== VDIR
)) {
320 hs_filldirent(vp
, &hp
->hs_dirent
);
322 vap
->va_type
= IFTOVT(hp
->hs_dirent
.mode
);
323 vap
->va_mode
= hp
->hs_dirent
.mode
;
324 vap
->va_uid
= hp
->hs_dirent
.uid
;
325 vap
->va_gid
= hp
->hs_dirent
.gid
;
327 vap
->va_fsid
= vfsp
->vfs_dev
;
328 vap
->va_nodeid
= (ino64_t
)hp
->hs_nodeid
;
329 vap
->va_nlink
= hp
->hs_dirent
.nlink
;
330 vap
->va_size
= (offset_t
)hp
->hs_dirent
.ext_size
;
332 vap
->va_atime
.tv_sec
= hp
->hs_dirent
.adate
.tv_sec
;
333 vap
->va_atime
.tv_nsec
= hp
->hs_dirent
.adate
.tv_usec
*1000;
334 vap
->va_mtime
.tv_sec
= hp
->hs_dirent
.mdate
.tv_sec
;
335 vap
->va_mtime
.tv_nsec
= hp
->hs_dirent
.mdate
.tv_usec
*1000;
336 vap
->va_ctime
.tv_sec
= hp
->hs_dirent
.cdate
.tv_sec
;
337 vap
->va_ctime
.tv_nsec
= hp
->hs_dirent
.cdate
.tv_usec
*1000;
338 if (vp
->v_type
== VCHR
|| vp
->v_type
== VBLK
)
339 vap
->va_rdev
= hp
->hs_dirent
.r_dev
;
342 vap
->va_blksize
= vfsp
->vfs_bsize
;
343 /* no. of blocks = no. of data blocks + no. of xar blocks */
344 vap
->va_nblocks
= (fsblkcnt64_t
)howmany(vap
->va_size
+ (u_longlong_t
)
345 (hp
->hs_dirent
.xar_len
<< fsp
->hsfs_vol
.lbn_shift
), DEV_BSIZE
);
346 vap
->va_seq
= hp
->hs_seq
;
352 hsfs_readlink(struct vnode
*vp
,
355 caller_context_t
*ct
)
359 if (vp
->v_type
!= VLNK
)
364 if (hp
->hs_dirent
.sym_link
== (char *)NULL
)
367 return (uiomove(hp
->hs_dirent
.sym_link
,
368 (size_t)MIN(hp
->hs_dirent
.ext_size
,
369 uiop
->uio_resid
), UIO_READ
, uiop
));
374 hsfs_inactive(struct vnode
*vp
,
376 caller_context_t
*ct
)
384 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
386 * Note: acquiring and holding v_lock for quite a while
387 * here serializes on the vnode; this is unfortunate, but
388 * likely not to overly impact performance, as the underlying
389 * device (CDROM drive) is quite slow.
391 rw_enter(&fsp
->hsfs_hash_lock
, RW_WRITER
);
392 mutex_enter(&hp
->hs_contents_lock
);
393 mutex_enter(&vp
->v_lock
);
395 if (vp
->v_count
< 1) {
396 panic("hsfs_inactive: v_count < 1");
400 if (vp
->v_count
> 1 || (hp
->hs_flags
& HREF
) == 0) {
401 vp
->v_count
--; /* release hold from vn_rele */
402 mutex_exit(&vp
->v_lock
);
403 mutex_exit(&hp
->hs_contents_lock
);
404 rw_exit(&fsp
->hsfs_hash_lock
);
407 vp
->v_count
--; /* release hold from vn_rele */
408 if (vp
->v_count
== 0) {
411 * If there are no pages associated with the
412 * hsnode, give it back to the kmem_cache,
413 * else put at the end of this file system's
414 * internal free list.
416 nopage
= !vn_has_cached_data(vp
);
419 * exit these locks now, since hs_freenode may
420 * kmem_free the hsnode and embedded vnode
422 mutex_exit(&vp
->v_lock
);
423 mutex_exit(&hp
->hs_contents_lock
);
424 hs_freenode(vp
, fsp
, nopage
);
426 mutex_exit(&vp
->v_lock
);
427 mutex_exit(&hp
->hs_contents_lock
);
429 rw_exit(&fsp
->hsfs_hash_lock
);
439 struct pathname
*pnp
,
443 caller_context_t
*ct
,
448 int namelen
= (int)strlen(nm
);
457 * If we're looking for ourself, life is simple.
459 if (namelen
== 1 && *nm
== '.') {
460 if (error
= hs_access(dvp
, (mode_t
)VEXEC
, cred
))
467 return (hs_dirlook(dvp
, nm
, namelen
, vpp
, cred
));
478 caller_context_t
*ct
,
483 struct hs_direntry hd
;
486 uint_t offset
; /* real offset in directory */
487 uint_t dirsiz
; /* real size of directory */
489 int hdlen
; /* length of hs directory entry */
490 long ndlen
; /* length of dirent entry */
492 size_t bufsize
; /* size of dirent buffer */
493 char *outbuf
; /* ptr to dirent buffer */
498 uint_t last_offset
; /* last index into current dir block */
499 ino64_t dirino
; /* temporary storage before storing in dirent */
503 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
504 if (dhp
->hs_dirent
.ext_size
== 0)
505 hs_filldirent(vp
, &dhp
->hs_dirent
);
506 dirsiz
= dhp
->hs_dirent
.ext_size
;
507 if (uiop
->uio_loffset
>= dirsiz
) { /* at or beyond EOF */
512 ASSERT(uiop
->uio_loffset
<= HS_MAXFILEOFF
);
513 offset
= uiop
->uio_loffset
;
515 dname_size
= fsp
->hsfs_namemax
+ 1; /* 1 for the ending NUL */
516 dname
= kmem_alloc(dname_size
, KM_SLEEP
);
517 bufsize
= uiop
->uio_resid
+ sizeof (struct dirent64
);
519 outbuf
= kmem_alloc(bufsize
, KM_SLEEP
);
520 nd
= (struct dirent64
*)outbuf
;
522 while (offset
< dirsiz
) {
523 bytes_wanted
= MIN(MAXBSIZE
, dirsiz
- (offset
& MAXBMASK
));
525 error
= fbread(vp
, (offset_t
)(offset
& MAXBMASK
),
526 (unsigned int)bytes_wanted
, S_READ
, &fbp
);
530 blkp
= (uchar_t
*)fbp
->fb_addr
;
531 last_offset
= (offset
& MAXBMASK
) + fbp
->fb_count
;
533 #define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */
535 while (offset
< last_offset
) {
537 * Very similar validation code is found in
538 * process_dirblock(), hsfs_node.c.
539 * For an explanation, see there.
540 * It may make sense for the future to
541 * "consolidate" the code in hs_parsedir(),
542 * process_dirblock() and hsfs_readdir() into
543 * a single utility function.
545 hdlen
= (int)((uchar_t
)
546 HDE_DIR_LEN(&blkp
[rel_offset(offset
)]));
547 if (hdlen
< HDE_ROOT_DIR_REC_SIZE
||
548 offset
+ hdlen
> last_offset
) {
550 * advance to next sector boundary
552 offset
= roundup(offset
+ 1, HS_SECTOR_SIZE
);
554 hs_log_bogus_disk_warning(fsp
,
555 HSFS_ERR_TRAILING_JUNK
, 0);
560 bzero(&hd
, sizeof (hd
));
563 * Just ignore invalid directory entries.
564 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
566 if (!hs_parsedir(fsp
, &blkp
[rel_offset(offset
)],
567 &hd
, dname
, &dnamelen
, last_offset
- offset
)) {
569 * Determine if there is enough room
571 ndlen
= (long)DIRENT64_RECLEN((dnamelen
));
573 if ((ndlen
+ ((char *)nd
- outbuf
)) >
575 fbrelse(fbp
, S_READ
);
576 goto done
; /* output buffer full */
579 diroff
= offset
+ hdlen
;
581 * If the media carries rrip-v1.12 or newer,
582 * and we trust the inodes from the rrip data
583 * (use_rrip_inodes != 0), use that data. If the
584 * media has been created by a recent mkisofs
585 * version, we may trust all numbers in the
586 * starting extent number; otherwise, we cannot
587 * do this for zero sized files and symlinks,
588 * because if we did we'd end up mapping all of
589 * them to the same node. We use HS_DUMMY_INO
590 * in this case and make sure that we will not
591 * map all files to the same meta data.
593 if (hd
.inode
!= 0 && use_rrip_inodes
) {
595 } else if ((hd
.ext_size
== 0 ||
596 hd
.sym_link
!= (char *)NULL
) &&
597 (fsp
->hsfs_flags
& HSFSMNT_INODE
) == 0) {
598 dirino
= HS_DUMMY_INO
;
603 /* strncpy(9f) will zero uninitialized bytes */
605 ASSERT(strlen(dname
) + 1 <=
606 DIRENT64_NAMELEN(ndlen
));
607 (void) strncpy(nd
->d_name
, dname
,
608 DIRENT64_NAMELEN(ndlen
));
609 nd
->d_reclen
= (ushort_t
)ndlen
;
610 nd
->d_off
= (offset_t
)diroff
;
612 nd
= (struct dirent64
*)((char *)nd
+ ndlen
);
615 * free up space allocated for symlink
617 if (hd
.sym_link
!= (char *)NULL
) {
618 kmem_free(hd
.sym_link
,
619 (size_t)(hd
.ext_size
+1));
620 hd
.sym_link
= (char *)NULL
;
625 fbrelse(fbp
, S_READ
);
629 * Got here for one of the following reasons:
630 * 1) outbuf is full (error == 0)
631 * 2) end of directory reached (error == 0)
632 * 3) error reading directory sector (error != 0)
633 * 4) directory entry crosses sector boundary (error == 0)
635 * If any directory entries have been copied, don't report
636 * case 4. Instead, return the valid directory entries.
638 * If no entries have been copied, report the error.
639 * If case 4, this will be indistiguishable from EOF.
642 ndlen
= ((char *)nd
- outbuf
);
644 error
= uiomove(outbuf
, (size_t)ndlen
, UIO_READ
, uiop
);
645 uiop
->uio_loffset
= offset
;
647 kmem_free(dname
, dname_size
);
648 kmem_free(outbuf
, bufsize
);
649 if (eofp
&& error
== 0)
650 *eofp
= (uiop
->uio_loffset
>= dirsiz
);
656 hsfs_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
661 if (fidp
->fid_len
< (sizeof (*fid
) - sizeof (fid
->hf_len
))) {
662 fidp
->fid_len
= sizeof (*fid
) - sizeof (fid
->hf_len
);
666 fid
= (struct hsfid
*)fidp
;
667 fid
->hf_len
= sizeof (*fid
) - sizeof (fid
->hf_len
);
669 mutex_enter(&hp
->hs_contents_lock
);
670 fid
->hf_dir_lbn
= hp
->hs_dir_lbn
;
671 fid
->hf_dir_off
= (ushort_t
)hp
->hs_dir_off
;
672 fid
->hf_ino
= hp
->hs_nodeid
;
673 mutex_exit(&hp
->hs_contents_lock
);
679 hsfs_open(struct vnode
**vpp
,
682 caller_context_t
*ct
)
695 caller_context_t
*ct
)
697 (void) cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
698 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
704 hsfs_access(struct vnode
*vp
,
708 caller_context_t
*ct
)
710 return (hs_access(vp
, (mode_t
)mode
, cred
));
714 * the seek time of a CD-ROM is very slow, and data transfer
715 * rate is even worse (max. 150K per sec). The design
716 * decision is to reduce access to cd-rom as much as possible,
717 * and to transfer a sizable block (read-ahead) of data at a time.
718 * UFS style of read ahead one block at a time is not appropriate,
719 * and is not supported
723 * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
725 #define KLUSTSIZE (56 * 1024)
726 /* we don't support read ahead */
727 int hsfs_lostpage
; /* no. of times we lost original page */
730 * Used to prevent biodone() from releasing buf resources that
731 * we didn't allocate in quite the usual way.
735 hsfs_iodone(struct buf
*bp
)
742 * The taskq thread that invokes the scheduling function to ensure
743 * that all readaheads are complete and cleans up the associated
744 * memory and releases the page lock.
747 hsfs_ra_task(void *arg
)
749 struct hio_info
*info
= arg
;
753 ASSERT(info
->pp
!= NULL
);
755 for (count
= 0; count
< info
->bufsused
; count
++) {
756 wbuf
= &(info
->bufs
[count
]);
758 DTRACE_PROBE1(hsfs_io_wait_ra
, struct buf
*, wbuf
);
759 while (sema_tryp(&(info
->sema
[count
])) == 0) {
760 if (hsched_invoke_strategy(info
->fsp
)) {
761 sema_p(&(info
->sema
[count
]));
765 sema_destroy(&(info
->sema
[count
]));
766 DTRACE_PROBE1(hsfs_io_done_ra
, struct buf
*, wbuf
);
767 biofini(&(info
->bufs
[count
]));
769 for (count
= 0; count
< info
->bufsused
; count
++) {
770 if (info
->vas
[count
] != NULL
) {
771 ppmapout(info
->vas
[count
]);
774 kmem_free(info
->vas
, info
->bufcnt
* sizeof (caddr_t
));
775 kmem_free(info
->bufs
, info
->bufcnt
* sizeof (struct buf
));
776 kmem_free(info
->sema
, info
->bufcnt
* sizeof (ksema_t
));
778 pvn_read_done(info
->pp
, 0);
779 kmem_cache_free(hio_info_cache
, info
);
783 * Submit asynchronous readahead requests to the I/O scheduler
784 * depending on the number of pages to read ahead. These requests
785 * are asynchronous to the calling thread but I/O requests issued
786 * subsequently by other threads with higher LBNs must wait for
787 * these readaheads to complete since we have a single ordered
788 * I/O pipeline. Thus these readaheads are semi-asynchronous.
789 * A TaskQ handles waiting for the readaheads to complete.
791 * This function is mostly a copy of hsfs_getapage but somewhat
792 * simpler. A readahead request is aborted if page allocation
807 int chunk_data_bytes
)
812 struct page
*pp
, *searchp
, *lastp
;
816 uint_t io_off
, io_len
;
824 uint_t which_chunk_lbn
;
827 offset_t offset_bytes
;
828 uint_t remaining_bytes
;
830 int remainder
; /* must be signed */
831 diskaddr_t driver_block
;
832 u_offset_t io_off_tmp
;
834 struct hio_info
*info
;
837 ASSERT(fsp
->hqueue
!= NULL
);
839 if (addr
>= seg
->s_base
+ seg
->s_size
) {
843 devvp
= fsp
->hsfs_devvp
;
844 secsize
= fsp
->hsfs_vol
.lbn_size
; /* bytes per logical block */
847 filsiz
= hp
->hs_dirent
.ext_size
;
855 extension
+= hp
->hs_ra_bytes
;
858 * Some CD writers (e.g. Kodak Photo CD writers)
859 * create CDs in TAO mode and reserve tracks that
860 * are not completely written. Some sectors remain
861 * unreadable for this reason and give I/O errors.
862 * Also, there's no point in reading sectors
863 * we'll never look at. So, if we're asked to go
864 * beyond the end of a file, truncate to the length
867 * Additionally, this behaviour is required by section
868 * 6.4.5 of ISO 9660:1988(E).
870 len
= MIN(extension
? extension
: PAGESIZE
, filsiz
- off
);
872 /* A little paranoia */
877 * After all that, make sure we're asking for things in units
878 * that bdev_strategy() will understand (see bug 4202551).
880 len
= roundup(len
, DEV_BSIZE
);
882 pp
= pvn_read_kluster(vp
, off
, seg
, addr
, &io_off_tmp
,
883 &io_len_tmp
, off
, len
, 1);
886 hp
->hs_num_contig
= 0;
888 hp
->hs_prev_offset
= 0;
892 io_off
= (uint_t
)io_off_tmp
;
893 io_len
= (uint_t
)io_len_tmp
;
895 /* check for truncation */
897 * xxx Clean up and return EIO instead?
898 * xxx Ought to go to u_offset_t for everything, but we
899 * xxx call lots of things that want uint_t arguments.
901 ASSERT(io_off
== io_off_tmp
);
904 * get enough buffers for worst-case scenario
905 * (i.e., no coalescing possible).
907 bufcnt
= (len
+ secsize
- 1) / secsize
;
908 bufs
= kmem_alloc(bufcnt
* sizeof (struct buf
), KM_SLEEP
);
909 vas
= kmem_alloc(bufcnt
* sizeof (caddr_t
), KM_SLEEP
);
912 * Allocate a array of semaphores since we are doing I/O
915 fio_done
= kmem_alloc(bufcnt
* sizeof (ksema_t
), KM_SLEEP
);
918 * If our filesize is not an integer multiple of PAGESIZE,
919 * we zero that part of the last page that's between EOF and
920 * the PAGESIZE boundary.
922 xlen
= io_len
& PAGEOFFSET
;
924 pagezero(pp
->p_prev
, xlen
, PAGESIZE
- xlen
);
926 DTRACE_PROBE2(hsfs_readahead
, struct vnode
*, vp
, uint_t
, io_len
);
931 io_end
= io_off
+ io_len
;
932 for (count
= 0, byte_offset
= io_off
;
933 byte_offset
< io_end
;
935 ASSERT(count
< bufcnt
);
937 bioinit(&bufs
[count
]);
938 bufs
[count
].b_edev
= devvp
->v_rdev
;
939 bufs
[count
].b_dev
= cmpdev(devvp
->v_rdev
);
940 bufs
[count
].b_flags
= B_NOCACHE
|B_BUSY
|B_READ
;
941 bufs
[count
].b_iodone
= hsfs_iodone
;
942 bufs
[count
].b_vp
= vp
;
943 bufs
[count
].b_file
= vp
;
945 /* Compute disk address for interleaving. */
947 /* considered without skips */
948 which_chunk_lbn
= byte_offset
/ chunk_data_bytes
;
950 /* factor in skips */
951 offset_lbn
= which_chunk_lbn
* chunk_lbn_count
;
953 /* convert to physical byte offset for lbn */
954 offset_bytes
= LBN_TO_BYTE(offset_lbn
, vp
->v_vfsp
);
956 /* don't forget offset into lbn */
957 offset_extra
= byte_offset
% chunk_data_bytes
;
959 /* get virtual block number for driver */
960 driver_block
= lbtodb(bof
+ xarsiz
961 + offset_bytes
+ offset_extra
);
963 if (lastp
!= searchp
) {
964 /* this branch taken first time through loop */
965 va
= vas
[count
] = ppmapin(searchp
, PROT_WRITE
,
967 /* ppmapin() guarantees not to return NULL */
972 bufs
[count
].b_un
.b_addr
= va
+ byte_offset
% PAGESIZE
;
973 bufs
[count
].b_offset
=
974 (offset_t
)(byte_offset
- io_off
+ off
);
977 * We specifically use the b_lblkno member here
978 * as even in the 32 bit world driver_block can
979 * get very large in line with the ISO9660 spec.
982 bufs
[count
].b_lblkno
= driver_block
;
984 remaining_bytes
= ((which_chunk_lbn
+ 1) * chunk_data_bytes
)
988 * remaining_bytes can't be zero, as we derived
989 * which_chunk_lbn directly from byte_offset.
991 if ((remaining_bytes
+ byte_offset
) < (off
+ len
)) {
992 /* coalesce-read the rest of the chunk */
993 bufs
[count
].b_bcount
= remaining_bytes
;
995 /* get the final bits */
996 bufs
[count
].b_bcount
= off
+ len
- byte_offset
;
999 remainder
= PAGESIZE
- (byte_offset
% PAGESIZE
);
1000 if (bufs
[count
].b_bcount
> remainder
) {
1001 bufs
[count
].b_bcount
= remainder
;
1004 bufs
[count
].b_bufsize
= bufs
[count
].b_bcount
;
1005 if (((offset_t
)byte_offset
+ bufs
[count
].b_bcount
) >
1009 byte_offset
+= bufs
[count
].b_bcount
;
1012 * We are scheduling I/O so we need to enqueue
1013 * requests rather than calling bdev_strategy
1014 * here. A later invocation of the scheduling
1015 * function will take care of doing the actual
1016 * I/O as it selects requests from the queue as
1017 * per the scheduling logic.
1019 struct hio
*hsio
= kmem_cache_alloc(hio_cache
,
1022 sema_init(&fio_done
[count
], 0, NULL
,
1023 SEMA_DEFAULT
, NULL
);
1024 hsio
->bp
= &bufs
[count
];
1025 hsio
->sema
= &fio_done
[count
];
1026 hsio
->io_lblkno
= bufs
[count
].b_lblkno
;
1027 hsio
->nblocks
= howmany(hsio
->bp
->b_bcount
,
1030 /* used for deadline */
1031 hsio
->io_timestamp
= drv_hztousec(ddi_get_lbolt());
1033 /* for I/O coalescing */
1034 hsio
->contig_chain
= NULL
;
1035 hsched_enqueue_io(fsp
, hsio
, 1);
1037 lwp_stat_update(LWP_STAT_INBLK
, 1);
1039 if ((remainder
- bufs
[count
].b_bcount
) < 1) {
1040 searchp
= searchp
->p_next
;
1045 info
= kmem_cache_alloc(hio_info_cache
, KM_SLEEP
);
1048 info
->sema
= fio_done
;
1049 info
->bufsused
= bufsused
;
1050 info
->bufcnt
= bufcnt
;
1054 (void) taskq_dispatch(fsp
->hqueue
->ra_task
,
1055 hsfs_ra_task
, info
, KM_SLEEP
);
1057 * The I/O locked pages are unlocked in our taskq thread.
1063 * Each file may have a different interleaving on disk. This makes
1064 * things somewhat interesting. The gist is that there are some
1065 * number of contiguous data sectors, followed by some other number
1066 * of contiguous skip sectors. The sum of those two sets of sectors
1067 * defines the interleave size. Unfortunately, it means that we generally
1068 * can't simply read N sectors starting at a given offset to satisfy
1069 * any given request.
1071 * What we do is get the relevant memory pages via pvn_read_kluster(),
1072 * then stride through the interleaves, setting up a buf for each
1073 * sector that needs to be brought in. Instead of kmem_alloc'ing
1074 * space for the sectors, though, we just point at the appropriate
1075 * spot in the relevant page for each of them. This saves us a bunch
1078 * NOTICE: The code below in hsfs_getapage is mostly same as the code
1079 * in hsfs_getpage_ra above (with some omissions). If you are
1080 * making any change to this function, please also look at
1103 struct page
*pp
, *searchp
, *lastp
;
1106 struct vnode
*devvp
;
1107 ulong_t byte_offset
;
1109 uint_t io_off
, io_len
;
1117 uint_t which_chunk_lbn
;
1119 uint_t offset_extra
;
1120 offset_t offset_bytes
;
1121 uint_t remaining_bytes
;
1123 int remainder
; /* must be signed */
1124 int chunk_lbn_count
;
1125 int chunk_data_bytes
;
1127 diskaddr_t driver_block
;
1128 u_offset_t io_off_tmp
;
1133 * We don't support asynchronous operation at the moment, so
1134 * just pretend we did it. If the pages are ever actually
1135 * needed, they'll get brought in then.
1141 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
1142 devvp
= fsp
->hsfs_devvp
;
1143 secsize
= fsp
->hsfs_vol
.lbn_size
; /* bytes per logical block */
1145 /* file data size */
1146 filsiz
= hp
->hs_dirent
.ext_size
;
1148 /* disk addr for start of file */
1149 bof
= LBN_TO_BYTE((offset_t
)hp
->hs_dirent
.ext_lbn
, vp
->v_vfsp
);
1151 /* xarsiz byte must be skipped for data */
1152 xarsiz
= hp
->hs_dirent
.xar_len
<< fsp
->hsfs_vol
.lbn_shift
;
1154 /* how many logical blocks in an interleave (data+skip) */
1155 chunk_lbn_count
= hp
->hs_dirent
.intlf_sz
+ hp
->hs_dirent
.intlf_sk
;
1157 if (chunk_lbn_count
== 0) {
1158 chunk_lbn_count
= 1;
1162 * Convert interleaving size into bytes. The zero case
1163 * (no interleaving) optimization is handled as a side-
1164 * effect of the read-ahead logic.
1166 if (hp
->hs_dirent
.intlf_sz
== 0) {
1167 chunk_data_bytes
= LBN_TO_BYTE(1, vp
->v_vfsp
);
1169 * Optimization: If our pagesize is a multiple of LBN
1170 * bytes, we can avoid breaking up a page into individual
1171 * lbn-sized requests.
1173 if (PAGESIZE
% chunk_data_bytes
== 0) {
1174 chunk_lbn_count
= BYTE_TO_LBN(PAGESIZE
, vp
->v_vfsp
);
1175 chunk_data_bytes
= PAGESIZE
;
1179 LBN_TO_BYTE(hp
->hs_dirent
.intlf_sz
, vp
->v_vfsp
);
1188 * Do some read-ahead. This mostly saves us a bit of
1189 * system cpu time more than anything else when doing
1190 * sequential reads. At some point, could do the
1191 * read-ahead asynchronously which might gain us something
1192 * on wall time, but it seems unlikely....
1194 * We do the easy case here, which is to read through
1195 * the end of the chunk, minus whatever's at the end that
1196 * won't exactly fill a page.
1198 if (hp
->hs_ra_bytes
> 0 && chunk_data_bytes
!= PAGESIZE
) {
1199 which_chunk_lbn
= (off
+ len
) / chunk_data_bytes
;
1200 extension
= ((which_chunk_lbn
+ 1) * chunk_data_bytes
) - off
;
1201 extension
-= (extension
% PAGESIZE
);
1203 extension
= roundup(len
, PAGESIZE
);
1206 atomic_inc_64(&fsp
->total_pages_requested
);
1210 /* search for page in buffer */
1211 if ((pagefound
= page_exists(vp
, off
)) == 0) {
1213 * Need to really do disk IO to get the page.
1216 extension
+= hp
->hs_ra_bytes
;
1219 * Some cd writers don't write sectors that aren't
1220 * used. Also, there's no point in reading sectors
1221 * we'll never look at. So, if we're asked to go
1222 * beyond the end of a file, truncate to the length
1225 * Additionally, this behaviour is required by section
1226 * 6.4.5 of ISO 9660:1988(E).
1228 len
= MIN(extension
? extension
: PAGESIZE
,
1231 /* A little paranoia. */
1235 * After all that, make sure we're asking for things
1236 * in units that bdev_strategy() will understand
1237 * (see bug 4202551).
1239 len
= roundup(len
, DEV_BSIZE
);
1243 pp
= pvn_read_kluster(vp
, off
, seg
, addr
, &io_off_tmp
,
1244 &io_len_tmp
, off
, len
, 0);
1248 * Pressure on memory, roll back readahead
1250 hp
->hs_num_contig
= 0;
1251 hp
->hs_ra_bytes
= 0;
1252 hp
->hs_prev_offset
= 0;
1256 io_off
= (uint_t
)io_off_tmp
;
1257 io_len
= (uint_t
)io_len_tmp
;
1259 /* check for truncation */
1261 * xxx Clean up and return EIO instead?
1262 * xxx Ought to go to u_offset_t for everything, but we
1263 * xxx call lots of things that want uint_t arguments.
1265 ASSERT(io_off
== io_off_tmp
);
1268 * get enough buffers for worst-case scenario
1269 * (i.e., no coalescing possible).
1271 bufcnt
= (len
+ secsize
- 1) / secsize
;
1272 bufs
= kmem_zalloc(bufcnt
* sizeof (struct buf
), KM_SLEEP
);
1273 vas
= kmem_alloc(bufcnt
* sizeof (caddr_t
), KM_SLEEP
);
1276 * Allocate a array of semaphores if we are doing I/O
1279 if (fsp
->hqueue
!= NULL
)
1280 fio_done
= kmem_alloc(bufcnt
* sizeof (ksema_t
),
1282 for (count
= 0; count
< bufcnt
; count
++) {
1283 bioinit(&bufs
[count
]);
1284 bufs
[count
].b_edev
= devvp
->v_rdev
;
1285 bufs
[count
].b_dev
= cmpdev(devvp
->v_rdev
);
1286 bufs
[count
].b_flags
= B_NOCACHE
|B_BUSY
|B_READ
;
1287 bufs
[count
].b_iodone
= hsfs_iodone
;
1288 bufs
[count
].b_vp
= vp
;
1289 bufs
[count
].b_file
= vp
;
1293 * If our filesize is not an integer multiple of PAGESIZE,
1294 * we zero that part of the last page that's between EOF and
1295 * the PAGESIZE boundary.
1297 xlen
= io_len
& PAGEOFFSET
;
1299 pagezero(pp
->p_prev
, xlen
, PAGESIZE
- xlen
);
1304 io_end
= io_off
+ io_len
;
1305 for (count
= 0, byte_offset
= io_off
;
1306 byte_offset
< io_end
; count
++) {
1307 ASSERT(count
< bufcnt
);
1309 /* Compute disk address for interleaving. */
1311 /* considered without skips */
1312 which_chunk_lbn
= byte_offset
/ chunk_data_bytes
;
1314 /* factor in skips */
1315 offset_lbn
= which_chunk_lbn
* chunk_lbn_count
;
1317 /* convert to physical byte offset for lbn */
1318 offset_bytes
= LBN_TO_BYTE(offset_lbn
, vp
->v_vfsp
);
1320 /* don't forget offset into lbn */
1321 offset_extra
= byte_offset
% chunk_data_bytes
;
1323 /* get virtual block number for driver */
1325 lbtodb(bof
+ xarsiz
+ offset_bytes
+ offset_extra
);
1327 if (lastp
!= searchp
) {
1328 /* this branch taken first time through loop */
1330 ppmapin(searchp
, PROT_WRITE
, (caddr_t
)-1);
1331 /* ppmapin() guarantees not to return NULL */
1336 bufs
[count
].b_un
.b_addr
= va
+ byte_offset
% PAGESIZE
;
1337 bufs
[count
].b_offset
=
1338 (offset_t
)(byte_offset
- io_off
+ off
);
1341 * We specifically use the b_lblkno member here
1342 * as even in the 32 bit world driver_block can
1343 * get very large in line with the ISO9660 spec.
1346 bufs
[count
].b_lblkno
= driver_block
;
1349 ((which_chunk_lbn
+ 1) * chunk_data_bytes
)
1353 * remaining_bytes can't be zero, as we derived
1354 * which_chunk_lbn directly from byte_offset.
1356 if ((remaining_bytes
+ byte_offset
) < (off
+ len
)) {
1357 /* coalesce-read the rest of the chunk */
1358 bufs
[count
].b_bcount
= remaining_bytes
;
1360 /* get the final bits */
1361 bufs
[count
].b_bcount
= off
+ len
- byte_offset
;
1365 * It would be nice to do multiple pages'
1366 * worth at once here when the opportunity
1367 * arises, as that has been shown to improve
1368 * our wall time. However, to do that
1369 * requires that we use the pageio subsystem,
1370 * which doesn't mix well with what we're
1371 * already using here. We can't use pageio
1372 * all the time, because that subsystem
1373 * assumes that a page is stored in N
1374 * contiguous blocks on the device.
1375 * Interleaving violates that assumption.
1377 * Update: This is now not so big a problem
1378 * because of the I/O scheduler sitting below
1379 * that can re-order and coalesce I/O requests.
1382 remainder
= PAGESIZE
- (byte_offset
% PAGESIZE
);
1383 if (bufs
[count
].b_bcount
> remainder
) {
1384 bufs
[count
].b_bcount
= remainder
;
1387 bufs
[count
].b_bufsize
= bufs
[count
].b_bcount
;
1388 if (((offset_t
)byte_offset
+ bufs
[count
].b_bcount
) >
1392 byte_offset
+= bufs
[count
].b_bcount
;
1394 if (fsp
->hqueue
== NULL
) {
1395 (void) bdev_strategy(&bufs
[count
]);
1399 * We are scheduling I/O so we need to enqueue
1400 * requests rather than calling bdev_strategy
1401 * here. A later invocation of the scheduling
1402 * function will take care of doing the actual
1403 * I/O as it selects requests from the queue as
1404 * per the scheduling logic.
1406 struct hio
*hsio
= kmem_cache_alloc(hio_cache
,
1409 sema_init(&fio_done
[count
], 0, NULL
,
1410 SEMA_DEFAULT
, NULL
);
1411 hsio
->bp
= &bufs
[count
];
1412 hsio
->sema
= &fio_done
[count
];
1413 hsio
->io_lblkno
= bufs
[count
].b_lblkno
;
1414 hsio
->nblocks
= howmany(hsio
->bp
->b_bcount
,
1417 /* used for deadline */
1418 hsio
->io_timestamp
=
1419 drv_hztousec(ddi_get_lbolt());
1421 /* for I/O coalescing */
1422 hsio
->contig_chain
= NULL
;
1423 hsched_enqueue_io(fsp
, hsio
, 0);
1426 lwp_stat_update(LWP_STAT_INBLK
, 1);
1428 if ((remainder
- bufs
[count
].b_bcount
) < 1) {
1429 searchp
= searchp
->p_next
;
1434 /* Now wait for everything to come in */
1435 if (fsp
->hqueue
== NULL
) {
1436 for (count
= 0; count
< bufsused
; count
++) {
1438 err
= biowait(&bufs
[count
]);
1440 (void) biowait(&bufs
[count
]);
1443 for (count
= 0; count
< bufsused
; count
++) {
1447 * Invoke scheduling function till our buf
1448 * is processed. In doing this it might
1449 * process bufs enqueued by other threads
1452 wbuf
= &bufs
[count
];
1453 DTRACE_PROBE1(hsfs_io_wait
, struct buf
*, wbuf
);
1454 while (sema_tryp(&fio_done
[count
]) == 0) {
1456 * hsched_invoke_strategy will return 1
1457 * if the I/O queue is empty. This means
1458 * that there is another thread who has
1459 * issued our buf and is waiting. So we
1460 * just block instead of spinning.
1462 if (hsched_invoke_strategy(fsp
)) {
1463 sema_p(&fio_done
[count
]);
1467 sema_destroy(&fio_done
[count
]);
1468 DTRACE_PROBE1(hsfs_io_done
, struct buf
*, wbuf
);
1471 err
= geterror(wbuf
);
1474 kmem_free(fio_done
, bufcnt
* sizeof (ksema_t
));
1477 /* Don't leak resources */
1478 for (count
= 0; count
< bufcnt
; count
++) {
1479 biofini(&bufs
[count
]);
1480 if (count
< bufsused
&& vas
[count
] != NULL
) {
1481 ppmapout(vas
[count
]);
1485 kmem_free(vas
, bufcnt
* sizeof (caddr_t
));
1486 kmem_free(bufs
, bufcnt
* sizeof (struct buf
));
1490 pvn_read_done(pp
, B_ERROR
);
1495 * Lock the requested page, and the one after it if possible.
1496 * Don't bother if our caller hasn't given us a place to stash
1497 * the page pointers, since otherwise we'd lock pages that would
1498 * never get unlocked.
1505 * Make sure it's in memory before we say it's here.
1507 if ((pp
= page_lookup(vp
, off
, SE_SHARED
)) == NULL
) {
1514 atomic_inc_64(&fsp
->cache_read_pages
);
1517 * Try to lock the next page, if it exists, without
1521 /* LINTED (plsz is unsigned) */
1522 for (soff
= off
+ PAGESIZE
; plsz
> 0;
1523 soff
+= PAGESIZE
, plsz
-= PAGESIZE
) {
1524 pp
= page_lookup_nowait(vp
, (u_offset_t
)soff
,
1533 * Schedule a semi-asynchronous readahead if we are
1534 * accessing the last cached page for the current
1537 * Doing this here means that readaheads will be
1538 * issued only if cache-hits occur. This is an advantage
1539 * since cache-hits would mean that readahead is giving
1540 * the desired benefit. If cache-hits do not occur there
1541 * is no point in reading ahead of time - the system
1544 if (fsp
->hqueue
!= NULL
&&
1545 hp
->hs_prev_offset
- off
== PAGESIZE
&&
1546 hp
->hs_prev_offset
< filsiz
&&
1547 hp
->hs_ra_bytes
> 0 &&
1548 !page_exists(vp
, hp
->hs_prev_offset
)) {
1549 (void) hsfs_getpage_ra(vp
, hp
->hs_prev_offset
, seg
,
1550 addr
+ PAGESIZE
, hp
, fsp
, xarsiz
, bof
,
1551 chunk_lbn_count
, chunk_data_bytes
);
1558 pvn_plist_init(pp
, pl
, plsz
, off
, io_len
, rw
);
1577 caller_context_t
*ct
)
1583 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
1586 /* does not support write */
1587 if (rw
== S_WRITE
) {
1591 if (vp
->v_flag
& VNOMAP
) {
1595 ASSERT(off
<= HS_MAXFILEOFF
);
1598 * Determine file data size for EOF check.
1600 filsiz
= hp
->hs_dirent
.ext_size
;
1601 if ((off
+ len
) > (offset_t
)(filsiz
+ PAGEOFFSET
) && seg
!= segkmap
)
1602 return (EFAULT
); /* beyond EOF */
1605 * Async Read-ahead computation.
1606 * This attempts to detect sequential access pattern and
1607 * enables reading extra pages ahead of time.
1609 if (fsp
->hqueue
!= NULL
) {
1611 * This check for sequential access also takes into
1612 * account segmap weirdness when reading in chunks
1613 * less than the segmap size of 8K.
1615 if (hp
->hs_prev_offset
== off
|| (off
<
1616 hp
->hs_prev_offset
&& off
+ MAX(len
, PAGESIZE
)
1617 >= hp
->hs_prev_offset
)) {
1618 if (hp
->hs_num_contig
<
1619 (seq_contig_requests
- 1)) {
1620 hp
->hs_num_contig
++;
1624 * We increase readahead quantum till
1625 * a predefined max. max_readahead_bytes
1626 * is a multiple of PAGESIZE.
1628 if (hp
->hs_ra_bytes
<
1629 fsp
->hqueue
->max_ra_bytes
) {
1630 hp
->hs_ra_bytes
+= PAGESIZE
;
1635 * Not contiguous so reduce read ahead counters.
1637 if (hp
->hs_ra_bytes
> 0)
1638 hp
->hs_ra_bytes
-= PAGESIZE
;
1640 if (hp
->hs_ra_bytes
<= 0) {
1641 hp
->hs_ra_bytes
= 0;
1642 if (hp
->hs_num_contig
> 0)
1643 hp
->hs_num_contig
--;
1647 * Length must be rounded up to page boundary.
1648 * since we read in units of pages.
1650 hp
->hs_prev_offset
= off
+ roundup(len
, PAGESIZE
);
1651 DTRACE_PROBE1(hsfs_compute_ra
, struct hsnode
*, hp
);
1656 return (pvn_getpages(hsfs_getapage
, vp
, off
, len
, protp
, pl
, plsz
,
1657 seg
, addr
, rw
, cred
));
1663 * This function should never be called. We need to have it to pass
1664 * it as an argument to other functions.
1676 /* should never happen - just destroy it */
1677 cmn_err(CE_NOTE
, "hsfs_putapage: dirty HSFS page");
1678 pvn_write_done(pp
, B_ERROR
| B_WRITE
| B_INVAL
| B_FORCE
| flags
);
1684 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
1685 * B_INVAL is set by:
1687 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
1688 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
1689 * which translates to an MC_SYNC with the MS_INVALIDATE flag.
1691 * The B_FREE (as well as the B_DONTNEED) flag is set when the
1692 * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
1693 * from SEGVN to release pages behind a pagefault.
1703 caller_context_t
*ct
)
1707 if (vp
->v_count
== 0) {
1708 panic("hsfs_putpage: bad v_count");
1712 if (vp
->v_flag
& VNOMAP
)
1715 ASSERT(off
<= HS_MAXFILEOFF
);
1717 if (!vn_has_cached_data(vp
)) /* no pages mapped */
1720 if (len
== 0) { /* from 'off' to EOF */
1721 error
= pvn_vplist_dirty(vp
, off
, hsfs_putapage
, flags
, cr
);
1723 offset_t end_off
= off
+ len
;
1724 offset_t file_size
= VTOH(vp
)->hs_dirent
.ext_size
;
1727 file_size
= (file_size
+ PAGESIZE
- 1) & PAGEMASK
;
1728 if (end_off
> file_size
)
1729 end_off
= file_size
;
1731 for (io_off
= off
; io_off
< end_off
; io_off
+= PAGESIZE
) {
1735 * We insist on getting the page only if we are
1736 * about to invalidate, free or write it and
1737 * the B_ASYNC flag is not set.
1739 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
1740 pp
= page_lookup(vp
, io_off
,
1741 (flags
& (B_INVAL
| B_FREE
)) ?
1742 SE_EXCL
: SE_SHARED
);
1744 pp
= page_lookup_nowait(vp
, io_off
,
1745 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
1752 * Normally pvn_getdirty() should return 0, which
1753 * impies that it has done the job for us.
1754 * The shouldn't-happen scenario is when it returns 1.
1755 * This means that the page has been modified and
1756 * needs to be put back.
1757 * Since we can't write on a CD, we fake a failed
1758 * I/O and force pvn_write_done() to destroy the page.
1760 if (pvn_getdirty(pp
, flags
) == 1) {
1762 "hsfs_putpage: dirty HSFS page");
1763 pvn_write_done(pp
, flags
|
1764 B_ERROR
| B_WRITE
| B_INVAL
| B_FORCE
);
1784 caller_context_t
*ct
)
1786 struct segvn_crargs vn_a
;
1789 /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
1791 if (vp
->v_flag
& VNOMAP
)
1794 if ((prot
& PROT_WRITE
) && (flags
& MAP_SHARED
))
1797 if (off
> HS_MAXFILEOFF
|| off
< 0 ||
1798 (off
+ len
) < 0 || (off
+ len
) > HS_MAXFILEOFF
)
1801 if (vp
->v_type
!= VREG
) {
1806 * If file is being locked, disallow mapping.
1808 if (vn_has_mandatory_locks(vp
, VTOH(vp
)->hs_dirent
.mode
))
1812 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
1820 vn_a
.type
= flags
& MAP_TYPE
;
1822 vn_a
.maxprot
= maxprot
;
1823 vn_a
.flags
= flags
& ~MAP_TYPE
;
1827 vn_a
.lgrp_mem_policy_flags
= 0;
1829 error
= as_map(as
, *addrp
, len
, segvn_create
, &vn_a
);
1846 caller_context_t
*ct
)
1850 if (vp
->v_flag
& VNOMAP
)
1854 mutex_enter(&hp
->hs_contents_lock
);
1855 hp
->hs_mapcnt
+= btopr(len
);
1856 mutex_exit(&hp
->hs_contents_lock
);
1872 caller_context_t
*ct
)
1876 if (vp
->v_flag
& VNOMAP
)
1880 mutex_enter(&hp
->hs_contents_lock
);
1881 hp
->hs_mapcnt
-= btopr(len
); /* Count released mappings */
1882 ASSERT(hp
->hs_mapcnt
>= 0);
1883 mutex_exit(&hp
->hs_contents_lock
);
1893 caller_context_t
*ct
)
1895 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
1903 struct flock64
*bfp
,
1906 struct flk_callback
*flk_cbp
,
1908 caller_context_t
*ct
)
1910 struct hsnode
*hp
= VTOH(vp
);
1913 * If the file is being mapped, disallow fs_frlock.
1914 * We are not holding the hs_contents_lock while checking
1915 * hs_mapcnt because the current locking strategy drops all
1916 * locks before calling fs_frlock.
1917 * So, hs_mapcnt could change before we enter fs_frlock making
1918 * it meaningless to have held hs_contents_lock in the first place.
1920 if (hp
->hs_mapcnt
> 0 && MANDLOCK(vp
, hp
->hs_dirent
.mode
))
1923 return (fs_frlock(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
, ct
));
1927 hsched_deadline_compare(const void *x1
, const void *x2
)
1929 const struct hio
*h1
= x1
;
1930 const struct hio
*h2
= x2
;
1932 if (h1
->io_timestamp
< h2
->io_timestamp
)
1934 if (h1
->io_timestamp
> h2
->io_timestamp
)
1937 if (h1
->io_lblkno
< h2
->io_lblkno
)
1939 if (h1
->io_lblkno
> h2
->io_lblkno
)
1951 hsched_offset_compare(const void *x1
, const void *x2
)
1953 const struct hio
*h1
= x1
;
1954 const struct hio
*h2
= x2
;
1956 if (h1
->io_lblkno
< h2
->io_lblkno
)
1958 if (h1
->io_lblkno
> h2
->io_lblkno
)
1970 hsched_init_caches(void)
1972 hio_cache
= kmem_cache_create("hsfs_hio_cache",
1973 sizeof (struct hio
), 0, NULL
,
1974 NULL
, NULL
, NULL
, NULL
, 0);
1976 hio_info_cache
= kmem_cache_create("hsfs_hio_info_cache",
1977 sizeof (struct hio_info
), 0, NULL
,
1978 NULL
, NULL
, NULL
, NULL
, 0);
1982 hsched_fini_caches(void)
1984 kmem_cache_destroy(hio_cache
);
1985 kmem_cache_destroy(hio_info_cache
);
1989 * Initialize I/O scheduling structures. This is called via hsfs_mount
1992 hsched_init(struct hsfs
*fsp
, int fsid
, struct modlinkage
*modlinkage
)
1994 struct hsfs_queue
*hqueue
= fsp
->hqueue
;
1995 struct vnode
*vp
= fsp
->hsfs_devvp
;
1997 /* TaskQ name of the form: hsched_task_ + stringof(int) */
2000 struct dk_cinfo info
;
2005 * Default maxtransfer = 16k chunk
2007 hqueue
->dev_maxtransfer
= 16384;
2010 * Try to fetch the maximum device transfer size. This is used to
2011 * ensure that a coalesced block does not exceed the maxtransfer.
2013 err
= ldi_ident_from_mod(modlinkage
, &li
);
2015 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
2016 cmn_err(CE_NOTE
, "hsched_init: ldi_ident_from_mod err=%d\n",
2021 err
= ldi_open_by_dev(&(vp
->v_rdev
), OTYP_CHR
, FREAD
, CRED(), &lh
, li
);
2022 ldi_ident_release(li
);
2024 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
2025 cmn_err(CE_NOTE
, "hsched_init: ldi_open err=%d\n", err
);
2029 error
= ldi_ioctl(lh
, DKIOCINFO
, (intptr_t)&info
, FKIOCTL
,
2031 err
= ldi_close(lh
, FREAD
, CRED());
2033 cmn_err(CE_NOTE
, "hsched_init: Querying device failed");
2034 cmn_err(CE_NOTE
, "hsched_init: ldi_close err=%d\n", err
);
2038 hqueue
->dev_maxtransfer
= ldbtob(info
.dki_maxtransfer
);
2043 * Max size of data to read ahead for sequential access pattern.
2044 * Conservative to avoid letting the underlying CD drive to spin
2045 * down, in case the application is reading slowly.
2046 * We read ahead upto a max of 4 pages.
2048 hqueue
->max_ra_bytes
= PAGESIZE
* 8;
2050 mutex_init(&(hqueue
->hsfs_queue_lock
), NULL
, MUTEX_DEFAULT
, NULL
);
2051 mutex_init(&(hqueue
->strategy_lock
), NULL
, MUTEX_DEFAULT
, NULL
);
2052 avl_create(&(hqueue
->read_tree
), hsched_offset_compare
,
2053 sizeof (struct hio
), offsetof(struct hio
, io_offset_node
));
2054 avl_create(&(hqueue
->deadline_tree
), hsched_deadline_compare
,
2055 sizeof (struct hio
), offsetof(struct hio
, io_deadline_node
));
2057 (void) snprintf(namebuf
, sizeof (namebuf
), "hsched_task_%d", fsid
);
2058 hqueue
->ra_task
= taskq_create(namebuf
, hsfs_taskq_nthreads
,
2059 minclsyspri
+ 2, 1, 104857600 / PAGESIZE
, TASKQ_DYNAMIC
);
2061 hqueue
->next
= NULL
;
2062 hqueue
->nbuf
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
2066 hsched_fini(struct hsfs_queue
*hqueue
)
2068 if (hqueue
!= NULL
) {
2070 * Remove the sentinel if there was one.
2072 if (hqueue
->next
!= NULL
) {
2073 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2074 kmem_cache_free(hio_cache
, hqueue
->next
);
2076 avl_destroy(&(hqueue
->read_tree
));
2077 avl_destroy(&(hqueue
->deadline_tree
));
2078 mutex_destroy(&(hqueue
->hsfs_queue_lock
));
2079 mutex_destroy(&(hqueue
->strategy_lock
));
2082 * If there are any existing readahead threads running
2083 * taskq_destroy will wait for them to finish.
2085 taskq_destroy(hqueue
->ra_task
);
2086 kmem_free(hqueue
->nbuf
, sizeof (struct buf
));
2091 * Determine if two I/O requests are adjacent to each other so
2092 * that they can coalesced.
2094 #define IS_ADJACENT(io, nio) \
2095 (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
2096 (io)->bp->b_edev == (nio)->bp->b_edev)
2099 * This performs the actual I/O scheduling logic. We use the Circular
2100 * Look algorithm here. Sort the I/O requests in ascending order of
2101 * logical block number and process them starting with the lowest
2102 * numbered block and progressing towards higher block numbers in the
2103 * queue. Once there are no more higher numbered blocks, start again
2104 * with the lowest one. This is good for CD/DVD as you keep moving
2105 * the head in one direction along the outward spiral track and avoid
2106 * too many seeks as much as possible. The re-ordering also allows
2107 * us to coalesce adjacent requests into one larger request.
2108 * This is thus essentially a 1-way Elevator with front merging.
2110 * In addition each read request here has a deadline and will be
2111 * processed out of turn if the deadline (500ms) expires.
2113 * This function is necessarily serialized via hqueue->strategy_lock.
2114 * This function sits just below hsfs_getapage and processes all read
2115 * requests orginating from that function.
2118 hsched_invoke_strategy(struct hsfs
*fsp
)
2120 struct hsfs_queue
*hqueue
;
2122 struct hio
*fio
, *nio
, *tio
, *prev
, *last
;
2123 size_t bsize
, soffset
, offset
, data
;
2124 int bioret
, bufcount
;
2129 hqueue
= fsp
->hqueue
;
2130 mutex_enter(&hqueue
->strategy_lock
);
2131 mutex_enter(&hqueue
->hsfs_queue_lock
);
2134 * Check for Deadline expiration first
2136 fio
= avl_first(&hqueue
->deadline_tree
);
2139 * Paranoid check for empty I/O queue. Both deadline
2140 * and read trees contain same data sorted in different
2141 * ways. So empty deadline tree = empty read tree.
2145 * Remove the sentinel if there was one.
2147 if (hqueue
->next
!= NULL
) {
2148 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2149 kmem_cache_free(hio_cache
, hqueue
->next
);
2150 hqueue
->next
= NULL
;
2152 mutex_exit(&hqueue
->hsfs_queue_lock
);
2153 mutex_exit(&hqueue
->strategy_lock
);
2157 if (drv_hztousec(ddi_get_lbolt()) - fio
->io_timestamp
2158 < HSFS_READ_DEADLINE
) {
2160 * Apply standard scheduling logic. This uses the
2161 * C-LOOK approach. Process I/O requests in ascending
2162 * order of logical block address till no subsequent
2163 * higher numbered block request remains. Then start
2164 * again from the lowest numbered block in the queue.
2166 * We do this cheaply here by means of a sentinel.
2167 * The last processed I/O structure from the previous
2168 * invocation of this func, is left dangling in the
2169 * read_tree so that we can easily scan to the next
2170 * higher numbered request and remove the sentinel.
2173 if (hqueue
->next
!= NULL
) {
2174 fio
= AVL_NEXT(&hqueue
->read_tree
, hqueue
->next
);
2175 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2176 kmem_cache_free(hio_cache
, hqueue
->next
);
2177 hqueue
->next
= NULL
;
2180 fio
= avl_first(&hqueue
->read_tree
);
2182 } else if (hqueue
->next
!= NULL
) {
2183 DTRACE_PROBE1(hsfs_deadline_expiry
, struct hio
*, fio
);
2185 avl_remove(&hqueue
->read_tree
, hqueue
->next
);
2186 kmem_cache_free(hio_cache
, hqueue
->next
);
2187 hqueue
->next
= NULL
;
2191 * In addition we try to coalesce contiguous
2192 * requests into one bigger request.
2195 bsize
= ldbtob(fio
->nblocks
);
2196 fvp
= fio
->bp
->b_file
;
2197 nio
= AVL_NEXT(&hqueue
->read_tree
, fio
);
2199 while (nio
!= NULL
&& IS_ADJACENT(tio
, nio
) &&
2200 bsize
< hqueue
->dev_maxtransfer
) {
2201 avl_remove(&hqueue
->deadline_tree
, tio
);
2202 avl_remove(&hqueue
->read_tree
, tio
);
2203 tio
->contig_chain
= nio
;
2204 bsize
+= ldbtob(nio
->nblocks
);
2209 * This check is required to detect the case where
2210 * we are merging adjacent buffers belonging to
2211 * different files. fvp is used to set the b_file
2212 * parameter in the coalesced buf. b_file is used
2213 * by DTrace so we do not want DTrace to accrue
2214 * requests to two different files to any one file.
2216 if (fvp
&& tio
->bp
->b_file
!= fvp
) {
2220 nio
= AVL_NEXT(&hqueue
->read_tree
, nio
);
2225 * tio is not removed from the read_tree as it serves as a sentinel
2226 * to cheaply allow us to scan to the next higher numbered I/O
2230 avl_remove(&hqueue
->deadline_tree
, tio
);
2231 mutex_exit(&hqueue
->hsfs_queue_lock
);
2232 DTRACE_PROBE3(hsfs_io_dequeued
, struct hio
*, fio
, int, bufcount
,
2236 * The benefit of coalescing occurs if the the savings in I/O outweighs
2237 * the cost of doing the additional work below.
2238 * It was observed that coalescing 2 buffers results in diminishing
2239 * returns, so we do coalescing if we have >2 adjacent bufs.
2241 if (bufcount
> hsched_coalesce_min
) {
2243 * We have coalesced blocks. First allocate mem and buf for
2244 * the entire coalesced chunk.
2245 * Since we are guaranteed single-threaded here we pre-allocate
2246 * one buf at mount time and that is re-used every time. This
2247 * is a synthesized buf structure that uses kmem_alloced chunk.
2248 * Not quite a normal buf attached to pages.
2250 fsp
->coalesced_bytes
+= bsize
;
2251 nbuf
= hqueue
->nbuf
;
2253 nbuf
->b_edev
= fio
->bp
->b_edev
;
2254 nbuf
->b_dev
= fio
->bp
->b_dev
;
2255 nbuf
->b_flags
= fio
->bp
->b_flags
;
2256 nbuf
->b_iodone
= fio
->bp
->b_iodone
;
2257 iodata
= kmem_alloc(bsize
, KM_SLEEP
);
2258 nbuf
->b_un
.b_addr
= iodata
;
2259 nbuf
->b_lblkno
= fio
->bp
->b_lblkno
;
2262 nbuf
->b_bcount
= bsize
;
2263 nbuf
->b_bufsize
= bsize
;
2265 DTRACE_PROBE3(hsfs_coalesced_io_start
, struct hio
*, fio
, int,
2266 bufcount
, size_t, bsize
);
2269 * Perform I/O for the coalesced block.
2271 (void) bdev_strategy(nbuf
);
2274 * Duplicate the last IO node to leave the sentinel alone.
2275 * The sentinel is freed in the next invocation of this
2278 prev
->contig_chain
= kmem_cache_alloc(hio_cache
, KM_SLEEP
);
2279 prev
->contig_chain
->bp
= tio
->bp
;
2280 prev
->contig_chain
->sema
= tio
->sema
;
2281 tio
= prev
->contig_chain
;
2282 tio
->contig_chain
= NULL
;
2283 soffset
= ldbtob(fio
->bp
->b_lblkno
);
2286 bioret
= biowait(nbuf
);
2287 data
= bsize
- nbuf
->b_resid
;
2289 mutex_exit(&hqueue
->strategy_lock
);
2292 * We use the b_resid parameter to detect how much
2293 * data was succesfully transferred. We will signal
2294 * a success to all the fully retrieved actual bufs
2295 * before coalescing, rest is signaled as error,
2299 DTRACE_PROBE3(hsfs_coalesced_io_done
, struct hio
*, nio
,
2300 int, bioret
, size_t, data
);
2303 * Copy data and signal success to all the bufs
2304 * which can be fully satisfied from b_resid.
2306 while (nio
!= NULL
&& data
>= nio
->bp
->b_bcount
) {
2307 offset
= ldbtob(nio
->bp
->b_lblkno
) - soffset
;
2308 bcopy(iodata
+ offset
, nio
->bp
->b_un
.b_addr
,
2310 data
-= nio
->bp
->b_bcount
;
2311 bioerror(nio
->bp
, 0);
2315 nio
= nio
->contig_chain
;
2316 kmem_cache_free(hio_cache
, tio
);
2320 * Signal error to all the leftover bufs (if any)
2321 * after b_resid data is exhausted.
2323 while (nio
!= NULL
) {
2324 nio
->bp
->b_resid
= nio
->bp
->b_bcount
- data
;
2325 bzero(nio
->bp
->b_un
.b_addr
+ data
, nio
->bp
->b_resid
);
2326 bioerror(nio
->bp
, bioret
);
2330 nio
= nio
->contig_chain
;
2331 kmem_cache_free(hio_cache
, tio
);
2334 kmem_free(iodata
, bsize
);
2338 io_done
= tio
->sema
;
2342 while (nio
!= NULL
) {
2343 (void) bdev_strategy(nio
->bp
);
2344 nio
= nio
->contig_chain
;
2347 mutex_exit(&hqueue
->strategy_lock
);
2349 while (nio
!= NULL
) {
2351 (void) biowait(nbuf
);
2354 /* sentinel last not freed. See above. */
2356 (void) biowait(nio
->bp
);
2360 nio
= nio
->contig_chain
;
2361 kmem_cache_free(hio_cache
, tio
);
2368 * Insert an I/O request in the I/O scheduler's pipeline
2369 * Using AVL tree makes it easy to reorder the I/O request
2370 * based on logical block number.
2373 hsched_enqueue_io(struct hsfs
*fsp
, struct hio
*hsio
, int ra
)
2375 struct hsfs_queue
*hqueue
= fsp
->hqueue
;
2377 mutex_enter(&hqueue
->hsfs_queue_lock
);
2379 fsp
->physical_read_bytes
+= hsio
->bp
->b_bcount
;
2381 fsp
->readahead_bytes
+= hsio
->bp
->b_bcount
;
2383 avl_add(&hqueue
->deadline_tree
, hsio
);
2384 avl_add(&hqueue
->read_tree
, hsio
);
2386 DTRACE_PROBE3(hsfs_io_enqueued
, struct hio
*, hsio
,
2387 struct hsfs_queue
*, hqueue
, int, ra
);
2389 mutex_exit(&hqueue
->hsfs_queue_lock
);
2394 hsfs_pathconf(struct vnode
*vp
,
2398 caller_context_t
*ct
)
2407 fsp
= VFS_TO_HSFS(vp
->v_vfsp
);
2408 *valp
= fsp
->hsfs_namemax
;
2411 case _PC_FILESIZEBITS
:
2412 *valp
= 33; /* Without multi extent support: 4 GB - 2k */
2415 case _PC_TIMESTAMP_RESOLUTION
:
2417 * HSFS keeps, at best, 1/100 second timestamp resolution.
2423 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2432 const fs_operation_def_t hsfs_vnodeops_template
[] = {
2433 VOPNAME_OPEN
, { .vop_open
= hsfs_open
},
2434 VOPNAME_CLOSE
, { .vop_close
= hsfs_close
},
2435 VOPNAME_READ
, { .vop_read
= hsfs_read
},
2436 VOPNAME_GETATTR
, { .vop_getattr
= hsfs_getattr
},
2437 VOPNAME_ACCESS
, { .vop_access
= hsfs_access
},
2438 VOPNAME_LOOKUP
, { .vop_lookup
= hsfs_lookup
},
2439 VOPNAME_READDIR
, { .vop_readdir
= hsfs_readdir
},
2440 VOPNAME_READLINK
, { .vop_readlink
= hsfs_readlink
},
2441 VOPNAME_FSYNC
, { .vop_fsync
= hsfs_fsync
},
2442 VOPNAME_INACTIVE
, { .vop_inactive
= hsfs_inactive
},
2443 VOPNAME_FID
, { .vop_fid
= hsfs_fid
},
2444 VOPNAME_SEEK
, { .vop_seek
= hsfs_seek
},
2445 VOPNAME_FRLOCK
, { .vop_frlock
= hsfs_frlock
},
2446 VOPNAME_GETPAGE
, { .vop_getpage
= hsfs_getpage
},
2447 VOPNAME_PUTPAGE
, { .vop_putpage
= hsfs_putpage
},
2448 VOPNAME_MAP
, { .vop_map
= hsfs_map
},
2449 VOPNAME_ADDMAP
, { .vop_addmap
= hsfs_addmap
},
2450 VOPNAME_DELMAP
, { .vop_delmap
= hsfs_delmap
},
2451 VOPNAME_PATHCONF
, { .vop_pathconf
= hsfs_pathconf
},
2455 struct vnodeops
*hsfs_vnodeops
;