4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2015, Joyent, Inc.
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
43 #include <sys/vnode.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
54 #include <sys/errno.h>
57 #include <sys/pathname.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/cmn_err.h>
61 #include <sys/dirent.h>
62 #include <sys/errno.h>
63 #include <sys/modctl.h>
64 #include <sys/statvfs.h>
65 #include <sys/mount.h>
66 #include <sys/sunddi.h>
67 #include <sys/bootconf.h>
68 #include <sys/policy.h>
75 #include <vm/seg_map.h>
76 #include <vm/seg_kmem.h>
77 #include <vm/seg_vn.h>
82 #include <sys/fs_subr.h>
84 #include <sys/fs/udf_volume.h>
85 #include <sys/fs/udf_inode.h>
87 static int32_t udf_open(struct vnode
**,
88 int32_t, struct cred
*, caller_context_t
*);
89 static int32_t udf_close(struct vnode
*,
90 int32_t, int32_t, offset_t
, struct cred
*, caller_context_t
*);
91 static int32_t udf_read(struct vnode
*,
92 struct uio
*, int32_t, struct cred
*, caller_context_t
*);
93 static int32_t udf_write(struct vnode
*,
94 struct uio
*, int32_t, struct cred
*, caller_context_t
*);
95 static int32_t udf_ioctl(struct vnode
*,
96 int32_t, intptr_t, int32_t, struct cred
*, int32_t *,
98 static int32_t udf_getattr(struct vnode
*,
99 struct vattr
*, int32_t, struct cred
*, caller_context_t
*);
100 static int32_t udf_setattr(struct vnode
*,
101 struct vattr
*, int32_t, struct cred
*, caller_context_t
*);
102 static int32_t udf_access(struct vnode
*,
103 int32_t, int32_t, struct cred
*, caller_context_t
*);
104 static int32_t udf_lookup(struct vnode
*,
105 char *, struct vnode
**, struct pathname
*,
106 int32_t, struct vnode
*, struct cred
*,
107 caller_context_t
*, int *, pathname_t
*);
108 static int32_t udf_create(struct vnode
*,
109 char *, struct vattr
*, enum vcexcl
,
110 int32_t, struct vnode
**, struct cred
*, int32_t,
111 caller_context_t
*, vsecattr_t
*);
112 static int32_t udf_remove(struct vnode
*,
113 char *, struct cred
*, caller_context_t
*, int);
114 static int32_t udf_link(struct vnode
*,
115 struct vnode
*, char *, struct cred
*, caller_context_t
*, int);
116 static int32_t udf_rename(struct vnode
*,
117 char *, struct vnode
*, char *, struct cred
*, caller_context_t
*, int);
118 static int32_t udf_mkdir(struct vnode
*,
119 char *, struct vattr
*, struct vnode
**, struct cred
*,
120 caller_context_t
*, int, vsecattr_t
*);
121 static int32_t udf_rmdir(struct vnode
*,
122 char *, struct vnode
*, struct cred
*, caller_context_t
*, int);
123 static int32_t udf_readdir(struct vnode
*,
124 struct uio
*, struct cred
*, int32_t *, caller_context_t
*, int);
125 static int32_t udf_symlink(struct vnode
*,
126 char *, struct vattr
*, char *, struct cred
*, caller_context_t
*, int);
127 static int32_t udf_readlink(struct vnode
*,
128 struct uio
*, struct cred
*, caller_context_t
*);
129 static int32_t udf_fsync(struct vnode
*,
130 int32_t, struct cred
*, caller_context_t
*);
131 static void udf_inactive(struct vnode
*,
132 struct cred
*, caller_context_t
*);
133 static int32_t udf_fid(struct vnode
*, struct fid
*, caller_context_t
*);
134 static int udf_rwlock(struct vnode
*, int32_t, caller_context_t
*);
135 static void udf_rwunlock(struct vnode
*, int32_t, caller_context_t
*);
136 static int32_t udf_seek(struct vnode
*, offset_t
, offset_t
*,
138 static int32_t udf_frlock(struct vnode
*, int32_t,
139 struct flock64
*, int32_t, offset_t
, struct flk_callback
*, cred_t
*,
141 static int32_t udf_space(struct vnode
*, int32_t,
142 struct flock64
*, int32_t, offset_t
, cred_t
*, caller_context_t
*);
143 static int32_t udf_getpage(struct vnode
*, offset_t
,
144 size_t, uint32_t *, struct page
**, size_t,
145 struct seg
*, caddr_t
, enum seg_rw
, struct cred
*, caller_context_t
*);
146 static int32_t udf_putpage(struct vnode
*, offset_t
,
147 size_t, int32_t, struct cred
*, caller_context_t
*);
148 static int32_t udf_map(struct vnode
*, offset_t
, struct as
*,
149 caddr_t
*, size_t, uint8_t, uint8_t, uint32_t, struct cred
*,
151 static int32_t udf_addmap(struct vnode
*, offset_t
, struct as
*,
152 caddr_t
, size_t, uint8_t, uint8_t, uint32_t, struct cred
*,
154 static int32_t udf_delmap(struct vnode
*, offset_t
, struct as
*,
155 caddr_t
, size_t, uint32_t, uint32_t, uint32_t, struct cred
*,
157 static int32_t udf_l_pathconf(struct vnode
*, int32_t,
158 ulong_t
*, struct cred
*, caller_context_t
*);
159 static int32_t udf_pageio(struct vnode
*, struct page
*,
160 uoff_t
, size_t, int32_t, struct cred
*, caller_context_t
*);
162 int32_t ud_getpage_miss(struct vnode
*, uoff_t
,
163 size_t, struct seg
*, caddr_t
, page_t
*pl
[],
164 size_t, enum seg_rw
, int32_t);
165 void ud_getpage_ra(struct vnode
*, uoff_t
, struct seg
*, caddr_t
);
166 int32_t ud_putpages(struct vnode
*, offset_t
, size_t, int32_t, struct cred
*);
167 int32_t ud_page_fill(struct ud_inode
*, page_t
*,
168 uoff_t
, uint32_t, uoff_t
*);
169 int32_t ud_iodone(struct buf
*);
170 int32_t ud_rdip(struct ud_inode
*, struct uio
*, int32_t, cred_t
*);
171 int32_t ud_wrip(struct ud_inode
*, struct uio
*, int32_t, cred_t
*);
172 int32_t ud_multi_strat(struct ud_inode
*, page_t
*, struct buf
*, uoff_t
);
173 int32_t ud_slave_done(struct buf
*);
176 * Structures to control multiple IO operations to get or put pages
177 * that are backed by discontiguous blocks. The master struct is
178 * a dummy that holds the original bp from pageio_setup. The
179 * slave struct holds the working bp's to do the actual IO. Once
180 * all the slave IOs complete. The master is processed as if a single
181 * IO op has completed.
183 uint32_t master_index
= 0;
184 typedef struct mio_master
{
185 kmutex_t mm_mutex
; /* protect the fields below */
187 buf_t
*mm_bp
; /* original bp */
188 int32_t mm_resid
; /* bytes remaining to transfer */
189 int32_t mm_error
; /* accumulated error from slaves */
190 int32_t mm_index
; /* XXX debugging */
193 typedef struct mio_slave
{
194 buf_t ms_buf
; /* working buffer for this IO chunk */
195 mio_master_t
*ms_ptr
; /* pointer to master */
198 const struct vnodeops udf_vnodeops
= {
200 .vop_open
= udf_open
,
201 .vop_close
= udf_close
,
202 .vop_read
= udf_read
,
203 .vop_write
= udf_write
,
204 .vop_ioctl
= udf_ioctl
,
205 .vop_getattr
= udf_getattr
,
206 .vop_setattr
= udf_setattr
,
207 .vop_access
= udf_access
,
208 .vop_lookup
= udf_lookup
,
209 .vop_create
= udf_create
,
210 .vop_remove
= udf_remove
,
211 .vop_link
= udf_link
,
212 .vop_rename
= udf_rename
,
213 .vop_mkdir
= udf_mkdir
,
214 .vop_rmdir
= udf_rmdir
,
215 .vop_readdir
= udf_readdir
,
216 .vop_symlink
= udf_symlink
,
217 .vop_readlink
= udf_readlink
,
218 .vop_fsync
= udf_fsync
,
219 .vop_inactive
= udf_inactive
,
221 .vop_rwlock
= udf_rwlock
,
222 .vop_rwunlock
= udf_rwunlock
,
223 .vop_seek
= udf_seek
,
224 .vop_frlock
= udf_frlock
,
225 .vop_space
= udf_space
,
226 .vop_getpage
= udf_getpage
,
227 .vop_putpage
= udf_putpage
,
229 .vop_addmap
= udf_addmap
,
230 .vop_delmap
= udf_delmap
,
231 .vop_pathconf
= udf_l_pathconf
,
232 .vop_pageio
= udf_pageio
,
233 .vop_vnevent
= fs_vnevent_support
,
242 caller_context_t
*ct
)
244 ud_printf("udf_open\n");
257 caller_context_t
*ct
)
259 struct ud_inode
*ip
= VTOI(vp
);
261 ud_printf("udf_close\n");
265 cleanlocks(vp
, ttoproc(curthread
)->p_pid
, 0);
266 cleanshares(vp
, ttoproc(curthread
)->p_pid
);
269 * Push partially filled cluster at last close.
270 * ``last close'' is approximated because the dnlc
271 * may have a hold on the vnode.
273 if (vp
->v_count
<= 2 && vp
->v_type
!= VBAD
) {
274 struct ud_inode
*ip
= VTOI(vp
);
275 if (ip
->i_delaylen
) {
276 (void) ud_putpages(vp
, ip
->i_delayoff
, ip
->i_delaylen
,
277 B_ASYNC
| B_FREE
, cr
);
292 caller_context_t
*ct
)
294 struct ud_inode
*ip
= VTOI(vp
);
297 ud_printf("udf_read\n");
300 rw_enter(&ip
->i_rwlock
, RW_READER
);
303 ASSERT(RW_READ_HELD(&ip
->i_rwlock
));
305 if (MANDLOCK(vp
, ip
->i_char
)) {
307 * udf_getattr ends up being called by chklock
309 error
= chklock(vp
, FREAD
, uiop
->uio_loffset
,
310 uiop
->uio_resid
, uiop
->uio_fmode
, ct
);
316 rw_enter(&ip
->i_contents
, RW_READER
);
317 error
= ud_rdip(ip
, uiop
, ioflag
, cr
);
318 rw_exit(&ip
->i_contents
);
322 rw_exit(&ip
->i_rwlock
);
329 int32_t ud_WRITES
= 1;
330 int32_t ud_HW
= 96 * 1024;
331 int32_t ud_LW
= 64 * 1024;
332 int32_t ud_throttles
= 0;
341 caller_context_t
*ct
)
343 struct ud_inode
*ip
= VTOI(vp
);
346 ud_printf("udf_write\n");
349 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
352 ASSERT(RW_WRITE_HELD(&ip
->i_rwlock
));
354 if (MANDLOCK(vp
, ip
->i_char
)) {
356 * ud_getattr ends up being called by chklock
358 error
= chklock(vp
, FWRITE
, uiop
->uio_loffset
,
359 uiop
->uio_resid
, uiop
->uio_fmode
, ct
);
367 mutex_enter(&ip
->i_tlock
);
368 if (ud_WRITES
&& (ip
->i_writes
> ud_HW
)) {
369 while (ip
->i_writes
> ud_HW
) {
371 cv_wait(&ip
->i_wrcv
, &ip
->i_tlock
);
374 mutex_exit(&ip
->i_tlock
);
379 rw_enter(&ip
->i_contents
, RW_WRITER
);
380 if ((ioflag
& FAPPEND
) != 0 && (ip
->i_type
== VREG
)) {
382 * In append mode start at end of file.
384 uiop
->uio_loffset
= ip
->i_size
;
386 error
= ud_wrip(ip
, uiop
, ioflag
, cr
);
387 rw_exit(&ip
->i_contents
);
391 rw_exit(&ip
->i_rwlock
);
406 caller_context_t
*ct
)
418 caller_context_t
*ct
)
420 struct ud_inode
*ip
= VTOI(vp
);
422 ud_printf("udf_getattr\n");
424 if (vap
->va_mask
== VATTR_SIZE
) {
426 * for performance, if only the size is requested don't bother
427 * with anything else.
429 vap
->va_size
= ip
->i_size
;
433 rw_enter(&ip
->i_contents
, RW_READER
);
435 vap
->va_type
= vp
->v_type
;
436 vap
->va_mode
= UD2VA_PERM(ip
->i_perm
) | ip
->i_char
;
438 vap
->va_uid
= ip
->i_uid
;
439 vap
->va_gid
= ip
->i_gid
;
440 vap
->va_fsid
= ip
->i_dev
;
441 vap
->va_nodeid
= ip
->i_icb_lbano
;
442 vap
->va_nlink
= ip
->i_nlink
;
443 vap
->va_size
= ip
->i_size
;
444 vap
->va_seq
= ip
->i_seq
;
445 if (vp
->v_type
== VCHR
|| vp
->v_type
== VBLK
) {
446 vap
->va_rdev
= ip
->i_rdev
;
451 mutex_enter(&ip
->i_tlock
);
452 ITIMES_NOLOCK(ip
); /* mark correct time in inode */
453 vap
->va_atime
.tv_sec
= (time_t)ip
->i_atime
.tv_sec
;
454 vap
->va_atime
.tv_nsec
= ip
->i_atime
.tv_nsec
;
455 vap
->va_mtime
.tv_sec
= (time_t)ip
->i_mtime
.tv_sec
;
456 vap
->va_mtime
.tv_nsec
= ip
->i_mtime
.tv_nsec
;
457 vap
->va_ctime
.tv_sec
= (time_t)ip
->i_ctime
.tv_sec
;
458 vap
->va_ctime
.tv_nsec
= ip
->i_ctime
.tv_nsec
;
459 mutex_exit(&ip
->i_tlock
);
461 switch (ip
->i_type
) {
463 vap
->va_blksize
= MAXBSIZE
;
466 vap
->va_blksize
= MAXBSIZE
;
469 vap
->va_blksize
= ip
->i_udf
->udf_lbsize
;
472 vap
->va_nblocks
= ip
->i_lbr
<< ip
->i_udf
->udf_l2d_shift
;
474 rw_exit(&ip
->i_contents
);
480 ud_iaccess_vmode(void *ip
, int mode
, struct cred
*cr
)
482 return (ud_iaccess(ip
, UD_UPERM2DPERM(mode
), cr
, 0));
492 caller_context_t
*ct
)
495 uint32_t mask
= vap
->va_mask
;
500 ud_printf("udf_setattr\n");
505 * not updates allowed to 4096 files
507 if (ip
->i_astrat
== STRAT_TYPE4096
) {
512 * Cannot set these attributes
514 if (mask
& VATTR_NOSET
) {
518 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
519 rw_enter(&ip
->i_contents
, RW_WRITER
);
521 ovap
.va_uid
= ip
->i_uid
;
522 ovap
.va_mode
= UD2VA_PERM(ip
->i_perm
) | ip
->i_char
;
523 error
= secpolicy_vnode_setattr(cr
, vp
, vap
, &ovap
, flags
,
524 ud_iaccess_vmode
, ip
);
530 * Change file access modes.
532 if (mask
& VATTR_MODE
) {
533 ip
->i_perm
= VA2UD_PERM(vap
->va_mode
);
534 ip
->i_char
= vap
->va_mode
& (VSUID
| VSGID
| VSVTX
);
535 mutex_enter(&ip
->i_tlock
);
537 mutex_exit(&ip
->i_tlock
);
539 if (mask
& (VATTR_UID
|VATTR_GID
)) {
540 if (mask
& VATTR_UID
) {
541 ip
->i_uid
= vap
->va_uid
;
543 if (mask
& VATTR_GID
) {
544 ip
->i_gid
= vap
->va_gid
;
546 mutex_enter(&ip
->i_tlock
);
548 mutex_exit(&ip
->i_tlock
);
551 * Truncate file. Must have write permission and not be a directory.
553 if (mask
& VATTR_SIZE
) {
554 if (vp
->v_type
== VDIR
) {
558 if (error
= ud_iaccess(ip
, IWRITE
, cr
, 0)) {
561 if (vap
->va_size
> MAXOFFSET_T
) {
565 if (error
= ud_itrunc(ip
, vap
->va_size
, 0, cr
)) {
569 if (vap
->va_size
== 0)
570 vnevent_truncate(vp
, ct
);
573 * Change file access or modified times.
575 if (mask
& (VATTR_ATIME
|VATTR_MTIME
)) {
576 mutex_enter(&ip
->i_tlock
);
577 if (mask
& VATTR_ATIME
) {
578 ip
->i_atime
.tv_sec
= vap
->va_atime
.tv_sec
;
579 ip
->i_atime
.tv_nsec
= vap
->va_atime
.tv_nsec
;
582 if (mask
& VATTR_MTIME
) {
583 ip
->i_mtime
.tv_sec
= vap
->va_mtime
.tv_sec
;
584 ip
->i_mtime
.tv_nsec
= vap
->va_mtime
.tv_nsec
;
586 ip
->i_ctime
.tv_sec
= now
.tv_sec
;
587 ip
->i_ctime
.tv_nsec
= now
.tv_nsec
;
588 ip
->i_flag
&= ~(IUPD
|ICHG
);
589 ip
->i_flag
|= IMODTIME
;
592 mutex_exit(&ip
->i_tlock
);
596 if (curthread
->t_flag
& T_DONTPEND
) {
601 rw_exit(&ip
->i_contents
);
602 rw_exit(&ip
->i_rwlock
);
614 caller_context_t
*ct
)
616 struct ud_inode
*ip
= VTOI(vp
);
618 ud_printf("udf_access\n");
620 if (ip
->i_udf
== NULL
) {
624 return (ud_iaccess(ip
, UD_UPERM2DPERM(mode
), cr
, 1));
627 int32_t udfs_stickyhack
= 1;
635 struct pathname
*pnp
,
639 caller_context_t
*ct
,
645 struct ud_inode
*ip
, *xip
;
647 ud_printf("udf_lookup\n");
649 * Null component name is a synonym for directory being searched.
659 * Fast path: Check the directory name lookup cache.
662 if (vp
= dnlc_lookup(dvp
, nm
)) {
664 * Check accessibility of directory.
666 if ((error
= ud_iaccess(ip
, IEXEC
, cr
, 1)) != 0) {
671 error
= ud_dirlook(ip
, nm
, &xip
, cr
, 1);
678 if ((ip
->i_type
!= VDIR
) &&
679 (ip
->i_char
& ISVTX
) &&
680 ((ip
->i_perm
& IEXEC
) == 0) &&
682 mutex_enter(&(*vpp
)->v_lock
);
683 (*vpp
)->v_flag
|= VISSWAP
;
684 mutex_exit(&(*vpp
)->v_lock
);
688 * If vnode is a device return special vnode instead.
690 if (IS_DEVVP(*vpp
)) {
692 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
,
717 caller_context_t
*ct
,
721 struct ud_inode
*ip
= VTOI(dvp
), *xip
;
723 ud_printf("udf_create\n");
725 if ((vap
->va_mode
& VSVTX
) && secpolicy_vnode_stky_modify(cr
) != 0)
726 vap
->va_mode
&= ~VSVTX
;
730 * Null component name refers to the directory itself.
737 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
738 error
= ud_direnter(ip
, name
, DE_CREATE
, NULL
, NULL
, vap
,
740 rw_exit(&ip
->i_rwlock
);
745 rw_enter(&ip
->i_contents
, RW_WRITER
);
748 rw_enter(&ip
->i_contents
, RW_WRITER
);
753 * If the file already exists and this is a non-exclusive create,
754 * check permissions and allow access for non-directories.
755 * Read-only create of an existing directory is also allowed.
756 * We fail an exclusive create of anything which already exists.
758 if (error
== EEXIST
) {
759 if (excl
== NONEXCL
) {
760 if ((ip
->i_type
== VDIR
) && (mode
& VWRITE
)) {
763 error
= ud_iaccess(ip
,
764 UD_UPERM2DPERM(mode
), cr
, 0);
770 rw_exit(&ip
->i_contents
);
773 } else if ((ip
->i_type
== VREG
) &&
774 (vap
->va_mask
& VATTR_SIZE
) && vap
->va_size
== 0) {
776 * Truncate regular files, if requested by caller.
777 * Grab i_rwlock to make sure no one else is
778 * currently writing to the file (we promised
779 * bmap we would do this).
780 * Must get the locks in the correct order.
782 if (ip
->i_size
== 0) {
783 ip
->i_flag
|= ICHG
| IUPD
;
785 rw_exit(&ip
->i_contents
);
786 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
787 rw_enter(&ip
->i_contents
, RW_WRITER
);
788 (void) ud_itrunc(ip
, 0, 0, cr
);
789 rw_exit(&ip
->i_rwlock
);
791 vnevent_create(ITOV(ip
), ct
);
800 rw_exit(&ip
->i_contents
);
803 rw_exit(&ip
->i_contents
);
811 * If vnode is a device return special vnode instead.
813 if (!error
&& IS_DEVVP(*vpp
)) {
816 newvp
= specvp(*vpp
, (*vpp
)->v_rdev
, (*vpp
)->v_type
, cr
);
834 caller_context_t
*ct
,
838 struct ud_inode
*ip
= VTOI(vp
);
840 ud_printf("udf_remove\n");
842 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
843 error
= ud_dirremove(ip
, nm
,
844 NULL
, NULL
, DR_REMOVE
, cr
, ct
);
845 rw_exit(&ip
->i_rwlock
);
858 caller_context_t
*ct
,
862 struct vnode
*realvp
;
863 struct ud_inode
*sip
;
864 struct ud_inode
*tdp
;
866 ud_printf("udf_link\n");
867 if (fop_realvp(svp
, &realvp
, ct
) == 0) {
872 * Do not allow links to directories
874 if (svp
->v_type
== VDIR
) {
880 if (sip
->i_uid
!= crgetuid(cr
) && secpolicy_basic_link(cr
) != 0)
885 rw_enter(&tdp
->i_rwlock
, RW_WRITER
);
886 error
= ud_direnter(tdp
, tnm
, DE_LINK
, NULL
,
887 sip
, NULL
, (struct ud_inode
**)0, cr
, ct
);
888 rw_exit(&tdp
->i_rwlock
);
893 vnevent_link(svp
, ct
);
907 caller_context_t
*ct
,
911 struct udf_vfs
*udf_vfsp
;
912 struct ud_inode
*sip
; /* source inode */
913 struct ud_inode
*tip
; /* target inode */
914 struct ud_inode
*sdp
, *tdp
; /* source and target parent inode */
915 struct vnode
*realvp
;
917 ud_printf("udf_rename\n");
919 if (fop_realvp(tdvp
, &realvp
, ct
) == 0) {
926 udf_vfsp
= sdp
->i_udf
;
928 mutex_enter(&udf_vfsp
->udf_rename_lck
);
930 * Look up inode of file we're supposed to rename.
932 if (error
= ud_dirlook(sdp
, snm
, &sip
, cr
, 0)) {
933 mutex_exit(&udf_vfsp
->udf_rename_lck
);
937 * be sure this is not a directory with another file system mounted
938 * over it. If it is just give up the locks, and return with
941 if (vn_mountedvfs(ITOV(sip
)) != NULL
) {
946 * Make sure we can delete the source entry. This requires
947 * write permission on the containing directory. If that
948 * directory is "sticky" it further requires (except for
949 * privileged users) that the user own the directory or the
950 * source entry, or else have permission to write the source
953 rw_enter(&sdp
->i_contents
, RW_READER
);
954 rw_enter(&sip
->i_contents
, RW_READER
);
955 if ((error
= ud_iaccess(sdp
, IWRITE
, cr
, 0)) != 0 ||
956 (error
= ud_sticky_remove_access(sdp
, sip
, cr
)) != 0) {
957 rw_exit(&sip
->i_contents
);
958 rw_exit(&sdp
->i_contents
);
964 * Check for renaming '.' or '..' or alias of '.'
966 if ((strcmp(snm
, ".") == 0) ||
967 (strcmp(snm
, "..") == 0) ||
970 rw_exit(&sip
->i_contents
);
971 rw_exit(&sdp
->i_contents
);
975 rw_exit(&sip
->i_contents
);
976 rw_exit(&sdp
->i_contents
);
978 if (ud_dirlook(tdp
, tnm
, &tip
, cr
, 0) == 0) {
979 vnevent_pre_rename_dest(ITOV(tip
), tdvp
, tnm
, ct
);
983 /* Notify the target dir. if not the same as the source dir. */
985 vnevent_pre_rename_dest_dir(tdvp
, ITOV(sip
), tnm
, ct
);
987 vnevent_pre_rename_src(ITOV(sip
), sdvp
, snm
, ct
);
990 * Link source to the target.
992 rw_enter(&tdp
->i_rwlock
, RW_WRITER
);
993 if (error
= ud_direnter(tdp
, tnm
, DE_RENAME
, sdp
, sip
,
994 NULL
, (struct ud_inode
**)0, cr
, ct
)) {
996 * ESAME isn't really an error; it indicates that the
997 * operation should not be done because the source and target
998 * are the same file, but that no error should be reported.
1000 if (error
== ESAME
) {
1003 rw_exit(&tdp
->i_rwlock
);
1006 rw_exit(&tdp
->i_rwlock
);
1008 rw_enter(&sdp
->i_rwlock
, RW_WRITER
);
1010 * Unlink the source.
1011 * Remove the source entry. ud_dirremove() checks that the entry
1012 * still reflects sip, and returns an error if it doesn't.
1013 * If the entry has changed just forget about it. Release
1016 if ((error
= ud_dirremove(sdp
, snm
, sip
, NULL
,
1017 DR_RENAME
, cr
, ct
)) == ENOENT
) {
1020 rw_exit(&sdp
->i_rwlock
);
1023 vnevent_rename_src(ITOV(sip
), sdvp
, snm
, ct
);
1025 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1034 mutex_exit(&udf_vfsp
->udf_rename_lck
);
1047 caller_context_t
*ct
,
1052 struct ud_inode
*ip
;
1053 struct ud_inode
*xip
;
1055 ASSERT((vap
->va_mask
& (VATTR_TYPE
|VATTR_MODE
)) == (VATTR_TYPE
|VATTR_MODE
));
1057 ud_printf("udf_mkdir\n");
1060 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1061 error
= ud_direnter(ip
, dirname
, DE_MKDIR
,
1062 NULL
, NULL
, vap
, &xip
, cr
, ct
);
1063 rw_exit(&ip
->i_rwlock
);
1069 } else if (error
== EEXIST
) {
1084 caller_context_t
*ct
,
1088 struct ud_inode
*ip
= VTOI(vp
);
1090 ud_printf("udf_rmdir\n");
1092 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1093 error
= ud_dirremove(ip
, nm
, NULL
, cdir
, DR_RMDIR
,
1095 rw_exit(&ip
->i_rwlock
);
1108 caller_context_t
*ct
,
1111 struct ud_inode
*ip
;
1112 struct dirent64
*nd
;
1113 struct udf_vfs
*udf_vfsp
;
1114 int32_t error
= 0, len
, outcount
= 0;
1115 uint32_t dirsiz
, offset
;
1116 uint32_t bufsize
, ndlen
, dummy
;
1118 caddr_t outb
, end_outb
;
1124 uint8_t *buf
= NULL
;
1126 struct fbuf
*fbp
= NULL
;
1127 struct file_id
*fid
;
1131 ud_printf("udf_readdir\n");
1134 udf_vfsp
= ip
->i_udf
;
1136 dirsiz
= ip
->i_size
;
1137 if ((uiop
->uio_offset
>= dirsiz
) ||
1138 (ip
->i_nlink
<= 0)) {
1145 offset
= uiop
->uio_offset
;
1146 iovp
= uiop
->uio_iov
;
1147 bufsize
= iovp
->iov_len
;
1149 outb
= outbuf
= kmem_alloc((uint32_t)bufsize
, KM_SLEEP
);
1150 end_outb
= outb
+ bufsize
;
1151 nd
= (struct dirent64
*)outbuf
;
1153 dname
= kmem_zalloc(1024, KM_SLEEP
);
1154 buf
= kmem_zalloc(udf_vfsp
->udf_lbsize
, KM_SLEEP
);
1157 len
= DIRENT64_RECLEN(1);
1158 if (((caddr_t
)nd
+ len
) >= end_outb
) {
1162 nd
->d_ino
= ip
->i_icb_lbano
;
1163 nd
->d_reclen
= (uint16_t)len
;
1165 nd
->d_name
[0] = '.';
1166 bzero(&nd
->d_name
[1], DIRENT64_NAMELEN(len
) - 1);
1167 nd
= (struct dirent64
*)((char *)nd
+ nd
->d_reclen
);
1169 } else if (offset
== 0x10) {
1173 while (offset
< dirsiz
) {
1174 error
= ud_get_next_fid(ip
, &fbp
,
1175 offset
, &fid
, &name
, buf
);
1180 if ((fid
->fid_flags
& FID_DELETED
) == 0) {
1181 if (fid
->fid_flags
& FID_PARENT
) {
1183 len
= DIRENT64_RECLEN(2);
1184 if (((caddr_t
)nd
+ len
) >= end_outb
) {
1189 nd
->d_ino
= ip
->i_icb_lbano
;
1190 nd
->d_reclen
= (uint16_t)len
;
1191 nd
->d_off
= offset
+ FID_LEN(fid
);
1192 nd
->d_name
[0] = '.';
1193 nd
->d_name
[1] = '.';
1194 bzero(&nd
->d_name
[2],
1195 DIRENT64_NAMELEN(len
) - 2);
1196 nd
= (struct dirent64
*)
1197 ((char *)nd
+ nd
->d_reclen
);
1199 if ((error
= ud_uncompress(fid
->fid_idlen
,
1200 &length
, name
, dname
)) != 0) {
1204 offset
+= FID_LEN(fid
);
1207 len
= DIRENT64_RECLEN(length
);
1208 if (((caddr_t
)nd
+ len
) >= end_outb
) {
1214 (void) strncpy(nd
->d_name
,
1215 (caddr_t
)dname
, length
);
1216 bzero(&nd
->d_name
[length
],
1217 DIRENT64_NAMELEN(len
) - length
);
1218 nd
->d_ino
= ud_xlate_to_daddr(udf_vfsp
,
1219 SWAP_16(fid
->fid_icb
.lad_ext_prn
),
1220 SWAP_32(fid
->fid_icb
.lad_ext_loc
), 1,
1222 nd
->d_reclen
= (uint16_t)len
;
1223 nd
->d_off
= offset
+ FID_LEN(fid
);
1224 nd
= (struct dirent64
*)
1225 ((char *)nd
+ nd
->d_reclen
);
1230 offset
+= FID_LEN(fid
);
1235 fbrelse(fbp
, S_OTHER
);
1237 ndlen
= ((char *)nd
- outbuf
);
1239 * In case of error do not call uiomove.
1240 * Return the error to the caller.
1242 if ((error
== 0) && (ndlen
!= 0)) {
1243 error
= uiomove(outbuf
, (long)ndlen
, UIO_READ
, uiop
);
1244 uiop
->uio_offset
= offset
;
1246 kmem_free((caddr_t
)buf
, udf_vfsp
->udf_lbsize
);
1247 kmem_free((caddr_t
)dname
, 1024);
1248 kmem_free(outbuf
, (uint32_t)bufsize
);
1249 if (eofp
&& error
== 0) {
1250 *eofp
= (uiop
->uio_offset
>= dirsiz
);
1263 caller_context_t
*ct
,
1266 int32_t error
= 0, outlen
;
1267 uint32_t ioflag
= 0;
1268 struct ud_inode
*ip
, *dip
= VTOI(dvp
);
1270 struct path_comp
*pc
;
1271 int8_t *dname
= NULL
, *uname
= NULL
, *sp
;
1273 ud_printf("udf_symlink\n");
1276 vap
->va_type
= VLNK
;
1279 rw_enter(&dip
->i_rwlock
, RW_WRITER
);
1280 error
= ud_direnter(dip
, linkname
, DE_CREATE
,
1281 NULL
, NULL
, vap
, &ip
, cr
, ct
);
1282 rw_exit(&dip
->i_rwlock
);
1284 dname
= kmem_zalloc(1024, KM_SLEEP
);
1285 uname
= kmem_zalloc(PAGESIZE
, KM_SLEEP
);
1287 pc
= (struct path_comp
*)uname
;
1289 * If the first character in target is "/"
1290 * then skip it and create entry for it
1292 if (*target
== '/') {
1295 pc
= (struct path_comp
*)(((char *)pc
) + 4);
1296 while (*target
== '/') {
1301 while (*target
!= '\0') {
1303 while ((*target
!= '/') && (*target
!= '\0')) {
1307 * We got the next component of the
1308 * path name. Create path_comp of
1311 if (((target
- sp
) == 1) && (*sp
== '.')) {
1316 pc
= (struct path_comp
*)(((char *)pc
) + 4);
1317 } else if (((target
- sp
) == 2) &&
1318 (*sp
== '.') && ((*(sp
+ 1)) == '.')) {
1323 pc
= (struct path_comp
*)(((char *)pc
) + 4);
1326 * convert the user given name
1327 * into appropriate form to be put
1330 outlen
= 1024; /* set to size of dname */
1331 if (error
= ud_compress(target
- sp
, &outlen
,
1332 (uint8_t *)sp
, (uint8_t *)dname
)) {
1337 pc
->pc_len
= outlen
;
1338 dname
[outlen
] = '\0';
1339 (void) strcpy((char *)pc
->pc_id
, dname
);
1340 pc
= (struct path_comp
*)
1341 (((char *)pc
) + 4 + outlen
);
1343 while (*target
== '/') {
1346 if (*target
== '\0') {
1351 rw_enter(&ip
->i_contents
, RW_WRITER
);
1354 if (curthread
->t_flag
& T_DONTPEND
) {
1357 error
= ud_rdwri(UIO_WRITE
, ioflag
, ip
,
1358 uname
, ((int8_t *)pc
) - uname
,
1359 0, UIO_SYSSPACE
, (int32_t *)0, cr
);
1363 rw_exit(&ip
->i_contents
);
1364 rw_enter(&dip
->i_rwlock
, RW_WRITER
);
1365 (void) ud_dirremove(dip
, linkname
, NULL
,
1366 NULL
, DR_REMOVE
, cr
, ct
);
1367 rw_exit(&dip
->i_rwlock
);
1370 rw_exit(&ip
->i_contents
);
1373 if ((error
== 0) || (error
== EEXIST
)) {
1379 if (uname
!= NULL
) {
1380 kmem_free(uname
, PAGESIZE
);
1382 if (dname
!= NULL
) {
1383 kmem_free(dname
, 1024);
1395 caller_context_t
*ct
)
1397 int32_t error
= 0, off
, id_len
, size
, len
;
1398 int8_t *dname
= NULL
, *uname
= NULL
;
1399 struct ud_inode
*ip
;
1400 struct fbuf
*fbp
= NULL
;
1401 struct path_comp
*pc
;
1403 ud_printf("udf_readlink\n");
1405 if (vp
->v_type
!= VLNK
) {
1411 if (size
> PAGESIZE
) {
1419 dname
= kmem_zalloc(1024, KM_SLEEP
);
1420 uname
= kmem_zalloc(PAGESIZE
, KM_SLEEP
);
1422 rw_enter(&ip
->i_contents
, RW_READER
);
1424 if ((error
= fbread(vp
, 0, size
, S_READ
, &fbp
)) != 0) {
1430 while (off
< size
) {
1431 pc
= (struct path_comp
*)(fbp
->fb_addr
+ off
);
1432 switch (pc
->pc_type
) {
1434 (void) strcpy(uname
, ip
->i_udf
->udf_fsmnt
);
1435 (void) strcat(uname
, "/");
1438 if (pc
->pc_len
!= 0) {
1445 (void) strcat(uname
, "../");
1448 (void) strcat(uname
, "./");
1451 if ((error
= ud_uncompress(pc
->pc_len
, &id_len
,
1452 pc
->pc_id
, (uint8_t *)dname
)) != 0) {
1455 dname
[id_len
] = '\0';
1456 (void) strcat(uname
, dname
);
1457 (void) strcat(uname
, "/");
1463 off
+= 4 + pc
->pc_len
;
1465 len
= strlen(uname
) - 1;
1466 if (uname
[len
] == '/') {
1469 * special case link to /
1477 error
= uiomove(uname
, len
, UIO_READ
, uiop
);
1483 fbrelse(fbp
, S_OTHER
);
1485 rw_exit(&ip
->i_contents
);
1486 if (uname
!= NULL
) {
1487 kmem_free(uname
, PAGESIZE
);
1489 if (dname
!= NULL
) {
1490 kmem_free(dname
, 1024);
1501 caller_context_t
*ct
)
1504 struct ud_inode
*ip
= VTOI(vp
);
1506 ud_printf("udf_fsync\n");
1508 rw_enter(&ip
->i_contents
, RW_WRITER
);
1509 if (!(IS_SWAPVP(vp
))) {
1510 error
= ud_syncip(ip
, 0, I_SYNC
); /* Do synchronous writes */
1513 error
= ud_sync_indir(ip
);
1515 ITIMES(ip
); /* XXX: is this necessary ??? */
1516 rw_exit(&ip
->i_contents
);
1523 udf_inactive(struct vnode
*vp
, struct cred
*cr
, caller_context_t
*ct
)
1525 ud_printf("udf_iinactive\n");
1527 ud_iinactive(VTOI(vp
), cr
);
1532 udf_fid(struct vnode
*vp
, struct fid
*fidp
, caller_context_t
*ct
)
1534 struct udf_fid
*udfidp
;
1535 struct ud_inode
*ip
= VTOI(vp
);
1537 ud_printf("udf_fid\n");
1539 if (fidp
->fid_len
< (sizeof (struct udf_fid
) - sizeof (uint16_t))) {
1540 fidp
->fid_len
= sizeof (struct udf_fid
) - sizeof (uint16_t);
1544 udfidp
= (struct udf_fid
*)fidp
;
1545 bzero((char *)udfidp
, sizeof (struct udf_fid
));
1546 rw_enter(&ip
->i_contents
, RW_READER
);
1547 udfidp
->udfid_len
= sizeof (struct udf_fid
) - sizeof (uint16_t);
1548 udfidp
->udfid_uinq_lo
= ip
->i_uniqid
& 0xffffffff;
1549 udfidp
->udfid_prn
= ip
->i_icb_prn
;
1550 udfidp
->udfid_icb_lbn
= ip
->i_icb_block
;
1551 rw_exit(&ip
->i_contents
);
1558 udf_rwlock(struct vnode
*vp
, int32_t write_lock
, caller_context_t
*ctp
)
1560 struct ud_inode
*ip
= VTOI(vp
);
1562 ud_printf("udf_rwlock\n");
1565 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1567 rw_enter(&ip
->i_rwlock
, RW_READER
);
1570 rw_exit(&ip
->i_rwlock
);
1572 return (write_lock
);
1577 udf_rwunlock(struct vnode
*vp
, int32_t write_lock
, caller_context_t
*ctp
)
1579 struct ud_inode
*ip
= VTOI(vp
);
1581 ud_printf("udf_rwunlock\n");
1584 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
1587 rw_exit(&ip
->i_rwlock
);
1593 udf_seek(struct vnode
*vp
, offset_t ooff
, offset_t
*noffp
, caller_context_t
*ct
)
1595 return ((*noffp
< 0 || *noffp
> MAXOFFSET_T
) ? EINVAL
: 0);
1602 struct flock64
*bfp
,
1605 struct flk_callback
*flk_cbp
,
1607 caller_context_t
*ct
)
1609 struct ud_inode
*ip
= VTOI(vp
);
1611 ud_printf("udf_frlock\n");
1614 * If file is being mapped, disallow frlock.
1615 * XXX I am not holding tlock while checking i_mapcnt because the
1616 * current locking strategy drops all locks before calling fs_frlock.
1617 * So, mapcnt could change before we enter fs_frlock making is
1618 * meaningless to have held tlock in the first place.
1620 if ((ip
->i_mapcnt
> 0) &&
1621 (MANDLOCK(vp
, ip
->i_char
))) {
1625 return (fs_frlock(vp
, cmd
, bfp
, flag
, offset
, flk_cbp
, cr
, ct
));
1633 struct flock64
*bfp
,
1637 caller_context_t
*ct
)
1641 ud_printf("udf_space\n");
1643 if (cmd
!= F_FREESP
) {
1645 } else if ((error
= convoff(vp
, bfp
, 0, offset
)) == 0) {
1646 error
= ud_freesp(vp
, bfp
, flag
, cr
);
1648 if (error
== 0 && bfp
->l_start
== 0)
1649 vnevent_truncate(vp
, ct
);
1662 struct page
**plarr
,
1668 caller_context_t
*ct
)
1670 struct ud_inode
*ip
= VTOI(vp
);
1671 int32_t error
, has_holes
, beyond_eof
, seqmode
, dolock
;
1672 int32_t pgsize
= PAGESIZE
;
1673 struct udf_vfs
*udf_vfsp
= ip
->i_udf
;
1675 uoff_t pgoff
, eoff
, uoff
;
1679 ud_printf("udf_getpage\n");
1681 uoff
= (uoff_t
)off
; /* type conversion */
1685 if (vp
->v_flag
& VNOMAP
) {
1688 seqmode
= ip
->i_nextr
== uoff
&& rw
!= S_CREATE
;
1691 dolock
= (rw_owner(&ip
->i_contents
) != curthread
);
1694 rw_enter(&ip
->i_contents
, rwtype
);
1697 rw_enter(&ip
->i_contents
, rwtype
);
1702 * We may be getting called as a side effect of a bmap using
1703 * fbread() when the blocks might be being allocated and the
1704 * size has not yet been up'ed. In this case we want to be
1705 * able to return zero pages if we get back UDF_HOLE from
1706 * calling bmap for a non write case here. We also might have
1707 * to read some frags from the disk into a page if we are
1708 * extending the number of frags for a given lbn in bmap().
1710 beyond_eof
= uoff
+ len
> ip
->i_size
+ PAGEOFFSET
;
1711 if (beyond_eof
&& seg
!= segkmap
) {
1713 rw_exit(&ip
->i_contents
);
1716 rw_exit(&ip
->i_contents
);
1723 * Must hold i_contents lock throughout the call to pvn_getpages
1724 * since locked pages are returned from each call to ud_getapage.
1725 * Must *not* return locked pages and then try for contents lock
1726 * due to lock ordering requirements (inode > page)
1729 has_holes
= ud_bmap_has_holes(ip
);
1731 if ((rw
== S_WRITE
|| rw
== S_CREATE
) && (has_holes
|| beyond_eof
)) {
1732 int32_t blk_size
, count
;
1736 * We must acquire the RW_WRITER lock in order to
1737 * call bmap_write().
1739 if (dolock
&& rwtype
== RW_READER
) {
1742 if (!rw_tryupgrade(&ip
->i_contents
)) {
1744 rw_exit(&ip
->i_contents
);
1751 * May be allocating disk blocks for holes here as
1752 * a result of mmap faults. write(2) does the bmap_write
1753 * in rdip/wrip, not here. We are not dealing with frags
1757 while ((offset
< uoff
+ len
) &&
1758 (offset
< ip
->i_size
)) {
1760 * the variable "bnp" is to simplify the expression for
1761 * the compiler; * just passing in &bn to bmap_write
1762 * causes a compiler "loop"
1765 blk_size
= udf_vfsp
->udf_lbsize
;
1766 if ((offset
+ blk_size
) > ip
->i_size
) {
1767 count
= ip
->i_size
- offset
;
1771 error
= ud_bmap_write(ip
, offset
, count
, 0, cr
);
1775 offset
+= count
; /* XXX - make this contig */
1780 * Can be a reader from now on.
1783 if (rwtype
== RW_WRITER
) {
1784 rw_downgrade(&ip
->i_contents
);
1787 if (dolock
&& rwtype
== RW_WRITER
) {
1788 rw_downgrade(&ip
->i_contents
);
1793 * We remove PROT_WRITE in cases when the file has UDF holes
1794 * because we don't want to call bmap_read() to check each
1795 * page if it is backed with a disk block.
1797 if (protp
&& has_holes
&& rw
!= S_WRITE
&& rw
!= S_CREATE
) {
1798 *protp
&= ~PROT_WRITE
;
1804 * The loop looks up pages in the range <off, off + len).
1805 * For each page, we first check if we should initiate an asynchronous
1806 * read ahead before we call page_lookup (we may sleep in page_lookup
1807 * for a previously initiated disk read).
1809 eoff
= (uoff
+ len
);
1810 for (pgoff
= uoff
, pgaddr
= addr
, pl
= plarr
;
1811 pgoff
< eoff
; /* empty */) {
1816 se
= ((rw
== S_CREATE
) ? SE_EXCL
: SE_SHARED
);
1819 * Handle async getpage (faultahead)
1821 if (plarr
== NULL
) {
1822 ip
->i_nextrio
= pgoff
;
1823 ud_getpage_ra(vp
, pgoff
, seg
, pgaddr
);
1830 * Check if we should initiate read ahead of next cluster.
1831 * We call page_exists only when we need to confirm that
1832 * we have the current page before we initiate the read ahead.
1834 nextrio
= ip
->i_nextrio
;
1836 pgoff
+ RD_CLUSTSZ(ip
) >= nextrio
&& pgoff
<= nextrio
&&
1837 nextrio
< ip
->i_size
&& page_exists(&vp
->v_object
, pgoff
))
1838 ud_getpage_ra(vp
, pgoff
, seg
, pgaddr
);
1840 if ((pp
= page_lookup(&vp
->v_object
, pgoff
, se
)) != NULL
) {
1843 * We found the page in the page cache.
1853 * We have to create the page, or read it from disk.
1855 if (error
= ud_getpage_miss(vp
, pgoff
, len
,
1856 seg
, pgaddr
, pl
, plsz
, rw
, seqmode
)) {
1860 while (*pl
!= NULL
) {
1871 * Return pages up to plsz if they are in the page cache.
1872 * We cannot return pages if there is a chance that they are
1873 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1875 if (plarr
&& !(has_holes
&& (rw
== S_WRITE
|| rw
== S_CREATE
))) {
1877 ASSERT((protp
== NULL
) ||
1878 !(has_holes
&& (*protp
& PROT_WRITE
)));
1880 eoff
= pgoff
+ plsz
;
1881 while (pgoff
< eoff
) {
1884 if ((pp
= page_lookup_nowait(&vp
->v_object
, pgoff
, SE_SHARED
)) == NULL
)
1894 *pl
= NULL
; /* Terminate page list */
1895 ip
->i_nextr
= pgoff
;
1898 if (error
&& plarr
) {
1900 * Release any pages we have locked.
1902 while (pl
> &plarr
[0])
1910 rw_exit(&ip
->i_contents
);
1913 rw_exit(&ip
->i_contents
);
1918 * If the inode is not already marked for IACC (in rwip() for read)
1919 * and the inode is not marked for no access time update (in rwip()
1920 * for write) then update the inode access time and mod time now.
1922 mutex_enter(&ip
->i_tlock
);
1923 if ((ip
->i_flag
& (IACC
| INOACC
)) == 0) {
1924 if ((rw
!= S_OTHER
) && (ip
->i_type
!= VDIR
)) {
1927 if (rw
== S_WRITE
) {
1932 mutex_exit(&ip
->i_tlock
);
1937 int32_t ud_delay
= 1;
1947 caller_context_t
*ct
)
1949 struct ud_inode
*ip
;
1952 ud_printf("udf_putpage\n");
1956 rw_enter(&ip
->i_contents
, RW_WRITER
);
1959 if (vp
->v_count
== 0) {
1960 cmn_err(CE_WARN
, "ud_putpage : bad v_count");
1965 if (vp
->v_flag
& VNOMAP
) {
1970 if (flags
& B_ASYNC
) {
1971 if (ud_delay
&& len
&&
1972 (flags
& ~(B_ASYNC
|B_DONTNEED
|B_FREE
)) == 0) {
1973 mutex_enter(&ip
->i_tlock
);
1976 * If nobody stalled, start a new cluster.
1978 if (ip
->i_delaylen
== 0) {
1979 ip
->i_delayoff
= off
;
1980 ip
->i_delaylen
= len
;
1981 mutex_exit(&ip
->i_tlock
);
1986 * If we have a full cluster or they are not contig,
1987 * then push last cluster and start over.
1989 if (ip
->i_delaylen
>= WR_CLUSTSZ(ip
) ||
1990 ip
->i_delayoff
+ ip
->i_delaylen
!= off
) {
1994 doff
= ip
->i_delayoff
;
1995 dlen
= ip
->i_delaylen
;
1996 ip
->i_delayoff
= off
;
1997 ip
->i_delaylen
= len
;
1998 mutex_exit(&ip
->i_tlock
);
1999 error
= ud_putpages(vp
, doff
, dlen
, flags
, cr
);
2000 /* LMXXX - flags are new val, not old */
2005 * There is something there, it's not full, and
2008 ip
->i_delaylen
+= len
;
2009 mutex_exit(&ip
->i_tlock
);
2014 * Must have weird flags or we are not clustering.
2018 error
= ud_putpages(vp
, off
, len
, flags
, cr
);
2022 rw_exit(&ip
->i_contents
);
2039 caller_context_t
*ct
)
2041 struct segvn_crargs vn_a
;
2044 ud_printf("udf_map\n");
2046 if (vp
->v_flag
& VNOMAP
) {
2052 ((off
+ len
) < 0)) {
2057 if (vp
->v_type
!= VREG
) {
2063 * If file is being locked, disallow mapping.
2065 if (vn_has_mandatory_locks(vp
, VTOI(vp
)->i_char
)) {
2071 error
= choose_addr(as
, addrp
, len
, off
, ADDR_VACALIGN
, flags
);
2079 vn_a
.type
= flags
& MAP_TYPE
;
2081 vn_a
.maxprot
= maxprot
;
2084 vn_a
.flags
= flags
& ~MAP_TYPE
;
2086 vn_a
.lgrp_mem_policy_flags
= 0;
2088 error
= as_map(as
, *addrp
, len
, segvn_create
, (caddr_t
)&vn_a
);
2097 udf_addmap(struct vnode
*vp
,
2106 caller_context_t
*ct
)
2108 struct ud_inode
*ip
= VTOI(vp
);
2110 ud_printf("udf_addmap\n");
2112 if (vp
->v_flag
& VNOMAP
) {
2116 mutex_enter(&ip
->i_tlock
);
2117 ip
->i_mapcnt
+= btopr(len
);
2118 mutex_exit(&ip
->i_tlock
);
2126 struct vnode
*vp
, offset_t off
,
2134 caller_context_t
*ct
)
2136 struct ud_inode
*ip
= VTOI(vp
);
2138 ud_printf("udf_delmap\n");
2140 if (vp
->v_flag
& VNOMAP
) {
2144 mutex_enter(&ip
->i_tlock
);
2145 ip
->i_mapcnt
-= btopr(len
); /* Count released mappings */
2146 ASSERT(ip
->i_mapcnt
>= 0);
2147 mutex_exit(&ip
->i_tlock
);
2159 caller_context_t
*ct
)
2163 ud_printf("udf_l_pathconf\n");
2165 if (cmd
== _PC_FILESIZEBITS
) {
2167 * udf supports 64 bits as file size
2168 * but there are several other restrictions
2169 * it only supports 32-bit block numbers and
2170 * daddr32_t is only and int32_t so taking these
2171 * into account we can stay just as where ufs is
2174 } else if (cmd
== _PC_TIMESTAMP_RESOLUTION
) {
2175 /* nanosecond timestamp resolution */
2178 error
= fs_pathconf(vp
, cmd
, valp
, cr
, ct
);
2184 uint32_t ud_pageio_reads
= 0, ud_pageio_writes
= 0;
2185 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads
))
2186 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes
))
2188 * Assumption is that there will not be a pageio request
2189 * to a enbedded file
2200 caller_context_t
*ct
)
2204 struct ud_inode
*ip
= VTOI(vp
);
2205 int32_t dolock
, error
= 0, contig
, multi_io
;
2206 size_t done_len
= 0, cur_len
= 0;
2207 page_t
*npp
= NULL
, *opp
= NULL
, *cpp
= pp
;
2213 dolock
= (rw_owner(&ip
->i_contents
) != curthread
);
2216 * We need a better check. Ideally, we would use another
2217 * vnodeops so that hlocked and forcibly unmounted file
2218 * systems would return EIO where appropriate and w/o the
2219 * need for these checks.
2221 if (ip
->i_udf
== NULL
) {
2226 rw_enter(&ip
->i_contents
, RW_READER
);
2229 rw_enter(&ip
->i_contents
, RW_READER
);
2234 * Break the io request into chunks, one for each contiguous
2235 * stretch of disk blocks in the target file.
2237 while (done_len
< io_len
) {
2241 if (error
= ud_bmap_read(ip
, (uoff_t
)(io_off
+ done_len
),
2246 if (bn
== UDF_HOLE
) { /* No holey swapfiles */
2247 cmn_err(CE_WARN
, "SWAP file has HOLES");
2252 cur_len
= MIN(io_len
- done_len
, contig
);
2255 * Check if more than one I/O is
2256 * required to complete the given
2259 if (ip
->i_udf
->udf_lbsize
< PAGESIZE
) {
2260 if (cur_len
>= PAGESIZE
) {
2262 cur_len
&= PAGEMASK
;
2265 cur_len
= MIN(io_len
- done_len
, PAGESIZE
);
2268 page_list_break(&cpp
, &npp
, btop(cur_len
));
2270 bp
= pageio_setup(cpp
, cur_len
, ip
->i_devvp
, flags
);
2273 bp
->b_edev
= ip
->i_dev
;
2274 bp
->b_dev
= cmpdev(ip
->i_dev
);
2276 bp
->b_un
.b_addr
= (caddr_t
)0;
2278 bp
->b_offset
= (offset_t
)(io_off
+ done_len
);
2281 * ub.ub_pageios.value.ul++;
2283 if (multi_io
== 0) {
2284 (void) bdev_strategy(bp
);
2286 error
= ud_multi_strat(ip
, cpp
, bp
,
2287 (uoff_t
)(io_off
+ done_len
));
2293 if (flags
& B_READ
) {
2300 * If the request is not B_ASYNC, wait for i/o to complete
2301 * and re-assemble the page list to return to the caller.
2302 * If it is B_ASYNC we leave the page list in pieces and
2303 * cleanup() will dispose of them.
2305 if ((flags
& B_ASYNC
) == 0) {
2306 error
= biowait(bp
);
2311 page_list_concat(&opp
, &cpp
);
2315 done_len
+= cur_len
;
2318 ASSERT(error
|| (cpp
== NULL
&& npp
== NULL
&& done_len
== io_len
));
2320 if (flags
& B_ASYNC
) {
2321 /* Cleanup unprocessed parts of list */
2322 page_list_concat(&cpp
, &npp
);
2323 if (flags
& B_READ
) {
2324 pvn_read_done(cpp
, B_ERROR
);
2326 pvn_write_done(cpp
, B_ERROR
);
2329 /* Re-assemble list and let caller clean up */
2330 page_list_concat(&opp
, &cpp
);
2331 page_list_concat(&opp
, &npp
);
2336 rw_exit(&ip
->i_contents
);
2339 rw_exit(&ip
->i_contents
);
2348 /* -------------------- local functions --------------------------- */
2353 ud_rdwri(enum uio_rw rw
, int32_t ioflag
,
2354 struct ud_inode
*ip
, caddr_t base
, int32_t len
,
2355 offset_t offset
, enum uio_seg seg
, int32_t *aresid
, struct cred
*cr
)
2361 ud_printf("ud_rdwri\n");
2363 bzero((caddr_t
)&auio
, sizeof (uio_t
));
2364 bzero((caddr_t
)&aiov
, sizeof (iovec_t
));
2366 aiov
.iov_base
= base
;
2368 auio
.uio_iov
= &aiov
;
2369 auio
.uio_iovcnt
= 1;
2370 auio
.uio_loffset
= offset
;
2371 auio
.uio_segflg
= (int16_t)seg
;
2372 auio
.uio_resid
= len
;
2374 if (rw
== UIO_WRITE
) {
2375 auio
.uio_fmode
= FWRITE
;
2376 auio
.uio_extflg
= UIO_COPY_DEFAULT
;
2377 auio
.uio_llimit
= curproc
->p_fsz_ctl
;
2378 error
= ud_wrip(ip
, &auio
, ioflag
, cr
);
2380 auio
.uio_fmode
= FREAD
;
2381 auio
.uio_extflg
= UIO_COPY_CACHED
;
2382 auio
.uio_llimit
= MAXOFFSET_T
;
2383 error
= ud_rdip(ip
, &auio
, ioflag
, cr
);
2387 *aresid
= auio
.uio_resid
;
2388 } else if (auio
.uio_resid
) {
2395 * Free behind hacks. The pager is busted.
2396 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2397 * or B_FREE_IF_TIGHT_ON_MEMORY.
2399 int32_t ud_freebehind
= 1;
2400 int32_t ud_smallfile
= 32 * 1024;
2404 ud_getpage_miss(struct vnode
*vp
, uoff_t off
,
2405 size_t len
, struct seg
*seg
, caddr_t addr
, page_t
*pl
[],
2406 size_t plsz
, enum seg_rw rw
, int32_t seq
)
2408 struct ud_inode
*ip
= VTOI(vp
);
2418 * Figure out whether the page can be created, or must be
2419 * read from the disk
2421 if (rw
== S_CREATE
) {
2422 if ((pp
= page_create_va(&vp
->v_object
, off
,
2423 PAGESIZE
, PG_WAIT
, seg
, addr
)) == NULL
) {
2424 cmn_err(CE_WARN
, "ud_getpage_miss: page_create");
2429 pp
= pvn_read_kluster(vp
, off
, seg
, addr
, &io_off
,
2430 &io_len
, off
, PAGESIZE
, 0);
2433 * Some other thread has entered the page.
2434 * ud_getpage will retry page_lookup.
2441 * Fill the page with as much data as we can from the file.
2443 err
= ud_page_fill(ip
, pp
, off
, B_READ
, &pgoff
);
2445 pvn_read_done(pp
, B_ERROR
);
2450 * XXX ??? ufs has io_len instead of pgoff below
2452 ip
->i_nextrio
= off
+ ((pgoff
+ PAGESIZE
- 1) & PAGEMASK
);
2455 * If the file access is sequential, initiate read ahead
2456 * of the next cluster.
2458 if (seq
&& ip
->i_nextrio
< ip
->i_size
) {
2459 ud_getpage_ra(vp
, off
, seg
, addr
);
2464 pvn_plist_init(pp
, pl
, plsz
, (offset_t
)off
, io_len
, rw
);
2470 ud_getpage_ra(struct vnode
*vp
,
2471 uoff_t off
, struct seg
*seg
, caddr_t addr
)
2475 struct ud_inode
*ip
= VTOI(vp
);
2476 uoff_t io_off
= ip
->i_nextrio
, pgoff
;
2477 caddr_t addr2
= addr
+ (io_off
- off
);
2482 * Is this test needed?
2485 if (addr2
>= seg
->s_base
+ seg
->s_size
) {
2490 if (ud_bmap_read(ip
, io_off
, &bn
, &contig
) != 0 || bn
== UDF_HOLE
) {
2494 pp
= pvn_read_kluster(vp
, io_off
, seg
, addr2
,
2495 &io_off
, &io_len
, io_off
, PAGESIZE
, 1);
2498 * Some other thread has entered the page.
2499 * So no read head done here (ie we will have to and wait
2500 * for the read when needed).
2507 (void) ud_page_fill(ip
, pp
, io_off
, (B_READ
|B_ASYNC
), &pgoff
);
2508 ip
->i_nextrio
= io_off
+ ((pgoff
+ PAGESIZE
- 1) & PAGEMASK
);
2512 ud_page_fill(struct ud_inode
*ip
, page_t
*pp
, uoff_t off
,
2513 uint32_t bflgs
, uoff_t
*pg_off
)
2517 caddr_t kaddr
, caddr
;
2518 int32_t error
= 0, contig
= 0, multi_io
= 0;
2519 int32_t lbsize
= ip
->i_udf
->udf_lbsize
;
2520 int32_t lbmask
= ip
->i_udf
->udf_lbmask
;
2523 isize
= (ip
->i_size
+ lbmask
) & (~lbmask
);
2524 if (ip
->i_desc_type
== ICB_FLAG_ONE_AD
) {
2527 * Embedded file read file_entry
2528 * from buffer cache and copy the required
2531 bp
= ud_bread(ip
->i_dev
,
2532 ip
->i_icb_lbano
<< ip
->i_udf
->udf_l2d_shift
, lbsize
);
2533 if ((bp
->b_error
== 0) &&
2534 (bp
->b_resid
== 0)) {
2536 caddr
= bp
->b_un
.b_addr
+ ip
->i_data_off
;
2541 kaddr
= (caddr_t
)ppmapin(pp
,
2542 PROT_READ
| PROT_WRITE
, (caddr_t
)-1);
2543 (void) kcopy(caddr
, kaddr
, ip
->i_size
);
2551 contig
= ip
->i_size
;
2555 * Get the continuous size and block number
2558 if (error
= ud_bmap_read(ip
, off
, &bn
, &contig
))
2560 contig
= MIN(contig
, PAGESIZE
);
2561 contig
= (contig
+ lbmask
) & (~lbmask
);
2564 * Zero part of the page which we are not
2565 * going to read from the disk.
2568 if (bn
== UDF_HOLE
) {
2571 * This is a HOLE. Just zero out
2574 if (((off
+ contig
) == isize
) ||
2575 (contig
== PAGESIZE
)) {
2576 pagezero(pp
->p_prev
, 0, PAGESIZE
);
2581 if (contig
< PAGESIZE
) {
2584 count
= isize
- off
;
2585 if (contig
!= count
) {
2587 contig
= (int32_t)(MIN(count
, PAGESIZE
));
2589 pagezero(pp
->p_prev
, contig
, PAGESIZE
- contig
);
2594 * Get a bp and initialize it
2596 bp
= pageio_setup(pp
, contig
, ip
->i_devvp
, bflgs
);
2599 bp
->b_edev
= ip
->i_dev
;
2600 bp
->b_dev
= cmpdev(ip
->i_dev
);
2602 bp
->b_un
.b_addr
= 0;
2603 bp
->b_file
= ip
->i_vnode
;
2608 if (multi_io
== 0) {
2611 * Single I/O is sufficient for this page
2613 (void) bdev_strategy(bp
);
2617 * We need to do the I/O in
2620 error
= ud_multi_strat(ip
, pp
, bp
, off
);
2625 if ((bflgs
& B_ASYNC
) == 0) {
2628 * Wait for i/o to complete.
2631 error
= biowait(bp
);
2638 if ((off
+ contig
) >= ip
->i_size
) {
2639 contig
= ip
->i_size
- off
;
2648 ud_putpages(struct vnode
*vp
, offset_t off
,
2649 size_t len
, int32_t flags
, struct cred
*cr
)
2651 struct ud_inode
*ip
;
2659 ud_printf("ud_putpages\n");
2661 if (vp
->v_count
== 0) {
2662 cmn_err(CE_WARN
, "ud_putpages: bad v_count");
2669 * Acquire the readers/write inode lock before locking
2670 * any pages in this inode.
2671 * The inode lock is held during i/o.
2674 mutex_enter(&ip
->i_tlock
);
2675 ip
->i_delayoff
= ip
->i_delaylen
= 0;
2676 mutex_exit(&ip
->i_tlock
);
2679 rw_enter(&ip
->i_contents
, RW_READER
);
2681 dolock
= (rw_owner(&ip
->i_contents
) != curthread
);
2683 rw_enter(&ip
->i_contents
, RW_READER
);
2687 if (!vn_has_cached_data(vp
)) {
2689 rw_exit(&ip
->i_contents
);
2692 rw_exit(&ip
->i_contents
);
2700 * Search the entire vp list for pages >= off.
2702 err
= pvn_vplist_dirty(vp
, (uoff_t
)off
, ud_putapage
,
2706 * Loop over all offsets in the range looking for
2707 * pages to deal with.
2709 if ((eoff
= blkroundup(ip
->i_udf
, ip
->i_size
)) != 0) {
2710 eoff
= MIN(off
+ len
, eoff
);
2715 for (io_off
= off
; io_off
< eoff
; io_off
+= io_len
) {
2717 * If we are not invalidating, synchronously
2718 * freeing or writing pages, use the routine
2719 * page_lookup_nowait() to prevent reclaiming
2720 * them from the free list.
2722 if ((flags
& B_INVAL
) || ((flags
& B_ASYNC
) == 0)) {
2723 pp
= page_lookup(&vp
->v_object
, io_off
,
2724 (flags
& (B_INVAL
| B_FREE
)) ? SE_EXCL
: SE_SHARED
);
2726 pp
= page_lookup_nowait(&vp
->v_object
,
2728 (flags
& B_FREE
) ? SE_EXCL
: SE_SHARED
);
2731 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0) {
2735 err
= ud_putapage(vp
, pp
,
2736 &io_off
, &io_len
, flags
, cr
);
2741 * "io_off" and "io_len" are returned as
2742 * the range of pages we actually wrote.
2743 * This allows us to skip ahead more quickly
2744 * since several pages may've been dealt
2745 * with by this iteration of the loop.
2750 if (err
== 0 && off
== 0 && (len
== 0 || len
>= ip
->i_size
)) {
2752 * We have just sync'ed back all the pages on
2753 * the inode, turn off the IMODTIME flag.
2755 mutex_enter(&ip
->i_tlock
);
2756 ip
->i_flag
&= ~IMODTIME
;
2757 mutex_exit(&ip
->i_tlock
);
2760 rw_exit(&ip
->i_contents
);
2763 rw_exit(&ip
->i_contents
);
2771 ud_putapage(struct vnode
*vp
,
2772 page_t
*pp
, uoff_t
*offp
,
2773 size_t *lenp
, int32_t flags
, struct cred
*cr
)
2777 struct ud_inode
*ip
;
2778 int32_t error
= 0, contig
, multi_io
= 0;
2779 struct udf_vfs
*udf_vfsp
;
2781 caddr_t kaddr
, caddr
;
2782 struct buf
*bp
= NULL
;
2786 struct file_entry
*fe
;
2788 ud_printf("ud_putapage\n");
2792 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
2793 lbmask
= ip
->i_udf
->udf_lbmask
;
2794 isize
= (ip
->i_size
+ lbmask
) & (~lbmask
);
2796 udf_vfsp
= ip
->i_udf
;
2797 ASSERT(udf_vfsp
->udf_flags
& UDF_FL_RW
);
2800 * If the modified time on the inode has not already been
2801 * set elsewhere (e.g. for write/setattr) we set the time now.
2802 * This gives us approximate modified times for mmap'ed files
2803 * which are modified via stores in the user address space.
2805 if (((ip
->i_flag
& IMODTIME
) == 0) || (flags
& B_FORCE
)) {
2806 mutex_enter(&ip
->i_tlock
);
2809 mutex_exit(&ip
->i_tlock
);
2814 * Align the request to a block boundry (for old file systems),
2815 * and go ask bmap() how contiguous things are for this file.
2817 off
= pp
->p_offset
& ~(offset_t
)lbmask
;
2818 /* block align it */
2821 if (ip
->i_desc_type
== ICB_FLAG_ONE_AD
) {
2822 ASSERT(ip
->i_size
<= ip
->i_max_emb
);
2824 pp
= pvn_write_kluster(vp
, pp
, &io_off
,
2825 &io_len
, off
, PAGESIZE
, flags
);
2830 bp
= ud_bread(ip
->i_dev
,
2831 ip
->i_icb_lbano
<< udf_vfsp
->udf_l2d_shift
,
2832 udf_vfsp
->udf_lbsize
);
2833 fe
= (struct file_entry
*)bp
->b_un
.b_addr
;
2834 if ((bp
->b_flags
& B_ERROR
) ||
2835 (ud_verify_tag_and_desc(&fe
->fe_tag
, UD_FILE_ENTRY
,
2837 1, udf_vfsp
->udf_lbsize
) != 0)) {
2839 pvn_write_done(pp
, B_ERROR
| B_WRITE
| flags
);
2840 if (bp
->b_flags
& B_ERROR
) {
2848 if ((bp
->b_error
== 0) &&
2849 (bp
->b_resid
== 0)) {
2851 caddr
= bp
->b_un
.b_addr
+ ip
->i_data_off
;
2852 kaddr
= (caddr_t
)ppmapin(pp
,
2853 PROT_READ
| PROT_WRITE
, (caddr_t
)-1);
2854 (void) kcopy(kaddr
, caddr
, ip
->i_size
);
2857 crc_len
= offsetof(struct file_entry
, fe_spec
) +
2858 SWAP_32(fe
->fe_len_ear
);
2859 crc_len
+= ip
->i_size
;
2860 ud_make_tag(ip
->i_udf
, &fe
->fe_tag
,
2861 UD_FILE_ENTRY
, ip
->i_icb_block
, crc_len
);
2865 if (flags
& B_ASYNC
) {
2866 pvn_write_done(pp
, flags
);
2868 contig
= ip
->i_size
;
2871 if (error
= ud_bmap_read(ip
, off
, &bn
, &contig
)) {
2874 contig
= MIN(contig
, PAGESIZE
);
2875 contig
= (contig
+ lbmask
) & (~lbmask
);
2877 if (contig
< PAGESIZE
) {
2880 count
= isize
- off
;
2881 if (contig
!= count
) {
2883 contig
= (int32_t)(MIN(count
, PAGESIZE
));
2887 if ((off
+ contig
) > isize
) {
2888 contig
= isize
- off
;
2891 if (contig
> PAGESIZE
) {
2892 if (contig
& PAGEOFFSET
) {
2897 pp
= pvn_write_kluster(vp
, pp
, &io_off
,
2898 &io_len
, off
, contig
, flags
);
2903 bp
= pageio_setup(pp
, contig
, ip
->i_devvp
, B_WRITE
| flags
);
2906 bp
->b_edev
= ip
->i_dev
;
2907 bp
->b_dev
= cmpdev(ip
->i_dev
);
2909 bp
->b_un
.b_addr
= 0;
2911 bp
->b_offset
= (offset_t
)off
;
2917 ASSERT(bp
->b_iodone
== NULL
);
2918 bp
->b_iodone
= ud_iodone
;
2919 mutex_enter(&ip
->i_tlock
);
2920 ip
->i_writes
+= bp
->b_bcount
;
2921 mutex_exit(&ip
->i_tlock
);
2923 if (multi_io
== 0) {
2925 (void) bdev_strategy(bp
);
2927 error
= ud_multi_strat(ip
, pp
, bp
, off
);
2933 if ((flags
& B_ASYNC
) == 0) {
2935 * Wait for i/o to complete.
2937 error
= biowait(bp
);
2942 if ((flags
& B_ASYNC
) == 0) {
2943 pvn_write_done(pp
, ((error
) ? B_ERROR
: 0) | B_WRITE
| flags
);
2949 if (error
!= 0 && pp
!= NULL
) {
2950 pvn_write_done(pp
, B_ERROR
| B_WRITE
| flags
);
2965 ud_iodone(struct buf
*bp
)
2967 struct ud_inode
*ip
;
2969 VERIFY(bp
->b_pages
->p_object
!= NULL
);
2970 ASSERT(bp
->b_pages
->p_vnode
!= NULL
);
2971 ASSERT(!(bp
->b_flags
& B_READ
));
2973 bp
->b_iodone
= NULL
;
2975 ip
= VTOI(bp
->b_pages
->p_vnode
);
2977 mutex_enter(&ip
->i_tlock
);
2978 if (ip
->i_writes
>= ud_LW
) {
2979 if ((ip
->i_writes
-= bp
->b_bcount
) <= ud_LW
) {
2981 cv_broadcast(&ip
->i_wrcv
); /* wake all up */
2985 ip
->i_writes
-= bp
->b_bcount
;
2987 mutex_exit(&ip
->i_tlock
);
2994 ud_rdip(struct ud_inode
*ip
, struct uio
*uio
, int32_t ioflag
, cred_t
*cr
)
2997 struct udf_vfs
*udf_vfsp
;
3001 int32_t error
, n
, on
, mapon
, dofree
;
3003 long oresid
= uio
->uio_resid
;
3005 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
3006 if ((ip
->i_type
!= VREG
) &&
3007 (ip
->i_type
!= VDIR
) &&
3008 (ip
->i_type
!= VLNK
)) {
3012 if (uio
->uio_loffset
> MAXOFFSET_T
) {
3016 if ((uio
->uio_loffset
< 0) ||
3017 ((uio
->uio_loffset
+ uio
->uio_resid
) < 0)) {
3020 if (uio
->uio_resid
== 0) {
3025 udf_vfsp
= ip
->i_udf
;
3026 mutex_enter(&ip
->i_tlock
);
3028 mutex_exit(&ip
->i_tlock
);
3030 rwtype
= (rw_write_held(&ip
->i_contents
)?RW_WRITER
:RW_READER
);
3034 uoff_t uoff
= uio
->uio_loffset
;
3035 off
= uoff
& (offset_t
)MAXBMASK
;
3036 mapon
= (int)(uoff
& (offset_t
)MAXBOFFSET
);
3037 on
= (int)blkoff(udf_vfsp
, uoff
);
3038 n
= (int)MIN(udf_vfsp
->udf_lbsize
- on
, uio
->uio_resid
);
3040 diff
= ip
->i_size
- uoff
;
3046 if (diff
< (offset_t
)n
) {
3049 dofree
= ud_freebehind
&&
3050 ip
->i_nextr
== (off
& PAGEMASK
) &&
3054 if (rwtype
== RW_READER
) {
3055 rw_exit(&ip
->i_contents
);
3059 base
= segmap_getmapflt(segkmap
, vp
, (off
+ mapon
),
3060 (uint32_t)n
, 1, S_READ
);
3061 error
= uiomove(base
+ mapon
, (long)n
, UIO_READ
, uio
);
3066 * If read a whole block, or read to eof,
3067 * won't need this buffer again soon.
3069 if (n
+ on
== MAXBSIZE
&& ud_freebehind
&& dofree
&&
3070 freemem
< lotsfree
+ pages_before_pager
) {
3071 flags
= SM_FREE
| SM_DONTNEED
|SM_ASYNC
;
3074 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3075 * we want to make sure that the page which has
3076 * been read, is written on disk if it is dirty.
3077 * And corresponding indirect blocks should also
3080 if ((ioflag
& FRSYNC
) && (ioflag
& (FSYNC
|FDSYNC
))) {
3084 error
= segmap_release(segkmap
, base
, flags
);
3086 (void) segmap_release(segkmap
, base
, flags
);
3090 if (rwtype
== RW_READER
) {
3091 rw_enter(&ip
->i_contents
, rwtype
);
3094 } while (error
== 0 && uio
->uio_resid
> 0 && n
!= 0);
3097 * Inode is updated according to this table if FRSYNC is set.
3099 * FSYNC FDSYNC(posix.4)
3100 * --------------------------
3101 * always IATTCHG|IBDWRITE
3103 if (ioflag
& FRSYNC
) {
3104 if ((ioflag
& FSYNC
) ||
3105 ((ioflag
& FDSYNC
) &&
3106 (ip
->i_flag
& (IATTCHG
|IBDWRITE
)))) {
3107 rw_exit(&ip
->i_contents
);
3108 rw_enter(&ip
->i_contents
, RW_WRITER
);
3113 * If we've already done a partial read, terminate
3114 * the read but return no error.
3116 if (oresid
!= uio
->uio_resid
) {
3125 ud_wrip(struct ud_inode
*ip
, struct uio
*uio
, int ioflag
, struct cred
*cr
)
3129 struct udf_vfs
*udf_vfsp
;
3131 int32_t error
= 0, iupdat_flag
, n
, on
, mapon
, i_size_changed
= 0;
3132 int32_t pagecreate
, newpage
;
3133 uint64_t old_i_size
;
3135 long start_resid
= uio
->uio_resid
, premove_resid
;
3136 rlim64_t limit
= uio
->uio_limit
;
3139 ASSERT(RW_WRITE_HELD(&ip
->i_contents
));
3140 if ((ip
->i_type
!= VREG
) &&
3141 (ip
->i_type
!= VDIR
) &&
3142 (ip
->i_type
!= VLNK
)) {
3146 if (uio
->uio_loffset
>= MAXOFFSET_T
) {
3150 * see udf_l_pathconf
3152 if (limit
> (((uint64_t)1 << 40) - 1)) {
3153 limit
= ((uint64_t)1 << 40) - 1;
3155 if (uio
->uio_loffset
>= limit
) {
3156 proc_t
*p
= ttoproc(curthread
);
3158 mutex_enter(&p
->p_lock
);
3159 (void) rctl_action(rctlproc_legacy
[RLIMIT_FSIZE
], p
->p_rctls
,
3160 p
, RCA_UNSAFE_SIGINFO
);
3161 mutex_exit(&p
->p_lock
);
3164 if ((uio
->uio_loffset
< 0) ||
3165 ((uio
->uio_loffset
+ uio
->uio_resid
) < 0)) {
3168 if (uio
->uio_resid
== 0) {
3172 mutex_enter(&ip
->i_tlock
);
3173 ip
->i_flag
|= INOACC
;
3175 if (ioflag
& (FSYNC
| FDSYNC
)) {
3176 ip
->i_flag
|= ISYNC
;
3179 mutex_exit(&ip
->i_tlock
);
3181 udf_vfsp
= ip
->i_udf
;
3185 uoff_t uoff
= uio
->uio_loffset
;
3186 off
= uoff
& (offset_t
)MAXBMASK
;
3187 mapon
= (int)(uoff
& (offset_t
)MAXBOFFSET
);
3188 on
= (int)blkoff(udf_vfsp
, uoff
);
3189 n
= (int)MIN(udf_vfsp
->udf_lbsize
- on
, uio
->uio_resid
);
3191 if (ip
->i_type
== VREG
&& uoff
+ n
>= limit
) {
3192 if (uoff
>= limit
) {
3196 n
= (int)(limit
- (rlim64_t
)uoff
);
3198 if (uoff
+ n
> ip
->i_size
) {
3200 * We are extending the length of the file.
3201 * bmap is used so that we are sure that
3202 * if we need to allocate new blocks, that it
3203 * is done here before we up the file size.
3205 error
= ud_bmap_write(ip
, uoff
,
3206 (int)(on
+ n
), mapon
== 0, cr
);
3211 old_i_size
= ip
->i_size
;
3212 ip
->i_size
= uoff
+ n
;
3214 * If we are writing from the beginning of
3215 * the mapping, we can just create the
3216 * pages without having to read them.
3218 pagecreate
= (mapon
== 0);
3219 } else if (n
== MAXBSIZE
) {
3221 * Going to do a whole mappings worth,
3222 * so we can just create the pages w/o
3223 * having to read them in. But before
3224 * we do that, we need to make sure any
3225 * needed blocks are allocated first.
3227 error
= ud_bmap_write(ip
, uoff
,
3228 (int)(on
+ n
), 1, cr
);
3237 rw_exit(&ip
->i_contents
);
3240 * Touch the page and fault it in if it is not in
3241 * core before segmap_getmapflt can lock it. This
3242 * is to avoid the deadlock if the buffer is mapped
3243 * to the same file through mmap which we want to
3246 uio_prefaultpages((long)n
, uio
);
3248 base
= segmap_getmapflt(segkmap
, vp
, (off
+ mapon
),
3249 (uint32_t)n
, !pagecreate
, S_WRITE
);
3252 * segmap_pagecreate() returns 1 if it calls
3253 * page_create_va() to allocate any pages.
3257 newpage
= segmap_pagecreate(segkmap
, base
,
3261 premove_resid
= uio
->uio_resid
;
3262 error
= uiomove(base
+ mapon
, (long)n
, UIO_WRITE
, uio
);
3265 uio
->uio_loffset
< roundup(off
+ mapon
+ n
, PAGESIZE
)) {
3267 * We created pages w/o initializing them completely,
3268 * thus we need to zero the part that wasn't set up.
3269 * This happens on most EOF write cases and if
3270 * we had some sort of error during the uiomove.
3274 nmoved
= (int)(uio
->uio_loffset
- (off
+ mapon
));
3275 ASSERT(nmoved
>= 0 && nmoved
<= n
);
3276 nzero
= roundup(on
+ n
, PAGESIZE
) - nmoved
;
3277 ASSERT(nzero
> 0 && mapon
+ nmoved
+ nzero
<= MAXBSIZE
);
3278 (void) kzero(base
+ mapon
+ nmoved
, (uint32_t)nzero
);
3282 * Unlock the pages allocated by page_create_va()
3283 * in segmap_pagecreate()
3286 segmap_pageunlock(segkmap
, base
, (size_t)n
, S_WRITE
);
3291 * If we failed on a write, we may have already
3292 * allocated file blocks as well as pages. It's
3293 * hard to undo the block allocation, but we must
3294 * be sure to invalidate any pages that may have
3297 (void) segmap_release(segkmap
, base
, SM_INVAL
);
3301 * Force write back for synchronous write cases.
3303 if ((ioflag
& (FSYNC
|FDSYNC
)) || ip
->i_type
== VDIR
) {
3305 * If the sticky bit is set but the
3306 * execute bit is not set, we do a
3307 * synchronous write back and free
3308 * the page when done. We set up swap
3309 * files to be handled this way to
3310 * prevent servers from keeping around
3311 * the client's swap pages too long.
3312 * XXX - there ought to be a better way.
3314 if (IS_SWAPVP(vp
)) {
3315 flags
= SM_WRITE
| SM_FREE
|
3321 } else if (((mapon
+ n
) == MAXBSIZE
) ||
3324 * Have written a whole block.
3325 * Start an asynchronous write and
3326 * mark the buffer to indicate that
3327 * it won't be needed again soon.
3329 flags
= SM_WRITE
|SM_ASYNC
| SM_DONTNEED
;
3331 error
= segmap_release(segkmap
, base
, flags
);
3334 * If the operation failed and is synchronous,
3335 * then we need to unwind what uiomove() last
3336 * did so we can potentially return an error to
3337 * the caller. If this write operation was
3338 * done in two pieces and the first succeeded,
3339 * then we won't return an error for the second
3340 * piece that failed. However, we only want to
3341 * return a resid value that reflects what was
3344 * Failures for non-synchronous operations can
3345 * be ignored since the page subsystem will
3346 * retry the operation until it succeeds or the
3347 * file system is unmounted.
3350 if ((ioflag
& (FSYNC
| FDSYNC
)) ||
3351 ip
->i_type
== VDIR
) {
3352 uio
->uio_resid
= premove_resid
;
3360 * Re-acquire contents lock.
3362 rw_enter(&ip
->i_contents
, RW_WRITER
);
3364 * If the uiomove() failed or if a synchronous
3365 * page push failed, fix up i_size.
3368 if (i_size_changed
) {
3370 * The uiomove failed, and we
3371 * allocated blocks,so get rid
3374 (void) ud_itrunc(ip
, old_i_size
, 0, cr
);
3378 * XXX - Can this be out of the loop?
3380 ip
->i_flag
|= IUPD
| ICHG
;
3381 if (i_size_changed
) {
3382 ip
->i_flag
|= IATTCHG
;
3384 if ((ip
->i_perm
& (IEXEC
| (IEXEC
>> 5) |
3385 (IEXEC
>> 10))) != 0 &&
3386 (ip
->i_char
& (ISUID
| ISGID
)) != 0 &&
3387 secpolicy_vnode_setid_retain(cr
,
3388 (ip
->i_char
& ISUID
) != 0 && ip
->i_uid
== 0) != 0) {
3390 * Clear Set-UID & Set-GID bits on
3391 * successful write if not privileged
3392 * and at least one of the execute bits
3393 * is set. If we always clear Set-GID,
3394 * mandatory file and record locking is
3397 ip
->i_char
&= ~(ISUID
| ISGID
);
3400 } while (error
== 0 && uio
->uio_resid
> 0 && n
!= 0);
3404 * Inode is updated according to this table -
3406 * FSYNC FDSYNC(posix.4)
3407 * --------------------------
3408 * always@ IATTCHG|IBDWRITE
3410 * @ - If we are doing synchronous write the only time we should
3411 * not be sync'ing the ip here is if we have the stickyhack
3412 * activated, the file is marked with the sticky bit and
3413 * no exec bit, the file length has not been changed and
3414 * no new blocks have been allocated during this write.
3416 if ((ip
->i_flag
& ISYNC
) != 0) {
3418 * we have eliminated nosync
3420 if ((ip
->i_flag
& (IATTCHG
|IBDWRITE
)) ||
3421 ((ioflag
& FSYNC
) && iupdat_flag
)) {
3427 * If we've already done a partial-write, terminate
3428 * the write but return no error.
3430 if (start_resid
!= uio
->uio_resid
) {
3433 ip
->i_flag
&= ~(INOACC
| ISYNC
);
3440 ud_multi_strat(struct ud_inode
*ip
,
3441 page_t
*pp
, struct buf
*bp
, uoff_t start
)
3444 int32_t error
= 0, io_count
, contig
, alloc_sz
, i
;
3446 mio_master_t
*mm
= NULL
;
3447 mio_slave_t
*ms
= NULL
;
3450 ASSERT(!(start
& PAGEOFFSET
));
3453 * Figure out how many buffers to allocate
3456 for (io_off
= 0; io_off
< bp
->b_bcount
; io_off
+= contig
) {
3458 if (error
= ud_bmap_read(ip
, (uoff_t
)(start
+ io_off
),
3465 contig
= MIN(contig
, PAGESIZE
- io_off
);
3466 if (bn
!= UDF_HOLE
) {
3472 if (bp
->b_flags
& B_READ
) {
3475 * This is a hole and is read
3476 * it should be filled with 0's
3478 pagezero(pp
, io_off
, contig
);
3484 if (io_count
!= 0) {
3487 * Allocate memory for all the
3488 * required number of buffers
3490 alloc_sz
= sizeof (mio_master_t
) +
3491 (sizeof (mio_slave_t
) * io_count
);
3492 mm
= (mio_master_t
*)kmem_zalloc(alloc_sz
, KM_SLEEP
);
3501 mutex_init(&mm
->mm_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
3502 mm
->mm_size
= alloc_sz
;
3506 mm
->mm_index
= master_index
++;
3508 ms
= (mio_slave_t
*)(((caddr_t
)mm
) + sizeof (mio_master_t
));
3511 * Initialize buffers
3514 for (io_off
= 0; io_off
< bp
->b_bcount
; io_off
+= contig
) {
3516 if (error
= ud_bmap_read(ip
,
3517 (uoff_t
)(start
+ io_off
),
3522 if ((io_off
+ contig
) > bp
->b_bcount
) {
3523 contig
= bp
->b_bcount
- io_off
;
3525 if (bn
!= UDF_HOLE
) {
3528 * and prepare to start I/O
3531 bioinit(&ms
->ms_buf
);
3532 rbp
= bioclone(bp
, io_off
, (size_t)contig
,
3533 bp
->b_edev
, bn
, ud_slave_done
,
3534 &ms
->ms_buf
, KM_NOSLEEP
);
3535 ASSERT(rbp
== &ms
->ms_buf
);
3536 mm
->mm_resid
+= contig
;
3545 ms
= (mio_slave_t
*)(((caddr_t
)mm
) + sizeof (mio_master_t
));
3546 for (i
= 0; i
< io_count
; i
++) {
3547 (void) bdev_strategy(&ms
->ms_buf
);
3554 bp
->b_flags
|= B_ERROR
;
3555 bp
->b_error
= error
;
3557 mutex_destroy(&mm
->mm_mutex
);
3558 kmem_free(mm
, mm
->mm_size
);
3565 ud_slave_done(struct buf
*bp
)
3570 ASSERT(SEMA_HELD(&bp
->b_sem
));
3571 ASSERT((bp
->b_flags
& B_DONE
) == 0);
3573 mm
= ((mio_slave_t
*)bp
)->ms_ptr
;
3576 * Propagate error and byte count info from slave struct to
3579 mutex_enter(&mm
->mm_mutex
);
3580 if (bp
->b_flags
& B_ERROR
) {
3583 * If multiple slave buffers get
3584 * error we forget the old errors
3585 * this is ok because we any way
3586 * cannot return multiple errors
3588 mm
->mm_error
= bp
->b_error
;
3590 mm
->mm_resid
-= bp
->b_bcount
;
3591 resid
= mm
->mm_resid
;
3592 mutex_exit(&mm
->mm_mutex
);
3595 * free up the resources allocated to cloned buffers.
3603 * This is the last I/O operation
3604 * clean up and return the original buffer
3607 mm
->mm_bp
->b_flags
|= B_ERROR
;
3608 mm
->mm_bp
->b_error
= mm
->mm_error
;
3611 mutex_destroy(&mm
->mm_mutex
);
3612 kmem_free(mm
, mm
->mm_size
);