drop net-snmp dep
[unleashed.git] / kernel / fs / udfs / udf_vnops.c
blob49148cb6f7d00a5c18b0872c8295bb9b73add5b6
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2015, Joyent, Inc.
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/stat.h>
43 #include <sys/vnode.h>
44 #include <sys/mode.h>
45 #include <sys/proc.h>
46 #include <sys/disp.h>
47 #include <sys/file.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/kmem.h>
51 #include <sys/uio.h>
52 #include <sys/dnlc.h>
53 #include <sys/conf.h>
54 #include <sys/errno.h>
55 #include <sys/mman.h>
56 #include <sys/fbuf.h>
57 #include <sys/pathname.h>
58 #include <sys/debug.h>
59 #include <sys/vmsystm.h>
60 #include <sys/cmn_err.h>
61 #include <sys/dirent.h>
62 #include <sys/errno.h>
63 #include <sys/modctl.h>
64 #include <sys/statvfs.h>
65 #include <sys/mount.h>
66 #include <sys/sunddi.h>
67 #include <sys/bootconf.h>
68 #include <sys/policy.h>
70 #include <vm/hat.h>
71 #include <vm/page.h>
72 #include <vm/pvn.h>
73 #include <vm/as.h>
74 #include <vm/seg.h>
75 #include <vm/seg_map.h>
76 #include <vm/seg_kmem.h>
77 #include <vm/seg_vn.h>
78 #include <vm/rm.h>
79 #include <vm/page.h>
80 #include <sys/swap.h>
82 #include <sys/fs_subr.h>
84 #include <sys/fs/udf_volume.h>
85 #include <sys/fs/udf_inode.h>
87 static int32_t udf_open(struct vnode **,
88 int32_t, struct cred *, caller_context_t *);
89 static int32_t udf_close(struct vnode *,
90 int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
91 static int32_t udf_read(struct vnode *,
92 struct uio *, int32_t, struct cred *, caller_context_t *);
93 static int32_t udf_write(struct vnode *,
94 struct uio *, int32_t, struct cred *, caller_context_t *);
95 static int32_t udf_ioctl(struct vnode *,
96 int32_t, intptr_t, int32_t, struct cred *, int32_t *,
97 caller_context_t *);
98 static int32_t udf_getattr(struct vnode *,
99 struct vattr *, int32_t, struct cred *, caller_context_t *);
100 static int32_t udf_setattr(struct vnode *,
101 struct vattr *, int32_t, struct cred *, caller_context_t *);
102 static int32_t udf_access(struct vnode *,
103 int32_t, int32_t, struct cred *, caller_context_t *);
104 static int32_t udf_lookup(struct vnode *,
105 char *, struct vnode **, struct pathname *,
106 int32_t, struct vnode *, struct cred *,
107 caller_context_t *, int *, pathname_t *);
108 static int32_t udf_create(struct vnode *,
109 char *, struct vattr *, enum vcexcl,
110 int32_t, struct vnode **, struct cred *, int32_t,
111 caller_context_t *, vsecattr_t *);
112 static int32_t udf_remove(struct vnode *,
113 char *, struct cred *, caller_context_t *, int);
114 static int32_t udf_link(struct vnode *,
115 struct vnode *, char *, struct cred *, caller_context_t *, int);
116 static int32_t udf_rename(struct vnode *,
117 char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
118 static int32_t udf_mkdir(struct vnode *,
119 char *, struct vattr *, struct vnode **, struct cred *,
120 caller_context_t *, int, vsecattr_t *);
121 static int32_t udf_rmdir(struct vnode *,
122 char *, struct vnode *, struct cred *, caller_context_t *, int);
123 static int32_t udf_readdir(struct vnode *,
124 struct uio *, struct cred *, int32_t *, caller_context_t *, int);
125 static int32_t udf_symlink(struct vnode *,
126 char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
127 static int32_t udf_readlink(struct vnode *,
128 struct uio *, struct cred *, caller_context_t *);
129 static int32_t udf_fsync(struct vnode *,
130 int32_t, struct cred *, caller_context_t *);
131 static void udf_inactive(struct vnode *,
132 struct cred *, caller_context_t *);
133 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
134 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
135 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
136 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
137 caller_context_t *);
138 static int32_t udf_frlock(struct vnode *, int32_t,
139 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
140 caller_context_t *);
141 static int32_t udf_space(struct vnode *, int32_t,
142 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
143 static int32_t udf_getpage(struct vnode *, offset_t,
144 size_t, uint32_t *, struct page **, size_t,
145 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
146 static int32_t udf_putpage(struct vnode *, offset_t,
147 size_t, int32_t, struct cred *, caller_context_t *);
148 static int32_t udf_map(struct vnode *, offset_t, struct as *,
149 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
150 caller_context_t *);
151 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
152 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
153 caller_context_t *);
154 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
155 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
156 caller_context_t *);
157 static int32_t udf_l_pathconf(struct vnode *, int32_t,
158 ulong_t *, struct cred *, caller_context_t *);
159 static int32_t udf_pageio(struct vnode *, struct page *,
160 uoff_t, size_t, int32_t, struct cred *, caller_context_t *);
162 int32_t ud_getpage_miss(struct vnode *, uoff_t,
163 size_t, struct seg *, caddr_t, page_t *pl[],
164 size_t, enum seg_rw, int32_t);
165 void ud_getpage_ra(struct vnode *, uoff_t, struct seg *, caddr_t);
166 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
167 int32_t ud_page_fill(struct ud_inode *, page_t *,
168 uoff_t, uint32_t, uoff_t *);
169 int32_t ud_iodone(struct buf *);
170 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
171 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, uoff_t);
173 int32_t ud_slave_done(struct buf *);
176 * Structures to control multiple IO operations to get or put pages
177 * that are backed by discontiguous blocks. The master struct is
178 * a dummy that holds the original bp from pageio_setup. The
179 * slave struct holds the working bp's to do the actual IO. Once
180 * all the slave IOs complete. The master is processed as if a single
181 * IO op has completed.
183 uint32_t master_index = 0;
184 typedef struct mio_master {
185 kmutex_t mm_mutex; /* protect the fields below */
186 int32_t mm_size;
187 buf_t *mm_bp; /* original bp */
188 int32_t mm_resid; /* bytes remaining to transfer */
189 int32_t mm_error; /* accumulated error from slaves */
190 int32_t mm_index; /* XXX debugging */
191 } mio_master_t;
193 typedef struct mio_slave {
194 buf_t ms_buf; /* working buffer for this IO chunk */
195 mio_master_t *ms_ptr; /* pointer to master */
196 } mio_slave_t;
198 const struct vnodeops udf_vnodeops = {
199 .vnop_name = "udfs",
200 .vop_open = udf_open,
201 .vop_close = udf_close,
202 .vop_read = udf_read,
203 .vop_write = udf_write,
204 .vop_ioctl = udf_ioctl,
205 .vop_getattr = udf_getattr,
206 .vop_setattr = udf_setattr,
207 .vop_access = udf_access,
208 .vop_lookup = udf_lookup,
209 .vop_create = udf_create,
210 .vop_remove = udf_remove,
211 .vop_link = udf_link,
212 .vop_rename = udf_rename,
213 .vop_mkdir = udf_mkdir,
214 .vop_rmdir = udf_rmdir,
215 .vop_readdir = udf_readdir,
216 .vop_symlink = udf_symlink,
217 .vop_readlink = udf_readlink,
218 .vop_fsync = udf_fsync,
219 .vop_inactive = udf_inactive,
220 .vop_fid = udf_fid,
221 .vop_rwlock = udf_rwlock,
222 .vop_rwunlock = udf_rwunlock,
223 .vop_seek = udf_seek,
224 .vop_frlock = udf_frlock,
225 .vop_space = udf_space,
226 .vop_getpage = udf_getpage,
227 .vop_putpage = udf_putpage,
228 .vop_map = udf_map,
229 .vop_addmap = udf_addmap,
230 .vop_delmap = udf_delmap,
231 .vop_pathconf = udf_l_pathconf,
232 .vop_pageio = udf_pageio,
233 .vop_vnevent = fs_vnevent_support,
236 /* ARGSUSED */
237 static int32_t
238 udf_open(
239 struct vnode **vpp,
240 int32_t flag,
241 struct cred *cr,
242 caller_context_t *ct)
244 ud_printf("udf_open\n");
246 return (0);
249 /* ARGSUSED */
250 static int32_t
251 udf_close(
252 struct vnode *vp,
253 int32_t flag,
254 int32_t count,
255 offset_t offset,
256 struct cred *cr,
257 caller_context_t *ct)
259 struct ud_inode *ip = VTOI(vp);
261 ud_printf("udf_close\n");
263 ITIMES(ip);
265 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
266 cleanshares(vp, ttoproc(curthread)->p_pid);
269 * Push partially filled cluster at last close.
270 * ``last close'' is approximated because the dnlc
271 * may have a hold on the vnode.
273 if (vp->v_count <= 2 && vp->v_type != VBAD) {
274 struct ud_inode *ip = VTOI(vp);
275 if (ip->i_delaylen) {
276 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
277 B_ASYNC | B_FREE, cr);
278 ip->i_delaylen = 0;
282 return (0);
285 /* ARGSUSED */
286 static int32_t
287 udf_read(
288 struct vnode *vp,
289 struct uio *uiop,
290 int32_t ioflag,
291 struct cred *cr,
292 caller_context_t *ct)
294 struct ud_inode *ip = VTOI(vp);
295 int32_t error;
297 ud_printf("udf_read\n");
299 #ifdef __lock_lint
300 rw_enter(&ip->i_rwlock, RW_READER);
301 #endif
303 ASSERT(RW_READ_HELD(&ip->i_rwlock));
305 if (MANDLOCK(vp, ip->i_char)) {
307 * udf_getattr ends up being called by chklock
309 error = chklock(vp, FREAD, uiop->uio_loffset,
310 uiop->uio_resid, uiop->uio_fmode, ct);
311 if (error) {
312 goto end;
316 rw_enter(&ip->i_contents, RW_READER);
317 error = ud_rdip(ip, uiop, ioflag, cr);
318 rw_exit(&ip->i_contents);
320 end:
321 #ifdef __lock_lint
322 rw_exit(&ip->i_rwlock);
323 #endif
325 return (error);
329 int32_t ud_WRITES = 1;
330 int32_t ud_HW = 96 * 1024;
331 int32_t ud_LW = 64 * 1024;
332 int32_t ud_throttles = 0;
334 /* ARGSUSED */
335 static int32_t
336 udf_write(
337 struct vnode *vp,
338 struct uio *uiop,
339 int32_t ioflag,
340 struct cred *cr,
341 caller_context_t *ct)
343 struct ud_inode *ip = VTOI(vp);
344 int32_t error = 0;
346 ud_printf("udf_write\n");
348 #ifdef __lock_lint
349 rw_enter(&ip->i_rwlock, RW_WRITER);
350 #endif
352 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
354 if (MANDLOCK(vp, ip->i_char)) {
356 * ud_getattr ends up being called by chklock
358 error = chklock(vp, FWRITE, uiop->uio_loffset,
359 uiop->uio_resid, uiop->uio_fmode, ct);
360 if (error) {
361 goto end;
365 * Throttle writes.
367 mutex_enter(&ip->i_tlock);
368 if (ud_WRITES && (ip->i_writes > ud_HW)) {
369 while (ip->i_writes > ud_HW) {
370 ud_throttles++;
371 cv_wait(&ip->i_wrcv, &ip->i_tlock);
374 mutex_exit(&ip->i_tlock);
377 * Write to the file
379 rw_enter(&ip->i_contents, RW_WRITER);
380 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
382 * In append mode start at end of file.
384 uiop->uio_loffset = ip->i_size;
386 error = ud_wrip(ip, uiop, ioflag, cr);
387 rw_exit(&ip->i_contents);
389 end:
390 #ifdef __lock_lint
391 rw_exit(&ip->i_rwlock);
392 #endif
394 return (error);
397 /* ARGSUSED */
398 static int32_t
399 udf_ioctl(
400 struct vnode *vp,
401 int32_t cmd,
402 intptr_t arg,
403 int32_t flag,
404 struct cred *cr,
405 int32_t *rvalp,
406 caller_context_t *ct)
408 return (ENOTTY);
411 /* ARGSUSED */
412 static int32_t
413 udf_getattr(
414 struct vnode *vp,
415 struct vattr *vap,
416 int32_t flags,
417 struct cred *cr,
418 caller_context_t *ct)
420 struct ud_inode *ip = VTOI(vp);
422 ud_printf("udf_getattr\n");
424 if (vap->va_mask == VATTR_SIZE) {
426 * for performance, if only the size is requested don't bother
427 * with anything else.
429 vap->va_size = ip->i_size;
430 return (0);
433 rw_enter(&ip->i_contents, RW_READER);
435 vap->va_type = vp->v_type;
436 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
438 vap->va_uid = ip->i_uid;
439 vap->va_gid = ip->i_gid;
440 vap->va_fsid = ip->i_dev;
441 vap->va_nodeid = ip->i_icb_lbano;
442 vap->va_nlink = ip->i_nlink;
443 vap->va_size = ip->i_size;
444 vap->va_seq = ip->i_seq;
445 if (vp->v_type == VCHR || vp->v_type == VBLK) {
446 vap->va_rdev = ip->i_rdev;
447 } else {
448 vap->va_rdev = 0;
451 mutex_enter(&ip->i_tlock);
452 ITIMES_NOLOCK(ip); /* mark correct time in inode */
453 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
454 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
455 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
456 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
457 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
458 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
459 mutex_exit(&ip->i_tlock);
461 switch (ip->i_type) {
462 case VBLK:
463 vap->va_blksize = MAXBSIZE;
464 break;
465 case VCHR:
466 vap->va_blksize = MAXBSIZE;
467 break;
468 default:
469 vap->va_blksize = ip->i_udf->udf_lbsize;
470 break;
472 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
474 rw_exit(&ip->i_contents);
476 return (0);
479 static int
480 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
482 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
485 /*ARGSUSED4*/
486 static int32_t
487 udf_setattr(
488 struct vnode *vp,
489 struct vattr *vap,
490 int32_t flags,
491 struct cred *cr,
492 caller_context_t *ct)
494 int32_t error = 0;
495 uint32_t mask = vap->va_mask;
496 struct ud_inode *ip;
497 timestruc_t now;
498 struct vattr ovap;
500 ud_printf("udf_setattr\n");
502 ip = VTOI(vp);
505 * not updates allowed to 4096 files
507 if (ip->i_astrat == STRAT_TYPE4096) {
508 return (EINVAL);
512 * Cannot set these attributes
514 if (mask & VATTR_NOSET) {
515 return (EINVAL);
518 rw_enter(&ip->i_rwlock, RW_WRITER);
519 rw_enter(&ip->i_contents, RW_WRITER);
521 ovap.va_uid = ip->i_uid;
522 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
523 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
524 ud_iaccess_vmode, ip);
525 if (error)
526 goto update_inode;
528 mask = vap->va_mask;
530 * Change file access modes.
532 if (mask & VATTR_MODE) {
533 ip->i_perm = VA2UD_PERM(vap->va_mode);
534 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
535 mutex_enter(&ip->i_tlock);
536 ip->i_flag |= ICHG;
537 mutex_exit(&ip->i_tlock);
539 if (mask & (VATTR_UID|VATTR_GID)) {
540 if (mask & VATTR_UID) {
541 ip->i_uid = vap->va_uid;
543 if (mask & VATTR_GID) {
544 ip->i_gid = vap->va_gid;
546 mutex_enter(&ip->i_tlock);
547 ip->i_flag |= ICHG;
548 mutex_exit(&ip->i_tlock);
551 * Truncate file. Must have write permission and not be a directory.
553 if (mask & VATTR_SIZE) {
554 if (vp->v_type == VDIR) {
555 error = EISDIR;
556 goto update_inode;
558 if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
559 goto update_inode;
561 if (vap->va_size > MAXOFFSET_T) {
562 error = EFBIG;
563 goto update_inode;
565 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
566 goto update_inode;
569 if (vap->va_size == 0)
570 vnevent_truncate(vp, ct);
573 * Change file access or modified times.
575 if (mask & (VATTR_ATIME|VATTR_MTIME)) {
576 mutex_enter(&ip->i_tlock);
577 if (mask & VATTR_ATIME) {
578 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
579 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
580 ip->i_flag &= ~IACC;
582 if (mask & VATTR_MTIME) {
583 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
584 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
585 gethrestime(&now);
586 ip->i_ctime.tv_sec = now.tv_sec;
587 ip->i_ctime.tv_nsec = now.tv_nsec;
588 ip->i_flag &= ~(IUPD|ICHG);
589 ip->i_flag |= IMODTIME;
591 ip->i_flag |= IMOD;
592 mutex_exit(&ip->i_tlock);
595 update_inode:
596 if (curthread->t_flag & T_DONTPEND) {
597 ud_iupdat(ip, 1);
598 } else {
599 ITIMES_NOLOCK(ip);
601 rw_exit(&ip->i_contents);
602 rw_exit(&ip->i_rwlock);
604 return (error);
607 /* ARGSUSED */
608 static int32_t
609 udf_access(
610 struct vnode *vp,
611 int32_t mode,
612 int32_t flags,
613 struct cred *cr,
614 caller_context_t *ct)
616 struct ud_inode *ip = VTOI(vp);
618 ud_printf("udf_access\n");
620 if (ip->i_udf == NULL) {
621 return (EIO);
624 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
627 int32_t udfs_stickyhack = 1;
629 /* ARGSUSED */
630 static int32_t
631 udf_lookup(
632 struct vnode *dvp,
633 char *nm,
634 struct vnode **vpp,
635 struct pathname *pnp,
636 int32_t flags,
637 struct vnode *rdir,
638 struct cred *cr,
639 caller_context_t *ct,
640 int *direntflags,
641 pathname_t *realpnp)
643 int32_t error;
644 struct vnode *vp;
645 struct ud_inode *ip, *xip;
647 ud_printf("udf_lookup\n");
649 * Null component name is a synonym for directory being searched.
651 if (*nm == '\0') {
652 VN_HOLD(dvp);
653 *vpp = dvp;
654 error = 0;
655 goto out;
659 * Fast path: Check the directory name lookup cache.
661 ip = VTOI(dvp);
662 if (vp = dnlc_lookup(dvp, nm)) {
664 * Check accessibility of directory.
666 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
667 VN_RELE(vp);
669 xip = VTOI(vp);
670 } else {
671 error = ud_dirlook(ip, nm, &xip, cr, 1);
672 ITIMES(ip);
675 if (error == 0) {
676 ip = xip;
677 *vpp = ITOV(ip);
678 if ((ip->i_type != VDIR) &&
679 (ip->i_char & ISVTX) &&
680 ((ip->i_perm & IEXEC) == 0) &&
681 udfs_stickyhack) {
682 mutex_enter(&(*vpp)->v_lock);
683 (*vpp)->v_flag |= VISSWAP;
684 mutex_exit(&(*vpp)->v_lock);
686 ITIMES(ip);
688 * If vnode is a device return special vnode instead.
690 if (IS_DEVVP(*vpp)) {
691 struct vnode *newvp;
692 newvp = specvp(*vpp, (*vpp)->v_rdev,
693 (*vpp)->v_type, cr);
694 VN_RELE(*vpp);
695 if (newvp == NULL) {
696 error = ENOSYS;
697 } else {
698 *vpp = newvp;
702 out:
703 return (error);
706 /* ARGSUSED */
707 static int32_t
708 udf_create(
709 struct vnode *dvp,
710 char *name,
711 struct vattr *vap,
712 enum vcexcl excl,
713 int32_t mode,
714 struct vnode **vpp,
715 struct cred *cr,
716 int32_t flag,
717 caller_context_t *ct,
718 vsecattr_t *vsecp)
720 int32_t error;
721 struct ud_inode *ip = VTOI(dvp), *xip;
723 ud_printf("udf_create\n");
725 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
726 vap->va_mode &= ~VSVTX;
728 if (*name == '\0') {
730 * Null component name refers to the directory itself.
732 VN_HOLD(dvp);
733 ITIMES(ip);
734 error = EEXIST;
735 } else {
736 xip = NULL;
737 rw_enter(&ip->i_rwlock, RW_WRITER);
738 error = ud_direnter(ip, name, DE_CREATE, NULL, NULL, vap,
739 &xip, cr, ct);
740 rw_exit(&ip->i_rwlock);
741 ITIMES(ip);
742 ip = xip;
744 #ifdef __lock_lint
745 rw_enter(&ip->i_contents, RW_WRITER);
746 #else
747 if (ip != NULL) {
748 rw_enter(&ip->i_contents, RW_WRITER);
750 #endif
753 * If the file already exists and this is a non-exclusive create,
754 * check permissions and allow access for non-directories.
755 * Read-only create of an existing directory is also allowed.
756 * We fail an exclusive create of anything which already exists.
758 if (error == EEXIST) {
759 if (excl == NONEXCL) {
760 if ((ip->i_type == VDIR) && (mode & VWRITE)) {
761 error = EISDIR;
762 } else if (mode) {
763 error = ud_iaccess(ip,
764 UD_UPERM2DPERM(mode), cr, 0);
765 } else {
766 error = 0;
769 if (error) {
770 rw_exit(&ip->i_contents);
771 VN_RELE(ITOV(ip));
772 goto out;
773 } else if ((ip->i_type == VREG) &&
774 (vap->va_mask & VATTR_SIZE) && vap->va_size == 0) {
776 * Truncate regular files, if requested by caller.
777 * Grab i_rwlock to make sure no one else is
778 * currently writing to the file (we promised
779 * bmap we would do this).
780 * Must get the locks in the correct order.
782 if (ip->i_size == 0) {
783 ip->i_flag |= ICHG | IUPD;
784 } else {
785 rw_exit(&ip->i_contents);
786 rw_enter(&ip->i_rwlock, RW_WRITER);
787 rw_enter(&ip->i_contents, RW_WRITER);
788 (void) ud_itrunc(ip, 0, 0, cr);
789 rw_exit(&ip->i_rwlock);
791 vnevent_create(ITOV(ip), ct);
795 if (error == 0) {
796 *vpp = ITOV(ip);
797 ITIMES(ip);
799 #ifdef __lock_lint
800 rw_exit(&ip->i_contents);
801 #else
802 if (ip != NULL) {
803 rw_exit(&ip->i_contents);
805 #endif
806 if (error) {
807 goto out;
811 * If vnode is a device return special vnode instead.
813 if (!error && IS_DEVVP(*vpp)) {
814 struct vnode *newvp;
816 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
817 VN_RELE(*vpp);
818 if (newvp == NULL) {
819 error = ENOSYS;
820 goto out;
822 *vpp = newvp;
824 out:
825 return (error);
828 /* ARGSUSED */
829 static int32_t
830 udf_remove(
831 struct vnode *vp,
832 char *nm,
833 struct cred *cr,
834 caller_context_t *ct,
835 int flags)
837 int32_t error;
838 struct ud_inode *ip = VTOI(vp);
840 ud_printf("udf_remove\n");
842 rw_enter(&ip->i_rwlock, RW_WRITER);
843 error = ud_dirremove(ip, nm,
844 NULL, NULL, DR_REMOVE, cr, ct);
845 rw_exit(&ip->i_rwlock);
846 ITIMES(ip);
848 return (error);
851 /* ARGSUSED */
852 static int32_t
853 udf_link(
854 struct vnode *tdvp,
855 struct vnode *svp,
856 char *tnm,
857 struct cred *cr,
858 caller_context_t *ct,
859 int flags)
861 int32_t error;
862 struct vnode *realvp;
863 struct ud_inode *sip;
864 struct ud_inode *tdp;
866 ud_printf("udf_link\n");
867 if (fop_realvp(svp, &realvp, ct) == 0) {
868 svp = realvp;
872 * Do not allow links to directories
874 if (svp->v_type == VDIR) {
875 return (EPERM);
878 sip = VTOI(svp);
880 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
881 return (EPERM);
883 tdp = VTOI(tdvp);
885 rw_enter(&tdp->i_rwlock, RW_WRITER);
886 error = ud_direnter(tdp, tnm, DE_LINK, NULL,
887 sip, NULL, (struct ud_inode **)0, cr, ct);
888 rw_exit(&tdp->i_rwlock);
889 ITIMES(sip);
890 ITIMES(tdp);
892 if (error == 0) {
893 vnevent_link(svp, ct);
896 return (error);
899 /* ARGSUSED */
900 static int32_t
901 udf_rename(
902 struct vnode *sdvp,
903 char *snm,
904 struct vnode *tdvp,
905 char *tnm,
906 struct cred *cr,
907 caller_context_t *ct,
908 int flags)
910 int32_t error = 0;
911 struct udf_vfs *udf_vfsp;
912 struct ud_inode *sip; /* source inode */
913 struct ud_inode *tip; /* target inode */
914 struct ud_inode *sdp, *tdp; /* source and target parent inode */
915 struct vnode *realvp;
917 ud_printf("udf_rename\n");
919 if (fop_realvp(tdvp, &realvp, ct) == 0) {
920 tdvp = realvp;
923 sdp = VTOI(sdvp);
924 tdp = VTOI(tdvp);
926 udf_vfsp = sdp->i_udf;
928 mutex_enter(&udf_vfsp->udf_rename_lck);
930 * Look up inode of file we're supposed to rename.
932 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
933 mutex_exit(&udf_vfsp->udf_rename_lck);
934 return (error);
937 * be sure this is not a directory with another file system mounted
938 * over it. If it is just give up the locks, and return with
939 * EBUSY
941 if (vn_mountedvfs(ITOV(sip)) != NULL) {
942 error = EBUSY;
943 goto errout;
946 * Make sure we can delete the source entry. This requires
947 * write permission on the containing directory. If that
948 * directory is "sticky" it further requires (except for
949 * privileged users) that the user own the directory or the
950 * source entry, or else have permission to write the source
951 * entry.
953 rw_enter(&sdp->i_contents, RW_READER);
954 rw_enter(&sip->i_contents, RW_READER);
955 if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
956 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
957 rw_exit(&sip->i_contents);
958 rw_exit(&sdp->i_contents);
959 ITIMES(sip);
960 goto errout;
964 * Check for renaming '.' or '..' or alias of '.'
966 if ((strcmp(snm, ".") == 0) ||
967 (strcmp(snm, "..") == 0) ||
968 (sdp == sip)) {
969 error = EINVAL;
970 rw_exit(&sip->i_contents);
971 rw_exit(&sdp->i_contents);
972 goto errout;
975 rw_exit(&sip->i_contents);
976 rw_exit(&sdp->i_contents);
978 if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
979 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
980 VN_RELE(ITOV(tip));
983 /* Notify the target dir. if not the same as the source dir. */
984 if (sdvp != tdvp)
985 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
987 vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
990 * Link source to the target.
992 rw_enter(&tdp->i_rwlock, RW_WRITER);
993 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
994 NULL, (struct ud_inode **)0, cr, ct)) {
996 * ESAME isn't really an error; it indicates that the
997 * operation should not be done because the source and target
998 * are the same file, but that no error should be reported.
1000 if (error == ESAME) {
1001 error = 0;
1003 rw_exit(&tdp->i_rwlock);
1004 goto errout;
1006 rw_exit(&tdp->i_rwlock);
1008 rw_enter(&sdp->i_rwlock, RW_WRITER);
1010 * Unlink the source.
1011 * Remove the source entry. ud_dirremove() checks that the entry
1012 * still reflects sip, and returns an error if it doesn't.
1013 * If the entry has changed just forget about it. Release
1014 * the source inode.
1016 if ((error = ud_dirremove(sdp, snm, sip, NULL,
1017 DR_RENAME, cr, ct)) == ENOENT) {
1018 error = 0;
1020 rw_exit(&sdp->i_rwlock);
1022 if (error == 0) {
1023 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1025 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1026 * in ud_direnter().
1030 errout:
1031 ITIMES(sdp);
1032 ITIMES(tdp);
1033 VN_RELE(ITOV(sip));
1034 mutex_exit(&udf_vfsp->udf_rename_lck);
1036 return (error);
1039 /* ARGSUSED */
1040 static int32_t
1041 udf_mkdir(
1042 struct vnode *dvp,
1043 char *dirname,
1044 struct vattr *vap,
1045 struct vnode **vpp,
1046 struct cred *cr,
1047 caller_context_t *ct,
1048 int flags,
1049 vsecattr_t *vsecp)
1051 int32_t error;
1052 struct ud_inode *ip;
1053 struct ud_inode *xip;
1055 ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1057 ud_printf("udf_mkdir\n");
1059 ip = VTOI(dvp);
1060 rw_enter(&ip->i_rwlock, RW_WRITER);
1061 error = ud_direnter(ip, dirname, DE_MKDIR,
1062 NULL, NULL, vap, &xip, cr, ct);
1063 rw_exit(&ip->i_rwlock);
1064 ITIMES(ip);
1065 if (error == 0) {
1066 ip = xip;
1067 *vpp = ITOV(ip);
1068 ITIMES(ip);
1069 } else if (error == EEXIST) {
1070 ITIMES(xip);
1071 VN_RELE(ITOV(xip));
1074 return (error);
1077 /* ARGSUSED */
1078 static int32_t
1079 udf_rmdir(
1080 struct vnode *vp,
1081 char *nm,
1082 struct vnode *cdir,
1083 struct cred *cr,
1084 caller_context_t *ct,
1085 int flags)
1087 int32_t error;
1088 struct ud_inode *ip = VTOI(vp);
1090 ud_printf("udf_rmdir\n");
1092 rw_enter(&ip->i_rwlock, RW_WRITER);
1093 error = ud_dirremove(ip, nm, NULL, cdir, DR_RMDIR,
1094 cr, ct);
1095 rw_exit(&ip->i_rwlock);
1096 ITIMES(ip);
1098 return (error);
1101 /* ARGSUSED */
1102 static int32_t
1103 udf_readdir(
1104 struct vnode *vp,
1105 struct uio *uiop,
1106 struct cred *cr,
1107 int32_t *eofp,
1108 caller_context_t *ct,
1109 int flags)
1111 struct ud_inode *ip;
1112 struct dirent64 *nd;
1113 struct udf_vfs *udf_vfsp;
1114 int32_t error = 0, len, outcount = 0;
1115 uint32_t dirsiz, offset;
1116 uint32_t bufsize, ndlen, dummy;
1117 caddr_t outbuf;
1118 caddr_t outb, end_outb;
1119 struct iovec *iovp;
1121 uint8_t *dname;
1122 int32_t length;
1124 uint8_t *buf = NULL;
1126 struct fbuf *fbp = NULL;
1127 struct file_id *fid;
1128 uint8_t *name;
1131 ud_printf("udf_readdir\n");
1133 ip = VTOI(vp);
1134 udf_vfsp = ip->i_udf;
1136 dirsiz = ip->i_size;
1137 if ((uiop->uio_offset >= dirsiz) ||
1138 (ip->i_nlink <= 0)) {
1139 if (eofp) {
1140 *eofp = 1;
1142 return (0);
1145 offset = uiop->uio_offset;
1146 iovp = uiop->uio_iov;
1147 bufsize = iovp->iov_len;
1149 outb = outbuf = kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1150 end_outb = outb + bufsize;
1151 nd = (struct dirent64 *)outbuf;
1153 dname = kmem_zalloc(1024, KM_SLEEP);
1154 buf = kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1156 if (offset == 0) {
1157 len = DIRENT64_RECLEN(1);
1158 if (((caddr_t)nd + len) >= end_outb) {
1159 error = EINVAL;
1160 goto end;
1162 nd->d_ino = ip->i_icb_lbano;
1163 nd->d_reclen = (uint16_t)len;
1164 nd->d_off = 0x10;
1165 nd->d_name[0] = '.';
1166 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1167 nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1168 outcount++;
1169 } else if (offset == 0x10) {
1170 offset = 0;
1173 while (offset < dirsiz) {
1174 error = ud_get_next_fid(ip, &fbp,
1175 offset, &fid, &name, buf);
1176 if (error != 0) {
1177 break;
1180 if ((fid->fid_flags & FID_DELETED) == 0) {
1181 if (fid->fid_flags & FID_PARENT) {
1183 len = DIRENT64_RECLEN(2);
1184 if (((caddr_t)nd + len) >= end_outb) {
1185 error = EINVAL;
1186 break;
1189 nd->d_ino = ip->i_icb_lbano;
1190 nd->d_reclen = (uint16_t)len;
1191 nd->d_off = offset + FID_LEN(fid);
1192 nd->d_name[0] = '.';
1193 nd->d_name[1] = '.';
1194 bzero(&nd->d_name[2],
1195 DIRENT64_NAMELEN(len) - 2);
1196 nd = (struct dirent64 *)
1197 ((char *)nd + nd->d_reclen);
1198 } else {
1199 if ((error = ud_uncompress(fid->fid_idlen,
1200 &length, name, dname)) != 0) {
1201 break;
1203 if (length == 0) {
1204 offset += FID_LEN(fid);
1205 continue;
1207 len = DIRENT64_RECLEN(length);
1208 if (((caddr_t)nd + len) >= end_outb) {
1209 if (!outcount) {
1210 error = EINVAL;
1212 break;
1214 (void) strncpy(nd->d_name,
1215 (caddr_t)dname, length);
1216 bzero(&nd->d_name[length],
1217 DIRENT64_NAMELEN(len) - length);
1218 nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1219 SWAP_16(fid->fid_icb.lad_ext_prn),
1220 SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1221 &dummy);
1222 nd->d_reclen = (uint16_t)len;
1223 nd->d_off = offset + FID_LEN(fid);
1224 nd = (struct dirent64 *)
1225 ((char *)nd + nd->d_reclen);
1227 outcount++;
1230 offset += FID_LEN(fid);
1233 end:
1234 if (fbp != NULL) {
1235 fbrelse(fbp, S_OTHER);
1237 ndlen = ((char *)nd - outbuf);
1239 * In case of error do not call uiomove.
1240 * Return the error to the caller.
1242 if ((error == 0) && (ndlen != 0)) {
1243 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1244 uiop->uio_offset = offset;
1246 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1247 kmem_free((caddr_t)dname, 1024);
1248 kmem_free(outbuf, (uint32_t)bufsize);
1249 if (eofp && error == 0) {
1250 *eofp = (uiop->uio_offset >= dirsiz);
1252 return (error);
1255 /* ARGSUSED */
1256 static int32_t
1257 udf_symlink(
1258 struct vnode *dvp,
1259 char *linkname,
1260 struct vattr *vap,
1261 char *target,
1262 struct cred *cr,
1263 caller_context_t *ct,
1264 int flags)
1266 int32_t error = 0, outlen;
1267 uint32_t ioflag = 0;
1268 struct ud_inode *ip, *dip = VTOI(dvp);
1270 struct path_comp *pc;
1271 int8_t *dname = NULL, *uname = NULL, *sp;
1273 ud_printf("udf_symlink\n");
1275 ip = NULL;
1276 vap->va_type = VLNK;
1277 vap->va_rdev = 0;
1279 rw_enter(&dip->i_rwlock, RW_WRITER);
1280 error = ud_direnter(dip, linkname, DE_CREATE,
1281 NULL, NULL, vap, &ip, cr, ct);
1282 rw_exit(&dip->i_rwlock);
1283 if (error == 0) {
1284 dname = kmem_zalloc(1024, KM_SLEEP);
1285 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1287 pc = (struct path_comp *)uname;
1289 * If the first character in target is "/"
1290 * then skip it and create entry for it
1292 if (*target == '/') {
1293 pc->pc_type = 2;
1294 pc->pc_len = 0;
1295 pc = (struct path_comp *)(((char *)pc) + 4);
1296 while (*target == '/') {
1297 target++;
1301 while (*target != '\0') {
1302 sp = target;
1303 while ((*target != '/') && (*target != '\0')) {
1304 target ++;
1307 * We got the next component of the
1308 * path name. Create path_comp of
1309 * appropriate type
1311 if (((target - sp) == 1) && (*sp == '.')) {
1313 * Dot entry.
1315 pc->pc_type = 4;
1316 pc = (struct path_comp *)(((char *)pc) + 4);
1317 } else if (((target - sp) == 2) &&
1318 (*sp == '.') && ((*(sp + 1)) == '.')) {
1320 * DotDot entry.
1322 pc->pc_type = 3;
1323 pc = (struct path_comp *)(((char *)pc) + 4);
1324 } else {
1326 * convert the user given name
1327 * into appropriate form to be put
1328 * on the media
1330 outlen = 1024; /* set to size of dname */
1331 if (error = ud_compress(target - sp, &outlen,
1332 (uint8_t *)sp, (uint8_t *)dname)) {
1333 break;
1335 pc->pc_type = 5;
1336 /* LINTED */
1337 pc->pc_len = outlen;
1338 dname[outlen] = '\0';
1339 (void) strcpy((char *)pc->pc_id, dname);
1340 pc = (struct path_comp *)
1341 (((char *)pc) + 4 + outlen);
1343 while (*target == '/') {
1344 target++;
1346 if (*target == '\0') {
1347 break;
1351 rw_enter(&ip->i_contents, RW_WRITER);
1352 if (error == 0) {
1353 ioflag = FWRITE;
1354 if (curthread->t_flag & T_DONTPEND) {
1355 ioflag |= FDSYNC;
1357 error = ud_rdwri(UIO_WRITE, ioflag, ip,
1358 uname, ((int8_t *)pc) - uname,
1359 0, UIO_SYSSPACE, (int32_t *)0, cr);
1361 if (error) {
1362 ud_idrop(ip);
1363 rw_exit(&ip->i_contents);
1364 rw_enter(&dip->i_rwlock, RW_WRITER);
1365 (void) ud_dirremove(dip, linkname, NULL,
1366 NULL, DR_REMOVE, cr, ct);
1367 rw_exit(&dip->i_rwlock);
1368 goto update_inode;
1370 rw_exit(&ip->i_contents);
1373 if ((error == 0) || (error == EEXIST)) {
1374 VN_RELE(ITOV(ip));
1377 update_inode:
1378 ITIMES(VTOI(dvp));
1379 if (uname != NULL) {
1380 kmem_free(uname, PAGESIZE);
1382 if (dname != NULL) {
1383 kmem_free(dname, 1024);
1386 return (error);
1389 /* ARGSUSED */
1390 static int32_t
1391 udf_readlink(
1392 struct vnode *vp,
1393 struct uio *uiop,
1394 struct cred *cr,
1395 caller_context_t *ct)
1397 int32_t error = 0, off, id_len, size, len;
1398 int8_t *dname = NULL, *uname = NULL;
1399 struct ud_inode *ip;
1400 struct fbuf *fbp = NULL;
1401 struct path_comp *pc;
1403 ud_printf("udf_readlink\n");
1405 if (vp->v_type != VLNK) {
1406 return (EINVAL);
1409 ip = VTOI(vp);
1410 size = ip->i_size;
1411 if (size > PAGESIZE) {
1412 return (EIO);
1415 if (size == 0) {
1416 return (0);
1419 dname = kmem_zalloc(1024, KM_SLEEP);
1420 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1422 rw_enter(&ip->i_contents, RW_READER);
1424 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1425 goto end;
1428 off = 0;
1430 while (off < size) {
1431 pc = (struct path_comp *)(fbp->fb_addr + off);
1432 switch (pc->pc_type) {
1433 case 1 :
1434 (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1435 (void) strcat(uname, "/");
1436 break;
1437 case 2 :
1438 if (pc->pc_len != 0) {
1439 goto end;
1441 uname[0] = '/';
1442 uname[1] = '\0';
1443 break;
1444 case 3 :
1445 (void) strcat(uname, "../");
1446 break;
1447 case 4 :
1448 (void) strcat(uname, "./");
1449 break;
1450 case 5 :
1451 if ((error = ud_uncompress(pc->pc_len, &id_len,
1452 pc->pc_id, (uint8_t *)dname)) != 0) {
1453 break;
1455 dname[id_len] = '\0';
1456 (void) strcat(uname, dname);
1457 (void) strcat(uname, "/");
1458 break;
1459 default :
1460 error = EINVAL;
1461 goto end;
1463 off += 4 + pc->pc_len;
1465 len = strlen(uname) - 1;
1466 if (uname[len] == '/') {
1467 if (len == 0) {
1469 * special case link to /
1471 len = 1;
1472 } else {
1473 uname[len] = '\0';
1477 error = uiomove(uname, len, UIO_READ, uiop);
1479 ITIMES(ip);
1481 end:
1482 if (fbp != NULL) {
1483 fbrelse(fbp, S_OTHER);
1485 rw_exit(&ip->i_contents);
1486 if (uname != NULL) {
1487 kmem_free(uname, PAGESIZE);
1489 if (dname != NULL) {
1490 kmem_free(dname, 1024);
1492 return (error);
1495 /* ARGSUSED */
1496 static int32_t
1497 udf_fsync(
1498 struct vnode *vp,
1499 int32_t syncflag,
1500 struct cred *cr,
1501 caller_context_t *ct)
1503 int32_t error = 0;
1504 struct ud_inode *ip = VTOI(vp);
1506 ud_printf("udf_fsync\n");
1508 rw_enter(&ip->i_contents, RW_WRITER);
1509 if (!(IS_SWAPVP(vp))) {
1510 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1512 if (error == 0) {
1513 error = ud_sync_indir(ip);
1515 ITIMES(ip); /* XXX: is this necessary ??? */
1516 rw_exit(&ip->i_contents);
1518 return (error);
1521 /* ARGSUSED */
1522 static void
1523 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1525 ud_printf("udf_iinactive\n");
1527 ud_iinactive(VTOI(vp), cr);
1530 /* ARGSUSED */
1531 static int32_t
1532 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1534 struct udf_fid *udfidp;
1535 struct ud_inode *ip = VTOI(vp);
1537 ud_printf("udf_fid\n");
1539 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1540 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1541 return (ENOSPC);
1544 udfidp = (struct udf_fid *)fidp;
1545 bzero((char *)udfidp, sizeof (struct udf_fid));
1546 rw_enter(&ip->i_contents, RW_READER);
1547 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1548 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1549 udfidp->udfid_prn = ip->i_icb_prn;
1550 udfidp->udfid_icb_lbn = ip->i_icb_block;
1551 rw_exit(&ip->i_contents);
1553 return (0);
1556 /* ARGSUSED2 */
1557 static int
1558 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1560 struct ud_inode *ip = VTOI(vp);
1562 ud_printf("udf_rwlock\n");
1564 if (write_lock) {
1565 rw_enter(&ip->i_rwlock, RW_WRITER);
1566 } else {
1567 rw_enter(&ip->i_rwlock, RW_READER);
1569 #ifdef __lock_lint
1570 rw_exit(&ip->i_rwlock);
1571 #endif
1572 return (write_lock);
1575 /* ARGSUSED */
1576 static void
1577 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1579 struct ud_inode *ip = VTOI(vp);
1581 ud_printf("udf_rwunlock\n");
1583 #ifdef __lock_lint
1584 rw_enter(&ip->i_rwlock, RW_WRITER);
1585 #endif
1587 rw_exit(&ip->i_rwlock);
1591 /* ARGSUSED */
1592 static int32_t
1593 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1595 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1598 static int32_t
1599 udf_frlock(
1600 struct vnode *vp,
1601 int32_t cmd,
1602 struct flock64 *bfp,
1603 int32_t flag,
1604 offset_t offset,
1605 struct flk_callback *flk_cbp,
1606 cred_t *cr,
1607 caller_context_t *ct)
1609 struct ud_inode *ip = VTOI(vp);
1611 ud_printf("udf_frlock\n");
1614 * If file is being mapped, disallow frlock.
1615 * XXX I am not holding tlock while checking i_mapcnt because the
1616 * current locking strategy drops all locks before calling fs_frlock.
1617 * So, mapcnt could change before we enter fs_frlock making is
1618 * meaningless to have held tlock in the first place.
1620 if ((ip->i_mapcnt > 0) &&
1621 (MANDLOCK(vp, ip->i_char))) {
1622 return (EAGAIN);
1625 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1628 /*ARGSUSED6*/
1629 static int32_t
1630 udf_space(
1631 struct vnode *vp,
1632 int32_t cmd,
1633 struct flock64 *bfp,
1634 int32_t flag,
1635 offset_t offset,
1636 cred_t *cr,
1637 caller_context_t *ct)
1639 int32_t error = 0;
1641 ud_printf("udf_space\n");
1643 if (cmd != F_FREESP) {
1644 error = EINVAL;
1645 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1646 error = ud_freesp(vp, bfp, flag, cr);
1648 if (error == 0 && bfp->l_start == 0)
1649 vnevent_truncate(vp, ct);
1652 return (error);
1655 /* ARGSUSED */
1656 static int32_t
1657 udf_getpage(
1658 struct vnode *vp,
1659 offset_t off,
1660 size_t len,
1661 uint32_t *protp,
1662 struct page **plarr,
1663 size_t plsz,
1664 struct seg *seg,
1665 caddr_t addr,
1666 enum seg_rw rw,
1667 struct cred *cr,
1668 caller_context_t *ct)
1670 struct ud_inode *ip = VTOI(vp);
1671 int32_t error, has_holes, beyond_eof, seqmode, dolock;
1672 int32_t pgsize = PAGESIZE;
1673 struct udf_vfs *udf_vfsp = ip->i_udf;
1674 page_t **pl;
1675 uoff_t pgoff, eoff, uoff;
1676 krw_t rwtype;
1677 caddr_t pgaddr;
1679 ud_printf("udf_getpage\n");
1681 uoff = (uoff_t)off; /* type conversion */
1682 if (protp) {
1683 *protp = PROT_ALL;
1685 if (vp->v_flag & VNOMAP) {
1686 return (ENOSYS);
1688 seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1690 rwtype = RW_READER;
1691 dolock = (rw_owner(&ip->i_contents) != curthread);
1692 retrylock:
1693 #ifdef __lock_lint
1694 rw_enter(&ip->i_contents, rwtype);
1695 #else
1696 if (dolock) {
1697 rw_enter(&ip->i_contents, rwtype);
1699 #endif
1702 * We may be getting called as a side effect of a bmap using
1703 * fbread() when the blocks might be being allocated and the
1704 * size has not yet been up'ed. In this case we want to be
1705 * able to return zero pages if we get back UDF_HOLE from
1706 * calling bmap for a non write case here. We also might have
1707 * to read some frags from the disk into a page if we are
1708 * extending the number of frags for a given lbn in bmap().
1710 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1711 if (beyond_eof && seg != segkmap) {
1712 #ifdef __lock_lint
1713 rw_exit(&ip->i_contents);
1714 #else
1715 if (dolock) {
1716 rw_exit(&ip->i_contents);
1718 #endif
1719 return (EFAULT);
1723 * Must hold i_contents lock throughout the call to pvn_getpages
1724 * since locked pages are returned from each call to ud_getapage.
1725 * Must *not* return locked pages and then try for contents lock
1726 * due to lock ordering requirements (inode > page)
1729 has_holes = ud_bmap_has_holes(ip);
1731 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1732 int32_t blk_size, count;
1733 uoff_t offset;
1736 * We must acquire the RW_WRITER lock in order to
1737 * call bmap_write().
1739 if (dolock && rwtype == RW_READER) {
1740 rwtype = RW_WRITER;
1742 if (!rw_tryupgrade(&ip->i_contents)) {
1744 rw_exit(&ip->i_contents);
1746 goto retrylock;
1751 * May be allocating disk blocks for holes here as
1752 * a result of mmap faults. write(2) does the bmap_write
1753 * in rdip/wrip, not here. We are not dealing with frags
1754 * in this case.
1756 offset = uoff;
1757 while ((offset < uoff + len) &&
1758 (offset < ip->i_size)) {
1760 * the variable "bnp" is to simplify the expression for
1761 * the compiler; * just passing in &bn to bmap_write
1762 * causes a compiler "loop"
1765 blk_size = udf_vfsp->udf_lbsize;
1766 if ((offset + blk_size) > ip->i_size) {
1767 count = ip->i_size - offset;
1768 } else {
1769 count = blk_size;
1771 error = ud_bmap_write(ip, offset, count, 0, cr);
1772 if (error) {
1773 goto update_inode;
1775 offset += count; /* XXX - make this contig */
1780 * Can be a reader from now on.
1782 #ifdef __lock_lint
1783 if (rwtype == RW_WRITER) {
1784 rw_downgrade(&ip->i_contents);
1786 #else
1787 if (dolock && rwtype == RW_WRITER) {
1788 rw_downgrade(&ip->i_contents);
1790 #endif
1793 * We remove PROT_WRITE in cases when the file has UDF holes
1794 * because we don't want to call bmap_read() to check each
1795 * page if it is backed with a disk block.
1797 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1798 *protp &= ~PROT_WRITE;
1801 error = 0;
1804 * The loop looks up pages in the range <off, off + len).
1805 * For each page, we first check if we should initiate an asynchronous
1806 * read ahead before we call page_lookup (we may sleep in page_lookup
1807 * for a previously initiated disk read).
1809 eoff = (uoff + len);
1810 for (pgoff = uoff, pgaddr = addr, pl = plarr;
1811 pgoff < eoff; /* empty */) {
1812 page_t *pp;
1813 uoff_t nextrio;
1814 se_t se;
1816 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1819 * Handle async getpage (faultahead)
1821 if (plarr == NULL) {
1822 ip->i_nextrio = pgoff;
1823 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1824 pgoff += pgsize;
1825 pgaddr += pgsize;
1826 continue;
1830 * Check if we should initiate read ahead of next cluster.
1831 * We call page_exists only when we need to confirm that
1832 * we have the current page before we initiate the read ahead.
1834 nextrio = ip->i_nextrio;
1835 if (seqmode &&
1836 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1837 nextrio < ip->i_size && page_exists(&vp->v_object, pgoff))
1838 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1840 if ((pp = page_lookup(&vp->v_object, pgoff, se)) != NULL) {
1843 * We found the page in the page cache.
1845 *pl++ = pp;
1846 pgoff += pgsize;
1847 pgaddr += pgsize;
1848 len -= pgsize;
1849 plsz -= pgsize;
1850 } else {
1853 * We have to create the page, or read it from disk.
1855 if (error = ud_getpage_miss(vp, pgoff, len,
1856 seg, pgaddr, pl, plsz, rw, seqmode)) {
1857 goto error_out;
1860 while (*pl != NULL) {
1861 pl++;
1862 pgoff += pgsize;
1863 pgaddr += pgsize;
1864 len -= pgsize;
1865 plsz -= pgsize;
1871 * Return pages up to plsz if they are in the page cache.
1872 * We cannot return pages if there is a chance that they are
1873 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1875 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1877 ASSERT((protp == NULL) ||
1878 !(has_holes && (*protp & PROT_WRITE)));
1880 eoff = pgoff + plsz;
1881 while (pgoff < eoff) {
1882 page_t *pp;
1884 if ((pp = page_lookup_nowait(&vp->v_object, pgoff, SE_SHARED)) == NULL)
1885 break;
1887 *pl++ = pp;
1888 pgoff += pgsize;
1889 plsz -= pgsize;
1893 if (plarr)
1894 *pl = NULL; /* Terminate page list */
1895 ip->i_nextr = pgoff;
1897 error_out:
1898 if (error && plarr) {
1900 * Release any pages we have locked.
1902 while (pl > &plarr[0])
1903 page_unlock(*--pl);
1905 plarr[0] = NULL;
1908 update_inode:
1909 #ifdef __lock_lint
1910 rw_exit(&ip->i_contents);
1911 #else
1912 if (dolock) {
1913 rw_exit(&ip->i_contents);
1915 #endif
1918 * If the inode is not already marked for IACC (in rwip() for read)
1919 * and the inode is not marked for no access time update (in rwip()
1920 * for write) then update the inode access time and mod time now.
1922 mutex_enter(&ip->i_tlock);
1923 if ((ip->i_flag & (IACC | INOACC)) == 0) {
1924 if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1925 ip->i_flag |= IACC;
1927 if (rw == S_WRITE) {
1928 ip->i_flag |= IUPD;
1930 ITIMES_NOLOCK(ip);
1932 mutex_exit(&ip->i_tlock);
1934 return (error);
1937 int32_t ud_delay = 1;
1939 /* ARGSUSED */
1940 static int32_t
1941 udf_putpage(
1942 struct vnode *vp,
1943 offset_t off,
1944 size_t len,
1945 int32_t flags,
1946 struct cred *cr,
1947 caller_context_t *ct)
1949 struct ud_inode *ip;
1950 int32_t error = 0;
1952 ud_printf("udf_putpage\n");
1954 ip = VTOI(vp);
1955 #ifdef __lock_lint
1956 rw_enter(&ip->i_contents, RW_WRITER);
1957 #endif
1959 if (vp->v_count == 0) {
1960 cmn_err(CE_WARN, "ud_putpage : bad v_count");
1961 error = EINVAL;
1962 goto out;
1965 if (vp->v_flag & VNOMAP) {
1966 error = ENOSYS;
1967 goto out;
1970 if (flags & B_ASYNC) {
1971 if (ud_delay && len &&
1972 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1973 mutex_enter(&ip->i_tlock);
1976 * If nobody stalled, start a new cluster.
1978 if (ip->i_delaylen == 0) {
1979 ip->i_delayoff = off;
1980 ip->i_delaylen = len;
1981 mutex_exit(&ip->i_tlock);
1982 goto out;
1986 * If we have a full cluster or they are not contig,
1987 * then push last cluster and start over.
1989 if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1990 ip->i_delayoff + ip->i_delaylen != off) {
1991 uoff_t doff;
1992 size_t dlen;
1994 doff = ip->i_delayoff;
1995 dlen = ip->i_delaylen;
1996 ip->i_delayoff = off;
1997 ip->i_delaylen = len;
1998 mutex_exit(&ip->i_tlock);
1999 error = ud_putpages(vp, doff, dlen, flags, cr);
2000 /* LMXXX - flags are new val, not old */
2001 goto out;
2005 * There is something there, it's not full, and
2006 * it is contig.
2008 ip->i_delaylen += len;
2009 mutex_exit(&ip->i_tlock);
2010 goto out;
2014 * Must have weird flags or we are not clustering.
2018 error = ud_putpages(vp, off, len, flags, cr);
2020 out:
2021 #ifdef __lock_lint
2022 rw_exit(&ip->i_contents);
2023 #endif
2024 return (error);
2027 /* ARGSUSED */
2028 static int32_t
2029 udf_map(
2030 struct vnode *vp,
2031 offset_t off,
2032 struct as *as,
2033 caddr_t *addrp,
2034 size_t len,
2035 uint8_t prot,
2036 uint8_t maxprot,
2037 uint32_t flags,
2038 struct cred *cr,
2039 caller_context_t *ct)
2041 struct segvn_crargs vn_a;
2042 int32_t error = 0;
2044 ud_printf("udf_map\n");
2046 if (vp->v_flag & VNOMAP) {
2047 error = ENOSYS;
2048 goto end;
2051 if ((off < 0) ||
2052 ((off + len) < 0)) {
2053 error = EINVAL;
2054 goto end;
2057 if (vp->v_type != VREG) {
2058 error = ENODEV;
2059 goto end;
2063 * If file is being locked, disallow mapping.
2065 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2066 error = EAGAIN;
2067 goto end;
2070 as_rangelock(as);
2071 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2072 if (error != 0) {
2073 as_rangeunlock(as);
2074 goto end;
2077 vn_a.vp = vp;
2078 vn_a.offset = off;
2079 vn_a.type = flags & MAP_TYPE;
2080 vn_a.prot = prot;
2081 vn_a.maxprot = maxprot;
2082 vn_a.cred = cr;
2083 vn_a.amp = NULL;
2084 vn_a.flags = flags & ~MAP_TYPE;
2085 vn_a.szc = 0;
2086 vn_a.lgrp_mem_policy_flags = 0;
2088 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2089 as_rangeunlock(as);
2091 end:
2092 return (error);
2095 /* ARGSUSED */
2096 static int32_t
2097 udf_addmap(struct vnode *vp,
2098 offset_t off,
2099 struct as *as,
2100 caddr_t addr,
2101 size_t len,
2102 uint8_t prot,
2103 uint8_t maxprot,
2104 uint32_t flags,
2105 struct cred *cr,
2106 caller_context_t *ct)
2108 struct ud_inode *ip = VTOI(vp);
2110 ud_printf("udf_addmap\n");
2112 if (vp->v_flag & VNOMAP) {
2113 return (ENOSYS);
2116 mutex_enter(&ip->i_tlock);
2117 ip->i_mapcnt += btopr(len);
2118 mutex_exit(&ip->i_tlock);
2120 return (0);
2123 /* ARGSUSED */
2124 static int32_t
2125 udf_delmap(
2126 struct vnode *vp, offset_t off,
2127 struct as *as,
2128 caddr_t addr,
2129 size_t len,
2130 uint32_t prot,
2131 uint32_t maxprot,
2132 uint32_t flags,
2133 struct cred *cr,
2134 caller_context_t *ct)
2136 struct ud_inode *ip = VTOI(vp);
2138 ud_printf("udf_delmap\n");
2140 if (vp->v_flag & VNOMAP) {
2141 return (ENOSYS);
2144 mutex_enter(&ip->i_tlock);
2145 ip->i_mapcnt -= btopr(len); /* Count released mappings */
2146 ASSERT(ip->i_mapcnt >= 0);
2147 mutex_exit(&ip->i_tlock);
2149 return (0);
2152 /* ARGSUSED */
2153 static int32_t
2154 udf_l_pathconf(
2155 struct vnode *vp,
2156 int32_t cmd,
2157 ulong_t *valp,
2158 struct cred *cr,
2159 caller_context_t *ct)
2161 int32_t error = 0;
2163 ud_printf("udf_l_pathconf\n");
2165 if (cmd == _PC_FILESIZEBITS) {
2167 * udf supports 64 bits as file size
2168 * but there are several other restrictions
2169 * it only supports 32-bit block numbers and
2170 * daddr32_t is only and int32_t so taking these
2171 * into account we can stay just as where ufs is
2173 *valp = 41;
2174 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2175 /* nanosecond timestamp resolution */
2176 *valp = 1L;
2177 } else {
2178 error = fs_pathconf(vp, cmd, valp, cr, ct);
2181 return (error);
2184 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2185 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2186 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2188 * Assumption is that there will not be a pageio request
2189 * to a enbedded file
2191 /* ARGSUSED */
2192 static int32_t
2193 udf_pageio(
2194 struct vnode *vp,
2195 struct page *pp,
2196 uoff_t io_off,
2197 size_t io_len,
2198 int32_t flags,
2199 struct cred *cr,
2200 caller_context_t *ct)
2202 daddr_t bn;
2203 struct buf *bp;
2204 struct ud_inode *ip = VTOI(vp);
2205 int32_t dolock, error = 0, contig, multi_io;
2206 size_t done_len = 0, cur_len = 0;
2207 page_t *npp = NULL, *opp = NULL, *cpp = pp;
2209 if (pp == NULL) {
2210 return (EINVAL);
2213 dolock = (rw_owner(&ip->i_contents) != curthread);
2216 * We need a better check. Ideally, we would use another
2217 * vnodeops so that hlocked and forcibly unmounted file
2218 * systems would return EIO where appropriate and w/o the
2219 * need for these checks.
2221 if (ip->i_udf == NULL) {
2222 return (EIO);
2225 #ifdef __lock_lint
2226 rw_enter(&ip->i_contents, RW_READER);
2227 #else
2228 if (dolock) {
2229 rw_enter(&ip->i_contents, RW_READER);
2231 #endif
2234 * Break the io request into chunks, one for each contiguous
2235 * stretch of disk blocks in the target file.
2237 while (done_len < io_len) {
2238 ASSERT(cpp);
2239 bp = NULL;
2240 contig = 0;
2241 if (error = ud_bmap_read(ip, (uoff_t)(io_off + done_len),
2242 &bn, &contig)) {
2243 break;
2246 if (bn == UDF_HOLE) { /* No holey swapfiles */
2247 cmn_err(CE_WARN, "SWAP file has HOLES");
2248 error = EINVAL;
2249 break;
2252 cur_len = MIN(io_len - done_len, contig);
2255 * Check if more than one I/O is
2256 * required to complete the given
2257 * I/O operation
2259 if (ip->i_udf->udf_lbsize < PAGESIZE) {
2260 if (cur_len >= PAGESIZE) {
2261 multi_io = 0;
2262 cur_len &= PAGEMASK;
2263 } else {
2264 multi_io = 1;
2265 cur_len = MIN(io_len - done_len, PAGESIZE);
2268 page_list_break(&cpp, &npp, btop(cur_len));
2270 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2271 ASSERT(bp != NULL);
2273 bp->b_edev = ip->i_dev;
2274 bp->b_dev = cmpdev(ip->i_dev);
2275 bp->b_blkno = bn;
2276 bp->b_un.b_addr = (caddr_t)0;
2277 bp->b_file = vp;
2278 bp->b_offset = (offset_t)(io_off + done_len);
2281 * ub.ub_pageios.value.ul++;
2283 if (multi_io == 0) {
2284 (void) bdev_strategy(bp);
2285 } else {
2286 error = ud_multi_strat(ip, cpp, bp,
2287 (uoff_t)(io_off + done_len));
2288 if (error != 0) {
2289 pageio_done(bp);
2290 break;
2293 if (flags & B_READ) {
2294 ud_pageio_reads++;
2295 } else {
2296 ud_pageio_writes++;
2300 * If the request is not B_ASYNC, wait for i/o to complete
2301 * and re-assemble the page list to return to the caller.
2302 * If it is B_ASYNC we leave the page list in pieces and
2303 * cleanup() will dispose of them.
2305 if ((flags & B_ASYNC) == 0) {
2306 error = biowait(bp);
2307 pageio_done(bp);
2308 if (error) {
2309 break;
2311 page_list_concat(&opp, &cpp);
2313 cpp = npp;
2314 npp = NULL;
2315 done_len += cur_len;
2318 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2319 if (error) {
2320 if (flags & B_ASYNC) {
2321 /* Cleanup unprocessed parts of list */
2322 page_list_concat(&cpp, &npp);
2323 if (flags & B_READ) {
2324 pvn_read_done(cpp, B_ERROR);
2325 } else {
2326 pvn_write_done(cpp, B_ERROR);
2328 } else {
2329 /* Re-assemble list and let caller clean up */
2330 page_list_concat(&opp, &cpp);
2331 page_list_concat(&opp, &npp);
2335 #ifdef __lock_lint
2336 rw_exit(&ip->i_contents);
2337 #else
2338 if (dolock) {
2339 rw_exit(&ip->i_contents);
2341 #endif
2342 return (error);
2348 /* -------------------- local functions --------------------------- */
2352 int32_t
2353 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2354 struct ud_inode *ip, caddr_t base, int32_t len,
2355 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2357 int32_t error;
2358 struct uio auio;
2359 struct iovec aiov;
2361 ud_printf("ud_rdwri\n");
2363 bzero((caddr_t)&auio, sizeof (uio_t));
2364 bzero((caddr_t)&aiov, sizeof (iovec_t));
2366 aiov.iov_base = base;
2367 aiov.iov_len = len;
2368 auio.uio_iov = &aiov;
2369 auio.uio_iovcnt = 1;
2370 auio.uio_loffset = offset;
2371 auio.uio_segflg = (int16_t)seg;
2372 auio.uio_resid = len;
2374 if (rw == UIO_WRITE) {
2375 auio.uio_fmode = FWRITE;
2376 auio.uio_extflg = UIO_COPY_DEFAULT;
2377 auio.uio_llimit = curproc->p_fsz_ctl;
2378 error = ud_wrip(ip, &auio, ioflag, cr);
2379 } else {
2380 auio.uio_fmode = FREAD;
2381 auio.uio_extflg = UIO_COPY_CACHED;
2382 auio.uio_llimit = MAXOFFSET_T;
2383 error = ud_rdip(ip, &auio, ioflag, cr);
2386 if (aresid) {
2387 *aresid = auio.uio_resid;
2388 } else if (auio.uio_resid) {
2389 error = EIO;
2391 return (error);
2395 * Free behind hacks. The pager is busted.
2396 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2397 * or B_FREE_IF_TIGHT_ON_MEMORY.
2399 int32_t ud_freebehind = 1;
2400 int32_t ud_smallfile = 32 * 1024;
2402 /* ARGSUSED */
2403 int32_t
2404 ud_getpage_miss(struct vnode *vp, uoff_t off,
2405 size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2406 size_t plsz, enum seg_rw rw, int32_t seq)
2408 struct ud_inode *ip = VTOI(vp);
2409 int32_t err = 0;
2410 size_t io_len;
2411 uoff_t io_off;
2412 uoff_t pgoff;
2413 page_t *pp;
2415 pl[0] = NULL;
2418 * Figure out whether the page can be created, or must be
2419 * read from the disk
2421 if (rw == S_CREATE) {
2422 if ((pp = page_create_va(&vp->v_object, off,
2423 PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2424 cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2425 return (EINVAL);
2427 io_len = PAGESIZE;
2428 } else {
2429 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2430 &io_len, off, PAGESIZE, 0);
2433 * Some other thread has entered the page.
2434 * ud_getpage will retry page_lookup.
2436 if (pp == NULL) {
2437 return (0);
2441 * Fill the page with as much data as we can from the file.
2443 err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2444 if (err) {
2445 pvn_read_done(pp, B_ERROR);
2446 return (err);
2450 * XXX ??? ufs has io_len instead of pgoff below
2452 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2455 * If the file access is sequential, initiate read ahead
2456 * of the next cluster.
2458 if (seq && ip->i_nextrio < ip->i_size) {
2459 ud_getpage_ra(vp, off, seg, addr);
2463 outmiss:
2464 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2465 return (err);
2468 /* ARGSUSED */
2469 void
2470 ud_getpage_ra(struct vnode *vp,
2471 uoff_t off, struct seg *seg, caddr_t addr)
2473 page_t *pp;
2474 size_t io_len;
2475 struct ud_inode *ip = VTOI(vp);
2476 uoff_t io_off = ip->i_nextrio, pgoff;
2477 caddr_t addr2 = addr + (io_off - off);
2478 daddr_t bn;
2479 int32_t contig = 0;
2482 * Is this test needed?
2485 if (addr2 >= seg->s_base + seg->s_size) {
2486 return;
2489 contig = 0;
2490 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2491 return;
2494 pp = pvn_read_kluster(vp, io_off, seg, addr2,
2495 &io_off, &io_len, io_off, PAGESIZE, 1);
2498 * Some other thread has entered the page.
2499 * So no read head done here (ie we will have to and wait
2500 * for the read when needed).
2503 if (pp == NULL) {
2504 return;
2507 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2508 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2512 ud_page_fill(struct ud_inode *ip, page_t *pp, uoff_t off,
2513 uint32_t bflgs, uoff_t *pg_off)
2515 daddr_t bn;
2516 struct buf *bp;
2517 caddr_t kaddr, caddr;
2518 int32_t error = 0, contig = 0, multi_io = 0;
2519 int32_t lbsize = ip->i_udf->udf_lbsize;
2520 int32_t lbmask = ip->i_udf->udf_lbmask;
2521 uint64_t isize;
2523 isize = (ip->i_size + lbmask) & (~lbmask);
2524 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2527 * Embedded file read file_entry
2528 * from buffer cache and copy the required
2529 * portions
2531 bp = ud_bread(ip->i_dev,
2532 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2533 if ((bp->b_error == 0) &&
2534 (bp->b_resid == 0)) {
2536 caddr = bp->b_un.b_addr + ip->i_data_off;
2539 * mapin to kvm
2541 kaddr = (caddr_t)ppmapin(pp,
2542 PROT_READ | PROT_WRITE, (caddr_t)-1);
2543 (void) kcopy(caddr, kaddr, ip->i_size);
2546 * mapout of kvm
2548 ppmapout(kaddr);
2550 brelse(bp);
2551 contig = ip->i_size;
2552 } else {
2555 * Get the continuous size and block number
2556 * at offset "off"
2558 if (error = ud_bmap_read(ip, off, &bn, &contig))
2559 goto out;
2560 contig = MIN(contig, PAGESIZE);
2561 contig = (contig + lbmask) & (~lbmask);
2564 * Zero part of the page which we are not
2565 * going to read from the disk.
2568 if (bn == UDF_HOLE) {
2571 * This is a HOLE. Just zero out
2572 * the page
2574 if (((off + contig) == isize) ||
2575 (contig == PAGESIZE)) {
2576 pagezero(pp->p_prev, 0, PAGESIZE);
2577 goto out;
2581 if (contig < PAGESIZE) {
2582 uint64_t count;
2584 count = isize - off;
2585 if (contig != count) {
2586 multi_io = 1;
2587 contig = (int32_t)(MIN(count, PAGESIZE));
2588 } else {
2589 pagezero(pp->p_prev, contig, PAGESIZE - contig);
2594 * Get a bp and initialize it
2596 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2597 ASSERT(bp != NULL);
2599 bp->b_edev = ip->i_dev;
2600 bp->b_dev = cmpdev(ip->i_dev);
2601 bp->b_blkno = bn;
2602 bp->b_un.b_addr = 0;
2603 bp->b_file = ip->i_vnode;
2606 * Start I/O
2608 if (multi_io == 0) {
2611 * Single I/O is sufficient for this page
2613 (void) bdev_strategy(bp);
2614 } else {
2617 * We need to do the I/O in
2618 * piece's
2620 error = ud_multi_strat(ip, pp, bp, off);
2621 if (error != 0) {
2622 goto out;
2625 if ((bflgs & B_ASYNC) == 0) {
2628 * Wait for i/o to complete.
2631 error = biowait(bp);
2632 pageio_done(bp);
2633 if (error) {
2634 goto out;
2638 if ((off + contig) >= ip->i_size) {
2639 contig = ip->i_size - off;
2642 out:
2643 *pg_off = contig;
2644 return (error);
2647 int32_t
2648 ud_putpages(struct vnode *vp, offset_t off,
2649 size_t len, int32_t flags, struct cred *cr)
2651 struct ud_inode *ip;
2652 page_t *pp;
2653 uoff_t io_off;
2654 size_t io_len;
2655 uoff_t eoff;
2656 int32_t err = 0;
2657 int32_t dolock;
2659 ud_printf("ud_putpages\n");
2661 if (vp->v_count == 0) {
2662 cmn_err(CE_WARN, "ud_putpages: bad v_count");
2663 return (EINVAL);
2666 ip = VTOI(vp);
2669 * Acquire the readers/write inode lock before locking
2670 * any pages in this inode.
2671 * The inode lock is held during i/o.
2673 if (len == 0) {
2674 mutex_enter(&ip->i_tlock);
2675 ip->i_delayoff = ip->i_delaylen = 0;
2676 mutex_exit(&ip->i_tlock);
2678 #ifdef __lock_lint
2679 rw_enter(&ip->i_contents, RW_READER);
2680 #else
2681 dolock = (rw_owner(&ip->i_contents) != curthread);
2682 if (dolock) {
2683 rw_enter(&ip->i_contents, RW_READER);
2685 #endif
2687 if (!vn_has_cached_data(vp)) {
2688 #ifdef __lock_lint
2689 rw_exit(&ip->i_contents);
2690 #else
2691 if (dolock) {
2692 rw_exit(&ip->i_contents);
2694 #endif
2695 return (0);
2698 if (len == 0) {
2700 * Search the entire vp list for pages >= off.
2702 err = pvn_vplist_dirty(vp, (uoff_t)off, ud_putapage,
2703 flags, cr);
2704 } else {
2706 * Loop over all offsets in the range looking for
2707 * pages to deal with.
2709 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2710 eoff = MIN(off + len, eoff);
2711 } else {
2712 eoff = off + len;
2715 for (io_off = off; io_off < eoff; io_off += io_len) {
2717 * If we are not invalidating, synchronously
2718 * freeing or writing pages, use the routine
2719 * page_lookup_nowait() to prevent reclaiming
2720 * them from the free list.
2722 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2723 pp = page_lookup(&vp->v_object, io_off,
2724 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
2725 } else {
2726 pp = page_lookup_nowait(&vp->v_object,
2727 io_off,
2728 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2731 if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2732 io_len = PAGESIZE;
2733 } else {
2735 err = ud_putapage(vp, pp,
2736 &io_off, &io_len, flags, cr);
2737 if (err != 0) {
2738 break;
2741 * "io_off" and "io_len" are returned as
2742 * the range of pages we actually wrote.
2743 * This allows us to skip ahead more quickly
2744 * since several pages may've been dealt
2745 * with by this iteration of the loop.
2750 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2752 * We have just sync'ed back all the pages on
2753 * the inode, turn off the IMODTIME flag.
2755 mutex_enter(&ip->i_tlock);
2756 ip->i_flag &= ~IMODTIME;
2757 mutex_exit(&ip->i_tlock);
2759 #ifdef __lock_lint
2760 rw_exit(&ip->i_contents);
2761 #else
2762 if (dolock) {
2763 rw_exit(&ip->i_contents);
2765 #endif
2766 return (err);
2769 /* ARGSUSED */
2770 int32_t
2771 ud_putapage(struct vnode *vp,
2772 page_t *pp, uoff_t *offp,
2773 size_t *lenp, int32_t flags, struct cred *cr)
2775 daddr_t bn;
2776 size_t io_len;
2777 struct ud_inode *ip;
2778 int32_t error = 0, contig, multi_io = 0;
2779 struct udf_vfs *udf_vfsp;
2780 uoff_t off, io_off;
2781 caddr_t kaddr, caddr;
2782 struct buf *bp = NULL;
2783 int32_t lbmask;
2784 uint64_t isize;
2785 uint16_t crc_len;
2786 struct file_entry *fe;
2788 ud_printf("ud_putapage\n");
2790 ip = VTOI(vp);
2791 ASSERT(ip);
2792 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2793 lbmask = ip->i_udf->udf_lbmask;
2794 isize = (ip->i_size + lbmask) & (~lbmask);
2796 udf_vfsp = ip->i_udf;
2797 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2800 * If the modified time on the inode has not already been
2801 * set elsewhere (e.g. for write/setattr) we set the time now.
2802 * This gives us approximate modified times for mmap'ed files
2803 * which are modified via stores in the user address space.
2805 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2806 mutex_enter(&ip->i_tlock);
2807 ip->i_flag |= IUPD;
2808 ITIMES_NOLOCK(ip);
2809 mutex_exit(&ip->i_tlock);
2814 * Align the request to a block boundry (for old file systems),
2815 * and go ask bmap() how contiguous things are for this file.
2817 off = pp->p_offset & ~(offset_t)lbmask;
2818 /* block align it */
2821 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2822 ASSERT(ip->i_size <= ip->i_max_emb);
2824 pp = pvn_write_kluster(vp, pp, &io_off,
2825 &io_len, off, PAGESIZE, flags);
2826 if (io_len == 0) {
2827 io_len = PAGESIZE;
2830 bp = ud_bread(ip->i_dev,
2831 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2832 udf_vfsp->udf_lbsize);
2833 fe = (struct file_entry *)bp->b_un.b_addr;
2834 if ((bp->b_flags & B_ERROR) ||
2835 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2836 ip->i_icb_block,
2837 1, udf_vfsp->udf_lbsize) != 0)) {
2838 if (pp != NULL)
2839 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2840 if (bp->b_flags & B_ERROR) {
2841 error = EIO;
2842 } else {
2843 error = EINVAL;
2845 brelse(bp);
2846 return (error);
2848 if ((bp->b_error == 0) &&
2849 (bp->b_resid == 0)) {
2851 caddr = bp->b_un.b_addr + ip->i_data_off;
2852 kaddr = (caddr_t)ppmapin(pp,
2853 PROT_READ | PROT_WRITE, (caddr_t)-1);
2854 (void) kcopy(kaddr, caddr, ip->i_size);
2855 ppmapout(kaddr);
2857 crc_len = offsetof(struct file_entry, fe_spec) +
2858 SWAP_32(fe->fe_len_ear);
2859 crc_len += ip->i_size;
2860 ud_make_tag(ip->i_udf, &fe->fe_tag,
2861 UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2863 bwrite(bp);
2865 if (flags & B_ASYNC) {
2866 pvn_write_done(pp, flags);
2868 contig = ip->i_size;
2869 } else {
2871 if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2872 goto out;
2874 contig = MIN(contig, PAGESIZE);
2875 contig = (contig + lbmask) & (~lbmask);
2877 if (contig < PAGESIZE) {
2878 uint64_t count;
2880 count = isize - off;
2881 if (contig != count) {
2882 multi_io = 1;
2883 contig = (int32_t)(MIN(count, PAGESIZE));
2887 if ((off + contig) > isize) {
2888 contig = isize - off;
2891 if (contig > PAGESIZE) {
2892 if (contig & PAGEOFFSET) {
2893 contig &= PAGEMASK;
2897 pp = pvn_write_kluster(vp, pp, &io_off,
2898 &io_len, off, contig, flags);
2899 if (io_len == 0) {
2900 io_len = PAGESIZE;
2903 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2904 ASSERT(bp != NULL);
2906 bp->b_edev = ip->i_dev;
2907 bp->b_dev = cmpdev(ip->i_dev);
2908 bp->b_blkno = bn;
2909 bp->b_un.b_addr = 0;
2910 bp->b_file = vp;
2911 bp->b_offset = (offset_t)off;
2915 * write throttle
2917 ASSERT(bp->b_iodone == NULL);
2918 bp->b_iodone = ud_iodone;
2919 mutex_enter(&ip->i_tlock);
2920 ip->i_writes += bp->b_bcount;
2921 mutex_exit(&ip->i_tlock);
2923 if (multi_io == 0) {
2925 (void) bdev_strategy(bp);
2926 } else {
2927 error = ud_multi_strat(ip, pp, bp, off);
2928 if (error != 0) {
2929 goto out;
2933 if ((flags & B_ASYNC) == 0) {
2935 * Wait for i/o to complete.
2937 error = biowait(bp);
2938 pageio_done(bp);
2942 if ((flags & B_ASYNC) == 0) {
2943 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2946 pp = NULL;
2948 out:
2949 if (error != 0 && pp != NULL) {
2950 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2953 if (offp) {
2954 *offp = io_off;
2956 if (lenp) {
2957 *lenp = io_len;
2960 return (error);
2964 int32_t
2965 ud_iodone(struct buf *bp)
2967 struct ud_inode *ip;
2969 VERIFY(bp->b_pages->p_object != NULL);
2970 ASSERT(bp->b_pages->p_vnode != NULL);
2971 ASSERT(!(bp->b_flags & B_READ));
2973 bp->b_iodone = NULL;
2975 ip = VTOI(bp->b_pages->p_vnode);
2977 mutex_enter(&ip->i_tlock);
2978 if (ip->i_writes >= ud_LW) {
2979 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2980 if (ud_WRITES) {
2981 cv_broadcast(&ip->i_wrcv); /* wake all up */
2984 } else {
2985 ip->i_writes -= bp->b_bcount;
2987 mutex_exit(&ip->i_tlock);
2988 iodone(bp);
2989 return (0);
2992 /* ARGSUSED3 */
2993 int32_t
2994 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2996 struct vnode *vp;
2997 struct udf_vfs *udf_vfsp;
2998 krw_t rwtype;
2999 caddr_t base;
3000 uint32_t flags;
3001 int32_t error, n, on, mapon, dofree;
3002 uoff_t off;
3003 long oresid = uio->uio_resid;
3005 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3006 if ((ip->i_type != VREG) &&
3007 (ip->i_type != VDIR) &&
3008 (ip->i_type != VLNK)) {
3009 return (EIO);
3012 if (uio->uio_loffset > MAXOFFSET_T) {
3013 return (0);
3016 if ((uio->uio_loffset < 0) ||
3017 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3018 return (EINVAL);
3020 if (uio->uio_resid == 0) {
3021 return (0);
3024 vp = ITOV(ip);
3025 udf_vfsp = ip->i_udf;
3026 mutex_enter(&ip->i_tlock);
3027 ip->i_flag |= IACC;
3028 mutex_exit(&ip->i_tlock);
3030 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3032 do {
3033 offset_t diff;
3034 uoff_t uoff = uio->uio_loffset;
3035 off = uoff & (offset_t)MAXBMASK;
3036 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3037 on = (int)blkoff(udf_vfsp, uoff);
3038 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3040 diff = ip->i_size - uoff;
3042 if (diff <= 0) {
3043 error = 0;
3044 goto out;
3046 if (diff < (offset_t)n) {
3047 n = (int)diff;
3049 dofree = ud_freebehind &&
3050 ip->i_nextr == (off & PAGEMASK) &&
3051 off > ud_smallfile;
3053 #ifndef __lock_lint
3054 if (rwtype == RW_READER) {
3055 rw_exit(&ip->i_contents);
3057 #endif
3059 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3060 (uint32_t)n, 1, S_READ);
3061 error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3063 flags = 0;
3064 if (!error) {
3066 * If read a whole block, or read to eof,
3067 * won't need this buffer again soon.
3069 if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3070 freemem < lotsfree + pages_before_pager) {
3071 flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3074 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3075 * we want to make sure that the page which has
3076 * been read, is written on disk if it is dirty.
3077 * And corresponding indirect blocks should also
3078 * be flushed out.
3080 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3081 flags &= ~SM_ASYNC;
3082 flags |= SM_WRITE;
3084 error = segmap_release(segkmap, base, flags);
3085 } else {
3086 (void) segmap_release(segkmap, base, flags);
3089 #ifndef __lock_lint
3090 if (rwtype == RW_READER) {
3091 rw_enter(&ip->i_contents, rwtype);
3093 #endif
3094 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3095 out:
3097 * Inode is updated according to this table if FRSYNC is set.
3099 * FSYNC FDSYNC(posix.4)
3100 * --------------------------
3101 * always IATTCHG|IBDWRITE
3103 if (ioflag & FRSYNC) {
3104 if ((ioflag & FSYNC) ||
3105 ((ioflag & FDSYNC) &&
3106 (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3107 rw_exit(&ip->i_contents);
3108 rw_enter(&ip->i_contents, RW_WRITER);
3109 ud_iupdat(ip, 1);
3113 * If we've already done a partial read, terminate
3114 * the read but return no error.
3116 if (oresid != uio->uio_resid) {
3117 error = 0;
3119 ITIMES(ip);
3121 return (error);
3124 int32_t
3125 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3127 caddr_t base;
3128 struct vnode *vp;
3129 struct udf_vfs *udf_vfsp;
3130 uint32_t flags;
3131 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3132 int32_t pagecreate, newpage;
3133 uint64_t old_i_size;
3134 uoff_t off;
3135 long start_resid = uio->uio_resid, premove_resid;
3136 rlim64_t limit = uio->uio_limit;
3139 ASSERT(RW_WRITE_HELD(&ip->i_contents));
3140 if ((ip->i_type != VREG) &&
3141 (ip->i_type != VDIR) &&
3142 (ip->i_type != VLNK)) {
3143 return (EIO);
3146 if (uio->uio_loffset >= MAXOFFSET_T) {
3147 return (EFBIG);
3150 * see udf_l_pathconf
3152 if (limit > (((uint64_t)1 << 40) - 1)) {
3153 limit = ((uint64_t)1 << 40) - 1;
3155 if (uio->uio_loffset >= limit) {
3156 proc_t *p = ttoproc(curthread);
3158 mutex_enter(&p->p_lock);
3159 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3160 p, RCA_UNSAFE_SIGINFO);
3161 mutex_exit(&p->p_lock);
3162 return (EFBIG);
3164 if ((uio->uio_loffset < 0) ||
3165 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3166 return (EINVAL);
3168 if (uio->uio_resid == 0) {
3169 return (0);
3172 mutex_enter(&ip->i_tlock);
3173 ip->i_flag |= INOACC;
3175 if (ioflag & (FSYNC | FDSYNC)) {
3176 ip->i_flag |= ISYNC;
3177 iupdat_flag = 1;
3179 mutex_exit(&ip->i_tlock);
3181 udf_vfsp = ip->i_udf;
3182 vp = ITOV(ip);
3184 do {
3185 uoff_t uoff = uio->uio_loffset;
3186 off = uoff & (offset_t)MAXBMASK;
3187 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3188 on = (int)blkoff(udf_vfsp, uoff);
3189 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3191 if (ip->i_type == VREG && uoff + n >= limit) {
3192 if (uoff >= limit) {
3193 error = EFBIG;
3194 goto out;
3196 n = (int)(limit - (rlim64_t)uoff);
3198 if (uoff + n > ip->i_size) {
3200 * We are extending the length of the file.
3201 * bmap is used so that we are sure that
3202 * if we need to allocate new blocks, that it
3203 * is done here before we up the file size.
3205 error = ud_bmap_write(ip, uoff,
3206 (int)(on + n), mapon == 0, cr);
3207 if (error) {
3208 break;
3210 i_size_changed = 1;
3211 old_i_size = ip->i_size;
3212 ip->i_size = uoff + n;
3214 * If we are writing from the beginning of
3215 * the mapping, we can just create the
3216 * pages without having to read them.
3218 pagecreate = (mapon == 0);
3219 } else if (n == MAXBSIZE) {
3221 * Going to do a whole mappings worth,
3222 * so we can just create the pages w/o
3223 * having to read them in. But before
3224 * we do that, we need to make sure any
3225 * needed blocks are allocated first.
3227 error = ud_bmap_write(ip, uoff,
3228 (int)(on + n), 1, cr);
3229 if (error) {
3230 break;
3232 pagecreate = 1;
3233 } else {
3234 pagecreate = 0;
3237 rw_exit(&ip->i_contents);
3240 * Touch the page and fault it in if it is not in
3241 * core before segmap_getmapflt can lock it. This
3242 * is to avoid the deadlock if the buffer is mapped
3243 * to the same file through mmap which we want to
3244 * write to.
3246 uio_prefaultpages((long)n, uio);
3248 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3249 (uint32_t)n, !pagecreate, S_WRITE);
3252 * segmap_pagecreate() returns 1 if it calls
3253 * page_create_va() to allocate any pages.
3255 newpage = 0;
3256 if (pagecreate) {
3257 newpage = segmap_pagecreate(segkmap, base,
3258 (size_t)n, 0);
3261 premove_resid = uio->uio_resid;
3262 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3264 if (pagecreate &&
3265 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3267 * We created pages w/o initializing them completely,
3268 * thus we need to zero the part that wasn't set up.
3269 * This happens on most EOF write cases and if
3270 * we had some sort of error during the uiomove.
3272 int nzero, nmoved;
3274 nmoved = (int)(uio->uio_loffset - (off + mapon));
3275 ASSERT(nmoved >= 0 && nmoved <= n);
3276 nzero = roundup(on + n, PAGESIZE) - nmoved;
3277 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3278 (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3282 * Unlock the pages allocated by page_create_va()
3283 * in segmap_pagecreate()
3285 if (newpage) {
3286 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3289 if (error) {
3291 * If we failed on a write, we may have already
3292 * allocated file blocks as well as pages. It's
3293 * hard to undo the block allocation, but we must
3294 * be sure to invalidate any pages that may have
3295 * been allocated.
3297 (void) segmap_release(segkmap, base, SM_INVAL);
3298 } else {
3299 flags = 0;
3301 * Force write back for synchronous write cases.
3303 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3305 * If the sticky bit is set but the
3306 * execute bit is not set, we do a
3307 * synchronous write back and free
3308 * the page when done. We set up swap
3309 * files to be handled this way to
3310 * prevent servers from keeping around
3311 * the client's swap pages too long.
3312 * XXX - there ought to be a better way.
3314 if (IS_SWAPVP(vp)) {
3315 flags = SM_WRITE | SM_FREE |
3316 SM_DONTNEED;
3317 iupdat_flag = 0;
3318 } else {
3319 flags = SM_WRITE;
3321 } else if (((mapon + n) == MAXBSIZE) ||
3322 IS_SWAPVP(vp)) {
3324 * Have written a whole block.
3325 * Start an asynchronous write and
3326 * mark the buffer to indicate that
3327 * it won't be needed again soon.
3329 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3331 error = segmap_release(segkmap, base, flags);
3334 * If the operation failed and is synchronous,
3335 * then we need to unwind what uiomove() last
3336 * did so we can potentially return an error to
3337 * the caller. If this write operation was
3338 * done in two pieces and the first succeeded,
3339 * then we won't return an error for the second
3340 * piece that failed. However, we only want to
3341 * return a resid value that reflects what was
3342 * really done.
3344 * Failures for non-synchronous operations can
3345 * be ignored since the page subsystem will
3346 * retry the operation until it succeeds or the
3347 * file system is unmounted.
3349 if (error) {
3350 if ((ioflag & (FSYNC | FDSYNC)) ||
3351 ip->i_type == VDIR) {
3352 uio->uio_resid = premove_resid;
3353 } else {
3354 error = 0;
3360 * Re-acquire contents lock.
3362 rw_enter(&ip->i_contents, RW_WRITER);
3364 * If the uiomove() failed or if a synchronous
3365 * page push failed, fix up i_size.
3367 if (error) {
3368 if (i_size_changed) {
3370 * The uiomove failed, and we
3371 * allocated blocks,so get rid
3372 * of them.
3374 (void) ud_itrunc(ip, old_i_size, 0, cr);
3376 } else {
3378 * XXX - Can this be out of the loop?
3380 ip->i_flag |= IUPD | ICHG;
3381 if (i_size_changed) {
3382 ip->i_flag |= IATTCHG;
3384 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3385 (IEXEC >> 10))) != 0 &&
3386 (ip->i_char & (ISUID | ISGID)) != 0 &&
3387 secpolicy_vnode_setid_retain(cr,
3388 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3390 * Clear Set-UID & Set-GID bits on
3391 * successful write if not privileged
3392 * and at least one of the execute bits
3393 * is set. If we always clear Set-GID,
3394 * mandatory file and record locking is
3395 * unuseable.
3397 ip->i_char &= ~(ISUID | ISGID);
3400 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3402 out:
3404 * Inode is updated according to this table -
3406 * FSYNC FDSYNC(posix.4)
3407 * --------------------------
3408 * always@ IATTCHG|IBDWRITE
3410 * @ - If we are doing synchronous write the only time we should
3411 * not be sync'ing the ip here is if we have the stickyhack
3412 * activated, the file is marked with the sticky bit and
3413 * no exec bit, the file length has not been changed and
3414 * no new blocks have been allocated during this write.
3416 if ((ip->i_flag & ISYNC) != 0) {
3418 * we have eliminated nosync
3420 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3421 ((ioflag & FSYNC) && iupdat_flag)) {
3422 ud_iupdat(ip, 1);
3427 * If we've already done a partial-write, terminate
3428 * the write but return no error.
3430 if (start_resid != uio->uio_resid) {
3431 error = 0;
3433 ip->i_flag &= ~(INOACC | ISYNC);
3434 ITIMES_NOLOCK(ip);
3436 return (error);
3439 int32_t
3440 ud_multi_strat(struct ud_inode *ip,
3441 page_t *pp, struct buf *bp, uoff_t start)
3443 daddr_t bn;
3444 int32_t error = 0, io_count, contig, alloc_sz, i;
3445 uint32_t io_off;
3446 mio_master_t *mm = NULL;
3447 mio_slave_t *ms = NULL;
3448 struct buf *rbp;
3450 ASSERT(!(start & PAGEOFFSET));
3453 * Figure out how many buffers to allocate
3455 io_count = 0;
3456 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3457 contig = 0;
3458 if (error = ud_bmap_read(ip, (uoff_t)(start + io_off),
3459 &bn, &contig)) {
3460 goto end;
3462 if (contig == 0) {
3463 goto end;
3465 contig = MIN(contig, PAGESIZE - io_off);
3466 if (bn != UDF_HOLE) {
3467 io_count ++;
3468 } else {
3470 * HOLE
3472 if (bp->b_flags & B_READ) {
3475 * This is a hole and is read
3476 * it should be filled with 0's
3478 pagezero(pp, io_off, contig);
3484 if (io_count != 0) {
3487 * Allocate memory for all the
3488 * required number of buffers
3490 alloc_sz = sizeof (mio_master_t) +
3491 (sizeof (mio_slave_t) * io_count);
3492 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3493 if (mm == NULL) {
3494 error = ENOMEM;
3495 goto end;
3499 * initialize master
3501 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3502 mm->mm_size = alloc_sz;
3503 mm->mm_bp = bp;
3504 mm->mm_resid = 0;
3505 mm->mm_error = 0;
3506 mm->mm_index = master_index++;
3508 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3511 * Initialize buffers
3513 io_count = 0;
3514 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3515 contig = 0;
3516 if (error = ud_bmap_read(ip,
3517 (uoff_t)(start + io_off),
3518 &bn, &contig)) {
3519 goto end;
3521 ASSERT(contig);
3522 if ((io_off + contig) > bp->b_bcount) {
3523 contig = bp->b_bcount - io_off;
3525 if (bn != UDF_HOLE) {
3527 * Clone the buffer
3528 * and prepare to start I/O
3530 ms->ms_ptr = mm;
3531 bioinit(&ms->ms_buf);
3532 rbp = bioclone(bp, io_off, (size_t)contig,
3533 bp->b_edev, bn, ud_slave_done,
3534 &ms->ms_buf, KM_NOSLEEP);
3535 ASSERT(rbp == &ms->ms_buf);
3536 mm->mm_resid += contig;
3537 io_count++;
3538 ms ++;
3543 * Start I/O's
3545 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3546 for (i = 0; i < io_count; i++) {
3547 (void) bdev_strategy(&ms->ms_buf);
3548 ms ++;
3552 end:
3553 if (error != 0) {
3554 bp->b_flags |= B_ERROR;
3555 bp->b_error = error;
3556 if (mm != NULL) {
3557 mutex_destroy(&mm->mm_mutex);
3558 kmem_free(mm, mm->mm_size);
3561 return (error);
3564 int32_t
3565 ud_slave_done(struct buf *bp)
3567 mio_master_t *mm;
3568 int32_t resid;
3570 ASSERT(SEMA_HELD(&bp->b_sem));
3571 ASSERT((bp->b_flags & B_DONE) == 0);
3573 mm = ((mio_slave_t *)bp)->ms_ptr;
3576 * Propagate error and byte count info from slave struct to
3577 * the master struct
3579 mutex_enter(&mm->mm_mutex);
3580 if (bp->b_flags & B_ERROR) {
3583 * If multiple slave buffers get
3584 * error we forget the old errors
3585 * this is ok because we any way
3586 * cannot return multiple errors
3588 mm->mm_error = bp->b_error;
3590 mm->mm_resid -= bp->b_bcount;
3591 resid = mm->mm_resid;
3592 mutex_exit(&mm->mm_mutex);
3595 * free up the resources allocated to cloned buffers.
3597 bp_mapout(bp);
3598 biofini(bp);
3600 if (resid == 0) {
3603 * This is the last I/O operation
3604 * clean up and return the original buffer
3606 if (mm->mm_error) {
3607 mm->mm_bp->b_flags |= B_ERROR;
3608 mm->mm_bp->b_error = mm->mm_error;
3610 biodone(mm->mm_bp);
3611 mutex_destroy(&mm->mm_mutex);
3612 kmem_free(mm, mm->mm_size);
3614 return (0);