kernel/fs/udfs/udf_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /*
  27  * Copyright 2015, Joyent, Inc.
  28  */
  29
  30 #include <sys/types.h>
  31 #include <sys/t_lock.h>
  32 #include <sys/param.h>
  33 #include <sys/time.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/resource.h>
  37 #include <sys/signal.h>
  38 #include <sys/cred.h>
  39 #include <sys/user.h>
  40 #include <sys/buf.h>
  41 #include <sys/vfs.h>
  42 #include <sys/stat.h>
  43 #include <sys/vnode.h>
  44 #include <sys/mode.h>
  45 #include <sys/proc.h>
  46 #include <sys/disp.h>
  47 #include <sys/file.h>
  48 #include <sys/fcntl.h>
  49 #include <sys/flock.h>
  50 #include <sys/kmem.h>
  51 #include <sys/uio.h>
  52 #include <sys/dnlc.h>
  53 #include <sys/conf.h>
  54 #include <sys/errno.h>
  55 #include <sys/mman.h>
  56 #include <sys/fbuf.h>
  57 #include <sys/pathname.h>
  58 #include <sys/debug.h>
  59 #include <sys/vmsystm.h>
  60 #include <sys/cmn_err.h>
  61 #include <sys/dirent.h>
  62 #include <sys/errno.h>
  63 #include <sys/modctl.h>
  64 #include <sys/statvfs.h>
  65 #include <sys/mount.h>
  66 #include <sys/sunddi.h>
  67 #include <sys/bootconf.h>
  68 #include <sys/policy.h>
  69
  70 #include <vm/hat.h>
  71 #include <vm/page.h>
  72 #include <vm/pvn.h>
  73 #include <vm/as.h>
  74 #include <vm/seg.h>
  75 #include <vm/seg_map.h>
  76 #include <vm/seg_kmem.h>
  77 #include <vm/seg_vn.h>
  78 #include <vm/rm.h>
  79 #include <vm/page.h>
  80 #include <sys/swap.h>
  81
  82 #include <sys/fs_subr.h>
  83
  84 #include <sys/fs/udf_volume.h>
  85 #include <sys/fs/udf_inode.h>
  86
  87 static int32_t udf_open(struct vnode **,
  88         int32_t, struct cred *, caller_context_t *);
  89 static int32_t udf_close(struct vnode *,
  90         int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
  91 static int32_t udf_read(struct vnode *,
  92         struct uio *, int32_t, struct cred *, caller_context_t *);
  93 static int32_t udf_write(struct vnode *,
  94         struct uio *, int32_t, struct cred *, caller_context_t *);
  95 static int32_t udf_ioctl(struct vnode *,
  96         int32_t, intptr_t, int32_t, struct cred *, int32_t *,
  97         caller_context_t *);
  98 static int32_t udf_getattr(struct vnode *,
  99         struct vattr *, int32_t, struct cred *, caller_context_t *);
 100 static int32_t udf_setattr(struct vnode *,
 101         struct vattr *, int32_t, struct cred *, caller_context_t *);
 102 static int32_t udf_access(struct vnode *,
 103         int32_t, int32_t, struct cred *, caller_context_t *);
 104 static int32_t udf_lookup(struct vnode *,
 105         char *, struct vnode **, struct pathname *,
 106         int32_t, struct vnode *, struct cred *,
 107         caller_context_t *, int *, pathname_t *);
 108 static int32_t udf_create(struct vnode *,
 109         char *, struct vattr *, enum vcexcl,
 110         int32_t, struct vnode **, struct cred *, int32_t,
 111         caller_context_t *, vsecattr_t *);
 112 static int32_t udf_remove(struct vnode *,
 113         char *, struct cred *, caller_context_t *, int);
 114 static int32_t udf_link(struct vnode *,
 115         struct vnode *, char *, struct cred *, caller_context_t *, int);
 116 static int32_t udf_rename(struct vnode *,
 117         char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
 118 static int32_t udf_mkdir(struct vnode *,
 119         char *, struct vattr *, struct vnode **, struct cred *,
 120         caller_context_t *, int, vsecattr_t *);
 121 static int32_t udf_rmdir(struct vnode *,
 122         char *, struct vnode *, struct cred *, caller_context_t *, int);
 123 static int32_t udf_readdir(struct vnode *,
 124         struct uio *, struct cred *, int32_t *, caller_context_t *, int);
 125 static int32_t udf_symlink(struct vnode *,
 126         char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
 127 static int32_t udf_readlink(struct vnode *,
 128         struct uio *, struct cred *, caller_context_t *);
 129 static int32_t udf_fsync(struct vnode *,
 130         int32_t, struct cred *, caller_context_t *);
 131 static void udf_inactive(struct vnode *,
 132         struct cred *, caller_context_t *);
 133 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
 134 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
 135 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
 136 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
 137         caller_context_t *);
 138 static int32_t udf_frlock(struct vnode *, int32_t,
 139         struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
 140         caller_context_t *);
 141 static int32_t udf_space(struct vnode *, int32_t,
 142         struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
 143 static int32_t udf_getpage(struct vnode *, offset_t,
 144         size_t, uint32_t *, struct page **, size_t,
 145         struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
 146 static int32_t udf_putpage(struct vnode *, offset_t,
 147         size_t, int32_t, struct cred *, caller_context_t *);
 148 static int32_t udf_map(struct vnode *, offset_t, struct as *,
 149         caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
 150         caller_context_t *);
 151 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
 152         caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
 153         caller_context_t *);
 154 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
 155         caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
 156         caller_context_t *);
 157 static int32_t udf_l_pathconf(struct vnode *, int32_t,
 158         ulong_t *, struct cred *, caller_context_t *);
 159 static int32_t udf_pageio(struct vnode *, struct page *,
 160         uoff_t, size_t, int32_t, struct cred *, caller_context_t *);
 161
 162 int32_t ud_getpage_miss(struct vnode *, uoff_t,
 163         size_t, struct seg *, caddr_t, page_t *pl[],
 164         size_t, enum seg_rw, int32_t);
 165 void ud_getpage_ra(struct vnode *, uoff_t, struct seg *, caddr_t);
 166 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
 167 int32_t ud_page_fill(struct ud_inode *, page_t *,
 168         uoff_t, uint32_t, uoff_t *);
 169 int32_t ud_iodone(struct buf *);
 170 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
 171 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
 172 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, uoff_t);
 173 int32_t ud_slave_done(struct buf *);
 174
 175 /*
 176  * Structures to control multiple IO operations to get or put pages
 177  * that are backed by discontiguous blocks. The master struct is
 178  * a dummy that holds the original bp from pageio_setup. The
 179  * slave struct holds the working bp's to do the actual IO. Once
 180  * all the slave IOs complete. The master is processed as if a single
 181  * IO op has completed.
 182  */
 183 uint32_t master_index = 0;
 184 typedef struct mio_master {
 185         kmutex_t        mm_mutex;       /* protect the fields below */
 186         int32_t         mm_size;
 187         buf_t           *mm_bp;         /* original bp */
 188         int32_t         mm_resid;       /* bytes remaining to transfer */
 189         int32_t         mm_error;       /* accumulated error from slaves */
 190         int32_t         mm_index;       /* XXX debugging */
 191 } mio_master_t;
 192
 193 typedef struct mio_slave {
 194         buf_t           ms_buf;         /* working buffer for this IO chunk */
 195         mio_master_t    *ms_ptr;        /* pointer to master */
 196 } mio_slave_t;
 197
 198 const struct vnodeops udf_vnodeops = {
 199         .vnop_name = "udfs",
 200         .vop_open = udf_open,
 201         .vop_close = udf_close,
 202         .vop_read = udf_read,
 203         .vop_write = udf_write,
 204         .vop_ioctl = udf_ioctl,
 205         .vop_getattr = udf_getattr,
 206         .vop_setattr = udf_setattr,
 207         .vop_access = udf_access,
 208         .vop_lookup = udf_lookup,
 209         .vop_create = udf_create,
 210         .vop_remove = udf_remove,
 211         .vop_link = udf_link,
 212         .vop_rename = udf_rename,
 213         .vop_mkdir = udf_mkdir,
 214         .vop_rmdir = udf_rmdir,
 215         .vop_readdir = udf_readdir,
 216         .vop_symlink = udf_symlink,
 217         .vop_readlink = udf_readlink,
 218         .vop_fsync = udf_fsync,
 219         .vop_inactive = udf_inactive,
 220         .vop_fid = udf_fid,
 221         .vop_rwlock = udf_rwlock,
 222         .vop_rwunlock = udf_rwunlock,
 223         .vop_seek = udf_seek,
 224         .vop_frlock = udf_frlock,
 225         .vop_space = udf_space,
 226         .vop_getpage = udf_getpage,
 227         .vop_putpage = udf_putpage,
 228         .vop_map = udf_map,
 229         .vop_addmap = udf_addmap,
 230         .vop_delmap = udf_delmap,
 231         .vop_pathconf = udf_l_pathconf,
 232         .vop_pageio = udf_pageio,
 233         .vop_vnevent = fs_vnevent_support,
 234 };
 235
 236 /* ARGSUSED */
 237 static int32_t
 238 udf_open(
 239         struct vnode **vpp,
 240         int32_t flag,
 241         struct cred *cr,
 242         caller_context_t *ct)
 243 {
 244         ud_printf("udf_open\n");
 245
 246         return (0);
 247 }
 248
 249 /* ARGSUSED */
 250 static int32_t
 251 udf_close(
 252         struct vnode *vp,
 253         int32_t flag,
 254         int32_t count,
 255         offset_t offset,
 256         struct cred *cr,
 257         caller_context_t *ct)
 258 {
 259         struct ud_inode *ip = VTOI(vp);
 260
 261         ud_printf("udf_close\n");
 262
 263         ITIMES(ip);
 264
 265         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 266         cleanshares(vp, ttoproc(curthread)->p_pid);
 267
 268         /*
 269          * Push partially filled cluster at last close.
 270          * ``last close'' is approximated because the dnlc
 271          * may have a hold on the vnode.
 272          */
 273         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 274                 struct ud_inode *ip = VTOI(vp);
 275                 if (ip->i_delaylen) {
 276                         (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 277                             B_ASYNC | B_FREE, cr);
 278                         ip->i_delaylen = 0;
 279                 }
 280         }
 281
 282         return (0);
 283 }
 284
 285 /* ARGSUSED */
 286 static int32_t
 287 udf_read(
 288         struct vnode *vp,
 289         struct uio *uiop,
 290         int32_t ioflag,
 291         struct cred *cr,
 292         caller_context_t *ct)
 293 {
 294         struct ud_inode *ip = VTOI(vp);
 295         int32_t error;
 296
 297         ud_printf("udf_read\n");
 298
 299 #ifdef  __lock_lint
 300         rw_enter(&ip->i_rwlock, RW_READER);
 301 #endif
 302
 303         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 304
 305         if (MANDLOCK(vp, ip->i_char)) {
 306                 /*
 307                  * udf_getattr ends up being called by chklock
 308                  */
 309                 error = chklock(vp, FREAD, uiop->uio_loffset,
 310                     uiop->uio_resid, uiop->uio_fmode, ct);
 311                 if (error) {
 312                         goto end;
 313                 }
 314         }
 315
 316         rw_enter(&ip->i_contents, RW_READER);
 317         error = ud_rdip(ip, uiop, ioflag, cr);
 318         rw_exit(&ip->i_contents);
 319
 320 end:
 321 #ifdef  __lock_lint
 322         rw_exit(&ip->i_rwlock);
 323 #endif
 324
 325         return (error);
 326 }
 327
 328
 329 int32_t ud_WRITES = 1;
 330 int32_t ud_HW = 96 * 1024;
 331 int32_t ud_LW = 64 * 1024;
 332 int32_t ud_throttles = 0;
 333
 334 /* ARGSUSED */
 335 static int32_t
 336 udf_write(
 337         struct vnode *vp,
 338         struct uio *uiop,
 339         int32_t ioflag,
 340         struct cred *cr,
 341         caller_context_t *ct)
 342 {
 343         struct ud_inode *ip = VTOI(vp);
 344         int32_t error = 0;
 345
 346         ud_printf("udf_write\n");
 347
 348 #ifdef  __lock_lint
 349         rw_enter(&ip->i_rwlock, RW_WRITER);
 350 #endif
 351
 352         ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 353
 354         if (MANDLOCK(vp, ip->i_char)) {
 355                 /*
 356                  * ud_getattr ends up being called by chklock
 357                  */
 358                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 359                     uiop->uio_resid, uiop->uio_fmode, ct);
 360                 if (error) {
 361                         goto end;
 362                 }
 363         }
 364         /*
 365          * Throttle writes.
 366          */
 367         mutex_enter(&ip->i_tlock);
 368         if (ud_WRITES && (ip->i_writes > ud_HW)) {
 369                 while (ip->i_writes > ud_HW) {
 370                         ud_throttles++;
 371                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 372                 }
 373         }
 374         mutex_exit(&ip->i_tlock);
 375
 376         /*
 377          * Write to the file
 378          */
 379         rw_enter(&ip->i_contents, RW_WRITER);
 380         if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
 381                 /*
 382                  * In append mode start at end of file.
 383                  */
 384                 uiop->uio_loffset = ip->i_size;
 385         }
 386         error = ud_wrip(ip, uiop, ioflag, cr);
 387         rw_exit(&ip->i_contents);
 388
 389 end:
 390 #ifdef  __lock_lint
 391         rw_exit(&ip->i_rwlock);
 392 #endif
 393
 394         return (error);
 395 }
 396
 397 /* ARGSUSED */
 398 static int32_t
 399 udf_ioctl(
 400         struct vnode *vp,
 401         int32_t cmd,
 402         intptr_t arg,
 403         int32_t flag,
 404         struct cred *cr,
 405         int32_t *rvalp,
 406         caller_context_t *ct)
 407 {
 408         return (ENOTTY);
 409 }
 410
 411 /* ARGSUSED */
 412 static int32_t
 413 udf_getattr(
 414         struct vnode *vp,
 415         struct vattr *vap,
 416         int32_t flags,
 417         struct cred *cr,
 418         caller_context_t *ct)
 419 {
 420         struct ud_inode *ip = VTOI(vp);
 421
 422         ud_printf("udf_getattr\n");
 423
 424         if (vap->va_mask == VATTR_SIZE) {
 425                 /*
 426                  * for performance, if only the size is requested don't bother
 427                  * with anything else.
 428                  */
 429                 vap->va_size = ip->i_size;
 430                 return (0);
 431         }
 432
 433         rw_enter(&ip->i_contents, RW_READER);
 434
 435         vap->va_type = vp->v_type;
 436         vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
 437
 438         vap->va_uid = ip->i_uid;
 439         vap->va_gid = ip->i_gid;
 440         vap->va_fsid = ip->i_dev;
 441         vap->va_nodeid = ip->i_icb_lbano;
 442         vap->va_nlink = ip->i_nlink;
 443         vap->va_size = ip->i_size;
 444         vap->va_seq = ip->i_seq;
 445         if (vp->v_type == VCHR || vp->v_type == VBLK) {
 446                 vap->va_rdev = ip->i_rdev;
 447         } else {
 448                 vap->va_rdev = 0;
 449         }
 450
 451         mutex_enter(&ip->i_tlock);
 452         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
 453         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
 454         vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
 455         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
 456         vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
 457         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
 458         vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
 459         mutex_exit(&ip->i_tlock);
 460
 461         switch (ip->i_type) {
 462                 case VBLK:
 463                         vap->va_blksize = MAXBSIZE;
 464                         break;
 465                 case VCHR:
 466                         vap->va_blksize = MAXBSIZE;
 467                         break;
 468                 default:
 469                         vap->va_blksize = ip->i_udf->udf_lbsize;
 470                         break;
 471         }
 472         vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
 473
 474         rw_exit(&ip->i_contents);
 475
 476         return (0);
 477 }
 478
 479 static int
 480 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
 481 {
 482         return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
 483 }
 484
 485 /*ARGSUSED4*/
 486 static int32_t
 487 udf_setattr(
 488         struct vnode *vp,
 489         struct vattr *vap,
 490         int32_t flags,
 491         struct cred *cr,
 492         caller_context_t *ct)
 493 {
 494         int32_t error = 0;
 495         uint32_t mask = vap->va_mask;
 496         struct ud_inode *ip;
 497         timestruc_t now;
 498         struct vattr ovap;
 499
 500         ud_printf("udf_setattr\n");
 501
 502         ip = VTOI(vp);
 503
 504         /*
 505          * not updates allowed to 4096 files
 506          */
 507         if (ip->i_astrat == STRAT_TYPE4096) {
 508                 return (EINVAL);
 509         }
 510
 511         /*
 512          * Cannot set these attributes
 513          */
 514         if (mask & VATTR_NOSET) {
 515                 return (EINVAL);
 516         }
 517
 518         rw_enter(&ip->i_rwlock, RW_WRITER);
 519         rw_enter(&ip->i_contents, RW_WRITER);
 520
 521         ovap.va_uid = ip->i_uid;
 522         ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
 523         error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
 524             ud_iaccess_vmode, ip);
 525         if (error)
 526                 goto update_inode;
 527
 528         mask = vap->va_mask;
 529         /*
 530          * Change file access modes.
 531          */
 532         if (mask & VATTR_MODE) {
 533                 ip->i_perm = VA2UD_PERM(vap->va_mode);
 534                 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
 535                 mutex_enter(&ip->i_tlock);
 536                 ip->i_flag |= ICHG;
 537                 mutex_exit(&ip->i_tlock);
 538         }
 539         if (mask & (VATTR_UID|VATTR_GID)) {
 540                 if (mask & VATTR_UID) {
 541                         ip->i_uid = vap->va_uid;
 542                 }
 543                 if (mask & VATTR_GID) {
 544                         ip->i_gid = vap->va_gid;
 545                 }
 546                 mutex_enter(&ip->i_tlock);
 547                 ip->i_flag |= ICHG;
 548                 mutex_exit(&ip->i_tlock);
 549         }
 550         /*
 551          * Truncate file.  Must have write permission and not be a directory.
 552          */
 553         if (mask & VATTR_SIZE) {
 554                 if (vp->v_type == VDIR) {
 555                         error = EISDIR;
 556                         goto update_inode;
 557                 }
 558                 if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
 559                         goto update_inode;
 560                 }
 561                 if (vap->va_size > MAXOFFSET_T) {
 562                         error = EFBIG;
 563                         goto update_inode;
 564                 }
 565                 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
 566                         goto update_inode;
 567                 }
 568
 569                 if (vap->va_size == 0)
 570                         vnevent_truncate(vp, ct);
 571         }
 572         /*
 573          * Change file access or modified times.
 574          */
 575         if (mask & (VATTR_ATIME|VATTR_MTIME)) {
 576                 mutex_enter(&ip->i_tlock);
 577                 if (mask & VATTR_ATIME) {
 578                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
 579                         ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
 580                         ip->i_flag &= ~IACC;
 581                 }
 582                 if (mask & VATTR_MTIME) {
 583                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
 584                         ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
 585                         gethrestime(&now);
 586                         ip->i_ctime.tv_sec = now.tv_sec;
 587                         ip->i_ctime.tv_nsec = now.tv_nsec;
 588                         ip->i_flag &= ~(IUPD|ICHG);
 589                         ip->i_flag |= IMODTIME;
 590                 }
 591                 ip->i_flag |= IMOD;
 592                 mutex_exit(&ip->i_tlock);
 593         }
 594
 595 update_inode:
 596         if (curthread->t_flag & T_DONTPEND) {
 597                 ud_iupdat(ip, 1);
 598         } else {
 599                 ITIMES_NOLOCK(ip);
 600         }
 601         rw_exit(&ip->i_contents);
 602         rw_exit(&ip->i_rwlock);
 603
 604         return (error);
 605 }
 606
 607 /* ARGSUSED */
 608 static int32_t
 609 udf_access(
 610         struct vnode *vp,
 611         int32_t mode,
 612         int32_t flags,
 613         struct cred *cr,
 614         caller_context_t *ct)
 615 {
 616         struct ud_inode *ip = VTOI(vp);
 617
 618         ud_printf("udf_access\n");
 619
 620         if (ip->i_udf == NULL) {
 621                 return (EIO);
 622         }
 623
 624         return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
 625 }
 626
 627 int32_t udfs_stickyhack = 1;
 628
 629 /* ARGSUSED */
 630 static int32_t
 631 udf_lookup(
 632         struct vnode *dvp,
 633         char *nm,
 634         struct vnode **vpp,
 635         struct pathname *pnp,
 636         int32_t flags,
 637         struct vnode *rdir,
 638         struct cred *cr,
 639         caller_context_t *ct,
 640         int *direntflags,
 641         pathname_t *realpnp)
 642 {
 643         int32_t error;
 644         struct vnode *vp;
 645         struct ud_inode *ip, *xip;
 646
 647         ud_printf("udf_lookup\n");
 648         /*
 649          * Null component name is a synonym for directory being searched.
 650          */
 651         if (*nm == '\0') {
 652                 VN_HOLD(dvp);
 653                 *vpp = dvp;
 654                 error = 0;
 655                 goto out;
 656         }
 657
 658         /*
 659          * Fast path: Check the directory name lookup cache.
 660          */
 661         ip = VTOI(dvp);
 662         if (vp = dnlc_lookup(dvp, nm)) {
 663                 /*
 664                  * Check accessibility of directory.
 665                  */
 666                 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
 667                         VN_RELE(vp);
 668                 }
 669                 xip = VTOI(vp);
 670         } else {
 671                 error = ud_dirlook(ip, nm, &xip, cr, 1);
 672                 ITIMES(ip);
 673         }
 674
 675         if (error == 0) {
 676                 ip = xip;
 677                 *vpp = ITOV(ip);
 678                 if ((ip->i_type != VDIR) &&
 679                     (ip->i_char & ISVTX) &&
 680                     ((ip->i_perm & IEXEC) == 0) &&
 681                     udfs_stickyhack) {
 682                         mutex_enter(&(*vpp)->v_lock);
 683                         (*vpp)->v_flag |= VISSWAP;
 684                         mutex_exit(&(*vpp)->v_lock);
 685                 }
 686                 ITIMES(ip);
 687                 /*
 688                  * If vnode is a device return special vnode instead.
 689                  */
 690                 if (IS_DEVVP(*vpp)) {
 691                         struct vnode *newvp;
 692                         newvp = specvp(*vpp, (*vpp)->v_rdev,
 693                             (*vpp)->v_type, cr);
 694                         VN_RELE(*vpp);
 695                         if (newvp == NULL) {
 696                                 error = ENOSYS;
 697                         } else {
 698                                 *vpp = newvp;
 699                         }
 700                 }
 701         }
 702 out:
 703         return (error);
 704 }
 705
 706 /* ARGSUSED */
 707 static int32_t
 708 udf_create(
 709         struct vnode *dvp,
 710         char *name,
 711         struct vattr *vap,
 712         enum vcexcl excl,
 713         int32_t mode,
 714         struct vnode **vpp,
 715         struct cred *cr,
 716         int32_t flag,
 717         caller_context_t *ct,
 718         vsecattr_t *vsecp)
 719 {
 720         int32_t error;
 721         struct ud_inode *ip = VTOI(dvp), *xip;
 722
 723         ud_printf("udf_create\n");
 724
 725         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
 726                 vap->va_mode &= ~VSVTX;
 727
 728         if (*name == '\0') {
 729                 /*
 730                  * Null component name refers to the directory itself.
 731                  */
 732                 VN_HOLD(dvp);
 733                 ITIMES(ip);
 734                 error = EEXIST;
 735         } else {
 736                 xip = NULL;
 737                 rw_enter(&ip->i_rwlock, RW_WRITER);
 738                 error = ud_direnter(ip, name, DE_CREATE, NULL, NULL, vap,
 739                     &xip, cr, ct);
 740                 rw_exit(&ip->i_rwlock);
 741                 ITIMES(ip);
 742                 ip = xip;
 743         }
 744 #ifdef  __lock_lint
 745         rw_enter(&ip->i_contents, RW_WRITER);
 746 #else
 747         if (ip != NULL) {
 748                 rw_enter(&ip->i_contents, RW_WRITER);
 749         }
 750 #endif
 751
 752         /*
 753          * If the file already exists and this is a non-exclusive create,
 754          * check permissions and allow access for non-directories.
 755          * Read-only create of an existing directory is also allowed.
 756          * We fail an exclusive create of anything which already exists.
 757          */
 758         if (error == EEXIST) {
 759                 if (excl == NONEXCL) {
 760                         if ((ip->i_type == VDIR) && (mode & VWRITE)) {
 761                                 error = EISDIR;
 762                         } else if (mode) {
 763                                 error = ud_iaccess(ip,
 764                                     UD_UPERM2DPERM(mode), cr, 0);
 765                         } else {
 766                                 error = 0;
 767                         }
 768                 }
 769                 if (error) {
 770                         rw_exit(&ip->i_contents);
 771                         VN_RELE(ITOV(ip));
 772                         goto out;
 773                 } else if ((ip->i_type == VREG) &&
 774                     (vap->va_mask & VATTR_SIZE) && vap->va_size == 0) {
 775                         /*
 776                          * Truncate regular files, if requested by caller.
 777                          * Grab i_rwlock to make sure no one else is
 778                          * currently writing to the file (we promised
 779                          * bmap we would do this).
 780                          * Must get the locks in the correct order.
 781                          */
 782                         if (ip->i_size == 0) {
 783                                 ip->i_flag |= ICHG | IUPD;
 784                         } else {
 785                                 rw_exit(&ip->i_contents);
 786                                 rw_enter(&ip->i_rwlock, RW_WRITER);
 787                                 rw_enter(&ip->i_contents, RW_WRITER);
 788                                 (void) ud_itrunc(ip, 0, 0, cr);
 789                                 rw_exit(&ip->i_rwlock);
 790                         }
 791                         vnevent_create(ITOV(ip), ct);
 792                 }
 793         }
 794
 795         if (error == 0) {
 796                 *vpp = ITOV(ip);
 797                 ITIMES(ip);
 798         }
 799 #ifdef  __lock_lint
 800         rw_exit(&ip->i_contents);
 801 #else
 802         if (ip != NULL) {
 803                 rw_exit(&ip->i_contents);
 804         }
 805 #endif
 806         if (error) {
 807                 goto out;
 808         }
 809
 810         /*
 811          * If vnode is a device return special vnode instead.
 812          */
 813         if (!error && IS_DEVVP(*vpp)) {
 814                 struct vnode *newvp;
 815
 816                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 817                 VN_RELE(*vpp);
 818                 if (newvp == NULL) {
 819                         error = ENOSYS;
 820                         goto out;
 821                 }
 822                 *vpp = newvp;
 823         }
 824 out:
 825         return (error);
 826 }
 827
 828 /* ARGSUSED */
 829 static int32_t
 830 udf_remove(
 831         struct vnode *vp,
 832         char *nm,
 833         struct cred *cr,
 834         caller_context_t *ct,
 835         int flags)
 836 {
 837         int32_t error;
 838         struct ud_inode *ip = VTOI(vp);
 839
 840         ud_printf("udf_remove\n");
 841
 842         rw_enter(&ip->i_rwlock, RW_WRITER);
 843         error = ud_dirremove(ip, nm,
 844             NULL, NULL, DR_REMOVE, cr, ct);
 845         rw_exit(&ip->i_rwlock);
 846         ITIMES(ip);
 847
 848         return (error);
 849 }
 850
 851 /* ARGSUSED */
 852 static int32_t
 853 udf_link(
 854         struct vnode *tdvp,
 855         struct vnode *svp,
 856         char *tnm,
 857         struct cred *cr,
 858         caller_context_t *ct,
 859         int flags)
 860 {
 861         int32_t error;
 862         struct vnode *realvp;
 863         struct ud_inode *sip;
 864         struct ud_inode *tdp;
 865
 866         ud_printf("udf_link\n");
 867         if (fop_realvp(svp, &realvp, ct) == 0) {
 868                 svp = realvp;
 869         }
 870
 871         /*
 872          * Do not allow links to directories
 873          */
 874         if (svp->v_type == VDIR) {
 875                 return (EPERM);
 876         }
 877
 878         sip = VTOI(svp);
 879
 880         if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
 881                 return (EPERM);
 882
 883         tdp = VTOI(tdvp);
 884
 885         rw_enter(&tdp->i_rwlock, RW_WRITER);
 886         error = ud_direnter(tdp, tnm, DE_LINK, NULL,
 887             sip, NULL, (struct ud_inode **)0, cr, ct);
 888         rw_exit(&tdp->i_rwlock);
 889         ITIMES(sip);
 890         ITIMES(tdp);
 891
 892         if (error == 0) {
 893                 vnevent_link(svp, ct);
 894         }
 895
 896         return (error);
 897 }
 898
 899 /* ARGSUSED */
 900 static int32_t
 901 udf_rename(
 902         struct vnode *sdvp,
 903         char *snm,
 904         struct vnode *tdvp,
 905         char *tnm,
 906         struct cred *cr,
 907         caller_context_t *ct,
 908         int flags)
 909 {
 910         int32_t error = 0;
 911         struct udf_vfs *udf_vfsp;
 912         struct ud_inode *sip;           /* source inode */
 913         struct ud_inode *tip;           /* target inode */
 914         struct ud_inode *sdp, *tdp;     /* source and target parent inode */
 915         struct vnode *realvp;
 916
 917         ud_printf("udf_rename\n");
 918
 919         if (fop_realvp(tdvp, &realvp, ct) == 0) {
 920                 tdvp = realvp;
 921         }
 922
 923         sdp = VTOI(sdvp);
 924         tdp = VTOI(tdvp);
 925
 926         udf_vfsp = sdp->i_udf;
 927
 928         mutex_enter(&udf_vfsp->udf_rename_lck);
 929         /*
 930          * Look up inode of file we're supposed to rename.
 931          */
 932         if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
 933                 mutex_exit(&udf_vfsp->udf_rename_lck);
 934                 return (error);
 935         }
 936         /*
 937          * be sure this is not a directory with another file system mounted
 938          * over it.  If it is just give up the locks, and return with
 939          * EBUSY
 940          */
 941         if (vn_mountedvfs(ITOV(sip)) != NULL) {
 942                 error = EBUSY;
 943                 goto errout;
 944         }
 945         /*
 946          * Make sure we can delete the source entry.  This requires
 947          * write permission on the containing directory.  If that
 948          * directory is "sticky" it further requires (except for
 949          * privileged users) that the user own the directory or the
 950          * source entry, or else have permission to write the source
 951          * entry.
 952          */
 953         rw_enter(&sdp->i_contents, RW_READER);
 954         rw_enter(&sip->i_contents, RW_READER);
 955         if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
 956             (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
 957                 rw_exit(&sip->i_contents);
 958                 rw_exit(&sdp->i_contents);
 959                 ITIMES(sip);
 960                 goto errout;
 961         }
 962
 963         /*
 964          * Check for renaming '.' or '..' or alias of '.'
 965          */
 966         if ((strcmp(snm, ".") == 0) ||
 967             (strcmp(snm, "..") == 0) ||
 968             (sdp == sip)) {
 969                 error = EINVAL;
 970                 rw_exit(&sip->i_contents);
 971                 rw_exit(&sdp->i_contents);
 972                 goto errout;
 973         }
 974
 975         rw_exit(&sip->i_contents);
 976         rw_exit(&sdp->i_contents);
 977
 978         if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
 979                 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
 980                 VN_RELE(ITOV(tip));
 981         }
 982
 983         /* Notify the target dir. if not the same as the source dir. */
 984         if (sdvp != tdvp)
 985                 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
 986
 987         vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
 988
 989         /*
 990          * Link source to the target.
 991          */
 992         rw_enter(&tdp->i_rwlock, RW_WRITER);
 993         if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
 994             NULL, (struct ud_inode **)0, cr, ct)) {
 995                 /*
 996                  * ESAME isn't really an error; it indicates that the
 997                  * operation should not be done because the source and target
 998                  * are the same file, but that no error should be reported.
 999                  */
1000                 if (error == ESAME) {
1001                         error = 0;
1002                 }
1003                 rw_exit(&tdp->i_rwlock);
1004                 goto errout;
1005         }
1006         rw_exit(&tdp->i_rwlock);
1007
1008         rw_enter(&sdp->i_rwlock, RW_WRITER);
1009         /*
1010          * Unlink the source.
1011          * Remove the source entry.  ud_dirremove() checks that the entry
1012          * still reflects sip, and returns an error if it doesn't.
1013          * If the entry has changed just forget about it.  Release
1014          * the source inode.
1015          */
1016         if ((error = ud_dirremove(sdp, snm, sip, NULL,
1017             DR_RENAME, cr, ct)) == ENOENT) {
1018                 error = 0;
1019         }
1020         rw_exit(&sdp->i_rwlock);
1021
1022         if (error == 0) {
1023                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1024                 /*
1025                  * vnevent_rename_dest and vnevent_rename_dest_dir are called
1026                  * in ud_direnter().
1027                  */
1028         }
1029
1030 errout:
1031         ITIMES(sdp);
1032         ITIMES(tdp);
1033         VN_RELE(ITOV(sip));
1034         mutex_exit(&udf_vfsp->udf_rename_lck);
1035
1036         return (error);
1037 }
1038
1039 /* ARGSUSED */
1040 static int32_t
1041 udf_mkdir(
1042         struct vnode *dvp,
1043         char *dirname,
1044         struct vattr *vap,
1045         struct vnode **vpp,
1046         struct cred *cr,
1047         caller_context_t *ct,
1048         int flags,
1049         vsecattr_t *vsecp)
1050 {
1051         int32_t error;
1052         struct ud_inode *ip;
1053         struct ud_inode *xip;
1054
1055         ASSERT((vap->va_mask & (VATTR_TYPE|VATTR_MODE)) == (VATTR_TYPE|VATTR_MODE));
1056
1057         ud_printf("udf_mkdir\n");
1058
1059         ip = VTOI(dvp);
1060         rw_enter(&ip->i_rwlock, RW_WRITER);
1061         error = ud_direnter(ip, dirname, DE_MKDIR,
1062             NULL, NULL, vap, &xip, cr, ct);
1063         rw_exit(&ip->i_rwlock);
1064         ITIMES(ip);
1065         if (error == 0) {
1066                 ip = xip;
1067                 *vpp = ITOV(ip);
1068                 ITIMES(ip);
1069         } else if (error == EEXIST) {
1070                 ITIMES(xip);
1071                 VN_RELE(ITOV(xip));
1072         }
1073
1074         return (error);
1075 }
1076
1077 /* ARGSUSED */
1078 static int32_t
1079 udf_rmdir(
1080         struct vnode *vp,
1081         char *nm,
1082         struct vnode *cdir,
1083         struct cred *cr,
1084         caller_context_t *ct,
1085         int flags)
1086 {
1087         int32_t error;
1088         struct ud_inode *ip = VTOI(vp);
1089
1090         ud_printf("udf_rmdir\n");
1091
1092         rw_enter(&ip->i_rwlock, RW_WRITER);
1093         error = ud_dirremove(ip, nm, NULL, cdir, DR_RMDIR,
1094             cr, ct);
1095         rw_exit(&ip->i_rwlock);
1096         ITIMES(ip);
1097
1098         return (error);
1099 }
1100
1101 /* ARGSUSED */
1102 static int32_t
1103 udf_readdir(
1104         struct vnode *vp,
1105         struct uio *uiop,
1106         struct cred *cr,
1107         int32_t *eofp,
1108         caller_context_t *ct,
1109         int flags)
1110 {
1111         struct ud_inode *ip;
1112         struct dirent64 *nd;
1113         struct udf_vfs *udf_vfsp;
1114         int32_t error = 0, len, outcount = 0;
1115         uint32_t dirsiz, offset;
1116         uint32_t bufsize, ndlen, dummy;
1117         caddr_t outbuf;
1118         caddr_t outb, end_outb;
1119         struct iovec *iovp;
1120
1121         uint8_t *dname;
1122         int32_t length;
1123
1124         uint8_t *buf = NULL;
1125
1126         struct fbuf *fbp = NULL;
1127         struct file_id *fid;
1128         uint8_t *name;
1129
1130
1131         ud_printf("udf_readdir\n");
1132
1133         ip = VTOI(vp);
1134         udf_vfsp = ip->i_udf;
1135
1136         dirsiz = ip->i_size;
1137         if ((uiop->uio_offset >= dirsiz) ||
1138             (ip->i_nlink <= 0)) {
1139                 if (eofp) {
1140                         *eofp = 1;
1141                 }
1142                 return (0);
1143         }
1144
1145         offset = uiop->uio_offset;
1146         iovp = uiop->uio_iov;
1147         bufsize = iovp->iov_len;
1148
1149         outb = outbuf = kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1150         end_outb = outb + bufsize;
1151         nd = (struct dirent64 *)outbuf;
1152
1153         dname = kmem_zalloc(1024, KM_SLEEP);
1154         buf = kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1155
1156         if (offset == 0) {
1157                 len = DIRENT64_RECLEN(1);
1158                 if (((caddr_t)nd + len) >= end_outb) {
1159                         error = EINVAL;
1160                         goto end;
1161                 }
1162                 nd->d_ino = ip->i_icb_lbano;
1163                 nd->d_reclen = (uint16_t)len;
1164                 nd->d_off = 0x10;
1165                 nd->d_name[0] = '.';
1166                 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1167                 nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1168                 outcount++;
1169         } else if (offset == 0x10) {
1170                 offset = 0;
1171         }
1172
1173         while (offset < dirsiz) {
1174                 error = ud_get_next_fid(ip, &fbp,
1175                     offset, &fid, &name, buf);
1176                 if (error != 0) {
1177                         break;
1178                 }
1179
1180                 if ((fid->fid_flags & FID_DELETED) == 0) {
1181                         if (fid->fid_flags & FID_PARENT) {
1182
1183                                 len = DIRENT64_RECLEN(2);
1184                                 if (((caddr_t)nd + len) >= end_outb) {
1185                                         error = EINVAL;
1186                                         break;
1187                                 }
1188
1189                                 nd->d_ino = ip->i_icb_lbano;
1190                                 nd->d_reclen = (uint16_t)len;
1191                                 nd->d_off = offset + FID_LEN(fid);
1192                                 nd->d_name[0] = '.';
1193                                 nd->d_name[1] = '.';
1194                                 bzero(&nd->d_name[2],
1195                                     DIRENT64_NAMELEN(len) - 2);
1196                                 nd = (struct dirent64 *)
1197                                     ((char *)nd + nd->d_reclen);
1198                         } else {
1199                                 if ((error = ud_uncompress(fid->fid_idlen,
1200                                     &length, name, dname)) != 0) {
1201                                         break;
1202                                 }
1203                                 if (length == 0) {
1204                                         offset += FID_LEN(fid);
1205                                         continue;
1206                                 }
1207                                 len = DIRENT64_RECLEN(length);
1208                                 if (((caddr_t)nd + len) >= end_outb) {
1209                                         if (!outcount) {
1210                                                 error = EINVAL;
1211                                         }
1212                                         break;
1213                                 }
1214                                 (void) strncpy(nd->d_name,
1215                                     (caddr_t)dname, length);
1216                                 bzero(&nd->d_name[length],
1217                                     DIRENT64_NAMELEN(len) - length);
1218                                 nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1219                                     SWAP_16(fid->fid_icb.lad_ext_prn),
1220                                     SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1221                                     &dummy);
1222                                 nd->d_reclen = (uint16_t)len;
1223                                 nd->d_off = offset + FID_LEN(fid);
1224                                 nd = (struct dirent64 *)
1225                                     ((char *)nd + nd->d_reclen);
1226                         }
1227                         outcount++;
1228                 }
1229
1230                 offset += FID_LEN(fid);
1231         }
1232
1233 end:
1234         if (fbp != NULL) {
1235                 fbrelse(fbp, S_OTHER);
1236         }
1237         ndlen = ((char *)nd - outbuf);
1238         /*
1239          * In case of error do not call uiomove.
1240          * Return the error to the caller.
1241          */
1242         if ((error == 0) && (ndlen != 0)) {
1243                 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1244                 uiop->uio_offset = offset;
1245         }
1246         kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1247         kmem_free((caddr_t)dname, 1024);
1248         kmem_free(outbuf, (uint32_t)bufsize);
1249         if (eofp && error == 0) {
1250                 *eofp = (uiop->uio_offset >= dirsiz);
1251         }
1252         return (error);
1253 }
1254
1255 /* ARGSUSED */
1256 static int32_t
1257 udf_symlink(
1258         struct vnode *dvp,
1259         char *linkname,
1260         struct vattr *vap,
1261         char *target,
1262         struct cred *cr,
1263         caller_context_t *ct,
1264         int flags)
1265 {
1266         int32_t error = 0, outlen;
1267         uint32_t ioflag = 0;
1268         struct ud_inode *ip, *dip = VTOI(dvp);
1269
1270         struct path_comp *pc;
1271         int8_t *dname = NULL, *uname = NULL, *sp;
1272
1273         ud_printf("udf_symlink\n");
1274
1275         ip = NULL;
1276         vap->va_type = VLNK;
1277         vap->va_rdev = 0;
1278
1279         rw_enter(&dip->i_rwlock, RW_WRITER);
1280         error = ud_direnter(dip, linkname, DE_CREATE,
1281             NULL, NULL, vap, &ip, cr, ct);
1282         rw_exit(&dip->i_rwlock);
1283         if (error == 0) {
1284                 dname = kmem_zalloc(1024, KM_SLEEP);
1285                 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1286
1287                 pc = (struct path_comp *)uname;
1288                 /*
1289                  * If the first character in target is "/"
1290                  * then skip it and create entry for it
1291                  */
1292                 if (*target == '/') {
1293                         pc->pc_type = 2;
1294                         pc->pc_len = 0;
1295                         pc = (struct path_comp *)(((char *)pc) + 4);
1296                         while (*target == '/') {
1297                                 target++;
1298                         }
1299                 }
1300
1301                 while (*target != '\0') {
1302                         sp = target;
1303                         while ((*target != '/') && (*target != '\0')) {
1304                                 target ++;
1305                         }
1306                         /*
1307                          * We got the next component of the
1308                          * path name. Create path_comp of
1309                          * appropriate type
1310                          */
1311                         if (((target - sp) == 1) && (*sp == '.')) {
1312                                 /*
1313                                  * Dot entry.
1314                                  */
1315                                 pc->pc_type = 4;
1316                                 pc = (struct path_comp *)(((char *)pc) + 4);
1317                         } else if (((target - sp) == 2) &&
1318                             (*sp == '.') && ((*(sp + 1)) == '.')) {
1319                                 /*
1320                                  * DotDot entry.
1321                                  */
1322                                 pc->pc_type = 3;
1323                                 pc = (struct path_comp *)(((char *)pc) + 4);
1324                         } else {
1325                                 /*
1326                                  * convert the user given name
1327                                  * into appropriate form to be put
1328                                  * on the media
1329                                  */
1330                                 outlen = 1024;  /* set to size of dname */
1331                                 if (error = ud_compress(target - sp, &outlen,
1332                                     (uint8_t *)sp, (uint8_t *)dname)) {
1333                                         break;
1334                                 }
1335                                 pc->pc_type = 5;
1336                                 /* LINTED */
1337                                 pc->pc_len = outlen;
1338                                 dname[outlen] = '\0';
1339                                 (void) strcpy((char *)pc->pc_id, dname);
1340                                 pc = (struct path_comp *)
1341                                     (((char *)pc) + 4 + outlen);
1342                         }
1343                         while (*target == '/') {
1344                                 target++;
1345                         }
1346                         if (*target == '\0') {
1347                                 break;
1348                         }
1349                 }
1350
1351                 rw_enter(&ip->i_contents, RW_WRITER);
1352                 if (error == 0) {
1353                         ioflag = FWRITE;
1354                         if (curthread->t_flag & T_DONTPEND) {
1355                                 ioflag |= FDSYNC;
1356                         }
1357                         error = ud_rdwri(UIO_WRITE, ioflag, ip,
1358                             uname, ((int8_t *)pc) - uname,
1359                             0, UIO_SYSSPACE, (int32_t *)0, cr);
1360                 }
1361                 if (error) {
1362                         ud_idrop(ip);
1363                         rw_exit(&ip->i_contents);
1364                         rw_enter(&dip->i_rwlock, RW_WRITER);
1365                         (void) ud_dirremove(dip, linkname, NULL,
1366                             NULL, DR_REMOVE, cr, ct);
1367                         rw_exit(&dip->i_rwlock);
1368                         goto update_inode;
1369                 }
1370                 rw_exit(&ip->i_contents);
1371         }
1372
1373         if ((error == 0) || (error == EEXIST)) {
1374                 VN_RELE(ITOV(ip));
1375         }
1376
1377 update_inode:
1378         ITIMES(VTOI(dvp));
1379         if (uname != NULL) {
1380                 kmem_free(uname, PAGESIZE);
1381         }
1382         if (dname != NULL) {
1383                 kmem_free(dname, 1024);
1384         }
1385
1386         return (error);
1387 }
1388
1389 /* ARGSUSED */
1390 static int32_t
1391 udf_readlink(
1392         struct vnode *vp,
1393         struct uio *uiop,
1394         struct cred *cr,
1395         caller_context_t *ct)
1396 {
1397         int32_t error = 0, off, id_len, size, len;
1398         int8_t *dname = NULL, *uname = NULL;
1399         struct ud_inode *ip;
1400         struct fbuf *fbp = NULL;
1401         struct path_comp *pc;
1402
1403         ud_printf("udf_readlink\n");
1404
1405         if (vp->v_type != VLNK) {
1406                 return (EINVAL);
1407         }
1408
1409         ip = VTOI(vp);
1410         size = ip->i_size;
1411         if (size > PAGESIZE) {
1412                 return (EIO);
1413         }
1414
1415         if (size == 0) {
1416                 return (0);
1417         }
1418
1419         dname = kmem_zalloc(1024, KM_SLEEP);
1420         uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1421
1422         rw_enter(&ip->i_contents, RW_READER);
1423
1424         if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1425                 goto end;
1426         }
1427
1428         off = 0;
1429
1430         while (off < size) {
1431                 pc = (struct path_comp *)(fbp->fb_addr + off);
1432                 switch (pc->pc_type) {
1433                         case 1 :
1434                                 (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1435                                 (void) strcat(uname, "/");
1436                                 break;
1437                         case 2 :
1438                                 if (pc->pc_len != 0) {
1439                                         goto end;
1440                                 }
1441                                 uname[0] = '/';
1442                                 uname[1] = '\0';
1443                                 break;
1444                         case 3 :
1445                                 (void) strcat(uname, "../");
1446                                 break;
1447                         case 4 :
1448                                 (void) strcat(uname, "./");
1449                                 break;
1450                         case 5 :
1451                                 if ((error = ud_uncompress(pc->pc_len, &id_len,
1452                                     pc->pc_id, (uint8_t *)dname)) != 0) {
1453                                         break;
1454                                 }
1455                                 dname[id_len] = '\0';
1456                                 (void) strcat(uname, dname);
1457                                 (void) strcat(uname, "/");
1458                                 break;
1459                         default :
1460                                 error = EINVAL;
1461                                 goto end;
1462                 }
1463                 off += 4 + pc->pc_len;
1464         }
1465         len = strlen(uname) - 1;
1466         if (uname[len] == '/') {
1467                 if (len == 0) {
1468                         /*
1469                          * special case link to /
1470                          */
1471                         len = 1;
1472                 } else {
1473                         uname[len] = '\0';
1474                 }
1475         }
1476
1477         error = uiomove(uname, len, UIO_READ, uiop);
1478
1479         ITIMES(ip);
1480
1481 end:
1482         if (fbp != NULL) {
1483                 fbrelse(fbp, S_OTHER);
1484         }
1485         rw_exit(&ip->i_contents);
1486         if (uname != NULL) {
1487                 kmem_free(uname, PAGESIZE);
1488         }
1489         if (dname != NULL) {
1490                 kmem_free(dname, 1024);
1491         }
1492         return (error);
1493 }
1494
1495 /* ARGSUSED */
1496 static int32_t
1497 udf_fsync(
1498         struct vnode *vp,
1499         int32_t syncflag,
1500         struct cred *cr,
1501         caller_context_t *ct)
1502 {
1503         int32_t error = 0;
1504         struct ud_inode *ip = VTOI(vp);
1505
1506         ud_printf("udf_fsync\n");
1507
1508         rw_enter(&ip->i_contents, RW_WRITER);
1509         if (!(IS_SWAPVP(vp))) {
1510                 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1511         }
1512         if (error == 0) {
1513                 error = ud_sync_indir(ip);
1514         }
1515         ITIMES(ip);             /* XXX: is this necessary ??? */
1516         rw_exit(&ip->i_contents);
1517
1518         return (error);
1519 }
1520
1521 /* ARGSUSED */
1522 static void
1523 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1524 {
1525         ud_printf("udf_iinactive\n");
1526
1527         ud_iinactive(VTOI(vp), cr);
1528 }
1529
1530 /* ARGSUSED */
1531 static int32_t
1532 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1533 {
1534         struct udf_fid *udfidp;
1535         struct ud_inode *ip = VTOI(vp);
1536
1537         ud_printf("udf_fid\n");
1538
1539         if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1540                 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1541                 return (ENOSPC);
1542         }
1543
1544         udfidp = (struct udf_fid *)fidp;
1545         bzero((char *)udfidp, sizeof (struct udf_fid));
1546         rw_enter(&ip->i_contents, RW_READER);
1547         udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1548         udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1549         udfidp->udfid_prn = ip->i_icb_prn;
1550         udfidp->udfid_icb_lbn = ip->i_icb_block;
1551         rw_exit(&ip->i_contents);
1552
1553         return (0);
1554 }
1555
1556 /* ARGSUSED2 */
1557 static int
1558 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1559 {
1560         struct ud_inode *ip = VTOI(vp);
1561
1562         ud_printf("udf_rwlock\n");
1563
1564         if (write_lock) {
1565                 rw_enter(&ip->i_rwlock, RW_WRITER);
1566         } else {
1567                 rw_enter(&ip->i_rwlock, RW_READER);
1568         }
1569 #ifdef  __lock_lint
1570         rw_exit(&ip->i_rwlock);
1571 #endif
1572         return (write_lock);
1573 }
1574
1575 /* ARGSUSED */
1576 static void
1577 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1578 {
1579         struct ud_inode *ip = VTOI(vp);
1580
1581         ud_printf("udf_rwunlock\n");
1582
1583 #ifdef  __lock_lint
1584         rw_enter(&ip->i_rwlock, RW_WRITER);
1585 #endif
1586
1587         rw_exit(&ip->i_rwlock);
1588
1589 }
1590
1591 /* ARGSUSED */
1592 static int32_t
1593 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1594 {
1595         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1596 }
1597
1598 static int32_t
1599 udf_frlock(
1600         struct vnode *vp,
1601         int32_t cmd,
1602         struct flock64 *bfp,
1603         int32_t flag,
1604         offset_t offset,
1605         struct flk_callback *flk_cbp,
1606         cred_t *cr,
1607         caller_context_t *ct)
1608 {
1609         struct ud_inode *ip = VTOI(vp);
1610
1611         ud_printf("udf_frlock\n");
1612
1613         /*
1614          * If file is being mapped, disallow frlock.
1615          * XXX I am not holding tlock while checking i_mapcnt because the
1616          * current locking strategy drops all locks before calling fs_frlock.
1617          * So, mapcnt could change before we enter fs_frlock making is
1618          * meaningless to have held tlock in the first place.
1619          */
1620         if ((ip->i_mapcnt > 0) &&
1621             (MANDLOCK(vp, ip->i_char))) {
1622                 return (EAGAIN);
1623         }
1624
1625         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1626 }
1627
1628 /*ARGSUSED6*/
1629 static int32_t
1630 udf_space(
1631         struct vnode *vp,
1632         int32_t cmd,
1633         struct flock64 *bfp,
1634         int32_t flag,
1635         offset_t offset,
1636         cred_t *cr,
1637         caller_context_t *ct)
1638 {
1639         int32_t error = 0;
1640
1641         ud_printf("udf_space\n");
1642
1643         if (cmd != F_FREESP) {
1644                 error =  EINVAL;
1645         } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1646                 error = ud_freesp(vp, bfp, flag, cr);
1647
1648                 if (error == 0 && bfp->l_start == 0)
1649                         vnevent_truncate(vp, ct);
1650         }
1651
1652         return (error);
1653 }
1654
1655 /* ARGSUSED */
1656 static int32_t
1657 udf_getpage(
1658         struct vnode *vp,
1659         offset_t off,
1660         size_t len,
1661         uint32_t *protp,
1662         struct page **plarr,
1663         size_t plsz,
1664         struct seg *seg,
1665         caddr_t addr,
1666         enum seg_rw rw,
1667         struct cred *cr,
1668         caller_context_t *ct)
1669 {
1670         struct ud_inode *ip = VTOI(vp);
1671         int32_t error, has_holes, beyond_eof, seqmode, dolock;
1672         int32_t pgsize = PAGESIZE;
1673         struct udf_vfs *udf_vfsp = ip->i_udf;
1674         page_t **pl;
1675         uoff_t pgoff, eoff, uoff;
1676         krw_t rwtype;
1677         caddr_t pgaddr;
1678
1679         ud_printf("udf_getpage\n");
1680
1681         uoff = (uoff_t)off; /* type conversion */
1682         if (protp) {
1683                 *protp = PROT_ALL;
1684         }
1685         if (vp->v_flag & VNOMAP) {
1686                 return (ENOSYS);
1687         }
1688         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1689
1690         rwtype = RW_READER;
1691         dolock = (rw_owner(&ip->i_contents) != curthread);
1692 retrylock:
1693 #ifdef  __lock_lint
1694         rw_enter(&ip->i_contents, rwtype);
1695 #else
1696         if (dolock) {
1697                 rw_enter(&ip->i_contents, rwtype);
1698         }
1699 #endif
1700
1701         /*
1702          * We may be getting called as a side effect of a bmap using
1703          * fbread() when the blocks might be being allocated and the
1704          * size has not yet been up'ed.  In this case we want to be
1705          * able to return zero pages if we get back UDF_HOLE from
1706          * calling bmap for a non write case here.  We also might have
1707          * to read some frags from the disk into a page if we are
1708          * extending the number of frags for a given lbn in bmap().
1709          */
1710         beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1711         if (beyond_eof && seg != segkmap) {
1712 #ifdef  __lock_lint
1713                 rw_exit(&ip->i_contents);
1714 #else
1715                 if (dolock) {
1716                         rw_exit(&ip->i_contents);
1717                 }
1718 #endif
1719                 return (EFAULT);
1720         }
1721
1722         /*
1723          * Must hold i_contents lock throughout the call to pvn_getpages
1724          * since locked pages are returned from each call to ud_getapage.
1725          * Must *not* return locked pages and then try for contents lock
1726          * due to lock ordering requirements (inode > page)
1727          */
1728
1729         has_holes = ud_bmap_has_holes(ip);
1730
1731         if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1732                 int32_t blk_size, count;
1733                 uoff_t offset;
1734
1735                 /*
1736                  * We must acquire the RW_WRITER lock in order to
1737                  * call bmap_write().
1738                  */
1739                 if (dolock && rwtype == RW_READER) {
1740                         rwtype = RW_WRITER;
1741
1742                         if (!rw_tryupgrade(&ip->i_contents)) {
1743
1744                                 rw_exit(&ip->i_contents);
1745
1746                                 goto retrylock;
1747                         }
1748                 }
1749
1750                 /*
1751                  * May be allocating disk blocks for holes here as
1752                  * a result of mmap faults. write(2) does the bmap_write
1753                  * in rdip/wrip, not here. We are not dealing with frags
1754                  * in this case.
1755                  */
1756                 offset = uoff;
1757                 while ((offset < uoff + len) &&
1758                     (offset < ip->i_size)) {
1759                         /*
1760                          * the variable "bnp" is to simplify the expression for
1761                          * the compiler; * just passing in &bn to bmap_write
1762                          * causes a compiler "loop"
1763                          */
1764
1765                         blk_size = udf_vfsp->udf_lbsize;
1766                         if ((offset + blk_size) > ip->i_size) {
1767                                 count = ip->i_size - offset;
1768                         } else {
1769                                 count = blk_size;
1770                         }
1771                         error = ud_bmap_write(ip, offset, count, 0, cr);
1772                         if (error) {
1773                                 goto update_inode;
1774                         }
1775                         offset += count; /* XXX - make this contig */
1776                 }
1777         }
1778
1779         /*
1780          * Can be a reader from now on.
1781          */
1782 #ifdef  __lock_lint
1783         if (rwtype == RW_WRITER) {
1784                 rw_downgrade(&ip->i_contents);
1785         }
1786 #else
1787         if (dolock && rwtype == RW_WRITER) {
1788                 rw_downgrade(&ip->i_contents);
1789         }
1790 #endif
1791
1792         /*
1793          * We remove PROT_WRITE in cases when the file has UDF holes
1794          * because we don't  want to call bmap_read() to check each
1795          * page if it is backed with a disk block.
1796          */
1797         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1798                 *protp &= ~PROT_WRITE;
1799         }
1800
1801         error = 0;
1802
1803         /*
1804          * The loop looks up pages in the range <off, off + len).
1805          * For each page, we first check if we should initiate an asynchronous
1806          * read ahead before we call page_lookup (we may sleep in page_lookup
1807          * for a previously initiated disk read).
1808          */
1809         eoff = (uoff + len);
1810         for (pgoff = uoff, pgaddr = addr, pl = plarr;
1811             pgoff < eoff; /* empty */) {
1812                 page_t  *pp;
1813                 uoff_t  nextrio;
1814                 se_t    se;
1815
1816                 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1817
1818                 /*
1819                  * Handle async getpage (faultahead)
1820                  */
1821                 if (plarr == NULL) {
1822                         ip->i_nextrio = pgoff;
1823                         ud_getpage_ra(vp, pgoff, seg, pgaddr);
1824                         pgoff += pgsize;
1825                         pgaddr += pgsize;
1826                         continue;
1827                 }
1828
1829                 /*
1830                  * Check if we should initiate read ahead of next cluster.
1831                  * We call page_exists only when we need to confirm that
1832                  * we have the current page before we initiate the read ahead.
1833                  */
1834                 nextrio = ip->i_nextrio;
1835                 if (seqmode &&
1836                     pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1837                     nextrio < ip->i_size && page_exists(&vp->v_object, pgoff))
1838                         ud_getpage_ra(vp, pgoff, seg, pgaddr);
1839
1840                 if ((pp = page_lookup(&vp->v_object, pgoff, se)) != NULL) {
1841
1842                         /*
1843                          * We found the page in the page cache.
1844                          */
1845                         *pl++ = pp;
1846                         pgoff += pgsize;
1847                         pgaddr += pgsize;
1848                         len -= pgsize;
1849                         plsz -= pgsize;
1850                 } else  {
1851
1852                         /*
1853                          * We have to create the page, or read it from disk.
1854                          */
1855                         if (error = ud_getpage_miss(vp, pgoff, len,
1856                             seg, pgaddr, pl, plsz, rw, seqmode)) {
1857                                 goto error_out;
1858                         }
1859
1860                         while (*pl != NULL) {
1861                                 pl++;
1862                                 pgoff += pgsize;
1863                                 pgaddr += pgsize;
1864                                 len -= pgsize;
1865                                 plsz -= pgsize;
1866                         }
1867                 }
1868         }
1869
1870         /*
1871          * Return pages up to plsz if they are in the page cache.
1872          * We cannot return pages if there is a chance that they are
1873          * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1874          */
1875         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1876
1877                 ASSERT((protp == NULL) ||
1878                     !(has_holes && (*protp & PROT_WRITE)));
1879
1880                 eoff = pgoff + plsz;
1881                 while (pgoff < eoff) {
1882                         page_t          *pp;
1883
1884                         if ((pp = page_lookup_nowait(&vp->v_object, pgoff, SE_SHARED)) == NULL)
1885                                 break;
1886
1887                         *pl++ = pp;
1888                         pgoff += pgsize;
1889                         plsz -= pgsize;
1890                 }
1891         }
1892
1893         if (plarr)
1894                 *pl = NULL;                     /* Terminate page list */
1895         ip->i_nextr = pgoff;
1896
1897 error_out:
1898         if (error && plarr) {
1899                 /*
1900                  * Release any pages we have locked.
1901                  */
1902                 while (pl > &plarr[0])
1903                         page_unlock(*--pl);
1904
1905                 plarr[0] = NULL;
1906         }
1907
1908 update_inode:
1909 #ifdef  __lock_lint
1910         rw_exit(&ip->i_contents);
1911 #else
1912         if (dolock) {
1913                 rw_exit(&ip->i_contents);
1914         }
1915 #endif
1916
1917         /*
1918          * If the inode is not already marked for IACC (in rwip() for read)
1919          * and the inode is not marked for no access time update (in rwip()
1920          * for write) then update the inode access time and mod time now.
1921          */
1922         mutex_enter(&ip->i_tlock);
1923         if ((ip->i_flag & (IACC | INOACC)) == 0) {
1924                 if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1925                         ip->i_flag |= IACC;
1926                 }
1927                 if (rw == S_WRITE) {
1928                         ip->i_flag |= IUPD;
1929                 }
1930                 ITIMES_NOLOCK(ip);
1931         }
1932         mutex_exit(&ip->i_tlock);
1933
1934         return (error);
1935 }
1936
1937 int32_t ud_delay = 1;
1938
1939 /* ARGSUSED */
1940 static int32_t
1941 udf_putpage(
1942         struct vnode *vp,
1943         offset_t off,
1944         size_t len,
1945         int32_t flags,
1946         struct cred *cr,
1947         caller_context_t *ct)
1948 {
1949         struct ud_inode *ip;
1950         int32_t error = 0;
1951
1952         ud_printf("udf_putpage\n");
1953
1954         ip = VTOI(vp);
1955 #ifdef  __lock_lint
1956         rw_enter(&ip->i_contents, RW_WRITER);
1957 #endif
1958
1959         if (vp->v_count == 0) {
1960                 cmn_err(CE_WARN, "ud_putpage : bad v_count");
1961                 error = EINVAL;
1962                 goto out;
1963         }
1964
1965         if (vp->v_flag & VNOMAP) {
1966                 error = ENOSYS;
1967                 goto out;
1968         }
1969
1970         if (flags & B_ASYNC) {
1971                 if (ud_delay && len &&
1972                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1973                         mutex_enter(&ip->i_tlock);
1974
1975                         /*
1976                          * If nobody stalled, start a new cluster.
1977                          */
1978                         if (ip->i_delaylen == 0) {
1979                                 ip->i_delayoff = off;
1980                                 ip->i_delaylen = len;
1981                                 mutex_exit(&ip->i_tlock);
1982                                 goto out;
1983                         }
1984
1985                         /*
1986                          * If we have a full cluster or they are not contig,
1987                          * then push last cluster and start over.
1988                          */
1989                         if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1990                             ip->i_delayoff + ip->i_delaylen != off) {
1991                                 uoff_t doff;
1992                                 size_t dlen;
1993
1994                                 doff = ip->i_delayoff;
1995                                 dlen = ip->i_delaylen;
1996                                 ip->i_delayoff = off;
1997                                 ip->i_delaylen = len;
1998                                 mutex_exit(&ip->i_tlock);
1999                                 error = ud_putpages(vp, doff, dlen, flags, cr);
2000                                 /* LMXXX - flags are new val, not old */
2001                                 goto out;
2002                         }
2003
2004                         /*
2005                          * There is something there, it's not full, and
2006                          * it is contig.
2007                          */
2008                         ip->i_delaylen += len;
2009                         mutex_exit(&ip->i_tlock);
2010                         goto out;
2011                 }
2012
2013                 /*
2014                  * Must have weird flags or we are not clustering.
2015                  */
2016         }
2017
2018         error = ud_putpages(vp, off, len, flags, cr);
2019
2020 out:
2021 #ifdef  __lock_lint
2022         rw_exit(&ip->i_contents);
2023 #endif
2024         return (error);
2025 }
2026
2027 /* ARGSUSED */
2028 static int32_t
2029 udf_map(
2030         struct vnode *vp,
2031         offset_t off,
2032         struct as *as,
2033         caddr_t *addrp,
2034         size_t len,
2035         uint8_t prot,
2036         uint8_t maxprot,
2037         uint32_t flags,
2038         struct cred *cr,
2039         caller_context_t *ct)
2040 {
2041         struct segvn_crargs vn_a;
2042         int32_t error = 0;
2043
2044         ud_printf("udf_map\n");
2045
2046         if (vp->v_flag & VNOMAP) {
2047                 error = ENOSYS;
2048                 goto end;
2049         }
2050
2051         if ((off < 0) ||
2052             ((off + len) < 0)) {
2053                 error = EINVAL;
2054                 goto end;
2055         }
2056
2057         if (vp->v_type != VREG) {
2058                 error = ENODEV;
2059                 goto end;
2060         }
2061
2062         /*
2063          * If file is being locked, disallow mapping.
2064          */
2065         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2066                 error = EAGAIN;
2067                 goto end;
2068         }
2069
2070         as_rangelock(as);
2071         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2072         if (error != 0) {
2073                 as_rangeunlock(as);
2074                 goto end;
2075         }
2076
2077         vn_a.vp = vp;
2078         vn_a.offset = off;
2079         vn_a.type = flags & MAP_TYPE;
2080         vn_a.prot = prot;
2081         vn_a.maxprot = maxprot;
2082         vn_a.cred = cr;
2083         vn_a.amp = NULL;
2084         vn_a.flags = flags & ~MAP_TYPE;
2085         vn_a.szc = 0;
2086         vn_a.lgrp_mem_policy_flags = 0;
2087
2088         error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2089         as_rangeunlock(as);
2090
2091 end:
2092         return (error);
2093 }
2094
2095 /* ARGSUSED */
2096 static int32_t
2097 udf_addmap(struct vnode *vp,
2098     offset_t off,
2099     struct as *as,
2100     caddr_t addr,
2101     size_t len,
2102     uint8_t prot,
2103     uint8_t maxprot,
2104     uint32_t flags,
2105     struct cred *cr,
2106     caller_context_t *ct)
2107 {
2108         struct ud_inode *ip = VTOI(vp);
2109
2110         ud_printf("udf_addmap\n");
2111
2112         if (vp->v_flag & VNOMAP) {
2113                 return (ENOSYS);
2114         }
2115
2116         mutex_enter(&ip->i_tlock);
2117         ip->i_mapcnt += btopr(len);
2118         mutex_exit(&ip->i_tlock);
2119
2120         return (0);
2121 }
2122
2123 /* ARGSUSED */
2124 static int32_t
2125 udf_delmap(
2126         struct vnode *vp, offset_t off,
2127         struct as *as,
2128         caddr_t addr,
2129         size_t len,
2130         uint32_t prot,
2131         uint32_t maxprot,
2132         uint32_t flags,
2133         struct cred *cr,
2134         caller_context_t *ct)
2135 {
2136         struct ud_inode *ip = VTOI(vp);
2137
2138         ud_printf("udf_delmap\n");
2139
2140         if (vp->v_flag & VNOMAP) {
2141                 return (ENOSYS);
2142         }
2143
2144         mutex_enter(&ip->i_tlock);
2145         ip->i_mapcnt -= btopr(len);     /* Count released mappings */
2146         ASSERT(ip->i_mapcnt >= 0);
2147         mutex_exit(&ip->i_tlock);
2148
2149         return (0);
2150 }
2151
2152 /* ARGSUSED */
2153 static int32_t
2154 udf_l_pathconf(
2155         struct vnode *vp,
2156         int32_t cmd,
2157         ulong_t *valp,
2158         struct cred *cr,
2159         caller_context_t *ct)
2160 {
2161         int32_t error = 0;
2162
2163         ud_printf("udf_l_pathconf\n");
2164
2165         if (cmd == _PC_FILESIZEBITS) {
2166                 /*
2167                  * udf supports 64 bits as file size
2168                  * but there are several other restrictions
2169                  * it only supports 32-bit block numbers and
2170                  * daddr32_t is only and int32_t so taking these
2171                  * into account we can stay just as where ufs is
2172                  */
2173                 *valp = 41;
2174         } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2175                 /* nanosecond timestamp resolution */
2176                 *valp = 1L;
2177         } else {
2178                 error = fs_pathconf(vp, cmd, valp, cr, ct);
2179         }
2180
2181         return (error);
2182 }
2183
2184 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2185 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2186 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2187 /*
2188  * Assumption is that there will not be a pageio request
2189  * to a enbedded file
2190  */
2191 /* ARGSUSED */
2192 static int32_t
2193 udf_pageio(
2194         struct vnode *vp,
2195         struct page *pp,
2196         uoff_t io_off,
2197         size_t io_len,
2198         int32_t flags,
2199         struct cred *cr,
2200         caller_context_t *ct)
2201 {
2202         daddr_t bn;
2203         struct buf *bp;
2204         struct ud_inode *ip = VTOI(vp);
2205         int32_t dolock, error = 0, contig, multi_io;
2206         size_t done_len = 0, cur_len = 0;
2207         page_t *npp = NULL, *opp = NULL, *cpp = pp;
2208
2209         if (pp == NULL) {
2210                 return (EINVAL);
2211         }
2212
2213         dolock = (rw_owner(&ip->i_contents) != curthread);
2214
2215         /*
2216          * We need a better check.  Ideally, we would use another
2217          * vnodeops so that hlocked and forcibly unmounted file
2218          * systems would return EIO where appropriate and w/o the
2219          * need for these checks.
2220          */
2221         if (ip->i_udf == NULL) {
2222                 return (EIO);
2223         }
2224
2225 #ifdef  __lock_lint
2226         rw_enter(&ip->i_contents, RW_READER);
2227 #else
2228         if (dolock) {
2229                 rw_enter(&ip->i_contents, RW_READER);
2230         }
2231 #endif
2232
2233         /*
2234          * Break the io request into chunks, one for each contiguous
2235          * stretch of disk blocks in the target file.
2236          */
2237         while (done_len < io_len) {
2238                 ASSERT(cpp);
2239                 bp = NULL;
2240                 contig = 0;
2241                 if (error = ud_bmap_read(ip, (uoff_t)(io_off + done_len),
2242                     &bn, &contig)) {
2243                         break;
2244                 }
2245
2246                 if (bn == UDF_HOLE) {   /* No holey swapfiles */
2247                         cmn_err(CE_WARN, "SWAP file has HOLES");
2248                         error = EINVAL;
2249                         break;
2250                 }
2251
2252                 cur_len = MIN(io_len - done_len, contig);
2253
2254                 /*
2255                  * Check if more than one I/O is
2256                  * required to complete the given
2257                  * I/O operation
2258                  */
2259                 if (ip->i_udf->udf_lbsize < PAGESIZE) {
2260                         if (cur_len >= PAGESIZE) {
2261                                 multi_io = 0;
2262                                 cur_len &= PAGEMASK;
2263                         } else {
2264                                 multi_io = 1;
2265                                 cur_len = MIN(io_len - done_len, PAGESIZE);
2266                         }
2267                 }
2268                 page_list_break(&cpp, &npp, btop(cur_len));
2269
2270                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2271                 ASSERT(bp != NULL);
2272
2273                 bp->b_edev = ip->i_dev;
2274                 bp->b_dev = cmpdev(ip->i_dev);
2275                 bp->b_blkno = bn;
2276                 bp->b_un.b_addr = (caddr_t)0;
2277                 bp->b_file = vp;
2278                 bp->b_offset = (offset_t)(io_off + done_len);
2279
2280 /*
2281  *              ub.ub_pageios.value.ul++;
2282  */
2283                 if (multi_io == 0) {
2284                         (void) bdev_strategy(bp);
2285                 } else {
2286                         error = ud_multi_strat(ip, cpp, bp,
2287                             (uoff_t)(io_off + done_len));
2288                         if (error != 0) {
2289                                 pageio_done(bp);
2290                                 break;
2291                         }
2292                 }
2293                 if (flags & B_READ) {
2294                         ud_pageio_reads++;
2295                 } else {
2296                         ud_pageio_writes++;
2297                 }
2298
2299                 /*
2300                  * If the request is not B_ASYNC, wait for i/o to complete
2301                  * and re-assemble the page list to return to the caller.
2302                  * If it is B_ASYNC we leave the page list in pieces and
2303                  * cleanup() will dispose of them.
2304                  */
2305                 if ((flags & B_ASYNC) == 0) {
2306                         error = biowait(bp);
2307                         pageio_done(bp);
2308                         if (error) {
2309                                 break;
2310                         }
2311                         page_list_concat(&opp, &cpp);
2312                 }
2313                 cpp = npp;
2314                 npp = NULL;
2315                 done_len += cur_len;
2316         }
2317
2318         ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2319         if (error) {
2320                 if (flags & B_ASYNC) {
2321                         /* Cleanup unprocessed parts of list */
2322                         page_list_concat(&cpp, &npp);
2323                         if (flags & B_READ) {
2324                                 pvn_read_done(cpp, B_ERROR);
2325                         } else {
2326                                 pvn_write_done(cpp, B_ERROR);
2327                         }
2328                 } else {
2329                         /* Re-assemble list and let caller clean up */
2330                         page_list_concat(&opp, &cpp);
2331                         page_list_concat(&opp, &npp);
2332                 }
2333         }
2334
2335 #ifdef  __lock_lint
2336         rw_exit(&ip->i_contents);
2337 #else
2338         if (dolock) {
2339                 rw_exit(&ip->i_contents);
2340         }
2341 #endif
2342         return (error);
2343 }
2344
2345
2346
2347
2348 /* -------------------- local functions --------------------------- */
2349
2350
2351
2352 int32_t
2353 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2354     struct ud_inode *ip, caddr_t base, int32_t len,
2355     offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2356 {
2357         int32_t error;
2358         struct uio auio;
2359         struct iovec aiov;
2360
2361         ud_printf("ud_rdwri\n");
2362
2363         bzero((caddr_t)&auio, sizeof (uio_t));
2364         bzero((caddr_t)&aiov, sizeof (iovec_t));
2365
2366         aiov.iov_base = base;
2367         aiov.iov_len = len;
2368         auio.uio_iov = &aiov;
2369         auio.uio_iovcnt = 1;
2370         auio.uio_loffset = offset;
2371         auio.uio_segflg = (int16_t)seg;
2372         auio.uio_resid = len;
2373
2374         if (rw == UIO_WRITE) {
2375                 auio.uio_fmode = FWRITE;
2376                 auio.uio_extflg = UIO_COPY_DEFAULT;
2377                 auio.uio_llimit = curproc->p_fsz_ctl;
2378                 error = ud_wrip(ip, &auio, ioflag, cr);
2379         } else {
2380                 auio.uio_fmode = FREAD;
2381                 auio.uio_extflg = UIO_COPY_CACHED;
2382                 auio.uio_llimit = MAXOFFSET_T;
2383                 error = ud_rdip(ip, &auio, ioflag, cr);
2384         }
2385
2386         if (aresid) {
2387                 *aresid = auio.uio_resid;
2388         } else if (auio.uio_resid) {
2389                 error = EIO;
2390         }
2391         return (error);
2392 }
2393
2394 /*
2395  * Free behind hacks.  The pager is busted.
2396  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2397  * or B_FREE_IF_TIGHT_ON_MEMORY.
2398  */
2399 int32_t ud_freebehind = 1;
2400 int32_t ud_smallfile = 32 * 1024;
2401
2402 /* ARGSUSED */
2403 int32_t
2404 ud_getpage_miss(struct vnode *vp, uoff_t off,
2405     size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2406     size_t plsz, enum seg_rw rw, int32_t seq)
2407 {
2408         struct ud_inode *ip = VTOI(vp);
2409         int32_t err = 0;
2410         size_t io_len;
2411         uoff_t io_off;
2412         uoff_t pgoff;
2413         page_t *pp;
2414
2415         pl[0] = NULL;
2416
2417         /*
2418          * Figure out whether the page can be created, or must be
2419          * read from the disk
2420          */
2421         if (rw == S_CREATE) {
2422                 if ((pp = page_create_va(&vp->v_object, off,
2423                     PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2424                         cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2425                         return (EINVAL);
2426                 }
2427                 io_len = PAGESIZE;
2428         } else {
2429                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2430                     &io_len, off, PAGESIZE, 0);
2431
2432                 /*
2433                  * Some other thread has entered the page.
2434                  * ud_getpage will retry page_lookup.
2435                  */
2436                 if (pp == NULL) {
2437                         return (0);
2438                 }
2439
2440                 /*
2441                  * Fill the page with as much data as we can from the file.
2442                  */
2443                 err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2444                 if (err) {
2445                         pvn_read_done(pp, B_ERROR);
2446                         return (err);
2447                 }
2448
2449                 /*
2450                  * XXX ??? ufs has io_len instead of pgoff below
2451                  */
2452                 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2453
2454                 /*
2455                  * If the file access is sequential, initiate read ahead
2456                  * of the next cluster.
2457                  */
2458                 if (seq && ip->i_nextrio < ip->i_size) {
2459                         ud_getpage_ra(vp, off, seg, addr);
2460                 }
2461         }
2462
2463 outmiss:
2464         pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2465         return (err);
2466 }
2467
2468 /* ARGSUSED */
2469 void
2470 ud_getpage_ra(struct vnode *vp,
2471     uoff_t off, struct seg *seg, caddr_t addr)
2472 {
2473         page_t *pp;
2474         size_t io_len;
2475         struct ud_inode *ip = VTOI(vp);
2476         uoff_t io_off = ip->i_nextrio, pgoff;
2477         caddr_t addr2 = addr + (io_off - off);
2478         daddr_t bn;
2479         int32_t contig = 0;
2480
2481         /*
2482          * Is this test needed?
2483          */
2484
2485         if (addr2 >= seg->s_base + seg->s_size) {
2486                 return;
2487         }
2488
2489         contig = 0;
2490         if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2491                 return;
2492         }
2493
2494         pp = pvn_read_kluster(vp, io_off, seg, addr2,
2495             &io_off, &io_len, io_off, PAGESIZE, 1);
2496
2497         /*
2498          * Some other thread has entered the page.
2499          * So no read head done here (ie we will have to and wait
2500          * for the read when needed).
2501          */
2502
2503         if (pp == NULL) {
2504                 return;
2505         }
2506
2507         (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2508         ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2509 }
2510
2511 int
2512 ud_page_fill(struct ud_inode *ip, page_t *pp, uoff_t off,
2513     uint32_t bflgs, uoff_t *pg_off)
2514 {
2515         daddr_t bn;
2516         struct buf *bp;
2517         caddr_t kaddr, caddr;
2518         int32_t error = 0, contig = 0, multi_io = 0;
2519         int32_t lbsize = ip->i_udf->udf_lbsize;
2520         int32_t lbmask = ip->i_udf->udf_lbmask;
2521         uint64_t isize;
2522
2523         isize = (ip->i_size + lbmask) & (~lbmask);
2524         if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2525
2526                 /*
2527                  * Embedded file read file_entry
2528                  * from buffer cache and copy the required
2529                  * portions
2530                  */
2531                 bp = ud_bread(ip->i_dev,
2532                     ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2533                 if ((bp->b_error == 0) &&
2534                     (bp->b_resid == 0)) {
2535
2536                         caddr = bp->b_un.b_addr + ip->i_data_off;
2537
2538                         /*
2539                          * mapin to kvm
2540                          */
2541                         kaddr = (caddr_t)ppmapin(pp,
2542                             PROT_READ | PROT_WRITE, (caddr_t)-1);
2543                         (void) kcopy(caddr, kaddr, ip->i_size);
2544
2545                         /*
2546                          * mapout of kvm
2547                          */
2548                         ppmapout(kaddr);
2549                 }
2550                 brelse(bp);
2551                 contig = ip->i_size;
2552         } else {
2553
2554                 /*
2555                  * Get the continuous size and block number
2556                  * at offset "off"
2557                  */
2558                 if (error = ud_bmap_read(ip, off, &bn, &contig))
2559                         goto out;
2560                 contig = MIN(contig, PAGESIZE);
2561                 contig = (contig + lbmask) & (~lbmask);
2562
2563                 /*
2564                  * Zero part of the page which we are not
2565                  * going to read from the disk.
2566                  */
2567
2568                 if (bn == UDF_HOLE) {
2569
2570                         /*
2571                          * This is a HOLE. Just zero out
2572                          * the page
2573                          */
2574                         if (((off + contig) == isize) ||
2575                             (contig == PAGESIZE)) {
2576                                 pagezero(pp->p_prev, 0, PAGESIZE);
2577                                 goto out;
2578                         }
2579                 }
2580
2581                 if (contig < PAGESIZE) {
2582                         uint64_t count;
2583
2584                         count = isize - off;
2585                         if (contig != count) {
2586                                 multi_io = 1;
2587                                 contig = (int32_t)(MIN(count, PAGESIZE));
2588                         } else {
2589                                 pagezero(pp->p_prev, contig, PAGESIZE - contig);
2590                         }
2591                 }
2592
2593                 /*
2594                  * Get a bp and initialize it
2595                  */
2596                 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2597                 ASSERT(bp != NULL);
2598
2599                 bp->b_edev = ip->i_dev;
2600                 bp->b_dev = cmpdev(ip->i_dev);
2601                 bp->b_blkno = bn;
2602                 bp->b_un.b_addr = 0;
2603                 bp->b_file = ip->i_vnode;
2604
2605                 /*
2606                  * Start I/O
2607                  */
2608                 if (multi_io == 0) {
2609
2610                         /*
2611                          * Single I/O is sufficient for this page
2612                          */
2613                         (void) bdev_strategy(bp);
2614                 } else {
2615
2616                         /*
2617                          * We need to do the I/O in
2618                          * piece's
2619                          */
2620                         error = ud_multi_strat(ip, pp, bp, off);
2621                         if (error != 0) {
2622                                 goto out;
2623                         }
2624                 }
2625                 if ((bflgs & B_ASYNC) == 0) {
2626
2627                         /*
2628                          * Wait for i/o to complete.
2629                          */
2630
2631                         error = biowait(bp);
2632                         pageio_done(bp);
2633                         if (error) {
2634                                 goto out;
2635                         }
2636                 }
2637         }
2638         if ((off + contig) >= ip->i_size) {
2639                 contig = ip->i_size - off;
2640         }
2641
2642 out:
2643         *pg_off = contig;
2644         return (error);
2645 }
2646
2647 int32_t
2648 ud_putpages(struct vnode *vp, offset_t off,
2649     size_t len, int32_t flags, struct cred *cr)
2650 {
2651         struct ud_inode *ip;
2652         page_t *pp;
2653         uoff_t io_off;
2654         size_t io_len;
2655         uoff_t eoff;
2656         int32_t err = 0;
2657         int32_t dolock;
2658
2659         ud_printf("ud_putpages\n");
2660
2661         if (vp->v_count == 0) {
2662                 cmn_err(CE_WARN, "ud_putpages: bad v_count");
2663                 return (EINVAL);
2664         }
2665
2666         ip = VTOI(vp);
2667
2668         /*
2669          * Acquire the readers/write inode lock before locking
2670          * any pages in this inode.
2671          * The inode lock is held during i/o.
2672          */
2673         if (len == 0) {
2674                 mutex_enter(&ip->i_tlock);
2675                 ip->i_delayoff = ip->i_delaylen = 0;
2676                 mutex_exit(&ip->i_tlock);
2677         }
2678 #ifdef  __lock_lint
2679         rw_enter(&ip->i_contents, RW_READER);
2680 #else
2681         dolock = (rw_owner(&ip->i_contents) != curthread);
2682         if (dolock) {
2683                 rw_enter(&ip->i_contents, RW_READER);
2684         }
2685 #endif
2686
2687         if (!vn_has_cached_data(vp)) {
2688 #ifdef  __lock_lint
2689                 rw_exit(&ip->i_contents);
2690 #else
2691                 if (dolock) {
2692                         rw_exit(&ip->i_contents);
2693                 }
2694 #endif
2695                 return (0);
2696         }
2697
2698         if (len == 0) {
2699                 /*
2700                  * Search the entire vp list for pages >= off.
2701                  */
2702                 err = pvn_vplist_dirty(vp, (uoff_t)off, ud_putapage,
2703                     flags, cr);
2704         } else {
2705                 /*
2706                  * Loop over all offsets in the range looking for
2707                  * pages to deal with.
2708                  */
2709                 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2710                         eoff = MIN(off + len, eoff);
2711                 } else {
2712                         eoff = off + len;
2713                 }
2714
2715                 for (io_off = off; io_off < eoff; io_off += io_len) {
2716                         /*
2717                          * If we are not invalidating, synchronously
2718                          * freeing or writing pages, use the routine
2719                          * page_lookup_nowait() to prevent reclaiming
2720                          * them from the free list.
2721                          */
2722                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2723                                 pp = page_lookup(&vp->v_object, io_off,
2724                                                  (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
2725                         } else {
2726                                 pp = page_lookup_nowait(&vp->v_object,
2727                                                         io_off,
2728                                                         (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2729                         }
2730
2731                         if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2732                                 io_len = PAGESIZE;
2733                         } else {
2734
2735                                 err = ud_putapage(vp, pp,
2736                                     &io_off, &io_len, flags, cr);
2737                                 if (err != 0) {
2738                                         break;
2739                                 }
2740                                 /*
2741                                  * "io_off" and "io_len" are returned as
2742                                  * the range of pages we actually wrote.
2743                                  * This allows us to skip ahead more quickly
2744                                  * since several pages may've been dealt
2745                                  * with by this iteration of the loop.
2746                                  */
2747                         }
2748                 }
2749         }
2750         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2751                 /*
2752                  * We have just sync'ed back all the pages on
2753                  * the inode, turn off the IMODTIME flag.
2754                  */
2755                 mutex_enter(&ip->i_tlock);
2756                 ip->i_flag &= ~IMODTIME;
2757                 mutex_exit(&ip->i_tlock);
2758         }
2759 #ifdef  __lock_lint
2760         rw_exit(&ip->i_contents);
2761 #else
2762         if (dolock) {
2763                 rw_exit(&ip->i_contents);
2764         }
2765 #endif
2766         return (err);
2767 }
2768
2769 /* ARGSUSED */
2770 int32_t
2771 ud_putapage(struct vnode *vp,
2772     page_t *pp, uoff_t *offp,
2773     size_t *lenp, int32_t flags, struct cred *cr)
2774 {
2775         daddr_t bn;
2776         size_t io_len;
2777         struct ud_inode *ip;
2778         int32_t error = 0, contig, multi_io = 0;
2779         struct udf_vfs *udf_vfsp;
2780         uoff_t off, io_off;
2781         caddr_t kaddr, caddr;
2782         struct buf *bp = NULL;
2783         int32_t lbmask;
2784         uint64_t isize;
2785         uint16_t crc_len;
2786         struct file_entry *fe;
2787
2788         ud_printf("ud_putapage\n");
2789
2790         ip = VTOI(vp);
2791         ASSERT(ip);
2792         ASSERT(RW_LOCK_HELD(&ip->i_contents));
2793         lbmask = ip->i_udf->udf_lbmask;
2794         isize = (ip->i_size + lbmask) & (~lbmask);
2795
2796         udf_vfsp = ip->i_udf;
2797         ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2798
2799         /*
2800          * If the modified time on the inode has not already been
2801          * set elsewhere (e.g. for write/setattr) we set the time now.
2802          * This gives us approximate modified times for mmap'ed files
2803          * which are modified via stores in the user address space.
2804          */
2805         if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2806                 mutex_enter(&ip->i_tlock);
2807                 ip->i_flag |= IUPD;
2808                 ITIMES_NOLOCK(ip);
2809                 mutex_exit(&ip->i_tlock);
2810         }
2811
2812
2813         /*
2814          * Align the request to a block boundry (for old file systems),
2815          * and go ask bmap() how contiguous things are for this file.
2816          */
2817         off = pp->p_offset & ~(offset_t)lbmask;
2818                                 /* block align it */
2819
2820
2821         if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2822                 ASSERT(ip->i_size <= ip->i_max_emb);
2823
2824                 pp = pvn_write_kluster(vp, pp, &io_off,
2825                     &io_len, off, PAGESIZE, flags);
2826                 if (io_len == 0) {
2827                         io_len = PAGESIZE;
2828                 }
2829
2830                 bp = ud_bread(ip->i_dev,
2831                     ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2832                     udf_vfsp->udf_lbsize);
2833                 fe = (struct file_entry *)bp->b_un.b_addr;
2834                 if ((bp->b_flags & B_ERROR) ||
2835                     (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2836                     ip->i_icb_block,
2837                     1, udf_vfsp->udf_lbsize) != 0)) {
2838                         if (pp != NULL)
2839                                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2840                         if (bp->b_flags & B_ERROR) {
2841                                 error = EIO;
2842                         } else {
2843                                 error = EINVAL;
2844                         }
2845                         brelse(bp);
2846                         return (error);
2847                 }
2848                 if ((bp->b_error == 0) &&
2849                     (bp->b_resid == 0)) {
2850
2851                         caddr = bp->b_un.b_addr + ip->i_data_off;
2852                         kaddr = (caddr_t)ppmapin(pp,
2853                             PROT_READ | PROT_WRITE, (caddr_t)-1);
2854                         (void) kcopy(kaddr, caddr, ip->i_size);
2855                         ppmapout(kaddr);
2856                 }
2857                 crc_len = offsetof(struct file_entry, fe_spec) +
2858                     SWAP_32(fe->fe_len_ear);
2859                 crc_len += ip->i_size;
2860                 ud_make_tag(ip->i_udf, &fe->fe_tag,
2861                     UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2862
2863                 bwrite(bp);
2864
2865                 if (flags & B_ASYNC) {
2866                         pvn_write_done(pp, flags);
2867                 }
2868                 contig = ip->i_size;
2869         } else {
2870
2871                 if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2872                         goto out;
2873                 }
2874                 contig = MIN(contig, PAGESIZE);
2875                 contig = (contig + lbmask) & (~lbmask);
2876
2877                 if (contig < PAGESIZE) {
2878                         uint64_t count;
2879
2880                         count = isize - off;
2881                         if (contig != count) {
2882                                 multi_io = 1;
2883                                 contig = (int32_t)(MIN(count, PAGESIZE));
2884                         }
2885                 }
2886
2887                 if ((off + contig) > isize) {
2888                         contig = isize - off;
2889                 }
2890
2891                 if (contig > PAGESIZE) {
2892                         if (contig & PAGEOFFSET) {
2893                                 contig &= PAGEMASK;
2894                         }
2895                 }
2896
2897                 pp = pvn_write_kluster(vp, pp, &io_off,
2898                     &io_len, off, contig, flags);
2899                 if (io_len == 0) {
2900                         io_len = PAGESIZE;
2901                 }
2902
2903                 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2904                 ASSERT(bp != NULL);
2905
2906                 bp->b_edev = ip->i_dev;
2907                 bp->b_dev = cmpdev(ip->i_dev);
2908                 bp->b_blkno = bn;
2909                 bp->b_un.b_addr = 0;
2910                 bp->b_file = vp;
2911                 bp->b_offset = (offset_t)off;
2912
2913
2914                 /*
2915                  * write throttle
2916                  */
2917                 ASSERT(bp->b_iodone == NULL);
2918                 bp->b_iodone = ud_iodone;
2919                 mutex_enter(&ip->i_tlock);
2920                 ip->i_writes += bp->b_bcount;
2921                 mutex_exit(&ip->i_tlock);
2922
2923                 if (multi_io == 0) {
2924
2925                         (void) bdev_strategy(bp);
2926                 } else {
2927                         error = ud_multi_strat(ip, pp, bp, off);
2928                         if (error != 0) {
2929                                 goto out;
2930                         }
2931                 }
2932
2933                 if ((flags & B_ASYNC) == 0) {
2934                         /*
2935                          * Wait for i/o to complete.
2936                          */
2937                         error = biowait(bp);
2938                         pageio_done(bp);
2939                 }
2940         }
2941
2942         if ((flags & B_ASYNC) == 0) {
2943                 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2944         }
2945
2946         pp = NULL;
2947
2948 out:
2949         if (error != 0 && pp != NULL) {
2950                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2951         }
2952
2953         if (offp) {
2954                 *offp = io_off;
2955         }
2956         if (lenp) {
2957                 *lenp = io_len;
2958         }
2959
2960         return (error);
2961 }
2962
2963
2964 int32_t
2965 ud_iodone(struct buf *bp)
2966 {
2967         struct ud_inode *ip;
2968
2969         VERIFY(bp->b_pages->p_object != NULL);
2970         ASSERT(bp->b_pages->p_vnode != NULL);
2971         ASSERT(!(bp->b_flags & B_READ));
2972
2973         bp->b_iodone = NULL;
2974
2975         ip = VTOI(bp->b_pages->p_vnode);
2976
2977         mutex_enter(&ip->i_tlock);
2978         if (ip->i_writes >= ud_LW) {
2979                 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2980                         if (ud_WRITES) {
2981                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
2982                         }
2983                 }
2984         } else {
2985                 ip->i_writes -= bp->b_bcount;
2986         }
2987         mutex_exit(&ip->i_tlock);
2988         iodone(bp);
2989         return (0);
2990 }
2991
2992 /* ARGSUSED3 */
2993 int32_t
2994 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2995 {
2996         struct vnode *vp;
2997         struct udf_vfs *udf_vfsp;
2998         krw_t rwtype;
2999         caddr_t base;
3000         uint32_t flags;
3001         int32_t error, n, on, mapon, dofree;
3002         uoff_t off;
3003         long oresid = uio->uio_resid;
3004
3005         ASSERT(RW_LOCK_HELD(&ip->i_contents));
3006         if ((ip->i_type != VREG) &&
3007             (ip->i_type != VDIR) &&
3008             (ip->i_type != VLNK)) {
3009                 return (EIO);
3010         }
3011
3012         if (uio->uio_loffset > MAXOFFSET_T) {
3013                 return (0);
3014         }
3015
3016         if ((uio->uio_loffset < 0) ||
3017             ((uio->uio_loffset + uio->uio_resid) < 0)) {
3018                 return (EINVAL);
3019         }
3020         if (uio->uio_resid == 0) {
3021                 return (0);
3022         }
3023
3024         vp = ITOV(ip);
3025         udf_vfsp = ip->i_udf;
3026         mutex_enter(&ip->i_tlock);
3027         ip->i_flag |= IACC;
3028         mutex_exit(&ip->i_tlock);
3029
3030         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3031
3032         do {
3033                 offset_t diff;
3034                 uoff_t uoff = uio->uio_loffset;
3035                 off = uoff & (offset_t)MAXBMASK;
3036                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3037                 on = (int)blkoff(udf_vfsp, uoff);
3038                 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3039
3040                 diff = ip->i_size - uoff;
3041
3042                 if (diff <= 0) {
3043                         error = 0;
3044                         goto out;
3045                 }
3046                 if (diff < (offset_t)n) {
3047                         n = (int)diff;
3048                 }
3049                 dofree = ud_freebehind &&
3050                     ip->i_nextr == (off & PAGEMASK) &&
3051                     off > ud_smallfile;
3052
3053 #ifndef __lock_lint
3054                 if (rwtype == RW_READER) {
3055                         rw_exit(&ip->i_contents);
3056                 }
3057 #endif
3058
3059                 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3060                     (uint32_t)n, 1, S_READ);
3061                 error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3062
3063                 flags = 0;
3064                 if (!error) {
3065                         /*
3066                          * If read a whole block, or read to eof,
3067                          * won't need this buffer again soon.
3068                          */
3069                         if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3070                             freemem < lotsfree + pages_before_pager) {
3071                                 flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3072                         }
3073                         /*
3074                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3075                          * we want to make sure that the page which has
3076                          * been read, is written on disk if it is dirty.
3077                          * And corresponding indirect blocks should also
3078                          * be flushed out.
3079                          */
3080                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3081                                 flags &= ~SM_ASYNC;
3082                                 flags |= SM_WRITE;
3083                         }
3084                         error = segmap_release(segkmap, base, flags);
3085                 } else    {
3086                         (void) segmap_release(segkmap, base, flags);
3087                 }
3088
3089 #ifndef __lock_lint
3090                 if (rwtype == RW_READER) {
3091                         rw_enter(&ip->i_contents, rwtype);
3092                 }
3093 #endif
3094         } while (error == 0 && uio->uio_resid > 0 && n != 0);
3095 out:
3096         /*
3097          * Inode is updated according to this table if FRSYNC is set.
3098          *
3099          *      FSYNC   FDSYNC(posix.4)
3100          *      --------------------------
3101          *      always  IATTCHG|IBDWRITE
3102          */
3103         if (ioflag & FRSYNC) {
3104                 if ((ioflag & FSYNC) ||
3105                     ((ioflag & FDSYNC) &&
3106                     (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3107                 rw_exit(&ip->i_contents);
3108                 rw_enter(&ip->i_contents, RW_WRITER);
3109                 ud_iupdat(ip, 1);
3110                 }
3111         }
3112         /*
3113          * If we've already done a partial read, terminate
3114          * the read but return no error.
3115          */
3116         if (oresid != uio->uio_resid) {
3117                 error = 0;
3118         }
3119         ITIMES(ip);
3120
3121         return (error);
3122 }
3123
3124 int32_t
3125 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3126 {
3127         caddr_t base;
3128         struct vnode *vp;
3129         struct udf_vfs *udf_vfsp;
3130         uint32_t flags;
3131         int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3132         int32_t pagecreate, newpage;
3133         uint64_t old_i_size;
3134         uoff_t off;
3135         long start_resid = uio->uio_resid, premove_resid;
3136         rlim64_t limit = uio->uio_limit;
3137
3138
3139         ASSERT(RW_WRITE_HELD(&ip->i_contents));
3140         if ((ip->i_type != VREG) &&
3141             (ip->i_type != VDIR) &&
3142             (ip->i_type != VLNK)) {
3143                 return (EIO);
3144         }
3145
3146         if (uio->uio_loffset >= MAXOFFSET_T) {
3147                 return (EFBIG);
3148         }
3149         /*
3150          * see udf_l_pathconf
3151          */
3152         if (limit > (((uint64_t)1 << 40) - 1)) {
3153                 limit = ((uint64_t)1 << 40) - 1;
3154         }
3155         if (uio->uio_loffset >= limit) {
3156                 proc_t *p = ttoproc(curthread);
3157
3158                 mutex_enter(&p->p_lock);
3159                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3160                     p, RCA_UNSAFE_SIGINFO);
3161                 mutex_exit(&p->p_lock);
3162                 return (EFBIG);
3163         }
3164         if ((uio->uio_loffset < 0) ||
3165             ((uio->uio_loffset + uio->uio_resid) < 0)) {
3166                 return (EINVAL);
3167         }
3168         if (uio->uio_resid == 0) {
3169                 return (0);
3170         }
3171
3172         mutex_enter(&ip->i_tlock);
3173         ip->i_flag |= INOACC;
3174
3175         if (ioflag & (FSYNC | FDSYNC)) {
3176                 ip->i_flag |= ISYNC;
3177                 iupdat_flag = 1;
3178         }
3179         mutex_exit(&ip->i_tlock);
3180
3181         udf_vfsp = ip->i_udf;
3182         vp = ITOV(ip);
3183
3184         do {
3185                 uoff_t uoff = uio->uio_loffset;
3186                 off = uoff & (offset_t)MAXBMASK;
3187                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3188                 on = (int)blkoff(udf_vfsp, uoff);
3189                 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3190
3191                 if (ip->i_type == VREG && uoff + n >= limit) {
3192                         if (uoff >= limit) {
3193                                 error = EFBIG;
3194                                 goto out;
3195                         }
3196                         n = (int)(limit - (rlim64_t)uoff);
3197                 }
3198                 if (uoff + n > ip->i_size) {
3199                         /*
3200                          * We are extending the length of the file.
3201                          * bmap is used so that we are sure that
3202                          * if we need to allocate new blocks, that it
3203                          * is done here before we up the file size.
3204                          */
3205                         error = ud_bmap_write(ip, uoff,
3206                             (int)(on + n), mapon == 0, cr);
3207                         if (error) {
3208                                 break;
3209                         }
3210                         i_size_changed = 1;
3211                         old_i_size = ip->i_size;
3212                         ip->i_size = uoff + n;
3213                         /*
3214                          * If we are writing from the beginning of
3215                          * the mapping, we can just create the
3216                          * pages without having to read them.
3217                          */
3218                         pagecreate = (mapon == 0);
3219                 } else if (n == MAXBSIZE) {
3220                         /*
3221                          * Going to do a whole mappings worth,
3222                          * so we can just create the pages w/o
3223                          * having to read them in.  But before
3224                          * we do that, we need to make sure any
3225                          * needed blocks are allocated first.
3226                          */
3227                         error = ud_bmap_write(ip, uoff,
3228                             (int)(on + n), 1, cr);
3229                         if (error) {
3230                                 break;
3231                         }
3232                         pagecreate = 1;
3233                 } else {
3234                         pagecreate = 0;
3235                 }
3236
3237                 rw_exit(&ip->i_contents);
3238
3239                 /*
3240                  * Touch the page and fault it in if it is not in
3241                  * core before segmap_getmapflt can lock it. This
3242                  * is to avoid the deadlock if the buffer is mapped
3243                  * to the same file through mmap which we want to
3244                  * write to.
3245                  */
3246                 uio_prefaultpages((long)n, uio);
3247
3248                 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3249                     (uint32_t)n, !pagecreate, S_WRITE);
3250
3251                 /*
3252                  * segmap_pagecreate() returns 1 if it calls
3253                  * page_create_va() to allocate any pages.
3254                  */
3255                 newpage = 0;
3256                 if (pagecreate) {
3257                         newpage = segmap_pagecreate(segkmap, base,
3258                             (size_t)n, 0);
3259                 }
3260
3261                 premove_resid = uio->uio_resid;
3262                 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3263
3264                 if (pagecreate &&
3265                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3266                         /*
3267                          * We created pages w/o initializing them completely,
3268                          * thus we need to zero the part that wasn't set up.
3269                          * This happens on most EOF write cases and if
3270                          * we had some sort of error during the uiomove.
3271                          */
3272                         int nzero, nmoved;
3273
3274                         nmoved = (int)(uio->uio_loffset - (off + mapon));
3275                         ASSERT(nmoved >= 0 && nmoved <= n);
3276                         nzero = roundup(on + n, PAGESIZE) - nmoved;
3277                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3278                         (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3279                 }
3280
3281                 /*
3282                  * Unlock the pages allocated by page_create_va()
3283                  * in segmap_pagecreate()
3284                  */
3285                 if (newpage) {
3286                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3287                 }
3288
3289                 if (error) {
3290                         /*
3291                          * If we failed on a write, we may have already
3292                          * allocated file blocks as well as pages.  It's
3293                          * hard to undo the block allocation, but we must
3294                          * be sure to invalidate any pages that may have
3295                          * been allocated.
3296                          */
3297                         (void) segmap_release(segkmap, base, SM_INVAL);
3298                 } else {
3299                         flags = 0;
3300                         /*
3301                          * Force write back for synchronous write cases.
3302                          */
3303                         if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3304                                 /*
3305                                  * If the sticky bit is set but the
3306                                  * execute bit is not set, we do a
3307                                  * synchronous write back and free
3308                                  * the page when done.  We set up swap
3309                                  * files to be handled this way to
3310                                  * prevent servers from keeping around
3311                                  * the client's swap pages too long.
3312                                  * XXX - there ought to be a better way.
3313                                  */
3314                                 if (IS_SWAPVP(vp)) {
3315                                         flags = SM_WRITE | SM_FREE |
3316                                             SM_DONTNEED;
3317                                         iupdat_flag = 0;
3318                                 } else {
3319                                         flags = SM_WRITE;
3320                                 }
3321                         } else if (((mapon + n) == MAXBSIZE) ||
3322                             IS_SWAPVP(vp)) {
3323                                 /*
3324                                  * Have written a whole block.
3325                                  * Start an asynchronous write and
3326                                  * mark the buffer to indicate that
3327                                  * it won't be needed again soon.
3328                                  */
3329                                 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3330                         }
3331                         error = segmap_release(segkmap, base, flags);
3332
3333                         /*
3334                          * If the operation failed and is synchronous,
3335                          * then we need to unwind what uiomove() last
3336                          * did so we can potentially return an error to
3337                          * the caller.  If this write operation was
3338                          * done in two pieces and the first succeeded,
3339                          * then we won't return an error for the second
3340                          * piece that failed.  However, we only want to
3341                          * return a resid value that reflects what was
3342                          * really done.
3343                          *
3344                          * Failures for non-synchronous operations can
3345                          * be ignored since the page subsystem will
3346                          * retry the operation until it succeeds or the
3347                          * file system is unmounted.
3348                          */
3349                         if (error) {
3350                                 if ((ioflag & (FSYNC | FDSYNC)) ||
3351                                     ip->i_type == VDIR) {
3352                                         uio->uio_resid = premove_resid;
3353                                 } else {
3354                                         error = 0;
3355                                 }
3356                         }
3357                 }
3358
3359                 /*
3360                  * Re-acquire contents lock.
3361                  */
3362                 rw_enter(&ip->i_contents, RW_WRITER);
3363                 /*
3364                  * If the uiomove() failed or if a synchronous
3365                  * page push failed, fix up i_size.
3366                  */
3367                 if (error) {
3368                         if (i_size_changed) {
3369                                 /*
3370                                  * The uiomove failed, and we
3371                                  * allocated blocks,so get rid
3372                                  * of them.
3373                                  */
3374                                 (void) ud_itrunc(ip, old_i_size, 0, cr);
3375                         }
3376                 } else {
3377                         /*
3378                          * XXX - Can this be out of the loop?
3379                          */
3380                         ip->i_flag |= IUPD | ICHG;
3381                         if (i_size_changed) {
3382                                 ip->i_flag |= IATTCHG;
3383                         }
3384                         if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3385                             (IEXEC >> 10))) != 0 &&
3386                             (ip->i_char & (ISUID | ISGID)) != 0 &&
3387                             secpolicy_vnode_setid_retain(cr,
3388                             (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3389                                 /*
3390                                  * Clear Set-UID & Set-GID bits on
3391                                  * successful write if not privileged
3392                                  * and at least one of the execute bits
3393                                  * is set.  If we always clear Set-GID,
3394                                  * mandatory file and record locking is
3395                                  * unuseable.
3396                                  */
3397                                 ip->i_char &= ~(ISUID | ISGID);
3398                         }
3399                 }
3400         } while (error == 0 && uio->uio_resid > 0 && n != 0);
3401
3402 out:
3403         /*
3404          * Inode is updated according to this table -
3405          *
3406          *      FSYNC   FDSYNC(posix.4)
3407          *      --------------------------
3408          *      always@ IATTCHG|IBDWRITE
3409          *
3410          * @ -  If we are doing synchronous write the only time we should
3411          *      not be sync'ing the ip here is if we have the stickyhack
3412          *      activated, the file is marked with the sticky bit and
3413          *      no exec bit, the file length has not been changed and
3414          *      no new blocks have been allocated during this write.
3415          */
3416         if ((ip->i_flag & ISYNC) != 0) {
3417                 /*
3418                  * we have eliminated nosync
3419                  */
3420                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3421                     ((ioflag & FSYNC) && iupdat_flag)) {
3422                         ud_iupdat(ip, 1);
3423                 }
3424         }
3425
3426         /*
3427          * If we've already done a partial-write, terminate
3428          * the write but return no error.
3429          */
3430         if (start_resid != uio->uio_resid) {
3431                 error = 0;
3432         }
3433         ip->i_flag &= ~(INOACC | ISYNC);
3434         ITIMES_NOLOCK(ip);
3435
3436         return (error);
3437 }
3438
3439 int32_t
3440 ud_multi_strat(struct ud_inode *ip,
3441     page_t *pp, struct buf *bp, uoff_t start)
3442 {
3443         daddr_t bn;
3444         int32_t error = 0, io_count, contig, alloc_sz, i;
3445         uint32_t io_off;
3446         mio_master_t *mm = NULL;
3447         mio_slave_t *ms = NULL;
3448         struct buf *rbp;
3449
3450         ASSERT(!(start & PAGEOFFSET));
3451
3452         /*
3453          * Figure out how many buffers to allocate
3454          */
3455         io_count = 0;
3456         for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3457                 contig = 0;
3458                 if (error = ud_bmap_read(ip, (uoff_t)(start + io_off),
3459                     &bn, &contig)) {
3460                         goto end;
3461                 }
3462                 if (contig == 0) {
3463                         goto end;
3464                 }
3465                 contig = MIN(contig, PAGESIZE - io_off);
3466                 if (bn != UDF_HOLE) {
3467                         io_count ++;
3468                 } else {
3469                         /*
3470                          * HOLE
3471                          */
3472                         if (bp->b_flags & B_READ) {
3473
3474                                 /*
3475                                  * This is a hole and is read
3476                                  * it should be filled with 0's
3477                                  */
3478                                 pagezero(pp, io_off, contig);
3479                         }
3480                 }
3481         }
3482
3483
3484         if (io_count != 0) {
3485
3486                 /*
3487                  * Allocate memory for all the
3488                  * required number of buffers
3489                  */
3490                 alloc_sz = sizeof (mio_master_t) +
3491                     (sizeof (mio_slave_t) * io_count);
3492                 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3493                 if (mm == NULL) {
3494                         error = ENOMEM;
3495                         goto end;
3496                 }
3497
3498                 /*
3499                  * initialize master
3500                  */
3501                 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3502                 mm->mm_size = alloc_sz;
3503                 mm->mm_bp = bp;
3504                 mm->mm_resid = 0;
3505                 mm->mm_error = 0;
3506                 mm->mm_index = master_index++;
3507
3508                 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3509
3510                 /*
3511                  * Initialize buffers
3512                  */
3513                 io_count = 0;
3514                 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3515                         contig = 0;
3516                         if (error = ud_bmap_read(ip,
3517                             (uoff_t)(start + io_off),
3518                             &bn, &contig)) {
3519                                 goto end;
3520                         }
3521                         ASSERT(contig);
3522                         if ((io_off + contig) > bp->b_bcount) {
3523                                 contig = bp->b_bcount - io_off;
3524                         }
3525                         if (bn != UDF_HOLE) {
3526                                 /*
3527                                  * Clone the buffer
3528                                  * and prepare to start I/O
3529                                  */
3530                                 ms->ms_ptr = mm;
3531                                 bioinit(&ms->ms_buf);
3532                                 rbp = bioclone(bp, io_off, (size_t)contig,
3533                                     bp->b_edev, bn, ud_slave_done,
3534                                     &ms->ms_buf, KM_NOSLEEP);
3535                                 ASSERT(rbp == &ms->ms_buf);
3536                                 mm->mm_resid += contig;
3537                                 io_count++;
3538                                 ms ++;
3539                         }
3540                 }
3541
3542                 /*
3543                  * Start I/O's
3544                  */
3545                 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3546                 for (i = 0; i < io_count; i++) {
3547                         (void) bdev_strategy(&ms->ms_buf);
3548                         ms ++;
3549                 }
3550         }
3551
3552 end:
3553         if (error != 0) {
3554                 bp->b_flags |= B_ERROR;
3555                 bp->b_error = error;
3556                 if (mm != NULL) {
3557                         mutex_destroy(&mm->mm_mutex);
3558                         kmem_free(mm, mm->mm_size);
3559                 }
3560         }
3561         return (error);
3562 }
3563
3564 int32_t
3565 ud_slave_done(struct buf *bp)
3566 {
3567         mio_master_t *mm;
3568         int32_t resid;
3569
3570         ASSERT(SEMA_HELD(&bp->b_sem));
3571         ASSERT((bp->b_flags & B_DONE) == 0);
3572
3573         mm = ((mio_slave_t *)bp)->ms_ptr;
3574
3575         /*
3576          * Propagate error and byte count info from slave struct to
3577          * the master struct
3578          */
3579         mutex_enter(&mm->mm_mutex);
3580         if (bp->b_flags & B_ERROR) {
3581
3582                 /*
3583                  * If multiple slave buffers get
3584                  * error we forget the old errors
3585                  * this is ok because we any way
3586                  * cannot return multiple errors
3587                  */
3588                 mm->mm_error = bp->b_error;
3589         }
3590         mm->mm_resid -= bp->b_bcount;
3591         resid = mm->mm_resid;
3592         mutex_exit(&mm->mm_mutex);
3593
3594         /*
3595          * free up the resources allocated to cloned buffers.
3596          */
3597         bp_mapout(bp);
3598         biofini(bp);
3599
3600         if (resid == 0) {
3601
3602                 /*
3603                  * This is the last I/O operation
3604                  * clean up and return the original buffer
3605                  */
3606                 if (mm->mm_error) {
3607                         mm->mm_bp->b_flags |= B_ERROR;
3608                         mm->mm_bp->b_error = mm->mm_error;
3609                 }
3610                 biodone(mm->mm_bp);
3611                 mutex_destroy(&mm->mm_mutex);
3612                 kmem_free(mm, mm->mm_size);
3613         }
3614         return (0);
3615 }