usr/src/uts/common/fs/ufs/ufs_bmap.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  26 /*        All Rights Reserved   */
  27
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/signal.h>
  44 #include <sys/user.h>
  45 #include <sys/vnode.h>
  46 #include <sys/buf.h>
  47 #include <sys/disp.h>
  48 #include <sys/proc.h>
  49 #include <sys/conf.h>
  50 #include <sys/fs/ufs_inode.h>
  51 #include <sys/fs/ufs_fs.h>
  52 #include <sys/fs/ufs_quota.h>
  53 #include <sys/fs/ufs_trans.h>
  54 #include <sys/fs/ufs_bio.h>
  55 #include <vm/seg.h>
  56 #include <sys/errno.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/vfs.h>
  59 #include <sys/debug.h>
  60 #include <sys/kmem.h>
  61 #include <sys/cmn_err.h>
  62
  63 /*
  64  * This structure is used to track blocks as we allocate them, so that
  65  * we can free them if we encounter an error during allocation.  We
  66  * keep track of five pieces of information for each allocated block:
  67  *   - The number of the newly allocated block
  68  *   - The size of the block (lets us deal with fragments if we want)
  69  *   - The number of the block containing a pointer to it; or whether
  70  *     the pointer is in the inode
  71  *   - The offset within the block (or inode) containing a pointer to it.
  72  *   - A flag indicating the usage of the block.  (Logging needs to know
  73  *     this to avoid overwriting a data block if it was previously used
  74  *     for metadata.)
  75  */
  76
  77 enum ufs_owner_type {
  78         ufs_no_owner,           /* Owner has not yet been updated */
  79         ufs_inode_direct,       /* Listed in inode's direct block table */
  80         ufs_inode_indirect,     /* Listed in inode's indirect block table */
  81         ufs_indirect_block      /* Listed in an indirect block */
  82 };
  83
  84 struct ufs_allocated_block {
  85         daddr_t this_block;         /* Number of this block */
  86         off_t block_size;           /* Size of this block, in bytes */
  87         enum ufs_owner_type owner;  /* Who points to this block? */
  88         daddr_t owner_block;        /* Number of the owning block */
  89         uint_t owner_offset;        /* Offset within that block or inode */
  90         int usage_flags;            /* Usage flags, as expected by free() */
  91 };
  92
  93
  94 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
  95                 int maxtrans);
  96
  97 static void ufs_undo_allocation(inode_t *ip, int block_count,
  98         struct ufs_allocated_block table[], int inode_sector_adjust);
  99
 100 /*
 101  * Find the extent and the matching block number.
 102  *
 103  * bsize > PAGESIZE
 104  *      boff indicates that we want a page in the middle
 105  *      min expression is supposed to make sure no extra page[s] after EOF
 106  * PAGESIZE >= bsize
 107  *      we assume that a page is a multiple of bsize, i.e.,
 108  *      boff always == 0
 109  *
 110  * We always return a length that is suitable for a disk transfer.
 111  */
 112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
 113         register daddr32_t *dp = (tblp);                                \
 114         register int _chkfrag = chkfrag; /* for lint. sigh */           \
 115                                                                         \
 116         if (*dp == 0) {                                                 \
 117                 *(bnp) = UFS_HOLE;                                      \
 118         } else {                                                        \
 119                 register int len;                                       \
 120                                                                         \
 121                 len = findextent(fs, dp, (int)(n), lenp, maxtrans) <<   \
 122                         (fs)->fs_bshift;                                \
 123                 if (_chkfrag) {                                         \
 124                         register u_offset_t tmp;                        \
 125                                                                         \
 126                         tmp = fragroundup((fs), size) -                 \
 127                             (((u_offset_t)lbn) << fs->fs_bshift);       \
 128                         len = (int)MIN(tmp, len);                       \
 129                 }                                                       \
 130                 len -= (boff);                                          \
 131                 if (len <= 0) {                                         \
 132                         *(bnp) = UFS_HOLE;                              \
 133                 } else {                                                \
 134                         *(bnp) = fsbtodb(fs, *dp) + btodb(boff);        \
 135                         *(lenp) = len;                                  \
 136                 }                                                       \
 137         }                                                               \
 138 }
 139
 140 /*
 141  * The maximum supported file size is actually somewhat less that 1
 142  * terabyte.  This is because the total number of blocks used for the
 143  * file and its metadata must fit into the ic_blocks field of the
 144  * inode, which is a signed 32-bit quantity.  The metadata allocated
 145  * for a file (that is, the single, double, and triple indirect blocks
 146  * used to reference the file blocks) is actually quite small,
 147  * but just to make sure, we check for overflow in the ic_blocks
 148  * ic_blocks fields for all files whose total block count is
 149  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
 150  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
 151  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
 152  * field if the number of blocks currently allocated to the file is
 153  * greater than VERYLARGEFILESIZE.
 154  *
 155  * Note that file "size" is the not the same as file "length".  A
 156  * file's "size" is the number of blocks allocated to it.  A file's
 157  * "length" is the maximum offset in the file.  A UFS FILE can have a
 158  * length of a terabyte, but the size is limited to somewhat less than
 159  * a terabyte, as described above.
 160  */
 161 #define VERYLARGEFILESIZE       0x7FE00000
 162
 163 /*
 164  * bmap{read,write} define the structure of file system storage by mapping
 165  * a logical offset in a file to a physical block number on the device.
 166  * It should be called with a locked inode when allocation is to be
 167  * done (bmap_write).  Note this strangeness: bmap_write is always called from
 168  * getpage(), not putpage(), since getpage() is where all the allocation
 169  * is done.
 170  *
 171  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
 172  *
 173  * NOTICE: the block number returned is the disk block number, not the
 174  * file system block number.  All the worries about block offsets and
 175  * page/block sizes are hidden inside of bmap.  Well, not quite,
 176  * unfortunately.  It's impossible to find one place to hide all this
 177  * mess.  There are 3 cases:
 178  *
 179  * PAGESIZE < bsize
 180  *      In this case, the {get,put}page routines will attempt to align to
 181  *      a file system block boundry (XXX - maybe this is a mistake?).  Since
 182  *      the kluster routines may be out of memory, we don't always get all
 183  *      the pages we wanted.  If we called bmap first, to find out how much
 184  *      to kluster, we handed in the block aligned offset.  If we didn't get
 185  *      all the pages, we have to chop off the amount we didn't get from the
 186  *      amount handed back by bmap.
 187  *
 188  * PAGESIZE == bsize
 189  *      Life is quite pleasant here, no extra work needed, mainly because we
 190  *      (probably?) won't kluster backwards, just forwards.
 191  *
 192  * PAGESIZE > bsize
 193  *      This one has a different set of problems, specifically, we may have to
 194  *      do N reads to fill one page.  Let us hope that Sun will stay with small
 195  *      pages.
 196  *
 197  * Returns 0 on success, or a non-zero errno if an error occurs.
 198  *
 199  * TODO
 200  *      LMXXX - add a bmap cache.  This could be a couple of extents in the
 201  *      inode.  Two is nice for PAGESIZE > bsize.
 202  */
 203
 204 int
 205 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
 206 {
 207         daddr_t lbn;
 208         ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
 209         struct  fs *fs = ufsvfsp->vfs_fs;
 210         struct  buf *bp;
 211         int     i, j, boff;
 212         int     shft;                   /* we maintain sh = 1 << shft */
 213         daddr_t ob, nb, tbn;
 214         daddr32_t *bap;
 215         int     nindirshift, nindiroffset;
 216
 217         ASSERT(RW_LOCK_HELD(&ip->i_contents));
 218         lbn = (daddr_t)lblkno(fs, off);
 219         boff = (int)blkoff(fs, off);
 220         if (lbn < 0)
 221                 return (EFBIG);
 222
 223         /*
 224          * The first NDADDR blocks are direct blocks.
 225          */
 226         if (lbn < NDADDR) {
 227                 DOEXTENT(fs, lbn, boff, bnp, lenp,
 228                     ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
 229                     ufsvfsp->vfs_iotransz);
 230                 return (0);
 231         }
 232
 233         nindirshift = ufsvfsp->vfs_nindirshift;
 234         nindiroffset = ufsvfsp->vfs_nindiroffset;
 235         /*
 236          * Determine how many levels of indirection.
 237          */
 238         shft = 0;                               /* sh = 1 */
 239         tbn = lbn - NDADDR;
 240         for (j = NIADDR; j > 0; j--) {
 241                 longlong_t      sh;
 242
 243                 shft += nindirshift;            /* sh *= nindir */
 244                 sh = 1LL << shft;
 245                 if (tbn < sh)
 246                         break;
 247                 tbn -= sh;
 248         }
 249         if (j == 0)
 250                 return (EFBIG);
 251
 252         /*
 253          * Fetch the first indirect block.
 254          */
 255         nb = ip->i_ib[NIADDR - j];
 256         if (nb == 0) {
 257                 *bnp = UFS_HOLE;
 258                 return (0);
 259         }
 260
 261         /*
 262          * Fetch through the indirect blocks.
 263          */
 264         for (; j <= NIADDR; j++) {
 265                 ob = nb;
 266                 bp = UFS_BREAD(ufsvfsp,
 267                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
 268                 if (bp->b_flags & B_ERROR) {
 269                         brelse(bp);
 270                         return (EIO);
 271                 }
 272                 bap = bp->b_un.b_daddr;
 273
 274                 ASSERT(!ufs_indir_badblock(ip, bap));
 275
 276                 shft -= nindirshift;            /* sh / nindir */
 277                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
 278                 nb = bap[i];
 279                 if (nb == 0) {
 280                         *bnp = UFS_HOLE;
 281                         brelse(bp);
 282                         return (0);
 283                 }
 284                 if (j != NIADDR)
 285                         brelse(bp);
 286         }
 287         DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
 288             MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
 289             0, ufsvfsp->vfs_iotransz);
 290         brelse(bp);
 291         return (0);
 292 }
 293
 294 /*
 295  * See bmap_read for general notes.
 296  *
 297  * The block must be at least size bytes and will be extended or
 298  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
 299  * will not create any in-core pages that correspond to the new disk allocation.
 300  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
 301  * and security is maintained b/c upon reading a negative block number pages
 302  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
 303  * be created and initialized as needed.
 304  *
 305  * Returns 0 on success, or a non-zero errno if an error occurs.
 306  */
 307 int
 308 bmap_write(struct inode *ip, u_offset_t off, int size,
 309     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
 310 {
 311         struct  fs *fs;
 312         struct  buf *bp;
 313         int     i;
 314         struct  buf *nbp;
 315         int     j;
 316         int     shft;                           /* we maintain sh = 1 << shft */
 317         daddr_t ob, nb, pref, lbn, llbn, tbn;
 318         daddr32_t *bap;
 319         struct  vnode *vp = ITOV(ip);
 320         long    bsize = VBSIZE(vp);
 321         long    osize, nsize;
 322         int     issync, metaflag, isdirquota;
 323         int     err;
 324         dev_t   dev;
 325         struct  fbuf *fbp;
 326         int     nindirshift;
 327         int     nindiroffset;
 328         struct  ufsvfs  *ufsvfsp;
 329         int     added_sectors;          /* sectors added to this inode */
 330         int     alloced_blocks;         /* fs blocks newly allocated */
 331         struct  ufs_allocated_block undo_table[NIADDR+1];
 332         int     verylargefile = 0;
 333
 334         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 335
 336         if (allocblk)
 337                 *allocblk = 0;
 338
 339         ufsvfsp = ip->i_ufsvfs;
 340         fs = ufsvfsp->vfs_bufp->b_un.b_fs;
 341         lbn = (daddr_t)lblkno(fs, off);
 342         if (lbn < 0)
 343                 return (EFBIG);
 344         if (ip->i_blocks >= VERYLARGEFILESIZE)
 345                 verylargefile = 1;
 346         llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
 347         metaflag = isdirquota = 0;
 348         if (((ip->i_mode & IFMT) == IFDIR) ||
 349             ((ip->i_mode & IFMT) == IFATTRDIR))
 350                 isdirquota = metaflag = I_DIR;
 351         else if ((ip->i_mode & IFMT) == IFSHAD)
 352                 metaflag = I_SHAD;
 353         else if (ip->i_ufsvfs->vfs_qinod == ip)
 354                 isdirquota = metaflag = I_QUOTA;
 355
 356         issync = ((ip->i_flag & ISYNC) != 0);
 357
 358         if (isdirquota || issync) {
 359                 alloc_type = BI_NORMAL; /* make sure */
 360         }
 361
 362         /*
 363          * If the next write will extend the file into a new block,
 364          * and the file is currently composed of a fragment
 365          * this fragment has to be extended to be a full block.
 366          */
 367         if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
 368                 osize = blksize(fs, ip, llbn);
 369                 if (osize < bsize && osize > 0) {
 370                         /*
 371                          * Check to see if doing this will make the file too
 372                          * big.  Only check if we are dealing with a very
 373                          * large file.
 374                          */
 375                         if (verylargefile == 1) {
 376                                 if (((unsigned)ip->i_blocks +
 377                                     btodb(bsize - osize)) > INT_MAX) {
 378                                         return (EFBIG);
 379                                 }
 380                         }
 381                         /*
 382                          * Make sure we have all needed pages setup correctly.
 383                          *
 384                          * We pass S_OTHER to fbread here because we want
 385                          * an exclusive lock on the page in question
 386                          * (see ufs_getpage). I/O to the old block location
 387                          * may still be in progress and we are about to free
 388                          * the old block. We don't want anyone else to get
 389                          * a hold of the old block once we free it until
 390                          * the I/O is complete.
 391                          */
 392                         err =
 393                             fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
 394                             (uint_t)bsize, S_OTHER, &fbp);
 395                         if (err)
 396                                 return (err);
 397                         pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
 398                         err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
 399                             &nb, cr);
 400                         if (err) {
 401                                 if (fbp)
 402                                         fbrelse(fbp, S_OTHER);
 403                                 return (err);
 404                         }
 405                         ASSERT(!ufs_badblock(ip, nb));
 406
 407                         /*
 408                          * Update the inode before releasing the
 409                          * lock on the page. If we released the page
 410                          * lock first, the data could be written to it's
 411                          * old address and then destroyed.
 412                          */
 413                         TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
 414                         ip->i_db[llbn] = nb;
 415                         UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
 416                             ip);
 417                         ip->i_blocks += btodb(bsize - osize);
 418                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 419                         TRANS_INODE(ufsvfsp, ip);
 420                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 421
 422                         /* Caller is responsible for updating i_seq */
 423                         /*
 424                          * Don't check metaflag here, directories won't do this
 425                          *
 426                          */
 427                         if (issync) {
 428                                 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
 429                         } else {
 430                                 ASSERT(fbp);
 431                                 fbrelse(fbp, S_WRITE);
 432                         }
 433
 434                         if (nb != ob) {
 435                                 (void) free(ip, ob, (off_t)osize, metaflag);
 436                         }
 437                 }
 438         }
 439
 440         /*
 441          * The first NDADDR blocks are direct blocks.
 442          */
 443         if (lbn < NDADDR) {
 444                 nb = ip->i_db[lbn];
 445                 if (nb == 0 ||
 446                     ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
 447                         if (nb != 0) {
 448                                 /* consider need to reallocate a frag */
 449                                 osize = fragroundup(fs, blkoff(fs, ip->i_size));
 450                                 nsize = fragroundup(fs, size);
 451                                 if (nsize <= osize)
 452                                         goto gotit;
 453                                 /*
 454                                  * Check to see if doing this will make the
 455                                  * file too big.  Only check if we are dealing
 456                                  * with a very large file.
 457                                  */
 458                                 if (verylargefile == 1) {
 459                                         if (((unsigned)ip->i_blocks +
 460                                             btodb(nsize - osize)) > INT_MAX) {
 461                                                 return (EFBIG);
 462                                         }
 463                                 }
 464                                 /*
 465                                  * need to re-allocate a block or frag
 466                                  */
 467                                 ob = nb;
 468                                 pref = blkpref(ip, lbn, (int)lbn,
 469                                     &ip->i_db[0]);
 470                                 err = realloccg(ip, ob, pref, (int)osize,
 471                                     (int)nsize, &nb, cr);
 472                                 if (err)
 473                                         return (err);
 474                                 if (allocblk)
 475                                         *allocblk = nb;
 476                                 ASSERT(!ufs_badblock(ip, nb));
 477
 478                         } else {
 479                                 /*
 480                                  * need to allocate a block or frag
 481                                  */
 482                                 osize = 0;
 483                                 if (ip->i_size <
 484                                     ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
 485                                         nsize = fragroundup(fs, size);
 486                                 else
 487                                         nsize = bsize;
 488                                 /*
 489                                  * Check to see if doing this will make the
 490                                  * file too big.  Only check if we are dealing
 491                                  * with a very large file.
 492                                  */
 493                                 if (verylargefile == 1) {
 494                                         if (((unsigned)ip->i_blocks +
 495                                             btodb(nsize - osize)) > INT_MAX) {
 496                                                 return (EFBIG);
 497                                         }
 498                                 }
 499                                 pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
 500                                 err = alloc(ip, pref, (int)nsize, &nb, cr);
 501                                 if (err)
 502                                         return (err);
 503                                 if (allocblk)
 504                                         *allocblk = nb;
 505                                 ASSERT(!ufs_badblock(ip, nb));
 506                                 ob = nb;
 507                         }
 508
 509                         /*
 510                          * Read old/create new zero pages
 511                          */
 512                         fbp = NULL;
 513                         if (osize == 0) {
 514                                 /*
 515                                  * mmap S_WRITE faults always enter here
 516                                  */
 517                                 /*
 518                                  * We zero it if its also BI_FALLOCATE, but
 519                                  * only for direct blocks!
 520                                  */
 521                                 if (alloc_type == BI_NORMAL ||
 522                                     alloc_type == BI_FALLOCATE ||
 523                                     P2ROUNDUP_TYPED(size,
 524                                     PAGESIZE, u_offset_t) < nsize) {
 525                                         /* fbzero doesn't cause a pagefault */
 526                                         fbzero(ITOV(ip),
 527                                             ((offset_t)lbn << fs->fs_bshift),
 528                                             (uint_t)nsize, &fbp);
 529                                 }
 530                         } else {
 531                                 err = fbread(vp,
 532                                     ((offset_t)lbn << fs->fs_bshift),
 533                                     (uint_t)nsize, S_OTHER, &fbp);
 534                                 if (err) {
 535                                         if (nb != ob) {
 536                                                 (void) free(ip, nb,
 537                                                     (off_t)nsize, metaflag);
 538                                         } else {
 539                                                 (void) free(ip,
 540                                                     ob + numfrags(fs, osize),
 541                                                     (off_t)(nsize - osize),
 542                                                     metaflag);
 543                                         }
 544                                         ASSERT(nsize >= osize);
 545                                         (void) chkdq(ip,
 546                                             -(long)btodb(nsize - osize),
 547                                             0, cr, (char **)NULL,
 548                                             (size_t *)NULL);
 549                                         return (err);
 550                                 }
 551                         }
 552                         TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
 553                         ip->i_db[lbn] = nb;
 554                         ip->i_blocks += btodb(nsize - osize);
 555                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 556                         TRANS_INODE(ufsvfsp, ip);
 557                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 558
 559                         /* Caller is responsible for updating i_seq */
 560
 561                         /*
 562                          * Write directory and shadow blocks synchronously so
 563                          * that they never appear with garbage in them on the
 564                          * disk.
 565                          *
 566                          */
 567                         if (isdirquota && (ip->i_size ||
 568                             TRANS_ISTRANS(ufsvfsp))) {
 569                         /*
 570                          * XXX man not be necessary with harpy trans
 571                          * bug id 1130055
 572                          */
 573                                 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
 574                         } else if (fbp) {
 575                                 fbrelse(fbp, S_WRITE);
 576                         }
 577
 578                         if (nb != ob)
 579                                 (void) free(ip, ob, (off_t)osize, metaflag);
 580                 }
 581 gotit:
 582                 return (0);
 583         }
 584
 585         added_sectors = alloced_blocks = 0;     /* No blocks alloced yet */
 586
 587         /*
 588          * Determine how many levels of indirection.
 589          */
 590         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 591         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 592         pref = 0;
 593         shft = 0;                               /* sh = 1 */
 594         tbn = lbn - NDADDR;
 595         for (j = NIADDR; j > 0; j--) {
 596                 longlong_t      sh;
 597
 598                 shft += nindirshift;            /* sh *= nindir */
 599                 sh = 1LL << shft;
 600                 if (tbn < sh)
 601                         break;
 602                 tbn -= sh;
 603         }
 604
 605         if (j == 0)
 606                 return (EFBIG);
 607
 608         /*
 609          * Fetch the first indirect block.
 610          */
 611         dev = ip->i_dev;
 612         nb = ip->i_ib[NIADDR - j];
 613         if (nb == 0) {
 614                 /*
 615                  * Check to see if doing this will make the
 616                  * file too big.  Only check if we are dealing
 617                  * with a very large file.
 618                  */
 619                 if (verylargefile == 1) {
 620                         if (((unsigned)ip->i_blocks + btodb(bsize))
 621                             > INT_MAX) {
 622                                 return (EFBIG);
 623                         }
 624                 }
 625                 /*
 626                  * Need to allocate an indirect block.
 627                  */
 628                 pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
 629                 err = alloc(ip, pref, (int)bsize, &nb, cr);
 630                 if (err)
 631                         return (err);
 632                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
 633                 ASSERT(!ufs_badblock(ip, nb));
 634
 635                 /*
 636                  * Keep track of this allocation so we can undo it if we
 637                  * get an error later.
 638                  */
 639
 640                 ASSERT(alloced_blocks <= NIADDR);
 641
 642                 undo_table[alloced_blocks].this_block = nb;
 643                 undo_table[alloced_blocks].block_size = bsize;
 644                 undo_table[alloced_blocks].owner = ufs_no_owner;
 645                 undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
 646
 647                 alloced_blocks++;
 648
 649                 /*
 650                  * Write zero block synchronously so that
 651                  * indirect blocks never point at garbage.
 652                  */
 653                 bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
 654
 655                 clrbuf(bp);
 656                 /* XXX Maybe special-case this? */
 657                 TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
 658                 UFS_BWRITE2(ufsvfsp, bp);
 659                 if (bp->b_flags & B_ERROR) {
 660                         err = geterror(bp);
 661                         brelse(bp);
 662                         ufs_undo_allocation(ip, alloced_blocks,
 663                             undo_table, added_sectors);
 664                         return (err);
 665                 }
 666                 brelse(bp);
 667
 668                 ip->i_ib[NIADDR - j] = nb;
 669                 added_sectors += btodb(bsize);
 670                 ip->i_blocks += btodb(bsize);
 671                 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 672                 TRANS_INODE(ufsvfsp, ip);
 673                 ip->i_flag |= IUPD | ICHG | IATTCHG;
 674                 /* Caller is responsible for updating i_seq */
 675
 676                 /*
 677                  * Update the 'undo table' now that we've linked this block
 678                  * to an inode.
 679                  */
 680
 681                 undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
 682                 undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
 683
 684                 /*
 685                  * In the ISYNC case, wrip will notice that the block
 686                  * count on the inode has changed and will be sure to
 687                  * ufs_iupdat the inode at the end of wrip.
 688                  */
 689         }
 690
 691         /*
 692          * Fetch through the indirect blocks.
 693          */
 694         for (; j <= NIADDR; j++) {
 695                 ob = nb;
 696                 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
 697
 698                 if (bp->b_flags & B_ERROR) {
 699                         err = geterror(bp);
 700                         brelse(bp);
 701                         /*
 702                          * Return any partial allocations.
 703                          *
 704                          * It is possible that we have not yet made any
 705                          * allocations at this point (if this is the first
 706                          * pass through the loop and we didn't have to
 707                          * allocate the first indirect block, above).
 708                          * In this case, alloced_blocks and added_sectors will
 709                          * be zero, and ufs_undo_allocation will do nothing.
 710                          */
 711                         ufs_undo_allocation(ip, alloced_blocks,
 712                             undo_table, added_sectors);
 713                         return (err);
 714                 }
 715                 bap = bp->b_un.b_daddr;
 716                 shft -= nindirshift;            /* sh /= nindir */
 717                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
 718                 nb = bap[i];
 719
 720                 if (nb == 0) {
 721                         /*
 722                          * Check to see if doing this will make the
 723                          * file too big.  Only check if we are dealing
 724                          * with a very large file.
 725                          */
 726                         if (verylargefile == 1) {
 727                                 if (((unsigned)ip->i_blocks + btodb(bsize))
 728                                     > INT_MAX) {
 729                                         brelse(bp);
 730                                         ufs_undo_allocation(ip, alloced_blocks,
 731                                             undo_table, added_sectors);
 732                                         return (EFBIG);
 733                                 }
 734                         }
 735                         if (pref == 0) {
 736                                 if (j < NIADDR) {
 737                                         /* Indirect block */
 738                                         pref = blkpref(ip, lbn, 0,
 739                                             (daddr32_t *)0);
 740                                 } else {
 741                                         /* Data block */
 742                                         pref = blkpref(ip, lbn, i, &bap[0]);
 743                                 }
 744                         }
 745
 746                         /*
 747                          * release "bp" buf to avoid deadlock (re-bread later)
 748                          */
 749                         brelse(bp);
 750
 751                         err = alloc(ip, pref, (int)bsize, &nb, cr);
 752                         if (err) {
 753                                 /*
 754                                  * Return any partial allocations.
 755                                  */
 756                                 ufs_undo_allocation(ip, alloced_blocks,
 757                                     undo_table, added_sectors);
 758                                 return (err);
 759                         }
 760
 761                         ASSERT(!ufs_badblock(ip, nb));
 762                         ASSERT(alloced_blocks <= NIADDR);
 763
 764                         if (allocblk)
 765                                 *allocblk = nb;
 766
 767                         undo_table[alloced_blocks].this_block = nb;
 768                         undo_table[alloced_blocks].block_size = bsize;
 769                         undo_table[alloced_blocks].owner = ufs_no_owner;
 770                         undo_table[alloced_blocks].usage_flags = metaflag |
 771                             ((j < NIADDR) ? I_IBLK : 0);
 772
 773                         alloced_blocks++;
 774
 775                         if (j < NIADDR) {
 776                                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
 777                                 /*
 778                                  * Write synchronously so indirect
 779                                  * blocks never point at garbage.
 780                                  */
 781                                 nbp = UFS_GETBLK(
 782                                     ufsvfsp, dev, fsbtodb(fs, nb), bsize);
 783
 784                                 clrbuf(nbp);
 785                                 /* XXX Maybe special-case this? */
 786                                 TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
 787                                 UFS_BWRITE2(ufsvfsp, nbp);
 788                                 if (nbp->b_flags & B_ERROR) {
 789                                         err = geterror(nbp);
 790                                         brelse(nbp);
 791                                         /*
 792                                          * Return any partial
 793                                          * allocations.
 794                                          */
 795                                         ufs_undo_allocation(ip,
 796                                             alloced_blocks,
 797                                             undo_table, added_sectors);
 798                                         return (err);
 799                                 }
 800                                 brelse(nbp);
 801                         } else if (alloc_type == BI_NORMAL ||
 802                             P2ROUNDUP_TYPED(size,
 803                             PAGESIZE, u_offset_t) < bsize) {
 804                                 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
 805                                 fbzero(ITOV(ip),
 806                                     ((offset_t)lbn << fs->fs_bshift),
 807                                     (uint_t)bsize, &fbp);
 808
 809                                 /*
 810                                  * Cases which we need to do a synchronous
 811                                  * write of the zeroed data pages:
 812                                  *
 813                                  * 1) If we are writing a directory then we
 814                                  * want to write synchronously so blocks in
 815                                  * directories never contain garbage.
 816                                  *
 817                                  * 2) If we are filling in a hole and the
 818                                  * indirect block is going to be synchronously
 819                                  * written back below we need to make sure
 820                                  * that the zeroes are written here before
 821                                  * the indirect block is updated so that if
 822                                  * we crash before the real data is pushed
 823                                  * we will not end up with random data is
 824                                  * the middle of the file.
 825                                  *
 826                                  * 3) If the size of the request rounded up
 827                                  * to the system page size is smaller than
 828                                  * the file system block size, we want to
 829                                  * write out all the pages now so that
 830                                  * they are not aborted before they actually
 831                                  * make it to ufs_putpage since the length
 832                                  * of the inode will not include the pages.
 833                                  */
 834
 835                                 if (isdirquota || (issync &&
 836                                     lbn < llbn))
 837                                         (void) ufs_fbiwrite(fbp, ip, nb,
 838                                             fs->fs_fsize);
 839                                 else
 840                                         fbrelse(fbp, S_WRITE);
 841                         }
 842
 843                         /*
 844                          * re-acquire "bp" buf
 845                          */
 846                         bp = UFS_BREAD(ufsvfsp,
 847                             ip->i_dev, fsbtodb(fs, ob), bsize);
 848                         if (bp->b_flags & B_ERROR) {
 849                                 err = geterror(bp);
 850                                 brelse(bp);
 851                                 /*
 852                                  * Return any partial allocations.
 853                                  */
 854                                 ufs_undo_allocation(ip,
 855                                     alloced_blocks,
 856                                     undo_table, added_sectors);
 857                                 return (err);
 858                         }
 859                         bap = bp->b_un.b_daddr;
 860                         bap[i] = nb;
 861
 862                         /*
 863                          * The magic explained: j will be equal to NIADDR
 864                          * when we are at the lowest level, this is where the
 865                          * array entries point directly to data blocks. Since
 866                          * we will be 'fallocate'ing we will go ahead and negate
 867                          * the addresses.
 868                          */
 869                         if (alloc_type == BI_FALLOCATE && j == NIADDR)
 870                                 bap[i] = -bap[i];
 871
 872                         TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
 873                         added_sectors += btodb(bsize);
 874                         ip->i_blocks += btodb(bsize);
 875                         ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 876                         TRANS_INODE(ufsvfsp, ip);
 877                         ip->i_flag |= IUPD | ICHG | IATTCHG;
 878
 879                         /* Caller is responsible for updating i_seq */
 880
 881                         undo_table[alloced_blocks-1].owner =
 882                             ufs_indirect_block;
 883                         undo_table[alloced_blocks-1].owner_block = ob;
 884                         undo_table[alloced_blocks-1].owner_offset = i;
 885
 886                         if (issync) {
 887                                 UFS_BWRITE2(ufsvfsp, bp);
 888                                 if (bp->b_flags & B_ERROR) {
 889                                         err = geterror(bp);
 890                                         brelse(bp);
 891                                         /*
 892                                          * Return any partial
 893                                          * allocations.
 894                                          */
 895                                         ufs_undo_allocation(ip,
 896                                             alloced_blocks,
 897                                             undo_table, added_sectors);
 898                                         return (err);
 899                                 }
 900                                 brelse(bp);
 901                         } else {
 902                                 bdrwrite(bp);
 903                         }
 904                 } else {
 905                         brelse(bp);
 906                 }
 907         }
 908         return (0);
 909 }
 910
 911 /*
 912  * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
 913  * is in the critical region of wrip().
 914  */
 915 int
 916 bmap_has_holes(struct inode *ip)
 917 {
 918         struct fs *fs = ip->i_fs;
 919         uint_t  dblks;                  /* # of data blocks */
 920         uint_t  mblks;                  /* # of data + metadata blocks */
 921         int     nindirshift;
 922         int     nindiroffset;
 923         uint_t  cnt;
 924         int     n, j, shft;
 925         uint_t nindirblks;
 926
 927         int     fsbshift = fs->fs_bshift;
 928         int     fsboffset = (1 << fsbshift) - 1;
 929
 930         /*
 931          * Check for writer in critical region, if found then we
 932          * cannot trust the values of i_size and i_blocks
 933          * simply return true.
 934          */
 935         if (ip->i_writer != NULL && ip->i_writer != curthread) {
 936                 return (1);
 937         }
 938
 939         dblks = (ip->i_size + fsboffset) >> fsbshift;
 940         mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
 941
 942         /*
 943          * File has only direct blocks.
 944          */
 945         if (dblks <= NDADDR)
 946                 return (mblks < dblks);
 947         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 948
 949         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 950         nindirblks = nindiroffset + 1;
 951
 952         dblks -= NDADDR;
 953         shft = 0;
 954         /*
 955          * Determine how many levels of indirection.
 956          */
 957         for (j = NIADDR; j > 0; j--) {
 958                 longlong_t      sh;
 959
 960                 shft += nindirshift;    /* sh *= nindir */
 961                 sh = 1LL << shft;
 962                 if (dblks <= sh)
 963                         break;
 964                 dblks -= sh;
 965         }
 966         /* LINTED: warning: logical expression always true: op "||" */
 967         ASSERT(NIADDR <= 3);
 968         ASSERT(j <= NIADDR);
 969         if (j == NIADDR)        /* single level indirection */
 970                 cnt = NDADDR + 1 + dblks;
 971         else if (j == NIADDR-1) /* double indirection */
 972                 cnt = NDADDR + 1 + nindirblks +
 973                     1 + (dblks + nindiroffset)/nindirblks + dblks;
 974         else if (j == NIADDR-2) { /* triple indirection */
 975                 n = (dblks + nindiroffset)/nindirblks;
 976                 cnt = NDADDR + 1 + nindirblks +
 977                     1 + nindirblks + nindirblks*nindirblks +
 978                     1 + (n + nindiroffset)/nindirblks + n + dblks;
 979         }
 980
 981         return (mblks < cnt);
 982 }
 983
 984 /*
 985  * find some contig blocks starting at *sbp and going for min(n, max_contig)
 986  * return the number of blocks (not frags) found.
 987  * The array passed in must be at least [0..n-1].
 988  */
 989 static int
 990 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
 991 {
 992         register daddr_t bn, nextbn;
 993         register daddr32_t *bp;
 994         register int diff;
 995         int maxtransblk;
 996
 997         if (n <= 0)
 998                 return (0);
 999         bn = *sbp;
1000         if (bn == 0)
1001                 return (0);
1002
1003         diff = fs->fs_frag;
1004         if (*lenp) {
1005                 n = MIN(n, lblkno(fs, *lenp));
1006         } else {
1007                 /*
1008                  * If the user has set the value for maxcontig lower than
1009                  * the drive transfer size, then assume they want this
1010                  * to be the maximum value for the size of the data transfer.
1011                  */
1012                 maxtransblk = maxtransfer >> DEV_BSHIFT;
1013                 if (fs->fs_maxcontig < maxtransblk) {
1014                         n = MIN(n, fs->fs_maxcontig);
1015                 } else {
1016                         n = MIN(n, maxtransblk);
1017                 }
1018         }
1019         bp = sbp;
1020         while (--n > 0) {
1021                 nextbn = *(bp + 1);
1022                 if (nextbn == 0 || bn + diff != nextbn)
1023                         break;
1024                 bn = nextbn;
1025                 bp++;
1026         }
1027         return ((int)(bp - sbp) + 1);
1028 }
1029
1030 /*
1031  * Free any blocks which had been successfully allocated.  Always called
1032  * as a result of an error, so we don't bother returning an error code
1033  * from here.
1034  *
1035  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1036  * Thus it is safe to call this as part of error handling, whether or not
1037  * any blocks have been allocated.
1038  *
1039  * The ufs_inode_direct case is currently unused.
1040  */
1041
1042 static void
1043 ufs_undo_allocation(
1044         inode_t *ip,
1045         int block_count,
1046         struct ufs_allocated_block table[],
1047         int inode_sector_adjust)
1048 {
1049         int i;
1050         int inode_changed;
1051         int error_updating_pointers;
1052         struct ufsvfs *ufsvfsp;
1053
1054         inode_changed = 0;
1055         error_updating_pointers = 0;
1056
1057         ufsvfsp = ip->i_ufsvfs;
1058
1059         /*
1060          * Update pointers on disk before freeing blocks.  If we fail,
1061          * some blocks may remain busy; but they will be reclaimed by
1062          * an fsck.  (This is better than letting a block wind up with
1063          * two owners if we successfully freed it but could not remove
1064          * the pointer to it.)
1065          */
1066
1067         for (i = 0; i < block_count; i++) {
1068                 switch (table[i].owner) {
1069                 case ufs_no_owner:
1070                         /* Nothing to do here, nobody points to us */
1071                         break;
1072                 case ufs_inode_direct:
1073                         ASSERT(table[i].owner_offset < NDADDR);
1074                         ip->i_db[table[i].owner_offset] = 0;
1075                         inode_changed = 1;
1076                         break;
1077                 case ufs_inode_indirect:
1078                         ASSERT(table[i].owner_offset < NIADDR);
1079                         ip->i_ib[table[i].owner_offset] = 0;
1080                         inode_changed = 1;
1081                         break;
1082                 case ufs_indirect_block: {
1083                         buf_t *bp;
1084                         daddr32_t *block_data;
1085
1086                         /* Read/modify/log/write. */
1087
1088                         ASSERT(table[i].owner_offset <
1089                             (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1090
1091                         bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1092                             fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1093                             VBSIZE(ITOV(ip)));
1094
1095                         if (bp->b_flags & B_ERROR) {
1096                                 /* Couldn't read this block; give up. */
1097                                 error_updating_pointers = 1;
1098                                 brelse(bp);
1099                                 break;          /* out of SWITCH */
1100                         }
1101
1102                         block_data = bp->b_un.b_daddr;
1103                         block_data[table[i].owner_offset] = 0;
1104
1105                         /* Write a log entry which includes the zero. */
1106                         /* It might be possible to optimize this by using */
1107                         /* TRANS_BUF directly and zeroing only the four */
1108                         /* bytes involved, but an attempt to do that led */
1109                         /* to panics in the logging code.  The attempt was */
1110                         /* TRANS_BUF(ufsvfsp,                             */
1111                         /*    table[i].owner_offset * sizeof (daddr32_t), */
1112                         /*    sizeof (daddr32_t),                         */
1113                         /*    bp,                                         */
1114                         /*    DT_ABZERO);                                 */
1115
1116                         TRANS_BUF_ITEM_128(ufsvfsp,
1117                             block_data[table[i].owner_offset],
1118                             block_data, bp, DT_AB);
1119
1120                         /* Now we can write the buffer itself. */
1121
1122                         UFS_BWRITE2(ufsvfsp, bp);
1123
1124                         if (bp->b_flags & B_ERROR) {
1125                                 error_updating_pointers = 1;
1126                         }
1127
1128                         brelse(bp);
1129                         break;
1130                 }
1131                 default:
1132                         (void) ufs_fault(ITOV(ip),
1133                             "ufs_undo_allocation failure\n");
1134                         break;
1135                 }
1136         }
1137
1138         /*
1139          * If the inode changed, or if we need to update its block count,
1140          * then do that now.  We update the inode synchronously on disk
1141          * to ensure that it won't transiently point at a block we've
1142          * freed (only necessary if we're not logging).
1143          *
1144          * NOTE: Currently ufs_iupdat() does not check for errors.  When
1145          * it is fixed, we should verify that we successfully updated the
1146          * inode before freeing blocks below.
1147          */
1148
1149         if (inode_changed || (inode_sector_adjust != 0)) {
1150                 ip->i_blocks -= inode_sector_adjust;
1151                 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1152                 TRANS_INODE(ufsvfsp, ip);
1153                 ip->i_flag |= IUPD | ICHG | IATTCHG;
1154                 ip->i_seq++;
1155                 if (!TRANS_ISTRANS(ufsvfsp))
1156                         ufs_iupdat(ip, I_SYNC);
1157         }
1158
1159         /*
1160          * Now we go through and actually free the blocks, but only if we
1161          * successfully removed the pointers to them.
1162          */
1163
1164         if (!error_updating_pointers) {
1165                 for (i = 0; i < block_count; i++) {
1166                         free(ip, table[i].this_block, table[i].block_size,
1167                             table[i].usage_flags);
1168                 }
1169         }
1170 }
1171
1172 /*
1173  * Find the next hole or data block in file starting at *off
1174  * Return found offset in *off, which can be less than the
1175  * starting offset if not block aligned.
1176  * This code is based on bmap_read().
1177  * Errors: ENXIO for end of file
1178  *         EIO for block read error.
1179  */
1180 int
1181 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1182 {
1183         ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1184         struct fs *fs = ufsvfsp->vfs_fs;
1185         buf_t *bp[NIADDR];
1186         int i, j;
1187         int shft;                       /* we maintain sh = 1 << shft */
1188         int nindirshift, nindiroffset;
1189         daddr_t ob, nb, tbn, lbn, skip;
1190         daddr32_t *bap;
1191         u_offset_t isz = (offset_t)ip->i_size;
1192         int32_t bs = fs->fs_bsize; /* file system block size */
1193         int32_t nindir = fs->fs_nindir;
1194         dev_t dev;
1195         int error = 0;
1196         daddr_t limits[NIADDR];
1197
1198         ASSERT(*off < isz);
1199         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1200         lbn = (daddr_t)lblkno(fs, *off);
1201         ASSERT(lbn >= 0);
1202
1203         for (i = 0; i < NIADDR; i++)
1204                 bp[i] = NULL;
1205
1206         /*
1207          * The first NDADDR blocks are direct blocks.
1208          */
1209         if (lbn < NDADDR) {
1210                 for (; lbn < NDADDR; lbn++) {
1211                         if ((hole && (ip->i_db[lbn] == 0)) ||
1212                             (!hole && (ip->i_db[lbn] != 0))) {
1213                                 goto out;
1214                         }
1215                 }
1216                 if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1217                         goto out;
1218         }
1219
1220         nindir = fs->fs_nindir;
1221         nindirshift = ufsvfsp->vfs_nindirshift;
1222         nindiroffset = ufsvfsp->vfs_nindiroffset;
1223         dev = ip->i_dev;
1224
1225         /* Set up limits array */
1226         for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1227                 limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1228
1229 loop:
1230         /*
1231          * Determine how many levels of indirection.
1232          */
1233         shft = 0;                               /* sh = 1 */
1234         tbn = lbn - NDADDR;
1235         for (j = NIADDR; j > 0; j--) {
1236                 longlong_t sh;
1237
1238                 shft += nindirshift;            /* sh *= nindir */
1239                 sh = 1LL << shft;
1240                 if (tbn < sh)
1241                         break;
1242                 tbn -= sh;
1243         }
1244         if (j == 0) {
1245                 /* must have passed end of file */
1246                 ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1247                 goto out;
1248         }
1249
1250         /*
1251          * Fetch the first indirect block.
1252          */
1253         nb = ip->i_ib[NIADDR - j];
1254         if (nb == 0) {
1255                 if (hole) {
1256                         lbn = limits[NIADDR - j];
1257                         goto out;
1258                 } else {
1259                         lbn = limits[NIADDR - j + 1];
1260                         if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1261                                 goto out;
1262                         goto loop;
1263                 }
1264         }
1265
1266         /*
1267          * Fetch through the indirect blocks.
1268          */
1269         for (; ((j <= NIADDR) && (nb != 0)); j++) {
1270                 ob = nb;
1271                 /*
1272                  * if there's a different block at this level then release
1273                  * the old one and in with the new.
1274                  */
1275                 if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1276                         if (bp[j-1] != NULL)
1277                                 brelse(bp[j-1]);
1278                         bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1279                         if (bp[j-1]->b_flags & B_ERROR) {
1280                                 error = EIO;
1281                                 goto out;
1282                         }
1283                 }
1284                 bap = bp[j-1]->b_un.b_daddr;
1285
1286                 shft -= nindirshift;            /* sh / nindir */
1287                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1288                 nb = bap[i];
1289                 skip = 1LL << (nindirshift * (NIADDR - j));
1290         }
1291
1292         /*
1293          * Scan through the blocks in this array.
1294          */
1295         for (; i < nindir; i++, lbn += skip) {
1296                 if (hole && (bap[i] == 0))
1297                         goto out;
1298                 if (!hole && (bap[i] != 0)) {
1299                         if (skip == 1) {
1300                                 /* we're at the lowest level */
1301                                 goto out;
1302                         } else {
1303                                 goto loop;
1304                         }
1305                 }
1306         }
1307         if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1308                 goto loop;
1309 out:
1310         for (i = 0; i < NIADDR; i++) {
1311                 if (bp[i])
1312                         brelse(bp[i]);
1313         }
1314         if (error == 0) {
1315                 if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1316                         error = ENXIO;
1317                 } else {
1318                         /* success */
1319                         *off = (u_offset_t)lbn << fs->fs_bshift;
1320                 }
1321         }
1322         return (error);
1323 }
1324
1325 /*
1326  * Set a particular offset in the inode list to be a certain block.
1327  * User is responsible for calling TRANS* functions
1328  */
1329 int
1330 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
1331 {
1332         daddr_t lbn;
1333         struct inode *ip;
1334         ufsvfs_t *ufsvfsp;
1335         struct  fs *fs;
1336         struct  buf *bp;
1337         int     i, j;
1338         int     shft;                   /* we maintain sh = 1 << shft */
1339         int err;
1340         daddr_t ob, nb, tbn;
1341         daddr32_t *bap;
1342         int     nindirshift, nindiroffset;
1343
1344         ip = VTOI(vp);
1345         ufsvfsp = ip->i_ufsvfs;
1346         fs = ufsvfsp->vfs_fs;
1347         lbn = (daddr_t)lblkno(fs, off);
1348
1349         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1350
1351         if (lbn < 0)
1352                 return (EFBIG);
1353
1354         /*
1355          * Take care of direct block assignment
1356          */
1357         if (lbn < NDADDR) {
1358                 ip->i_db[lbn] = bn;
1359                 return (0);
1360         }
1361
1362         nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1363         nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1364         /*
1365          * Determine how many levels of indirection.
1366          */
1367         shft = 0;                               /* sh = 1 */
1368         tbn = lbn - NDADDR;
1369         for (j = NIADDR; j > 0; j--) {
1370                 longlong_t      sh;
1371
1372                 shft += nindirshift;            /* sh *= nindir */
1373                 sh = 1LL << shft;
1374                 if (tbn < sh)
1375                         break;
1376                 tbn -= sh;
1377         }
1378         if (j == 0)
1379                 return (EFBIG);
1380
1381         /*
1382          * Fetch the first indirect block.
1383          */
1384         nb = ip->i_ib[NIADDR - j];
1385         if (nb == 0) {
1386                 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1387                 return (err);
1388         }
1389
1390         /*
1391          * Fetch through the indirect blocks.
1392          */
1393         for (; j <= NIADDR; j++) {
1394                 ob = nb;
1395                 bp = UFS_BREAD(ufsvfsp,
1396                     ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1397                 if (bp->b_flags & B_ERROR) {
1398                         err = geterror(bp);
1399                         brelse(bp);
1400                         return (err);
1401                 }
1402                 bap = bp->b_un.b_daddr;
1403
1404                 ASSERT(!ufs_indir_badblock(ip, bap));
1405
1406                 shft -= nindirshift;            /* sh / nindir */
1407                 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1408
1409                 nb = bap[i];
1410                 if (nb == 0) {
1411                         err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1412                         return (err);
1413                 }
1414
1415                 if (j == NIADDR) {
1416                         bap[i] = bn;
1417                         bdrwrite(bp);
1418                         return (0);
1419                 }
1420
1421                 brelse(bp);
1422         }
1423         return (0);
1424 }