kernel/fs/ufs/ufs_alloc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/condvar_impl.h>
  40 #include <sys/types.h>
  41 #include <sys/t_lock.h>
  42 #include <sys/debug.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/signal.h>
  46 #include <sys/cred.h>
  47 #include <sys/proc.h>
  48 #include <sys/disp.h>
  49 #include <sys/user.h>
  50 #include <sys/buf.h>
  51 #include <sys/vfs.h>
  52 #include <sys/vnode.h>
  53 #include <sys/acl.h>
  54 #include <sys/fs/ufs_fs.h>
  55 #include <sys/fs/ufs_inode.h>
  56 #include <sys/fs/ufs_acl.h>
  57 #include <sys/fs/ufs_bio.h>
  58 #include <sys/fs/ufs_quota.h>
  59 #include <sys/kmem.h>
  60 #include <sys/fs/ufs_trans.h>
  61 #include <sys/fs/ufs_panic.h>
  62 #include <sys/errno.h>
  63 #include <sys/time.h>
  64 #include <sys/sysmacros.h>
  65 #include <sys/file.h>
  66 #include <sys/fcntl.h>
  67 #include <sys/flock.h>
  68 #include <sys/fs_subr.h>
  69 #include <sys/cmn_err.h>
  70 #include <sys/policy.h>
  71 #include <sys/fs/ufs_log.h>
  72
  73 static ino_t    hashalloc();
  74 static daddr_t  fragextend();
  75 static daddr_t  alloccg();
  76 static daddr_t  alloccgblk();
  77 static ino_t    ialloccg();
  78 static daddr_t  mapsearch();
  79 static int      findlogstartcg();
  80
  81 extern int      inside[], around[];
  82 extern uchar_t  *fragtbl[];
  83 void delay();
  84
  85 /*
  86  * Allocate a block in the file system.
  87  *
  88  * The size of the requested block is given, which must be some
  89  * multiple of fs_fsize and <= fs_bsize.
  90  * A preference may be optionally specified. If a preference is given
  91  * the following hierarchy is used to allocate a block:
  92  *   1) allocate the requested block.
  93  *   2) allocate a rotationally optimal block in the same cylinder.
  94  *   3) allocate a block in the same cylinder group.
  95  *   4) quadratically rehash into other cylinder groups, until an
  96  *      available block is located.
  97  * If no block preference is given the following hierarchy is used
  98  * to allocate a block:
  99  *   1) allocate a block in the cylinder group that contains the
 100  *      inode for the file.
 101  *   2) quadratically rehash into other cylinder groups, until an
 102  *      available block is located.
 103  */
 104 int
 105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
 106 {
 107         struct fs *fs;
 108         struct ufsvfs *ufsvfsp;
 109         daddr_t bno;
 110         int cg;
 111         int err;
 112         char *errmsg = NULL;
 113         size_t len;
 114         clock_t now;
 115
 116         ufsvfsp = ip->i_ufsvfs;
 117         fs = ufsvfsp->vfs_fs;
 118         if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 119                 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
 120                     " bsize = %d, size = %d, fs = %s\n",
 121                     ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
 122                 return (err);
 123         }
 124         if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 125                 goto nospace;
 126         if (freespace(fs, ufsvfsp) <= 0 &&
 127             secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
 128                 goto nospace;
 129         err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
 130         /* Note that may not have err, but may have errmsg */
 131         if (errmsg != NULL) {
 132                 uprintf(errmsg);
 133                 kmem_free(errmsg, len);
 134                 errmsg = NULL;
 135         }
 136         if (err)
 137                 return (err);
 138         if (bpref >= fs->fs_size)
 139                 bpref = 0;
 140         if (bpref == 0)
 141                 cg = (int)itog(fs, ip->i_number);
 142         else
 143                 cg = dtog(fs, bpref);
 144
 145         bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
 146             (ulong_t (*)())alloccg);
 147         if (bno > 0) {
 148                 *bnp = bno;
 149                 return (0);
 150         }
 151
 152         /*
 153          * hashalloc() failed because some other thread grabbed
 154          * the last block so unwind the quota operation.  We can
 155          * ignore the return because subtractions don't fail and
 156          * size is guaranteed to be >= zero by our caller.
 157          */
 158         (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, NULL);
 159
 160 nospace:
 161         now = ddi_get_lbolt();
 162         mutex_enter(&ufsvfsp->vfs_lock);
 163         if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
 164             (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
 165                 ufsvfsp->vfs_lastwhinetime = now;
 166                 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
 167         }
 168         mutex_exit(&ufsvfsp->vfs_lock);
 169         return (ENOSPC);
 170 }
 171
 172 /*
 173  * Reallocate a fragment to a bigger size
 174  *
 175  * The number and size of the old block is given, and a preference
 176  * and new size is also specified.  The allocator attempts to extend
 177  * the original block.  Failing that, the regular block allocator is
 178  * invoked to get an appropriate block.
 179  */
 180 int
 181 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
 182     int nsize, daddr_t *bnp, cred_t *cr)
 183 {
 184         daddr_t bno;
 185         struct fs *fs;
 186         struct ufsvfs *ufsvfsp;
 187         int cg, request;
 188         int err;
 189         char *errmsg = NULL;
 190         size_t len;
 191         clock_t now;
 192
 193         ufsvfsp = ip->i_ufsvfs;
 194         fs = ufsvfsp->vfs_fs;
 195         if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 196             (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 197                 err = ufs_fault(ITOV(ip),
 198                     "realloccg: bad size, dev=0x%lx, bsize=%d, "
 199                     "osize=%d, nsize=%d, fs=%s\n",
 200                     ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
 201                 return (err);
 202         }
 203         if (freespace(fs, ufsvfsp) <= 0 &&
 204             secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
 205                 goto nospace;
 206         if (bprev == 0) {
 207                 err = ufs_fault(ITOV(ip),
 208                     "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
 209                     " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
 210                     fs->fs_fsmnt);
 211                 return (err);
 212         }
 213         err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
 214         /* Note that may not have err, but may have errmsg */
 215         if (errmsg != NULL) {
 216                 uprintf(errmsg);
 217                 kmem_free(errmsg, len);
 218                 errmsg = NULL;
 219         }
 220         if (err)
 221                 return (err);
 222         cg = dtog(fs, bprev);
 223         bno = fragextend(ip, cg, (long)bprev, osize, nsize);
 224         if (bno != 0) {
 225                 *bnp = bno;
 226                 return (0);
 227         }
 228         if (bpref >= fs->fs_size)
 229                 bpref = 0;
 230
 231         /*
 232          * When optimizing for time we allocate a full block and
 233          * then only use the upper portion for this request. When
 234          * this file grows again it will grow into the unused portion
 235          * of the block (See fragextend() above).  This saves time
 236          * because an extra disk write would be needed if the frags
 237          * following the current allocation were not free. The extra
 238          * disk write is needed to move the data from its current
 239          * location into the newly allocated position.
 240          *
 241          * When optimizing for space we allocate a run of frags
 242          * that is just the right size for this request.
 243          */
 244         request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
 245         bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
 246             (ulong_t (*)())alloccg);
 247         if (bno > 0) {
 248                 *bnp = bno;
 249                 if (nsize < request)
 250                         (void) free(ip, bno + numfrags(fs, nsize),
 251                             (off_t)(request - nsize), I_NOCANCEL);
 252                 return (0);
 253         }
 254
 255         /*
 256          * hashalloc() failed because some other thread grabbed
 257          * the last block so unwind the quota operation.  We can
 258          * ignore the return because subtractions don't fail, and
 259          * our caller guarantees nsize >= osize.
 260          */
 261         (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
 262             NULL);
 263
 264 nospace:
 265         now = ddi_get_lbolt();
 266         mutex_enter(&ufsvfsp->vfs_lock);
 267         if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
 268             (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
 269                 ufsvfsp->vfs_lastwhinetime = now;
 270                 cmn_err(CE_NOTE,
 271                     "realloccg %s: file system full", fs->fs_fsmnt);
 272         }
 273         mutex_exit(&ufsvfsp->vfs_lock);
 274         return (ENOSPC);
 275 }
 276
 277 /*
 278  * Allocate an inode in the file system.
 279  *
 280  * A preference may be optionally specified. If a preference is given
 281  * the following hierarchy is used to allocate an inode:
 282  *   1) allocate the requested inode.
 283  *   2) allocate an inode in the same cylinder group.
 284  *   3) quadratically rehash into other cylinder groups, until an
 285  *      available inode is located.
 286  * If no inode preference is given the following hierarchy is used
 287  * to allocate an inode:
 288  *   1) allocate an inode in cylinder group 0.
 289  *   2) quadratically rehash into other cylinder groups, until an
 290  *      available inode is located.
 291  */
 292 int
 293 ufs_ialloc(struct inode *pip,
 294     ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
 295 {
 296         struct inode *ip;
 297         struct fs *fs;
 298         int cg;
 299         ino_t ino;
 300         int err;
 301         int nifree;
 302         struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
 303         char *errmsg = NULL;
 304         size_t len;
 305
 306         ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
 307         fs = pip->i_fs;
 308 loop:
 309         nifree = fs->fs_cstotal.cs_nifree;
 310
 311         if (nifree == 0)
 312                 goto noinodes;
 313         /*
 314          * Shadow inodes don't count against a user's inode allocation.
 315          * They are an implementation method and not a resource.
 316          */
 317         if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 318                 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
 319                     /* change */ 1, NULL, crgetuid(cr), 0,
 320                     cr, &errmsg, &len);
 321                 /*
 322                  * As we haven't acquired any locks yet, dump the message
 323                  * now.
 324                  */
 325                 if (errmsg != NULL) {
 326                         uprintf(errmsg);
 327                         kmem_free(errmsg, len);
 328                         errmsg = NULL;
 329                 }
 330                 if (err)
 331                         return (err);
 332         }
 333
 334         if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
 335                 ipref = 0;
 336         cg = (int)itog(fs, ipref);
 337         ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
 338             (ulong_t (*)())ialloccg);
 339         if (ino == 0) {
 340                 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 341                         /*
 342                          * We can safely ignore the return from chkiq()
 343                          * because deallocations can only fail if we
 344                          * can't get the user's quota info record off
 345                          * the disk due to an I/O error.  In that case,
 346                          * the quota subsystem is already messed up.
 347                          */
 348                         (void) chkiq(ufsvfsp, /* change */ -1, NULL,
 349                             crgetuid(cr), 0, cr, (char **)NULL, NULL);
 350                 }
 351                 goto noinodes;
 352         }
 353         err = ufs_iget(pip->i_vfs, ino, ipp, cr);
 354         if (err) {
 355                 if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
 356                         /*
 357                          * See above comment about why it is safe to ignore an
 358                          * error return here.
 359                          */
 360                         (void) chkiq(ufsvfsp, /* change */ -1, NULL,
 361                             crgetuid(cr), 0, cr, (char **)NULL, NULL);
 362                 }
 363                 ufs_ifree(pip, ino, 0);
 364                 return (err);
 365         }
 366         ip = *ipp;
 367         ASSERT(!ip->i_ufs_acl);
 368         ASSERT(!ip->i_dquot);
 369         rw_enter(&ip->i_contents, RW_WRITER);
 370
 371         /*
 372          * Check if we really got a free inode, if not then complain
 373          * and mark the inode ISTALE so that it will be freed by the
 374          * ufs idle thread eventually and will not be sent to ufs_delete().
 375          */
 376         if (ip->i_mode || (ip->i_nlink > 0)) {
 377                 ip->i_flag |= ISTALE;
 378                 rw_exit(&ip->i_contents);
 379                 VN_RELE(ITOV(ip));
 380                 cmn_err(CE_WARN,
 381                     "%s: unexpected allocated inode %d, run fsck(8)%s",
 382                     fs->fs_fsmnt, (int)ino,
 383                     (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 384                 goto loop;
 385         }
 386
 387         /*
 388          * Check the inode has no size or data blocks.
 389          * This could have happened if the truncation failed when
 390          * deleting the inode. It used to be possible for this to occur
 391          * if a block allocation failed when iteratively truncating a
 392          * large file using logging and with a full file system.
 393          * This was fixed with bug fix 4348738. However, truncation may
 394          * still fail on an IO error. So in all cases for safety and
 395          * security we clear out the size; the blocks allocated; and
 396          * pointers to the blocks. This will ultimately cause a fsck
 397          * error of un-accounted for blocks, but its a fairly benign error,
 398          * and possibly the correct thing to do anyway as accesssing those
 399          * blocks agains may lead to more IO errors.
 400          */
 401         if (ip->i_size || ip->i_blocks) {
 402                 int i;
 403
 404                 if (ip->i_size) {
 405                         cmn_err(CE_WARN,
 406                             "%s: free inode %d had size 0x%llx, run fsck(8)%s",
 407                             fs->fs_fsmnt, (int)ino, ip->i_size,
 408                             (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 409                 }
 410                 /*
 411                  * Clear any garbage left behind.
 412                  */
 413                 ip->i_size = 0;
 414                 ip->i_blocks = 0;
 415                 for (i = 0; i < NDADDR; i++)
 416                         ip->i_db[i] = 0;
 417                 for (i = 0; i < NIADDR; i++)
 418                         ip->i_ib[i] = 0;
 419         }
 420
 421         /*
 422          * Initialize the link count
 423          */
 424         ip->i_nlink = 0;
 425
 426         /*
 427          * Clear the old flags
 428          */
 429         ip->i_flag &= IREF;
 430
 431         /*
 432          * Access times are not really defined if the fs is mounted
 433          * with 'noatime'. But it can cause nfs clients to fail
 434          * open() if the atime is not a legal value. Set a legal value
 435          * here when the inode is allocated.
 436          */
 437         if (ufsvfsp->vfs_noatime) {
 438                 mutex_enter(&ufs_iuniqtime_lock);
 439                 ip->i_atime = iuniqtime;
 440                 mutex_exit(&ufs_iuniqtime_lock);
 441         }
 442         rw_exit(&ip->i_contents);
 443         return (0);
 444 noinodes:
 445         if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
 446                 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
 447         return (ENOSPC);
 448 }
 449
 450 /*
 451  * Find a cylinder group to place a directory.
 452  * Returns an inumber within the selected cylinder group.
 453  * Note, the vfs_lock is not needed as we don't require exact cg summary info.
 454  *
 455  * If the switch ufs_close_dirs is set, then the policy is to use
 456  * the current cg if it has more than 25% free inodes and more
 457  * than 25% free blocks. Otherwise the cgs are searched from
 458  * the beginning and the first cg with the same criteria is
 459  * used. If that is also null then we revert to the old algorithm.
 460  * This tends to cluster files at the beginning of the disk
 461  * until the disk gets full.
 462  *
 463  * Otherwise if ufs_close_dirs is not set then the original policy is
 464  * used which is to select from among those cylinder groups with
 465  * above the average number of free inodes, the one with the smallest
 466  * number of directories.
 467  */
 468
 469 int ufs_close_dirs = 1; /* allocate directories close as possible */
 470
 471 ino_t
 472 dirpref(inode_t *dp)
 473 {
 474         int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
 475         struct fs *fs = dp->i_fs;
 476
 477         cg = itog(fs, dp->i_number);
 478         mininode = fs->fs_ipg >> 2;
 479         minbpg = fs->fs_maxbpg >> 2;
 480         if (ufs_close_dirs &&
 481             (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
 482             (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
 483                 return (dp->i_number);
 484         }
 485
 486         avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 487         minndir = fs->fs_ipg;
 488         mincg = 0;
 489         for (cg = 0; cg < fs->fs_ncg; cg++) {
 490                 ifree = fs->fs_cs(fs, cg).cs_nifree;
 491                 if (ufs_close_dirs &&
 492                     (ifree > mininode) &&
 493                     (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
 494                         return ((ino_t)(fs->fs_ipg * cg));
 495                 }
 496                 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
 497                     (ifree >= avgifree)) {
 498                         mincg = cg;
 499                         minndir = fs->fs_cs(fs, cg).cs_ndir;
 500                 }
 501         }
 502         return ((ino_t)(fs->fs_ipg * mincg));
 503 }
 504
 505 /*
 506  * Select the desired position for the next block in a file.  The file is
 507  * logically divided into sections. The first section is composed of the
 508  * direct blocks. Each additional section contains fs_maxbpg blocks.
 509  *
 510  * If no blocks have been allocated in the first section, the policy is to
 511  * request a block in the same cylinder group as the inode that describes
 512  * the file. If no blocks have been allocated in any other section, the
 513  * policy is to place the section in a cylinder group with a greater than
 514  * average number of free blocks.  An appropriate cylinder group is found
 515  * by using a rotor that sweeps the cylinder groups. When a new group of
 516  * blocks is needed, the sweep begins in the cylinder group following the
 517  * cylinder group from which the previous allocation was made. The sweep
 518  * continues until a cylinder group with greater than the average number
 519  * of free blocks is found. If the allocation is for the first block in an
 520  * indirect block, the information on the previous allocation is unavailable;
 521  * here a best guess is made based upon the logical block number being
 522  * allocated.
 523  *
 524  * If a section is already partially allocated, the policy is to
 525  * contiguously allocate fs_maxcontig blocks.  The end of one of these
 526  * contiguous blocks and the beginning of the next is physically separated
 527  * so that the disk head will be in transit between them for at least
 528  * fs_rotdelay milliseconds.  This is to allow time for the processor to
 529  * schedule another I/O transfer.
 530  */
 531 daddr_t
 532 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
 533 {
 534         struct fs *fs;
 535         struct ufsvfs *ufsvfsp;
 536         int cg;
 537         int avgbfree, startcg;
 538         daddr_t nextblk;
 539
 540         ufsvfsp = ip->i_ufsvfs;
 541         fs = ip->i_fs;
 542         if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 543                 if (lbn < NDADDR) {
 544                         cg = itog(fs, ip->i_number);
 545                         return (fs->fs_fpg * cg + fs->fs_frag);
 546                 }
 547                 /*
 548                  * Find a cylinder with greater than average
 549                  * number of unused data blocks.
 550                  */
 551                 if (indx == 0 || bap[indx - 1] == 0)
 552                         startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 553                 else
 554                         startcg = dtog(fs, bap[indx - 1]) + 1;
 555                 startcg %= fs->fs_ncg;
 556
 557                 mutex_enter(&ufsvfsp->vfs_lock);
 558                 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 559                 /*
 560                  * used for computing log space for writes/truncs
 561                  */
 562                 ufsvfsp->vfs_avgbfree = avgbfree;
 563                 for (cg = startcg; cg < fs->fs_ncg; cg++)
 564                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 565                                 fs->fs_cgrotor = cg;
 566                                 mutex_exit(&ufsvfsp->vfs_lock);
 567                                 return (fs->fs_fpg * cg + fs->fs_frag);
 568                         }
 569                 for (cg = 0; cg <= startcg; cg++)
 570                         if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 571                                 fs->fs_cgrotor = cg;
 572                                 mutex_exit(&ufsvfsp->vfs_lock);
 573                                 return (fs->fs_fpg * cg + fs->fs_frag);
 574                         }
 575                 mutex_exit(&ufsvfsp->vfs_lock);
 576                 return (0);
 577         }
 578         /*
 579          * One or more previous blocks have been laid out. If less
 580          * than fs_maxcontig previous blocks are contiguous, the
 581          * next block is requested contiguously, otherwise it is
 582          * requested rotationally delayed by fs_rotdelay milliseconds.
 583          */
 584
 585         nextblk = bap[indx - 1];
 586         /*
 587          * Provision for fallocate to return positive
 588          * blk preference based on last allocation
 589          */
 590         if (nextblk < 0 && nextblk != UFS_HOLE) {
 591                 nextblk = (-bap[indx - 1]) + fs->fs_frag;
 592         } else {
 593                 nextblk = bap[indx - 1] + fs->fs_frag;
 594         }
 595
 596         if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
 597             blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
 598                 return (nextblk);
 599         }
 600         if (fs->fs_rotdelay != 0)
 601                 /*
 602                  * Here we convert ms of delay to frags as:
 603                  * (frags) = (ms) * (rev/sec) * (sect/rev) /
 604                  *      ((sect/frag) * (ms/sec))
 605                  * then round up to the next block.
 606                  */
 607                 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
 608                     (NSPF(fs) * 1000), fs->fs_frag);
 609         return (nextblk);
 610 }
 611
 612 /*
 613  * Free a block or fragment.
 614  *
 615  * The specified block or fragment is placed back in the
 616  * free map. If a fragment is deallocated, a possible
 617  * block reassembly is checked.
 618  */
 619 void
 620 free(struct inode *ip, daddr_t bno, off_t size, int flags)
 621 {
 622         struct fs *fs = ip->i_fs;
 623         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 624         struct ufs_q *delq = &ufsvfsp->vfs_delete;
 625         struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
 626         struct cg *cgp;
 627         struct buf *bp;
 628         int cg, bmap, bbase;
 629         int i;
 630         uchar_t *blksfree;
 631         int *blktot;
 632         short *blks;
 633         daddr_t blkno, cylno, rpos;
 634
 635         /*
 636          * fallocate'd files will have negative block address.
 637          * So negate it again to get original block address.
 638          */
 639         if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
 640                 bno = -bno;
 641         }
 642
 643         if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 644                 (void) ufs_fault(ITOV(ip),
 645                     "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
 646                     "fs = %s\n", ip->i_dev, fs->fs_bsize,
 647                     (int)size, fs->fs_fsmnt);
 648                 return;
 649         }
 650         cg = dtog(fs, bno);
 651         ASSERT(!ufs_badblock(ip, bno));
 652         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 653             (int)fs->fs_cgsize);
 654
 655         cgp = bp->b_un.b_cg;
 656         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 657                 brelse(bp);
 658                 return;
 659         }
 660
 661         if (!(flags & I_NOCANCEL))
 662                 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
 663         if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
 664                 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
 665         }
 666         blksfree = cg_blksfree(cgp);
 667         blktot = cg_blktot(cgp);
 668         mutex_enter(&ufsvfsp->vfs_lock);
 669         cgp->cg_time = gethrestime_sec();
 670         bno = dtogd(fs, bno);
 671         if (size == fs->fs_bsize) {
 672                 blkno = fragstoblks(fs, bno);
 673                 cylno = cbtocylno(fs, bno);
 674                 rpos = cbtorpos(ufsvfsp, bno);
 675                 blks = cg_blks(ufsvfsp, cgp, cylno);
 676                 if (!isclrblock(fs, blksfree, blkno)) {
 677                         mutex_exit(&ufsvfsp->vfs_lock);
 678                         brelse(bp);
 679                         (void) ufs_fault(ITOV(ip), "free: freeing free block, "
 680                             "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
 681                             ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
 682                         return;
 683                 }
 684                 setblock(fs, blksfree, blkno);
 685                 blks[rpos]++;
 686                 blktot[cylno]++;
 687                 cgp->cg_cs.cs_nbfree++;         /* Log below */
 688                 fs->fs_cstotal.cs_nbfree++;
 689                 fs->fs_cs(fs, cg).cs_nbfree++;
 690                 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
 691                         mutex_enter(&delq->uq_mutex);
 692                         delq_info->delq_unreclaimed_blocks -=
 693                             btodb(fs->fs_bsize);
 694                         mutex_exit(&delq->uq_mutex);
 695                 }
 696         } else {
 697                 bbase = bno - fragnum(fs, bno);
 698                 /*
 699                  * Decrement the counts associated with the old frags
 700                  */
 701                 bmap = blkmap(fs, blksfree, bbase);
 702                 fragacct(fs, bmap, cgp->cg_frsum, -1);
 703                 /*
 704                  * Deallocate the fragment
 705                  */
 706                 for (i = 0; i < numfrags(fs, size); i++) {
 707                         if (isset(blksfree, bno + i)) {
 708                                 brelse(bp);
 709                                 mutex_exit(&ufsvfsp->vfs_lock);
 710                                 (void) ufs_fault(ITOV(ip),
 711                                     "free: freeing free frag, "
 712                                     "dev:0x%lx, blk:%ld, cg:%d, "
 713                                     "ino:%lu, fs:%s",
 714                                     ip->i_dev,
 715                                     bno + i,
 716                                     cgp->cg_cgx,
 717                                     ip->i_number,
 718                                     fs->fs_fsmnt);
 719                                 return;
 720                         }
 721                         setbit(blksfree, bno + i);
 722                 }
 723                 cgp->cg_cs.cs_nffree += i;
 724                 fs->fs_cstotal.cs_nffree += i;
 725                 fs->fs_cs(fs, cg).cs_nffree += i;
 726                 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
 727                         mutex_enter(&delq->uq_mutex);
 728                         delq_info->delq_unreclaimed_blocks -=
 729                             btodb(i * fs->fs_fsize);
 730                         mutex_exit(&delq->uq_mutex);
 731                 }
 732                 /*
 733                  * Add back in counts associated with the new frags
 734                  */
 735                 bmap = blkmap(fs, blksfree, bbase);
 736                 fragacct(fs, bmap, cgp->cg_frsum, 1);
 737                 /*
 738                  * If a complete block has been reassembled, account for it
 739                  */
 740                 blkno = fragstoblks(fs, bbase);
 741                 if (isblock(fs, blksfree, blkno)) {
 742                         cylno = cbtocylno(fs, bbase);
 743                         rpos = cbtorpos(ufsvfsp, bbase);
 744                         blks = cg_blks(ufsvfsp, cgp, cylno);
 745                         blks[rpos]++;
 746                         blktot[cylno]++;
 747                         cgp->cg_cs.cs_nffree -= fs->fs_frag;
 748                         fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 749                         fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 750                         cgp->cg_cs.cs_nbfree++;
 751                         fs->fs_cstotal.cs_nbfree++;
 752                         fs->fs_cs(fs, cg).cs_nbfree++;
 753                 }
 754         }
 755         fs->fs_fmod = 1;
 756         ufs_notclean(ufsvfsp);
 757         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 758         TRANS_SI(ufsvfsp, fs, cg);
 759         bdrwrite(bp);
 760 }
 761
 762 /*
 763  * Free an inode.
 764  *
 765  * The specified inode is placed back in the free map.
 766  */
 767 void
 768 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
 769 {
 770         struct fs *fs = ip->i_fs;
 771         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 772         struct cg *cgp;
 773         struct buf *bp;
 774         unsigned int inot;
 775         int cg;
 776         char *iused;
 777
 778         if (ip->i_number == ino && ip->i_mode != 0) {
 779                 (void) ufs_fault(ITOV(ip),
 780                     "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
 781                     "fs = %s\n",
 782                     ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
 783                 return;
 784         }
 785         if (ino >= fs->fs_ipg * fs->fs_ncg) {
 786                 (void) ufs_fault(ITOV(ip),
 787                     "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
 788                     (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
 789                 return;
 790         }
 791         cg = (int)itog(fs, ino);
 792         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 793             (int)fs->fs_cgsize);
 794
 795         cgp = bp->b_un.b_cg;
 796         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 797                 brelse(bp);
 798                 return;
 799         }
 800         mutex_enter(&ufsvfsp->vfs_lock);
 801         cgp->cg_time = gethrestime_sec();
 802         iused = cg_inosused(cgp);
 803         inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
 804         if (isclr(iused, inot)) {
 805                 mutex_exit(&ufsvfsp->vfs_lock);
 806                 brelse(bp);
 807                 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
 808                     "mode: (imode) %o, (omode) %o, ino:%d, "
 809                     "fs:%s",
 810                     ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
 811                 return;
 812         }
 813         clrbit(iused, inot);
 814
 815         if (inot < (ulong_t)cgp->cg_irotor)
 816                 cgp->cg_irotor = inot;
 817         cgp->cg_cs.cs_nifree++;
 818         fs->fs_cstotal.cs_nifree++;
 819         fs->fs_cs(fs, cg).cs_nifree++;
 820         if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
 821                 cgp->cg_cs.cs_ndir--;
 822                 fs->fs_cstotal.cs_ndir--;
 823                 fs->fs_cs(fs, cg).cs_ndir--;
 824         }
 825         fs->fs_fmod = 1;
 826         ufs_notclean(ufsvfsp);
 827         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 828         TRANS_SI(ufsvfsp, fs, cg);
 829         bdrwrite(bp);
 830 }
 831
 832 /*
 833  * Implement the cylinder overflow algorithm.
 834  *
 835  * The policy implemented by this algorithm is:
 836  *   1) allocate the block in its requested cylinder group.
 837  *   2) quadratically rehash on the cylinder group number.
 838  *   3) brute force search for a free block.
 839  * The size parameter means size for data blocks, mode for inodes.
 840  */
 841 static ino_t
 842 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
 843 {
 844         struct fs *fs;
 845         int i;
 846         long result;
 847         int icg = cg;
 848
 849         fs = ip->i_fs;
 850         /*
 851          * 1: preferred cylinder group
 852          */
 853         result = (*allocator)(ip, cg, pref, size);
 854         if (result)
 855                 return (result);
 856         /*
 857          * 2: quadratic rehash
 858          */
 859         for (i = 1; i < fs->fs_ncg; i *= 2) {
 860                 cg += i;
 861                 if (cg >= fs->fs_ncg)
 862                         cg -= fs->fs_ncg;
 863                 result = (*allocator)(ip, cg, 0, size);
 864                 if (result)
 865                         return (result);
 866         }
 867         /*
 868          * 3: brute force search
 869          * Note that we start at i == 2, since 0 was checked initially,
 870          * and 1 is always checked in the quadratic rehash.
 871          */
 872         cg = (icg + 2) % fs->fs_ncg;
 873         for (i = 2; i < fs->fs_ncg; i++) {
 874                 result = (*allocator)(ip, cg, 0, size);
 875                 if (result)
 876                         return (result);
 877                 cg++;
 878                 if (cg == fs->fs_ncg)
 879                         cg = 0;
 880         }
 881         return (0);
 882 }
 883
 884 /*
 885  * Determine whether a fragment can be extended.
 886  *
 887  * Check to see if the necessary fragments are available, and
 888  * if they are, allocate them.
 889  */
 890 static daddr_t
 891 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
 892 {
 893         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 894         struct fs *fs = ip->i_fs;
 895         struct buf *bp;
 896         struct cg *cgp;
 897         uchar_t *blksfree;
 898         long bno;
 899         int frags, bbase;
 900         int i, j;
 901
 902         if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 903                 return (0);
 904         frags = numfrags(fs, nsize);
 905         bbase = (int)fragnum(fs, bprev);
 906         if (bbase > fragnum(fs, (bprev + frags - 1))) {
 907                 /* cannot extend across a block boundary */
 908                 return (0);
 909         }
 910
 911         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
 912             (int)fs->fs_cgsize);
 913         cgp = bp->b_un.b_cg;
 914         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 915                 brelse(bp);
 916                 return (0);
 917         }
 918
 919         blksfree = cg_blksfree(cgp);
 920         mutex_enter(&ufsvfsp->vfs_lock);
 921         bno = dtogd(fs, bprev);
 922         for (i = numfrags(fs, osize); i < frags; i++) {
 923                 if (isclr(blksfree, bno + i)) {
 924                         mutex_exit(&ufsvfsp->vfs_lock);
 925                         brelse(bp);
 926                         return (0);
 927                 }
 928                 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
 929                     fs->fs_fsize))) {
 930                         mutex_exit(&ufsvfsp->vfs_lock);
 931                         brelse(bp);
 932                         return (0);
 933                 }
 934         }
 935
 936         cgp->cg_time = gethrestime_sec();
 937         /*
 938          * The current fragment can be extended,
 939          * deduct the count on fragment being extended into
 940          * increase the count on the remaining fragment (if any)
 941          * allocate the extended piece.
 942          */
 943         for (i = frags; i < fs->fs_frag - bbase; i++)
 944                 if (isclr(blksfree, bno + i))
 945                         break;
 946         j = i - numfrags(fs, osize);
 947         cgp->cg_frsum[j]--;
 948         ASSERT(cgp->cg_frsum[j] >= 0);
 949         if (i != frags)
 950                 cgp->cg_frsum[i - frags]++;
 951         for (i = numfrags(fs, osize); i < frags; i++) {
 952                 clrbit(blksfree, bno + i);
 953                 cgp->cg_cs.cs_nffree--;
 954                 fs->fs_cs(fs, cg).cs_nffree--;
 955                 fs->fs_cstotal.cs_nffree--;
 956         }
 957         fs->fs_fmod = 1;
 958         ufs_notclean(ufsvfsp);
 959         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
 960         TRANS_SI(ufsvfsp, fs, cg);
 961         bdrwrite(bp);
 962         return ((daddr_t)bprev);
 963 }
 964
 965 /*
 966  * Determine whether a block can be allocated.
 967  *
 968  * Check to see if a block of the apprpriate size
 969  * is available, and if it is, allocate it.
 970  */
 971 static daddr_t
 972 alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
 973 {
 974         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
 975         struct fs *fs = ip->i_fs;
 976         struct buf *bp;
 977         struct cg *cgp;
 978         uchar_t *blksfree;
 979         int bno, frags;
 980         int allocsiz;
 981         int i;
 982
 983         /*
 984          * Searching for space could be time expensive so do some
 985          * up front checking to verify that there is actually space
 986          * available (free blocks or free frags).
 987          */
 988         if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
 989                 if (size == fs->fs_bsize)
 990                         return (0);
 991
 992                 /*
 993                  * If there are not enough free frags then return.
 994                  */
 995                 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
 996                         return (0);
 997         }
 998
 999         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1000             (int)fs->fs_cgsize);
1001
1002         cgp = bp->b_un.b_cg;
1003         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1004             (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
1005                 brelse(bp);
1006                 return (0);
1007         }
1008         blksfree = cg_blksfree(cgp);
1009         mutex_enter(&ufsvfsp->vfs_lock);
1010         cgp->cg_time = gethrestime_sec();
1011         if (size == fs->fs_bsize) {
1012                 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1013                         goto errout;
1014                 fs->fs_fmod = 1;
1015                 ufs_notclean(ufsvfsp);
1016                 TRANS_SI(ufsvfsp, fs, cg);
1017                 bdrwrite(bp);
1018                 return (bno);
1019         }
1020         /*
1021          * Check fragment bitmap to see if any fragments are already available.
1022          * mapsearch() may fail because the fragment that fits this request
1023          * might still be on the cancel list and not available for re-use yet.
1024          * Look for a bigger sized fragment to allocate first before we have
1025          * to give up and fragment a whole new block eventually.
1026          */
1027         frags = numfrags(fs, size);
1028         allocsiz = frags;
1029 next_size:
1030         for (; allocsiz < fs->fs_frag; allocsiz++)
1031                 if (cgp->cg_frsum[allocsiz] != 0)
1032                         break;
1033
1034         if (allocsiz != fs->fs_frag) {
1035                 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
1036                 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
1037                         allocsiz++;
1038                         goto next_size;
1039                 }
1040         }
1041
1042         if (allocsiz == fs->fs_frag || bno < 0) {
1043                 /*
1044                  * No fragments were available, so a block
1045                  * will be allocated and hacked up.
1046                  */
1047                 if (cgp->cg_cs.cs_nbfree == 0)
1048                         goto errout;
1049                 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
1050                         goto errout;
1051                 bpref = dtogd(fs, bno);
1052                 for (i = frags; i < fs->fs_frag; i++)
1053                         setbit(blksfree, bpref + i);
1054                 i = fs->fs_frag - frags;
1055                 cgp->cg_cs.cs_nffree += i;
1056                 fs->fs_cstotal.cs_nffree += i;
1057                 fs->fs_cs(fs, cg).cs_nffree += i;
1058                 cgp->cg_frsum[i]++;
1059                 fs->fs_fmod = 1;
1060                 ufs_notclean(ufsvfsp);
1061                 TRANS_SI(ufsvfsp, fs, cg);
1062                 bdrwrite(bp);
1063                 return (bno);
1064         }
1065
1066         for (i = 0; i < frags; i++)
1067                 clrbit(blksfree, bno + i);
1068         cgp->cg_cs.cs_nffree -= frags;
1069         fs->fs_cstotal.cs_nffree -= frags;
1070         fs->fs_cs(fs, cg).cs_nffree -= frags;
1071         cgp->cg_frsum[allocsiz]--;
1072         ASSERT(cgp->cg_frsum[allocsiz] >= 0);
1073         if (frags != allocsiz) {
1074                 cgp->cg_frsum[allocsiz - frags]++;
1075         }
1076         fs->fs_fmod = 1;
1077         ufs_notclean(ufsvfsp);
1078         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1079         TRANS_SI(ufsvfsp, fs, cg);
1080         bdrwrite(bp);
1081         return (cg * fs->fs_fpg + bno);
1082 errout:
1083         mutex_exit(&ufsvfsp->vfs_lock);
1084         brelse(bp);
1085         return (0);
1086 }
1087
1088 /*
1089  * Allocate a block in a cylinder group.
1090  *
1091  * This algorithm implements the following policy:
1092  *   1) allocate the requested block.
1093  *   2) allocate a rotationally optimal block in the same cylinder.
1094  *   3) allocate the next available block on the block rotor for the
1095  *      specified cylinder group.
1096  * Note that this routine only allocates fs_bsize blocks; these
1097  * blocks may be fragmented by the routine that allocates them.
1098  */
1099 static daddr_t
1100 alloccgblk(
1101         struct ufsvfs *ufsvfsp,
1102         struct cg *cgp,
1103         daddr_t bpref,
1104         struct buf *bp)
1105 {
1106         daddr_t bno;
1107         int cylno, pos, delta, rotbl_size;
1108         short *cylbp;
1109         int i;
1110         struct fs *fs;
1111         uchar_t *blksfree;
1112         daddr_t blkno, rpos, frag;
1113         short *blks;
1114         int32_t *blktot;
1115
1116         ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
1117         fs = ufsvfsp->vfs_fs;
1118         blksfree = cg_blksfree(cgp);
1119         if (bpref == 0) {
1120                 bpref = cgp->cg_rotor;
1121                 goto norot;
1122         }
1123         bpref = blknum(fs, bpref);
1124         bpref = dtogd(fs, bpref);
1125         /*
1126          * If the requested block is available, use it.
1127          */
1128         if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
1129                 bno = bpref;
1130                 goto gotit;
1131         }
1132         /*
1133          * Check for a block available on the same cylinder.
1134          */
1135         cylno = cbtocylno(fs, bpref);
1136         if (cg_blktot(cgp)[cylno] == 0)
1137                 goto norot;
1138         if (fs->fs_cpc == 0) {
1139                 /*
1140                  * Block layout info is not available, so just
1141                  * have to take any block in this cylinder.
1142                  */
1143                 bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
1144                 goto norot;
1145         }
1146         /*
1147          * Check the summary information to see if a block is
1148          * available in the requested cylinder starting at the
1149          * requested rotational position and proceeding around.
1150          */
1151         cylbp = cg_blks(ufsvfsp, cgp, cylno);
1152         pos = cbtorpos(ufsvfsp, bpref);
1153         for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
1154                 if (cylbp[i] > 0)
1155                         break;
1156         if (i == ufsvfsp->vfs_nrpos)
1157                 for (i = 0; i < pos; i++)
1158                         if (cylbp[i] > 0)
1159                                 break;
1160         if (cylbp[i] > 0) {
1161                 /*
1162                  * Found a rotational position, now find the actual
1163                  * block.  A "panic" if none is actually there.
1164                  */
1165
1166                 /*
1167                  * Up to this point, "pos" has referred to the rotational
1168                  * position of the desired block.  From now on, it holds
1169                  * the offset of the current cylinder within a cylinder
1170                  * cycle.  (A cylinder cycle refers to a set of cylinders
1171                  * which are described by a single rotational table; the
1172                  * size of the cycle is fs_cpc.)
1173                  *
1174                  * bno is set to the block number of the first block within
1175                  * the current cylinder cycle.
1176                  */
1177
1178                 pos = cylno % fs->fs_cpc;
1179                 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1180
1181                 /*
1182                  * The blocks within a cylinder are grouped into equivalence
1183                  * classes according to their "rotational position."  There
1184                  * are two tables used to determine these classes.
1185                  *
1186                  * The positional offset table (fs_postbl) has an entry for
1187                  * each rotational position of each cylinder in a cylinder
1188                  * cycle.  This entry contains the relative block number
1189                  * (counting from the start of the cylinder cycle) of the
1190                  * first block in the equivalence class for that position
1191                  * and that cylinder.  Positions for which no blocks exist
1192                  * are indicated by a -1.
1193                  *
1194                  * The rotational delta table (fs_rotbl) has an entry for
1195                  * each block in a cylinder cycle.  This entry contains
1196                  * the offset from that block to the next block in the
1197                  * same equivalence class.  The last block in the class
1198                  * is indicated by a zero in the table.
1199                  *
1200                  * The following code, then, walks through all of the blocks
1201                  * in the cylinder (cylno) which we're allocating within
1202                  * which are in the equivalence class for the rotational
1203                  * position (i) which we're allocating within.
1204                  */
1205
1206                 if (fs_postbl(ufsvfsp, pos)[i] == -1) {
1207                         (void) ufs_fault(ufsvfsp->vfs_root,
1208                             "alloccgblk: cyl groups corrupted, pos = %d, "
1209                             "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1210                         return (0);
1211                 }
1212
1213                 /*
1214                  * There is one entry in the rotational table for each block
1215                  * in the cylinder cycle.  These are whole blocks, not frags.
1216                  */
1217
1218                 rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
1219                     (fs->fs_fragshift + fs->fs_fsbtodb);
1220
1221                 /*
1222                  * As we start, "i" is the rotational position within which
1223                  * we're searching.  After the next line, it will be a block
1224                  * number (relative to the start of the cylinder cycle)
1225                  * within the equivalence class of that rotational position.
1226                  */
1227
1228                 i = fs_postbl(ufsvfsp, pos)[i];
1229
1230                 for (;;) {
1231                         if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
1232                                 bno = blkstofrags(fs, (bno + i));
1233                                 goto gotit;
1234                         }
1235                         delta = fs_rotbl(fs)[i];
1236                         if (delta <= 0 ||               /* End of chain, or */
1237                             delta + i > rotbl_size)     /* end of table? */
1238                                 break;                  /* If so, panic. */
1239                         i += delta;
1240                 }
1241                 (void) ufs_fault(ufsvfsp->vfs_root,
1242                     "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
1243                     "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
1244                 return (0);
1245         }
1246 norot:
1247         /*
1248          * No blocks in the requested cylinder, so take
1249          * next available one in this cylinder group.
1250          */
1251         bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
1252         if (bno < 0)
1253                 return (0);
1254         cgp->cg_rotor = bno;
1255 gotit:
1256         blkno = fragstoblks(fs, bno);
1257         frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
1258         if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
1259                 goto norot;
1260         clrblock(fs, blksfree, (long)blkno);
1261         /*
1262          * the other cg/sb/si fields are TRANS'ed by the caller
1263          */
1264         cgp->cg_cs.cs_nbfree--;
1265         fs->fs_cstotal.cs_nbfree--;
1266         fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1267         cylno = cbtocylno(fs, bno);
1268         blks = cg_blks(ufsvfsp, cgp, cylno);
1269         rpos = cbtorpos(ufsvfsp, bno);
1270         blktot = cg_blktot(cgp);
1271         blks[rpos]--;
1272         blktot[cylno]--;
1273         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1274         fs->fs_fmod = 1;
1275         return (frag);
1276 }
1277
1278 /*
1279  * Determine whether an inode can be allocated.
1280  *
1281  * Check to see if an inode is available, and if it is,
1282  * allocate it using the following policy:
1283  *   1) allocate the requested inode.
1284  *   2) allocate the next available inode after the requested
1285  *      inode in the specified cylinder group.
1286  */
1287 static ino_t
1288 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
1289 {
1290         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
1291         struct fs *fs = ip->i_fs;
1292         struct cg *cgp;
1293         struct buf *bp;
1294         int start, len, loc, map, i;
1295         char *iused;
1296
1297         if (fs->fs_cs(fs, cg).cs_nifree == 0)
1298                 return (0);
1299         bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1300             (int)fs->fs_cgsize);
1301
1302         cgp = bp->b_un.b_cg;
1303         if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
1304             cgp->cg_cs.cs_nifree == 0) {
1305                 brelse(bp);
1306                 return (0);
1307         }
1308         iused = cg_inosused(cgp);
1309         mutex_enter(&ufsvfsp->vfs_lock);
1310         /*
1311          * While we are waiting for the mutex, someone may have taken
1312          * the last available inode.  Need to recheck.
1313          */
1314         if (cgp->cg_cs.cs_nifree == 0) {
1315                 mutex_exit(&ufsvfsp->vfs_lock);
1316                 brelse(bp);
1317                 return (0);
1318         }
1319
1320         cgp->cg_time = gethrestime_sec();
1321         if (ipref) {
1322                 ipref %= fs->fs_ipg;
1323                 if (isclr(iused, ipref))
1324                         goto gotit;
1325         }
1326         start = cgp->cg_irotor / NBBY;
1327         len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1328         loc = skpc(0xff, (uint_t)len, &iused[start]);
1329         if (loc == 0) {
1330                 len = start + 1;
1331                 start = 0;
1332                 loc = skpc(0xff, (uint_t)len, &iused[0]);
1333                 if (loc == 0) {
1334                         mutex_exit(&ufsvfsp->vfs_lock);
1335                         (void) ufs_fault(ITOV(ip),
1336                             "ialloccg: map corrupted, cg = %d, irotor = %d, "
1337                             "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
1338                         return (0);
1339                 }
1340         }
1341         i = start + len - loc;
1342         map = iused[i];
1343         ipref = i * NBBY;
1344         for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1345                 if ((map & i) == 0) {
1346                         cgp->cg_irotor = ipref;
1347                         goto gotit;
1348                 }
1349         }
1350
1351         mutex_exit(&ufsvfsp->vfs_lock);
1352         (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
1353             fs->fs_fsmnt);
1354         return (0);
1355 gotit:
1356         setbit(iused, ipref);
1357         cgp->cg_cs.cs_nifree--;
1358         fs->fs_cstotal.cs_nifree--;
1359         fs->fs_cs(fs, cg).cs_nifree--;
1360         if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
1361                 cgp->cg_cs.cs_ndir++;
1362                 fs->fs_cstotal.cs_ndir++;
1363                 fs->fs_cs(fs, cg).cs_ndir++;
1364         }
1365         fs->fs_fmod = 1;
1366         ufs_notclean(ufsvfsp);
1367         TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
1368         TRANS_SI(ufsvfsp, fs, cg);
1369         bdrwrite(bp);
1370         return (cg * fs->fs_ipg + ipref);
1371 }
1372
1373 /*
1374  * Find a block of the specified size in the specified cylinder group.
1375  *
1376  * It is a panic if a request is made to find a block if none are
1377  * available.
1378  */
1379 static daddr_t
1380 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref,
1381     int allocsiz)
1382 {
1383         struct fs *fs   = ufsvfsp->vfs_fs;
1384         daddr_t bno, cfrag;
1385         int start, len, loc, i, last, first, secondtime;
1386         int blk, field, subfield, pos;
1387         int gotit;
1388
1389         /*
1390          * ufsvfs->vfs_lock is held when calling this.
1391          */
1392         /*
1393          * Find the fragment by searching through the
1394          * free block map for an appropriate bit pattern.
1395          */
1396         if (bpref)
1397                 start = dtogd(fs, bpref) / NBBY;
1398         else
1399                 start = cgp->cg_frotor / NBBY;
1400         /*
1401          * the following loop performs two scans -- the first scan
1402          * searches the bottom half of the array for a match and the
1403          * second scan searches the top half of the array.  The loops
1404          * have been merged just to make things difficult.
1405          */
1406         first = start;
1407         last = howmany(fs->fs_fpg, NBBY);
1408         secondtime = 0;
1409         cfrag = cgp->cg_cgx * fs->fs_fpg;
1410         while (first < last) {
1411                 len = last - first;
1412                 /*
1413                  * search the array for a match
1414                  */
1415                 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
1416                     (uchar_t *)fragtbl[fs->fs_frag],
1417                     (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1418                 /*
1419                  * match found
1420                  */
1421                 if (loc) {
1422                         bno = (last - loc) * NBBY;
1423
1424                         /*
1425                          * Found the byte in the map, sift
1426                          * through the bits to find the selected frag
1427                          */
1428                         cgp->cg_frotor = bno;
1429                         gotit = 0;
1430                         for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1431                                 blk = blkmap(fs, cg_blksfree(cgp), bno);
1432                                 blk <<= 1;
1433                                 field = around[allocsiz];
1434                                 subfield = inside[allocsiz];
1435                                 for (pos = 0;
1436                                     pos <= fs->fs_frag - allocsiz;
1437                                     pos++) {
1438                                         if ((blk & field) == subfield) {
1439                                                 gotit++;
1440                                                 break;
1441                                         }
1442                                         field <<= 1;
1443                                         subfield <<= 1;
1444                                 }
1445                                 if (gotit)
1446                                         break;
1447                         }
1448                         bno += pos;
1449
1450                         /*
1451                          * success if block is *not* being converted from
1452                          * metadata into userdata (harpy).  If so, ignore.
1453                          */
1454                         if (!TRANS_ISCANCEL(ufsvfsp,
1455                             ldbtob(fsbtodb(fs, (cfrag+bno))),
1456                             allocsiz * fs->fs_fsize))
1457                                 return (bno);
1458
1459                         /*
1460                          * keep looking -- this block is being converted
1461                          */
1462                         first = (last - loc) + 1;
1463                         loc = 0;
1464                         if (first < last)
1465                                 continue;
1466                 }
1467                 /*
1468                  * no usable matches in bottom half -- now search the top half
1469                  */
1470                 if (secondtime)
1471                         /*
1472                          * no usable matches in top half -- all done
1473                          */
1474                         break;
1475                 secondtime = 1;
1476                 last = start + 1;
1477                 first = 0;
1478         }
1479         /*
1480          * no usable matches
1481          */
1482         return ((daddr_t)-1);
1483 }
1484
1485 #define UFSNADDR (NDADDR + NIADDR)      /* NADDR applies to (obsolete) S5FS */
1486 #define IB(i)   (NDADDR + (i))  /* index of i'th indirect block ptr */
1487 #define SINGLE  0               /* single indirect block ptr */
1488 #define DOUBLE  1               /* double indirect block ptr */
1489 #define TRIPLE  2               /* triple indirect block ptr */
1490
1491 /*
1492  * Acquire a write lock, and keep trying till we get it
1493  */
1494 static int
1495 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
1496 {
1497         int err = 0;
1498
1499 lockagain:
1500         do {
1501                 err = ufs_fiolfss(vp, lf);
1502                 if (err)
1503                         return (err);
1504         } while (!LOCKFS_IS_ULOCK(lf));
1505
1506         lf->lf_lock = LOCKFS_WLOCK;
1507         lf->lf_flags = 0;
1508         lf->lf_comment = NULL;
1509         err = ufs__fiolfs(vp, lf, 1, 0);
1510
1511         if (err == EBUSY || err == EINVAL)
1512                 goto lockagain;
1513
1514         return (err);
1515 }
1516
1517 /*
1518  * Release the write lock
1519  */
1520 static int
1521 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
1522 {
1523         int err = 0;
1524
1525         lf->lf_lock = LOCKFS_ULOCK;
1526         lf->lf_flags = 0;
1527         err = ufs__fiolfs(vp, lf, 1, 0);
1528         return (err);
1529 }
1530
1531 struct allocsp_undo {
1532         daddr_t offset;
1533         daddr_t blk;
1534         struct allocsp_undo *next;
1535 };
1536
1537 /*
1538  * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
1539  * file system. For direct blocks, the blocks are allocated from the offset
1540  * requested to the block boundary, then any full blocks are allocated,
1541  * and finally any remainder.
1542  * For indirect blocks the blocks are not initialized and are
1543  * only marked as allocated. These addresses are then stored as negative
1544  * block numbers in the inode to imply special handling. UFS has been modified
1545  * where necessary to understand this new notion.
1546  * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
1547  */
1548 int
1549 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
1550 {
1551         struct lockfs lf;
1552         int berr, err, resv, issync;
1553         off_t istart, len; /* istart, special for idb */
1554         struct inode *ip;
1555         struct fs *fs;
1556         struct ufsvfs *ufsvfsp;
1557         uoff_t resid, i, uoff;
1558         daddr32_t db_undo[NDADDR];      /* old direct blocks */
1559         struct allocsp_undo *ib_undo = NULL;    /* ib undo */
1560         struct allocsp_undo *undo = NULL;
1561         uoff_t osz;                     /* old file size */
1562         int chunkblks = 0;              /* # of blocks in 1 allocation */
1563         int cnt = 0;
1564         daddr_t allocblk;
1565         daddr_t totblks = 0;
1566         struct ulockfs  *ulp;
1567         size_t done_len;
1568         int nbytes, offsetn;
1569
1570
1571         ASSERT(vp->v_type == VREG);
1572
1573         ip = VTOI(vp);
1574         fs = ip->i_fs;
1575         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
1576                 err = EIO;
1577                 goto out_allocsp;
1578         }
1579
1580         istart = blkroundup(fs, (lp->l_start));
1581         len = blkroundup(fs, (lp->l_len));
1582         chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
1583         ulp = &ufsvfsp->vfs_ulockfs;
1584
1585         if (lp->l_start < 0 || lp->l_len <= 0)
1586                 return (EINVAL);
1587
1588         /* Quickly check to make sure we have space before we proceed */
1589         if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
1590                 if (TRANS_ISTRANS(ufsvfsp)) {
1591                         ufs_delete_drain_wait(ufsvfsp, 1);
1592                         if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
1593                                 return (ENOSPC);
1594                 } else
1595                         return (ENOSPC);
1596         }
1597
1598         /*
1599          * We will keep i_rwlock locked as WRITER through out the function
1600          * since we don't want anyone else reading or writing to the inode
1601          * while we are in the middle of fallocating the file.
1602          */
1603         rw_enter(&ip->i_rwlock, RW_WRITER);
1604
1605         /* Back up the direct block list, used for undo later if necessary */
1606         rw_enter(&ip->i_contents, RW_READER);
1607         for (i = 0; i < NDADDR; i++)
1608                 db_undo[i] = ip->i_db[i];
1609         osz = ip->i_size;
1610         rw_exit(&ip->i_contents);
1611
1612         /* Write lock the file system */
1613         if (err = allocsp_wlockfs(vp, &lf))
1614                 goto exit;
1615
1616         /*
1617          * Allocate any direct blocks now.
1618          * Blocks are allocated from the offset requested to the block
1619          * boundary, then any full blocks are allocated, and finally any
1620          * remainder.
1621          */
1622         if (lblkno(fs, lp->l_start) < NDADDR) {
1623                 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
1624                     &resv, &resid);
1625                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_ALLOCSP, resv);
1626
1627                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1628                 rw_enter(&ip->i_contents, RW_WRITER);
1629
1630                 done_len = 0;
1631                 while ((done_len < lp->l_len) &&
1632                     (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
1633                         uoff = (offset_t)(lp->l_start + done_len);
1634                         offsetn = (int)blkoff(fs, uoff);
1635                         nbytes = (int)MIN(fs->fs_bsize - offsetn,
1636                             lp->l_len - done_len);
1637
1638                         berr = bmap_write(ip, uoff, offsetn + nbytes,
1639                             BI_FALLOCATE, &allocblk, cr);
1640                         /* Yikes error, quit */
1641                         if (berr) {
1642                                 TRANS_INODE(ufsvfsp, ip);
1643                                 rw_exit(&ip->i_contents);
1644                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1645                                 TRANS_END_CSYNC(ufsvfsp, &err, issync,
1646                                                 TOP_ALLOCSP, resv);
1647                                 err = allocsp_unlockfs(vp, &lf);
1648                                 goto exit;
1649                         }
1650
1651                         if (allocblk) {
1652                                 totblks++;
1653                                 if ((uoff + nbytes) > ip->i_size)
1654                                         ip->i_size = (uoff + nbytes);
1655                         }
1656                         done_len += nbytes;
1657                 }
1658
1659                 TRANS_INODE(ufsvfsp, ip);
1660                 rw_exit(&ip->i_contents);
1661                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1662                 TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_ALLOCSP, resv);
1663
1664                 /* start offset for indirect allocation */
1665                 istart =  (uoff + nbytes);
1666         }
1667
1668         /* Break the transactions into vfs_iotransz units */
1669         ufs_trans_trunc_resv(ip, ip->i_size +
1670             blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
1671         TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_ALLOCSP, resv);
1672
1673         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1674         rw_enter(&ip->i_contents, RW_WRITER);
1675
1676         /* Now go about fallocating necessary indirect blocks */
1677         for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
1678                 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
1679                     &allocblk, cr);
1680                 if (berr) {
1681                         TRANS_INODE(ufsvfsp, ip);
1682                         rw_exit(&ip->i_contents);
1683                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1684                         TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_ALLOCSP,
1685                                         resv);
1686                         err = allocsp_unlockfs(vp, &lf);
1687                         goto exit;
1688                 }
1689
1690                 /* Update the blk counter only if new block was added */
1691                 if (allocblk) {
1692                         /* Save undo information */
1693                         undo = kmem_alloc(sizeof (struct allocsp_undo),
1694                             KM_SLEEP);
1695                         undo->offset = i;
1696                         undo->blk = allocblk;
1697                         undo->next = ib_undo;
1698                         ib_undo = undo;
1699                         totblks++;
1700
1701                         if (i >= ip->i_size)
1702                                 ip->i_size += fs->fs_bsize;
1703                 }
1704                 cnt++;
1705
1706                 /* Being a good UFS citizen, let others get a share */
1707                 if (cnt == chunkblks) {
1708                         /*
1709                          * If there are waiters or the fs is hard locked,
1710                          * error locked, or read-only error locked,
1711                          * quit with EIO
1712                          */
1713                         if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
1714                             ULOCKFS_IS_ROELOCK(ulp)) {
1715                                 ip->i_cflags |= IFALLOCATE;
1716                                 TRANS_INODE(ufsvfsp, ip);
1717                                 rw_exit(&ip->i_contents);
1718                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1719
1720                                 TRANS_END_CSYNC(ufsvfsp, &err, issync,
1721                                                 TOP_ALLOCSP, resv);
1722                                 rw_exit(&ip->i_rwlock);
1723                                 (void) allocsp_unlockfs(vp, &lf);
1724                                 return (EIO);
1725                         }
1726
1727                         TRANS_INODE(ufsvfsp, ip);
1728                         rw_exit(&ip->i_contents);
1729                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1730
1731                         /* End the current transaction */
1732                         TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_ALLOCSP,
1733                                         resv);
1734
1735                         if (CV_HAS_WAITERS(&ulp->ul_cv)) {
1736                                 /* Release the write lock */
1737                                 if (err = allocsp_unlockfs(vp, &lf))
1738                                         goto exit;
1739
1740                                 /* Wake up others waiting to do operations */
1741                                 mutex_enter(&ulp->ul_lock);
1742                                 cv_broadcast(&ulp->ul_cv);
1743                                 mutex_exit(&ulp->ul_lock);
1744
1745                                 /* Grab the write lock again */
1746                                 if (err = allocsp_wlockfs(vp, &lf))
1747                                         goto exit;
1748                         } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
1749
1750                         /* Reserve more space in log for this file */
1751                         ufs_trans_trunc_resv(ip,
1752                             ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
1753                             &resv, &resid);
1754                         TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_ALLOCSP, resv);
1755
1756                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1757                         rw_enter(&ip->i_contents, RW_WRITER);
1758
1759                         cnt = 0;        /* reset cnt b/c of new transaction */
1760                 }
1761         }
1762
1763         if (!err && !berr)
1764                 ip->i_cflags |= IFALLOCATE;
1765
1766         /* If the file has grown then correct the file size */
1767         if (osz < (lp->l_start + lp->l_len))
1768                 ip->i_size = (lp->l_start + lp->l_len);
1769
1770         /* Release locks, end log transaction and unlock fs */
1771         TRANS_INODE(ufsvfsp, ip);
1772         rw_exit(&ip->i_contents);
1773         rw_exit(&ufsvfsp->vfs_dqrwlock);
1774
1775         TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_ALLOCSP, resv);
1776         err = allocsp_unlockfs(vp, &lf);
1777
1778         /*
1779          * @ exit label, we should no longer be holding the fs write lock, and
1780          * all logging transactions should have been ended. We still hold
1781          * ip->i_rwlock.
1782          */
1783 exit:
1784         /*
1785          * File has grown larger than 2GB. Set flag
1786          * in superblock to indicate this, if it
1787          * is not already set.
1788          */
1789         if ((ip->i_size > MAXOFF32_T) &&
1790             !(fs->fs_flags & FSLARGEFILES)) {
1791                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1792                 mutex_enter(&ufsvfsp->vfs_lock);
1793                 fs->fs_flags |= FSLARGEFILES;
1794                 ufs_sbwrite(ufsvfsp);
1795                 mutex_exit(&ufsvfsp->vfs_lock);
1796         }
1797
1798         /*
1799          * Since we couldn't allocate completely, we will undo the allocations.
1800          */
1801         if (berr) {
1802                 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
1803                 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_ALLOCSP, resv);
1804
1805                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1806                 rw_enter(&ip->i_contents, RW_WRITER);
1807
1808                 /* Direct blocks */
1809                 for (i = 0; i < NDADDR; i++) {
1810                         /*
1811                          * Only free the block if they are not same, and
1812                          * the old one isn't zero (the fragment was
1813                          * re-allocated).
1814                          */
1815                         if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
1816                                 free(ip, ip->i_db[i], fs->fs_bsize, 0);
1817                                 ip->i_db[i] = 0;
1818                         }
1819                 }
1820
1821                 /* Undo the indirect blocks */
1822                 while (ib_undo != NULL) {
1823                         undo = ib_undo;
1824                         err = bmap_set_bn(vp, undo->offset, 0);
1825                         if (err)
1826                                 cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
1827                                     "undo allocation of block %ld",
1828                                     undo->offset);
1829                         free(ip, undo->blk, fs->fs_bsize, I_IBLK);
1830                         ib_undo = undo->next;
1831                         kmem_free(undo, sizeof (struct allocsp_undo));
1832                 }
1833
1834                 ip->i_size = osz;
1835                 TRANS_INODE(ufsvfsp, ip);
1836
1837                 rw_exit(&ip->i_contents);
1838                 rw_exit(&ufsvfsp->vfs_dqrwlock);
1839
1840                 TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_ALLOCSP, resv);
1841
1842                 rw_exit(&ip->i_rwlock);
1843                 return (berr);
1844         }
1845
1846         /*
1847          * Don't forget to free the undo chain :)
1848          */
1849         while (ib_undo != NULL) {
1850                 undo = ib_undo;
1851                 ib_undo = undo->next;
1852                 kmem_free(undo, sizeof (struct allocsp_undo));
1853         }
1854
1855         rw_exit(&ip->i_rwlock);
1856
1857 out_allocsp:
1858         return (err);
1859 }
1860
1861 /*
1862  * Free storage space associated with the specified inode.  The portion
1863  * to be freed is specified by lp->l_start and lp->l_len (already
1864  * normalized to a "whence" of 0).
1865  *
1866  * This is an experimental facility whose continued existence is not
1867  * guaranteed.  Currently, we only support the special case
1868  * of l_len == 0, meaning free to end of file.
1869  *
1870  * Blocks are freed in reverse order.  This FILO algorithm will tend to
1871  * maintain a contiguous free list much longer than FIFO.
1872  * See also ufs_itrunc() in ufs_inode.c.
1873  *
1874  * Bug: unused bytes in the last retained block are not cleared.
1875  * This may result in a "hole" in the file that does not read as zeroes.
1876  */
1877 /* ARGSUSED */
1878 int
1879 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
1880 {
1881         int i;
1882         struct inode *ip = VTOI(vp);
1883         int error;
1884
1885         ASSERT(vp->v_type == VREG);
1886         ASSERT(lp->l_start >= 0);       /* checked by convoff */
1887
1888         if (lp->l_len != 0)
1889                 return (EINVAL);
1890
1891         rw_enter(&ip->i_contents, RW_READER);
1892         if (ip->i_size == (uoff_t)lp->l_start) {
1893                 rw_exit(&ip->i_contents);
1894                 return (0);
1895         }
1896
1897         /*
1898          * Check if there is any active mandatory lock on the
1899          * range that will be truncated/expanded.
1900          */
1901         if (MANDLOCK(vp, ip->i_mode)) {
1902                 offset_t save_start;
1903
1904                 save_start = lp->l_start;
1905
1906                 if (ip->i_size < lp->l_start) {
1907                         /*
1908                          * "Truncate up" case: need to make sure there
1909                          * is no lock beyond current end-of-file. To
1910                          * do so, we need to set l_start to the size
1911                          * of the file temporarily.
1912                          */
1913                         lp->l_start = ip->i_size;
1914                 }
1915                 lp->l_type = F_WRLCK;
1916                 lp->l_sysid = 0;
1917                 lp->l_pid = ttoproc(curthread)->p_pid;
1918                 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
1919                 rw_exit(&ip->i_contents);
1920                 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
1921                     lp->l_type != F_UNLCK) {
1922                         return (i ? i : EAGAIN);
1923                 }
1924                 rw_enter(&ip->i_contents, RW_READER);
1925
1926                 lp->l_start = save_start;
1927         }
1928
1929         /*
1930          * Make sure a write isn't in progress (allocating blocks)
1931          * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
1932          * truncate while it was allocating blocks).
1933          * Grab the locks in the right order.
1934          */
1935         rw_exit(&ip->i_contents);
1936         rw_enter(&ip->i_rwlock, RW_WRITER);
1937         error = TRANS_ITRUNC(ip, (uoff_t)lp->l_start, 0, cr);
1938         rw_exit(&ip->i_rwlock);
1939         return (error);
1940 }
1941
1942 /*
1943  * Find a cg with as close to nb contiguous bytes as possible
1944  *      THIS MAY TAKE MANY DISK READS!
1945  *
1946  * Implemented in an attempt to allocate contiguous blocks for
1947  * writing the ufs log file to, minimizing future disk head seeking
1948  */
1949 daddr_t
1950 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
1951 {
1952         struct fs       *fs     = ufsvfsp->vfs_fs;
1953         daddr_t         nblk    = lblkno(fs, blkroundup(fs, nb));
1954         daddr_t         minblk  = lblkno(fs, blkroundup(fs, minb));
1955         daddr_t         savebno, curbno, cgbno;
1956         int             cg, cgblks, savecg, savenblk, curnblk, startcg;
1957         uchar_t         *blksfree;
1958         buf_t           *bp;
1959         struct cg       *cgp;
1960
1961         savenblk = 0;
1962         savecg = 0;
1963         savebno = 0;
1964
1965         if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
1966                 cg = 0; /* Nothing suitable found */
1967         else
1968                 cg = startcg;
1969
1970         for (; cg < fs->fs_ncg; ++cg) {
1971                 /*
1972                  * find the largest contiguous range in this cg
1973                  */
1974                 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
1975                     (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
1976                     (int)fs->fs_cgsize);
1977                 cgp = bp->b_un.b_cg;
1978                 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
1979                         brelse(bp);
1980                         continue;
1981                 }
1982                 blksfree = cg_blksfree(cgp);        /* free array */
1983                 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
1984                 cgbno = 0;
1985                 while (cgbno < cgblks && savenblk < nblk) {
1986                         /* find a free block */
1987                         for (; cgbno < cgblks; ++cgbno) {
1988                                 if (isblock(fs, blksfree, cgbno)) {
1989                                         if (startcg != -1) {
1990                                                 brelse(bp);
1991                                                 savecg = startcg;
1992                                                 savebno = cgbno;
1993                                                 goto done;
1994                                         } else
1995                                                 break;
1996                                 }
1997                         }
1998                         curbno = cgbno;
1999                         /* count the number of free blocks */
2000                         for (curnblk = 0; cgbno < cgblks; ++cgbno) {
2001                                 if (!isblock(fs, blksfree, cgbno))
2002                                         break;
2003                                 if (++curnblk >= nblk)
2004                                         break;
2005                         }
2006                         if (curnblk > savenblk) {
2007                                 savecg = cg;
2008                                 savenblk = curnblk;
2009                                 savebno = curbno;
2010                         }
2011                 }
2012                 brelse(bp);
2013                 if (savenblk >= nblk)
2014                         break;
2015         }
2016
2017 done:
2018
2019         /* convert block offset in cg to frag offset in cg */
2020         savebno = blkstofrags(fs, savebno);
2021
2022         /* convert frag offset in cg to frag offset in fs */
2023         savebno += (savecg * fs->fs_fpg);
2024
2025         return (savebno);
2026 }
2027
2028 /*
2029  * The object of this routine is to find a start point for the UFS log.
2030  * Ideally the space should be allocated from the smallest possible number
2031  * of contiguous cylinder groups. This is found by using a sliding window
2032  * technique. The smallest window of contiguous cylinder groups, which is
2033  * still able to accommodate the target, is found by moving the window
2034  * through the cylinder groups in a single pass. The end of the window is
2035  * advanced until the space is accommodated, then the start is advanced until
2036  * it no longer fits, the end is then advanced again and so on until the
2037  * final cylinder group is reached. The first suitable instance is recorded
2038  * and its starting cg number is returned.
2039  *
2040  * If we are not able to find a minimum amount of space, represented by
2041  * minblk, or to do so uses more than the available extents, then return -1.
2042  */
2043
2044 int
2045 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
2046 {
2047         int      ncgs;           /* number of cylinder groups */
2048         daddr_t target;          /* amount of space sought */
2049         int      cwidth, ctotal; /* current window width and total */
2050         int      bwidth, btotal; /* best window width and total so far */
2051         int      s;     /* index of the first element in the current window */
2052         int      e;     /* index of the first element + the width */
2053                         /*  (i.e. 1 + index of last element) */
2054         int      bs; /* index of the first element in the best window so far */
2055         int      header, max_extents;
2056
2057         target = requested;
2058         ncgs = fs->fs_ncg;
2059
2060         header = sizeof (extent_block_t) - sizeof (extent_t);
2061         max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
2062         cwidth = ctotal = 0;
2063         btotal = -1;
2064         bwidth = ncgs;
2065         s = e = 0;
2066         while (e < ncgs) {
2067         /* Advance the end of the window until it accommodates the target. */
2068                 while (ctotal < target && e < ncgs) {
2069                         ctotal += fs->fs_cs(fs, e).cs_nbfree;
2070                         e++;
2071                 }
2072
2073                 /*
2074                  * Advance the start of the window until it no longer
2075                  * accommodates the target.
2076                  */
2077                 while (ctotal >= target && s < e) {
2078                         /* See if this is the smallest window so far. */
2079                         cwidth = e - s;
2080                         if (cwidth <= bwidth) {
2081                                 if (cwidth == bwidth && ctotal <= btotal)
2082                                         goto more;
2083                                 bwidth = cwidth;
2084                                 btotal = ctotal;
2085                                 bs = s;
2086                         }
2087 more:
2088                         ctotal -= fs->fs_cs(fs, s).cs_nbfree;
2089                         s++;
2090                 }
2091         }
2092
2093         /*
2094          * If we cannot allocate the minimum required or we use too many
2095          * extents to do so, return -1.
2096          */
2097         if (btotal < minblk || bwidth > max_extents)
2098                 bs = -1;
2099
2100         return (bs);
2101 }