sys/vfs/ufs/ffs_inode.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. All advertising materials mentioning features or use of this software
  14  *    must display the following acknowledgement:
  15  *      This product includes software developed by the University of
  16  *      California, Berkeley and its contributors.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
  34  * $FreeBSD: src/sys/ufs/ffs/ffs_inode.c,v 1.56.2.5 2002/02/05 18:35:03 dillon Exp $
  35  * $DragonFly: src/sys/vfs/ufs/ffs_inode.c,v 1.24 2007/06/14 02:55:25 dillon Exp $
  36  */
  37
  38 #include "opt_quota.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/mount.h>
  43 #include <sys/proc.h>
  44 #include <sys/buf.h>
  45 #include <sys/vnode.h>
  46 #include <sys/kernel.h>
  47 #include <sys/malloc.h>
  48 #include <sys/resourcevar.h>
  49 #include <sys/vmmeter.h>
  50
  51 #include <vm/vm.h>
  52 #include <vm/vm_extern.h>
  53
  54 #include "quota.h"
  55 #include "ufsmount.h"
  56 #include "inode.h"
  57 #include "ufs_extern.h"
  58
  59 #include "fs.h"
  60 #include "ffs_extern.h"
  61
  62 #include <vm/vm_page2.h>
  63
  64 static int ffs_indirtrunc (struct inode *, ufs_daddr_t, ufs_daddr_t,
  65             ufs_daddr_t, int, long *);
  66
  67 /*
  68  * Update the access, modified, and inode change times as specified by the
  69  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.  Write the inode
  70  * to disk if the IN_MODIFIED flag is set (it may be set initially, or by
  71  * the timestamp update).  The IN_LAZYMOD flag is set to force a write
  72  * later if not now.  If we write now, then clear both IN_MODIFIED and
  73  * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is
  74  * set, then wait for the write to complete.
  75  */
  76 int
  77 ffs_update(struct vnode *vp, int waitfor)
  78 {
  79         struct fs *fs;
  80         struct buf *bp;
  81         struct inode *ip;
  82         int error;
  83
  84         ufs_itimes(vp);
  85         ip = VTOI(vp);
  86         if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
  87                 return (0);
  88         ip->i_flag &= ~(IN_LAZYMOD | IN_MODIFIED);
  89         fs = ip->i_fs;
  90         if (fs->fs_ronly)
  91                 return (0);
  92
  93         /*
  94          * The vnode type is usually set to VBAD if an unrecoverable I/O
  95          * error has occured (such as when reading the inode).  Clear the
  96          * modified bits but do not write anything out in this case.
  97          */
  98         if (vp->v_type == VBAD)
  99                 return (0);
 100         /*
 101          * Ensure that uid and gid are correct. This is a temporary
 102          * fix until fsck has been changed to do the update.
 103          */
 104         if (fs->fs_inodefmt < FS_44INODEFMT) {          /* XXX */
 105                 ip->i_din.di_ouid = ip->i_uid;          /* XXX */
 106                 ip->i_din.di_ogid = ip->i_gid;          /* XXX */
 107         }                                               /* XXX */
 108         error = bread(ip->i_devvp,
 109                       fsbtodoff(fs, ino_to_fsba(fs, ip->i_number)),
 110                       (int)fs->fs_bsize, &bp);
 111         if (error) {
 112                 brelse(bp);
 113                 return (error);
 114         }
 115         if (DOINGSOFTDEP(vp))
 116                 softdep_update_inodeblock(ip, bp, waitfor);
 117         else if (ip->i_effnlink != ip->i_nlink)
 118                 panic("ffs_update: bad link cnt");
 119         *((struct ufs1_dinode *)bp->b_data +
 120             ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
 121         if (waitfor && !DOINGASYNC(vp)) {
 122                 return (bwrite(bp));
 123         } else if (vm_page_count_severe() || buf_dirty_count_severe()) {
 124                 return (bwrite(bp));
 125         } else {
 126                 if (bp->b_bufsize == fs->fs_bsize)
 127                         bp->b_flags |= B_CLUSTEROK;
 128                 bdwrite(bp);
 129                 return (0);
 130         }
 131 }
 132
 133 #define SINGLE  0       /* index of single indirect block */
 134 #define DOUBLE  1       /* index of double indirect block */
 135 #define TRIPLE  2       /* index of triple indirect block */
 136 /*
 137  * Truncate the inode oip to at most length size, freeing the
 138  * disk blocks.
 139  */
 140 int
 141 ffs_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred)
 142 {
 143         struct vnode *ovp = vp;
 144         ufs_daddr_t lastblock;
 145         struct inode *oip;
 146         ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
 147         ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 148         struct fs *fs;
 149         struct buf *bp;
 150         int offset, size, level;
 151         long count, nblocks, blocksreleased = 0;
 152         int i;
 153         int aflags, error, allerror;
 154         off_t osize;
 155
 156         oip = VTOI(ovp);
 157         fs = oip->i_fs;
 158         if (length < 0)
 159                 return (EINVAL);
 160         if (length > fs->fs_maxfilesize)
 161                 return (EFBIG);
 162         if (ovp->v_type == VLNK &&
 163             (oip->i_size < ovp->v_mount->mnt_maxsymlinklen || oip->i_din.di_blocks == 0)) {
 164 #ifdef DIAGNOSTIC
 165                 if (length != 0)
 166                         panic("ffs_truncate: partial truncate of symlink");
 167 #endif /* DIAGNOSTIC */
 168                 bzero((char *)&oip->i_shortlink, (uint)oip->i_size);
 169                 oip->i_size = 0;
 170                 oip->i_flag |= IN_CHANGE | IN_UPDATE;
 171                 return (ffs_update(ovp, 1));
 172         }
 173         if (oip->i_size == length) {
 174                 oip->i_flag |= IN_CHANGE | IN_UPDATE;
 175                 return (ffs_update(ovp, 0));
 176         }
 177         if (fs->fs_ronly)
 178                 panic("ffs_truncate: read-only filesystem");
 179 #ifdef QUOTA
 180         error = ufs_getinoquota(oip);
 181         if (error)
 182                 return (error);
 183 #endif
 184         ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0;
 185         if (DOINGSOFTDEP(ovp)) {
 186                 if (length > 0 || softdep_slowdown(ovp)) {
 187                         /*
 188                          * If a file is only partially truncated, then
 189                          * we have to clean up the data structures
 190                          * describing the allocation past the truncation
 191                          * point. Finding and deallocating those structures
 192                          * is a lot of work. Since partial truncation occurs
 193                          * rarely, we solve the problem by syncing the file
 194                          * so that it will have no data structures left.
 195                          */
 196                         if ((error = VOP_FSYNC(ovp, MNT_WAIT)) != 0)
 197                                 return (error);
 198                 } else {
 199 #ifdef QUOTA
 200                         (void) ufs_chkdq(oip, -oip->i_blocks, NOCRED, 0);
 201 #endif
 202                         softdep_setup_freeblocks(oip, length);
 203                         vinvalbuf(ovp, 0, 0, 0);
 204                         vnode_pager_setsize(ovp, 0);
 205                         oip->i_flag |= IN_CHANGE | IN_UPDATE;
 206                         return (ffs_update(ovp, 0));
 207                 }
 208         }
 209         osize = oip->i_size;
 210         /*
 211          * Lengthen the size of the file. We must ensure that the
 212          * last byte of the file is allocated. Since the smallest
 213          * value of osize is 0, length will be at least 1.
 214          */
 215         if (osize < length) {
 216                 vnode_pager_setsize(ovp, length);
 217                 aflags = B_CLRBUF;
 218                 if (flags & IO_SYNC)
 219                         aflags |= B_SYNC;
 220                 error = VOP_BALLOC(ovp, length - 1, 1,
 221                     cred, aflags, &bp);
 222                 if (error)
 223                         return (error);
 224                 oip->i_size = length;
 225                 if (bp->b_bufsize == fs->fs_bsize)
 226                         bp->b_flags |= B_CLUSTEROK;
 227                 if (aflags & B_SYNC)
 228                         bwrite(bp);
 229                 else
 230                         bawrite(bp);
 231                 oip->i_flag |= IN_CHANGE | IN_UPDATE;
 232                 return (ffs_update(ovp, 1));
 233         }
 234         /*
 235          * Shorten the size of the file. If the file is not being
 236          * truncated to a block boundary, the contents of the
 237          * partial block following the end of the file must be
 238          * zero'ed in case it ever becomes accessible again because
 239          * of subsequent file growth. Directories however are not
 240          * zero'ed as they should grow back initialized to empty.
 241          */
 242         offset = blkoff(fs, length);
 243         if (offset == 0) {
 244                 oip->i_size = length;
 245         } else {
 246                 lbn = lblkno(fs, length);
 247                 aflags = B_CLRBUF;
 248                 if (flags & IO_SYNC)
 249                         aflags |= B_SYNC;
 250                 error = VOP_BALLOC(ovp, length - 1, 1, cred, aflags, &bp);
 251                 if (error) {
 252                         return (error);
 253                 }
 254                 /*
 255                  * When we are doing soft updates and the UFS_BALLOC
 256                  * above fills in a direct block hole with a full sized
 257                  * block that will be truncated down to a fragment below,
 258                  * we must flush out the block dependency with an FSYNC
 259                  * so that we do not get a soft updates inconsistency
 260                  * when we create the fragment below.
 261                  */
 262                 if (DOINGSOFTDEP(ovp) && lbn < NDADDR &&
 263                     fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
 264                     (error = VOP_FSYNC(ovp, MNT_WAIT)) != 0) {
 265                                 return (error);
 266                 }
 267                 oip->i_size = length;
 268                 size = blksize(fs, oip, lbn);
 269                 if (ovp->v_type != VDIR)
 270                         bzero((char *)bp->b_data + offset,
 271                             (uint)(size - offset));
 272                 /* Kirk's code has reallocbuf(bp, size, 1) here */
 273                 allocbuf(bp, size);
 274                 if (bp->b_bufsize == fs->fs_bsize)
 275                         bp->b_flags |= B_CLUSTEROK;
 276                 if (aflags & B_SYNC)
 277                         bwrite(bp);
 278                 else
 279                         bawrite(bp);
 280         }
 281         /*
 282          * Calculate index into inode's block list of
 283          * last direct and indirect blocks (if any)
 284          * which we want to keep.  Lastblock is -1 when
 285          * the file is truncated to 0.
 286          */
 287         lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
 288         lastiblock[SINGLE] = lastblock - NDADDR;
 289         lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 290         lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 291         nblocks = btodb(fs->fs_bsize);
 292
 293         /*
 294          * Update file and block pointers on disk before we start freeing
 295          * blocks.  If we crash before free'ing blocks below, the blocks
 296          * will be returned to the free list.  lastiblock values are also
 297          * normalized to -1 for calls to ffs_indirtrunc below.
 298          */
 299         bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks);
 300         for (level = TRIPLE; level >= SINGLE; level--)
 301                 if (lastiblock[level] < 0) {
 302                         oip->i_ib[level] = 0;
 303                         lastiblock[level] = -1;
 304                 }
 305         for (i = NDADDR - 1; i > lastblock; i--)
 306                 oip->i_db[i] = 0;
 307         oip->i_flag |= IN_CHANGE | IN_UPDATE;
 308         allerror = ffs_update(ovp, 1);
 309
 310         /*
 311          * Having written the new inode to disk, save its new configuration
 312          * and put back the old block pointers long enough to process them.
 313          * Note that we save the new block configuration so we can check it
 314          * when we are done.
 315          */
 316         bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks);
 317         bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks);
 318         oip->i_size = osize;
 319
 320         error = vtruncbuf(ovp, length, fs->fs_bsize);
 321         if (error && (allerror == 0))
 322                 allerror = error;
 323
 324         /*
 325          * Indirect blocks first.
 326          */
 327         indir_lbn[SINGLE] = -NDADDR;
 328         indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 329         indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 330         for (level = TRIPLE; level >= SINGLE; level--) {
 331                 bn = oip->i_ib[level];
 332                 if (bn != 0) {
 333                         error = ffs_indirtrunc(oip, indir_lbn[level],
 334                             fsbtodb(fs, bn), lastiblock[level], level, &count);
 335                         if (error)
 336                                 allerror = error;
 337                         blocksreleased += count;
 338                         if (lastiblock[level] < 0) {
 339                                 oip->i_ib[level] = 0;
 340                                 ffs_blkfree(oip, bn, fs->fs_bsize);
 341                                 blocksreleased += nblocks;
 342                         }
 343                 }
 344                 if (lastiblock[level] >= 0)
 345                         goto done;
 346         }
 347
 348         /*
 349          * All whole direct blocks or frags.
 350          */
 351         for (i = NDADDR - 1; i > lastblock; i--) {
 352                 long bsize;
 353
 354                 bn = oip->i_db[i];
 355                 if (bn == 0)
 356                         continue;
 357                 oip->i_db[i] = 0;
 358                 bsize = blksize(fs, oip, i);
 359                 ffs_blkfree(oip, bn, bsize);
 360                 blocksreleased += btodb(bsize);
 361         }
 362         if (lastblock < 0)
 363                 goto done;
 364
 365         /*
 366          * Finally, look for a change in size of the
 367          * last direct block; release any frags.
 368          */
 369         bn = oip->i_db[lastblock];
 370         if (bn != 0) {
 371                 long oldspace, newspace;
 372
 373                 /*
 374                  * Calculate amount of space we're giving
 375                  * back as old block size minus new block size.
 376                  */
 377                 oldspace = blksize(fs, oip, lastblock);
 378                 oip->i_size = length;
 379                 newspace = blksize(fs, oip, lastblock);
 380                 if (newspace == 0)
 381                         panic("ffs_truncate: newspace");
 382                 if (oldspace - newspace > 0) {
 383                         /*
 384                          * Block number of space to be free'd is
 385                          * the old block # plus the number of frags
 386                          * required for the storage we're keeping.
 387                          */
 388                         bn += numfrags(fs, newspace);
 389                         ffs_blkfree(oip, bn, oldspace - newspace);
 390                         blocksreleased += btodb(oldspace - newspace);
 391                 }
 392         }
 393 done:
 394 #ifdef DIAGNOSTIC
 395         for (level = SINGLE; level <= TRIPLE; level++)
 396                 if (newblks[NDADDR + level] != oip->i_ib[level])
 397                         panic("ffs_truncate1");
 398         for (i = 0; i < NDADDR; i++)
 399                 if (newblks[i] != oip->i_db[i])
 400                         panic("ffs_truncate2");
 401         if (length == 0 &&
 402             (!RB_EMPTY(&ovp->v_rbdirty_tree) ||
 403              !RB_EMPTY(&ovp->v_rbclean_tree)))
 404                 panic("ffs_truncate3");
 405 #endif /* DIAGNOSTIC */
 406         /*
 407          * Put back the real size.
 408          */
 409         oip->i_size = length;
 410         oip->i_blocks -= blocksreleased;
 411
 412         if (oip->i_blocks < 0)                  /* sanity */
 413                 oip->i_blocks = 0;
 414         oip->i_flag |= IN_CHANGE;
 415 #ifdef QUOTA
 416         (void) ufs_chkdq(oip, -blocksreleased, NOCRED, 0);
 417 #endif
 418         return (allerror);
 419 }
 420
 421 /*
 422  * Release blocks associated with the inode ip and stored in the indirect
 423  * block bn.  Blocks are free'd in LIFO order up to (but not including)
 424  * lastbn.  If level is greater than SINGLE, the block is an indirect block
 425  * and recursive calls to indirtrunc must be used to cleanse other indirect
 426  * blocks.
 427  *
 428  * NB: triple indirect blocks are untested.
 429  */
 430 static int
 431 ffs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, ufs_daddr_t dbn,
 432                ufs_daddr_t lastbn, int level, long *countp)
 433 {
 434         int i;
 435         struct buf *bp;
 436         struct fs *fs = ip->i_fs;
 437         ufs_daddr_t *bap;
 438         struct vnode *vp;
 439         ufs_daddr_t *copy = NULL, nb, nlbn, last;
 440         long blkcount, factor;
 441         int nblocks, blocksreleased = 0;
 442         int error = 0, allerror = 0;
 443
 444         /*
 445          * Calculate index in current block of last
 446          * block to be kept.  -1 indicates the entire
 447          * block so we need not calculate the index.
 448          */
 449         factor = 1;
 450         for (i = SINGLE; i < level; i++)
 451                 factor *= NINDIR(fs);
 452         last = lastbn;
 453         if (lastbn > 0)
 454                 last /= factor;
 455         nblocks = btodb(fs->fs_bsize);
 456         /*
 457          * Get buffer of block pointers, zero those entries corresponding
 458          * to blocks to be free'd, and update on disk copy first.  Since
 459          * double(triple) indirect before single(double) indirect, calls
 460          * to bmap on these blocks will fail.  However, we already have
 461          * the on disk address, so we have to set the bio_offset field
 462          * explicitly instead of letting bread do everything for us.
 463          */
 464         vp = ITOV(ip);
 465         bp = getblk(vp, lblktodoff(fs, lbn), (int)fs->fs_bsize, 0, 0);
 466         if ((bp->b_flags & B_CACHE) == 0) {
 467                 bp->b_flags &= ~(B_ERROR|B_INVAL);
 468                 bp->b_cmd = BUF_CMD_READ;
 469                 if (bp->b_bcount > bp->b_bufsize)
 470                         panic("ffs_indirtrunc: bad buffer size");
 471                 bp->b_bio2.bio_offset = dbtodoff(fs, dbn);
 472                 vfs_busy_pages(vp, bp);
 473                 /*
 474                  * Access the block device layer using the device vnode
 475                  * and the translated block number (bio2) instead of the
 476                  * file vnode (vp) and logical block number (bio1).
 477                  *
 478                  * Even though we are bypassing the vnode layer, we still
 479                  * want the vnode state to indicate that an I/O on its behalf
 480                  * is in progress.
 481                  */
 482                 bio_start_transaction(&bp->b_bio1, &vp->v_track_read);
 483                 vn_strategy(ip->i_devvp, &bp->b_bio2);
 484                 error = biowait(bp);
 485         }
 486         if (error) {
 487                 brelse(bp);
 488                 *countp = 0;
 489                 return (error);
 490         }
 491
 492         bap = (ufs_daddr_t *)bp->b_data;
 493         if (lastbn != -1) {
 494                 MALLOC(copy, ufs_daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK);
 495                 bcopy((caddr_t)bap, (caddr_t)copy, (uint)fs->fs_bsize);
 496                 bzero((caddr_t)&bap[last + 1],
 497                     (uint)(NINDIR(fs) - (last + 1)) * sizeof (ufs_daddr_t));
 498                 if (DOINGASYNC(vp)) {
 499                         bawrite(bp);
 500                 } else {
 501                         error = bwrite(bp);
 502                         if (error)
 503                                 allerror = error;
 504                 }
 505                 bap = copy;
 506         }
 507
 508         /*
 509          * Recursively free totally unused blocks.
 510          */
 511         for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 512             i--, nlbn += factor) {
 513                 nb = bap[i];
 514                 if (nb == 0)
 515                         continue;
 516                 if (level > SINGLE) {
 517                         if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 518                             (ufs_daddr_t)-1, level - 1, &blkcount)) != 0)
 519                                 allerror = error;
 520                         blocksreleased += blkcount;
 521                 }
 522                 ffs_blkfree(ip, nb, fs->fs_bsize);
 523                 blocksreleased += nblocks;
 524         }
 525
 526         /*
 527          * Recursively free last partial block.
 528          */
 529         if (level > SINGLE && lastbn >= 0) {
 530                 last = lastbn % factor;
 531                 nb = bap[i];
 532                 if (nb != 0) {
 533                         error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 534                             last, level - 1, &blkcount);
 535                         if (error)
 536                                 allerror = error;
 537                         blocksreleased += blkcount;
 538                 }
 539         }
 540         if (copy != NULL) {
 541                 FREE(copy, M_TEMP);
 542         } else {
 543                 bp->b_flags |= B_INVAL | B_NOCACHE;
 544                 brelse(bp);
 545         }
 546
 547         *countp = blocksreleased;
 548         return (allerror);
 549 }