6334 Cannot unlink files when over quota
[illumos-gate.git] / usr / src / uts / common / fs / ufs / ufs_bmap.c
blobaa224862f559d26b87f780f7c8d1c59a39e5bde0
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/signal.h>
44 #include <sys/user.h>
45 #include <sys/vnode.h>
46 #include <sys/buf.h>
47 #include <sys/disp.h>
48 #include <sys/proc.h>
49 #include <sys/conf.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_trans.h>
54 #include <sys/fs/ufs_bio.h>
55 #include <vm/seg.h>
56 #include <sys/errno.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vfs.h>
59 #include <sys/debug.h>
60 #include <sys/kmem.h>
61 #include <sys/cmn_err.h>
64 * This structure is used to track blocks as we allocate them, so that
65 * we can free them if we encounter an error during allocation. We
66 * keep track of five pieces of information for each allocated block:
67 * - The number of the newly allocated block
68 * - The size of the block (lets us deal with fragments if we want)
69 * - The number of the block containing a pointer to it; or whether
70 * the pointer is in the inode
71 * - The offset within the block (or inode) containing a pointer to it.
72 * - A flag indicating the usage of the block. (Logging needs to know
73 * this to avoid overwriting a data block if it was previously used
74 * for metadata.)
77 enum ufs_owner_type {
78 ufs_no_owner, /* Owner has not yet been updated */
79 ufs_inode_direct, /* Listed in inode's direct block table */
80 ufs_inode_indirect, /* Listed in inode's indirect block table */
81 ufs_indirect_block /* Listed in an indirect block */
84 struct ufs_allocated_block {
85 daddr_t this_block; /* Number of this block */
86 off_t block_size; /* Size of this block, in bytes */
87 enum ufs_owner_type owner; /* Who points to this block? */
88 daddr_t owner_block; /* Number of the owning block */
89 uint_t owner_offset; /* Offset within that block or inode */
90 int usage_flags; /* Usage flags, as expected by free() */
94 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
95 int maxtrans);
97 static void ufs_undo_allocation(inode_t *ip, int block_count,
98 struct ufs_allocated_block table[], int inode_sector_adjust);
101 * Find the extent and the matching block number.
103 * bsize > PAGESIZE
104 * boff indicates that we want a page in the middle
105 * min expression is supposed to make sure no extra page[s] after EOF
106 * PAGESIZE >= bsize
107 * we assume that a page is a multiple of bsize, i.e.,
108 * boff always == 0
110 * We always return a length that is suitable for a disk transfer.
112 #define DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
113 register daddr32_t *dp = (tblp); \
114 register int _chkfrag = chkfrag; /* for lint. sigh */ \
116 if (*dp == 0) { \
117 *(bnp) = UFS_HOLE; \
118 } else { \
119 register int len; \
121 len = findextent(fs, dp, (int)(n), lenp, maxtrans) << \
122 (fs)->fs_bshift; \
123 if (_chkfrag) { \
124 register u_offset_t tmp; \
126 tmp = fragroundup((fs), size) - \
127 (((u_offset_t)lbn) << fs->fs_bshift); \
128 len = (int)MIN(tmp, len); \
130 len -= (boff); \
131 if (len <= 0) { \
132 *(bnp) = UFS_HOLE; \
133 } else { \
134 *(bnp) = fsbtodb(fs, *dp) + btodb(boff); \
135 *(lenp) = len; \
141 * The maximum supported file size is actually somewhat less that 1
142 * terabyte. This is because the total number of blocks used for the
143 * file and its metadata must fit into the ic_blocks field of the
144 * inode, which is a signed 32-bit quantity. The metadata allocated
145 * for a file (that is, the single, double, and triple indirect blocks
146 * used to reference the file blocks) is actually quite small,
147 * but just to make sure, we check for overflow in the ic_blocks
148 * ic_blocks fields for all files whose total block count is
149 * within 1 GB of a terabyte. VERYLARGEFILESIZE below is the number of
150 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
151 * in a gigabyte (2^21). We only check for overflow in the ic_blocks
152 * field if the number of blocks currently allocated to the file is
153 * greater than VERYLARGEFILESIZE.
155 * Note that file "size" is the not the same as file "length". A
156 * file's "size" is the number of blocks allocated to it. A file's
157 * "length" is the maximum offset in the file. A UFS FILE can have a
158 * length of a terabyte, but the size is limited to somewhat less than
159 * a terabyte, as described above.
161 #define VERYLARGEFILESIZE 0x7FE00000
164 * bmap{read,write} define the structure of file system storage by mapping
165 * a logical offset in a file to a physical block number on the device.
166 * It should be called with a locked inode when allocation is to be
167 * done (bmap_write). Note this strangeness: bmap_write is always called from
168 * getpage(), not putpage(), since getpage() is where all the allocation
169 * is done.
171 * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
173 * NOTICE: the block number returned is the disk block number, not the
174 * file system block number. All the worries about block offsets and
175 * page/block sizes are hidden inside of bmap. Well, not quite,
176 * unfortunately. It's impossible to find one place to hide all this
177 * mess. There are 3 cases:
179 * PAGESIZE < bsize
180 * In this case, the {get,put}page routines will attempt to align to
181 * a file system block boundry (XXX - maybe this is a mistake?). Since
182 * the kluster routines may be out of memory, we don't always get all
183 * the pages we wanted. If we called bmap first, to find out how much
184 * to kluster, we handed in the block aligned offset. If we didn't get
185 * all the pages, we have to chop off the amount we didn't get from the
186 * amount handed back by bmap.
188 * PAGESIZE == bsize
189 * Life is quite pleasant here, no extra work needed, mainly because we
190 * (probably?) won't kluster backwards, just forwards.
192 * PAGESIZE > bsize
193 * This one has a different set of problems, specifically, we may have to
194 * do N reads to fill one page. Let us hope that Sun will stay with small
195 * pages.
197 * Returns 0 on success, or a non-zero errno if an error occurs.
199 * TODO
200 * LMXXX - add a bmap cache. This could be a couple of extents in the
201 * inode. Two is nice for PAGESIZE > bsize.
205 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
207 daddr_t lbn;
208 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
209 struct fs *fs = ufsvfsp->vfs_fs;
210 struct buf *bp;
211 int i, j, boff;
212 int shft; /* we maintain sh = 1 << shft */
213 daddr_t ob, nb, tbn;
214 daddr32_t *bap;
215 int nindirshift, nindiroffset;
217 ASSERT(RW_LOCK_HELD(&ip->i_contents));
218 lbn = (daddr_t)lblkno(fs, off);
219 boff = (int)blkoff(fs, off);
220 if (lbn < 0)
221 return (EFBIG);
224 * The first NDADDR blocks are direct blocks.
226 if (lbn < NDADDR) {
227 DOEXTENT(fs, lbn, boff, bnp, lenp,
228 ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
229 ufsvfsp->vfs_iotransz);
230 return (0);
233 nindirshift = ufsvfsp->vfs_nindirshift;
234 nindiroffset = ufsvfsp->vfs_nindiroffset;
236 * Determine how many levels of indirection.
238 shft = 0; /* sh = 1 */
239 tbn = lbn - NDADDR;
240 for (j = NIADDR; j > 0; j--) {
241 longlong_t sh;
243 shft += nindirshift; /* sh *= nindir */
244 sh = 1LL << shft;
245 if (tbn < sh)
246 break;
247 tbn -= sh;
249 if (j == 0)
250 return (EFBIG);
253 * Fetch the first indirect block.
255 nb = ip->i_ib[NIADDR - j];
256 if (nb == 0) {
257 *bnp = UFS_HOLE;
258 return (0);
262 * Fetch through the indirect blocks.
264 for (; j <= NIADDR; j++) {
265 ob = nb;
266 bp = UFS_BREAD(ufsvfsp,
267 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
268 if (bp->b_flags & B_ERROR) {
269 brelse(bp);
270 return (EIO);
272 bap = bp->b_un.b_daddr;
274 ASSERT(!ufs_indir_badblock(ip, bap));
276 shft -= nindirshift; /* sh / nindir */
277 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
278 nb = bap[i];
279 if (nb == 0) {
280 *bnp = UFS_HOLE;
281 brelse(bp);
282 return (0);
284 if (j != NIADDR)
285 brelse(bp);
287 DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
288 MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
289 0, ufsvfsp->vfs_iotransz);
290 brelse(bp);
291 return (0);
295 * See bmap_read for general notes.
297 * The block must be at least size bytes and will be extended or
298 * allocated as needed. If alloc_type is of type BI_ALLOC_ONLY, then bmap
299 * will not create any in-core pages that correspond to the new disk allocation.
300 * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
301 * and security is maintained b/c upon reading a negative block number pages
302 * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
303 * be created and initialized as needed.
305 * Returns 0 on success, or a non-zero errno if an error occurs.
308 bmap_write(struct inode *ip, u_offset_t off, int size,
309 enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
311 struct fs *fs;
312 struct buf *bp;
313 int i;
314 struct buf *nbp;
315 int j;
316 int shft; /* we maintain sh = 1 << shft */
317 daddr_t ob, nb, pref, lbn, llbn, tbn;
318 daddr32_t *bap;
319 struct vnode *vp = ITOV(ip);
320 long bsize = VBSIZE(vp);
321 long osize, nsize;
322 int issync, metaflag, isdirquota;
323 int err;
324 dev_t dev;
325 struct fbuf *fbp;
326 int nindirshift;
327 int nindiroffset;
328 struct ufsvfs *ufsvfsp;
329 int added_sectors; /* sectors added to this inode */
330 int alloced_blocks; /* fs blocks newly allocated */
331 struct ufs_allocated_block undo_table[NIADDR+1];
332 int verylargefile = 0;
334 ASSERT(RW_WRITE_HELD(&ip->i_contents));
336 if (allocblk)
337 *allocblk = 0;
339 ufsvfsp = ip->i_ufsvfs;
340 fs = ufsvfsp->vfs_bufp->b_un.b_fs;
341 lbn = (daddr_t)lblkno(fs, off);
342 if (lbn < 0)
343 return (EFBIG);
344 if (ip->i_blocks >= VERYLARGEFILESIZE)
345 verylargefile = 1;
346 llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
347 metaflag = isdirquota = 0;
348 if (((ip->i_mode & IFMT) == IFDIR) ||
349 ((ip->i_mode & IFMT) == IFATTRDIR))
350 isdirquota = metaflag = I_DIR;
351 else if ((ip->i_mode & IFMT) == IFSHAD)
352 metaflag = I_SHAD;
353 else if (ip->i_ufsvfs->vfs_qinod == ip)
354 isdirquota = metaflag = I_QUOTA;
356 issync = ((ip->i_flag & ISYNC) != 0);
358 if (isdirquota || issync) {
359 alloc_type = BI_NORMAL; /* make sure */
363 * If the next write will extend the file into a new block,
364 * and the file is currently composed of a fragment
365 * this fragment has to be extended to be a full block.
367 if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
368 osize = blksize(fs, ip, llbn);
369 if (osize < bsize && osize > 0) {
371 * Check to see if doing this will make the file too
372 * big. Only check if we are dealing with a very
373 * large file.
375 if (verylargefile == 1) {
376 if (((unsigned)ip->i_blocks +
377 btodb(bsize - osize)) > INT_MAX) {
378 return (EFBIG);
382 * Make sure we have all needed pages setup correctly.
384 * We pass S_OTHER to fbread here because we want
385 * an exclusive lock on the page in question
386 * (see ufs_getpage). I/O to the old block location
387 * may still be in progress and we are about to free
388 * the old block. We don't want anyone else to get
389 * a hold of the old block once we free it until
390 * the I/O is complete.
392 err =
393 fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
394 (uint_t)bsize, S_OTHER, &fbp);
395 if (err)
396 return (err);
397 pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
398 err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
399 &nb, cr);
400 if (err) {
401 if (fbp)
402 fbrelse(fbp, S_OTHER);
403 return (err);
405 ASSERT(!ufs_badblock(ip, nb));
408 * Update the inode before releasing the
409 * lock on the page. If we released the page
410 * lock first, the data could be written to it's
411 * old address and then destroyed.
413 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
414 ip->i_db[llbn] = nb;
415 UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
416 ip);
417 ip->i_blocks += btodb(bsize - osize);
418 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
419 TRANS_INODE(ufsvfsp, ip);
420 ip->i_flag |= IUPD | ICHG | IATTCHG;
422 /* Caller is responsible for updating i_seq */
424 * Don't check metaflag here, directories won't do this
427 if (issync) {
428 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
429 } else {
430 ASSERT(fbp);
431 fbrelse(fbp, S_WRITE);
434 if (nb != ob) {
435 (void) free(ip, ob, (off_t)osize, metaflag);
441 * The first NDADDR blocks are direct blocks.
443 if (lbn < NDADDR) {
444 nb = ip->i_db[lbn];
445 if (nb == 0 ||
446 ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
447 if (nb != 0) {
448 /* consider need to reallocate a frag */
449 osize = fragroundup(fs, blkoff(fs, ip->i_size));
450 nsize = fragroundup(fs, size);
451 if (nsize <= osize)
452 goto gotit;
454 * Check to see if doing this will make the
455 * file too big. Only check if we are dealing
456 * with a very large file.
458 if (verylargefile == 1) {
459 if (((unsigned)ip->i_blocks +
460 btodb(nsize - osize)) > INT_MAX) {
461 return (EFBIG);
465 * need to re-allocate a block or frag
467 ob = nb;
468 pref = blkpref(ip, lbn, (int)lbn,
469 &ip->i_db[0]);
470 err = realloccg(ip, ob, pref, (int)osize,
471 (int)nsize, &nb, cr);
472 if (err)
473 return (err);
474 if (allocblk)
475 *allocblk = nb;
476 ASSERT(!ufs_badblock(ip, nb));
478 } else {
480 * need to allocate a block or frag
482 osize = 0;
483 if (ip->i_size <
484 ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
485 nsize = fragroundup(fs, size);
486 else
487 nsize = bsize;
489 * Check to see if doing this will make the
490 * file too big. Only check if we are dealing
491 * with a very large file.
493 if (verylargefile == 1) {
494 if (((unsigned)ip->i_blocks +
495 btodb(nsize - osize)) > INT_MAX) {
496 return (EFBIG);
499 pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
500 err = alloc(ip, pref, (int)nsize, &nb, cr);
501 if (err)
502 return (err);
503 if (allocblk)
504 *allocblk = nb;
505 ASSERT(!ufs_badblock(ip, nb));
506 ob = nb;
510 * Read old/create new zero pages
512 fbp = NULL;
513 if (osize == 0) {
515 * mmap S_WRITE faults always enter here
518 * We zero it if its also BI_FALLOCATE, but
519 * only for direct blocks!
521 if (alloc_type == BI_NORMAL ||
522 alloc_type == BI_FALLOCATE ||
523 P2ROUNDUP_TYPED(size,
524 PAGESIZE, u_offset_t) < nsize) {
525 /* fbzero doesn't cause a pagefault */
526 fbzero(ITOV(ip),
527 ((offset_t)lbn << fs->fs_bshift),
528 (uint_t)nsize, &fbp);
530 } else {
531 err = fbread(vp,
532 ((offset_t)lbn << fs->fs_bshift),
533 (uint_t)nsize, S_OTHER, &fbp);
534 if (err) {
535 if (nb != ob) {
536 (void) free(ip, nb,
537 (off_t)nsize, metaflag);
538 } else {
539 (void) free(ip,
540 ob + numfrags(fs, osize),
541 (off_t)(nsize - osize),
542 metaflag);
544 ASSERT(nsize >= osize);
545 (void) chkdq(ip,
546 -(long)btodb(nsize - osize),
547 0, cr, (char **)NULL,
548 (size_t *)NULL);
549 return (err);
552 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
553 ip->i_db[lbn] = nb;
554 ip->i_blocks += btodb(nsize - osize);
555 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
556 TRANS_INODE(ufsvfsp, ip);
557 ip->i_flag |= IUPD | ICHG | IATTCHG;
559 /* Caller is responsible for updating i_seq */
562 * Write directory and shadow blocks synchronously so
563 * that they never appear with garbage in them on the
564 * disk.
567 if (isdirquota && (ip->i_size ||
568 TRANS_ISTRANS(ufsvfsp))) {
570 * XXX man not be necessary with harpy trans
571 * bug id 1130055
573 (void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
574 } else if (fbp) {
575 fbrelse(fbp, S_WRITE);
578 if (nb != ob)
579 (void) free(ip, ob, (off_t)osize, metaflag);
581 gotit:
582 return (0);
585 added_sectors = alloced_blocks = 0; /* No blocks alloced yet */
588 * Determine how many levels of indirection.
590 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
591 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
592 pref = 0;
593 shft = 0; /* sh = 1 */
594 tbn = lbn - NDADDR;
595 for (j = NIADDR; j > 0; j--) {
596 longlong_t sh;
598 shft += nindirshift; /* sh *= nindir */
599 sh = 1LL << shft;
600 if (tbn < sh)
601 break;
602 tbn -= sh;
605 if (j == 0)
606 return (EFBIG);
609 * Fetch the first indirect block.
611 dev = ip->i_dev;
612 nb = ip->i_ib[NIADDR - j];
613 if (nb == 0) {
615 * Check to see if doing this will make the
616 * file too big. Only check if we are dealing
617 * with a very large file.
619 if (verylargefile == 1) {
620 if (((unsigned)ip->i_blocks + btodb(bsize))
621 > INT_MAX) {
622 return (EFBIG);
626 * Need to allocate an indirect block.
628 pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
629 err = alloc(ip, pref, (int)bsize, &nb, cr);
630 if (err)
631 return (err);
632 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
633 ASSERT(!ufs_badblock(ip, nb));
636 * Keep track of this allocation so we can undo it if we
637 * get an error later.
640 ASSERT(alloced_blocks <= NIADDR);
642 undo_table[alloced_blocks].this_block = nb;
643 undo_table[alloced_blocks].block_size = bsize;
644 undo_table[alloced_blocks].owner = ufs_no_owner;
645 undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
647 alloced_blocks++;
650 * Write zero block synchronously so that
651 * indirect blocks never point at garbage.
653 bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
655 clrbuf(bp);
656 /* XXX Maybe special-case this? */
657 TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
658 UFS_BWRITE2(ufsvfsp, bp);
659 if (bp->b_flags & B_ERROR) {
660 err = geterror(bp);
661 brelse(bp);
662 ufs_undo_allocation(ip, alloced_blocks,
663 undo_table, added_sectors);
664 return (err);
666 brelse(bp);
668 ip->i_ib[NIADDR - j] = nb;
669 added_sectors += btodb(bsize);
670 ip->i_blocks += btodb(bsize);
671 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
672 TRANS_INODE(ufsvfsp, ip);
673 ip->i_flag |= IUPD | ICHG | IATTCHG;
674 /* Caller is responsible for updating i_seq */
677 * Update the 'undo table' now that we've linked this block
678 * to an inode.
681 undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
682 undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
685 * In the ISYNC case, wrip will notice that the block
686 * count on the inode has changed and will be sure to
687 * ufs_iupdat the inode at the end of wrip.
692 * Fetch through the indirect blocks.
694 for (; j <= NIADDR; j++) {
695 ob = nb;
696 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
698 if (bp->b_flags & B_ERROR) {
699 err = geterror(bp);
700 brelse(bp);
702 * Return any partial allocations.
704 * It is possible that we have not yet made any
705 * allocations at this point (if this is the first
706 * pass through the loop and we didn't have to
707 * allocate the first indirect block, above).
708 * In this case, alloced_blocks and added_sectors will
709 * be zero, and ufs_undo_allocation will do nothing.
711 ufs_undo_allocation(ip, alloced_blocks,
712 undo_table, added_sectors);
713 return (err);
715 bap = bp->b_un.b_daddr;
716 shft -= nindirshift; /* sh /= nindir */
717 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
718 nb = bap[i];
720 if (nb == 0) {
722 * Check to see if doing this will make the
723 * file too big. Only check if we are dealing
724 * with a very large file.
726 if (verylargefile == 1) {
727 if (((unsigned)ip->i_blocks + btodb(bsize))
728 > INT_MAX) {
729 brelse(bp);
730 ufs_undo_allocation(ip, alloced_blocks,
731 undo_table, added_sectors);
732 return (EFBIG);
735 if (pref == 0) {
736 if (j < NIADDR) {
737 /* Indirect block */
738 pref = blkpref(ip, lbn, 0,
739 (daddr32_t *)0);
740 } else {
741 /* Data block */
742 pref = blkpref(ip, lbn, i, &bap[0]);
747 * release "bp" buf to avoid deadlock (re-bread later)
749 brelse(bp);
751 err = alloc(ip, pref, (int)bsize, &nb, cr);
752 if (err) {
754 * Return any partial allocations.
756 ufs_undo_allocation(ip, alloced_blocks,
757 undo_table, added_sectors);
758 return (err);
761 ASSERT(!ufs_badblock(ip, nb));
762 ASSERT(alloced_blocks <= NIADDR);
764 if (allocblk)
765 *allocblk = nb;
767 undo_table[alloced_blocks].this_block = nb;
768 undo_table[alloced_blocks].block_size = bsize;
769 undo_table[alloced_blocks].owner = ufs_no_owner;
770 undo_table[alloced_blocks].usage_flags = metaflag |
771 ((j < NIADDR) ? I_IBLK : 0);
773 alloced_blocks++;
775 if (j < NIADDR) {
776 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
778 * Write synchronously so indirect
779 * blocks never point at garbage.
781 nbp = UFS_GETBLK(
782 ufsvfsp, dev, fsbtodb(fs, nb), bsize);
784 clrbuf(nbp);
785 /* XXX Maybe special-case this? */
786 TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
787 UFS_BWRITE2(ufsvfsp, nbp);
788 if (nbp->b_flags & B_ERROR) {
789 err = geterror(nbp);
790 brelse(nbp);
792 * Return any partial
793 * allocations.
795 ufs_undo_allocation(ip,
796 alloced_blocks,
797 undo_table, added_sectors);
798 return (err);
800 brelse(nbp);
801 } else if (alloc_type == BI_NORMAL ||
802 P2ROUNDUP_TYPED(size,
803 PAGESIZE, u_offset_t) < bsize) {
804 TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
805 fbzero(ITOV(ip),
806 ((offset_t)lbn << fs->fs_bshift),
807 (uint_t)bsize, &fbp);
810 * Cases which we need to do a synchronous
811 * write of the zeroed data pages:
813 * 1) If we are writing a directory then we
814 * want to write synchronously so blocks in
815 * directories never contain garbage.
817 * 2) If we are filling in a hole and the
818 * indirect block is going to be synchronously
819 * written back below we need to make sure
820 * that the zeroes are written here before
821 * the indirect block is updated so that if
822 * we crash before the real data is pushed
823 * we will not end up with random data is
824 * the middle of the file.
826 * 3) If the size of the request rounded up
827 * to the system page size is smaller than
828 * the file system block size, we want to
829 * write out all the pages now so that
830 * they are not aborted before they actually
831 * make it to ufs_putpage since the length
832 * of the inode will not include the pages.
835 if (isdirquota || (issync &&
836 lbn < llbn))
837 (void) ufs_fbiwrite(fbp, ip, nb,
838 fs->fs_fsize);
839 else
840 fbrelse(fbp, S_WRITE);
844 * re-acquire "bp" buf
846 bp = UFS_BREAD(ufsvfsp,
847 ip->i_dev, fsbtodb(fs, ob), bsize);
848 if (bp->b_flags & B_ERROR) {
849 err = geterror(bp);
850 brelse(bp);
852 * Return any partial allocations.
854 ufs_undo_allocation(ip,
855 alloced_blocks,
856 undo_table, added_sectors);
857 return (err);
859 bap = bp->b_un.b_daddr;
860 bap[i] = nb;
863 * The magic explained: j will be equal to NIADDR
864 * when we are at the lowest level, this is where the
865 * array entries point directly to data blocks. Since
866 * we will be 'fallocate'ing we will go ahead and negate
867 * the addresses.
869 if (alloc_type == BI_FALLOCATE && j == NIADDR)
870 bap[i] = -bap[i];
872 TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
873 added_sectors += btodb(bsize);
874 ip->i_blocks += btodb(bsize);
875 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
876 TRANS_INODE(ufsvfsp, ip);
877 ip->i_flag |= IUPD | ICHG | IATTCHG;
879 /* Caller is responsible for updating i_seq */
881 undo_table[alloced_blocks-1].owner =
882 ufs_indirect_block;
883 undo_table[alloced_blocks-1].owner_block = ob;
884 undo_table[alloced_blocks-1].owner_offset = i;
886 if (issync) {
887 UFS_BWRITE2(ufsvfsp, bp);
888 if (bp->b_flags & B_ERROR) {
889 err = geterror(bp);
890 brelse(bp);
892 * Return any partial
893 * allocations.
895 ufs_undo_allocation(ip,
896 alloced_blocks,
897 undo_table, added_sectors);
898 return (err);
900 brelse(bp);
901 } else {
902 bdrwrite(bp);
904 } else {
905 brelse(bp);
908 return (0);
912 * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
913 * is in the critical region of wrip().
916 bmap_has_holes(struct inode *ip)
918 struct fs *fs = ip->i_fs;
919 uint_t dblks; /* # of data blocks */
920 uint_t mblks; /* # of data + metadata blocks */
921 int nindirshift;
922 int nindiroffset;
923 uint_t cnt;
924 int n, j, shft;
925 uint_t nindirblks;
927 int fsbshift = fs->fs_bshift;
928 int fsboffset = (1 << fsbshift) - 1;
931 * Check for writer in critical region, if found then we
932 * cannot trust the values of i_size and i_blocks
933 * simply return true.
935 if (ip->i_writer != NULL && ip->i_writer != curthread) {
936 return (1);
939 dblks = (ip->i_size + fsboffset) >> fsbshift;
940 mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
943 * File has only direct blocks.
945 if (dblks <= NDADDR)
946 return (mblks < dblks);
947 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
949 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
950 nindirblks = nindiroffset + 1;
952 dblks -= NDADDR;
953 shft = 0;
955 * Determine how many levels of indirection.
957 for (j = NIADDR; j > 0; j--) {
958 longlong_t sh;
960 shft += nindirshift; /* sh *= nindir */
961 sh = 1LL << shft;
962 if (dblks <= sh)
963 break;
964 dblks -= sh;
966 /* LINTED: warning: logical expression always true: op "||" */
967 ASSERT(NIADDR <= 3);
968 ASSERT(j <= NIADDR);
969 if (j == NIADDR) /* single level indirection */
970 cnt = NDADDR + 1 + dblks;
971 else if (j == NIADDR-1) /* double indirection */
972 cnt = NDADDR + 1 + nindirblks +
973 1 + (dblks + nindiroffset)/nindirblks + dblks;
974 else if (j == NIADDR-2) { /* triple indirection */
975 n = (dblks + nindiroffset)/nindirblks;
976 cnt = NDADDR + 1 + nindirblks +
977 1 + nindirblks + nindirblks*nindirblks +
978 1 + (n + nindiroffset)/nindirblks + n + dblks;
981 return (mblks < cnt);
985 * find some contig blocks starting at *sbp and going for min(n, max_contig)
986 * return the number of blocks (not frags) found.
987 * The array passed in must be at least [0..n-1].
989 static int
990 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
992 register daddr_t bn, nextbn;
993 register daddr32_t *bp;
994 register int diff;
995 int maxtransblk;
997 if (n <= 0)
998 return (0);
999 bn = *sbp;
1000 if (bn == 0)
1001 return (0);
1003 diff = fs->fs_frag;
1004 if (*lenp) {
1005 n = MIN(n, lblkno(fs, *lenp));
1006 } else {
1008 * If the user has set the value for maxcontig lower than
1009 * the drive transfer size, then assume they want this
1010 * to be the maximum value for the size of the data transfer.
1012 maxtransblk = maxtransfer >> DEV_BSHIFT;
1013 if (fs->fs_maxcontig < maxtransblk) {
1014 n = MIN(n, fs->fs_maxcontig);
1015 } else {
1016 n = MIN(n, maxtransblk);
1019 bp = sbp;
1020 while (--n > 0) {
1021 nextbn = *(bp + 1);
1022 if (nextbn == 0 || bn + diff != nextbn)
1023 break;
1024 bn = nextbn;
1025 bp++;
1027 return ((int)(bp - sbp) + 1);
1031 * Free any blocks which had been successfully allocated. Always called
1032 * as a result of an error, so we don't bother returning an error code
1033 * from here.
1035 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1036 * Thus it is safe to call this as part of error handling, whether or not
1037 * any blocks have been allocated.
1039 * The ufs_inode_direct case is currently unused.
1042 static void
1043 ufs_undo_allocation(
1044 inode_t *ip,
1045 int block_count,
1046 struct ufs_allocated_block table[],
1047 int inode_sector_adjust)
1049 int i;
1050 int inode_changed;
1051 int error_updating_pointers;
1052 struct ufsvfs *ufsvfsp;
1054 inode_changed = 0;
1055 error_updating_pointers = 0;
1057 ufsvfsp = ip->i_ufsvfs;
1060 * Update pointers on disk before freeing blocks. If we fail,
1061 * some blocks may remain busy; but they will be reclaimed by
1062 * an fsck. (This is better than letting a block wind up with
1063 * two owners if we successfully freed it but could not remove
1064 * the pointer to it.)
1067 for (i = 0; i < block_count; i++) {
1068 switch (table[i].owner) {
1069 case ufs_no_owner:
1070 /* Nothing to do here, nobody points to us */
1071 break;
1072 case ufs_inode_direct:
1073 ASSERT(table[i].owner_offset < NDADDR);
1074 ip->i_db[table[i].owner_offset] = 0;
1075 inode_changed = 1;
1076 break;
1077 case ufs_inode_indirect:
1078 ASSERT(table[i].owner_offset < NIADDR);
1079 ip->i_ib[table[i].owner_offset] = 0;
1080 inode_changed = 1;
1081 break;
1082 case ufs_indirect_block: {
1083 buf_t *bp;
1084 daddr32_t *block_data;
1086 /* Read/modify/log/write. */
1088 ASSERT(table[i].owner_offset <
1089 (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1091 bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1092 fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1093 VBSIZE(ITOV(ip)));
1095 if (bp->b_flags & B_ERROR) {
1096 /* Couldn't read this block; give up. */
1097 error_updating_pointers = 1;
1098 brelse(bp);
1099 break; /* out of SWITCH */
1102 block_data = bp->b_un.b_daddr;
1103 block_data[table[i].owner_offset] = 0;
1105 /* Write a log entry which includes the zero. */
1106 /* It might be possible to optimize this by using */
1107 /* TRANS_BUF directly and zeroing only the four */
1108 /* bytes involved, but an attempt to do that led */
1109 /* to panics in the logging code. The attempt was */
1110 /* TRANS_BUF(ufsvfsp, */
1111 /* table[i].owner_offset * sizeof (daddr32_t), */
1112 /* sizeof (daddr32_t), */
1113 /* bp, */
1114 /* DT_ABZERO); */
1116 TRANS_BUF_ITEM_128(ufsvfsp,
1117 block_data[table[i].owner_offset],
1118 block_data, bp, DT_AB);
1120 /* Now we can write the buffer itself. */
1122 UFS_BWRITE2(ufsvfsp, bp);
1124 if (bp->b_flags & B_ERROR) {
1125 error_updating_pointers = 1;
1128 brelse(bp);
1129 break;
1131 default:
1132 (void) ufs_fault(ITOV(ip),
1133 "ufs_undo_allocation failure\n");
1134 break;
1139 * If the inode changed, or if we need to update its block count,
1140 * then do that now. We update the inode synchronously on disk
1141 * to ensure that it won't transiently point at a block we've
1142 * freed (only necessary if we're not logging).
1144 * NOTE: Currently ufs_iupdat() does not check for errors. When
1145 * it is fixed, we should verify that we successfully updated the
1146 * inode before freeing blocks below.
1149 if (inode_changed || (inode_sector_adjust != 0)) {
1150 ip->i_blocks -= inode_sector_adjust;
1151 ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1152 TRANS_INODE(ufsvfsp, ip);
1153 ip->i_flag |= IUPD | ICHG | IATTCHG;
1154 ip->i_seq++;
1155 if (!TRANS_ISTRANS(ufsvfsp))
1156 ufs_iupdat(ip, I_SYNC);
1160 * Now we go through and actually free the blocks, but only if we
1161 * successfully removed the pointers to them.
1164 if (!error_updating_pointers) {
1165 for (i = 0; i < block_count; i++) {
1166 free(ip, table[i].this_block, table[i].block_size,
1167 table[i].usage_flags);
1173 * Find the next hole or data block in file starting at *off
1174 * Return found offset in *off, which can be less than the
1175 * starting offset if not block aligned.
1176 * This code is based on bmap_read().
1177 * Errors: ENXIO for end of file
1178 * EIO for block read error.
1181 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1183 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1184 struct fs *fs = ufsvfsp->vfs_fs;
1185 buf_t *bp[NIADDR];
1186 int i, j;
1187 int shft; /* we maintain sh = 1 << shft */
1188 int nindirshift, nindiroffset;
1189 daddr_t ob, nb, tbn, lbn, skip;
1190 daddr32_t *bap;
1191 u_offset_t isz = (offset_t)ip->i_size;
1192 int32_t bs = fs->fs_bsize; /* file system block size */
1193 int32_t nindir = fs->fs_nindir;
1194 dev_t dev;
1195 int error = 0;
1196 daddr_t limits[NIADDR];
1198 ASSERT(*off < isz);
1199 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1200 lbn = (daddr_t)lblkno(fs, *off);
1201 ASSERT(lbn >= 0);
1203 for (i = 0; i < NIADDR; i++)
1204 bp[i] = NULL;
1207 * The first NDADDR blocks are direct blocks.
1209 if (lbn < NDADDR) {
1210 for (; lbn < NDADDR; lbn++) {
1211 if ((hole && (ip->i_db[lbn] == 0)) ||
1212 (!hole && (ip->i_db[lbn] != 0))) {
1213 goto out;
1216 if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1217 goto out;
1220 nindir = fs->fs_nindir;
1221 nindirshift = ufsvfsp->vfs_nindirshift;
1222 nindiroffset = ufsvfsp->vfs_nindiroffset;
1223 dev = ip->i_dev;
1225 /* Set up limits array */
1226 for (limits[0] = NDADDR, j = 1; j < NIADDR; j++)
1227 limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1229 loop:
1231 * Determine how many levels of indirection.
1233 shft = 0; /* sh = 1 */
1234 tbn = lbn - NDADDR;
1235 for (j = NIADDR; j > 0; j--) {
1236 longlong_t sh;
1238 shft += nindirshift; /* sh *= nindir */
1239 sh = 1LL << shft;
1240 if (tbn < sh)
1241 break;
1242 tbn -= sh;
1244 if (j == 0) {
1245 /* must have passed end of file */
1246 ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1247 goto out;
1251 * Fetch the first indirect block.
1253 nb = ip->i_ib[NIADDR - j];
1254 if (nb == 0) {
1255 if (hole) {
1256 lbn = limits[NIADDR - j];
1257 goto out;
1258 } else {
1259 lbn = limits[NIADDR - j + 1];
1260 if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1261 goto out;
1262 goto loop;
1267 * Fetch through the indirect blocks.
1269 for (; ((j <= NIADDR) && (nb != 0)); j++) {
1270 ob = nb;
1272 * if there's a different block at this level then release
1273 * the old one and in with the new.
1275 if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1276 if (bp[j-1] != NULL)
1277 brelse(bp[j-1]);
1278 bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1279 if (bp[j-1]->b_flags & B_ERROR) {
1280 error = EIO;
1281 goto out;
1284 bap = bp[j-1]->b_un.b_daddr;
1286 shft -= nindirshift; /* sh / nindir */
1287 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1288 nb = bap[i];
1289 skip = 1LL << (nindirshift * (NIADDR - j));
1293 * Scan through the blocks in this array.
1295 for (; i < nindir; i++, lbn += skip) {
1296 if (hole && (bap[i] == 0))
1297 goto out;
1298 if (!hole && (bap[i] != 0)) {
1299 if (skip == 1) {
1300 /* we're at the lowest level */
1301 goto out;
1302 } else {
1303 goto loop;
1307 if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1308 goto loop;
1309 out:
1310 for (i = 0; i < NIADDR; i++) {
1311 if (bp[i])
1312 brelse(bp[i]);
1314 if (error == 0) {
1315 if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1316 error = ENXIO;
1317 } else {
1318 /* success */
1319 *off = (u_offset_t)lbn << fs->fs_bshift;
1322 return (error);
1326 * Set a particular offset in the inode list to be a certain block.
1327 * User is responsible for calling TRANS* functions
1330 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
1332 daddr_t lbn;
1333 struct inode *ip;
1334 ufsvfs_t *ufsvfsp;
1335 struct fs *fs;
1336 struct buf *bp;
1337 int i, j;
1338 int shft; /* we maintain sh = 1 << shft */
1339 int err;
1340 daddr_t ob, nb, tbn;
1341 daddr32_t *bap;
1342 int nindirshift, nindiroffset;
1344 ip = VTOI(vp);
1345 ufsvfsp = ip->i_ufsvfs;
1346 fs = ufsvfsp->vfs_fs;
1347 lbn = (daddr_t)lblkno(fs, off);
1349 ASSERT(RW_LOCK_HELD(&ip->i_contents));
1351 if (lbn < 0)
1352 return (EFBIG);
1355 * Take care of direct block assignment
1357 if (lbn < NDADDR) {
1358 ip->i_db[lbn] = bn;
1359 return (0);
1362 nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1363 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1365 * Determine how many levels of indirection.
1367 shft = 0; /* sh = 1 */
1368 tbn = lbn - NDADDR;
1369 for (j = NIADDR; j > 0; j--) {
1370 longlong_t sh;
1372 shft += nindirshift; /* sh *= nindir */
1373 sh = 1LL << shft;
1374 if (tbn < sh)
1375 break;
1376 tbn -= sh;
1378 if (j == 0)
1379 return (EFBIG);
1382 * Fetch the first indirect block.
1384 nb = ip->i_ib[NIADDR - j];
1385 if (nb == 0) {
1386 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1387 return (err);
1391 * Fetch through the indirect blocks.
1393 for (; j <= NIADDR; j++) {
1394 ob = nb;
1395 bp = UFS_BREAD(ufsvfsp,
1396 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1397 if (bp->b_flags & B_ERROR) {
1398 err = geterror(bp);
1399 brelse(bp);
1400 return (err);
1402 bap = bp->b_un.b_daddr;
1404 ASSERT(!ufs_indir_badblock(ip, bap));
1406 shft -= nindirshift; /* sh / nindir */
1407 i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1409 nb = bap[i];
1410 if (nb == 0) {
1411 err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1412 return (err);
1415 if (j == NIADDR) {
1416 bap[i] = bn;
1417 bdrwrite(bp);
1418 return (0);
1421 brelse(bp);
1423 return (0);