2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
30 * $FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.64.2.2 2001/09/21 19:15:21 dillon Exp $
33 #include "opt_quota.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
39 #include <sys/malloc.h>
41 #include <sys/vnode.h>
42 #include <sys/mount.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/syslog.h>
47 #include <sys/taskqueue.h>
48 #include <machine/inttypes.h>
54 #include "ufs_extern.h"
58 #include "ffs_extern.h"
60 typedef ufs_daddr_t
allocfcn_t (struct inode
*ip
, int cg
, ufs_daddr_t bpref
,
63 static ufs_daddr_t
ffs_alloccg (struct inode
*, int, ufs_daddr_t
, int);
65 ffs_alloccgblk (struct inode
*, struct buf
*, ufs_daddr_t
);
66 static void ffs_blkfree_cg(struct fs
*, struct vnode
*, cdev_t
, ino_t
,
67 uint32_t , ufs_daddr_t
, long );
69 static int ffs_checkblk (struct inode
*, ufs_daddr_t
, long);
71 static void ffs_clusteracct (struct fs
*, struct cg
*, ufs_daddr_t
,
73 static ufs_daddr_t
ffs_clusteralloc (struct inode
*, int, ufs_daddr_t
,
75 static ino_t
ffs_dirpref (struct inode
*);
76 static ufs_daddr_t
ffs_fragextend (struct inode
*, int, long, int, int);
77 static void ffs_fserr (struct fs
*, uint
, char *);
78 static u_long ffs_hashalloc
79 (struct inode
*, int, long, int, allocfcn_t
*);
80 static ino_t
ffs_nodealloccg (struct inode
*, int, ufs_daddr_t
, int);
81 static ufs_daddr_t
ffs_mapsearch (struct fs
*, struct cg
*, ufs_daddr_t
,
85 * Allocate a block in the filesystem.
87 * The size of the requested block is given, which must be some
88 * multiple of fs_fsize and <= fs_bsize.
89 * A preference may be optionally specified. If a preference is given
90 * the following hierarchy is used to allocate a block:
91 * 1) allocate the requested block.
92 * 2) allocate a rotationally optimal block in the same cylinder.
93 * 3) allocate a block in the same cylinder group.
94 * 4) quadradically rehash into other cylinder groups, until an
95 * available block is located.
96 * If no block preference is given the following heirarchy is used
97 * to allocate a block:
98 * 1) allocate a block in the cylinder group that contains the
100 * 2) quadradically rehash into other cylinder groups, until an
101 * available block is located.
104 ffs_alloc(struct inode
*ip
, ufs_daddr_t lbn
, ufs_daddr_t bpref
, int size
,
105 struct ucred
*cred
, ufs_daddr_t
*bnp
)
117 if ((uint
)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
118 kprintf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
119 devtoname(ip
->i_dev
), (long)fs
->fs_bsize
, size
,
121 panic("ffs_alloc: bad size");
124 panic("ffs_alloc: missing credential");
125 #endif /* DIAGNOSTIC */
126 if (size
== fs
->fs_bsize
&& fs
->fs_cstotal
.cs_nbfree
== 0)
128 if (cred
->cr_uid
!= 0 &&
129 freespace(fs
, fs
->fs_minfree
) - numfrags(fs
, size
) < 0)
132 error
= ufs_chkdq(ip
, (long)btodb(size
), cred
, 0);
136 if (bpref
>= fs
->fs_size
)
139 cg
= ino_to_cg(fs
, ip
->i_number
);
141 cg
= dtog(fs
, bpref
);
142 bno
= (ufs_daddr_t
)ffs_hashalloc(ip
, cg
, (long)bpref
, size
,
145 ip
->i_blocks
+= btodb(size
);
146 ip
->i_flag
|= IN_CHANGE
| IN_UPDATE
;
152 * Restore user's disk quota because allocation failed.
154 (void) ufs_chkdq(ip
, (long)-btodb(size
), cred
, FORCE
);
157 ffs_fserr(fs
, cred
->cr_uid
, "filesystem full");
158 uprintf("\n%s: write failed, filesystem is full\n", fs
->fs_fsmnt
);
163 * Reallocate a fragment to a bigger size
165 * The number and size of the old block is given, and a preference
166 * and new size is also specified. The allocator attempts to extend
167 * the original block. Failing that, the regular block allocator is
168 * invoked to get an appropriate block.
171 ffs_realloccg(struct inode
*ip
, ufs_daddr_t lbprev
, ufs_daddr_t bpref
,
172 int osize
, int nsize
, struct ucred
*cred
, struct buf
**bpp
)
176 int cg
, request
, error
;
177 ufs_daddr_t bprev
, bno
;
182 if ((uint
)osize
> fs
->fs_bsize
|| fragoff(fs
, osize
) != 0 ||
183 (uint
)nsize
> fs
->fs_bsize
|| fragoff(fs
, nsize
) != 0) {
185 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
186 devtoname(ip
->i_dev
), (long)fs
->fs_bsize
, osize
,
187 nsize
, fs
->fs_fsmnt
);
188 panic("ffs_realloccg: bad size");
191 panic("ffs_realloccg: missing credential");
192 #endif /* DIAGNOSTIC */
193 if (cred
->cr_uid
!= 0 &&
194 freespace(fs
, fs
->fs_minfree
) - numfrags(fs
, nsize
- osize
) < 0)
196 if ((bprev
= ip
->i_db
[lbprev
]) == 0) {
197 kprintf("dev = %s, bsize = %ld, bprev = %ld, fs = %s\n",
198 devtoname(ip
->i_dev
), (long)fs
->fs_bsize
, (long)bprev
,
200 panic("ffs_realloccg: bad bprev");
203 * Allocate the extra space in the buffer.
205 error
= bread(ITOV(ip
), lblktodoff(fs
, lbprev
), osize
, &bp
);
211 if(bp
->b_bio2
.bio_offset
== NOOFFSET
) {
212 if (lbprev
>= UFS_NDADDR
)
213 panic("ffs_realloccg: lbprev out of range");
214 bp
->b_bio2
.bio_offset
= fsbtodoff(fs
, bprev
);
218 error
= ufs_chkdq(ip
, (long)btodb(nsize
- osize
), cred
, 0);
225 * Check for extension in the existing location.
227 cg
= dtog(fs
, bprev
);
228 bno
= ffs_fragextend(ip
, cg
, (long)bprev
, osize
, nsize
);
230 if (bp
->b_bio2
.bio_offset
!= fsbtodoff(fs
, bno
))
231 panic("ffs_realloccg: bad blockno");
232 ip
->i_blocks
+= btodb(nsize
- osize
);
233 ip
->i_flag
|= IN_CHANGE
| IN_UPDATE
;
235 bzero((char *)bp
->b_data
+ osize
, (uint
)nsize
- osize
);
240 * Allocate a new disk location.
242 if (bpref
>= fs
->fs_size
)
244 switch ((int)fs
->fs_optim
) {
247 * Allocate an exact sized fragment. Although this makes
248 * best use of space, we will waste time relocating it if
249 * the file continues to grow. If the fragmentation is
250 * less than half of the minimum free reserve, we choose
251 * to begin optimizing for time.
254 if (fs
->fs_minfree
<= 5 ||
255 fs
->fs_cstotal
.cs_nffree
>
256 (off_t
)fs
->fs_dsize
* fs
->fs_minfree
/ (2 * 100))
258 log(LOG_NOTICE
, "%s: optimization changed from SPACE to TIME\n",
260 fs
->fs_optim
= FS_OPTTIME
;
264 * At this point we have discovered a file that is trying to
265 * grow a small fragment to a larger fragment. To save time,
266 * we allocate a full sized block, then free the unused portion.
267 * If the file continues to grow, the `ffs_fragextend' call
268 * above will be able to grow it in place without further
269 * copying. If aberrant programs cause disk fragmentation to
270 * grow within 2% of the free reserve, we choose to begin
271 * optimizing for space.
273 request
= fs
->fs_bsize
;
274 if (fs
->fs_cstotal
.cs_nffree
<
275 (off_t
)fs
->fs_dsize
* (fs
->fs_minfree
- 2) / 100)
277 log(LOG_NOTICE
, "%s: optimization changed from TIME to SPACE\n",
279 fs
->fs_optim
= FS_OPTSPACE
;
282 kprintf("dev = %s, optim = %ld, fs = %s\n",
283 devtoname(ip
->i_dev
), (long)fs
->fs_optim
, fs
->fs_fsmnt
);
284 panic("ffs_realloccg: bad optim");
287 bno
= (ufs_daddr_t
)ffs_hashalloc(ip
, cg
, (long)bpref
, request
,
290 bp
->b_bio2
.bio_offset
= fsbtodoff(fs
, bno
);
291 if (!DOINGSOFTDEP(ITOV(ip
)))
292 ffs_blkfree(ip
, bprev
, (long)osize
);
294 ffs_blkfree(ip
, bno
+ numfrags(fs
, nsize
),
295 (long)(request
- nsize
));
296 ip
->i_blocks
+= btodb(nsize
- osize
);
297 ip
->i_flag
|= IN_CHANGE
| IN_UPDATE
;
299 bzero((char *)bp
->b_data
+ osize
, (uint
)nsize
- osize
);
305 * Restore user's disk quota because allocation failed.
307 (void) ufs_chkdq(ip
, (long)-btodb(nsize
- osize
), cred
, FORCE
);
314 ffs_fserr(fs
, cred
->cr_uid
, "filesystem full");
315 uprintf("\n%s: write failed, filesystem is full\n", fs
->fs_fsmnt
);
319 SYSCTL_NODE(_vfs
, OID_AUTO
, ffs
, CTLFLAG_RW
, 0, "FFS filesystem");
322 * Reallocate a sequence of blocks into a contiguous sequence of blocks.
324 * The vnode and an array of buffer pointers for a range of sequential
325 * logical blocks to be made contiguous is given. The allocator attempts
326 * to find a range of sequential blocks starting as close as possible to
327 * an fs_rotdelay offset from the end of the allocation for the logical
328 * block immediately preceeding the current range. If successful, the
329 * physical block numbers in the buffer pointers and in the inode are
330 * changed to reflect the new allocation. If unsuccessful, the allocation
331 * is left unchanged. The success in doing the reallocation is returned.
332 * Note that the error return is not reflected back to the user. Rather
333 * the previous block allocation will be used.
335 static int doasyncfree
= 1;
336 SYSCTL_INT(_vfs_ffs
, FFS_ASYNCFREE
, doasyncfree
, CTLFLAG_RW
, &doasyncfree
, 0, "");
338 static int doreallocblks
= 1;
339 SYSCTL_INT(_vfs_ffs
, FFS_REALLOCBLKS
, doreallocblks
, CTLFLAG_RW
, &doreallocblks
, 0, "");
342 static volatile int prtrealloc
= 0;
346 * ffs_reallocblks(struct vnode *a_vp, struct cluster_save *a_buflist)
349 ffs_reallocblks(struct vop_reallocblks_args
*ap
)
354 struct buf
*sbp
, *ebp
;
355 ufs_daddr_t
*bap
, *sbap
, *ebap
= NULL
;
356 struct cluster_save
*buflist
;
357 ufs_daddr_t start_lbn
, end_lbn
, soff
, newblk
, blkno
;
361 struct indir start_ap
[UFS_NIADDR
+ 1], end_ap
[UFS_NIADDR
+ 1], *idp
;
362 int i
, len
, slen
, start_lvl
, end_lvl
, pref
, ssize
;
364 if (doreallocblks
== 0)
369 if (fs
->fs_contigsumsize
<= 0)
371 buflist
= ap
->a_buflist
;
372 len
= buflist
->bs_nchildren
;
373 start_lbn
= lblkno(fs
, buflist
->bs_children
[0]->b_loffset
);
374 end_lbn
= start_lbn
+ len
- 1;
376 for (i
= 0; i
< len
; i
++)
377 if (!ffs_checkblk(ip
,
378 dofftofsb(fs
, buflist
->bs_children
[i
]->b_bio2
.bio_offset
), fs
->fs_bsize
))
379 panic("ffs_reallocblks: unallocated block 1");
380 for (i
= 1; i
< len
; i
++) {
381 if (buflist
->bs_children
[i
]->b_loffset
!= lblktodoff(fs
, start_lbn
) + lblktodoff(fs
, i
))
382 panic("ffs_reallocblks: non-logical cluster");
384 boffset
= buflist
->bs_children
[0]->b_bio2
.bio_offset
;
385 ssize
= (int)fsbtodoff(fs
, fs
->fs_frag
);
386 for (i
= 1; i
< len
- 1; i
++)
387 if (buflist
->bs_children
[i
]->b_bio2
.bio_offset
!= boffset
+ (i
* ssize
))
388 panic("ffs_reallocblks: non-physical cluster %d", i
);
391 * If the latest allocation is in a new cylinder group, assume that
392 * the filesystem has decided to move and do not force it back to
393 * the previous cylinder group.
395 if (dtog(fs
, dofftofsb(fs
, buflist
->bs_children
[0]->b_bio2
.bio_offset
)) !=
396 dtog(fs
, dofftofsb(fs
, buflist
->bs_children
[len
- 1]->b_bio2
.bio_offset
)))
398 if (ufs_getlbns(vp
, start_lbn
, start_ap
, &start_lvl
) ||
399 ufs_getlbns(vp
, end_lbn
, end_ap
, &end_lvl
))
402 * Get the starting offset and block map for the first block and
403 * the number of blocks that will fit into sbap starting at soff.
405 if (start_lvl
== 0) {
408 slen
= UFS_NDADDR
- soff
;
410 idp
= &start_ap
[start_lvl
- 1];
411 if (bread(vp
, lblktodoff(fs
, idp
->in_lbn
), (int)fs
->fs_bsize
, &sbp
)) {
415 sbap
= (ufs_daddr_t
*)sbp
->b_data
;
417 slen
= fs
->fs_nindir
- soff
;
420 * Find the preferred location for the cluster.
422 pref
= ffs_blkpref(ip
, start_lbn
, soff
, sbap
);
425 * If the block range spans two block maps, get the second map.
427 if (end_lvl
== 0 || (idp
= &end_ap
[end_lvl
- 1])->in_off
+ 1 >= len
) {
431 if (start_ap
[start_lvl
-1].in_lbn
== idp
->in_lbn
)
432 panic("ffs_reallocblk: start == end");
434 ssize
= len
- (idp
->in_off
+ 1);
435 if (bread(vp
, lblktodoff(fs
, idp
->in_lbn
), (int)fs
->fs_bsize
, &ebp
))
437 ebap
= (ufs_daddr_t
*)ebp
->b_data
;
441 * Make sure we aren't spanning more then two blockmaps. ssize is
442 * our calculation of the span we have to scan in the first blockmap,
443 * while slen is our calculation of the number of entries available
444 * in the first blockmap (from soff).
447 panic("ffs_reallocblks: range spans more than two blockmaps!"
448 " start_lbn %ld len %d (%d/%d)",
449 (long)start_lbn
, len
, slen
, ssize
);
452 * Search the block map looking for an allocation of the desired size.
454 if ((newblk
= (ufs_daddr_t
)ffs_hashalloc(ip
, dtog(fs
, pref
), (long)pref
,
455 len
, ffs_clusteralloc
)) == 0)
458 * We have found a new contiguous block.
460 * First we have to replace the old block pointers with the new
461 * block pointers in the inode and indirect blocks associated
466 kprintf("realloc: ino %ju, lbns %d-%d\n\told:",
467 (uintmax_t)ip
->i_number
, start_lbn
, end_lbn
);
470 for (bap
= &sbap
[soff
], i
= 0; i
< len
; i
++, blkno
+= fs
->fs_frag
) {
476 if (!ffs_checkblk(ip
,
477 dofftofsb(fs
, buflist
->bs_children
[i
]->b_bio2
.bio_offset
), fs
->fs_bsize
))
478 panic("ffs_reallocblks: unallocated block 2");
479 if (dofftofsb(fs
, buflist
->bs_children
[i
]->b_bio2
.bio_offset
) != *bap
)
480 panic("ffs_reallocblks: alloc mismatch");
484 kprintf(" %d,", *bap
);
486 if (DOINGSOFTDEP(vp
)) {
487 if (sbap
== &ip
->i_db
[0] && i
< ssize
)
488 softdep_setup_allocdirect(ip
, start_lbn
+ i
,
489 blkno
, *bap
, fs
->fs_bsize
, fs
->fs_bsize
,
490 buflist
->bs_children
[i
]);
492 softdep_setup_allocindir_page(ip
, start_lbn
+ i
,
493 i
< ssize
? sbp
: ebp
, soff
+ i
, blkno
,
494 *bap
, buflist
->bs_children
[i
]);
499 * Next we must write out the modified inode and indirect blocks.
500 * For strict correctness, the writes should be synchronous since
501 * the old block values may have been written to disk. In practise
502 * they are almost never written, but if we are concerned about
503 * strict correctness, the `doasyncfree' flag should be set to zero.
505 * The test on `doasyncfree' should be changed to test a flag
506 * that shows whether the associated buffers and inodes have
507 * been written. The flag should be set when the cluster is
508 * started and cleared whenever the buffer or inode is flushed.
509 * We can then check below to see if it is set, and do the
510 * synchronous write only when it has been cleared.
512 if (sbap
!= &ip
->i_db
[0]) {
518 ip
->i_flag
|= IN_CHANGE
| IN_UPDATE
;
529 * Last, free the old blocks and assign the new blocks to the buffers.
535 for (blkno
= newblk
, i
= 0; i
< len
; i
++, blkno
+= fs
->fs_frag
) {
536 if (!DOINGSOFTDEP(vp
) &&
537 buflist
->bs_children
[i
]->b_bio2
.bio_offset
!= NOOFFSET
) {
539 dofftofsb(fs
, buflist
->bs_children
[i
]->b_bio2
.bio_offset
),
542 buflist
->bs_children
[i
]->b_bio2
.bio_offset
= fsbtodoff(fs
, blkno
);
544 if (!ffs_checkblk(ip
,
545 dofftofsb(fs
, buflist
->bs_children
[i
]->b_bio2
.bio_offset
), fs
->fs_bsize
))
546 panic("ffs_reallocblks: unallocated block 3");
550 kprintf(" %d,", blkno
);
564 if (sbap
!= &ip
->i_db
[0])
570 * Allocate an inode in the filesystem.
572 * If allocating a directory, use ffs_dirpref to select the inode.
573 * If allocating in a directory, the following hierarchy is followed:
574 * 1) allocate the preferred inode.
575 * 2) allocate an inode in the same cylinder group.
576 * 3) quadradically rehash into other cylinder groups, until an
577 * available inode is located.
578 * If no inode preference is given the following heirarchy is used
579 * to allocate an inode:
580 * 1) allocate an inode in cylinder group 0.
581 * 2) quadradically rehash into other cylinder groups, until an
582 * available inode is located.
585 ffs_valloc(struct vnode
*pvp
, int mode
, struct ucred
*cred
, struct vnode
**vpp
)
596 if (fs
->fs_cstotal
.cs_nifree
== 0)
599 if ((mode
& IFMT
) == IFDIR
)
600 ipref
= ffs_dirpref(pip
);
602 ipref
= pip
->i_number
;
603 if (ipref
>= fs
->fs_ncg
* fs
->fs_ipg
)
605 cg
= ino_to_cg(fs
, ipref
);
607 * Track number of dirs created one after another
608 * in a same cg without intervening by files.
610 if ((mode
& IFMT
) == IFDIR
) {
611 if (fs
->fs_contigdirs
[cg
] < 255)
612 fs
->fs_contigdirs
[cg
]++;
614 if (fs
->fs_contigdirs
[cg
] > 0)
615 fs
->fs_contigdirs
[cg
]--;
617 ino
= (ino_t
)ffs_hashalloc(pip
, cg
, (long)ipref
, mode
,
618 (allocfcn_t
*)ffs_nodealloccg
);
621 error
= VFS_VGET(pvp
->v_mount
, NULL
, ino
, vpp
);
623 ffs_vfree(pvp
, ino
, mode
);
628 kprintf("mode = 0%o, inum = %lu, fs = %s\n",
629 ip
->i_mode
, (u_long
)ip
->i_number
, fs
->fs_fsmnt
);
630 panic("ffs_valloc: dup alloc");
632 if (ip
->i_blocks
) { /* XXX */
633 kprintf("free inode %s/%lu had %ld blocks\n",
634 fs
->fs_fsmnt
, (u_long
)ino
, (long)ip
->i_blocks
);
639 * Set up a new generation number for this inode.
641 if (ip
->i_gen
== 0 || ++ip
->i_gen
== 0)
642 ip
->i_gen
= krandom() / 2 + 1;
645 ffs_fserr(fs
, cred
->cr_uid
, "out of inodes");
646 uprintf("\n%s: create/symlink failed, no inodes free\n", fs
->fs_fsmnt
);
651 * Find a cylinder group to place a directory.
653 * The policy implemented by this algorithm is to allocate a
654 * directory inode in the same cylinder group as its parent
655 * directory, but also to reserve space for its files inodes
656 * and data. Restrict the number of directories which may be
657 * allocated one after another in the same cylinder group
658 * without intervening allocation of files.
660 * If we allocate a first level directory then force allocation
661 * in another cylinder group.
664 ffs_dirpref(struct inode
*pip
)
667 int cg
, prefcg
, dirsize
, cgsize
;
669 int avgifree
, avgbfree
, avgndir
, curdirsize
;
670 int minifree
, minbfree
, maxndir
;
676 avgifree
= fs
->fs_cstotal
.cs_nifree
/ fs
->fs_ncg
;
677 avgbfree
= fs
->fs_cstotal
.cs_nbfree
/ fs
->fs_ncg
;
678 avgndir
= fs
->fs_cstotal
.cs_ndir
/ fs
->fs_ncg
;
681 * Force allocation in another cg if creating a first level dir.
683 if (ITOV(pip
)->v_flag
& VROOT
) {
684 prefcg
= karc4random() % fs
->fs_ncg
;
686 minndir
= fs
->fs_ipg
;
687 for (cg
= prefcg
; cg
< fs
->fs_ncg
; cg
++)
688 if (fs
->fs_cs(fs
, cg
).cs_ndir
< minndir
&&
689 fs
->fs_cs(fs
, cg
).cs_nifree
>= avgifree
&&
690 fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
692 minndir
= fs
->fs_cs(fs
, cg
).cs_ndir
;
694 for (cg
= 0; cg
< prefcg
; cg
++)
695 if (fs
->fs_cs(fs
, cg
).cs_ndir
< minndir
&&
696 fs
->fs_cs(fs
, cg
).cs_nifree
>= avgifree
&&
697 fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
699 minndir
= fs
->fs_cs(fs
, cg
).cs_ndir
;
701 return ((ino_t
)(fs
->fs_ipg
* mincg
));
705 * Count various limits which used for
706 * optimal allocation of a directory inode.
708 maxndir
= min(avgndir
+ fs
->fs_ipg
/ 16, fs
->fs_ipg
);
709 minifree
= avgifree
- avgifree
/ 4;
712 minbfree
= avgbfree
- avgbfree
/ 4;
715 cgsize
= fs
->fs_fsize
* fs
->fs_fpg
;
718 * fs_avgfilesize and fs_avgfpdir are user-settable entities and
719 * multiplying them may overflow a 32 bit integer.
721 dirsize64
= fs
->fs_avgfilesize
* (int64_t)fs
->fs_avgfpdir
;
722 if (dirsize64
> 0x7fffffff) {
725 dirsize
= (int)dirsize64
;
726 curdirsize
= avgndir
?
727 (cgsize
- avgbfree
* fs
->fs_bsize
) / avgndir
: 0;
728 if (dirsize
< curdirsize
)
729 dirsize
= curdirsize
;
730 maxcontigdirs
= min((avgbfree
* fs
->fs_bsize
) / dirsize
, 255);
731 if (fs
->fs_avgfpdir
> 0)
732 maxcontigdirs
= min(maxcontigdirs
,
733 fs
->fs_ipg
/ fs
->fs_avgfpdir
);
734 if (maxcontigdirs
== 0)
739 * Limit number of dirs in one cg and reserve space for
740 * regular files, but only if we have no deficit in
743 prefcg
= ino_to_cg(fs
, pip
->i_number
);
744 for (cg
= prefcg
; cg
< fs
->fs_ncg
; cg
++)
745 if (fs
->fs_cs(fs
, cg
).cs_ndir
< maxndir
&&
746 fs
->fs_cs(fs
, cg
).cs_nifree
>= minifree
&&
747 fs
->fs_cs(fs
, cg
).cs_nbfree
>= minbfree
) {
748 if (fs
->fs_contigdirs
[cg
] < maxcontigdirs
)
749 return ((ino_t
)(fs
->fs_ipg
* cg
));
751 for (cg
= 0; cg
< prefcg
; cg
++)
752 if (fs
->fs_cs(fs
, cg
).cs_ndir
< maxndir
&&
753 fs
->fs_cs(fs
, cg
).cs_nifree
>= minifree
&&
754 fs
->fs_cs(fs
, cg
).cs_nbfree
>= minbfree
) {
755 if (fs
->fs_contigdirs
[cg
] < maxcontigdirs
)
756 return ((ino_t
)(fs
->fs_ipg
* cg
));
759 * This is a backstop when we have deficit in space.
761 for (cg
= prefcg
; cg
< fs
->fs_ncg
; cg
++)
762 if (fs
->fs_cs(fs
, cg
).cs_nifree
>= avgifree
)
763 return ((ino_t
)(fs
->fs_ipg
* cg
));
764 for (cg
= 0; cg
< prefcg
; cg
++)
765 if (fs
->fs_cs(fs
, cg
).cs_nifree
>= avgifree
)
767 return ((ino_t
)(fs
->fs_ipg
* cg
));
771 * Select the desired position for the next block in a file. The file is
772 * logically divided into sections. The first section is composed of the
773 * direct blocks. Each additional section contains fs_maxbpg blocks.
775 * If no blocks have been allocated in the first section, the policy is to
776 * request a block in the same cylinder group as the inode that describes
777 * the file. If no blocks have been allocated in any other section, the
778 * policy is to place the section in a cylinder group with a greater than
779 * average number of free blocks. An appropriate cylinder group is found
780 * by using a rotor that sweeps the cylinder groups. When a new group of
781 * blocks is needed, the sweep begins in the cylinder group following the
782 * cylinder group from which the previous allocation was made. The sweep
783 * continues until a cylinder group with greater than the average number
784 * of free blocks is found. If the allocation is for the first block in an
785 * indirect block, the information on the previous allocation is unavailable;
786 * here a best guess is made based upon the logical block number being
789 * If a section is already partially allocated, the policy is to
790 * contiguously allocate fs_maxcontig blocks. The end of one of these
791 * contiguous blocks and the beginning of the next is physically separated
792 * so that the disk head will be in transit between them for at least
793 * fs_rotdelay milliseconds. This is to allow time for the processor to
794 * schedule another I/O transfer.
797 ffs_blkpref(struct inode
*ip
, ufs_daddr_t lbn
, int indx
, ufs_daddr_t
*bap
)
801 int avgbfree
, startcg
;
805 if (indx
% fs
->fs_maxbpg
== 0 || bap
[indx
- 1] == 0) {
806 if (lbn
< UFS_NDADDR
+ NINDIR(fs
)) {
807 cg
= ino_to_cg(fs
, ip
->i_number
);
808 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
811 * Find a cylinder with greater than average number of
812 * unused data blocks.
814 if (indx
== 0 || bap
[indx
- 1] == 0)
816 ino_to_cg(fs
, ip
->i_number
) + lbn
/ fs
->fs_maxbpg
;
818 startcg
= dtog(fs
, bap
[indx
- 1]) + 1;
819 startcg
%= fs
->fs_ncg
;
820 avgbfree
= fs
->fs_cstotal
.cs_nbfree
/ fs
->fs_ncg
;
821 for (cg
= startcg
; cg
< fs
->fs_ncg
; cg
++)
822 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
824 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
826 for (cg
= 0; cg
<= startcg
; cg
++)
827 if (fs
->fs_cs(fs
, cg
).cs_nbfree
>= avgbfree
) {
829 return (fs
->fs_fpg
* cg
+ fs
->fs_frag
);
834 * One or more previous blocks have been laid out. If less
835 * than fs_maxcontig previous blocks are contiguous, the
836 * next block is requested contiguously, otherwise it is
837 * requested rotationally delayed by fs_rotdelay milliseconds.
839 nextblk
= bap
[indx
- 1] + fs
->fs_frag
;
840 if (fs
->fs_rotdelay
== 0 || indx
< fs
->fs_maxcontig
||
841 bap
[indx
- fs
->fs_maxcontig
] +
842 blkstofrags(fs
, fs
->fs_maxcontig
) != nextblk
)
845 * Here we convert ms of delay to frags as:
846 * (frags) = (ms) * (rev/sec) * (sect/rev) /
847 * ((sect/frag) * (ms/sec))
848 * then round up to the next block.
850 nextblk
+= roundup(fs
->fs_rotdelay
* fs
->fs_rps
* fs
->fs_nsect
/
851 (NSPF(fs
) * 1000), fs
->fs_frag
);
856 * Implement the cylinder overflow algorithm.
858 * The policy implemented by this algorithm is:
859 * 1) allocate the block in its requested cylinder group.
860 * 2) quadradically rehash on the cylinder group number.
861 * 3) brute force search for a free block.
865 ffs_hashalloc(struct inode
*ip
, int cg
, long pref
,
866 int size
, /* size for data blocks, mode for inodes */
867 allocfcn_t
*allocator
)
870 long result
; /* XXX why not same type as we return? */
875 * 1: preferred cylinder group
877 result
= (*allocator
)(ip
, cg
, pref
, size
);
881 * 2: quadratic rehash
883 for (i
= 1; i
< fs
->fs_ncg
; i
*= 2) {
885 if (cg
>= fs
->fs_ncg
)
887 result
= (*allocator
)(ip
, cg
, 0, size
);
892 * 3: brute force search
893 * Note that we start at i == 2, since 0 was checked initially,
894 * and 1 is always checked in the quadratic rehash.
896 cg
= (icg
+ 2) % fs
->fs_ncg
;
897 for (i
= 2; i
< fs
->fs_ncg
; i
++) {
898 result
= (*allocator
)(ip
, cg
, 0, size
);
902 if (cg
== fs
->fs_ncg
)
909 * Determine whether a fragment can be extended.
911 * Check to see if the necessary fragments are available, and
912 * if they are, allocate them.
915 ffs_fragextend(struct inode
*ip
, int cg
, long bprev
, int osize
, int nsize
)
926 if (fs
->fs_cs(fs
, cg
).cs_nffree
< numfrags(fs
, nsize
- osize
))
928 frags
= numfrags(fs
, nsize
);
929 bbase
= fragnum(fs
, bprev
);
930 if (bbase
> fragnum(fs
, (bprev
+ frags
- 1))) {
931 /* cannot extend across a block boundary */
934 KKASSERT(blknum(fs
, bprev
) == blknum(fs
, bprev
+ frags
- 1));
935 error
= bread(ip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
936 (int)fs
->fs_cgsize
, &bp
);
941 cgp
= (struct cg
*)bp
->b_data
;
942 if (!cg_chkmagic(cgp
)) {
946 cgp
->cg_time
= time_second
;
947 bno
= dtogd(fs
, bprev
);
948 blksfree
= cg_blksfree(cgp
);
949 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
950 if (isclr(blksfree
, bno
+ i
)) {
957 * the current fragment can be extended
958 * deduct the count on fragment being extended into
959 * increase the count on the remaining fragment (if any)
960 * allocate the extended piece
962 * ---oooooooooonnnnnnn111----
967 for (i
= frags
; i
< fs
->fs_frag
- bbase
; i
++) {
968 if (isclr(blksfree
, bno
+ i
))
973 * Size of original free frag is [i - numfrags(fs, osize)]
974 * Size of remaining free frag is [i - frags]
976 cgp
->cg_frsum
[i
- numfrags(fs
, osize
)]--;
978 cgp
->cg_frsum
[i
- frags
]++;
979 for (i
= numfrags(fs
, osize
); i
< frags
; i
++) {
980 clrbit(blksfree
, bno
+ i
);
981 cgp
->cg_cs
.cs_nffree
--;
982 fs
->fs_cstotal
.cs_nffree
--;
983 fs
->fs_cs(fs
, cg
).cs_nffree
--;
986 if (DOINGSOFTDEP(ITOV(ip
)))
987 softdep_setup_blkmapdep(bp
, fs
, bprev
);
993 * Determine whether a block can be allocated.
995 * Check to see if a block of the appropriate size is available,
996 * and if it is, allocate it.
999 ffs_alloccg(struct inode
*ip
, int cg
, ufs_daddr_t bpref
, int size
)
1005 ufs_daddr_t bno
, blkno
;
1006 int allocsiz
, error
, frags
;
1010 if (fs
->fs_cs(fs
, cg
).cs_nbfree
== 0 && size
== fs
->fs_bsize
)
1012 error
= bread(ip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
1013 (int)fs
->fs_cgsize
, &bp
);
1018 cgp
= (struct cg
*)bp
->b_data
;
1019 if (!cg_chkmagic(cgp
) ||
1020 (cgp
->cg_cs
.cs_nbfree
== 0 && size
== fs
->fs_bsize
)) {
1024 cgp
->cg_time
= time_second
;
1025 if (size
== fs
->fs_bsize
) {
1026 bno
= ffs_alloccgblk(ip
, bp
, bpref
);
1031 * Check to see if any fragments of sufficient size are already
1032 * available. Fit the data into a larger fragment if necessary,
1033 * before allocating a whole new block.
1035 blksfree
= cg_blksfree(cgp
);
1036 frags
= numfrags(fs
, size
);
1037 for (allocsiz
= frags
; allocsiz
< fs
->fs_frag
; allocsiz
++) {
1038 if (cgp
->cg_frsum
[allocsiz
] != 0)
1041 if (allocsiz
== fs
->fs_frag
) {
1043 * No fragments were available, allocate a whole block and
1044 * cut the requested fragment (of size frags) out of it.
1046 if (cgp
->cg_cs
.cs_nbfree
== 0) {
1050 bno
= ffs_alloccgblk(ip
, bp
, bpref
);
1051 bpref
= dtogd(fs
, bno
);
1052 for (i
= frags
; i
< fs
->fs_frag
; i
++)
1053 setbit(blksfree
, bpref
+ i
);
1056 * Calculate the number of free frags still remaining after
1057 * we have cut out the requested allocation. Indicate that
1058 * a fragment of that size is now available for future
1061 i
= fs
->fs_frag
- frags
;
1062 cgp
->cg_cs
.cs_nffree
+= i
;
1063 fs
->fs_cstotal
.cs_nffree
+= i
;
1064 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
1072 * cg_frsum[] has told us that a free fragment of allocsiz size is
1073 * available. Find it, then clear the bitmap bits associated with
1076 bno
= ffs_mapsearch(fs
, cgp
, bpref
, allocsiz
);
1081 for (i
= 0; i
< frags
; i
++)
1082 clrbit(blksfree
, bno
+ i
);
1083 cgp
->cg_cs
.cs_nffree
-= frags
;
1084 fs
->fs_cstotal
.cs_nffree
-= frags
;
1085 fs
->fs_cs(fs
, cg
).cs_nffree
-= frags
;
1089 * Account for the allocation. The original searched size that we
1090 * found is no longer available. If we cut out a smaller piece then
1091 * a smaller fragment is now available.
1093 cgp
->cg_frsum
[allocsiz
]--;
1094 if (frags
!= allocsiz
)
1095 cgp
->cg_frsum
[allocsiz
- frags
]++;
1096 blkno
= cg
* fs
->fs_fpg
+ bno
;
1097 if (DOINGSOFTDEP(ITOV(ip
)))
1098 softdep_setup_blkmapdep(bp
, fs
, blkno
);
1100 return ((u_long
)blkno
);
1104 * Allocate a block in a cylinder group.
1106 * This algorithm implements the following policy:
1107 * 1) allocate the requested block.
1108 * 2) allocate a rotationally optimal block in the same cylinder.
1109 * 3) allocate the next available block on the block rotor for the
1110 * specified cylinder group.
1111 * Note that this routine only allocates fs_bsize blocks; these
1112 * blocks may be fragmented by the routine that allocates them.
1115 ffs_alloccgblk(struct inode
*ip
, struct buf
*bp
, ufs_daddr_t bpref
)
1119 ufs_daddr_t bno
, blkno
;
1120 int cylno
, pos
, delta
;
1126 cgp
= (struct cg
*)bp
->b_data
;
1127 blksfree
= cg_blksfree(cgp
);
1128 if (bpref
== 0 || dtog(fs
, bpref
) != cgp
->cg_cgx
) {
1129 bpref
= cgp
->cg_rotor
;
1132 bpref
= blknum(fs
, bpref
);
1133 bpref
= dtogd(fs
, bpref
);
1135 * if the requested block is available, use it
1137 if (ffs_isblock(fs
, blksfree
, fragstoblks(fs
, bpref
))) {
1141 if (fs
->fs_nrpos
<= 1 || fs
->fs_cpc
== 0) {
1143 * Block layout information is not available.
1144 * Leaving bpref unchanged means we take the
1145 * next available free block following the one
1146 * we just allocated. Hopefully this will at
1147 * least hit a track cache on drives of unknown
1148 * geometry (e.g. SCSI).
1153 * check for a block available on the same cylinder
1155 cylno
= cbtocylno(fs
, bpref
);
1156 if (cg_blktot(cgp
)[cylno
] == 0)
1159 * check the summary information to see if a block is
1160 * available in the requested cylinder starting at the
1161 * requested rotational position and proceeding around.
1163 cylbp
= cg_blks(fs
, cgp
, cylno
);
1164 pos
= cbtorpos(fs
, bpref
);
1165 for (i
= pos
; i
< fs
->fs_nrpos
; i
++)
1168 if (i
== fs
->fs_nrpos
)
1169 for (i
= 0; i
< pos
; i
++)
1174 * found a rotational position, now find the actual
1175 * block. A panic if none is actually there.
1177 pos
= cylno
% fs
->fs_cpc
;
1178 bno
= (cylno
- pos
) * fs
->fs_spc
/ NSPB(fs
);
1179 if (fs_postbl(fs
, pos
)[i
] == -1) {
1180 kprintf("pos = %d, i = %d, fs = %s\n",
1181 pos
, i
, fs
->fs_fsmnt
);
1182 panic("ffs_alloccgblk: cyl groups corrupted");
1184 for (i
= fs_postbl(fs
, pos
)[i
];; ) {
1185 if (ffs_isblock(fs
, blksfree
, bno
+ i
)) {
1186 bno
= blkstofrags(fs
, (bno
+ i
));
1189 delta
= fs_rotbl(fs
)[i
];
1191 delta
+ i
> fragstoblks(fs
, fs
->fs_fpg
))
1195 kprintf("pos = %d, i = %d, fs = %s\n", pos
, i
, fs
->fs_fsmnt
);
1196 panic("ffs_alloccgblk: can't find blk in cyl");
1200 * no blocks in the requested cylinder, so take next
1201 * available one in this cylinder group.
1203 bno
= ffs_mapsearch(fs
, cgp
, bpref
, (int)fs
->fs_frag
);
1206 cgp
->cg_rotor
= bno
;
1208 blkno
= fragstoblks(fs
, bno
);
1209 ffs_clrblock(fs
, blksfree
, (long)blkno
);
1210 ffs_clusteracct(fs
, cgp
, blkno
, -1);
1211 cgp
->cg_cs
.cs_nbfree
--;
1212 fs
->fs_cstotal
.cs_nbfree
--;
1213 fs
->fs_cs(fs
, cgp
->cg_cgx
).cs_nbfree
--;
1214 cylno
= cbtocylno(fs
, bno
);
1215 cg_blks(fs
, cgp
, cylno
)[cbtorpos(fs
, bno
)]--;
1216 cg_blktot(cgp
)[cylno
]--;
1218 blkno
= cgp
->cg_cgx
* fs
->fs_fpg
+ bno
;
1219 if (DOINGSOFTDEP(ITOV(ip
)))
1220 softdep_setup_blkmapdep(bp
, fs
, blkno
);
1225 * Determine whether a cluster can be allocated.
1227 * We do not currently check for optimal rotational layout if there
1228 * are multiple choices in the same cylinder group. Instead we just
1229 * take the first one that we find following bpref.
1232 ffs_clusteralloc(struct inode
*ip
, int cg
, ufs_daddr_t bpref
, int len
)
1237 int i
, got
, run
, bno
, bit
, map
;
1243 if (fs
->fs_maxcluster
[cg
] < len
)
1245 if (bread(ip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
1246 (int)fs
->fs_cgsize
, &bp
)) {
1249 cgp
= (struct cg
*)bp
->b_data
;
1250 if (!cg_chkmagic(cgp
))
1254 * Check to see if a cluster of the needed size (or bigger) is
1255 * available in this cylinder group.
1257 lp
= &cg_clustersum(cgp
)[len
];
1258 for (i
= len
; i
<= fs
->fs_contigsumsize
; i
++)
1261 if (i
> fs
->fs_contigsumsize
) {
1263 * This is the first time looking for a cluster in this
1264 * cylinder group. Update the cluster summary information
1265 * to reflect the true maximum sized cluster so that
1266 * future cluster allocation requests can avoid reading
1267 * the cylinder group map only to find no clusters.
1269 lp
= &cg_clustersum(cgp
)[len
- 1];
1270 for (i
= len
- 1; i
> 0; i
--)
1273 fs
->fs_maxcluster
[cg
] = i
;
1277 * Search the cluster map to find a big enough cluster.
1278 * We take the first one that we find, even if it is larger
1279 * than we need as we prefer to get one close to the previous
1280 * block allocation. We do not search before the current
1281 * preference point as we do not want to allocate a block
1282 * that is allocated before the previous one (as we will
1283 * then have to wait for another pass of the elevator
1284 * algorithm before it will be read). We prefer to fail and
1285 * be recalled to try an allocation in the next cylinder group.
1287 if (dtog(fs
, bpref
) != cg
)
1290 bpref
= fragstoblks(fs
, dtogd(fs
, blknum(fs
, bpref
)));
1291 mapp
= &cg_clustersfree(cgp
)[bpref
/ NBBY
];
1293 bit
= 1 << (bpref
% NBBY
);
1294 for (run
= 0, got
= bpref
; got
< cgp
->cg_nclusterblks
; got
++) {
1295 if ((map
& bit
) == 0) {
1302 if ((got
& (NBBY
- 1)) != (NBBY
- 1)) {
1309 if (got
>= cgp
->cg_nclusterblks
)
1312 * Allocate the cluster that we have found.
1314 blksfree
= cg_blksfree(cgp
);
1315 for (i
= 1; i
<= len
; i
++) {
1316 if (!ffs_isblock(fs
, blksfree
, got
- run
+ i
))
1317 panic("ffs_clusteralloc: map mismatch");
1319 bno
= cg
* fs
->fs_fpg
+ blkstofrags(fs
, got
- run
+ 1);
1320 if (dtog(fs
, bno
) != cg
)
1321 panic("ffs_clusteralloc: allocated out of group");
1322 len
= blkstofrags(fs
, len
);
1323 for (i
= 0; i
< len
; i
+= fs
->fs_frag
) {
1324 if ((got
= ffs_alloccgblk(ip
, bp
, bno
+ i
)) != bno
+ i
)
1325 panic("ffs_clusteralloc: lost block");
1336 * Determine whether an inode can be allocated.
1338 * Check to see if an inode is available, and if it is,
1339 * allocate it using the following policy:
1340 * 1) allocate the requested inode.
1341 * 2) allocate the next available inode after the requested
1342 * inode in the specified cylinder group.
1343 * 3) the inode must not already be in the inode hash table. We
1344 * can encounter such a case because the vnode reclamation sequence
1346 * 3) the inode must not already be in the inode hash, otherwise it
1347 * may be in the process of being deallocated. This can occur
1348 * because the bitmap is updated before the inode is removed from
1349 * hash. If we were to reallocate the inode the caller could wind
1350 * up returning a vnode/inode combination which is in an indeterminate
1354 ffs_nodealloccg(struct inode
*ip
, int cg
, ufs_daddr_t ipref
, int mode
)
1356 struct ufsmount
*ump
;
1362 int error
, len
, arraysize
, i
;
1368 ump
= VFSTOUFS(vp
->v_mount
);
1370 if (fs
->fs_cs(fs
, cg
).cs_nifree
== 0)
1372 error
= bread(ip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
1373 (int)fs
->fs_cgsize
, &bp
);
1378 cgp
= (struct cg
*)bp
->b_data
;
1379 if (!cg_chkmagic(cgp
) || cgp
->cg_cs
.cs_nifree
== 0) {
1383 inosused
= cg_inosused(cgp
);
1387 * Quick check, reuse the most recently free inode or continue
1388 * a scan from where we left off the last time.
1390 ibase
= cg
* fs
->fs_ipg
;
1392 ipref
%= fs
->fs_ipg
;
1393 if (isclr(inosused
, ipref
)) {
1394 if (ufs_ihashcheck(ump
, ip
->i_dev
, ibase
+ ipref
) == 0)
1400 * Scan the inode bitmap starting at irotor, be sure to handle
1401 * the edge case by going back to the beginning of the array.
1403 * If the number of inodes is not byte-aligned, the unused bits
1404 * should be set to 1. This will be sanity checked in gotit. Note
1405 * that we have to be sure not to overlap the beginning and end
1406 * when irotor is in the middle of a byte as this will cause the
1407 * same bitmap byte to be checked twice. To solve this problem we
1408 * just convert everything to a byte index for the loop.
1410 ipref
= (cgp
->cg_irotor
% fs
->fs_ipg
) >> 3; /* byte index */
1411 len
= (fs
->fs_ipg
+ 7) >> 3; /* byte size */
1415 map
= inosused
[ipref
];
1417 for (i
= 0; i
< NBBY
; ++i
) {
1419 * If we find a free bit we have to make sure
1420 * that the inode is not in the middle of
1421 * being destroyed. The inode should not exist
1422 * in the inode hash.
1424 * Adjust the rotor to try to hit the
1425 * quick-check up above.
1427 if ((map
& (1 << i
)) == 0) {
1428 if (ufs_ihashcheck(ump
, ip
->i_dev
, ibase
+ (ipref
<< 3) + i
) == 0) {
1429 ipref
= (ipref
<< 3) + i
;
1430 cgp
->cg_irotor
= (ipref
+ 1) % fs
->fs_ipg
;
1439 * Setup for the next byte, start at the beginning again if
1440 * we hit the end of the array.
1442 if (++ipref
== arraysize
)
1446 if (icheckmiss
== cgp
->cg_cs
.cs_nifree
) {
1450 kprintf("fs = %s\n", fs
->fs_fsmnt
);
1451 panic("ffs_nodealloccg: block not in map, icheckmiss/nfree %d/%d",
1452 icheckmiss
, cgp
->cg_cs
.cs_nifree
);
1456 * ipref is a bit index as of the gotit label.
1459 KKASSERT(ipref
>= 0 && ipref
< fs
->fs_ipg
);
1460 cgp
->cg_time
= time_second
;
1461 if (DOINGSOFTDEP(ITOV(ip
)))
1462 softdep_setup_inomapdep(bp
, ip
, ibase
+ ipref
);
1463 setbit(inosused
, ipref
);
1464 cgp
->cg_cs
.cs_nifree
--;
1465 fs
->fs_cstotal
.cs_nifree
--;
1466 fs
->fs_cs(fs
, cg
).cs_nifree
--;
1468 if ((mode
& IFMT
) == IFDIR
) {
1469 cgp
->cg_cs
.cs_ndir
++;
1470 fs
->fs_cstotal
.cs_ndir
++;
1471 fs
->fs_cs(fs
, cg
).cs_ndir
++;
1474 return (ibase
+ ipref
);
1478 * Free a block or fragment.
1480 * The specified block or fragment is placed back in the
1481 * free map. If a fragment is deallocated, a possible
1482 * block reassembly is checked.
1485 ffs_blkfree_cg(struct fs
* fs
, struct vnode
* i_devvp
, cdev_t i_dev
, ino_t i_number
,
1486 uint32_t i_din_uid
, ufs_daddr_t bno
, long size
)
1491 int i
, error
, cg
, blk
, frags
, bbase
;
1496 * ffs_blkfree() handles TRIM if UFS is mounted with the 'trim'
1497 * option, do not issue an unconditional duplicate here!
1498 * VOP_FREEBLKS(i_devvp, fsbtodoff(fs, bno), size);
1501 if ((uint
)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0 ||
1502 fragnum(fs
, bno
) + numfrags(fs
, size
) > fs
->fs_frag
) {
1503 kprintf("dev=%s, bno = %ld, bsize = %ld, size = %ld, fs = %s\n",
1504 devtoname(i_dev
), (long)bno
, (long)fs
->fs_bsize
, size
,
1506 panic("ffs_blkfree: bad size");
1509 if ((uint
)bno
>= fs
->fs_size
) {
1510 kprintf("bad block %ld, ino %lu\n",
1511 (long)bno
, (u_long
)i_number
);
1512 ffs_fserr(fs
, i_din_uid
, "bad block");
1517 * Load the cylinder group
1519 error
= bread(i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
1520 (int)fs
->fs_cgsize
, &bp
);
1525 cgp
= (struct cg
*)bp
->b_data
;
1526 if (!cg_chkmagic(cgp
)) {
1530 cgp
->cg_time
= time_second
;
1531 bno
= dtogd(fs
, bno
);
1532 blksfree
= cg_blksfree(cgp
);
1534 if (size
== fs
->fs_bsize
) {
1536 * Free a whole block
1538 blkno
= fragstoblks(fs
, bno
);
1539 if (!ffs_isfreeblock(fs
, blksfree
, blkno
)) {
1540 kprintf("dev = %s, block = %ld, fs = %s\n",
1541 devtoname(i_dev
), (long)bno
, fs
->fs_fsmnt
);
1542 panic("ffs_blkfree: freeing free block");
1544 ffs_setblock(fs
, blksfree
, blkno
);
1545 ffs_clusteracct(fs
, cgp
, blkno
, 1);
1546 cgp
->cg_cs
.cs_nbfree
++;
1547 fs
->fs_cstotal
.cs_nbfree
++;
1548 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
1549 i
= cbtocylno(fs
, bno
);
1550 cg_blks(fs
, cgp
, i
)[cbtorpos(fs
, bno
)]++;
1551 cg_blktot(cgp
)[i
]++;
1554 * Free a fragment within a block.
1556 * bno is the starting block number of the fragment being
1559 * bbase is the starting block number for the filesystem
1560 * block containing the fragment.
1562 * blk is the current bitmap for the fragments within the
1563 * filesystem block containing the fragment.
1565 * frags is the number of fragments being freed
1567 * Call ffs_fragacct() to account for the removal of all
1568 * current fragments, then adjust the bitmap to free the
1569 * requested fragment, and finally call ffs_fragacct() again
1570 * to regenerate the accounting.
1572 bbase
= bno
- fragnum(fs
, bno
);
1573 blk
= blkmap(fs
, blksfree
, bbase
);
1574 ffs_fragacct(fs
, blk
, cgp
->cg_frsum
, -1);
1575 frags
= numfrags(fs
, size
);
1576 for (i
= 0; i
< frags
; i
++) {
1577 if (isset(blksfree
, bno
+ i
)) {
1578 kprintf("dev = %s, block = %ld, fs = %s\n",
1579 devtoname(i_dev
), (long)(bno
+ i
),
1581 panic("ffs_blkfree: freeing free frag");
1583 setbit(blksfree
, bno
+ i
);
1585 cgp
->cg_cs
.cs_nffree
+= i
;
1586 fs
->fs_cstotal
.cs_nffree
+= i
;
1587 fs
->fs_cs(fs
, cg
).cs_nffree
+= i
;
1590 * Add back in counts associated with the new frags
1592 blk
= blkmap(fs
, blksfree
, bbase
);
1593 ffs_fragacct(fs
, blk
, cgp
->cg_frsum
, 1);
1596 * If a complete block has been reassembled, account for it
1598 blkno
= fragstoblks(fs
, bbase
);
1599 if (ffs_isblock(fs
, blksfree
, blkno
)) {
1600 cgp
->cg_cs
.cs_nffree
-= fs
->fs_frag
;
1601 fs
->fs_cstotal
.cs_nffree
-= fs
->fs_frag
;
1602 fs
->fs_cs(fs
, cg
).cs_nffree
-= fs
->fs_frag
;
1603 ffs_clusteracct(fs
, cgp
, blkno
, 1);
1604 cgp
->cg_cs
.cs_nbfree
++;
1605 fs
->fs_cstotal
.cs_nbfree
++;
1606 fs
->fs_cs(fs
, cg
).cs_nbfree
++;
1607 i
= cbtocylno(fs
, bbase
);
1608 cg_blks(fs
, cgp
, i
)[cbtorpos(fs
, bbase
)]++;
1609 cg_blktot(cgp
)[i
]++;
1616 struct ffs_blkfree_trim_params
{
1622 * With TRIM, inode pointer is gone in the callback but we still need
1623 * the following fields for ffs_blkfree_cg()
1625 struct vnode
*i_devvp
;
1634 ffs_blkfree_trim_task(void *ctx
, int pending
)
1636 struct ffs_blkfree_trim_params
*tp
;
1639 ffs_blkfree_cg(tp
->i_fs
, tp
->i_devvp
, tp
->i_dev
, tp
->i_number
,
1640 tp
->i_din_uid
, tp
->bno
, tp
->size
);
1647 ffs_blkfree_trim_completed(struct bio
*biop
)
1649 struct buf
*bp
= biop
->bio_buf
;
1650 struct ffs_blkfree_trim_params
*tp
;
1652 tp
= bp
->b_bio1
.bio_caller_info1
.ptr
;
1653 TASK_INIT(&tp
->task
, 0, ffs_blkfree_trim_task
, tp
);
1654 tp
= biop
->bio_caller_info1
.ptr
;
1655 taskqueue_enqueue(taskqueue_swi
, &tp
->task
);
1661 * If TRIM is enabled, we TRIM the blocks first then free them. We do this
1662 * after TRIM is finished and the callback handler is called. The logic here
1663 * is that we free the blocks before updating the bitmap so that we don't
1664 * reuse a block before we actually trim it, which would result in trimming
1668 ffs_blkfree(struct inode
*ip
, ufs_daddr_t bno
, long size
)
1670 struct mount
*mp
= ip
->i_devvp
->v_mount
;
1671 struct ffs_blkfree_trim_params
*tp
;
1673 if (!(mp
->mnt_flag
& MNT_TRIM
)) {
1674 ffs_blkfree_cg(ip
->i_fs
, ip
->i_devvp
,ip
->i_dev
,ip
->i_number
,
1675 ip
->i_uid
, bno
, size
);
1681 tp
= kmalloc(sizeof(struct ffs_blkfree_trim_params
), M_TEMP
, M_WAITOK
);
1684 tp
->i_devvp
= ip
->i_devvp
;
1685 tp
->i_dev
= ip
->i_dev
;
1686 tp
->i_din_uid
= ip
->i_uid
;
1687 tp
->i_number
= ip
->i_number
;
1690 bp
= getnewbuf(0, 0, 0, 1);
1692 bp
->b_cmd
= BUF_CMD_FREEBLKS
;
1693 bp
->b_bio1
.bio_offset
= fsbtodoff(ip
->i_fs
, bno
);
1694 bp
->b_bcount
= size
;
1695 bp
->b_bio1
.bio_caller_info1
.ptr
= tp
;
1696 bp
->b_bio1
.bio_done
= ffs_blkfree_trim_completed
;
1697 vn_strategy(ip
->i_devvp
, &bp
->b_bio1
);
1702 * Verify allocation of a block or fragment. Returns true if block or
1703 * fragment is allocated, false if it is free.
1706 ffs_checkblk(struct inode
*ip
, ufs_daddr_t bno
, long size
)
1711 int i
, error
, frags
, free
;
1715 if ((uint
)size
> fs
->fs_bsize
|| fragoff(fs
, size
) != 0) {
1716 kprintf("bsize = %ld, size = %ld, fs = %s\n",
1717 (long)fs
->fs_bsize
, size
, fs
->fs_fsmnt
);
1718 panic("ffs_checkblk: bad size");
1720 if ((uint
)bno
>= fs
->fs_size
)
1721 panic("ffs_checkblk: bad block %d", bno
);
1722 error
= bread(ip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, dtog(fs
, bno
))),
1723 (int)fs
->fs_cgsize
, &bp
);
1725 panic("ffs_checkblk: cg bread failed");
1726 cgp
= (struct cg
*)bp
->b_data
;
1727 if (!cg_chkmagic(cgp
))
1728 panic("ffs_checkblk: cg magic mismatch");
1729 blksfree
= cg_blksfree(cgp
);
1730 bno
= dtogd(fs
, bno
);
1731 if (size
== fs
->fs_bsize
) {
1732 free
= ffs_isblock(fs
, blksfree
, fragstoblks(fs
, bno
));
1734 frags
= numfrags(fs
, size
);
1735 for (free
= 0, i
= 0; i
< frags
; i
++)
1736 if (isset(blksfree
, bno
+ i
))
1738 if (free
!= 0 && free
!= frags
)
1739 panic("ffs_checkblk: partially free fragment");
1744 #endif /* DIAGNOSTIC */
1750 ffs_vfree(struct vnode
*pvp
, ino_t ino
, int mode
)
1752 if (DOINGSOFTDEP(pvp
)) {
1753 softdep_freefile(pvp
, ino
, mode
);
1756 return (ffs_freefile(pvp
, ino
, mode
));
1760 * Do the actual free operation.
1761 * The specified inode is placed back in the free map.
1764 ffs_freefile(struct vnode
*pvp
, ino_t ino
, int mode
)
1775 if ((uint
)ino
>= fs
->fs_ipg
* fs
->fs_ncg
)
1776 panic("ffs_vfree: range: dev = (%d,%d), ino = %"PRId64
", fs = %s",
1777 major(pip
->i_dev
), minor(pip
->i_dev
), ino
, fs
->fs_fsmnt
);
1778 cg
= ino_to_cg(fs
, ino
);
1779 error
= bread(pip
->i_devvp
, fsbtodoff(fs
, cgtod(fs
, cg
)),
1780 (int)fs
->fs_cgsize
, &bp
);
1785 cgp
= (struct cg
*)bp
->b_data
;
1786 if (!cg_chkmagic(cgp
)) {
1790 cgp
->cg_time
= time_second
;
1791 inosused
= cg_inosused(cgp
);
1793 if (isclr(inosused
, ino
)) {
1794 kprintf("dev = %s, ino = %lu, fs = %s\n",
1795 devtoname(pip
->i_dev
), (u_long
)ino
, fs
->fs_fsmnt
);
1796 if (fs
->fs_ronly
== 0)
1797 panic("ffs_vfree: freeing free inode");
1799 clrbit(inosused
, ino
);
1800 if (ino
< cgp
->cg_irotor
)
1801 cgp
->cg_irotor
= ino
;
1802 cgp
->cg_cs
.cs_nifree
++;
1803 fs
->fs_cstotal
.cs_nifree
++;
1804 fs
->fs_cs(fs
, cg
).cs_nifree
++;
1805 if ((mode
& IFMT
) == IFDIR
) {
1806 cgp
->cg_cs
.cs_ndir
--;
1807 fs
->fs_cstotal
.cs_ndir
--;
1808 fs
->fs_cs(fs
, cg
).cs_ndir
--;
1816 * Find a block of the specified size in the specified cylinder group.
1818 * It is a panic if a request is made to find a block if none are
1822 ffs_mapsearch(struct fs
*fs
, struct cg
*cgp
, ufs_daddr_t bpref
, int allocsiz
)
1825 int start
, len
, loc
, i
;
1826 int blk
, field
, subfield
, pos
;
1830 * find the fragment by searching through the free block
1831 * map for an appropriate bit pattern.
1834 start
= dtogd(fs
, bpref
) / NBBY
;
1836 start
= cgp
->cg_frotor
/ NBBY
;
1837 blksfree
= cg_blksfree(cgp
);
1838 len
= howmany(fs
->fs_fpg
, NBBY
) - start
;
1839 loc
= scanc((uint
)len
, (u_char
*)&blksfree
[start
],
1840 (u_char
*)fragtbl
[fs
->fs_frag
],
1841 (u_char
)(1 << (allocsiz
- 1 + (fs
->fs_frag
% NBBY
))));
1843 len
= start
+ 1; /* XXX why overlap here? */
1845 loc
= scanc((uint
)len
, (u_char
*)&blksfree
[0],
1846 (u_char
*)fragtbl
[fs
->fs_frag
],
1847 (u_char
)(1 << (allocsiz
- 1 + (fs
->fs_frag
% NBBY
))));
1849 kprintf("start = %d, len = %d, fs = %s\n",
1850 start
, len
, fs
->fs_fsmnt
);
1851 panic("ffs_alloccg: map corrupted");
1855 bno
= (start
+ len
- loc
) * NBBY
;
1856 cgp
->cg_frotor
= bno
;
1858 * found the byte in the map
1859 * sift through the bits to find the selected frag
1861 for (i
= bno
+ NBBY
; bno
< i
; bno
+= fs
->fs_frag
) {
1862 blk
= blkmap(fs
, blksfree
, bno
);
1864 field
= around
[allocsiz
];
1865 subfield
= inside
[allocsiz
];
1866 for (pos
= 0; pos
<= fs
->fs_frag
- allocsiz
; pos
++) {
1867 if ((blk
& field
) == subfield
)
1873 kprintf("bno = %lu, fs = %s\n", (u_long
)bno
, fs
->fs_fsmnt
);
1874 panic("ffs_alloccg: block not in map");
1879 * Update the cluster map because of an allocation or free.
1881 * Cnt == 1 means free; cnt == -1 means allocating.
1884 ffs_clusteracct(struct fs
*fs
, struct cg
*cgp
, ufs_daddr_t blkno
, int cnt
)
1888 u_char
*freemapp
, *mapp
;
1889 int i
, start
, end
, forw
, back
, map
, bit
;
1891 if (fs
->fs_contigsumsize
<= 0)
1893 freemapp
= cg_clustersfree(cgp
);
1894 sump
= cg_clustersum(cgp
);
1896 * Allocate or clear the actual block.
1899 setbit(freemapp
, blkno
);
1901 clrbit(freemapp
, blkno
);
1903 * Find the size of the cluster going forward.
1906 end
= start
+ fs
->fs_contigsumsize
;
1907 if (end
>= cgp
->cg_nclusterblks
)
1908 end
= cgp
->cg_nclusterblks
;
1909 mapp
= &freemapp
[start
/ NBBY
];
1911 bit
= 1 << (start
% NBBY
);
1912 for (i
= start
; i
< end
; i
++) {
1913 if ((map
& bit
) == 0)
1915 if ((i
& (NBBY
- 1)) != (NBBY
- 1)) {
1924 * Find the size of the cluster going backward.
1927 end
= start
- fs
->fs_contigsumsize
;
1930 mapp
= &freemapp
[start
/ NBBY
];
1932 bit
= 1 << (start
% NBBY
);
1933 for (i
= start
; i
> end
; i
--) {
1934 if ((map
& bit
) == 0)
1936 if ((i
& (NBBY
- 1)) != 0) {
1940 bit
= 1 << (NBBY
- 1);
1945 * Account for old cluster and the possibly new forward and
1948 i
= back
+ forw
+ 1;
1949 if (i
> fs
->fs_contigsumsize
)
1950 i
= fs
->fs_contigsumsize
;
1957 * Update cluster summary information.
1959 lp
= &sump
[fs
->fs_contigsumsize
];
1960 for (i
= fs
->fs_contigsumsize
; i
> 0; i
--)
1963 fs
->fs_maxcluster
[cgp
->cg_cgx
] = i
;
1967 * Fserr prints the name of a filesystem with an error diagnostic.
1969 * The form of the error message is:
1973 ffs_fserr(struct fs
*fs
, uint uid
, char *cp
)
1975 struct thread
*td
= curthread
;
1978 if ((p
= td
->td_proc
) != NULL
) {
1979 log(LOG_ERR
, "pid %d (%s), uid %d on %s: %s\n", p
? p
->p_pid
: -1,
1980 p
? p
->p_comm
: "-", uid
, fs
->fs_fsmnt
, cp
);
1982 log(LOG_ERR
, "system thread %p, uid %d on %s: %s\n",
1983 td
, uid
, fs
->fs_fsmnt
, cp
);