2 * linux/fs/ext2/ialloc.c
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
15 #include <linux/config.h>
16 #include <linux/quotaops.h>
17 #include <linux/sched.h>
18 #include <linux/backing-dev.h>
19 #include <linux/buffer_head.h>
20 #include <linux/random.h>
26 * ialloc.c contains the inodes allocation and deallocation routines
30 * The free inodes are managed by bitmaps. A file system contains several
31 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
32 * block for inodes, N blocks for the inode table and data blocks.
34 * The file system contains group descriptors which are located after the
35 * super block. Each descriptor contains the number of the bitmap block and
36 * the free blocks count in the block.
41 * Read the inode allocation bitmap for a given block_group, reading
42 * into the specified slot in the superblock's bitmap cache.
44 * Return buffer_head of bitmap on success or NULL.
46 static struct buffer_head
*
47 read_inode_bitmap(struct super_block
* sb
, unsigned long block_group
)
49 struct ext2_group_desc
*desc
;
50 struct buffer_head
*bh
= NULL
;
52 desc
= ext2_get_group_desc(sb
, block_group
, NULL
);
56 bh
= sb_bread(sb
, le32_to_cpu(desc
->bg_inode_bitmap
));
58 ext2_error(sb
, "read_inode_bitmap",
59 "Cannot read inode bitmap - "
60 "block_group = %lu, inode_bitmap = %u",
61 block_group
, le32_to_cpu(desc
->bg_inode_bitmap
));
66 static void ext2_release_inode(struct super_block
*sb
, int group
, int dir
)
68 struct ext2_group_desc
* desc
;
69 struct buffer_head
*bh
;
71 desc
= ext2_get_group_desc(sb
, group
, &bh
);
73 ext2_error(sb
, "ext2_release_inode",
74 "can't get descriptor for group %d", group
);
78 spin_lock(sb_bgl_lock(EXT2_SB(sb
), group
));
79 desc
->bg_free_inodes_count
=
80 cpu_to_le16(le16_to_cpu(desc
->bg_free_inodes_count
) + 1);
82 desc
->bg_used_dirs_count
=
83 cpu_to_le16(le16_to_cpu(desc
->bg_used_dirs_count
) - 1);
84 spin_unlock(sb_bgl_lock(EXT2_SB(sb
), group
));
86 percpu_counter_dec(&EXT2_SB(sb
)->s_dirs_counter
);
88 mark_buffer_dirty(bh
);
92 * NOTE! When we get the inode, we're the only people
93 * that have access to it, and as such there are no
94 * race conditions we have to worry about. The inode
95 * is not on the hash-lists, and it cannot be reached
96 * through the filesystem because the directory entry
97 * has been deleted earlier.
99 * HOWEVER: we must make sure that we get no aliases,
100 * which means that we have to call "clear_inode()"
101 * _before_ we mark the inode not in use in the inode
102 * bitmaps. Otherwise a newly created file might use
103 * the same inode number (not actually the same pointer
104 * though), and then we'd have two inodes sharing the
105 * same inode number and space on the harddisk.
107 void ext2_free_inode (struct inode
* inode
)
109 struct super_block
* sb
= inode
->i_sb
;
112 struct buffer_head
*bitmap_bh
= NULL
;
113 unsigned long block_group
;
115 struct ext2_super_block
* es
;
118 ext2_debug ("freeing inode %lu\n", ino
);
121 * Note: we must free any quota before locking the superblock,
122 * as writing the quota to disk may need the lock as well.
124 if (!is_bad_inode(inode
)) {
125 /* Quota is already initialized in iput() */
126 ext2_xattr_delete_inode(inode
);
127 DQUOT_FREE_INODE(inode
);
131 es
= EXT2_SB(sb
)->s_es
;
132 is_directory
= S_ISDIR(inode
->i_mode
);
134 /* Do this BEFORE marking the inode not in use or returning an error */
137 if (ino
< EXT2_FIRST_INO(sb
) ||
138 ino
> le32_to_cpu(es
->s_inodes_count
)) {
139 ext2_error (sb
, "ext2_free_inode",
140 "reserved or nonexistent inode %lu", ino
);
143 block_group
= (ino
- 1) / EXT2_INODES_PER_GROUP(sb
);
144 bit
= (ino
- 1) % EXT2_INODES_PER_GROUP(sb
);
146 bitmap_bh
= read_inode_bitmap(sb
, block_group
);
150 /* Ok, now we can actually update the inode bitmaps.. */
151 if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb
), block_group
),
152 bit
, (void *) bitmap_bh
->b_data
))
153 ext2_error (sb
, "ext2_free_inode",
154 "bit already cleared for inode %lu", ino
);
156 ext2_release_inode(sb
, block_group
, is_directory
);
157 mark_buffer_dirty(bitmap_bh
);
158 if (sb
->s_flags
& MS_SYNCHRONOUS
)
159 sync_dirty_buffer(bitmap_bh
);
165 * We perform asynchronous prereading of the new inode's inode block when
166 * we create the inode, in the expectation that the inode will be written
167 * back soon. There are two reasons:
169 * - When creating a large number of files, the async prereads will be
170 * nicely merged into large reads
171 * - When writing out a large number of inodes, we don't need to keep on
172 * stalling the writes while we read the inode block.
174 * FIXME: ext2_get_group_desc() needs to be simplified.
176 static void ext2_preread_inode(struct inode
*inode
)
178 unsigned long block_group
;
179 unsigned long offset
;
181 struct buffer_head
*bh
;
182 struct ext2_group_desc
* gdp
;
183 struct backing_dev_info
*bdi
;
185 bdi
= inode
->i_mapping
->backing_dev_info
;
186 if (bdi_read_congested(bdi
))
188 if (bdi_write_congested(bdi
))
191 block_group
= (inode
->i_ino
- 1) / EXT2_INODES_PER_GROUP(inode
->i_sb
);
192 gdp
= ext2_get_group_desc(inode
->i_sb
, block_group
, &bh
);
197 * Figure out the offset within the block group inode table
199 offset
= ((inode
->i_ino
- 1) % EXT2_INODES_PER_GROUP(inode
->i_sb
)) *
200 EXT2_INODE_SIZE(inode
->i_sb
);
201 block
= le32_to_cpu(gdp
->bg_inode_table
) +
202 (offset
>> EXT2_BLOCK_SIZE_BITS(inode
->i_sb
));
203 sb_breadahead(inode
->i_sb
, block
);
207 * There are two policies for allocating an inode. If the new inode is
208 * a directory, then a forward search is made for a block group with both
209 * free space and a low directory-to-inode ratio; if that fails, then of
210 * the groups with above-average free space, that group with the fewest
211 * directories already is chosen.
213 * For other inodes, search forward from the parent directory\'s block
214 * group to find a free inode.
216 static int find_group_dir(struct super_block
*sb
, struct inode
*parent
)
218 int ngroups
= EXT2_SB(sb
)->s_groups_count
;
219 int avefreei
= ext2_count_free_inodes(sb
) / ngroups
;
220 struct ext2_group_desc
*desc
, *best_desc
= NULL
;
221 struct buffer_head
*bh
, *best_bh
= NULL
;
222 int group
, best_group
= -1;
224 for (group
= 0; group
< ngroups
; group
++) {
225 desc
= ext2_get_group_desc (sb
, group
, &bh
);
226 if (!desc
|| !desc
->bg_free_inodes_count
)
228 if (le16_to_cpu(desc
->bg_free_inodes_count
) < avefreei
)
231 (le16_to_cpu(desc
->bg_free_blocks_count
) >
232 le16_to_cpu(best_desc
->bg_free_blocks_count
))) {
245 * Orlov's allocator for directories.
247 * We always try to spread first-level directories.
249 * If there are blockgroups with both free inodes and free blocks counts
250 * not worse than average we return one with smallest directory count.
251 * Otherwise we simply return a random group.
253 * For the rest rules look so:
255 * It's OK to put directory into a group unless
256 * it has too many directories already (max_dirs) or
257 * it has too few free inodes left (min_inodes) or
258 * it has too few free blocks left (min_blocks) or
259 * it's already running too large debt (max_debt).
260 * Parent's group is prefered, if it doesn't satisfy these
261 * conditions we search cyclically through the rest. If none
262 * of the groups look good we just look for a group with more
263 * free inodes than average (starting at parent's group).
265 * Debt is incremented each time we allocate a directory and decremented
266 * when we allocate an inode, within 0--255.
269 #define INODE_COST 64
270 #define BLOCK_COST 256
272 static int find_group_orlov(struct super_block
*sb
, struct inode
*parent
)
274 int parent_group
= EXT2_I(parent
)->i_block_group
;
275 struct ext2_sb_info
*sbi
= EXT2_SB(sb
);
276 struct ext2_super_block
*es
= sbi
->s_es
;
277 int ngroups
= sbi
->s_groups_count
;
278 int inodes_per_group
= EXT2_INODES_PER_GROUP(sb
);
285 int max_debt
, max_dirs
, min_blocks
, min_inodes
;
287 struct ext2_group_desc
*desc
;
288 struct buffer_head
*bh
;
290 freei
= percpu_counter_read_positive(&sbi
->s_freeinodes_counter
);
291 avefreei
= freei
/ ngroups
;
292 free_blocks
= percpu_counter_read_positive(&sbi
->s_freeblocks_counter
);
293 avefreeb
= free_blocks
/ ngroups
;
294 ndirs
= percpu_counter_read_positive(&sbi
->s_dirs_counter
);
296 if ((parent
== sb
->s_root
->d_inode
) ||
297 (EXT2_I(parent
)->i_flags
& EXT2_TOPDIR_FL
)) {
298 struct ext2_group_desc
*best_desc
= NULL
;
299 struct buffer_head
*best_bh
= NULL
;
300 int best_ndir
= inodes_per_group
;
303 get_random_bytes(&group
, sizeof(group
));
304 parent_group
= (unsigned)group
% ngroups
;
305 for (i
= 0; i
< ngroups
; i
++) {
306 group
= (parent_group
+ i
) % ngroups
;
307 desc
= ext2_get_group_desc (sb
, group
, &bh
);
308 if (!desc
|| !desc
->bg_free_inodes_count
)
310 if (le16_to_cpu(desc
->bg_used_dirs_count
) >= best_ndir
)
312 if (le16_to_cpu(desc
->bg_free_inodes_count
) < avefreei
)
314 if (le16_to_cpu(desc
->bg_free_blocks_count
) < avefreeb
)
317 best_ndir
= le16_to_cpu(desc
->bg_used_dirs_count
);
321 if (best_group
>= 0) {
331 ndirs
= 1; /* percpu_counters are approximate... */
333 blocks_per_dir
= (le32_to_cpu(es
->s_blocks_count
)-free_blocks
) / ndirs
;
335 max_dirs
= ndirs
/ ngroups
+ inodes_per_group
/ 16;
336 min_inodes
= avefreei
- inodes_per_group
/ 4;
337 min_blocks
= avefreeb
- EXT2_BLOCKS_PER_GROUP(sb
) / 4;
339 max_debt
= EXT2_BLOCKS_PER_GROUP(sb
) / max(blocks_per_dir
, BLOCK_COST
);
340 if (max_debt
* INODE_COST
> inodes_per_group
)
341 max_debt
= inodes_per_group
/ INODE_COST
;
347 for (i
= 0; i
< ngroups
; i
++) {
348 group
= (parent_group
+ i
) % ngroups
;
349 desc
= ext2_get_group_desc (sb
, group
, &bh
);
350 if (!desc
|| !desc
->bg_free_inodes_count
)
352 if (sbi
->s_debts
[group
] >= max_debt
)
354 if (le16_to_cpu(desc
->bg_used_dirs_count
) >= max_dirs
)
356 if (le16_to_cpu(desc
->bg_free_inodes_count
) < min_inodes
)
358 if (le16_to_cpu(desc
->bg_free_blocks_count
) < min_blocks
)
364 for (i
= 0; i
< ngroups
; i
++) {
365 group
= (parent_group
+ i
) % ngroups
;
366 desc
= ext2_get_group_desc (sb
, group
, &bh
);
367 if (!desc
|| !desc
->bg_free_inodes_count
)
369 if (le16_to_cpu(desc
->bg_free_inodes_count
) >= avefreei
)
375 * The free-inodes counter is approximate, and for really small
376 * filesystems the above test can fail to find any blockgroups
388 static int find_group_other(struct super_block
*sb
, struct inode
*parent
)
390 int parent_group
= EXT2_I(parent
)->i_block_group
;
391 int ngroups
= EXT2_SB(sb
)->s_groups_count
;
392 struct ext2_group_desc
*desc
;
393 struct buffer_head
*bh
;
397 * Try to place the inode in its parent directory
399 group
= parent_group
;
400 desc
= ext2_get_group_desc (sb
, group
, &bh
);
401 if (desc
&& le16_to_cpu(desc
->bg_free_inodes_count
) &&
402 le16_to_cpu(desc
->bg_free_blocks_count
))
406 * We're going to place this inode in a different blockgroup from its
407 * parent. We want to cause files in a common directory to all land in
408 * the same blockgroup. But we want files which are in a different
409 * directory which shares a blockgroup with our parent to land in a
410 * different blockgroup.
412 * So add our directory's i_ino into the starting point for the hash.
414 group
= (group
+ parent
->i_ino
) % ngroups
;
417 * Use a quadratic hash to find a group with a free inode and some
420 for (i
= 1; i
< ngroups
; i
<<= 1) {
422 if (group
>= ngroups
)
424 desc
= ext2_get_group_desc (sb
, group
, &bh
);
425 if (desc
&& le16_to_cpu(desc
->bg_free_inodes_count
) &&
426 le16_to_cpu(desc
->bg_free_blocks_count
))
431 * That failed: try linear search for a free inode, even if that group
432 * has no free blocks.
434 group
= parent_group
;
435 for (i
= 0; i
< ngroups
; i
++) {
436 if (++group
>= ngroups
)
438 desc
= ext2_get_group_desc (sb
, group
, &bh
);
439 if (desc
&& le16_to_cpu(desc
->bg_free_inodes_count
))
449 struct inode
*ext2_new_inode(struct inode
*dir
, int mode
)
451 struct super_block
*sb
;
452 struct buffer_head
*bitmap_bh
= NULL
;
453 struct buffer_head
*bh2
;
456 struct inode
* inode
;
457 struct ext2_group_desc
*gdp
;
458 struct ext2_super_block
*es
;
459 struct ext2_inode_info
*ei
;
460 struct ext2_sb_info
*sbi
;
464 inode
= new_inode(sb
);
466 return ERR_PTR(-ENOMEM
);
472 if (test_opt(sb
, OLDALLOC
))
473 group
= find_group_dir(sb
, dir
);
475 group
= find_group_orlov(sb
, dir
);
477 group
= find_group_other(sb
, dir
);
484 for (i
= 0; i
< sbi
->s_groups_count
; i
++) {
485 gdp
= ext2_get_group_desc(sb
, group
, &bh2
);
487 bitmap_bh
= read_inode_bitmap(sb
, group
);
494 repeat_in_this_group
:
495 ino
= ext2_find_next_zero_bit((unsigned long *)bitmap_bh
->b_data
,
496 EXT2_INODES_PER_GROUP(sb
), ino
);
497 if (ino
>= EXT2_INODES_PER_GROUP(sb
)) {
499 * Rare race: find_group_xx() decided that there were
500 * free inodes in this group, but by the time we tried
501 * to allocate one, they're all gone. This can also
502 * occur because the counters which find_group_orlov()
503 * uses are approximate. So just go and search the
506 if (++group
== sbi
->s_groups_count
)
510 if (ext2_set_bit_atomic(sb_bgl_lock(sbi
, group
),
511 ino
, bitmap_bh
->b_data
)) {
512 /* we lost this inode */
513 if (++ino
>= EXT2_INODES_PER_GROUP(sb
)) {
514 /* this group is exhausted, try next group */
515 if (++group
== sbi
->s_groups_count
)
519 /* try to find free inode in the same group */
520 goto repeat_in_this_group
;
526 * Scanned all blockgroups.
531 mark_buffer_dirty(bitmap_bh
);
532 if (sb
->s_flags
& MS_SYNCHRONOUS
)
533 sync_dirty_buffer(bitmap_bh
);
536 ino
+= group
* EXT2_INODES_PER_GROUP(sb
) + 1;
537 if (ino
< EXT2_FIRST_INO(sb
) || ino
> le32_to_cpu(es
->s_inodes_count
)) {
538 ext2_error (sb
, "ext2_new_inode",
539 "reserved inode or inode > inodes count - "
540 "block_group = %d,inode=%lu", group
,
541 (unsigned long) ino
);
546 percpu_counter_mod(&sbi
->s_freeinodes_counter
, -1);
548 percpu_counter_inc(&sbi
->s_dirs_counter
);
550 spin_lock(sb_bgl_lock(sbi
, group
));
551 gdp
->bg_free_inodes_count
=
552 cpu_to_le16(le16_to_cpu(gdp
->bg_free_inodes_count
) - 1);
554 if (sbi
->s_debts
[group
] < 255)
555 sbi
->s_debts
[group
]++;
556 gdp
->bg_used_dirs_count
=
557 cpu_to_le16(le16_to_cpu(gdp
->bg_used_dirs_count
) + 1);
559 if (sbi
->s_debts
[group
])
560 sbi
->s_debts
[group
]--;
562 spin_unlock(sb_bgl_lock(sbi
, group
));
565 mark_buffer_dirty(bh2
);
566 inode
->i_uid
= current
->fsuid
;
567 if (test_opt (sb
, GRPID
))
568 inode
->i_gid
= dir
->i_gid
;
569 else if (dir
->i_mode
& S_ISGID
) {
570 inode
->i_gid
= dir
->i_gid
;
574 inode
->i_gid
= current
->fsgid
;
575 inode
->i_mode
= mode
;
578 inode
->i_blksize
= PAGE_SIZE
; /* This is the optimal IO size (for stat), not the fs block size */
580 inode
->i_mtime
= inode
->i_atime
= inode
->i_ctime
= CURRENT_TIME_SEC
;
581 memset(ei
->i_data
, 0, sizeof(ei
->i_data
));
582 ei
->i_flags
= EXT2_I(dir
)->i_flags
& ~EXT2_BTREE_FL
;
584 ei
->i_flags
&= ~(EXT2_IMMUTABLE_FL
|EXT2_APPEND_FL
);
585 /* dirsync is only applied to directories */
587 ei
->i_flags
&= ~EXT2_DIRSYNC_FL
;
594 ei
->i_block_group
= group
;
595 ei
->i_next_alloc_block
= 0;
596 ei
->i_next_alloc_goal
= 0;
597 ei
->i_prealloc_block
= 0;
598 ei
->i_prealloc_count
= 0;
599 ei
->i_dir_start_lookup
= 0;
600 ei
->i_state
= EXT2_STATE_NEW
;
601 ext2_set_inode_flags(inode
);
602 spin_lock(&sbi
->s_next_gen_lock
);
603 inode
->i_generation
= sbi
->s_next_generation
++;
604 spin_unlock(&sbi
->s_next_gen_lock
);
605 insert_inode_hash(inode
);
607 if (DQUOT_ALLOC_INODE(inode
)) {
612 err
= ext2_init_acl(inode
, dir
);
616 err
= ext2_init_security(inode
,dir
);
620 mark_inode_dirty(inode
);
621 ext2_debug("allocating inode %lu\n", inode
->i_ino
);
622 ext2_preread_inode(inode
);
626 DQUOT_FREE_INODE(inode
);
630 inode
->i_flags
|= S_NOQUOTA
;
636 make_bad_inode(inode
);
641 unsigned long ext2_count_free_inodes (struct super_block
* sb
)
643 struct ext2_group_desc
*desc
;
644 unsigned long desc_count
= 0;
648 struct ext2_super_block
*es
;
649 unsigned long bitmap_count
= 0;
650 struct buffer_head
*bitmap_bh
= NULL
;
653 es
= EXT2_SB(sb
)->s_es
;
654 for (i
= 0; i
< EXT2_SB(sb
)->s_groups_count
; i
++) {
657 desc
= ext2_get_group_desc (sb
, i
, NULL
);
660 desc_count
+= le16_to_cpu(desc
->bg_free_inodes_count
);
662 bitmap_bh
= read_inode_bitmap(sb
, i
);
666 x
= ext2_count_free(bitmap_bh
, EXT2_INODES_PER_GROUP(sb
) / 8);
667 printk("group %d: stored = %d, counted = %u\n",
668 i
, le16_to_cpu(desc
->bg_free_inodes_count
), x
);
672 printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
673 percpu_counter_read(&EXT2_SB(sb
)->s_freeinodes_counter
),
674 desc_count
, bitmap_count
);
678 for (i
= 0; i
< EXT2_SB(sb
)->s_groups_count
; i
++) {
679 desc
= ext2_get_group_desc (sb
, i
, NULL
);
682 desc_count
+= le16_to_cpu(desc
->bg_free_inodes_count
);
688 /* Called at mount-time, super-block is locked */
689 unsigned long ext2_count_dirs (struct super_block
* sb
)
691 unsigned long count
= 0;
694 for (i
= 0; i
< EXT2_SB(sb
)->s_groups_count
; i
++) {
695 struct ext2_group_desc
*gdp
= ext2_get_group_desc (sb
, i
, NULL
);
698 count
+= le16_to_cpu(gdp
->bg_used_dirs_count
);