1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
38 #include "blockcheck.h"
42 #include "localalloc.h"
48 #include "buffer_head_io.h"
50 #define NOT_ALLOC_NEW_GROUP 0
51 #define ALLOC_NEW_GROUP 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL 0x2
54 #define OCFS2_MAX_TO_STEAL 1024
56 struct ocfs2_suballoc_result
{
57 u64 sr_bg_blkno
; /* The bg we allocated from. Set
58 to 0 when a block group is
60 u64 sr_blkno
; /* The first allocated block */
61 unsigned int sr_bit_offset
; /* The bit in the bg */
62 unsigned int sr_bits
; /* How many bits we claimed */
65 static u64
ocfs2_group_from_res(struct ocfs2_suballoc_result
*res
)
67 if (res
->sr_blkno
== 0)
71 return res
->sr_bg_blkno
;
73 return ocfs2_which_suballoc_group(res
->sr_blkno
, res
->sr_bit_offset
);
76 static inline void ocfs2_debug_bg(struct ocfs2_group_desc
*bg
);
77 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode
*fe
);
78 static inline u16
ocfs2_find_victim_chain(struct ocfs2_chain_list
*cl
);
79 static int ocfs2_block_group_fill(handle_t
*handle
,
80 struct inode
*alloc_inode
,
81 struct buffer_head
*bg_bh
,
83 unsigned int group_clusters
,
85 struct ocfs2_chain_list
*cl
);
86 static int ocfs2_block_group_alloc(struct ocfs2_super
*osb
,
87 struct inode
*alloc_inode
,
88 struct buffer_head
*bh
,
90 u64
*last_alloc_group
,
93 static int ocfs2_cluster_group_search(struct inode
*inode
,
94 struct buffer_head
*group_bh
,
95 u32 bits_wanted
, u32 min_bits
,
97 struct ocfs2_suballoc_result
*res
);
98 static int ocfs2_block_group_search(struct inode
*inode
,
99 struct buffer_head
*group_bh
,
100 u32 bits_wanted
, u32 min_bits
,
102 struct ocfs2_suballoc_result
*res
);
103 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context
*ac
,
107 struct ocfs2_suballoc_result
*res
);
108 static int ocfs2_test_bg_bit_allocatable(struct buffer_head
*bg_bh
,
110 static inline int ocfs2_block_group_set_bits(handle_t
*handle
,
111 struct inode
*alloc_inode
,
112 struct ocfs2_group_desc
*bg
,
113 struct buffer_head
*group_bh
,
114 unsigned int bit_off
,
115 unsigned int num_bits
);
116 static int ocfs2_relink_block_group(handle_t
*handle
,
117 struct inode
*alloc_inode
,
118 struct buffer_head
*fe_bh
,
119 struct buffer_head
*bg_bh
,
120 struct buffer_head
*prev_bg_bh
,
122 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc
*bg
,
124 static inline u32
ocfs2_desc_bitmap_to_cluster_off(struct inode
*inode
,
127 static inline void ocfs2_block_to_cluster_group(struct inode
*inode
,
131 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super
*osb
,
132 u32 bits_wanted
, u64 max_block
,
134 struct ocfs2_alloc_context
**ac
);
136 void ocfs2_free_ac_resource(struct ocfs2_alloc_context
*ac
)
138 struct inode
*inode
= ac
->ac_inode
;
141 if (ac
->ac_which
!= OCFS2_AC_USE_LOCAL
)
142 ocfs2_inode_unlock(inode
, 1);
144 mutex_unlock(&inode
->i_mutex
);
154 void ocfs2_free_alloc_context(struct ocfs2_alloc_context
*ac
)
156 ocfs2_free_ac_resource(ac
);
160 static u32
ocfs2_bits_per_group(struct ocfs2_chain_list
*cl
)
162 return (u32
)le16_to_cpu(cl
->cl_cpg
) * (u32
)le16_to_cpu(cl
->cl_bpc
);
165 #define do_error(fmt, ...) \
168 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
170 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
173 static int ocfs2_validate_gd_self(struct super_block
*sb
,
174 struct buffer_head
*bh
,
177 struct ocfs2_group_desc
*gd
= (struct ocfs2_group_desc
*)bh
->b_data
;
179 if (!OCFS2_IS_VALID_GROUP_DESC(gd
)) {
180 do_error("Group descriptor #%llu has bad signature %.*s",
181 (unsigned long long)bh
->b_blocknr
, 7,
186 if (le64_to_cpu(gd
->bg_blkno
) != bh
->b_blocknr
) {
187 do_error("Group descriptor #%llu has an invalid bg_blkno "
189 (unsigned long long)bh
->b_blocknr
,
190 (unsigned long long)le64_to_cpu(gd
->bg_blkno
));
194 if (le32_to_cpu(gd
->bg_generation
) != OCFS2_SB(sb
)->fs_generation
) {
195 do_error("Group descriptor #%llu has an invalid "
196 "fs_generation of #%u",
197 (unsigned long long)bh
->b_blocknr
,
198 le32_to_cpu(gd
->bg_generation
));
202 if (le16_to_cpu(gd
->bg_free_bits_count
) > le16_to_cpu(gd
->bg_bits
)) {
203 do_error("Group descriptor #%llu has bit count %u but "
204 "claims that %u are free",
205 (unsigned long long)bh
->b_blocknr
,
206 le16_to_cpu(gd
->bg_bits
),
207 le16_to_cpu(gd
->bg_free_bits_count
));
211 if (le16_to_cpu(gd
->bg_bits
) > (8 * le16_to_cpu(gd
->bg_size
))) {
212 do_error("Group descriptor #%llu has bit count %u but "
213 "max bitmap bits of %u",
214 (unsigned long long)bh
->b_blocknr
,
215 le16_to_cpu(gd
->bg_bits
),
216 8 * le16_to_cpu(gd
->bg_size
));
223 static int ocfs2_validate_gd_parent(struct super_block
*sb
,
224 struct ocfs2_dinode
*di
,
225 struct buffer_head
*bh
,
228 unsigned int max_bits
;
229 struct ocfs2_group_desc
*gd
= (struct ocfs2_group_desc
*)bh
->b_data
;
231 if (di
->i_blkno
!= gd
->bg_parent_dinode
) {
232 do_error("Group descriptor #%llu has bad parent "
233 "pointer (%llu, expected %llu)",
234 (unsigned long long)bh
->b_blocknr
,
235 (unsigned long long)le64_to_cpu(gd
->bg_parent_dinode
),
236 (unsigned long long)le64_to_cpu(di
->i_blkno
));
240 max_bits
= le16_to_cpu(di
->id2
.i_chain
.cl_cpg
) * le16_to_cpu(di
->id2
.i_chain
.cl_bpc
);
241 if (le16_to_cpu(gd
->bg_bits
) > max_bits
) {
242 do_error("Group descriptor #%llu has bit count of %u",
243 (unsigned long long)bh
->b_blocknr
,
244 le16_to_cpu(gd
->bg_bits
));
248 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
249 if ((le16_to_cpu(gd
->bg_chain
) >
250 le16_to_cpu(di
->id2
.i_chain
.cl_next_free_rec
)) ||
251 ((le16_to_cpu(gd
->bg_chain
) ==
252 le16_to_cpu(di
->id2
.i_chain
.cl_next_free_rec
)) && !resize
)) {
253 do_error("Group descriptor #%llu has bad chain %u",
254 (unsigned long long)bh
->b_blocknr
,
255 le16_to_cpu(gd
->bg_chain
));
265 * This version only prints errors. It does not fail the filesystem, and
266 * exists only for resize.
268 int ocfs2_check_group_descriptor(struct super_block
*sb
,
269 struct ocfs2_dinode
*di
,
270 struct buffer_head
*bh
)
273 struct ocfs2_group_desc
*gd
= (struct ocfs2_group_desc
*)bh
->b_data
;
275 BUG_ON(!buffer_uptodate(bh
));
278 * If the ecc fails, we return the error but otherwise
279 * leave the filesystem running. We know any error is
280 * local to this block.
282 rc
= ocfs2_validate_meta_ecc(sb
, bh
->b_data
, &gd
->bg_check
);
285 "Checksum failed for group descriptor %llu\n",
286 (unsigned long long)bh
->b_blocknr
);
288 rc
= ocfs2_validate_gd_self(sb
, bh
, 1);
290 rc
= ocfs2_validate_gd_parent(sb
, di
, bh
, 1);
295 static int ocfs2_validate_group_descriptor(struct super_block
*sb
,
296 struct buffer_head
*bh
)
299 struct ocfs2_group_desc
*gd
= (struct ocfs2_group_desc
*)bh
->b_data
;
301 mlog(0, "Validating group descriptor %llu\n",
302 (unsigned long long)bh
->b_blocknr
);
304 BUG_ON(!buffer_uptodate(bh
));
307 * If the ecc fails, we return the error but otherwise
308 * leave the filesystem running. We know any error is
309 * local to this block.
311 rc
= ocfs2_validate_meta_ecc(sb
, bh
->b_data
, &gd
->bg_check
);
316 * Errors after here are fatal.
319 return ocfs2_validate_gd_self(sb
, bh
, 0);
322 int ocfs2_read_group_descriptor(struct inode
*inode
, struct ocfs2_dinode
*di
,
323 u64 gd_blkno
, struct buffer_head
**bh
)
326 struct buffer_head
*tmp
= *bh
;
328 rc
= ocfs2_read_block(INODE_CACHE(inode
), gd_blkno
, &tmp
,
329 ocfs2_validate_group_descriptor
);
333 rc
= ocfs2_validate_gd_parent(inode
->i_sb
, di
, tmp
, 0);
339 /* If ocfs2_read_block() got us a new bh, pass it up. */
347 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super
*osb
,
348 struct ocfs2_group_desc
*bg
,
349 struct ocfs2_chain_list
*cl
,
350 u64 p_blkno
, u32 clusters
)
352 struct ocfs2_extent_list
*el
= &bg
->bg_list
;
353 struct ocfs2_extent_rec
*rec
;
355 BUG_ON(!ocfs2_supports_discontig_bg(osb
));
356 if (!el
->l_next_free_rec
)
357 el
->l_count
= cpu_to_le16(ocfs2_extent_recs_per_gd(osb
->sb
));
358 rec
= &el
->l_recs
[le16_to_cpu(el
->l_next_free_rec
)];
359 rec
->e_blkno
= cpu_to_le64(p_blkno
);
360 rec
->e_cpos
= cpu_to_le32(le16_to_cpu(bg
->bg_bits
) /
361 le16_to_cpu(cl
->cl_bpc
));
362 rec
->e_leaf_clusters
= cpu_to_le32(clusters
);
363 le16_add_cpu(&bg
->bg_bits
, clusters
* le16_to_cpu(cl
->cl_bpc
));
364 le16_add_cpu(&bg
->bg_free_bits_count
,
365 clusters
* le16_to_cpu(cl
->cl_bpc
));
366 le16_add_cpu(&el
->l_next_free_rec
, 1);
369 static int ocfs2_block_group_fill(handle_t
*handle
,
370 struct inode
*alloc_inode
,
371 struct buffer_head
*bg_bh
,
373 unsigned int group_clusters
,
375 struct ocfs2_chain_list
*cl
)
378 struct ocfs2_super
*osb
= OCFS2_SB(alloc_inode
->i_sb
);
379 struct ocfs2_group_desc
*bg
= (struct ocfs2_group_desc
*) bg_bh
->b_data
;
380 struct super_block
* sb
= alloc_inode
->i_sb
;
384 if (((unsigned long long) bg_bh
->b_blocknr
) != group_blkno
) {
385 ocfs2_error(alloc_inode
->i_sb
, "group block (%llu) != "
387 (unsigned long long)group_blkno
,
388 (unsigned long long) bg_bh
->b_blocknr
);
393 status
= ocfs2_journal_access_gd(handle
,
394 INODE_CACHE(alloc_inode
),
396 OCFS2_JOURNAL_ACCESS_CREATE
);
402 memset(bg
, 0, sb
->s_blocksize
);
403 strcpy(bg
->bg_signature
, OCFS2_GROUP_DESC_SIGNATURE
);
404 bg
->bg_generation
= cpu_to_le32(OCFS2_SB(sb
)->fs_generation
);
405 bg
->bg_size
= cpu_to_le16(ocfs2_group_bitmap_size(sb
, 1,
406 osb
->s_feature_incompat
));
407 bg
->bg_chain
= cpu_to_le16(my_chain
);
408 bg
->bg_next_group
= cl
->cl_recs
[my_chain
].c_blkno
;
409 bg
->bg_parent_dinode
= cpu_to_le64(OCFS2_I(alloc_inode
)->ip_blkno
);
410 bg
->bg_blkno
= cpu_to_le64(group_blkno
);
411 if (group_clusters
== le16_to_cpu(cl
->cl_cpg
))
412 bg
->bg_bits
= cpu_to_le16(ocfs2_bits_per_group(cl
));
414 ocfs2_bg_discontig_add_extent(osb
, bg
, cl
, group_blkno
,
417 /* set the 1st bit in the bitmap to account for the descriptor block */
418 ocfs2_set_bit(0, (unsigned long *)bg
->bg_bitmap
);
419 bg
->bg_free_bits_count
= cpu_to_le16(le16_to_cpu(bg
->bg_bits
) - 1);
421 ocfs2_journal_dirty(handle
, bg_bh
);
423 /* There is no need to zero out or otherwise initialize the
424 * other blocks in a group - All valid FS metadata in a block
425 * group stores the superblock fs_generation value at
426 * allocation time. */
433 static inline u16
ocfs2_find_smallest_chain(struct ocfs2_chain_list
*cl
)
438 while (curr
< le16_to_cpu(cl
->cl_count
)) {
439 if (le32_to_cpu(cl
->cl_recs
[best
].c_total
) >
440 le32_to_cpu(cl
->cl_recs
[curr
].c_total
))
447 static struct buffer_head
*
448 ocfs2_block_group_alloc_contig(struct ocfs2_super
*osb
, handle_t
*handle
,
449 struct inode
*alloc_inode
,
450 struct ocfs2_alloc_context
*ac
,
451 struct ocfs2_chain_list
*cl
)
454 u32 bit_off
, num_bits
;
456 struct buffer_head
*bg_bh
;
457 unsigned int alloc_rec
= ocfs2_find_smallest_chain(cl
);
459 status
= ocfs2_claim_clusters(handle
, ac
,
460 le16_to_cpu(cl
->cl_cpg
), &bit_off
,
463 if (status
!= -ENOSPC
)
468 /* setup the group */
469 bg_blkno
= ocfs2_clusters_to_blocks(osb
->sb
, bit_off
);
470 mlog(0, "new descriptor, record %u, at block %llu\n",
471 alloc_rec
, (unsigned long long)bg_blkno
);
473 bg_bh
= sb_getblk(osb
->sb
, bg_blkno
);
479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode
), bg_bh
);
481 status
= ocfs2_block_group_fill(handle
, alloc_inode
, bg_bh
,
482 bg_blkno
, num_bits
, alloc_rec
, cl
);
489 return status
? ERR_PTR(status
) : bg_bh
;
492 static int ocfs2_block_group_claim_bits(struct ocfs2_super
*osb
,
494 struct ocfs2_alloc_context
*ac
,
495 unsigned int min_bits
,
496 u32
*bit_off
, u32
*num_bits
)
501 status
= ocfs2_claim_clusters(handle
, ac
, min_bits
,
503 if (status
!= -ENOSPC
)
512 static int ocfs2_block_group_grow_discontig(handle_t
*handle
,
513 struct inode
*alloc_inode
,
514 struct buffer_head
*bg_bh
,
515 struct ocfs2_alloc_context
*ac
,
516 struct ocfs2_chain_list
*cl
,
517 unsigned int min_bits
)
520 struct ocfs2_super
*osb
= OCFS2_SB(alloc_inode
->i_sb
);
521 struct ocfs2_group_desc
*bg
=
522 (struct ocfs2_group_desc
*)bg_bh
->b_data
;
523 unsigned int needed
= le16_to_cpu(cl
->cl_cpg
) -
524 le16_to_cpu(bg
->bg_bits
) / le16_to_cpu(cl
->cl_bpc
);
525 u32 p_cpos
, clusters
;
527 struct ocfs2_extent_list
*el
= &bg
->bg_list
;
529 status
= ocfs2_journal_access_gd(handle
,
530 INODE_CACHE(alloc_inode
),
532 OCFS2_JOURNAL_ACCESS_CREATE
);
538 while ((needed
> 0) && (le16_to_cpu(el
->l_next_free_rec
) <
539 le16_to_cpu(el
->l_count
))) {
540 if (min_bits
> needed
)
542 status
= ocfs2_block_group_claim_bits(osb
, handle
, ac
,
546 if (status
!= -ENOSPC
)
550 p_blkno
= ocfs2_clusters_to_blocks(osb
->sb
, p_cpos
);
551 ocfs2_bg_discontig_add_extent(osb
, bg
, cl
, p_blkno
,
555 needed
= le16_to_cpu(cl
->cl_cpg
) -
556 le16_to_cpu(bg
->bg_bits
) / le16_to_cpu(cl
->cl_bpc
);
561 * We have used up all the extent rec but can't fill up
562 * the cpg. So bail out.
568 ocfs2_journal_dirty(handle
, bg_bh
);
574 static void ocfs2_bg_alloc_cleanup(handle_t
*handle
,
575 struct ocfs2_alloc_context
*cluster_ac
,
576 struct inode
*alloc_inode
,
577 struct buffer_head
*bg_bh
)
580 struct ocfs2_group_desc
*bg
;
581 struct ocfs2_extent_list
*el
;
582 struct ocfs2_extent_rec
*rec
;
587 bg
= (struct ocfs2_group_desc
*)bg_bh
->b_data
;
589 for (i
= 0; i
< le16_to_cpu(el
->l_next_free_rec
); i
++) {
590 rec
= &el
->l_recs
[i
];
591 ret
= ocfs2_free_clusters(handle
, cluster_ac
->ac_inode
,
593 le64_to_cpu(rec
->e_blkno
),
594 le32_to_cpu(rec
->e_leaf_clusters
));
597 /* Try all the clusters to free */
600 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode
), bg_bh
);
604 static struct buffer_head
*
605 ocfs2_block_group_alloc_discontig(handle_t
*handle
,
606 struct inode
*alloc_inode
,
607 struct ocfs2_alloc_context
*ac
,
608 struct ocfs2_chain_list
*cl
)
611 u32 bit_off
, num_bits
;
613 unsigned int min_bits
= le16_to_cpu(cl
->cl_cpg
) >> 1;
614 struct buffer_head
*bg_bh
= NULL
;
615 unsigned int alloc_rec
= ocfs2_find_smallest_chain(cl
);
616 struct ocfs2_super
*osb
= OCFS2_SB(alloc_inode
->i_sb
);
618 if (!ocfs2_supports_discontig_bg(osb
)) {
623 status
= ocfs2_extend_trans(handle
,
624 ocfs2_calc_bg_discontig_credits(osb
->sb
));
631 * We're going to be grabbing from multiple cluster groups.
632 * We don't have enough credits to relink them all, and the
633 * cluster groups will be staying in cache for the duration of
636 ac
->ac_allow_chain_relink
= 0;
638 /* Claim the first region */
639 status
= ocfs2_block_group_claim_bits(osb
, handle
, ac
, min_bits
,
640 &bit_off
, &num_bits
);
642 if (status
!= -ENOSPC
)
648 /* setup the group */
649 bg_blkno
= ocfs2_clusters_to_blocks(osb
->sb
, bit_off
);
650 mlog(0, "new descriptor, record %u, at block %llu\n",
651 alloc_rec
, (unsigned long long)bg_blkno
);
653 bg_bh
= sb_getblk(osb
->sb
, bg_blkno
);
659 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode
), bg_bh
);
661 status
= ocfs2_block_group_fill(handle
, alloc_inode
, bg_bh
,
662 bg_blkno
, num_bits
, alloc_rec
, cl
);
668 status
= ocfs2_block_group_grow_discontig(handle
, alloc_inode
,
669 bg_bh
, ac
, cl
, min_bits
);
675 ocfs2_bg_alloc_cleanup(handle
, ac
, alloc_inode
, bg_bh
);
676 return status
? ERR_PTR(status
) : bg_bh
;
680 * We expect the block group allocator to already be locked.
682 static int ocfs2_block_group_alloc(struct ocfs2_super
*osb
,
683 struct inode
*alloc_inode
,
684 struct buffer_head
*bh
,
686 u64
*last_alloc_group
,
690 struct ocfs2_dinode
*fe
= (struct ocfs2_dinode
*) bh
->b_data
;
691 struct ocfs2_chain_list
*cl
;
692 struct ocfs2_alloc_context
*ac
= NULL
;
693 handle_t
*handle
= NULL
;
695 struct buffer_head
*bg_bh
= NULL
;
696 struct ocfs2_group_desc
*bg
;
698 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode
));
702 cl
= &fe
->id2
.i_chain
;
703 status
= ocfs2_reserve_clusters_with_limit(osb
,
704 le16_to_cpu(cl
->cl_cpg
),
705 max_block
, flags
, &ac
);
707 if (status
!= -ENOSPC
)
712 credits
= ocfs2_calc_group_alloc_credits(osb
->sb
,
713 le16_to_cpu(cl
->cl_cpg
));
714 handle
= ocfs2_start_trans(osb
, credits
);
715 if (IS_ERR(handle
)) {
716 status
= PTR_ERR(handle
);
722 if (last_alloc_group
&& *last_alloc_group
!= 0) {
723 mlog(0, "use old allocation group %llu for block group alloc\n",
724 (unsigned long long)*last_alloc_group
);
725 ac
->ac_last_group
= *last_alloc_group
;
728 bg_bh
= ocfs2_block_group_alloc_contig(osb
, handle
, alloc_inode
,
730 if (IS_ERR(bg_bh
) && (PTR_ERR(bg_bh
) == -ENOSPC
))
731 bg_bh
= ocfs2_block_group_alloc_discontig(handle
,
735 status
= PTR_ERR(bg_bh
);
737 if (status
!= -ENOSPC
)
741 bg
= (struct ocfs2_group_desc
*) bg_bh
->b_data
;
743 status
= ocfs2_journal_access_di(handle
, INODE_CACHE(alloc_inode
),
744 bh
, OCFS2_JOURNAL_ACCESS_WRITE
);
750 alloc_rec
= le16_to_cpu(bg
->bg_chain
);
751 le32_add_cpu(&cl
->cl_recs
[alloc_rec
].c_free
,
752 le16_to_cpu(bg
->bg_free_bits_count
));
753 le32_add_cpu(&cl
->cl_recs
[alloc_rec
].c_total
,
754 le16_to_cpu(bg
->bg_bits
));
755 cl
->cl_recs
[alloc_rec
].c_blkno
= bg
->bg_blkno
;
756 if (le16_to_cpu(cl
->cl_next_free_rec
) < le16_to_cpu(cl
->cl_count
))
757 le16_add_cpu(&cl
->cl_next_free_rec
, 1);
759 le32_add_cpu(&fe
->id1
.bitmap1
.i_used
, le16_to_cpu(bg
->bg_bits
) -
760 le16_to_cpu(bg
->bg_free_bits_count
));
761 le32_add_cpu(&fe
->id1
.bitmap1
.i_total
, le16_to_cpu(bg
->bg_bits
));
762 le32_add_cpu(&fe
->i_clusters
, le16_to_cpu(cl
->cl_cpg
));
764 ocfs2_journal_dirty(handle
, bh
);
766 spin_lock(&OCFS2_I(alloc_inode
)->ip_lock
);
767 OCFS2_I(alloc_inode
)->ip_clusters
= le32_to_cpu(fe
->i_clusters
);
768 fe
->i_size
= cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode
->i_sb
,
769 le32_to_cpu(fe
->i_clusters
)));
770 spin_unlock(&OCFS2_I(alloc_inode
)->ip_lock
);
771 i_size_write(alloc_inode
, le64_to_cpu(fe
->i_size
));
772 alloc_inode
->i_blocks
= ocfs2_inode_sector_count(alloc_inode
);
776 /* save the new last alloc group so that the caller can cache it. */
777 if (last_alloc_group
)
778 *last_alloc_group
= ac
->ac_last_group
;
782 ocfs2_commit_trans(osb
, handle
);
785 ocfs2_free_alloc_context(ac
);
793 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super
*osb
,
794 struct ocfs2_alloc_context
*ac
,
797 u64
*last_alloc_group
,
801 u32 bits_wanted
= ac
->ac_bits_wanted
;
802 struct inode
*alloc_inode
;
803 struct buffer_head
*bh
= NULL
;
804 struct ocfs2_dinode
*fe
;
809 alloc_inode
= ocfs2_get_system_file_inode(osb
, type
, slot
);
815 mutex_lock(&alloc_inode
->i_mutex
);
817 status
= ocfs2_inode_lock(alloc_inode
, &bh
, 1);
819 mutex_unlock(&alloc_inode
->i_mutex
);
826 ac
->ac_inode
= alloc_inode
;
827 ac
->ac_alloc_slot
= slot
;
829 fe
= (struct ocfs2_dinode
*) bh
->b_data
;
831 /* The bh was validated by the inode read inside
832 * ocfs2_inode_lock(). Any corruption is a code bug. */
833 BUG_ON(!OCFS2_IS_VALID_DINODE(fe
));
835 if (!(fe
->i_flags
& cpu_to_le32(OCFS2_CHAIN_FL
))) {
836 ocfs2_error(alloc_inode
->i_sb
, "Invalid chain allocator %llu",
837 (unsigned long long)le64_to_cpu(fe
->i_blkno
));
842 free_bits
= le32_to_cpu(fe
->id1
.bitmap1
.i_total
) -
843 le32_to_cpu(fe
->id1
.bitmap1
.i_used
);
845 if (bits_wanted
> free_bits
) {
846 /* cluster bitmap never grows */
847 if (ocfs2_is_cluster_bitmap(alloc_inode
)) {
848 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
849 bits_wanted
, free_bits
);
854 if (!(flags
& ALLOC_NEW_GROUP
)) {
855 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
856 "and we don't alloc a new group for it.\n",
857 slot
, bits_wanted
, free_bits
);
862 status
= ocfs2_block_group_alloc(osb
, alloc_inode
, bh
,
864 last_alloc_group
, flags
);
866 if (status
!= -ENOSPC
)
870 atomic_inc(&osb
->alloc_stats
.bg_extends
);
872 /* You should never ask for this much metadata */
874 (le32_to_cpu(fe
->id1
.bitmap1
.i_total
)
875 - le32_to_cpu(fe
->id1
.bitmap1
.i_used
)));
887 static void ocfs2_init_inode_steal_slot(struct ocfs2_super
*osb
)
889 spin_lock(&osb
->osb_lock
);
890 osb
->s_inode_steal_slot
= OCFS2_INVALID_SLOT
;
891 spin_unlock(&osb
->osb_lock
);
892 atomic_set(&osb
->s_num_inodes_stolen
, 0);
895 static void ocfs2_init_meta_steal_slot(struct ocfs2_super
*osb
)
897 spin_lock(&osb
->osb_lock
);
898 osb
->s_meta_steal_slot
= OCFS2_INVALID_SLOT
;
899 spin_unlock(&osb
->osb_lock
);
900 atomic_set(&osb
->s_num_meta_stolen
, 0);
903 void ocfs2_init_steal_slots(struct ocfs2_super
*osb
)
905 ocfs2_init_inode_steal_slot(osb
);
906 ocfs2_init_meta_steal_slot(osb
);
909 static void __ocfs2_set_steal_slot(struct ocfs2_super
*osb
, int slot
, int type
)
911 spin_lock(&osb
->osb_lock
);
912 if (type
== INODE_ALLOC_SYSTEM_INODE
)
913 osb
->s_inode_steal_slot
= slot
;
914 else if (type
== EXTENT_ALLOC_SYSTEM_INODE
)
915 osb
->s_meta_steal_slot
= slot
;
916 spin_unlock(&osb
->osb_lock
);
919 static int __ocfs2_get_steal_slot(struct ocfs2_super
*osb
, int type
)
921 int slot
= OCFS2_INVALID_SLOT
;
923 spin_lock(&osb
->osb_lock
);
924 if (type
== INODE_ALLOC_SYSTEM_INODE
)
925 slot
= osb
->s_inode_steal_slot
;
926 else if (type
== EXTENT_ALLOC_SYSTEM_INODE
)
927 slot
= osb
->s_meta_steal_slot
;
928 spin_unlock(&osb
->osb_lock
);
933 static int ocfs2_get_inode_steal_slot(struct ocfs2_super
*osb
)
935 return __ocfs2_get_steal_slot(osb
, INODE_ALLOC_SYSTEM_INODE
);
938 static int ocfs2_get_meta_steal_slot(struct ocfs2_super
*osb
)
940 return __ocfs2_get_steal_slot(osb
, EXTENT_ALLOC_SYSTEM_INODE
);
943 static int ocfs2_steal_resource(struct ocfs2_super
*osb
,
944 struct ocfs2_alloc_context
*ac
,
947 int i
, status
= -ENOSPC
;
948 int slot
= __ocfs2_get_steal_slot(osb
, type
);
950 /* Start to steal resource from the first slot after ours. */
951 if (slot
== OCFS2_INVALID_SLOT
)
952 slot
= osb
->slot_num
+ 1;
954 for (i
= 0; i
< osb
->max_slots
; i
++, slot
++) {
955 if (slot
== osb
->max_slots
)
958 if (slot
== osb
->slot_num
)
961 status
= ocfs2_reserve_suballoc_bits(osb
, ac
,
964 NOT_ALLOC_NEW_GROUP
);
966 __ocfs2_set_steal_slot(osb
, slot
, type
);
970 ocfs2_free_ac_resource(ac
);
976 static int ocfs2_steal_inode(struct ocfs2_super
*osb
,
977 struct ocfs2_alloc_context
*ac
)
979 return ocfs2_steal_resource(osb
, ac
, INODE_ALLOC_SYSTEM_INODE
);
982 static int ocfs2_steal_meta(struct ocfs2_super
*osb
,
983 struct ocfs2_alloc_context
*ac
)
985 return ocfs2_steal_resource(osb
, ac
, EXTENT_ALLOC_SYSTEM_INODE
);
988 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super
*osb
,
990 struct ocfs2_alloc_context
**ac
)
993 int slot
= ocfs2_get_meta_steal_slot(osb
);
995 *ac
= kzalloc(sizeof(struct ocfs2_alloc_context
), GFP_KERNEL
);
1002 (*ac
)->ac_bits_wanted
= blocks
;
1003 (*ac
)->ac_which
= OCFS2_AC_USE_META
;
1004 (*ac
)->ac_group_search
= ocfs2_block_group_search
;
1006 if (slot
!= OCFS2_INVALID_SLOT
&&
1007 atomic_read(&osb
->s_num_meta_stolen
) < OCFS2_MAX_TO_STEAL
)
1010 atomic_set(&osb
->s_num_meta_stolen
, 0);
1011 status
= ocfs2_reserve_suballoc_bits(osb
, (*ac
),
1012 EXTENT_ALLOC_SYSTEM_INODE
,
1013 (u32
)osb
->slot_num
, NULL
,
1014 ALLOC_GROUPS_FROM_GLOBAL
|ALLOC_NEW_GROUP
);
1019 if (slot
!= OCFS2_INVALID_SLOT
)
1020 ocfs2_init_meta_steal_slot(osb
);
1022 } else if (status
< 0 && status
!= -ENOSPC
) {
1027 ocfs2_free_ac_resource(*ac
);
1030 status
= ocfs2_steal_meta(osb
, *ac
);
1031 atomic_inc(&osb
->s_num_meta_stolen
);
1033 if (status
!= -ENOSPC
)
1040 if ((status
< 0) && *ac
) {
1041 ocfs2_free_alloc_context(*ac
);
1049 int ocfs2_reserve_new_metadata(struct ocfs2_super
*osb
,
1050 struct ocfs2_extent_list
*root_el
,
1051 struct ocfs2_alloc_context
**ac
)
1053 return ocfs2_reserve_new_metadata_blocks(osb
,
1054 ocfs2_extend_meta_needed(root_el
),
1058 int ocfs2_reserve_new_inode(struct ocfs2_super
*osb
,
1059 struct ocfs2_alloc_context
**ac
)
1062 int slot
= ocfs2_get_inode_steal_slot(osb
);
1065 *ac
= kzalloc(sizeof(struct ocfs2_alloc_context
), GFP_KERNEL
);
1072 (*ac
)->ac_bits_wanted
= 1;
1073 (*ac
)->ac_which
= OCFS2_AC_USE_INODE
;
1075 (*ac
)->ac_group_search
= ocfs2_block_group_search
;
1078 * stat(2) can't handle i_ino > 32bits, so we tell the
1079 * lower levels not to allocate us a block group past that
1080 * limit. The 'inode64' mount option avoids this behavior.
1082 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_INODE64
))
1083 (*ac
)->ac_max_block
= (u32
)~0U;
1086 * slot is set when we successfully steal inode from other nodes.
1087 * It is reset in 3 places:
1088 * 1. when we flush the truncate log
1089 * 2. when we complete local alloc recovery.
1090 * 3. when we successfully allocate from our own slot.
1091 * After it is set, we will go on stealing inodes until we find the
1092 * need to check our slots to see whether there is some space for us.
1094 if (slot
!= OCFS2_INVALID_SLOT
&&
1095 atomic_read(&osb
->s_num_inodes_stolen
) < OCFS2_MAX_TO_STEAL
)
1098 atomic_set(&osb
->s_num_inodes_stolen
, 0);
1099 alloc_group
= osb
->osb_inode_alloc_group
;
1100 status
= ocfs2_reserve_suballoc_bits(osb
, *ac
,
1101 INODE_ALLOC_SYSTEM_INODE
,
1105 ALLOC_GROUPS_FROM_GLOBAL
);
1109 spin_lock(&osb
->osb_lock
);
1110 osb
->osb_inode_alloc_group
= alloc_group
;
1111 spin_unlock(&osb
->osb_lock
);
1112 mlog(0, "after reservation, new allocation group is "
1113 "%llu\n", (unsigned long long)alloc_group
);
1116 * Some inodes must be freed by us, so try to allocate
1117 * from our own next time.
1119 if (slot
!= OCFS2_INVALID_SLOT
)
1120 ocfs2_init_inode_steal_slot(osb
);
1122 } else if (status
< 0 && status
!= -ENOSPC
) {
1127 ocfs2_free_ac_resource(*ac
);
1130 status
= ocfs2_steal_inode(osb
, *ac
);
1131 atomic_inc(&osb
->s_num_inodes_stolen
);
1133 if (status
!= -ENOSPC
)
1140 if ((status
< 0) && *ac
) {
1141 ocfs2_free_alloc_context(*ac
);
1149 /* local alloc code has to do the same thing, so rather than do this
1151 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super
*osb
,
1152 struct ocfs2_alloc_context
*ac
)
1156 ac
->ac_which
= OCFS2_AC_USE_MAIN
;
1157 ac
->ac_group_search
= ocfs2_cluster_group_search
;
1159 status
= ocfs2_reserve_suballoc_bits(osb
, ac
,
1160 GLOBAL_BITMAP_SYSTEM_INODE
,
1161 OCFS2_INVALID_SLOT
, NULL
,
1163 if (status
< 0 && status
!= -ENOSPC
) {
1172 /* Callers don't need to care which bitmap (local alloc or main) to
1173 * use so we figure it out for them, but unfortunately this clutters
1175 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super
*osb
,
1176 u32 bits_wanted
, u64 max_block
,
1178 struct ocfs2_alloc_context
**ac
)
1184 *ac
= kzalloc(sizeof(struct ocfs2_alloc_context
), GFP_KERNEL
);
1191 (*ac
)->ac_bits_wanted
= bits_wanted
;
1192 (*ac
)->ac_max_block
= max_block
;
1195 if (!(flags
& ALLOC_GROUPS_FROM_GLOBAL
) &&
1196 ocfs2_alloc_should_use_local(osb
, bits_wanted
)) {
1197 status
= ocfs2_reserve_local_alloc_bits(osb
,
1200 if ((status
< 0) && (status
!= -ENOSPC
)) {
1206 if (status
== -ENOSPC
) {
1207 status
= ocfs2_reserve_cluster_bitmap_bits(osb
, *ac
);
1209 if (status
!= -ENOSPC
)
1217 if ((status
< 0) && *ac
) {
1218 ocfs2_free_alloc_context(*ac
);
1226 int ocfs2_reserve_clusters(struct ocfs2_super
*osb
,
1228 struct ocfs2_alloc_context
**ac
)
1230 return ocfs2_reserve_clusters_with_limit(osb
, bits_wanted
, 0,
1231 ALLOC_NEW_GROUP
, ac
);
1235 * More or less lifted from ext3. I'll leave their description below:
1237 * "For ext3 allocations, we must not reuse any blocks which are
1238 * allocated in the bitmap buffer's "last committed data" copy. This
1239 * prevents deletes from freeing up the page for reuse until we have
1240 * committed the delete transaction.
1242 * If we didn't do this, then deleting something and reallocating it as
1243 * data would allow the old block to be overwritten before the
1244 * transaction committed (because we force data to disk before commit).
1245 * This would lead to corruption if we crashed between overwriting the
1246 * data and committing the delete.
1248 * @@@ We may want to make this allocation behaviour conditional on
1249 * data-writes at some point, and disable it for metadata allocations or
1250 * sync-data inodes."
1252 * Note: OCFS2 already does this differently for metadata vs data
1253 * allocations, as those bitmaps are separate and undo access is never
1254 * called on a metadata group descriptor.
1256 static int ocfs2_test_bg_bit_allocatable(struct buffer_head
*bg_bh
,
1259 struct ocfs2_group_desc
*bg
= (struct ocfs2_group_desc
*) bg_bh
->b_data
;
1262 if (ocfs2_test_bit(nr
, (unsigned long *)bg
->bg_bitmap
))
1265 if (!buffer_jbd(bg_bh
))
1268 jbd_lock_bh_state(bg_bh
);
1269 bg
= (struct ocfs2_group_desc
*) bh2jh(bg_bh
)->b_committed_data
;
1271 ret
= !ocfs2_test_bit(nr
, (unsigned long *)bg
->bg_bitmap
);
1274 jbd_unlock_bh_state(bg_bh
);
1279 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super
*osb
,
1280 struct buffer_head
*bg_bh
,
1281 unsigned int bits_wanted
,
1282 unsigned int total_bits
,
1283 struct ocfs2_suballoc_result
*res
)
1286 u16 best_offset
, best_size
;
1287 int offset
, start
, found
, status
= 0;
1288 struct ocfs2_group_desc
*bg
= (struct ocfs2_group_desc
*) bg_bh
->b_data
;
1290 /* Callers got this descriptor from
1291 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1292 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg
));
1294 found
= start
= best_offset
= best_size
= 0;
1295 bitmap
= bg
->bg_bitmap
;
1297 while((offset
= ocfs2_find_next_zero_bit(bitmap
, total_bits
, start
)) != -1) {
1298 if (offset
== total_bits
)
1301 if (!ocfs2_test_bg_bit_allocatable(bg_bh
, offset
)) {
1302 /* We found a zero, but we can't use it as it
1303 * hasn't been put to disk yet! */
1306 } else if (offset
== start
) {
1307 /* we found a zero */
1309 /* move start to the next bit to test */
1312 /* got a zero after some ones */
1316 if (found
> best_size
) {
1318 best_offset
= start
- found
;
1320 /* we got everything we needed */
1321 if (found
== bits_wanted
) {
1322 /* mlog(0, "Found it all!\n"); */
1328 res
->sr_bit_offset
= best_offset
;
1329 res
->sr_bits
= best_size
;
1332 /* No error log here -- see the comment above
1333 * ocfs2_test_bg_bit_allocatable */
1339 static inline int ocfs2_block_group_set_bits(handle_t
*handle
,
1340 struct inode
*alloc_inode
,
1341 struct ocfs2_group_desc
*bg
,
1342 struct buffer_head
*group_bh
,
1343 unsigned int bit_off
,
1344 unsigned int num_bits
)
1347 void *bitmap
= bg
->bg_bitmap
;
1348 int journal_type
= OCFS2_JOURNAL_ACCESS_WRITE
;
1352 /* All callers get the descriptor via
1353 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1354 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg
));
1355 BUG_ON(le16_to_cpu(bg
->bg_free_bits_count
) < num_bits
);
1357 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off
,
1360 if (ocfs2_is_cluster_bitmap(alloc_inode
))
1361 journal_type
= OCFS2_JOURNAL_ACCESS_UNDO
;
1363 status
= ocfs2_journal_access_gd(handle
,
1364 INODE_CACHE(alloc_inode
),
1372 le16_add_cpu(&bg
->bg_free_bits_count
, -num_bits
);
1374 ocfs2_set_bit(bit_off
++, bitmap
);
1376 ocfs2_journal_dirty(handle
, group_bh
);
1383 /* find the one with the most empty bits */
1384 static inline u16
ocfs2_find_victim_chain(struct ocfs2_chain_list
*cl
)
1388 BUG_ON(!cl
->cl_next_free_rec
);
1391 while (curr
< le16_to_cpu(cl
->cl_next_free_rec
)) {
1392 if (le32_to_cpu(cl
->cl_recs
[curr
].c_free
) >
1393 le32_to_cpu(cl
->cl_recs
[best
].c_free
))
1398 BUG_ON(best
>= le16_to_cpu(cl
->cl_next_free_rec
));
1402 static int ocfs2_relink_block_group(handle_t
*handle
,
1403 struct inode
*alloc_inode
,
1404 struct buffer_head
*fe_bh
,
1405 struct buffer_head
*bg_bh
,
1406 struct buffer_head
*prev_bg_bh
,
1410 /* there is a really tiny chance the journal calls could fail,
1411 * but we wouldn't want inconsistent blocks in *any* case. */
1412 u64 fe_ptr
, bg_ptr
, prev_bg_ptr
;
1413 struct ocfs2_dinode
*fe
= (struct ocfs2_dinode
*) fe_bh
->b_data
;
1414 struct ocfs2_group_desc
*bg
= (struct ocfs2_group_desc
*) bg_bh
->b_data
;
1415 struct ocfs2_group_desc
*prev_bg
= (struct ocfs2_group_desc
*) prev_bg_bh
->b_data
;
1417 /* The caller got these descriptors from
1418 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1419 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg
));
1420 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg
));
1422 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1423 (unsigned long long)le64_to_cpu(fe
->i_blkno
), chain
,
1424 (unsigned long long)le64_to_cpu(bg
->bg_blkno
),
1425 (unsigned long long)le64_to_cpu(prev_bg
->bg_blkno
));
1427 fe_ptr
= le64_to_cpu(fe
->id2
.i_chain
.cl_recs
[chain
].c_blkno
);
1428 bg_ptr
= le64_to_cpu(bg
->bg_next_group
);
1429 prev_bg_ptr
= le64_to_cpu(prev_bg
->bg_next_group
);
1431 status
= ocfs2_journal_access_gd(handle
, INODE_CACHE(alloc_inode
),
1433 OCFS2_JOURNAL_ACCESS_WRITE
);
1439 prev_bg
->bg_next_group
= bg
->bg_next_group
;
1440 ocfs2_journal_dirty(handle
, prev_bg_bh
);
1442 status
= ocfs2_journal_access_gd(handle
, INODE_CACHE(alloc_inode
),
1443 bg_bh
, OCFS2_JOURNAL_ACCESS_WRITE
);
1449 bg
->bg_next_group
= fe
->id2
.i_chain
.cl_recs
[chain
].c_blkno
;
1450 ocfs2_journal_dirty(handle
, bg_bh
);
1452 status
= ocfs2_journal_access_di(handle
, INODE_CACHE(alloc_inode
),
1453 fe_bh
, OCFS2_JOURNAL_ACCESS_WRITE
);
1459 fe
->id2
.i_chain
.cl_recs
[chain
].c_blkno
= bg
->bg_blkno
;
1460 ocfs2_journal_dirty(handle
, fe_bh
);
1464 fe
->id2
.i_chain
.cl_recs
[chain
].c_blkno
= cpu_to_le64(fe_ptr
);
1465 bg
->bg_next_group
= cpu_to_le64(bg_ptr
);
1466 prev_bg
->bg_next_group
= cpu_to_le64(prev_bg_ptr
);
1473 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc
*bg
,
1476 return le16_to_cpu(bg
->bg_free_bits_count
) > wanted
;
1479 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1480 * value on error. */
1481 static int ocfs2_cluster_group_search(struct inode
*inode
,
1482 struct buffer_head
*group_bh
,
1483 u32 bits_wanted
, u32 min_bits
,
1485 struct ocfs2_suballoc_result
*res
)
1487 int search
= -ENOSPC
;
1490 struct ocfs2_group_desc
*gd
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
1491 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
1492 unsigned int max_bits
, gd_cluster_off
;
1494 BUG_ON(!ocfs2_is_cluster_bitmap(inode
));
1496 if (gd
->bg_free_bits_count
) {
1497 max_bits
= le16_to_cpu(gd
->bg_bits
);
1499 /* Tail groups in cluster bitmaps which aren't cpg
1500 * aligned are prone to partial extention by a failed
1501 * fs resize. If the file system resize never got to
1502 * update the dinode cluster count, then we don't want
1503 * to trust any clusters past it, regardless of what
1504 * the group descriptor says. */
1505 gd_cluster_off
= ocfs2_blocks_to_clusters(inode
->i_sb
,
1506 le64_to_cpu(gd
->bg_blkno
));
1507 if ((gd_cluster_off
+ max_bits
) >
1508 OCFS2_I(inode
)->ip_clusters
) {
1509 max_bits
= OCFS2_I(inode
)->ip_clusters
- gd_cluster_off
;
1510 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1511 (unsigned long long)le64_to_cpu(gd
->bg_blkno
),
1512 le16_to_cpu(gd
->bg_bits
),
1513 OCFS2_I(inode
)->ip_clusters
, max_bits
);
1516 ret
= ocfs2_block_group_find_clear_bits(OCFS2_SB(inode
->i_sb
),
1517 group_bh
, bits_wanted
,
1523 blkoff
= ocfs2_clusters_to_blocks(inode
->i_sb
,
1525 res
->sr_bit_offset
+
1527 mlog(0, "Checking %llu against %llu\n",
1528 (unsigned long long)blkoff
,
1529 (unsigned long long)max_block
);
1530 if (blkoff
> max_block
)
1534 /* ocfs2_block_group_find_clear_bits() might
1535 * return success, but we still want to return
1536 * -ENOSPC unless it found the minimum number
1538 if (min_bits
<= res
->sr_bits
)
1539 search
= 0; /* success */
1540 else if (res
->sr_bits
) {
1542 * Don't show bits which we'll be returning
1543 * for allocation to the local alloc bitmap.
1545 ocfs2_local_alloc_seen_free_bits(osb
, res
->sr_bits
);
1552 static int ocfs2_block_group_search(struct inode
*inode
,
1553 struct buffer_head
*group_bh
,
1554 u32 bits_wanted
, u32 min_bits
,
1556 struct ocfs2_suballoc_result
*res
)
1560 struct ocfs2_group_desc
*bg
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
1562 BUG_ON(min_bits
!= 1);
1563 BUG_ON(ocfs2_is_cluster_bitmap(inode
));
1565 if (bg
->bg_free_bits_count
) {
1566 ret
= ocfs2_block_group_find_clear_bits(OCFS2_SB(inode
->i_sb
),
1567 group_bh
, bits_wanted
,
1568 le16_to_cpu(bg
->bg_bits
),
1570 if (!ret
&& max_block
) {
1571 blkoff
= le64_to_cpu(bg
->bg_blkno
) +
1572 res
->sr_bit_offset
+ res
->sr_bits
;
1573 mlog(0, "Checking %llu against %llu\n",
1574 (unsigned long long)blkoff
,
1575 (unsigned long long)max_block
);
1576 if (blkoff
> max_block
)
1584 static int ocfs2_alloc_dinode_update_counts(struct inode
*inode
,
1586 struct buffer_head
*di_bh
,
1592 struct ocfs2_dinode
*di
= (struct ocfs2_dinode
*) di_bh
->b_data
;
1593 struct ocfs2_chain_list
*cl
= (struct ocfs2_chain_list
*) &di
->id2
.i_chain
;
1595 ret
= ocfs2_journal_access_di(handle
, INODE_CACHE(inode
), di_bh
,
1596 OCFS2_JOURNAL_ACCESS_WRITE
);
1602 tmp_used
= le32_to_cpu(di
->id1
.bitmap1
.i_used
);
1603 di
->id1
.bitmap1
.i_used
= cpu_to_le32(num_bits
+ tmp_used
);
1604 le32_add_cpu(&cl
->cl_recs
[chain
].c_free
, -num_bits
);
1605 ocfs2_journal_dirty(handle
, di_bh
);
1611 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result
*res
,
1612 struct ocfs2_extent_rec
*rec
,
1613 struct ocfs2_chain_list
*cl
)
1615 unsigned int bpc
= le16_to_cpu(cl
->cl_bpc
);
1616 unsigned int bitoff
= le32_to_cpu(rec
->e_cpos
) * bpc
;
1617 unsigned int bitcount
= le32_to_cpu(rec
->e_leaf_clusters
) * bpc
;
1619 if (res
->sr_bit_offset
< bitoff
)
1621 if (res
->sr_bit_offset
>= (bitoff
+ bitcount
))
1623 res
->sr_blkno
= le64_to_cpu(rec
->e_blkno
) +
1624 (res
->sr_bit_offset
- bitoff
);
1625 if ((res
->sr_bit_offset
+ res
->sr_bits
) > (bitoff
+ bitcount
))
1626 res
->sr_bits
= (bitoff
+ bitcount
) - res
->sr_bit_offset
;
1630 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context
*ac
,
1631 struct ocfs2_group_desc
*bg
,
1632 struct ocfs2_suballoc_result
*res
)
1635 u64 bg_blkno
= res
->sr_bg_blkno
; /* Save off */
1636 struct ocfs2_extent_rec
*rec
;
1637 struct ocfs2_dinode
*di
= (struct ocfs2_dinode
*)ac
->ac_bh
->b_data
;
1638 struct ocfs2_chain_list
*cl
= &di
->id2
.i_chain
;
1640 if (ocfs2_is_cluster_bitmap(ac
->ac_inode
)) {
1645 res
->sr_blkno
= res
->sr_bg_blkno
+ res
->sr_bit_offset
;
1646 res
->sr_bg_blkno
= 0; /* Clear it for contig block groups */
1647 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac
->ac_inode
->i_sb
)) ||
1648 !bg
->bg_list
.l_next_free_rec
)
1651 for (i
= 0; i
< le16_to_cpu(bg
->bg_list
.l_next_free_rec
); i
++) {
1652 rec
= &bg
->bg_list
.l_recs
[i
];
1653 if (ocfs2_bg_discontig_fix_by_rec(res
, rec
, cl
)) {
1654 res
->sr_bg_blkno
= bg_blkno
; /* Restore */
1660 static int ocfs2_search_one_group(struct ocfs2_alloc_context
*ac
,
1664 struct ocfs2_suballoc_result
*res
,
1668 struct buffer_head
*group_bh
= NULL
;
1669 struct ocfs2_group_desc
*gd
;
1670 struct ocfs2_dinode
*di
= (struct ocfs2_dinode
*)ac
->ac_bh
->b_data
;
1671 struct inode
*alloc_inode
= ac
->ac_inode
;
1673 ret
= ocfs2_read_group_descriptor(alloc_inode
, di
,
1674 res
->sr_bg_blkno
, &group_bh
);
1680 gd
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
1681 ret
= ac
->ac_group_search(alloc_inode
, group_bh
, bits_wanted
, min_bits
,
1682 ac
->ac_max_block
, res
);
1690 ocfs2_bg_discontig_fix_result(ac
, gd
, res
);
1692 ret
= ocfs2_alloc_dinode_update_counts(alloc_inode
, handle
, ac
->ac_bh
,
1694 le16_to_cpu(gd
->bg_chain
));
1700 ret
= ocfs2_block_group_set_bits(handle
, alloc_inode
, gd
, group_bh
,
1701 res
->sr_bit_offset
, res
->sr_bits
);
1705 *bits_left
= le16_to_cpu(gd
->bg_free_bits_count
);
1713 static int ocfs2_search_chain(struct ocfs2_alloc_context
*ac
,
1717 struct ocfs2_suballoc_result
*res
,
1724 struct inode
*alloc_inode
= ac
->ac_inode
;
1725 struct buffer_head
*group_bh
= NULL
;
1726 struct buffer_head
*prev_group_bh
= NULL
;
1727 struct ocfs2_dinode
*fe
= (struct ocfs2_dinode
*) ac
->ac_bh
->b_data
;
1728 struct ocfs2_chain_list
*cl
= (struct ocfs2_chain_list
*) &fe
->id2
.i_chain
;
1729 struct ocfs2_group_desc
*bg
;
1731 chain
= ac
->ac_chain
;
1732 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1734 (unsigned long long)OCFS2_I(alloc_inode
)->ip_blkno
);
1736 status
= ocfs2_read_group_descriptor(alloc_inode
, fe
,
1737 le64_to_cpu(cl
->cl_recs
[chain
].c_blkno
),
1743 bg
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
1746 /* for now, the chain search is a bit simplistic. We just use
1747 * the 1st group with any empty bits. */
1748 while ((status
= ac
->ac_group_search(alloc_inode
, group_bh
,
1749 bits_wanted
, min_bits
,
1752 if (!bg
->bg_next_group
)
1755 brelse(prev_group_bh
);
1756 prev_group_bh
= NULL
;
1758 next_group
= le64_to_cpu(bg
->bg_next_group
);
1759 prev_group_bh
= group_bh
;
1761 status
= ocfs2_read_group_descriptor(alloc_inode
, fe
,
1762 next_group
, &group_bh
);
1767 bg
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
1770 if (status
!= -ENOSPC
)
1775 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1776 res
->sr_bits
, (unsigned long long)le64_to_cpu(bg
->bg_blkno
));
1778 res
->sr_bg_blkno
= le64_to_cpu(bg
->bg_blkno
);
1780 BUG_ON(res
->sr_bits
== 0);
1782 ocfs2_bg_discontig_fix_result(ac
, bg
, res
);
1786 * Keep track of previous block descriptor read. When
1787 * we find a target, if we have read more than X
1788 * number of descriptors, and the target is reasonably
1789 * empty, relink him to top of his chain.
1791 * We've read 0 extra blocks and only send one more to
1792 * the transaction, yet the next guy to search has a
1795 * Do this *after* figuring out how many bits we're taking out
1796 * of our target group.
1798 if (ac
->ac_allow_chain_relink
&&
1800 (ocfs2_block_group_reasonably_empty(bg
, res
->sr_bits
))) {
1801 status
= ocfs2_relink_block_group(handle
, alloc_inode
,
1802 ac
->ac_bh
, group_bh
,
1803 prev_group_bh
, chain
);
1810 /* Ok, claim our bits now: set the info on dinode, chainlist
1811 * and then the group */
1812 status
= ocfs2_journal_access_di(handle
,
1813 INODE_CACHE(alloc_inode
),
1815 OCFS2_JOURNAL_ACCESS_WRITE
);
1821 tmp_used
= le32_to_cpu(fe
->id1
.bitmap1
.i_used
);
1822 fe
->id1
.bitmap1
.i_used
= cpu_to_le32(res
->sr_bits
+ tmp_used
);
1823 le32_add_cpu(&cl
->cl_recs
[chain
].c_free
, -res
->sr_bits
);
1824 ocfs2_journal_dirty(handle
, ac
->ac_bh
);
1826 status
= ocfs2_block_group_set_bits(handle
,
1837 mlog(0, "Allocated %u bits from suballocator %llu\n", res
->sr_bits
,
1838 (unsigned long long)le64_to_cpu(fe
->i_blkno
));
1840 *bits_left
= le16_to_cpu(bg
->bg_free_bits_count
);
1843 brelse(prev_group_bh
);
1849 /* will give out up to bits_wanted contiguous bits. */
1850 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context
*ac
,
1854 struct ocfs2_suballoc_result
*res
)
1859 u64 hint
= ac
->ac_last_group
;
1860 struct ocfs2_chain_list
*cl
;
1861 struct ocfs2_dinode
*fe
;
1865 BUG_ON(ac
->ac_bits_given
>= ac
->ac_bits_wanted
);
1866 BUG_ON(bits_wanted
> (ac
->ac_bits_wanted
- ac
->ac_bits_given
));
1869 fe
= (struct ocfs2_dinode
*) ac
->ac_bh
->b_data
;
1871 /* The bh was validated by the inode read during
1872 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
1873 BUG_ON(!OCFS2_IS_VALID_DINODE(fe
));
1875 if (le32_to_cpu(fe
->id1
.bitmap1
.i_used
) >=
1876 le32_to_cpu(fe
->id1
.bitmap1
.i_total
)) {
1877 ocfs2_error(ac
->ac_inode
->i_sb
,
1878 "Chain allocator dinode %llu has %u used "
1879 "bits but only %u total.",
1880 (unsigned long long)le64_to_cpu(fe
->i_blkno
),
1881 le32_to_cpu(fe
->id1
.bitmap1
.i_used
),
1882 le32_to_cpu(fe
->id1
.bitmap1
.i_total
));
1887 res
->sr_bg_blkno
= hint
;
1888 if (res
->sr_bg_blkno
) {
1889 /* Attempt to short-circuit the usual search mechanism
1890 * by jumping straight to the most recently used
1891 * allocation group. This helps us mantain some
1892 * contiguousness across allocations. */
1893 status
= ocfs2_search_one_group(ac
, handle
, bits_wanted
,
1894 min_bits
, res
, &bits_left
);
1897 if (status
< 0 && status
!= -ENOSPC
) {
1903 cl
= (struct ocfs2_chain_list
*) &fe
->id2
.i_chain
;
1905 victim
= ocfs2_find_victim_chain(cl
);
1906 ac
->ac_chain
= victim
;
1907 ac
->ac_allow_chain_relink
= 1;
1909 status
= ocfs2_search_chain(ac
, handle
, bits_wanted
, min_bits
,
1912 hint
= ocfs2_group_from_res(res
);
1915 if (status
< 0 && status
!= -ENOSPC
) {
1920 mlog(0, "Search of victim chain %u came up with nothing, "
1921 "trying all chains now.\n", victim
);
1923 /* If we didn't pick a good victim, then just default to
1924 * searching each chain in order. Don't allow chain relinking
1925 * because we only calculate enough journal credits for one
1926 * relink per alloc. */
1927 ac
->ac_allow_chain_relink
= 0;
1928 for (i
= 0; i
< le16_to_cpu(cl
->cl_next_free_rec
); i
++) {
1931 if (!cl
->cl_recs
[i
].c_free
)
1935 status
= ocfs2_search_chain(ac
, handle
, bits_wanted
, min_bits
,
1938 hint
= ocfs2_group_from_res(res
);
1941 if (status
< 0 && status
!= -ENOSPC
) {
1948 if (status
!= -ENOSPC
) {
1949 /* If the next search of this group is not likely to
1950 * yield a suitable extent, then we reset the last
1951 * group hint so as to not waste a disk read */
1952 if (bits_left
< min_bits
)
1953 ac
->ac_last_group
= 0;
1955 ac
->ac_last_group
= hint
;
1963 int ocfs2_claim_metadata(handle_t
*handle
,
1964 struct ocfs2_alloc_context
*ac
,
1967 u16
*suballoc_bit_start
,
1968 unsigned int *num_bits
,
1972 struct ocfs2_suballoc_result res
= { .sr_blkno
= 0, };
1975 BUG_ON(ac
->ac_bits_wanted
< (ac
->ac_bits_given
+ bits_wanted
));
1976 BUG_ON(ac
->ac_which
!= OCFS2_AC_USE_META
);
1978 status
= ocfs2_claim_suballoc_bits(ac
,
1987 atomic_inc(&OCFS2_SB(ac
->ac_inode
->i_sb
)->alloc_stats
.bg_allocs
);
1989 *suballoc_loc
= res
.sr_bg_blkno
;
1990 *suballoc_bit_start
= res
.sr_bit_offset
;
1991 *blkno_start
= res
.sr_blkno
;
1992 ac
->ac_bits_given
+= res
.sr_bits
;
1993 *num_bits
= res
.sr_bits
;
2000 static void ocfs2_init_inode_ac_group(struct inode
*dir
,
2001 struct buffer_head
*parent_di_bh
,
2002 struct ocfs2_alloc_context
*ac
)
2004 struct ocfs2_dinode
*di
= (struct ocfs2_dinode
*)parent_di_bh
->b_data
;
2006 * Try to allocate inodes from some specific group.
2008 * If the parent dir has recorded the last group used in allocation,
2009 * cool, use it. Otherwise if we try to allocate new inode from the
2010 * same slot the parent dir belongs to, use the same chunk.
2012 * We are very careful here to avoid the mistake of setting
2013 * ac_last_group to a group descriptor from a different (unlocked) slot.
2015 if (OCFS2_I(dir
)->ip_last_used_group
&&
2016 OCFS2_I(dir
)->ip_last_used_slot
== ac
->ac_alloc_slot
)
2017 ac
->ac_last_group
= OCFS2_I(dir
)->ip_last_used_group
;
2018 else if (le16_to_cpu(di
->i_suballoc_slot
) == ac
->ac_alloc_slot
) {
2019 if (di
->i_suballoc_loc
)
2020 ac
->ac_last_group
= le64_to_cpu(di
->i_suballoc_loc
);
2022 ac
->ac_last_group
= ocfs2_which_suballoc_group(
2023 le64_to_cpu(di
->i_blkno
),
2024 le16_to_cpu(di
->i_suballoc_bit
));
2028 static inline void ocfs2_save_inode_ac_group(struct inode
*dir
,
2029 struct ocfs2_alloc_context
*ac
)
2031 OCFS2_I(dir
)->ip_last_used_group
= ac
->ac_last_group
;
2032 OCFS2_I(dir
)->ip_last_used_slot
= ac
->ac_alloc_slot
;
2035 int ocfs2_claim_new_inode(handle_t
*handle
,
2037 struct buffer_head
*parent_fe_bh
,
2038 struct ocfs2_alloc_context
*ac
,
2044 struct ocfs2_suballoc_result res
;
2049 BUG_ON(ac
->ac_bits_given
!= 0);
2050 BUG_ON(ac
->ac_bits_wanted
!= 1);
2051 BUG_ON(ac
->ac_which
!= OCFS2_AC_USE_INODE
);
2053 ocfs2_init_inode_ac_group(dir
, parent_fe_bh
, ac
);
2055 status
= ocfs2_claim_suballoc_bits(ac
,
2064 atomic_inc(&OCFS2_SB(ac
->ac_inode
->i_sb
)->alloc_stats
.bg_allocs
);
2066 BUG_ON(res
.sr_bits
!= 1);
2068 *suballoc_loc
= res
.sr_bg_blkno
;
2069 *suballoc_bit
= res
.sr_bit_offset
;
2070 *fe_blkno
= res
.sr_blkno
;
2071 ac
->ac_bits_given
++;
2072 ocfs2_save_inode_ac_group(dir
, ac
);
2079 /* translate a group desc. blkno and it's bitmap offset into
2080 * disk cluster offset. */
2081 static inline u32
ocfs2_desc_bitmap_to_cluster_off(struct inode
*inode
,
2085 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
2088 BUG_ON(!ocfs2_is_cluster_bitmap(inode
));
2090 if (bg_blkno
!= osb
->first_cluster_group_blkno
)
2091 cluster
= ocfs2_blocks_to_clusters(inode
->i_sb
, bg_blkno
);
2092 cluster
+= (u32
) bg_bit_off
;
2096 /* given a cluster offset, calculate which block group it belongs to
2097 * and return that block offset. */
2098 u64
ocfs2_which_cluster_group(struct inode
*inode
, u32 cluster
)
2100 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
2103 BUG_ON(!ocfs2_is_cluster_bitmap(inode
));
2105 group_no
= cluster
/ osb
->bitmap_cpg
;
2107 return osb
->first_cluster_group_blkno
;
2108 return ocfs2_clusters_to_blocks(inode
->i_sb
,
2109 group_no
* osb
->bitmap_cpg
);
2112 /* given the block number of a cluster start, calculate which cluster
2113 * group and descriptor bitmap offset that corresponds to. */
2114 static inline void ocfs2_block_to_cluster_group(struct inode
*inode
,
2119 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
2120 u32 data_cluster
= ocfs2_blocks_to_clusters(osb
->sb
, data_blkno
);
2122 BUG_ON(!ocfs2_is_cluster_bitmap(inode
));
2124 *bg_blkno
= ocfs2_which_cluster_group(inode
,
2127 if (*bg_blkno
== osb
->first_cluster_group_blkno
)
2128 *bg_bit_off
= (u16
) data_cluster
;
2130 *bg_bit_off
= (u16
) ocfs2_blocks_to_clusters(osb
->sb
,
2131 data_blkno
- *bg_blkno
);
2135 * min_bits - minimum contiguous chunk from this total allocation we
2136 * can handle. set to what we asked for originally for a full
2137 * contig. allocation, set to '1' to indicate we can deal with extents
2140 int __ocfs2_claim_clusters(handle_t
*handle
,
2141 struct ocfs2_alloc_context
*ac
,
2148 unsigned int bits_wanted
= max_clusters
;
2149 struct ocfs2_suballoc_result res
= { .sr_blkno
= 0, };
2150 struct ocfs2_super
*osb
= OCFS2_SB(ac
->ac_inode
->i_sb
);
2154 BUG_ON(ac
->ac_bits_given
>= ac
->ac_bits_wanted
);
2156 BUG_ON(ac
->ac_which
!= OCFS2_AC_USE_LOCAL
2157 && ac
->ac_which
!= OCFS2_AC_USE_MAIN
);
2159 if (ac
->ac_which
== OCFS2_AC_USE_LOCAL
) {
2160 WARN_ON(min_clusters
> 1);
2162 status
= ocfs2_claim_local_alloc_bits(osb
,
2169 atomic_inc(&osb
->alloc_stats
.local_data
);
2171 if (min_clusters
> (osb
->bitmap_cpg
- 1)) {
2172 /* The only paths asking for contiguousness
2173 * should know about this already. */
2174 mlog(ML_ERROR
, "minimum allocation requested %u exceeds "
2175 "group bitmap size %u!\n", min_clusters
,
2180 /* clamp the current request down to a realistic size. */
2181 if (bits_wanted
> (osb
->bitmap_cpg
- 1))
2182 bits_wanted
= osb
->bitmap_cpg
- 1;
2184 status
= ocfs2_claim_suballoc_bits(ac
,
2190 BUG_ON(res
.sr_blkno
); /* cluster alloc can't set */
2192 ocfs2_desc_bitmap_to_cluster_off(ac
->ac_inode
,
2195 atomic_inc(&osb
->alloc_stats
.bitmap_data
);
2196 *num_clusters
= res
.sr_bits
;
2200 if (status
!= -ENOSPC
)
2205 ac
->ac_bits_given
+= *num_clusters
;
2212 int ocfs2_claim_clusters(handle_t
*handle
,
2213 struct ocfs2_alloc_context
*ac
,
2218 unsigned int bits_wanted
= ac
->ac_bits_wanted
- ac
->ac_bits_given
;
2220 return __ocfs2_claim_clusters(handle
, ac
, min_clusters
,
2221 bits_wanted
, cluster_start
, num_clusters
);
2224 static int ocfs2_block_group_clear_bits(handle_t
*handle
,
2225 struct inode
*alloc_inode
,
2226 struct ocfs2_group_desc
*bg
,
2227 struct buffer_head
*group_bh
,
2228 unsigned int bit_off
,
2229 unsigned int num_bits
,
2230 void (*undo_fn
)(unsigned int bit
,
2231 unsigned long *bmap
))
2235 struct ocfs2_group_desc
*undo_bg
= NULL
;
2239 /* The caller got this descriptor from
2240 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
2241 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg
));
2243 mlog(0, "off = %u, num = %u\n", bit_off
, num_bits
);
2245 BUG_ON(undo_fn
&& !ocfs2_is_cluster_bitmap(alloc_inode
));
2246 status
= ocfs2_journal_access_gd(handle
, INODE_CACHE(alloc_inode
),
2249 OCFS2_JOURNAL_ACCESS_UNDO
:
2250 OCFS2_JOURNAL_ACCESS_WRITE
);
2257 jbd_lock_bh_state(group_bh
);
2258 undo_bg
= (struct ocfs2_group_desc
*)
2259 bh2jh(group_bh
)->b_committed_data
;
2265 ocfs2_clear_bit((bit_off
+ tmp
),
2266 (unsigned long *) bg
->bg_bitmap
);
2268 undo_fn(bit_off
+ tmp
,
2269 (unsigned long *) undo_bg
->bg_bitmap
);
2271 le16_add_cpu(&bg
->bg_free_bits_count
, num_bits
);
2274 jbd_unlock_bh_state(group_bh
);
2276 ocfs2_journal_dirty(handle
, group_bh
);
2282 * expects the suballoc inode to already be locked.
2284 static int _ocfs2_free_suballoc_bits(handle_t
*handle
,
2285 struct inode
*alloc_inode
,
2286 struct buffer_head
*alloc_bh
,
2287 unsigned int start_bit
,
2290 void (*undo_fn
)(unsigned int bit
,
2291 unsigned long *bitmap
))
2295 struct ocfs2_dinode
*fe
= (struct ocfs2_dinode
*) alloc_bh
->b_data
;
2296 struct ocfs2_chain_list
*cl
= &fe
->id2
.i_chain
;
2297 struct buffer_head
*group_bh
= NULL
;
2298 struct ocfs2_group_desc
*group
;
2302 /* The alloc_bh comes from ocfs2_free_dinode() or
2303 * ocfs2_free_clusters(). The callers have all locked the
2304 * allocator and gotten alloc_bh from the lock call. This
2305 * validates the dinode buffer. Any corruption that has happended
2307 BUG_ON(!OCFS2_IS_VALID_DINODE(fe
));
2308 BUG_ON((count
+ start_bit
) > ocfs2_bits_per_group(cl
));
2310 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2311 (unsigned long long)OCFS2_I(alloc_inode
)->ip_blkno
, count
,
2312 (unsigned long long)bg_blkno
, start_bit
);
2314 status
= ocfs2_read_group_descriptor(alloc_inode
, fe
, bg_blkno
,
2320 group
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
2322 BUG_ON((count
+ start_bit
) > le16_to_cpu(group
->bg_bits
));
2324 status
= ocfs2_block_group_clear_bits(handle
, alloc_inode
,
2326 start_bit
, count
, undo_fn
);
2332 status
= ocfs2_journal_access_di(handle
, INODE_CACHE(alloc_inode
),
2333 alloc_bh
, OCFS2_JOURNAL_ACCESS_WRITE
);
2339 le32_add_cpu(&cl
->cl_recs
[le16_to_cpu(group
->bg_chain
)].c_free
,
2341 tmp_used
= le32_to_cpu(fe
->id1
.bitmap1
.i_used
);
2342 fe
->id1
.bitmap1
.i_used
= cpu_to_le32(tmp_used
- count
);
2343 ocfs2_journal_dirty(handle
, alloc_bh
);
2352 int ocfs2_free_suballoc_bits(handle_t
*handle
,
2353 struct inode
*alloc_inode
,
2354 struct buffer_head
*alloc_bh
,
2355 unsigned int start_bit
,
2359 return _ocfs2_free_suballoc_bits(handle
, alloc_inode
, alloc_bh
,
2360 start_bit
, bg_blkno
, count
, NULL
);
2363 int ocfs2_free_dinode(handle_t
*handle
,
2364 struct inode
*inode_alloc_inode
,
2365 struct buffer_head
*inode_alloc_bh
,
2366 struct ocfs2_dinode
*di
)
2368 u64 blk
= le64_to_cpu(di
->i_blkno
);
2369 u16 bit
= le16_to_cpu(di
->i_suballoc_bit
);
2370 u64 bg_blkno
= ocfs2_which_suballoc_group(blk
, bit
);
2372 if (di
->i_suballoc_loc
)
2373 bg_blkno
= le64_to_cpu(di
->i_suballoc_loc
);
2374 return ocfs2_free_suballoc_bits(handle
, inode_alloc_inode
,
2375 inode_alloc_bh
, bit
, bg_blkno
, 1);
2378 static int _ocfs2_free_clusters(handle_t
*handle
,
2379 struct inode
*bitmap_inode
,
2380 struct buffer_head
*bitmap_bh
,
2382 unsigned int num_clusters
,
2383 void (*undo_fn
)(unsigned int bit
,
2384 unsigned long *bitmap
))
2389 struct ocfs2_dinode
*fe
;
2391 /* You can't ever have a contiguous set of clusters
2392 * bigger than a block group bitmap so we never have to worry
2393 * about looping on them. */
2397 /* This is expensive. We can safely remove once this stuff has
2398 * gotten tested really well. */
2399 BUG_ON(start_blk
!= ocfs2_clusters_to_blocks(bitmap_inode
->i_sb
, ocfs2_blocks_to_clusters(bitmap_inode
->i_sb
, start_blk
)));
2401 fe
= (struct ocfs2_dinode
*) bitmap_bh
->b_data
;
2403 ocfs2_block_to_cluster_group(bitmap_inode
, start_blk
, &bg_blkno
,
2406 mlog(0, "want to free %u clusters starting at block %llu\n",
2407 num_clusters
, (unsigned long long)start_blk
);
2408 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2409 (unsigned long long)bg_blkno
, bg_start_bit
);
2411 status
= _ocfs2_free_suballoc_bits(handle
, bitmap_inode
, bitmap_bh
,
2412 bg_start_bit
, bg_blkno
,
2413 num_clusters
, undo_fn
);
2419 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode
->i_sb
),
2427 int ocfs2_free_clusters(handle_t
*handle
,
2428 struct inode
*bitmap_inode
,
2429 struct buffer_head
*bitmap_bh
,
2431 unsigned int num_clusters
)
2433 return _ocfs2_free_clusters(handle
, bitmap_inode
, bitmap_bh
,
2434 start_blk
, num_clusters
,
2439 * Give never-used clusters back to the global bitmap. We don't need
2440 * to protect these bits in the undo buffer.
2442 int ocfs2_release_clusters(handle_t
*handle
,
2443 struct inode
*bitmap_inode
,
2444 struct buffer_head
*bitmap_bh
,
2446 unsigned int num_clusters
)
2448 return _ocfs2_free_clusters(handle
, bitmap_inode
, bitmap_bh
,
2449 start_blk
, num_clusters
,
2453 static inline void ocfs2_debug_bg(struct ocfs2_group_desc
*bg
)
2455 printk("Block Group:\n");
2456 printk("bg_signature: %s\n", bg
->bg_signature
);
2457 printk("bg_size: %u\n", bg
->bg_size
);
2458 printk("bg_bits: %u\n", bg
->bg_bits
);
2459 printk("bg_free_bits_count: %u\n", bg
->bg_free_bits_count
);
2460 printk("bg_chain: %u\n", bg
->bg_chain
);
2461 printk("bg_generation: %u\n", le32_to_cpu(bg
->bg_generation
));
2462 printk("bg_next_group: %llu\n",
2463 (unsigned long long)bg
->bg_next_group
);
2464 printk("bg_parent_dinode: %llu\n",
2465 (unsigned long long)bg
->bg_parent_dinode
);
2466 printk("bg_blkno: %llu\n",
2467 (unsigned long long)bg
->bg_blkno
);
2470 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode
*fe
)
2474 printk("Suballoc Inode %llu:\n", (unsigned long long)fe
->i_blkno
);
2475 printk("i_signature: %s\n", fe
->i_signature
);
2476 printk("i_size: %llu\n",
2477 (unsigned long long)fe
->i_size
);
2478 printk("i_clusters: %u\n", fe
->i_clusters
);
2479 printk("i_generation: %u\n",
2480 le32_to_cpu(fe
->i_generation
));
2481 printk("id1.bitmap1.i_used: %u\n",
2482 le32_to_cpu(fe
->id1
.bitmap1
.i_used
));
2483 printk("id1.bitmap1.i_total: %u\n",
2484 le32_to_cpu(fe
->id1
.bitmap1
.i_total
));
2485 printk("id2.i_chain.cl_cpg: %u\n", fe
->id2
.i_chain
.cl_cpg
);
2486 printk("id2.i_chain.cl_bpc: %u\n", fe
->id2
.i_chain
.cl_bpc
);
2487 printk("id2.i_chain.cl_count: %u\n", fe
->id2
.i_chain
.cl_count
);
2488 printk("id2.i_chain.cl_next_free_rec: %u\n",
2489 fe
->id2
.i_chain
.cl_next_free_rec
);
2490 for(i
= 0; i
< fe
->id2
.i_chain
.cl_next_free_rec
; i
++) {
2491 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i
,
2492 fe
->id2
.i_chain
.cl_recs
[i
].c_free
);
2493 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i
,
2494 fe
->id2
.i_chain
.cl_recs
[i
].c_total
);
2495 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i
,
2496 (unsigned long long)fe
->id2
.i_chain
.cl_recs
[i
].c_blkno
);
2501 * For a given allocation, determine which allocators will need to be
2502 * accessed, and lock them, reserving the appropriate number of bits.
2504 * Sparse file systems call this from ocfs2_write_begin_nolock()
2505 * and ocfs2_allocate_unwritten_extents().
2507 * File systems which don't support holes call this from
2508 * ocfs2_extend_allocation().
2510 int ocfs2_lock_allocators(struct inode
*inode
,
2511 struct ocfs2_extent_tree
*et
,
2512 u32 clusters_to_add
, u32 extents_to_split
,
2513 struct ocfs2_alloc_context
**data_ac
,
2514 struct ocfs2_alloc_context
**meta_ac
)
2516 int ret
= 0, num_free_extents
;
2517 unsigned int max_recs_needed
= clusters_to_add
+ 2 * extents_to_split
;
2518 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
2524 BUG_ON(clusters_to_add
!= 0 && data_ac
== NULL
);
2526 num_free_extents
= ocfs2_num_free_extents(osb
, et
);
2527 if (num_free_extents
< 0) {
2528 ret
= num_free_extents
;
2534 * Sparse allocation file systems need to be more conservative
2535 * with reserving room for expansion - the actual allocation
2536 * happens while we've got a journal handle open so re-taking
2537 * a cluster lock (because we ran out of room for another
2538 * extent) will violate ordering rules.
2540 * Most of the time we'll only be seeing this 1 cluster at a time
2543 * Always lock for any unwritten extents - we might want to
2544 * add blocks during a split.
2546 if (!num_free_extents
||
2547 (ocfs2_sparse_alloc(osb
) && num_free_extents
< max_recs_needed
)) {
2548 ret
= ocfs2_reserve_new_metadata(osb
, et
->et_root_el
, meta_ac
);
2556 if (clusters_to_add
== 0)
2559 ret
= ocfs2_reserve_clusters(osb
, clusters_to_add
, data_ac
);
2569 ocfs2_free_alloc_context(*meta_ac
);
2574 * We cannot have an error and a non null *data_ac.
2582 * Read the inode specified by blkno to get suballoc_slot and
2585 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super
*osb
, u64 blkno
,
2586 u16
*suballoc_slot
, u64
*group_blkno
,
2590 struct buffer_head
*inode_bh
= NULL
;
2591 struct ocfs2_dinode
*inode_fe
;
2593 mlog_entry("blkno: %llu\n", (unsigned long long)blkno
);
2595 /* dirty read disk */
2596 status
= ocfs2_read_blocks_sync(osb
, blkno
, 1, &inode_bh
);
2598 mlog(ML_ERROR
, "read block %llu failed %d\n",
2599 (unsigned long long)blkno
, status
);
2603 inode_fe
= (struct ocfs2_dinode
*) inode_bh
->b_data
;
2604 if (!OCFS2_IS_VALID_DINODE(inode_fe
)) {
2605 mlog(ML_ERROR
, "invalid inode %llu requested\n",
2606 (unsigned long long)blkno
);
2611 if (le16_to_cpu(inode_fe
->i_suballoc_slot
) != (u16
)OCFS2_INVALID_SLOT
&&
2612 (u32
)le16_to_cpu(inode_fe
->i_suballoc_slot
) > osb
->max_slots
- 1) {
2613 mlog(ML_ERROR
, "inode %llu has invalid suballoc slot %u\n",
2614 (unsigned long long)blkno
,
2615 (u32
)le16_to_cpu(inode_fe
->i_suballoc_slot
));
2621 *suballoc_slot
= le16_to_cpu(inode_fe
->i_suballoc_slot
);
2623 *suballoc_bit
= le16_to_cpu(inode_fe
->i_suballoc_bit
);
2625 *group_blkno
= le64_to_cpu(inode_fe
->i_suballoc_loc
);
2635 * test whether bit is SET in allocator bitmap or not. on success, 0
2636 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2637 * is returned and *res is meaningless. Call this after you have
2638 * cluster locked against suballoc, or you may get a result based on
2639 * non-up2date contents
2641 static int ocfs2_test_suballoc_bit(struct ocfs2_super
*osb
,
2642 struct inode
*suballoc
,
2643 struct buffer_head
*alloc_bh
,
2644 u64 group_blkno
, u64 blkno
,
2647 struct ocfs2_dinode
*alloc_di
;
2648 struct ocfs2_group_desc
*group
;
2649 struct buffer_head
*group_bh
= NULL
;
2653 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno
,
2656 alloc_di
= (struct ocfs2_dinode
*)alloc_bh
->b_data
;
2657 if ((bit
+ 1) > ocfs2_bits_per_group(&alloc_di
->id2
.i_chain
)) {
2658 mlog(ML_ERROR
, "suballoc bit %u out of range of %u\n",
2660 ocfs2_bits_per_group(&alloc_di
->id2
.i_chain
));
2665 bg_blkno
= group_blkno
? group_blkno
:
2666 ocfs2_which_suballoc_group(blkno
, bit
);
2667 status
= ocfs2_read_group_descriptor(suballoc
, alloc_di
, bg_blkno
,
2670 mlog(ML_ERROR
, "read group %llu failed %d\n",
2671 (unsigned long long)bg_blkno
, status
);
2675 group
= (struct ocfs2_group_desc
*) group_bh
->b_data
;
2676 *res
= ocfs2_test_bit(bit
, (unsigned long *)group
->bg_bitmap
);
2686 * Test if the bit representing this inode (blkno) is set in the
2689 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2691 * In the event of failure, a negative value is returned and *res is
2694 * Callers must make sure to hold nfs_sync_lock to prevent
2695 * ocfs2_delete_inode() on another node from accessing the same
2696 * suballocator concurrently.
2698 int ocfs2_test_inode_bit(struct ocfs2_super
*osb
, u64 blkno
, int *res
)
2701 u64 group_blkno
= 0;
2702 u16 suballoc_bit
= 0, suballoc_slot
= 0;
2703 struct inode
*inode_alloc_inode
;
2704 struct buffer_head
*alloc_bh
= NULL
;
2706 mlog_entry("blkno: %llu", (unsigned long long)blkno
);
2708 status
= ocfs2_get_suballoc_slot_bit(osb
, blkno
, &suballoc_slot
,
2709 &group_blkno
, &suballoc_bit
);
2711 mlog(ML_ERROR
, "get alloc slot and bit failed %d\n", status
);
2716 ocfs2_get_system_file_inode(osb
, INODE_ALLOC_SYSTEM_INODE
,
2718 if (!inode_alloc_inode
) {
2719 /* the error code could be inaccurate, but we are not able to
2720 * get the correct one. */
2722 mlog(ML_ERROR
, "unable to get alloc inode in slot %u\n",
2723 (u32
)suballoc_slot
);
2727 mutex_lock(&inode_alloc_inode
->i_mutex
);
2728 status
= ocfs2_inode_lock(inode_alloc_inode
, &alloc_bh
, 0);
2730 mutex_unlock(&inode_alloc_inode
->i_mutex
);
2731 mlog(ML_ERROR
, "lock on alloc inode on slot %u failed %d\n",
2732 (u32
)suballoc_slot
, status
);
2736 status
= ocfs2_test_suballoc_bit(osb
, inode_alloc_inode
, alloc_bh
,
2737 group_blkno
, blkno
, suballoc_bit
, res
);
2739 mlog(ML_ERROR
, "test suballoc bit failed %d\n", status
);
2741 ocfs2_inode_unlock(inode_alloc_inode
, 0);
2742 mutex_unlock(&inode_alloc_inode
->i_mutex
);
2744 iput(inode_alloc_inode
);