NFSv2/v3: Fix a memory leak when using -onolock
[linux-2.6.22.y-op.git] / fs / ocfs2 / suballoc.c
blobe3437626d183a63c54e46ff6226598f0aa373f67
1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
4 * suballoc.c
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
35 #include "ocfs2.h"
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
47 #include "buffer_head_io.h"
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(handle_t *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
62 static int ocfs2_cluster_group_search(struct inode *inode,
63 struct buffer_head *group_bh,
64 u32 bits_wanted, u32 min_bits,
65 u16 *bit_off, u16 *bits_found);
66 static int ocfs2_block_group_search(struct inode *inode,
67 struct buffer_head *group_bh,
68 u32 bits_wanted, u32 min_bits,
69 u16 *bit_off, u16 *bits_found);
70 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
71 struct ocfs2_alloc_context *ac,
72 handle_t *handle,
73 u32 bits_wanted,
74 u32 min_bits,
75 u16 *bit_off,
76 unsigned int *num_bits,
77 u64 *bg_blkno);
78 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
79 int nr);
80 static inline int ocfs2_block_group_set_bits(handle_t *handle,
81 struct inode *alloc_inode,
82 struct ocfs2_group_desc *bg,
83 struct buffer_head *group_bh,
84 unsigned int bit_off,
85 unsigned int num_bits);
86 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
87 struct inode *alloc_inode,
88 struct ocfs2_group_desc *bg,
89 struct buffer_head *group_bh,
90 unsigned int bit_off,
91 unsigned int num_bits);
93 static int ocfs2_relink_block_group(handle_t *handle,
94 struct inode *alloc_inode,
95 struct buffer_head *fe_bh,
96 struct buffer_head *bg_bh,
97 struct buffer_head *prev_bg_bh,
98 u16 chain);
99 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 u32 wanted);
101 static int ocfs2_free_suballoc_bits(handle_t *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *alloc_bh,
104 unsigned int start_bit,
105 u64 bg_blkno,
106 unsigned int count);
107 static inline u64 ocfs2_which_suballoc_group(u64 block,
108 unsigned int bit);
109 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 u64 bg_blkno,
111 u16 bg_bit_off);
112 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
113 u32 cluster);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 u64 data_blkno,
116 u64 *bg_blkno,
117 u16 *bg_bit_off);
119 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
121 struct inode *inode = ac->ac_inode;
123 if (inode) {
124 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
125 ocfs2_meta_unlock(inode, 1);
127 mutex_unlock(&inode->i_mutex);
129 iput(inode);
131 if (ac->ac_bh)
132 brelse(ac->ac_bh);
133 kfree(ac);
136 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
138 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
141 /* somewhat more expensive than our other checks, so use sparingly. */
142 static int ocfs2_check_group_descriptor(struct super_block *sb,
143 struct ocfs2_dinode *di,
144 struct ocfs2_group_desc *gd)
146 unsigned int max_bits;
148 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
149 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
150 return -EIO;
153 if (di->i_blkno != gd->bg_parent_dinode) {
154 ocfs2_error(sb, "Group descriptor # %llu has bad parent "
155 "pointer (%llu, expected %llu)",
156 (unsigned long long)le64_to_cpu(gd->bg_blkno),
157 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
158 (unsigned long long)le64_to_cpu(di->i_blkno));
159 return -EIO;
162 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
163 if (le16_to_cpu(gd->bg_bits) > max_bits) {
164 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
165 (unsigned long long)le64_to_cpu(gd->bg_blkno),
166 le16_to_cpu(gd->bg_bits));
167 return -EIO;
170 if (le16_to_cpu(gd->bg_chain) >=
171 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
172 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
173 (unsigned long long)le64_to_cpu(gd->bg_blkno),
174 le16_to_cpu(gd->bg_chain));
175 return -EIO;
178 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
179 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
180 "claims that %u are free",
181 (unsigned long long)le64_to_cpu(gd->bg_blkno),
182 le16_to_cpu(gd->bg_bits),
183 le16_to_cpu(gd->bg_free_bits_count));
184 return -EIO;
187 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
188 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
189 "max bitmap bits of %u",
190 (unsigned long long)le64_to_cpu(gd->bg_blkno),
191 le16_to_cpu(gd->bg_bits),
192 8 * le16_to_cpu(gd->bg_size));
193 return -EIO;
196 return 0;
199 static int ocfs2_block_group_fill(handle_t *handle,
200 struct inode *alloc_inode,
201 struct buffer_head *bg_bh,
202 u64 group_blkno,
203 u16 my_chain,
204 struct ocfs2_chain_list *cl)
206 int status = 0;
207 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
208 struct super_block * sb = alloc_inode->i_sb;
210 mlog_entry_void();
212 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
213 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
214 "b_blocknr (%llu)",
215 (unsigned long long)group_blkno,
216 (unsigned long long) bg_bh->b_blocknr);
217 status = -EIO;
218 goto bail;
221 status = ocfs2_journal_access(handle,
222 alloc_inode,
223 bg_bh,
224 OCFS2_JOURNAL_ACCESS_CREATE);
225 if (status < 0) {
226 mlog_errno(status);
227 goto bail;
230 memset(bg, 0, sb->s_blocksize);
231 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
232 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
233 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
234 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
235 bg->bg_chain = cpu_to_le16(my_chain);
236 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
237 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
238 bg->bg_blkno = cpu_to_le64(group_blkno);
239 /* set the 1st bit in the bitmap to account for the descriptor block */
240 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
241 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
243 status = ocfs2_journal_dirty(handle, bg_bh);
244 if (status < 0)
245 mlog_errno(status);
247 /* There is no need to zero out or otherwise initialize the
248 * other blocks in a group - All valid FS metadata in a block
249 * group stores the superblock fs_generation value at
250 * allocation time. */
252 bail:
253 mlog_exit(status);
254 return status;
257 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
259 u16 curr, best;
261 best = curr = 0;
262 while (curr < le16_to_cpu(cl->cl_count)) {
263 if (le32_to_cpu(cl->cl_recs[best].c_total) >
264 le32_to_cpu(cl->cl_recs[curr].c_total))
265 best = curr;
266 curr++;
268 return best;
272 * We expect the block group allocator to already be locked.
274 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
275 struct inode *alloc_inode,
276 struct buffer_head *bh)
278 int status, credits;
279 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
280 struct ocfs2_chain_list *cl;
281 struct ocfs2_alloc_context *ac = NULL;
282 handle_t *handle = NULL;
283 u32 bit_off, num_bits;
284 u16 alloc_rec;
285 u64 bg_blkno;
286 struct buffer_head *bg_bh = NULL;
287 struct ocfs2_group_desc *bg;
289 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
291 mlog_entry_void();
293 cl = &fe->id2.i_chain;
294 status = ocfs2_reserve_clusters(osb,
295 le16_to_cpu(cl->cl_cpg),
296 &ac);
297 if (status < 0) {
298 if (status != -ENOSPC)
299 mlog_errno(status);
300 goto bail;
303 credits = ocfs2_calc_group_alloc_credits(osb->sb,
304 le16_to_cpu(cl->cl_cpg));
305 handle = ocfs2_start_trans(osb, credits);
306 if (IS_ERR(handle)) {
307 status = PTR_ERR(handle);
308 handle = NULL;
309 mlog_errno(status);
310 goto bail;
313 status = ocfs2_claim_clusters(osb,
314 handle,
316 le16_to_cpu(cl->cl_cpg),
317 &bit_off,
318 &num_bits);
319 if (status < 0) {
320 if (status != -ENOSPC)
321 mlog_errno(status);
322 goto bail;
325 alloc_rec = ocfs2_find_smallest_chain(cl);
327 /* setup the group */
328 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
329 mlog(0, "new descriptor, record %u, at block %llu\n",
330 alloc_rec, (unsigned long long)bg_blkno);
332 bg_bh = sb_getblk(osb->sb, bg_blkno);
333 if (!bg_bh) {
334 status = -EIO;
335 mlog_errno(status);
336 goto bail;
338 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
340 status = ocfs2_block_group_fill(handle,
341 alloc_inode,
342 bg_bh,
343 bg_blkno,
344 alloc_rec,
345 cl);
346 if (status < 0) {
347 mlog_errno(status);
348 goto bail;
351 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
353 status = ocfs2_journal_access(handle, alloc_inode,
354 bh, OCFS2_JOURNAL_ACCESS_WRITE);
355 if (status < 0) {
356 mlog_errno(status);
357 goto bail;
360 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
361 le16_to_cpu(bg->bg_free_bits_count));
362 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
363 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
364 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
365 le16_add_cpu(&cl->cl_next_free_rec, 1);
367 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
368 le16_to_cpu(bg->bg_free_bits_count));
369 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
370 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
372 status = ocfs2_journal_dirty(handle, bh);
373 if (status < 0) {
374 mlog_errno(status);
375 goto bail;
378 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
379 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
380 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
381 le32_to_cpu(fe->i_clusters)));
382 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
383 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
384 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
386 status = 0;
387 bail:
388 if (handle)
389 ocfs2_commit_trans(osb, handle);
391 if (ac)
392 ocfs2_free_alloc_context(ac);
394 if (bg_bh)
395 brelse(bg_bh);
397 mlog_exit(status);
398 return status;
401 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
402 struct ocfs2_alloc_context *ac,
403 int type,
404 u32 slot)
406 int status;
407 u32 bits_wanted = ac->ac_bits_wanted;
408 struct inode *alloc_inode;
409 struct buffer_head *bh = NULL;
410 struct ocfs2_dinode *fe;
411 u32 free_bits;
413 mlog_entry_void();
415 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
416 if (!alloc_inode) {
417 mlog_errno(-EINVAL);
418 return -EINVAL;
421 mutex_lock(&alloc_inode->i_mutex);
423 status = ocfs2_meta_lock(alloc_inode, &bh, 1);
424 if (status < 0) {
425 mutex_unlock(&alloc_inode->i_mutex);
426 iput(alloc_inode);
428 mlog_errno(status);
429 return status;
432 ac->ac_inode = alloc_inode;
434 fe = (struct ocfs2_dinode *) bh->b_data;
435 if (!OCFS2_IS_VALID_DINODE(fe)) {
436 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
437 status = -EIO;
438 goto bail;
440 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
441 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
442 (unsigned long long)le64_to_cpu(fe->i_blkno));
443 status = -EIO;
444 goto bail;
447 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
448 le32_to_cpu(fe->id1.bitmap1.i_used);
450 if (bits_wanted > free_bits) {
451 /* cluster bitmap never grows */
452 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
453 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
454 bits_wanted, free_bits);
455 status = -ENOSPC;
456 goto bail;
459 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
460 if (status < 0) {
461 if (status != -ENOSPC)
462 mlog_errno(status);
463 goto bail;
465 atomic_inc(&osb->alloc_stats.bg_extends);
467 /* You should never ask for this much metadata */
468 BUG_ON(bits_wanted >
469 (le32_to_cpu(fe->id1.bitmap1.i_total)
470 - le32_to_cpu(fe->id1.bitmap1.i_used)));
473 get_bh(bh);
474 ac->ac_bh = bh;
475 bail:
476 if (bh)
477 brelse(bh);
479 mlog_exit(status);
480 return status;
483 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
484 struct ocfs2_dinode *fe,
485 struct ocfs2_alloc_context **ac)
487 int status;
488 u32 slot;
490 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
491 if (!(*ac)) {
492 status = -ENOMEM;
493 mlog_errno(status);
494 goto bail;
497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
498 (*ac)->ac_which = OCFS2_AC_USE_META;
500 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 slot = 0;
502 #else
503 slot = osb->slot_num;
504 #endif
506 (*ac)->ac_group_search = ocfs2_block_group_search;
508 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
509 EXTENT_ALLOC_SYSTEM_INODE, slot);
510 if (status < 0) {
511 if (status != -ENOSPC)
512 mlog_errno(status);
513 goto bail;
516 status = 0;
517 bail:
518 if ((status < 0) && *ac) {
519 ocfs2_free_alloc_context(*ac);
520 *ac = NULL;
523 mlog_exit(status);
524 return status;
527 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
528 struct ocfs2_alloc_context **ac)
530 int status;
532 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
533 if (!(*ac)) {
534 status = -ENOMEM;
535 mlog_errno(status);
536 goto bail;
539 (*ac)->ac_bits_wanted = 1;
540 (*ac)->ac_which = OCFS2_AC_USE_INODE;
542 (*ac)->ac_group_search = ocfs2_block_group_search;
544 status = ocfs2_reserve_suballoc_bits(osb, *ac,
545 INODE_ALLOC_SYSTEM_INODE,
546 osb->slot_num);
547 if (status < 0) {
548 if (status != -ENOSPC)
549 mlog_errno(status);
550 goto bail;
553 status = 0;
554 bail:
555 if ((status < 0) && *ac) {
556 ocfs2_free_alloc_context(*ac);
557 *ac = NULL;
560 mlog_exit(status);
561 return status;
564 /* local alloc code has to do the same thing, so rather than do this
565 * twice.. */
566 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
567 struct ocfs2_alloc_context *ac)
569 int status;
571 ac->ac_which = OCFS2_AC_USE_MAIN;
572 ac->ac_group_search = ocfs2_cluster_group_search;
574 status = ocfs2_reserve_suballoc_bits(osb, ac,
575 GLOBAL_BITMAP_SYSTEM_INODE,
576 OCFS2_INVALID_SLOT);
577 if (status < 0 && status != -ENOSPC) {
578 mlog_errno(status);
579 goto bail;
582 bail:
583 return status;
586 /* Callers don't need to care which bitmap (local alloc or main) to
587 * use so we figure it out for them, but unfortunately this clutters
588 * things a bit. */
589 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
590 u32 bits_wanted,
591 struct ocfs2_alloc_context **ac)
593 int status;
595 mlog_entry_void();
597 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
598 if (!(*ac)) {
599 status = -ENOMEM;
600 mlog_errno(status);
601 goto bail;
604 (*ac)->ac_bits_wanted = bits_wanted;
606 status = -ENOSPC;
607 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
608 status = ocfs2_reserve_local_alloc_bits(osb,
609 bits_wanted,
610 *ac);
611 if ((status < 0) && (status != -ENOSPC)) {
612 mlog_errno(status);
613 goto bail;
614 } else if (status == -ENOSPC) {
615 /* reserve_local_bits will return enospc with
616 * the local alloc inode still locked, so we
617 * can change this safely here. */
618 mlog(0, "Disabling local alloc\n");
619 /* We set to OCFS2_LA_DISABLED so that umount
620 * can clean up what's left of the local
621 * allocation */
622 osb->local_alloc_state = OCFS2_LA_DISABLED;
626 if (status == -ENOSPC) {
627 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
628 if (status < 0) {
629 if (status != -ENOSPC)
630 mlog_errno(status);
631 goto bail;
635 status = 0;
636 bail:
637 if ((status < 0) && *ac) {
638 ocfs2_free_alloc_context(*ac);
639 *ac = NULL;
642 mlog_exit(status);
643 return status;
647 * More or less lifted from ext3. I'll leave their description below:
649 * "For ext3 allocations, we must not reuse any blocks which are
650 * allocated in the bitmap buffer's "last committed data" copy. This
651 * prevents deletes from freeing up the page for reuse until we have
652 * committed the delete transaction.
654 * If we didn't do this, then deleting something and reallocating it as
655 * data would allow the old block to be overwritten before the
656 * transaction committed (because we force data to disk before commit).
657 * This would lead to corruption if we crashed between overwriting the
658 * data and committing the delete.
660 * @@@ We may want to make this allocation behaviour conditional on
661 * data-writes at some point, and disable it for metadata allocations or
662 * sync-data inodes."
664 * Note: OCFS2 already does this differently for metadata vs data
665 * allocations, as those bitmaps are seperate and undo access is never
666 * called on a metadata group descriptor.
668 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
669 int nr)
671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
673 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
674 return 0;
675 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
676 return 1;
678 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
679 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
682 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
683 struct buffer_head *bg_bh,
684 unsigned int bits_wanted,
685 unsigned int total_bits,
686 u16 *bit_off,
687 u16 *bits_found)
689 void *bitmap;
690 u16 best_offset, best_size;
691 int offset, start, found, status = 0;
692 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
694 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
695 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
696 return -EIO;
699 found = start = best_offset = best_size = 0;
700 bitmap = bg->bg_bitmap;
702 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
703 if (offset == total_bits)
704 break;
706 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
707 /* We found a zero, but we can't use it as it
708 * hasn't been put to disk yet! */
709 found = 0;
710 start = offset + 1;
711 } else if (offset == start) {
712 /* we found a zero */
713 found++;
714 /* move start to the next bit to test */
715 start++;
716 } else {
717 /* got a zero after some ones */
718 found = 1;
719 start = offset + 1;
721 if (found > best_size) {
722 best_size = found;
723 best_offset = start - found;
725 /* we got everything we needed */
726 if (found == bits_wanted) {
727 /* mlog(0, "Found it all!\n"); */
728 break;
732 /* XXX: I think the first clause is equivalent to the second
733 * - jlbec */
734 if (found == bits_wanted) {
735 *bit_off = start - found;
736 *bits_found = found;
737 } else if (best_size) {
738 *bit_off = best_offset;
739 *bits_found = best_size;
740 } else {
741 status = -ENOSPC;
742 /* No error log here -- see the comment above
743 * ocfs2_test_bg_bit_allocatable */
746 return status;
749 static inline int ocfs2_block_group_set_bits(handle_t *handle,
750 struct inode *alloc_inode,
751 struct ocfs2_group_desc *bg,
752 struct buffer_head *group_bh,
753 unsigned int bit_off,
754 unsigned int num_bits)
756 int status;
757 void *bitmap = bg->bg_bitmap;
758 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
760 mlog_entry_void();
762 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
763 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
764 status = -EIO;
765 goto bail;
767 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
769 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
770 num_bits);
772 if (ocfs2_is_cluster_bitmap(alloc_inode))
773 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
775 status = ocfs2_journal_access(handle,
776 alloc_inode,
777 group_bh,
778 journal_type);
779 if (status < 0) {
780 mlog_errno(status);
781 goto bail;
784 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
786 while(num_bits--)
787 ocfs2_set_bit(bit_off++, bitmap);
789 status = ocfs2_journal_dirty(handle,
790 group_bh);
791 if (status < 0) {
792 mlog_errno(status);
793 goto bail;
796 bail:
797 mlog_exit(status);
798 return status;
801 /* find the one with the most empty bits */
802 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
804 u16 curr, best;
806 BUG_ON(!cl->cl_next_free_rec);
808 best = curr = 0;
809 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
810 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
811 le32_to_cpu(cl->cl_recs[best].c_free))
812 best = curr;
813 curr++;
816 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
817 return best;
820 static int ocfs2_relink_block_group(handle_t *handle,
821 struct inode *alloc_inode,
822 struct buffer_head *fe_bh,
823 struct buffer_head *bg_bh,
824 struct buffer_head *prev_bg_bh,
825 u16 chain)
827 int status;
828 /* there is a really tiny chance the journal calls could fail,
829 * but we wouldn't want inconsistent blocks in *any* case. */
830 u64 fe_ptr, bg_ptr, prev_bg_ptr;
831 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
832 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
833 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
835 if (!OCFS2_IS_VALID_DINODE(fe)) {
836 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
837 status = -EIO;
838 goto out;
840 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
841 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
842 status = -EIO;
843 goto out;
845 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
846 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
847 status = -EIO;
848 goto out;
851 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
852 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
853 (unsigned long long)le64_to_cpu(bg->bg_blkno),
854 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
856 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
857 bg_ptr = le64_to_cpu(bg->bg_next_group);
858 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
860 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
861 OCFS2_JOURNAL_ACCESS_WRITE);
862 if (status < 0) {
863 mlog_errno(status);
864 goto out_rollback;
867 prev_bg->bg_next_group = bg->bg_next_group;
869 status = ocfs2_journal_dirty(handle, prev_bg_bh);
870 if (status < 0) {
871 mlog_errno(status);
872 goto out_rollback;
875 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
876 OCFS2_JOURNAL_ACCESS_WRITE);
877 if (status < 0) {
878 mlog_errno(status);
879 goto out_rollback;
882 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
884 status = ocfs2_journal_dirty(handle, bg_bh);
885 if (status < 0) {
886 mlog_errno(status);
887 goto out_rollback;
890 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
891 OCFS2_JOURNAL_ACCESS_WRITE);
892 if (status < 0) {
893 mlog_errno(status);
894 goto out_rollback;
897 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
899 status = ocfs2_journal_dirty(handle, fe_bh);
900 if (status < 0) {
901 mlog_errno(status);
902 goto out_rollback;
905 status = 0;
906 out_rollback:
907 if (status < 0) {
908 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
909 bg->bg_next_group = cpu_to_le64(bg_ptr);
910 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
912 out:
913 mlog_exit(status);
914 return status;
917 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
918 u32 wanted)
920 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
923 /* return 0 on success, -ENOSPC to keep searching and any other < 0
924 * value on error. */
925 static int ocfs2_cluster_group_search(struct inode *inode,
926 struct buffer_head *group_bh,
927 u32 bits_wanted, u32 min_bits,
928 u16 *bit_off, u16 *bits_found)
930 int search = -ENOSPC;
931 int ret;
932 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
933 u16 tmp_off, tmp_found;
934 unsigned int max_bits, gd_cluster_off;
936 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
938 if (gd->bg_free_bits_count) {
939 max_bits = le16_to_cpu(gd->bg_bits);
941 /* Tail groups in cluster bitmaps which aren't cpg
942 * aligned are prone to partial extention by a failed
943 * fs resize. If the file system resize never got to
944 * update the dinode cluster count, then we don't want
945 * to trust any clusters past it, regardless of what
946 * the group descriptor says. */
947 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
948 le64_to_cpu(gd->bg_blkno));
949 if ((gd_cluster_off + max_bits) >
950 OCFS2_I(inode)->ip_clusters) {
951 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
952 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
953 (unsigned long long)le64_to_cpu(gd->bg_blkno),
954 le16_to_cpu(gd->bg_bits),
955 OCFS2_I(inode)->ip_clusters, max_bits);
958 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
959 group_bh, bits_wanted,
960 max_bits,
961 &tmp_off, &tmp_found);
962 if (ret)
963 return ret;
965 /* ocfs2_block_group_find_clear_bits() might
966 * return success, but we still want to return
967 * -ENOSPC unless it found the minimum number
968 * of bits. */
969 if (min_bits <= tmp_found) {
970 *bit_off = tmp_off;
971 *bits_found = tmp_found;
972 search = 0; /* success */
976 return search;
979 static int ocfs2_block_group_search(struct inode *inode,
980 struct buffer_head *group_bh,
981 u32 bits_wanted, u32 min_bits,
982 u16 *bit_off, u16 *bits_found)
984 int ret = -ENOSPC;
985 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
987 BUG_ON(min_bits != 1);
988 BUG_ON(ocfs2_is_cluster_bitmap(inode));
990 if (bg->bg_free_bits_count)
991 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
992 group_bh, bits_wanted,
993 le16_to_cpu(bg->bg_bits),
994 bit_off, bits_found);
996 return ret;
999 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1000 handle_t *handle,
1001 struct buffer_head *di_bh,
1002 u32 num_bits,
1003 u16 chain)
1005 int ret;
1006 u32 tmp_used;
1007 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1008 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1010 ret = ocfs2_journal_access(handle, inode, di_bh,
1011 OCFS2_JOURNAL_ACCESS_WRITE);
1012 if (ret < 0) {
1013 mlog_errno(ret);
1014 goto out;
1017 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1018 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1019 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1021 ret = ocfs2_journal_dirty(handle, di_bh);
1022 if (ret < 0)
1023 mlog_errno(ret);
1025 out:
1026 return ret;
1029 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1030 handle_t *handle,
1031 u32 bits_wanted,
1032 u32 min_bits,
1033 u16 *bit_off,
1034 unsigned int *num_bits,
1035 u64 gd_blkno,
1036 u16 *bits_left)
1038 int ret;
1039 u16 found;
1040 struct buffer_head *group_bh = NULL;
1041 struct ocfs2_group_desc *gd;
1042 struct inode *alloc_inode = ac->ac_inode;
1044 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1045 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1046 if (ret < 0) {
1047 mlog_errno(ret);
1048 return ret;
1051 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1052 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1053 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1054 ret = -EIO;
1055 goto out;
1058 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1059 bit_off, &found);
1060 if (ret < 0) {
1061 if (ret != -ENOSPC)
1062 mlog_errno(ret);
1063 goto out;
1066 *num_bits = found;
1068 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1069 *num_bits,
1070 le16_to_cpu(gd->bg_chain));
1071 if (ret < 0) {
1072 mlog_errno(ret);
1073 goto out;
1076 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1077 *bit_off, *num_bits);
1078 if (ret < 0)
1079 mlog_errno(ret);
1081 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1083 out:
1084 brelse(group_bh);
1086 return ret;
1089 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1090 handle_t *handle,
1091 u32 bits_wanted,
1092 u32 min_bits,
1093 u16 *bit_off,
1094 unsigned int *num_bits,
1095 u64 *bg_blkno,
1096 u16 *bits_left)
1098 int status;
1099 u16 chain, tmp_bits;
1100 u32 tmp_used;
1101 u64 next_group;
1102 struct inode *alloc_inode = ac->ac_inode;
1103 struct buffer_head *group_bh = NULL;
1104 struct buffer_head *prev_group_bh = NULL;
1105 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1106 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1107 struct ocfs2_group_desc *bg;
1109 chain = ac->ac_chain;
1110 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1111 bits_wanted, chain,
1112 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1114 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1115 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1116 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1117 if (status < 0) {
1118 mlog_errno(status);
1119 goto bail;
1121 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1122 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1123 if (status) {
1124 mlog_errno(status);
1125 goto bail;
1128 status = -ENOSPC;
1129 /* for now, the chain search is a bit simplistic. We just use
1130 * the 1st group with any empty bits. */
1131 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1132 bits_wanted, min_bits, bit_off,
1133 &tmp_bits)) == -ENOSPC) {
1134 if (!bg->bg_next_group)
1135 break;
1137 if (prev_group_bh) {
1138 brelse(prev_group_bh);
1139 prev_group_bh = NULL;
1141 next_group = le64_to_cpu(bg->bg_next_group);
1142 prev_group_bh = group_bh;
1143 group_bh = NULL;
1144 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1145 next_group, &group_bh,
1146 OCFS2_BH_CACHED, alloc_inode);
1147 if (status < 0) {
1148 mlog_errno(status);
1149 goto bail;
1151 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1152 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1153 if (status) {
1154 mlog_errno(status);
1155 goto bail;
1158 if (status < 0) {
1159 if (status != -ENOSPC)
1160 mlog_errno(status);
1161 goto bail;
1164 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1165 tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1167 *num_bits = tmp_bits;
1169 BUG_ON(*num_bits == 0);
1172 * Keep track of previous block descriptor read. When
1173 * we find a target, if we have read more than X
1174 * number of descriptors, and the target is reasonably
1175 * empty, relink him to top of his chain.
1177 * We've read 0 extra blocks and only send one more to
1178 * the transaction, yet the next guy to search has a
1179 * much easier time.
1181 * Do this *after* figuring out how many bits we're taking out
1182 * of our target group.
1184 if (ac->ac_allow_chain_relink &&
1185 (prev_group_bh) &&
1186 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1187 status = ocfs2_relink_block_group(handle, alloc_inode,
1188 ac->ac_bh, group_bh,
1189 prev_group_bh, chain);
1190 if (status < 0) {
1191 mlog_errno(status);
1192 goto bail;
1196 /* Ok, claim our bits now: set the info on dinode, chainlist
1197 * and then the group */
1198 status = ocfs2_journal_access(handle,
1199 alloc_inode,
1200 ac->ac_bh,
1201 OCFS2_JOURNAL_ACCESS_WRITE);
1202 if (status < 0) {
1203 mlog_errno(status);
1204 goto bail;
1207 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1208 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1209 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1211 status = ocfs2_journal_dirty(handle,
1212 ac->ac_bh);
1213 if (status < 0) {
1214 mlog_errno(status);
1215 goto bail;
1218 status = ocfs2_block_group_set_bits(handle,
1219 alloc_inode,
1221 group_bh,
1222 *bit_off,
1223 *num_bits);
1224 if (status < 0) {
1225 mlog_errno(status);
1226 goto bail;
1229 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1230 (unsigned long long)le64_to_cpu(fe->i_blkno));
1232 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1233 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1234 bail:
1235 if (group_bh)
1236 brelse(group_bh);
1237 if (prev_group_bh)
1238 brelse(prev_group_bh);
1240 mlog_exit(status);
1241 return status;
1244 /* will give out up to bits_wanted contiguous bits. */
1245 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1246 struct ocfs2_alloc_context *ac,
1247 handle_t *handle,
1248 u32 bits_wanted,
1249 u32 min_bits,
1250 u16 *bit_off,
1251 unsigned int *num_bits,
1252 u64 *bg_blkno)
1254 int status;
1255 u16 victim, i;
1256 u16 bits_left = 0;
1257 u64 hint_blkno = ac->ac_last_group;
1258 struct ocfs2_chain_list *cl;
1259 struct ocfs2_dinode *fe;
1261 mlog_entry_void();
1263 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1264 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1265 BUG_ON(!ac->ac_bh);
1267 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1268 if (!OCFS2_IS_VALID_DINODE(fe)) {
1269 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1270 status = -EIO;
1271 goto bail;
1273 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1274 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1275 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1276 "bits but only %u total.",
1277 (unsigned long long)le64_to_cpu(fe->i_blkno),
1278 le32_to_cpu(fe->id1.bitmap1.i_used),
1279 le32_to_cpu(fe->id1.bitmap1.i_total));
1280 status = -EIO;
1281 goto bail;
1284 if (hint_blkno) {
1285 /* Attempt to short-circuit the usual search mechanism
1286 * by jumping straight to the most recently used
1287 * allocation group. This helps us mantain some
1288 * contiguousness across allocations. */
1289 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1290 min_bits, bit_off, num_bits,
1291 hint_blkno, &bits_left);
1292 if (!status) {
1293 /* Be careful to update *bg_blkno here as the
1294 * caller is expecting it to be filled in, and
1295 * ocfs2_search_one_group() won't do that for
1296 * us. */
1297 *bg_blkno = hint_blkno;
1298 goto set_hint;
1300 if (status < 0 && status != -ENOSPC) {
1301 mlog_errno(status);
1302 goto bail;
1306 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1308 victim = ocfs2_find_victim_chain(cl);
1309 ac->ac_chain = victim;
1310 ac->ac_allow_chain_relink = 1;
1312 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1313 num_bits, bg_blkno, &bits_left);
1314 if (!status)
1315 goto set_hint;
1316 if (status < 0 && status != -ENOSPC) {
1317 mlog_errno(status);
1318 goto bail;
1321 mlog(0, "Search of victim chain %u came up with nothing, "
1322 "trying all chains now.\n", victim);
1324 /* If we didn't pick a good victim, then just default to
1325 * searching each chain in order. Don't allow chain relinking
1326 * because we only calculate enough journal credits for one
1327 * relink per alloc. */
1328 ac->ac_allow_chain_relink = 0;
1329 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1330 if (i == victim)
1331 continue;
1332 if (!cl->cl_recs[i].c_free)
1333 continue;
1335 ac->ac_chain = i;
1336 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1337 bit_off, num_bits, bg_blkno,
1338 &bits_left);
1339 if (!status)
1340 break;
1341 if (status < 0 && status != -ENOSPC) {
1342 mlog_errno(status);
1343 goto bail;
1347 set_hint:
1348 if (status != -ENOSPC) {
1349 /* If the next search of this group is not likely to
1350 * yield a suitable extent, then we reset the last
1351 * group hint so as to not waste a disk read */
1352 if (bits_left < min_bits)
1353 ac->ac_last_group = 0;
1354 else
1355 ac->ac_last_group = *bg_blkno;
1358 bail:
1359 mlog_exit(status);
1360 return status;
1363 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1364 handle_t *handle,
1365 struct ocfs2_alloc_context *ac,
1366 u32 bits_wanted,
1367 u16 *suballoc_bit_start,
1368 unsigned int *num_bits,
1369 u64 *blkno_start)
1371 int status;
1372 u64 bg_blkno;
1374 BUG_ON(!ac);
1375 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1376 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1378 status = ocfs2_claim_suballoc_bits(osb,
1380 handle,
1381 bits_wanted,
1383 suballoc_bit_start,
1384 num_bits,
1385 &bg_blkno);
1386 if (status < 0) {
1387 mlog_errno(status);
1388 goto bail;
1390 atomic_inc(&osb->alloc_stats.bg_allocs);
1392 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1393 ac->ac_bits_given += (*num_bits);
1394 status = 0;
1395 bail:
1396 mlog_exit(status);
1397 return status;
1400 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1401 handle_t *handle,
1402 struct ocfs2_alloc_context *ac,
1403 u16 *suballoc_bit,
1404 u64 *fe_blkno)
1406 int status;
1407 unsigned int num_bits;
1408 u64 bg_blkno;
1410 mlog_entry_void();
1412 BUG_ON(!ac);
1413 BUG_ON(ac->ac_bits_given != 0);
1414 BUG_ON(ac->ac_bits_wanted != 1);
1415 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1417 status = ocfs2_claim_suballoc_bits(osb,
1419 handle,
1422 suballoc_bit,
1423 &num_bits,
1424 &bg_blkno);
1425 if (status < 0) {
1426 mlog_errno(status);
1427 goto bail;
1429 atomic_inc(&osb->alloc_stats.bg_allocs);
1431 BUG_ON(num_bits != 1);
1433 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1434 ac->ac_bits_given++;
1435 status = 0;
1436 bail:
1437 mlog_exit(status);
1438 return status;
1441 /* translate a group desc. blkno and it's bitmap offset into
1442 * disk cluster offset. */
1443 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1444 u64 bg_blkno,
1445 u16 bg_bit_off)
1447 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1448 u32 cluster = 0;
1450 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1452 if (bg_blkno != osb->first_cluster_group_blkno)
1453 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1454 cluster += (u32) bg_bit_off;
1455 return cluster;
1458 /* given a cluster offset, calculate which block group it belongs to
1459 * and return that block offset. */
1460 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1461 u32 cluster)
1463 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1464 u32 group_no;
1466 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1468 group_no = cluster / osb->bitmap_cpg;
1469 if (!group_no)
1470 return osb->first_cluster_group_blkno;
1471 return ocfs2_clusters_to_blocks(inode->i_sb,
1472 group_no * osb->bitmap_cpg);
1475 /* given the block number of a cluster start, calculate which cluster
1476 * group and descriptor bitmap offset that corresponds to. */
1477 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1478 u64 data_blkno,
1479 u64 *bg_blkno,
1480 u16 *bg_bit_off)
1482 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1483 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1485 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1487 *bg_blkno = ocfs2_which_cluster_group(inode,
1488 data_cluster);
1490 if (*bg_blkno == osb->first_cluster_group_blkno)
1491 *bg_bit_off = (u16) data_cluster;
1492 else
1493 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1494 data_blkno - *bg_blkno);
1498 * min_bits - minimum contiguous chunk from this total allocation we
1499 * can handle. set to what we asked for originally for a full
1500 * contig. allocation, set to '1' to indicate we can deal with extents
1501 * of any size.
1503 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1504 handle_t *handle,
1505 struct ocfs2_alloc_context *ac,
1506 u32 min_clusters,
1507 u32 *cluster_start,
1508 u32 *num_clusters)
1510 int status;
1511 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1512 u64 bg_blkno = 0;
1513 u16 bg_bit_off;
1515 mlog_entry_void();
1517 BUG_ON(!ac);
1518 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1520 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1521 && ac->ac_which != OCFS2_AC_USE_MAIN);
1523 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1524 status = ocfs2_claim_local_alloc_bits(osb,
1525 handle,
1527 bits_wanted,
1528 cluster_start,
1529 num_clusters);
1530 if (!status)
1531 atomic_inc(&osb->alloc_stats.local_data);
1532 } else {
1533 if (min_clusters > (osb->bitmap_cpg - 1)) {
1534 /* The only paths asking for contiguousness
1535 * should know about this already. */
1536 mlog(ML_ERROR, "minimum allocation requested exceeds "
1537 "group bitmap size!");
1538 status = -ENOSPC;
1539 goto bail;
1541 /* clamp the current request down to a realistic size. */
1542 if (bits_wanted > (osb->bitmap_cpg - 1))
1543 bits_wanted = osb->bitmap_cpg - 1;
1545 status = ocfs2_claim_suballoc_bits(osb,
1547 handle,
1548 bits_wanted,
1549 min_clusters,
1550 &bg_bit_off,
1551 num_clusters,
1552 &bg_blkno);
1553 if (!status) {
1554 *cluster_start =
1555 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1556 bg_blkno,
1557 bg_bit_off);
1558 atomic_inc(&osb->alloc_stats.bitmap_data);
1561 if (status < 0) {
1562 if (status != -ENOSPC)
1563 mlog_errno(status);
1564 goto bail;
1567 ac->ac_bits_given += *num_clusters;
1569 bail:
1570 mlog_exit(status);
1571 return status;
1574 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1575 struct inode *alloc_inode,
1576 struct ocfs2_group_desc *bg,
1577 struct buffer_head *group_bh,
1578 unsigned int bit_off,
1579 unsigned int num_bits)
1581 int status;
1582 unsigned int tmp;
1583 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1584 struct ocfs2_group_desc *undo_bg = NULL;
1586 mlog_entry_void();
1588 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1589 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1590 status = -EIO;
1591 goto bail;
1594 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1596 if (ocfs2_is_cluster_bitmap(alloc_inode))
1597 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1599 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1600 journal_type);
1601 if (status < 0) {
1602 mlog_errno(status);
1603 goto bail;
1606 if (ocfs2_is_cluster_bitmap(alloc_inode))
1607 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1609 tmp = num_bits;
1610 while(tmp--) {
1611 ocfs2_clear_bit((bit_off + tmp),
1612 (unsigned long *) bg->bg_bitmap);
1613 if (ocfs2_is_cluster_bitmap(alloc_inode))
1614 ocfs2_set_bit(bit_off + tmp,
1615 (unsigned long *) undo_bg->bg_bitmap);
1617 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1619 status = ocfs2_journal_dirty(handle, group_bh);
1620 if (status < 0)
1621 mlog_errno(status);
1622 bail:
1623 return status;
1627 * expects the suballoc inode to already be locked.
1629 static int ocfs2_free_suballoc_bits(handle_t *handle,
1630 struct inode *alloc_inode,
1631 struct buffer_head *alloc_bh,
1632 unsigned int start_bit,
1633 u64 bg_blkno,
1634 unsigned int count)
1636 int status = 0;
1637 u32 tmp_used;
1638 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1639 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1640 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1641 struct buffer_head *group_bh = NULL;
1642 struct ocfs2_group_desc *group;
1644 mlog_entry_void();
1646 if (!OCFS2_IS_VALID_DINODE(fe)) {
1647 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1648 status = -EIO;
1649 goto bail;
1651 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1653 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1654 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1655 (unsigned long long)bg_blkno, start_bit);
1657 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1658 alloc_inode);
1659 if (status < 0) {
1660 mlog_errno(status);
1661 goto bail;
1664 group = (struct ocfs2_group_desc *) group_bh->b_data;
1665 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1666 if (status) {
1667 mlog_errno(status);
1668 goto bail;
1670 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1672 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1673 group, group_bh,
1674 start_bit, count);
1675 if (status < 0) {
1676 mlog_errno(status);
1677 goto bail;
1680 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1681 OCFS2_JOURNAL_ACCESS_WRITE);
1682 if (status < 0) {
1683 mlog_errno(status);
1684 goto bail;
1687 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1688 count);
1689 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1690 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1692 status = ocfs2_journal_dirty(handle, alloc_bh);
1693 if (status < 0) {
1694 mlog_errno(status);
1695 goto bail;
1698 bail:
1699 if (group_bh)
1700 brelse(group_bh);
1702 mlog_exit(status);
1703 return status;
1706 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1708 u64 group = block - (u64) bit;
1710 return group;
1713 int ocfs2_free_dinode(handle_t *handle,
1714 struct inode *inode_alloc_inode,
1715 struct buffer_head *inode_alloc_bh,
1716 struct ocfs2_dinode *di)
1718 u64 blk = le64_to_cpu(di->i_blkno);
1719 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1720 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1722 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1723 inode_alloc_bh, bit, bg_blkno, 1);
1726 int ocfs2_free_extent_block(handle_t *handle,
1727 struct inode *eb_alloc_inode,
1728 struct buffer_head *eb_alloc_bh,
1729 struct ocfs2_extent_block *eb)
1731 u64 blk = le64_to_cpu(eb->h_blkno);
1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1736 bit, bg_blkno, 1);
1739 int ocfs2_free_clusters(handle_t *handle,
1740 struct inode *bitmap_inode,
1741 struct buffer_head *bitmap_bh,
1742 u64 start_blk,
1743 unsigned int num_clusters)
1745 int status;
1746 u16 bg_start_bit;
1747 u64 bg_blkno;
1748 struct ocfs2_dinode *fe;
1750 /* You can't ever have a contiguous set of clusters
1751 * bigger than a block group bitmap so we never have to worry
1752 * about looping on them. */
1754 mlog_entry_void();
1756 /* This is expensive. We can safely remove once this stuff has
1757 * gotten tested really well. */
1758 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1760 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1762 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1763 &bg_start_bit);
1765 mlog(0, "want to free %u clusters starting at block %llu\n",
1766 num_clusters, (unsigned long long)start_blk);
1767 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1768 (unsigned long long)bg_blkno, bg_start_bit);
1770 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1771 bg_start_bit, bg_blkno,
1772 num_clusters);
1773 if (status < 0)
1774 mlog_errno(status);
1776 mlog_exit(status);
1777 return status;
1780 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1782 printk("Block Group:\n");
1783 printk("bg_signature: %s\n", bg->bg_signature);
1784 printk("bg_size: %u\n", bg->bg_size);
1785 printk("bg_bits: %u\n", bg->bg_bits);
1786 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1787 printk("bg_chain: %u\n", bg->bg_chain);
1788 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1789 printk("bg_next_group: %llu\n",
1790 (unsigned long long)bg->bg_next_group);
1791 printk("bg_parent_dinode: %llu\n",
1792 (unsigned long long)bg->bg_parent_dinode);
1793 printk("bg_blkno: %llu\n",
1794 (unsigned long long)bg->bg_blkno);
1797 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1799 int i;
1801 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1802 printk("i_signature: %s\n", fe->i_signature);
1803 printk("i_size: %llu\n",
1804 (unsigned long long)fe->i_size);
1805 printk("i_clusters: %u\n", fe->i_clusters);
1806 printk("i_generation: %u\n",
1807 le32_to_cpu(fe->i_generation));
1808 printk("id1.bitmap1.i_used: %u\n",
1809 le32_to_cpu(fe->id1.bitmap1.i_used));
1810 printk("id1.bitmap1.i_total: %u\n",
1811 le32_to_cpu(fe->id1.bitmap1.i_total));
1812 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1813 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1814 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1815 printk("id2.i_chain.cl_next_free_rec: %u\n",
1816 fe->id2.i_chain.cl_next_free_rec);
1817 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1818 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1819 fe->id2.i_chain.cl_recs[i].c_free);
1820 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1821 fe->id2.i_chain.cl_recs[i].c_total);
1822 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1823 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);