1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * Block/Cluster mapping functions
8 * Copyright (C) 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
26 #include <linux/init.h>
27 #include <linux/types.h>
28 #include <linux/fiemap.h>
30 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
31 #include <cluster/masklog.h>
37 #include "extent_map.h"
41 #include "buffer_head_io.h"
44 * The extent caching implementation is intentionally trivial.
46 * We only cache a small number of extents stored directly on the
47 * inode, so linear order operations are acceptable. If we ever want
48 * to increase the size of the extent map, then these algorithms must
52 void ocfs2_extent_map_init(struct inode
*inode
)
54 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
56 oi
->ip_extent_map
.em_num_items
= 0;
57 INIT_LIST_HEAD(&oi
->ip_extent_map
.em_list
);
60 static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map
*em
,
62 struct ocfs2_extent_map_item
**ret_emi
)
65 struct ocfs2_extent_map_item
*emi
;
69 list_for_each_entry(emi
, &em
->em_list
, ei_list
) {
70 range
= emi
->ei_cpos
+ emi
->ei_clusters
;
72 if (cpos
>= emi
->ei_cpos
&& cpos
< range
) {
73 list_move(&emi
->ei_list
, &em
->em_list
);
81 static int ocfs2_extent_map_lookup(struct inode
*inode
, unsigned int cpos
,
82 unsigned int *phys
, unsigned int *len
,
86 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
87 struct ocfs2_extent_map_item
*emi
;
89 spin_lock(&oi
->ip_lock
);
91 __ocfs2_extent_map_lookup(&oi
->ip_extent_map
, cpos
, &emi
);
93 coff
= cpos
- emi
->ei_cpos
;
94 *phys
= emi
->ei_phys
+ coff
;
96 *len
= emi
->ei_clusters
- coff
;
98 *flags
= emi
->ei_flags
;
101 spin_unlock(&oi
->ip_lock
);
110 * Forget about all clusters equal to or greater than cpos.
112 void ocfs2_extent_map_trunc(struct inode
*inode
, unsigned int cpos
)
114 struct ocfs2_extent_map_item
*emi
, *n
;
115 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
116 struct ocfs2_extent_map
*em
= &oi
->ip_extent_map
;
120 spin_lock(&oi
->ip_lock
);
121 list_for_each_entry_safe(emi
, n
, &em
->em_list
, ei_list
) {
122 if (emi
->ei_cpos
>= cpos
) {
123 /* Full truncate of this record. */
124 list_move(&emi
->ei_list
, &tmp_list
);
125 BUG_ON(em
->em_num_items
== 0);
130 range
= emi
->ei_cpos
+ emi
->ei_clusters
;
132 /* Partial truncate */
133 emi
->ei_clusters
= cpos
- emi
->ei_cpos
;
136 spin_unlock(&oi
->ip_lock
);
138 list_for_each_entry_safe(emi
, n
, &tmp_list
, ei_list
) {
139 list_del(&emi
->ei_list
);
145 * Is any part of emi2 contained within emi1
147 static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item
*emi1
,
148 struct ocfs2_extent_map_item
*emi2
)
150 unsigned int range1
, range2
;
153 * Check if logical start of emi2 is inside emi1
155 range1
= emi1
->ei_cpos
+ emi1
->ei_clusters
;
156 if (emi2
->ei_cpos
>= emi1
->ei_cpos
&& emi2
->ei_cpos
< range1
)
160 * Check if logical end of emi2 is inside emi1
162 range2
= emi2
->ei_cpos
+ emi2
->ei_clusters
;
163 if (range2
> emi1
->ei_cpos
&& range2
<= range1
)
169 static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item
*dest
,
170 struct ocfs2_extent_map_item
*src
)
172 dest
->ei_cpos
= src
->ei_cpos
;
173 dest
->ei_phys
= src
->ei_phys
;
174 dest
->ei_clusters
= src
->ei_clusters
;
175 dest
->ei_flags
= src
->ei_flags
;
179 * Try to merge emi with ins. Returns 1 if merge succeeds, zero
182 static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item
*emi
,
183 struct ocfs2_extent_map_item
*ins
)
186 * Handle contiguousness
188 if (ins
->ei_phys
== (emi
->ei_phys
+ emi
->ei_clusters
) &&
189 ins
->ei_cpos
== (emi
->ei_cpos
+ emi
->ei_clusters
) &&
190 ins
->ei_flags
== emi
->ei_flags
) {
191 emi
->ei_clusters
+= ins
->ei_clusters
;
193 } else if ((ins
->ei_phys
+ ins
->ei_clusters
) == emi
->ei_phys
&&
194 (ins
->ei_cpos
+ ins
->ei_clusters
) == emi
->ei_phys
&&
195 ins
->ei_flags
== emi
->ei_flags
) {
196 emi
->ei_phys
= ins
->ei_phys
;
197 emi
->ei_cpos
= ins
->ei_cpos
;
198 emi
->ei_clusters
+= ins
->ei_clusters
;
203 * Overlapping extents - this shouldn't happen unless we've
204 * split an extent to change it's flags. That is exceedingly
205 * rare, so there's no sense in trying to optimize it yet.
207 if (ocfs2_ei_is_contained(emi
, ins
) ||
208 ocfs2_ei_is_contained(ins
, emi
)) {
209 ocfs2_copy_emi_fields(emi
, ins
);
213 /* No merge was possible. */
218 * In order to reduce complexity on the caller, this insert function
219 * is intentionally liberal in what it will accept.
221 * The only rule is that the truncate call *must* be used whenever
222 * records have been deleted. This avoids inserting overlapping
223 * records with different physical mappings.
225 void ocfs2_extent_map_insert_rec(struct inode
*inode
,
226 struct ocfs2_extent_rec
*rec
)
228 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
229 struct ocfs2_extent_map
*em
= &oi
->ip_extent_map
;
230 struct ocfs2_extent_map_item
*emi
, *new_emi
= NULL
;
231 struct ocfs2_extent_map_item ins
;
233 ins
.ei_cpos
= le32_to_cpu(rec
->e_cpos
);
234 ins
.ei_phys
= ocfs2_blocks_to_clusters(inode
->i_sb
,
235 le64_to_cpu(rec
->e_blkno
));
236 ins
.ei_clusters
= le16_to_cpu(rec
->e_leaf_clusters
);
237 ins
.ei_flags
= rec
->e_flags
;
240 spin_lock(&oi
->ip_lock
);
242 list_for_each_entry(emi
, &em
->em_list
, ei_list
) {
243 if (ocfs2_try_to_merge_extent_map(emi
, &ins
)) {
244 list_move(&emi
->ei_list
, &em
->em_list
);
245 spin_unlock(&oi
->ip_lock
);
251 * No item could be merged.
253 * Either allocate and add a new item, or overwrite the last recently
257 if (em
->em_num_items
< OCFS2_MAX_EXTENT_MAP_ITEMS
) {
258 if (new_emi
== NULL
) {
259 spin_unlock(&oi
->ip_lock
);
261 new_emi
= kmalloc(sizeof(*new_emi
), GFP_NOFS
);
268 ocfs2_copy_emi_fields(new_emi
, &ins
);
269 list_add(&new_emi
->ei_list
, &em
->em_list
);
273 BUG_ON(list_empty(&em
->em_list
) || em
->em_num_items
== 0);
274 emi
= list_entry(em
->em_list
.prev
,
275 struct ocfs2_extent_map_item
, ei_list
);
276 list_move(&emi
->ei_list
, &em
->em_list
);
277 ocfs2_copy_emi_fields(emi
, &ins
);
280 spin_unlock(&oi
->ip_lock
);
287 static int ocfs2_last_eb_is_empty(struct inode
*inode
,
288 struct ocfs2_dinode
*di
)
291 u64 last_eb_blk
= le64_to_cpu(di
->i_last_eb_blk
);
292 struct buffer_head
*eb_bh
= NULL
;
293 struct ocfs2_extent_block
*eb
;
294 struct ocfs2_extent_list
*el
;
296 ret
= ocfs2_read_block(inode
, last_eb_blk
, &eb_bh
);
302 eb
= (struct ocfs2_extent_block
*) eb_bh
->b_data
;
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb
)) {
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, eb
);
311 if (el
->l_tree_depth
) {
312 ocfs2_error(inode
->i_sb
,
313 "Inode %lu has non zero tree depth in "
314 "leaf block %llu\n", inode
->i_ino
,
315 (unsigned long long)eb_bh
->b_blocknr
);
320 next_free
= le16_to_cpu(el
->l_next_free_rec
);
322 if (next_free
== 0 ||
323 (next_free
== 1 && ocfs2_is_empty_extent(&el
->l_recs
[0])))
332 * Return the 1st index within el which contains an extent start
333 * larger than v_cluster.
335 static int ocfs2_search_for_hole_index(struct ocfs2_extent_list
*el
,
339 struct ocfs2_extent_rec
*rec
;
341 for(i
= 0; i
< le16_to_cpu(el
->l_next_free_rec
); i
++) {
342 rec
= &el
->l_recs
[i
];
344 if (v_cluster
< le32_to_cpu(rec
->e_cpos
))
352 * Figure out the size of a hole which starts at v_cluster within the given
355 * If there is no more allocation past v_cluster, we return the maximum
356 * cluster size minus v_cluster.
358 * If we have in-inode extents, then el points to the dinode list and
359 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
362 static int ocfs2_figure_hole_clusters(struct inode
*inode
,
363 struct ocfs2_extent_list
*el
,
364 struct buffer_head
*eb_bh
,
369 struct buffer_head
*next_eb_bh
= NULL
;
370 struct ocfs2_extent_block
*eb
, *next_eb
;
372 i
= ocfs2_search_for_hole_index(el
, v_cluster
);
374 if (i
== le16_to_cpu(el
->l_next_free_rec
) && eb_bh
) {
375 eb
= (struct ocfs2_extent_block
*)eb_bh
->b_data
;
378 * Check the next leaf for any extents.
381 if (le64_to_cpu(eb
->h_next_leaf_blk
) == 0ULL)
382 goto no_more_extents
;
384 ret
= ocfs2_read_block(inode
,
385 le64_to_cpu(eb
->h_next_leaf_blk
),
391 next_eb
= (struct ocfs2_extent_block
*)next_eb_bh
->b_data
;
393 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb
)) {
395 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode
->i_sb
, next_eb
);
399 el
= &next_eb
->h_list
;
401 i
= ocfs2_search_for_hole_index(el
, v_cluster
);
405 if (i
== le16_to_cpu(el
->l_next_free_rec
)) {
407 * We're at the end of our existing allocation. Just
408 * return the maximum number of clusters we could
411 *num_clusters
= UINT_MAX
- v_cluster
;
413 *num_clusters
= le32_to_cpu(el
->l_recs
[i
].e_cpos
) - v_cluster
;
422 static int ocfs2_get_clusters_nocache(struct inode
*inode
,
423 struct buffer_head
*di_bh
,
424 u32 v_cluster
, unsigned int *hole_len
,
425 struct ocfs2_extent_rec
*ret_rec
,
426 unsigned int *is_last
)
428 int i
, ret
, tree_height
, len
;
429 struct ocfs2_dinode
*di
;
430 struct ocfs2_extent_block
*uninitialized_var(eb
);
431 struct ocfs2_extent_list
*el
;
432 struct ocfs2_extent_rec
*rec
;
433 struct buffer_head
*eb_bh
= NULL
;
435 memset(ret_rec
, 0, sizeof(*ret_rec
));
439 di
= (struct ocfs2_dinode
*) di_bh
->b_data
;
440 el
= &di
->id2
.i_list
;
441 tree_height
= le16_to_cpu(el
->l_tree_depth
);
443 if (tree_height
> 0) {
444 ret
= ocfs2_find_leaf(inode
, el
, v_cluster
, &eb_bh
);
450 eb
= (struct ocfs2_extent_block
*) eb_bh
->b_data
;
453 if (el
->l_tree_depth
) {
454 ocfs2_error(inode
->i_sb
,
455 "Inode %lu has non zero tree depth in "
456 "leaf block %llu\n", inode
->i_ino
,
457 (unsigned long long)eb_bh
->b_blocknr
);
463 i
= ocfs2_search_extent_list(el
, v_cluster
);
466 * Holes can be larger than the maximum size of an
467 * extent, so we return their lengths in a seperate
471 ret
= ocfs2_figure_hole_clusters(inode
, el
, eb_bh
,
483 rec
= &el
->l_recs
[i
];
485 BUG_ON(v_cluster
< le32_to_cpu(rec
->e_cpos
));
488 ocfs2_error(inode
->i_sb
, "Inode %lu has bad extent "
489 "record (%u, %u, 0)", inode
->i_ino
,
490 le32_to_cpu(rec
->e_cpos
),
491 ocfs2_rec_clusters(el
, rec
));
499 * Checking for last extent is potentially expensive - we
500 * might have to look at the next leaf over to see if it's
503 * The first two checks are to see whether the caller even
504 * cares for this information, and if the extent is at least
505 * the last in it's list.
507 * If those hold true, then the extent is last if any of the
508 * additional conditions hold true:
509 * - Extent list is in-inode
510 * - Extent list is right-most
511 * - Extent list is 2nd to rightmost, with empty right-most
514 if (i
== (le16_to_cpu(el
->l_next_free_rec
) - 1)) {
515 if (tree_height
== 0)
517 else if (eb
->h_blkno
== di
->i_last_eb_blk
)
519 else if (eb
->h_next_leaf_blk
== di
->i_last_eb_blk
) {
520 ret
= ocfs2_last_eb_is_empty(inode
, di
);
538 static void ocfs2_relative_extent_offsets(struct super_block
*sb
,
540 struct ocfs2_extent_rec
*rec
,
541 u32
*p_cluster
, u32
*num_clusters
)
544 u32 coff
= v_cluster
- le32_to_cpu(rec
->e_cpos
);
546 *p_cluster
= ocfs2_blocks_to_clusters(sb
, le64_to_cpu(rec
->e_blkno
));
547 *p_cluster
= *p_cluster
+ coff
;
550 *num_clusters
= le16_to_cpu(rec
->e_leaf_clusters
) - coff
;
553 int ocfs2_xattr_get_clusters(struct inode
*inode
, u32 v_cluster
,
554 u32
*p_cluster
, u32
*num_clusters
,
555 struct ocfs2_extent_list
*el
)
558 struct buffer_head
*eb_bh
= NULL
;
559 struct ocfs2_extent_block
*eb
;
560 struct ocfs2_extent_rec
*rec
;
563 if (el
->l_tree_depth
) {
564 ret
= ocfs2_find_leaf(inode
, el
, v_cluster
, &eb_bh
);
570 eb
= (struct ocfs2_extent_block
*) eb_bh
->b_data
;
573 if (el
->l_tree_depth
) {
574 ocfs2_error(inode
->i_sb
,
575 "Inode %lu has non zero tree depth in "
576 "xattr leaf block %llu\n", inode
->i_ino
,
577 (unsigned long long)eb_bh
->b_blocknr
);
583 i
= ocfs2_search_extent_list(el
, v_cluster
);
589 rec
= &el
->l_recs
[i
];
590 BUG_ON(v_cluster
< le32_to_cpu(rec
->e_cpos
));
593 ocfs2_error(inode
->i_sb
, "Inode %lu has bad extent "
594 "record (%u, %u, 0) in xattr", inode
->i_ino
,
595 le32_to_cpu(rec
->e_cpos
),
596 ocfs2_rec_clusters(el
, rec
));
600 coff
= v_cluster
- le32_to_cpu(rec
->e_cpos
);
601 *p_cluster
= ocfs2_blocks_to_clusters(inode
->i_sb
,
602 le64_to_cpu(rec
->e_blkno
));
603 *p_cluster
= *p_cluster
+ coff
;
605 *num_clusters
= ocfs2_rec_clusters(el
, rec
) - coff
;
613 int ocfs2_get_clusters(struct inode
*inode
, u32 v_cluster
,
614 u32
*p_cluster
, u32
*num_clusters
,
615 unsigned int *extent_flags
)
618 unsigned int uninitialized_var(hole_len
), flags
= 0;
619 struct buffer_head
*di_bh
= NULL
;
620 struct ocfs2_extent_rec rec
;
622 if (OCFS2_I(inode
)->ip_dyn_features
& OCFS2_INLINE_DATA_FL
) {
628 ret
= ocfs2_extent_map_lookup(inode
, v_cluster
, p_cluster
,
629 num_clusters
, extent_flags
);
633 ret
= ocfs2_read_block(inode
, OCFS2_I(inode
)->ip_blkno
, &di_bh
);
639 ret
= ocfs2_get_clusters_nocache(inode
, di_bh
, v_cluster
, &hole_len
,
646 if (rec
.e_blkno
== 0ULL) {
648 * A hole was found. Return some canned values that
649 * callers can key on. If asked for, num_clusters will
650 * be populated with the size of the hole.
654 *num_clusters
= hole_len
;
657 ocfs2_relative_extent_offsets(inode
->i_sb
, v_cluster
, &rec
,
658 p_cluster
, num_clusters
);
661 ocfs2_extent_map_insert_rec(inode
, &rec
);
665 *extent_flags
= flags
;
673 * This expects alloc_sem to be held. The allocation cannot change at
674 * all while the map is in the process of being updated.
676 int ocfs2_extent_map_get_blocks(struct inode
*inode
, u64 v_blkno
, u64
*p_blkno
,
677 u64
*ret_count
, unsigned int *extent_flags
)
680 int bpc
= ocfs2_clusters_to_blocks(inode
->i_sb
, 1);
681 u32 cpos
, num_clusters
, p_cluster
;
684 cpos
= ocfs2_blocks_to_clusters(inode
->i_sb
, v_blkno
);
686 ret
= ocfs2_get_clusters(inode
, cpos
, &p_cluster
, &num_clusters
,
694 * p_cluster == 0 indicates a hole.
697 boff
= ocfs2_clusters_to_blocks(inode
->i_sb
, p_cluster
);
698 boff
+= (v_blkno
& (u64
)(bpc
- 1));
704 *ret_count
= ocfs2_clusters_to_blocks(inode
->i_sb
, num_clusters
);
705 *ret_count
-= v_blkno
& (u64
)(bpc
- 1);
712 static int ocfs2_fiemap_inline(struct inode
*inode
, struct buffer_head
*di_bh
,
713 struct fiemap_extent_info
*fieinfo
,
717 unsigned int id_count
;
718 struct ocfs2_dinode
*di
;
720 u32 flags
= FIEMAP_EXTENT_DATA_INLINE
|FIEMAP_EXTENT_LAST
;
721 struct ocfs2_inode_info
*oi
= OCFS2_I(inode
);
723 di
= (struct ocfs2_dinode
*)di_bh
->b_data
;
724 id_count
= le16_to_cpu(di
->id2
.i_data
.id_count
);
726 if (map_start
< id_count
) {
727 phys
= oi
->ip_blkno
<< inode
->i_sb
->s_blocksize_bits
;
728 phys
+= offsetof(struct ocfs2_dinode
, id2
.i_data
.id_data
);
730 ret
= fiemap_fill_next_extent(fieinfo
, 0, phys
, id_count
,
739 #define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
741 int ocfs2_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
742 u64 map_start
, u64 map_len
)
745 u32 mapping_end
, cpos
;
746 unsigned int hole_size
;
747 struct ocfs2_super
*osb
= OCFS2_SB(inode
->i_sb
);
748 u64 len_bytes
, phys_bytes
, virt_bytes
;
749 struct buffer_head
*di_bh
= NULL
;
750 struct ocfs2_extent_rec rec
;
752 ret
= fiemap_check_flags(fieinfo
, OCFS2_FIEMAP_FLAGS
);
756 ret
= ocfs2_inode_lock(inode
, &di_bh
, 0);
762 down_read(&OCFS2_I(inode
)->ip_alloc_sem
);
765 * Handle inline-data separately.
767 if (OCFS2_I(inode
)->ip_dyn_features
& OCFS2_INLINE_DATA_FL
) {
768 ret
= ocfs2_fiemap_inline(inode
, di_bh
, fieinfo
, map_start
);
772 cpos
= map_start
>> osb
->s_clustersize_bits
;
773 mapping_end
= ocfs2_clusters_for_bytes(inode
->i_sb
,
774 map_start
+ map_len
);
777 while (cpos
< mapping_end
&& !is_last
) {
780 ret
= ocfs2_get_clusters_nocache(inode
, di_bh
, cpos
,
781 &hole_size
, &rec
, &is_last
);
787 if (rec
.e_blkno
== 0ULL) {
793 if (rec
.e_flags
& OCFS2_EXT_UNWRITTEN
)
794 fe_flags
|= FIEMAP_EXTENT_UNWRITTEN
;
796 fe_flags
|= FIEMAP_EXTENT_LAST
;
797 len_bytes
= (u64
)le16_to_cpu(rec
.e_leaf_clusters
) << osb
->s_clustersize_bits
;
798 phys_bytes
= le64_to_cpu(rec
.e_blkno
) << osb
->sb
->s_blocksize_bits
;
799 virt_bytes
= (u64
)le32_to_cpu(rec
.e_cpos
) << osb
->s_clustersize_bits
;
801 ret
= fiemap_fill_next_extent(fieinfo
, virt_bytes
, phys_bytes
,
802 len_bytes
, fe_flags
);
806 cpos
= le32_to_cpu(rec
.e_cpos
)+ le16_to_cpu(rec
.e_leaf_clusters
);
815 up_read(&OCFS2_I(inode
)->ip_alloc_sem
);
817 ocfs2_inode_unlock(inode
, 0);