2 * Copyright (C) 2008 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/blkdev.h>
22 #include <linux/list_sort.h>
24 #include "transaction.h"
27 #include "print-tree.h"
33 /* magic values for the inode_only field in btrfs_log_inode:
35 * LOG_INODE_ALL means to log everything
36 * LOG_INODE_EXISTS means to log just enough to recreate the inode
39 #define LOG_INODE_ALL 0
40 #define LOG_INODE_EXISTS 1
43 * directory trouble cases
45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
46 * log, we must force a full commit before doing an fsync of the directory
47 * where the unlink was done.
48 * ---> record transid of last unlink/rename per directory
52 * rename foo/some_dir foo2/some_dir
54 * fsync foo/some_dir/some_file
56 * The fsync above will unlink the original some_dir without recording
57 * it in its new location (foo2). After a crash, some_dir will be gone
58 * unless the fsync of some_file forces a full commit
60 * 2) we must log any new names for any file or dir that is in the fsync
61 * log. ---> check inode while renaming/linking.
63 * 2a) we must log any new names for any file or dir during rename
64 * when the directory they are being removed from was logged.
65 * ---> check inode and old parent dir during rename
67 * 2a is actually the more important variant. With the extra logging
68 * a crash might unlink the old name without recreating the new one
70 * 3) after a crash, we must go through any directories with a link count
71 * of zero and redo the rm -rf
78 * The directory f1 was fully removed from the FS, but fsync was never
79 * called on f1, only its parent dir. After a crash the rm -rf must
80 * be replayed. This must be able to recurse down the entire
81 * directory tree. The inode link count fixup code takes care of the
86 * stages for the tree walking. The first
87 * stage (0) is to only pin down the blocks we find
88 * the second stage (1) is to make sure that all the inodes
89 * we find in the log are created in the subvolume.
91 * The last stage is to deal with directories and links and extents
92 * and all the other fun semantics
94 #define LOG_WALK_PIN_ONLY 0
95 #define LOG_WALK_REPLAY_INODES 1
96 #define LOG_WALK_REPLAY_ALL 2
98 static int btrfs_log_inode(struct btrfs_trans_handle
*trans
,
99 struct btrfs_root
*root
, struct inode
*inode
,
101 static int link_to_fixup_dir(struct btrfs_trans_handle
*trans
,
102 struct btrfs_root
*root
,
103 struct btrfs_path
*path
, u64 objectid
);
104 static noinline
int replay_dir_deletes(struct btrfs_trans_handle
*trans
,
105 struct btrfs_root
*root
,
106 struct btrfs_root
*log
,
107 struct btrfs_path
*path
,
108 u64 dirid
, int del_all
);
111 * tree logging is a special write ahead log used to make sure that
112 * fsyncs and O_SYNCs can happen without doing full tree commits.
114 * Full tree commits are expensive because they require commonly
115 * modified blocks to be recowed, creating many dirty pages in the
116 * extent tree an 4x-6x higher write load than ext3.
118 * Instead of doing a tree commit on every fsync, we use the
119 * key ranges and transaction ids to find items for a given file or directory
120 * that have changed in this transaction. Those items are copied into
121 * a special tree (one per subvolume root), that tree is written to disk
122 * and then the fsync is considered complete.
124 * After a crash, items are copied out of the log-tree back into the
125 * subvolume tree. Any file data extents found are recorded in the extent
126 * allocation tree, and the log-tree freed.
128 * The log tree is read three times, once to pin down all the extents it is
129 * using in ram and once, once to create all the inodes logged in the tree
130 * and once to do all the other items.
134 * start a sub transaction and setup the log tree
135 * this increments the log tree writer count to make the people
136 * syncing the tree wait for us to finish
138 static int start_log_trans(struct btrfs_trans_handle
*trans
,
139 struct btrfs_root
*root
)
144 mutex_lock(&root
->log_mutex
);
145 if (root
->log_root
) {
146 if (!root
->log_start_pid
) {
147 root
->log_start_pid
= current
->pid
;
148 root
->log_multiple_pids
= false;
149 } else if (root
->log_start_pid
!= current
->pid
) {
150 root
->log_multiple_pids
= true;
153 atomic_inc(&root
->log_batch
);
154 atomic_inc(&root
->log_writers
);
155 mutex_unlock(&root
->log_mutex
);
158 root
->log_multiple_pids
= false;
159 root
->log_start_pid
= current
->pid
;
160 mutex_lock(&root
->fs_info
->tree_log_mutex
);
161 if (!root
->fs_info
->log_root_tree
) {
162 ret
= btrfs_init_log_root_tree(trans
, root
->fs_info
);
166 if (err
== 0 && !root
->log_root
) {
167 ret
= btrfs_add_log_tree(trans
, root
);
171 mutex_unlock(&root
->fs_info
->tree_log_mutex
);
172 atomic_inc(&root
->log_batch
);
173 atomic_inc(&root
->log_writers
);
174 mutex_unlock(&root
->log_mutex
);
179 * returns 0 if there was a log transaction running and we were able
180 * to join, or returns -ENOENT if there were not transactions
183 static int join_running_log_trans(struct btrfs_root
*root
)
191 mutex_lock(&root
->log_mutex
);
192 if (root
->log_root
) {
194 atomic_inc(&root
->log_writers
);
196 mutex_unlock(&root
->log_mutex
);
201 * This either makes the current running log transaction wait
202 * until you call btrfs_end_log_trans() or it makes any future
203 * log transactions wait until you call btrfs_end_log_trans()
205 int btrfs_pin_log_trans(struct btrfs_root
*root
)
209 mutex_lock(&root
->log_mutex
);
210 atomic_inc(&root
->log_writers
);
211 mutex_unlock(&root
->log_mutex
);
216 * indicate we're done making changes to the log tree
217 * and wake up anyone waiting to do a sync
219 void btrfs_end_log_trans(struct btrfs_root
*root
)
221 if (atomic_dec_and_test(&root
->log_writers
)) {
223 if (waitqueue_active(&root
->log_writer_wait
))
224 wake_up(&root
->log_writer_wait
);
230 * the walk control struct is used to pass state down the chain when
231 * processing the log tree. The stage field tells us which part
232 * of the log tree processing we are currently doing. The others
233 * are state fields used for that specific part
235 struct walk_control
{
236 /* should we free the extent on disk when done? This is used
237 * at transaction commit time while freeing a log tree
241 /* should we write out the extent buffer? This is used
242 * while flushing the log tree to disk during a sync
246 /* should we wait for the extent buffer io to finish? Also used
247 * while flushing the log tree to disk for a sync
251 /* pin only walk, we record which extents on disk belong to the
256 /* what stage of the replay code we're currently in */
259 /* the root we are currently replaying */
260 struct btrfs_root
*replay_dest
;
262 /* the trans handle for the current replay */
263 struct btrfs_trans_handle
*trans
;
265 /* the function that gets used to process blocks we find in the
266 * tree. Note the extent_buffer might not be up to date when it is
267 * passed in, and it must be checked or read if you need the data
270 int (*process_func
)(struct btrfs_root
*log
, struct extent_buffer
*eb
,
271 struct walk_control
*wc
, u64 gen
);
275 * process_func used to pin down extents, write them or wait on them
277 static int process_one_buffer(struct btrfs_root
*log
,
278 struct extent_buffer
*eb
,
279 struct walk_control
*wc
, u64 gen
)
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
287 if (btrfs_fs_incompat(log
->fs_info
, MIXED_GROUPS
)) {
288 ret
= btrfs_read_buffer(eb
, gen
);
294 ret
= btrfs_pin_extent_for_log_replay(log
->fs_info
->extent_root
,
297 if (!ret
&& btrfs_buffer_uptodate(eb
, gen
, 0)) {
298 if (wc
->pin
&& btrfs_header_level(eb
) == 0)
299 ret
= btrfs_exclude_logged_extents(log
, eb
);
301 btrfs_write_tree_block(eb
);
303 btrfs_wait_tree_block_writeback(eb
);
309 * Item overwrite used by replay and tree logging. eb, slot and key all refer
310 * to the src data we are copying out.
312 * root is the tree we are copying into, and path is a scratch
313 * path for use in this function (it should be released on entry and
314 * will be released on exit).
316 * If the key is already in the destination tree the existing item is
317 * overwritten. If the existing item isn't big enough, it is extended.
318 * If it is too large, it is truncated.
320 * If the key isn't in the destination yet, a new item is inserted.
322 static noinline
int overwrite_item(struct btrfs_trans_handle
*trans
,
323 struct btrfs_root
*root
,
324 struct btrfs_path
*path
,
325 struct extent_buffer
*eb
, int slot
,
326 struct btrfs_key
*key
)
330 u64 saved_i_size
= 0;
331 int save_old_i_size
= 0;
332 unsigned long src_ptr
;
333 unsigned long dst_ptr
;
334 int overwrite_root
= 0;
335 bool inode_item
= key
->type
== BTRFS_INODE_ITEM_KEY
;
337 if (root
->root_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
)
340 item_size
= btrfs_item_size_nr(eb
, slot
);
341 src_ptr
= btrfs_item_ptr_offset(eb
, slot
);
343 /* look for the key in the destination tree */
344 ret
= btrfs_search_slot(NULL
, root
, key
, path
, 0, 0);
351 u32 dst_size
= btrfs_item_size_nr(path
->nodes
[0],
353 if (dst_size
!= item_size
)
356 if (item_size
== 0) {
357 btrfs_release_path(path
);
360 dst_copy
= kmalloc(item_size
, GFP_NOFS
);
361 src_copy
= kmalloc(item_size
, GFP_NOFS
);
362 if (!dst_copy
|| !src_copy
) {
363 btrfs_release_path(path
);
369 read_extent_buffer(eb
, src_copy
, src_ptr
, item_size
);
371 dst_ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
372 read_extent_buffer(path
->nodes
[0], dst_copy
, dst_ptr
,
374 ret
= memcmp(dst_copy
, src_copy
, item_size
);
379 * they have the same contents, just return, this saves
380 * us from cowing blocks in the destination tree and doing
381 * extra writes that may not have been done by a previous
385 btrfs_release_path(path
);
390 * We need to load the old nbytes into the inode so when we
391 * replay the extents we've logged we get the right nbytes.
394 struct btrfs_inode_item
*item
;
397 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
398 struct btrfs_inode_item
);
399 nbytes
= btrfs_inode_nbytes(path
->nodes
[0], item
);
400 item
= btrfs_item_ptr(eb
, slot
,
401 struct btrfs_inode_item
);
402 btrfs_set_inode_nbytes(eb
, item
, nbytes
);
404 } else if (inode_item
) {
405 struct btrfs_inode_item
*item
;
408 * New inode, set nbytes to 0 so that the nbytes comes out
409 * properly when we replay the extents.
411 item
= btrfs_item_ptr(eb
, slot
, struct btrfs_inode_item
);
412 btrfs_set_inode_nbytes(eb
, item
, 0);
415 btrfs_release_path(path
);
416 /* try to insert the key into the destination tree */
417 ret
= btrfs_insert_empty_item(trans
, root
, path
,
420 /* make sure any existing item is the correct size */
421 if (ret
== -EEXIST
) {
423 found_size
= btrfs_item_size_nr(path
->nodes
[0],
425 if (found_size
> item_size
)
426 btrfs_truncate_item(root
, path
, item_size
, 1);
427 else if (found_size
< item_size
)
428 btrfs_extend_item(root
, path
,
429 item_size
- found_size
);
433 dst_ptr
= btrfs_item_ptr_offset(path
->nodes
[0],
436 /* don't overwrite an existing inode if the generation number
437 * was logged as zero. This is done when the tree logging code
438 * is just logging an inode to make sure it exists after recovery.
440 * Also, don't overwrite i_size on directories during replay.
441 * log replay inserts and removes directory items based on the
442 * state of the tree found in the subvolume, and i_size is modified
445 if (key
->type
== BTRFS_INODE_ITEM_KEY
&& ret
== -EEXIST
) {
446 struct btrfs_inode_item
*src_item
;
447 struct btrfs_inode_item
*dst_item
;
449 src_item
= (struct btrfs_inode_item
*)src_ptr
;
450 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
452 if (btrfs_inode_generation(eb
, src_item
) == 0)
455 if (overwrite_root
&&
456 S_ISDIR(btrfs_inode_mode(eb
, src_item
)) &&
457 S_ISDIR(btrfs_inode_mode(path
->nodes
[0], dst_item
))) {
459 saved_i_size
= btrfs_inode_size(path
->nodes
[0],
464 copy_extent_buffer(path
->nodes
[0], eb
, dst_ptr
,
467 if (save_old_i_size
) {
468 struct btrfs_inode_item
*dst_item
;
469 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
470 btrfs_set_inode_size(path
->nodes
[0], dst_item
, saved_i_size
);
473 /* make sure the generation is filled in */
474 if (key
->type
== BTRFS_INODE_ITEM_KEY
) {
475 struct btrfs_inode_item
*dst_item
;
476 dst_item
= (struct btrfs_inode_item
*)dst_ptr
;
477 if (btrfs_inode_generation(path
->nodes
[0], dst_item
) == 0) {
478 btrfs_set_inode_generation(path
->nodes
[0], dst_item
,
483 btrfs_mark_buffer_dirty(path
->nodes
[0]);
484 btrfs_release_path(path
);
489 * simple helper to read an inode off the disk from a given root
490 * This can only be called for subvolume roots and not for the log
492 static noinline
struct inode
*read_one_inode(struct btrfs_root
*root
,
495 struct btrfs_key key
;
498 key
.objectid
= objectid
;
499 key
.type
= BTRFS_INODE_ITEM_KEY
;
501 inode
= btrfs_iget(root
->fs_info
->sb
, &key
, root
, NULL
);
504 } else if (is_bad_inode(inode
)) {
511 /* replays a single extent in 'eb' at 'slot' with 'key' into the
512 * subvolume 'root'. path is released on entry and should be released
515 * extents in the log tree have not been allocated out of the extent
516 * tree yet. So, this completes the allocation, taking a reference
517 * as required if the extent already exists or creating a new extent
518 * if it isn't in the extent allocation tree yet.
520 * The extent is inserted into the file, dropping any existing extents
521 * from the file that overlap the new one.
523 static noinline
int replay_one_extent(struct btrfs_trans_handle
*trans
,
524 struct btrfs_root
*root
,
525 struct btrfs_path
*path
,
526 struct extent_buffer
*eb
, int slot
,
527 struct btrfs_key
*key
)
531 u64 start
= key
->offset
;
533 struct btrfs_file_extent_item
*item
;
534 struct inode
*inode
= NULL
;
538 item
= btrfs_item_ptr(eb
, slot
, struct btrfs_file_extent_item
);
539 found_type
= btrfs_file_extent_type(eb
, item
);
541 if (found_type
== BTRFS_FILE_EXTENT_REG
||
542 found_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
543 nbytes
= btrfs_file_extent_num_bytes(eb
, item
);
544 extent_end
= start
+ nbytes
;
547 * We don't add to the inodes nbytes if we are prealloc or a
550 if (btrfs_file_extent_disk_bytenr(eb
, item
) == 0)
552 } else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
553 size
= btrfs_file_extent_inline_len(eb
, item
);
554 nbytes
= btrfs_file_extent_ram_bytes(eb
, item
);
555 extent_end
= ALIGN(start
+ size
, root
->sectorsize
);
561 inode
= read_one_inode(root
, key
->objectid
);
568 * first check to see if we already have this extent in the
569 * file. This must be done before the btrfs_drop_extents run
570 * so we don't try to drop this extent.
572 ret
= btrfs_lookup_file_extent(trans
, root
, path
, btrfs_ino(inode
),
576 (found_type
== BTRFS_FILE_EXTENT_REG
||
577 found_type
== BTRFS_FILE_EXTENT_PREALLOC
)) {
578 struct btrfs_file_extent_item cmp1
;
579 struct btrfs_file_extent_item cmp2
;
580 struct btrfs_file_extent_item
*existing
;
581 struct extent_buffer
*leaf
;
583 leaf
= path
->nodes
[0];
584 existing
= btrfs_item_ptr(leaf
, path
->slots
[0],
585 struct btrfs_file_extent_item
);
587 read_extent_buffer(eb
, &cmp1
, (unsigned long)item
,
589 read_extent_buffer(leaf
, &cmp2
, (unsigned long)existing
,
593 * we already have a pointer to this exact extent,
594 * we don't have to do anything
596 if (memcmp(&cmp1
, &cmp2
, sizeof(cmp1
)) == 0) {
597 btrfs_release_path(path
);
601 btrfs_release_path(path
);
603 /* drop any overlapping extents */
604 ret
= btrfs_drop_extents(trans
, root
, inode
, start
, extent_end
, 1);
608 if (found_type
== BTRFS_FILE_EXTENT_REG
||
609 found_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
611 unsigned long dest_offset
;
612 struct btrfs_key ins
;
614 ret
= btrfs_insert_empty_item(trans
, root
, path
, key
,
618 dest_offset
= btrfs_item_ptr_offset(path
->nodes
[0],
620 copy_extent_buffer(path
->nodes
[0], eb
, dest_offset
,
621 (unsigned long)item
, sizeof(*item
));
623 ins
.objectid
= btrfs_file_extent_disk_bytenr(eb
, item
);
624 ins
.offset
= btrfs_file_extent_disk_num_bytes(eb
, item
);
625 ins
.type
= BTRFS_EXTENT_ITEM_KEY
;
626 offset
= key
->offset
- btrfs_file_extent_offset(eb
, item
);
628 if (ins
.objectid
> 0) {
631 LIST_HEAD(ordered_sums
);
633 * is this extent already allocated in the extent
634 * allocation tree? If so, just add a reference
636 ret
= btrfs_lookup_extent(root
, ins
.objectid
,
639 ret
= btrfs_inc_extent_ref(trans
, root
,
640 ins
.objectid
, ins
.offset
,
641 0, root
->root_key
.objectid
,
642 key
->objectid
, offset
, 0);
647 * insert the extent pointer in the extent
650 ret
= btrfs_alloc_logged_file_extent(trans
,
651 root
, root
->root_key
.objectid
,
652 key
->objectid
, offset
, &ins
);
656 btrfs_release_path(path
);
658 if (btrfs_file_extent_compression(eb
, item
)) {
659 csum_start
= ins
.objectid
;
660 csum_end
= csum_start
+ ins
.offset
;
662 csum_start
= ins
.objectid
+
663 btrfs_file_extent_offset(eb
, item
);
664 csum_end
= csum_start
+
665 btrfs_file_extent_num_bytes(eb
, item
);
668 ret
= btrfs_lookup_csums_range(root
->log_root
,
669 csum_start
, csum_end
- 1,
673 while (!list_empty(&ordered_sums
)) {
674 struct btrfs_ordered_sum
*sums
;
675 sums
= list_entry(ordered_sums
.next
,
676 struct btrfs_ordered_sum
,
679 ret
= btrfs_csum_file_blocks(trans
,
680 root
->fs_info
->csum_root
,
682 list_del(&sums
->list
);
688 btrfs_release_path(path
);
690 } else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
691 /* inline extents are easy, we just overwrite them */
692 ret
= overwrite_item(trans
, root
, path
, eb
, slot
, key
);
697 inode_add_bytes(inode
, nbytes
);
698 ret
= btrfs_update_inode(trans
, root
, inode
);
706 * when cleaning up conflicts between the directory names in the
707 * subvolume, directory names in the log and directory names in the
708 * inode back references, we may have to unlink inodes from directories.
710 * This is a helper function to do the unlink of a specific directory
713 static noinline
int drop_one_dir_item(struct btrfs_trans_handle
*trans
,
714 struct btrfs_root
*root
,
715 struct btrfs_path
*path
,
717 struct btrfs_dir_item
*di
)
722 struct extent_buffer
*leaf
;
723 struct btrfs_key location
;
726 leaf
= path
->nodes
[0];
728 btrfs_dir_item_key_to_cpu(leaf
, di
, &location
);
729 name_len
= btrfs_dir_name_len(leaf
, di
);
730 name
= kmalloc(name_len
, GFP_NOFS
);
734 read_extent_buffer(leaf
, name
, (unsigned long)(di
+ 1), name_len
);
735 btrfs_release_path(path
);
737 inode
= read_one_inode(root
, location
.objectid
);
743 ret
= link_to_fixup_dir(trans
, root
, path
, location
.objectid
);
747 ret
= btrfs_unlink_inode(trans
, root
, dir
, inode
, name
, name_len
);
750 btrfs_run_delayed_items(trans
, root
);
758 * helper function to see if a given name and sequence number found
759 * in an inode back reference are already in a directory and correctly
760 * point to this inode
762 static noinline
int inode_in_dir(struct btrfs_root
*root
,
763 struct btrfs_path
*path
,
764 u64 dirid
, u64 objectid
, u64 index
,
765 const char *name
, int name_len
)
767 struct btrfs_dir_item
*di
;
768 struct btrfs_key location
;
771 di
= btrfs_lookup_dir_index_item(NULL
, root
, path
, dirid
,
772 index
, name
, name_len
, 0);
773 if (di
&& !IS_ERR(di
)) {
774 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &location
);
775 if (location
.objectid
!= objectid
)
779 btrfs_release_path(path
);
781 di
= btrfs_lookup_dir_item(NULL
, root
, path
, dirid
, name
, name_len
, 0);
782 if (di
&& !IS_ERR(di
)) {
783 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, &location
);
784 if (location
.objectid
!= objectid
)
790 btrfs_release_path(path
);
795 * helper function to check a log tree for a named back reference in
796 * an inode. This is used to decide if a back reference that is
797 * found in the subvolume conflicts with what we find in the log.
799 * inode backreferences may have multiple refs in a single item,
800 * during replay we process one reference at a time, and we don't
801 * want to delete valid links to a file from the subvolume if that
802 * link is also in the log.
804 static noinline
int backref_in_log(struct btrfs_root
*log
,
805 struct btrfs_key
*key
,
807 char *name
, int namelen
)
809 struct btrfs_path
*path
;
810 struct btrfs_inode_ref
*ref
;
812 unsigned long ptr_end
;
813 unsigned long name_ptr
;
819 path
= btrfs_alloc_path();
823 ret
= btrfs_search_slot(NULL
, log
, key
, path
, 0, 0);
827 ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
829 if (key
->type
== BTRFS_INODE_EXTREF_KEY
) {
830 if (btrfs_find_name_in_ext_backref(path
, ref_objectid
,
831 name
, namelen
, NULL
))
837 item_size
= btrfs_item_size_nr(path
->nodes
[0], path
->slots
[0]);
838 ptr_end
= ptr
+ item_size
;
839 while (ptr
< ptr_end
) {
840 ref
= (struct btrfs_inode_ref
*)ptr
;
841 found_name_len
= btrfs_inode_ref_name_len(path
->nodes
[0], ref
);
842 if (found_name_len
== namelen
) {
843 name_ptr
= (unsigned long)(ref
+ 1);
844 ret
= memcmp_extent_buffer(path
->nodes
[0], name
,
851 ptr
= (unsigned long)(ref
+ 1) + found_name_len
;
854 btrfs_free_path(path
);
858 static inline int __add_inode_ref(struct btrfs_trans_handle
*trans
,
859 struct btrfs_root
*root
,
860 struct btrfs_path
*path
,
861 struct btrfs_root
*log_root
,
862 struct inode
*dir
, struct inode
*inode
,
863 struct extent_buffer
*eb
,
864 u64 inode_objectid
, u64 parent_objectid
,
865 u64 ref_index
, char *name
, int namelen
,
871 struct extent_buffer
*leaf
;
872 struct btrfs_dir_item
*di
;
873 struct btrfs_key search_key
;
874 struct btrfs_inode_extref
*extref
;
877 /* Search old style refs */
878 search_key
.objectid
= inode_objectid
;
879 search_key
.type
= BTRFS_INODE_REF_KEY
;
880 search_key
.offset
= parent_objectid
;
881 ret
= btrfs_search_slot(NULL
, root
, &search_key
, path
, 0, 0);
883 struct btrfs_inode_ref
*victim_ref
;
885 unsigned long ptr_end
;
887 leaf
= path
->nodes
[0];
889 /* are we trying to overwrite a back ref for the root directory
890 * if so, just jump out, we're done
892 if (search_key
.objectid
== search_key
.offset
)
895 /* check all the names in this back reference to see
896 * if they are in the log. if so, we allow them to stay
897 * otherwise they must be unlinked as a conflict
899 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
900 ptr_end
= ptr
+ btrfs_item_size_nr(leaf
, path
->slots
[0]);
901 while (ptr
< ptr_end
) {
902 victim_ref
= (struct btrfs_inode_ref
*)ptr
;
903 victim_name_len
= btrfs_inode_ref_name_len(leaf
,
905 victim_name
= kmalloc(victim_name_len
, GFP_NOFS
);
909 read_extent_buffer(leaf
, victim_name
,
910 (unsigned long)(victim_ref
+ 1),
913 if (!backref_in_log(log_root
, &search_key
,
917 btrfs_inc_nlink(inode
);
918 btrfs_release_path(path
);
920 ret
= btrfs_unlink_inode(trans
, root
, dir
,
926 btrfs_run_delayed_items(trans
, root
);
932 ptr
= (unsigned long)(victim_ref
+ 1) + victim_name_len
;
936 * NOTE: we have searched root tree and checked the
937 * coresponding ref, it does not need to check again.
941 btrfs_release_path(path
);
943 /* Same search but for extended refs */
944 extref
= btrfs_lookup_inode_extref(NULL
, root
, path
, name
, namelen
,
945 inode_objectid
, parent_objectid
, 0,
947 if (!IS_ERR_OR_NULL(extref
)) {
951 struct inode
*victim_parent
;
953 leaf
= path
->nodes
[0];
955 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
956 base
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
958 while (cur_offset
< item_size
) {
959 extref
= (struct btrfs_inode_extref
*)base
+ cur_offset
;
961 victim_name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
963 if (btrfs_inode_extref_parent(leaf
, extref
) != parent_objectid
)
966 victim_name
= kmalloc(victim_name_len
, GFP_NOFS
);
969 read_extent_buffer(leaf
, victim_name
, (unsigned long)&extref
->name
,
972 search_key
.objectid
= inode_objectid
;
973 search_key
.type
= BTRFS_INODE_EXTREF_KEY
;
974 search_key
.offset
= btrfs_extref_hash(parent_objectid
,
978 if (!backref_in_log(log_root
, &search_key
,
979 parent_objectid
, victim_name
,
982 victim_parent
= read_one_inode(root
,
985 btrfs_inc_nlink(inode
);
986 btrfs_release_path(path
);
988 ret
= btrfs_unlink_inode(trans
, root
,
993 btrfs_run_delayed_items(trans
, root
);
1006 cur_offset
+= victim_name_len
+ sizeof(*extref
);
1010 btrfs_release_path(path
);
1012 /* look for a conflicting sequence number */
1013 di
= btrfs_lookup_dir_index_item(trans
, root
, path
, btrfs_ino(dir
),
1014 ref_index
, name
, namelen
, 0);
1015 if (di
&& !IS_ERR(di
)) {
1016 ret
= drop_one_dir_item(trans
, root
, path
, dir
, di
);
1020 btrfs_release_path(path
);
1022 /* look for a conflicing name */
1023 di
= btrfs_lookup_dir_item(trans
, root
, path
, btrfs_ino(dir
),
1025 if (di
&& !IS_ERR(di
)) {
1026 ret
= drop_one_dir_item(trans
, root
, path
, dir
, di
);
1030 btrfs_release_path(path
);
1035 static int extref_get_fields(struct extent_buffer
*eb
, unsigned long ref_ptr
,
1036 u32
*namelen
, char **name
, u64
*index
,
1037 u64
*parent_objectid
)
1039 struct btrfs_inode_extref
*extref
;
1041 extref
= (struct btrfs_inode_extref
*)ref_ptr
;
1043 *namelen
= btrfs_inode_extref_name_len(eb
, extref
);
1044 *name
= kmalloc(*namelen
, GFP_NOFS
);
1048 read_extent_buffer(eb
, *name
, (unsigned long)&extref
->name
,
1051 *index
= btrfs_inode_extref_index(eb
, extref
);
1052 if (parent_objectid
)
1053 *parent_objectid
= btrfs_inode_extref_parent(eb
, extref
);
1058 static int ref_get_fields(struct extent_buffer
*eb
, unsigned long ref_ptr
,
1059 u32
*namelen
, char **name
, u64
*index
)
1061 struct btrfs_inode_ref
*ref
;
1063 ref
= (struct btrfs_inode_ref
*)ref_ptr
;
1065 *namelen
= btrfs_inode_ref_name_len(eb
, ref
);
1066 *name
= kmalloc(*namelen
, GFP_NOFS
);
1070 read_extent_buffer(eb
, *name
, (unsigned long)(ref
+ 1), *namelen
);
1072 *index
= btrfs_inode_ref_index(eb
, ref
);
1078 * replay one inode back reference item found in the log tree.
1079 * eb, slot and key refer to the buffer and key found in the log tree.
1080 * root is the destination we are replaying into, and path is for temp
1081 * use by this function. (it should be released on return).
1083 static noinline
int add_inode_ref(struct btrfs_trans_handle
*trans
,
1084 struct btrfs_root
*root
,
1085 struct btrfs_root
*log
,
1086 struct btrfs_path
*path
,
1087 struct extent_buffer
*eb
, int slot
,
1088 struct btrfs_key
*key
)
1091 struct inode
*inode
;
1092 unsigned long ref_ptr
;
1093 unsigned long ref_end
;
1097 int search_done
= 0;
1098 int log_ref_ver
= 0;
1099 u64 parent_objectid
;
1102 int ref_struct_size
;
1104 ref_ptr
= btrfs_item_ptr_offset(eb
, slot
);
1105 ref_end
= ref_ptr
+ btrfs_item_size_nr(eb
, slot
);
1107 if (key
->type
== BTRFS_INODE_EXTREF_KEY
) {
1108 struct btrfs_inode_extref
*r
;
1110 ref_struct_size
= sizeof(struct btrfs_inode_extref
);
1112 r
= (struct btrfs_inode_extref
*)ref_ptr
;
1113 parent_objectid
= btrfs_inode_extref_parent(eb
, r
);
1115 ref_struct_size
= sizeof(struct btrfs_inode_ref
);
1116 parent_objectid
= key
->offset
;
1118 inode_objectid
= key
->objectid
;
1121 * it is possible that we didn't log all the parent directories
1122 * for a given inode. If we don't find the dir, just don't
1123 * copy the back ref in. The link count fixup code will take
1126 dir
= read_one_inode(root
, parent_objectid
);
1130 inode
= read_one_inode(root
, inode_objectid
);
1136 while (ref_ptr
< ref_end
) {
1138 ret
= extref_get_fields(eb
, ref_ptr
, &namelen
, &name
,
1139 &ref_index
, &parent_objectid
);
1141 * parent object can change from one array
1145 dir
= read_one_inode(root
, parent_objectid
);
1149 ret
= ref_get_fields(eb
, ref_ptr
, &namelen
, &name
,
1155 /* if we already have a perfect match, we're done */
1156 if (!inode_in_dir(root
, path
, btrfs_ino(dir
), btrfs_ino(inode
),
1157 ref_index
, name
, namelen
)) {
1159 * look for a conflicting back reference in the
1160 * metadata. if we find one we have to unlink that name
1161 * of the file before we add our new link. Later on, we
1162 * overwrite any existing back reference, and we don't
1163 * want to create dangling pointers in the directory.
1167 ret
= __add_inode_ref(trans
, root
, path
, log
,
1171 ref_index
, name
, namelen
,
1181 /* insert our name */
1182 ret
= btrfs_add_link(trans
, dir
, inode
, name
, namelen
,
1187 btrfs_update_inode(trans
, root
, inode
);
1190 ref_ptr
= (unsigned long)(ref_ptr
+ ref_struct_size
) + namelen
;
1198 /* finally write the back reference in the inode */
1199 ret
= overwrite_item(trans
, root
, path
, eb
, slot
, key
);
1201 btrfs_release_path(path
);
1207 static int insert_orphan_item(struct btrfs_trans_handle
*trans
,
1208 struct btrfs_root
*root
, u64 offset
)
1211 ret
= btrfs_find_orphan_item(root
, offset
);
1213 ret
= btrfs_insert_orphan_item(trans
, root
, offset
);
1217 static int count_inode_extrefs(struct btrfs_root
*root
,
1218 struct inode
*inode
, struct btrfs_path
*path
)
1222 unsigned int nlink
= 0;
1225 u64 inode_objectid
= btrfs_ino(inode
);
1228 struct btrfs_inode_extref
*extref
;
1229 struct extent_buffer
*leaf
;
1232 ret
= btrfs_find_one_extref(root
, inode_objectid
, offset
, path
,
1237 leaf
= path
->nodes
[0];
1238 item_size
= btrfs_item_size_nr(leaf
, path
->slots
[0]);
1239 ptr
= btrfs_item_ptr_offset(leaf
, path
->slots
[0]);
1241 while (cur_offset
< item_size
) {
1242 extref
= (struct btrfs_inode_extref
*) (ptr
+ cur_offset
);
1243 name_len
= btrfs_inode_extref_name_len(leaf
, extref
);
1247 cur_offset
+= name_len
+ sizeof(*extref
);
1251 btrfs_release_path(path
);
1253 btrfs_release_path(path
);
1260 static int count_inode_refs(struct btrfs_root
*root
,
1261 struct inode
*inode
, struct btrfs_path
*path
)
1264 struct btrfs_key key
;
1265 unsigned int nlink
= 0;
1267 unsigned long ptr_end
;
1269 u64 ino
= btrfs_ino(inode
);
1272 key
.type
= BTRFS_INODE_REF_KEY
;
1273 key
.offset
= (u64
)-1;
1276 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1280 if (path
->slots
[0] == 0)
1284 btrfs_item_key_to_cpu(path
->nodes
[0], &key
,
1286 if (key
.objectid
!= ino
||
1287 key
.type
!= BTRFS_INODE_REF_KEY
)
1289 ptr
= btrfs_item_ptr_offset(path
->nodes
[0], path
->slots
[0]);
1290 ptr_end
= ptr
+ btrfs_item_size_nr(path
->nodes
[0],
1292 while (ptr
< ptr_end
) {
1293 struct btrfs_inode_ref
*ref
;
1295 ref
= (struct btrfs_inode_ref
*)ptr
;
1296 name_len
= btrfs_inode_ref_name_len(path
->nodes
[0],
1298 ptr
= (unsigned long)(ref
+ 1) + name_len
;
1302 if (key
.offset
== 0)
1305 btrfs_release_path(path
);
1307 btrfs_release_path(path
);
1313 * There are a few corners where the link count of the file can't
1314 * be properly maintained during replay. So, instead of adding
1315 * lots of complexity to the log code, we just scan the backrefs
1316 * for any file that has been through replay.
1318 * The scan will update the link count on the inode to reflect the
1319 * number of back refs found. If it goes down to zero, the iput
1320 * will free the inode.
1322 static noinline
int fixup_inode_link_count(struct btrfs_trans_handle
*trans
,
1323 struct btrfs_root
*root
,
1324 struct inode
*inode
)
1326 struct btrfs_path
*path
;
1329 u64 ino
= btrfs_ino(inode
);
1331 path
= btrfs_alloc_path();
1335 ret
= count_inode_refs(root
, inode
, path
);
1341 ret
= count_inode_extrefs(root
, inode
, path
);
1352 if (nlink
!= inode
->i_nlink
) {
1353 set_nlink(inode
, nlink
);
1354 btrfs_update_inode(trans
, root
, inode
);
1356 BTRFS_I(inode
)->index_cnt
= (u64
)-1;
1358 if (inode
->i_nlink
== 0) {
1359 if (S_ISDIR(inode
->i_mode
)) {
1360 ret
= replay_dir_deletes(trans
, root
, NULL
, path
,
1365 ret
= insert_orphan_item(trans
, root
, ino
);
1369 btrfs_free_path(path
);
1373 static noinline
int fixup_inode_link_counts(struct btrfs_trans_handle
*trans
,
1374 struct btrfs_root
*root
,
1375 struct btrfs_path
*path
)
1378 struct btrfs_key key
;
1379 struct inode
*inode
;
1381 key
.objectid
= BTRFS_TREE_LOG_FIXUP_OBJECTID
;
1382 key
.type
= BTRFS_ORPHAN_ITEM_KEY
;
1383 key
.offset
= (u64
)-1;
1385 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
1390 if (path
->slots
[0] == 0)
1395 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1396 if (key
.objectid
!= BTRFS_TREE_LOG_FIXUP_OBJECTID
||
1397 key
.type
!= BTRFS_ORPHAN_ITEM_KEY
)
1400 ret
= btrfs_del_item(trans
, root
, path
);
1404 btrfs_release_path(path
);
1405 inode
= read_one_inode(root
, key
.offset
);
1409 ret
= fixup_inode_link_count(trans
, root
, inode
);
1415 * fixup on a directory may create new entries,
1416 * make sure we always look for the highset possible
1419 key
.offset
= (u64
)-1;
1423 btrfs_release_path(path
);
1429 * record a given inode in the fixup dir so we can check its link
1430 * count when replay is done. The link count is incremented here
1431 * so the inode won't go away until we check it
1433 static noinline
int link_to_fixup_dir(struct btrfs_trans_handle
*trans
,
1434 struct btrfs_root
*root
,
1435 struct btrfs_path
*path
,
1438 struct btrfs_key key
;
1440 struct inode
*inode
;
1442 inode
= read_one_inode(root
, objectid
);
1446 key
.objectid
= BTRFS_TREE_LOG_FIXUP_OBJECTID
;
1447 btrfs_set_key_type(&key
, BTRFS_ORPHAN_ITEM_KEY
);
1448 key
.offset
= objectid
;
1450 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
, 0);
1452 btrfs_release_path(path
);
1454 if (!inode
->i_nlink
)
1455 set_nlink(inode
, 1);
1457 btrfs_inc_nlink(inode
);
1458 ret
= btrfs_update_inode(trans
, root
, inode
);
1459 } else if (ret
== -EEXIST
) {
1462 BUG(); /* Logic Error */
1470 * when replaying the log for a directory, we only insert names
1471 * for inodes that actually exist. This means an fsync on a directory
1472 * does not implicitly fsync all the new files in it
1474 static noinline
int insert_one_name(struct btrfs_trans_handle
*trans
,
1475 struct btrfs_root
*root
,
1476 struct btrfs_path
*path
,
1477 u64 dirid
, u64 index
,
1478 char *name
, int name_len
, u8 type
,
1479 struct btrfs_key
*location
)
1481 struct inode
*inode
;
1485 inode
= read_one_inode(root
, location
->objectid
);
1489 dir
= read_one_inode(root
, dirid
);
1494 ret
= btrfs_add_link(trans
, dir
, inode
, name
, name_len
, 1, index
);
1496 /* FIXME, put inode into FIXUP list */
1504 * take a single entry in a log directory item and replay it into
1507 * if a conflicting item exists in the subdirectory already,
1508 * the inode it points to is unlinked and put into the link count
1511 * If a name from the log points to a file or directory that does
1512 * not exist in the FS, it is skipped. fsyncs on directories
1513 * do not force down inodes inside that directory, just changes to the
1514 * names or unlinks in a directory.
1516 static noinline
int replay_one_name(struct btrfs_trans_handle
*trans
,
1517 struct btrfs_root
*root
,
1518 struct btrfs_path
*path
,
1519 struct extent_buffer
*eb
,
1520 struct btrfs_dir_item
*di
,
1521 struct btrfs_key
*key
)
1525 struct btrfs_dir_item
*dst_di
;
1526 struct btrfs_key found_key
;
1527 struct btrfs_key log_key
;
1533 dir
= read_one_inode(root
, key
->objectid
);
1537 name_len
= btrfs_dir_name_len(eb
, di
);
1538 name
= kmalloc(name_len
, GFP_NOFS
);
1542 log_type
= btrfs_dir_type(eb
, di
);
1543 read_extent_buffer(eb
, name
, (unsigned long)(di
+ 1),
1546 btrfs_dir_item_key_to_cpu(eb
, di
, &log_key
);
1547 exists
= btrfs_lookup_inode(trans
, root
, path
, &log_key
, 0);
1552 btrfs_release_path(path
);
1554 if (key
->type
== BTRFS_DIR_ITEM_KEY
) {
1555 dst_di
= btrfs_lookup_dir_item(trans
, root
, path
, key
->objectid
,
1557 } else if (key
->type
== BTRFS_DIR_INDEX_KEY
) {
1558 dst_di
= btrfs_lookup_dir_index_item(trans
, root
, path
,
1567 if (IS_ERR_OR_NULL(dst_di
)) {
1568 /* we need a sequence number to insert, so we only
1569 * do inserts for the BTRFS_DIR_INDEX_KEY types
1571 if (key
->type
!= BTRFS_DIR_INDEX_KEY
)
1576 btrfs_dir_item_key_to_cpu(path
->nodes
[0], dst_di
, &found_key
);
1577 /* the existing item matches the logged item */
1578 if (found_key
.objectid
== log_key
.objectid
&&
1579 found_key
.type
== log_key
.type
&&
1580 found_key
.offset
== log_key
.offset
&&
1581 btrfs_dir_type(path
->nodes
[0], dst_di
) == log_type
) {
1586 * don't drop the conflicting directory entry if the inode
1587 * for the new entry doesn't exist
1592 ret
= drop_one_dir_item(trans
, root
, path
, dir
, dst_di
);
1596 if (key
->type
== BTRFS_DIR_INDEX_KEY
)
1599 btrfs_release_path(path
);
1605 btrfs_release_path(path
);
1606 ret
= insert_one_name(trans
, root
, path
, key
->objectid
, key
->offset
,
1607 name
, name_len
, log_type
, &log_key
);
1608 if (ret
&& ret
!= -ENOENT
)
1615 * find all the names in a directory item and reconcile them into
1616 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1617 * one name in a directory item, but the same code gets used for
1618 * both directory index types
1620 static noinline
int replay_one_dir_item(struct btrfs_trans_handle
*trans
,
1621 struct btrfs_root
*root
,
1622 struct btrfs_path
*path
,
1623 struct extent_buffer
*eb
, int slot
,
1624 struct btrfs_key
*key
)
1627 u32 item_size
= btrfs_item_size_nr(eb
, slot
);
1628 struct btrfs_dir_item
*di
;
1631 unsigned long ptr_end
;
1633 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1634 ptr_end
= ptr
+ item_size
;
1635 while (ptr
< ptr_end
) {
1636 di
= (struct btrfs_dir_item
*)ptr
;
1637 if (verify_dir_item(root
, eb
, di
))
1639 name_len
= btrfs_dir_name_len(eb
, di
);
1640 ret
= replay_one_name(trans
, root
, path
, eb
, di
, key
);
1643 ptr
= (unsigned long)(di
+ 1);
1650 * directory replay has two parts. There are the standard directory
1651 * items in the log copied from the subvolume, and range items
1652 * created in the log while the subvolume was logged.
1654 * The range items tell us which parts of the key space the log
1655 * is authoritative for. During replay, if a key in the subvolume
1656 * directory is in a logged range item, but not actually in the log
1657 * that means it was deleted from the directory before the fsync
1658 * and should be removed.
1660 static noinline
int find_dir_range(struct btrfs_root
*root
,
1661 struct btrfs_path
*path
,
1662 u64 dirid
, int key_type
,
1663 u64
*start_ret
, u64
*end_ret
)
1665 struct btrfs_key key
;
1667 struct btrfs_dir_log_item
*item
;
1671 if (*start_ret
== (u64
)-1)
1674 key
.objectid
= dirid
;
1675 key
.type
= key_type
;
1676 key
.offset
= *start_ret
;
1678 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1682 if (path
->slots
[0] == 0)
1687 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1689 if (key
.type
!= key_type
|| key
.objectid
!= dirid
) {
1693 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1694 struct btrfs_dir_log_item
);
1695 found_end
= btrfs_dir_log_end(path
->nodes
[0], item
);
1697 if (*start_ret
>= key
.offset
&& *start_ret
<= found_end
) {
1699 *start_ret
= key
.offset
;
1700 *end_ret
= found_end
;
1705 /* check the next slot in the tree to see if it is a valid item */
1706 nritems
= btrfs_header_nritems(path
->nodes
[0]);
1707 if (path
->slots
[0] >= nritems
) {
1708 ret
= btrfs_next_leaf(root
, path
);
1715 btrfs_item_key_to_cpu(path
->nodes
[0], &key
, path
->slots
[0]);
1717 if (key
.type
!= key_type
|| key
.objectid
!= dirid
) {
1721 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1722 struct btrfs_dir_log_item
);
1723 found_end
= btrfs_dir_log_end(path
->nodes
[0], item
);
1724 *start_ret
= key
.offset
;
1725 *end_ret
= found_end
;
1728 btrfs_release_path(path
);
1733 * this looks for a given directory item in the log. If the directory
1734 * item is not in the log, the item is removed and the inode it points
1737 static noinline
int check_item_in_log(struct btrfs_trans_handle
*trans
,
1738 struct btrfs_root
*root
,
1739 struct btrfs_root
*log
,
1740 struct btrfs_path
*path
,
1741 struct btrfs_path
*log_path
,
1743 struct btrfs_key
*dir_key
)
1746 struct extent_buffer
*eb
;
1749 struct btrfs_dir_item
*di
;
1750 struct btrfs_dir_item
*log_di
;
1753 unsigned long ptr_end
;
1755 struct inode
*inode
;
1756 struct btrfs_key location
;
1759 eb
= path
->nodes
[0];
1760 slot
= path
->slots
[0];
1761 item_size
= btrfs_item_size_nr(eb
, slot
);
1762 ptr
= btrfs_item_ptr_offset(eb
, slot
);
1763 ptr_end
= ptr
+ item_size
;
1764 while (ptr
< ptr_end
) {
1765 di
= (struct btrfs_dir_item
*)ptr
;
1766 if (verify_dir_item(root
, eb
, di
)) {
1771 name_len
= btrfs_dir_name_len(eb
, di
);
1772 name
= kmalloc(name_len
, GFP_NOFS
);
1777 read_extent_buffer(eb
, name
, (unsigned long)(di
+ 1),
1780 if (log
&& dir_key
->type
== BTRFS_DIR_ITEM_KEY
) {
1781 log_di
= btrfs_lookup_dir_item(trans
, log
, log_path
,
1784 } else if (log
&& dir_key
->type
== BTRFS_DIR_INDEX_KEY
) {
1785 log_di
= btrfs_lookup_dir_index_item(trans
, log
,
1791 if (IS_ERR_OR_NULL(log_di
)) {
1792 btrfs_dir_item_key_to_cpu(eb
, di
, &location
);
1793 btrfs_release_path(path
);
1794 btrfs_release_path(log_path
);
1795 inode
= read_one_inode(root
, location
.objectid
);
1801 ret
= link_to_fixup_dir(trans
, root
,
1802 path
, location
.objectid
);
1809 btrfs_inc_nlink(inode
);
1810 ret
= btrfs_unlink_inode(trans
, root
, dir
, inode
,
1813 btrfs_run_delayed_items(trans
, root
);
1819 /* there might still be more names under this key
1820 * check and repeat if required
1822 ret
= btrfs_search_slot(NULL
, root
, dir_key
, path
,
1829 btrfs_release_path(log_path
);
1832 ptr
= (unsigned long)(di
+ 1);
1837 btrfs_release_path(path
);
1838 btrfs_release_path(log_path
);
1843 * deletion replay happens before we copy any new directory items
1844 * out of the log or out of backreferences from inodes. It
1845 * scans the log to find ranges of keys that log is authoritative for,
1846 * and then scans the directory to find items in those ranges that are
1847 * not present in the log.
1849 * Anything we don't find in the log is unlinked and removed from the
1852 static noinline
int replay_dir_deletes(struct btrfs_trans_handle
*trans
,
1853 struct btrfs_root
*root
,
1854 struct btrfs_root
*log
,
1855 struct btrfs_path
*path
,
1856 u64 dirid
, int del_all
)
1860 int key_type
= BTRFS_DIR_LOG_ITEM_KEY
;
1862 struct btrfs_key dir_key
;
1863 struct btrfs_key found_key
;
1864 struct btrfs_path
*log_path
;
1867 dir_key
.objectid
= dirid
;
1868 dir_key
.type
= BTRFS_DIR_ITEM_KEY
;
1869 log_path
= btrfs_alloc_path();
1873 dir
= read_one_inode(root
, dirid
);
1874 /* it isn't an error if the inode isn't there, that can happen
1875 * because we replay the deletes before we copy in the inode item
1879 btrfs_free_path(log_path
);
1887 range_end
= (u64
)-1;
1889 ret
= find_dir_range(log
, path
, dirid
, key_type
,
1890 &range_start
, &range_end
);
1895 dir_key
.offset
= range_start
;
1898 ret
= btrfs_search_slot(NULL
, root
, &dir_key
, path
,
1903 nritems
= btrfs_header_nritems(path
->nodes
[0]);
1904 if (path
->slots
[0] >= nritems
) {
1905 ret
= btrfs_next_leaf(root
, path
);
1909 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
1911 if (found_key
.objectid
!= dirid
||
1912 found_key
.type
!= dir_key
.type
)
1915 if (found_key
.offset
> range_end
)
1918 ret
= check_item_in_log(trans
, root
, log
, path
,
1923 if (found_key
.offset
== (u64
)-1)
1925 dir_key
.offset
= found_key
.offset
+ 1;
1927 btrfs_release_path(path
);
1928 if (range_end
== (u64
)-1)
1930 range_start
= range_end
+ 1;
1935 if (key_type
== BTRFS_DIR_LOG_ITEM_KEY
) {
1936 key_type
= BTRFS_DIR_LOG_INDEX_KEY
;
1937 dir_key
.type
= BTRFS_DIR_INDEX_KEY
;
1938 btrfs_release_path(path
);
1942 btrfs_release_path(path
);
1943 btrfs_free_path(log_path
);
1949 * the process_func used to replay items from the log tree. This
1950 * gets called in two different stages. The first stage just looks
1951 * for inodes and makes sure they are all copied into the subvolume.
1953 * The second stage copies all the other item types from the log into
1954 * the subvolume. The two stage approach is slower, but gets rid of
1955 * lots of complexity around inodes referencing other inodes that exist
1956 * only in the log (references come from either directory items or inode
1959 static int replay_one_buffer(struct btrfs_root
*log
, struct extent_buffer
*eb
,
1960 struct walk_control
*wc
, u64 gen
)
1963 struct btrfs_path
*path
;
1964 struct btrfs_root
*root
= wc
->replay_dest
;
1965 struct btrfs_key key
;
1970 ret
= btrfs_read_buffer(eb
, gen
);
1974 level
= btrfs_header_level(eb
);
1979 path
= btrfs_alloc_path();
1983 nritems
= btrfs_header_nritems(eb
);
1984 for (i
= 0; i
< nritems
; i
++) {
1985 btrfs_item_key_to_cpu(eb
, &key
, i
);
1987 /* inode keys are done during the first stage */
1988 if (key
.type
== BTRFS_INODE_ITEM_KEY
&&
1989 wc
->stage
== LOG_WALK_REPLAY_INODES
) {
1990 struct btrfs_inode_item
*inode_item
;
1993 inode_item
= btrfs_item_ptr(eb
, i
,
1994 struct btrfs_inode_item
);
1995 mode
= btrfs_inode_mode(eb
, inode_item
);
1996 if (S_ISDIR(mode
)) {
1997 ret
= replay_dir_deletes(wc
->trans
,
1998 root
, log
, path
, key
.objectid
, 0);
2002 ret
= overwrite_item(wc
->trans
, root
, path
,
2007 /* for regular files, make sure corresponding
2008 * orhpan item exist. extents past the new EOF
2009 * will be truncated later by orphan cleanup.
2011 if (S_ISREG(mode
)) {
2012 ret
= insert_orphan_item(wc
->trans
, root
,
2018 ret
= link_to_fixup_dir(wc
->trans
, root
,
2019 path
, key
.objectid
);
2023 if (wc
->stage
< LOG_WALK_REPLAY_ALL
)
2026 /* these keys are simply copied */
2027 if (key
.type
== BTRFS_XATTR_ITEM_KEY
) {
2028 ret
= overwrite_item(wc
->trans
, root
, path
,
2032 } else if (key
.type
== BTRFS_INODE_REF_KEY
||
2033 key
.type
== BTRFS_INODE_EXTREF_KEY
) {
2034 ret
= add_inode_ref(wc
->trans
, root
, log
, path
,
2036 if (ret
&& ret
!= -ENOENT
)
2039 } else if (key
.type
== BTRFS_EXTENT_DATA_KEY
) {
2040 ret
= replay_one_extent(wc
->trans
, root
, path
,
2044 } else if (key
.type
== BTRFS_DIR_ITEM_KEY
||
2045 key
.type
== BTRFS_DIR_INDEX_KEY
) {
2046 ret
= replay_one_dir_item(wc
->trans
, root
, path
,
2052 btrfs_free_path(path
);
2056 static noinline
int walk_down_log_tree(struct btrfs_trans_handle
*trans
,
2057 struct btrfs_root
*root
,
2058 struct btrfs_path
*path
, int *level
,
2059 struct walk_control
*wc
)
2064 struct extent_buffer
*next
;
2065 struct extent_buffer
*cur
;
2066 struct extent_buffer
*parent
;
2070 WARN_ON(*level
< 0);
2071 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
2073 while (*level
> 0) {
2074 WARN_ON(*level
< 0);
2075 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
2076 cur
= path
->nodes
[*level
];
2078 if (btrfs_header_level(cur
) != *level
)
2081 if (path
->slots
[*level
] >=
2082 btrfs_header_nritems(cur
))
2085 bytenr
= btrfs_node_blockptr(cur
, path
->slots
[*level
]);
2086 ptr_gen
= btrfs_node_ptr_generation(cur
, path
->slots
[*level
]);
2087 blocksize
= btrfs_level_size(root
, *level
- 1);
2089 parent
= path
->nodes
[*level
];
2090 root_owner
= btrfs_header_owner(parent
);
2092 next
= btrfs_find_create_tree_block(root
, bytenr
, blocksize
);
2097 ret
= wc
->process_func(root
, next
, wc
, ptr_gen
);
2099 free_extent_buffer(next
);
2103 path
->slots
[*level
]++;
2105 ret
= btrfs_read_buffer(next
, ptr_gen
);
2107 free_extent_buffer(next
);
2111 btrfs_tree_lock(next
);
2112 btrfs_set_lock_blocking(next
);
2113 clean_tree_block(trans
, root
, next
);
2114 btrfs_wait_tree_block_writeback(next
);
2115 btrfs_tree_unlock(next
);
2117 WARN_ON(root_owner
!=
2118 BTRFS_TREE_LOG_OBJECTID
);
2119 ret
= btrfs_free_and_pin_reserved_extent(root
,
2122 free_extent_buffer(next
);
2126 free_extent_buffer(next
);
2129 ret
= btrfs_read_buffer(next
, ptr_gen
);
2131 free_extent_buffer(next
);
2135 WARN_ON(*level
<= 0);
2136 if (path
->nodes
[*level
-1])
2137 free_extent_buffer(path
->nodes
[*level
-1]);
2138 path
->nodes
[*level
-1] = next
;
2139 *level
= btrfs_header_level(next
);
2140 path
->slots
[*level
] = 0;
2143 WARN_ON(*level
< 0);
2144 WARN_ON(*level
>= BTRFS_MAX_LEVEL
);
2146 path
->slots
[*level
] = btrfs_header_nritems(path
->nodes
[*level
]);
2152 static noinline
int walk_up_log_tree(struct btrfs_trans_handle
*trans
,
2153 struct btrfs_root
*root
,
2154 struct btrfs_path
*path
, int *level
,
2155 struct walk_control
*wc
)
2162 for (i
= *level
; i
< BTRFS_MAX_LEVEL
- 1 && path
->nodes
[i
]; i
++) {
2163 slot
= path
->slots
[i
];
2164 if (slot
+ 1 < btrfs_header_nritems(path
->nodes
[i
])) {
2167 WARN_ON(*level
== 0);
2170 struct extent_buffer
*parent
;
2171 if (path
->nodes
[*level
] == root
->node
)
2172 parent
= path
->nodes
[*level
];
2174 parent
= path
->nodes
[*level
+ 1];
2176 root_owner
= btrfs_header_owner(parent
);
2177 ret
= wc
->process_func(root
, path
->nodes
[*level
], wc
,
2178 btrfs_header_generation(path
->nodes
[*level
]));
2183 struct extent_buffer
*next
;
2185 next
= path
->nodes
[*level
];
2187 btrfs_tree_lock(next
);
2188 btrfs_set_lock_blocking(next
);
2189 clean_tree_block(trans
, root
, next
);
2190 btrfs_wait_tree_block_writeback(next
);
2191 btrfs_tree_unlock(next
);
2193 WARN_ON(root_owner
!= BTRFS_TREE_LOG_OBJECTID
);
2194 ret
= btrfs_free_and_pin_reserved_extent(root
,
2195 path
->nodes
[*level
]->start
,
2196 path
->nodes
[*level
]->len
);
2200 free_extent_buffer(path
->nodes
[*level
]);
2201 path
->nodes
[*level
] = NULL
;
2209 * drop the reference count on the tree rooted at 'snap'. This traverses
2210 * the tree freeing any blocks that have a ref count of zero after being
2213 static int walk_log_tree(struct btrfs_trans_handle
*trans
,
2214 struct btrfs_root
*log
, struct walk_control
*wc
)
2219 struct btrfs_path
*path
;
2222 path
= btrfs_alloc_path();
2226 level
= btrfs_header_level(log
->node
);
2228 path
->nodes
[level
] = log
->node
;
2229 extent_buffer_get(log
->node
);
2230 path
->slots
[level
] = 0;
2233 wret
= walk_down_log_tree(trans
, log
, path
, &level
, wc
);
2241 wret
= walk_up_log_tree(trans
, log
, path
, &level
, wc
);
2250 /* was the root node processed? if not, catch it here */
2251 if (path
->nodes
[orig_level
]) {
2252 ret
= wc
->process_func(log
, path
->nodes
[orig_level
], wc
,
2253 btrfs_header_generation(path
->nodes
[orig_level
]));
2257 struct extent_buffer
*next
;
2259 next
= path
->nodes
[orig_level
];
2261 btrfs_tree_lock(next
);
2262 btrfs_set_lock_blocking(next
);
2263 clean_tree_block(trans
, log
, next
);
2264 btrfs_wait_tree_block_writeback(next
);
2265 btrfs_tree_unlock(next
);
2267 WARN_ON(log
->root_key
.objectid
!=
2268 BTRFS_TREE_LOG_OBJECTID
);
2269 ret
= btrfs_free_and_pin_reserved_extent(log
, next
->start
,
2277 btrfs_free_path(path
);
2282 * helper function to update the item for a given subvolumes log root
2283 * in the tree of log roots
2285 static int update_log_root(struct btrfs_trans_handle
*trans
,
2286 struct btrfs_root
*log
)
2290 if (log
->log_transid
== 1) {
2291 /* insert root item on the first sync */
2292 ret
= btrfs_insert_root(trans
, log
->fs_info
->log_root_tree
,
2293 &log
->root_key
, &log
->root_item
);
2295 ret
= btrfs_update_root(trans
, log
->fs_info
->log_root_tree
,
2296 &log
->root_key
, &log
->root_item
);
2301 static int wait_log_commit(struct btrfs_trans_handle
*trans
,
2302 struct btrfs_root
*root
, unsigned long transid
)
2305 int index
= transid
% 2;
2308 * we only allow two pending log transactions at a time,
2309 * so we know that if ours is more than 2 older than the
2310 * current transaction, we're done
2313 prepare_to_wait(&root
->log_commit_wait
[index
],
2314 &wait
, TASK_UNINTERRUPTIBLE
);
2315 mutex_unlock(&root
->log_mutex
);
2317 if (root
->fs_info
->last_trans_log_full_commit
!=
2318 trans
->transid
&& root
->log_transid
< transid
+ 2 &&
2319 atomic_read(&root
->log_commit
[index
]))
2322 finish_wait(&root
->log_commit_wait
[index
], &wait
);
2323 mutex_lock(&root
->log_mutex
);
2324 } while (root
->fs_info
->last_trans_log_full_commit
!=
2325 trans
->transid
&& root
->log_transid
< transid
+ 2 &&
2326 atomic_read(&root
->log_commit
[index
]));
2330 static void wait_for_writer(struct btrfs_trans_handle
*trans
,
2331 struct btrfs_root
*root
)
2334 while (root
->fs_info
->last_trans_log_full_commit
!=
2335 trans
->transid
&& atomic_read(&root
->log_writers
)) {
2336 prepare_to_wait(&root
->log_writer_wait
,
2337 &wait
, TASK_UNINTERRUPTIBLE
);
2338 mutex_unlock(&root
->log_mutex
);
2339 if (root
->fs_info
->last_trans_log_full_commit
!=
2340 trans
->transid
&& atomic_read(&root
->log_writers
))
2342 mutex_lock(&root
->log_mutex
);
2343 finish_wait(&root
->log_writer_wait
, &wait
);
2348 * btrfs_sync_log does sends a given tree log down to the disk and
2349 * updates the super blocks to record it. When this call is done,
2350 * you know that any inodes previously logged are safely on disk only
2353 * Any other return value means you need to call btrfs_commit_transaction.
2354 * Some of the edge cases for fsyncing directories that have had unlinks
2355 * or renames done in the past mean that sometimes the only safe
2356 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
2357 * that has happened.
2359 int btrfs_sync_log(struct btrfs_trans_handle
*trans
,
2360 struct btrfs_root
*root
)
2366 struct btrfs_root
*log
= root
->log_root
;
2367 struct btrfs_root
*log_root_tree
= root
->fs_info
->log_root_tree
;
2368 unsigned long log_transid
= 0;
2369 struct blk_plug plug
;
2371 mutex_lock(&root
->log_mutex
);
2372 log_transid
= root
->log_transid
;
2373 index1
= root
->log_transid
% 2;
2374 if (atomic_read(&root
->log_commit
[index1
])) {
2375 wait_log_commit(trans
, root
, root
->log_transid
);
2376 mutex_unlock(&root
->log_mutex
);
2379 atomic_set(&root
->log_commit
[index1
], 1);
2381 /* wait for previous tree log sync to complete */
2382 if (atomic_read(&root
->log_commit
[(index1
+ 1) % 2]))
2383 wait_log_commit(trans
, root
, root
->log_transid
- 1);
2385 int batch
= atomic_read(&root
->log_batch
);
2386 /* when we're on an ssd, just kick the log commit out */
2387 if (!btrfs_test_opt(root
, SSD
) && root
->log_multiple_pids
) {
2388 mutex_unlock(&root
->log_mutex
);
2389 schedule_timeout_uninterruptible(1);
2390 mutex_lock(&root
->log_mutex
);
2392 wait_for_writer(trans
, root
);
2393 if (batch
== atomic_read(&root
->log_batch
))
2397 /* bail out if we need to do a full commit */
2398 if (root
->fs_info
->last_trans_log_full_commit
== trans
->transid
) {
2400 btrfs_free_logged_extents(log
, log_transid
);
2401 mutex_unlock(&root
->log_mutex
);
2405 if (log_transid
% 2 == 0)
2406 mark
= EXTENT_DIRTY
;
2410 /* we start IO on all the marked extents here, but we don't actually
2411 * wait for them until later.
2413 blk_start_plug(&plug
);
2414 ret
= btrfs_write_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2416 blk_finish_plug(&plug
);
2417 btrfs_abort_transaction(trans
, root
, ret
);
2418 btrfs_free_logged_extents(log
, log_transid
);
2419 mutex_unlock(&root
->log_mutex
);
2423 btrfs_set_root_node(&log
->root_item
, log
->node
);
2425 root
->log_transid
++;
2426 log
->log_transid
= root
->log_transid
;
2427 root
->log_start_pid
= 0;
2430 * IO has been started, blocks of the log tree have WRITTEN flag set
2431 * in their headers. new modifications of the log will be written to
2432 * new positions. so it's safe to allow log writers to go in.
2434 mutex_unlock(&root
->log_mutex
);
2436 mutex_lock(&log_root_tree
->log_mutex
);
2437 atomic_inc(&log_root_tree
->log_batch
);
2438 atomic_inc(&log_root_tree
->log_writers
);
2439 mutex_unlock(&log_root_tree
->log_mutex
);
2441 ret
= update_log_root(trans
, log
);
2443 mutex_lock(&log_root_tree
->log_mutex
);
2444 if (atomic_dec_and_test(&log_root_tree
->log_writers
)) {
2446 if (waitqueue_active(&log_root_tree
->log_writer_wait
))
2447 wake_up(&log_root_tree
->log_writer_wait
);
2451 blk_finish_plug(&plug
);
2452 if (ret
!= -ENOSPC
) {
2453 btrfs_abort_transaction(trans
, root
, ret
);
2454 mutex_unlock(&log_root_tree
->log_mutex
);
2457 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2458 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2459 btrfs_free_logged_extents(log
, log_transid
);
2460 mutex_unlock(&log_root_tree
->log_mutex
);
2465 index2
= log_root_tree
->log_transid
% 2;
2466 if (atomic_read(&log_root_tree
->log_commit
[index2
])) {
2467 blk_finish_plug(&plug
);
2468 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2469 wait_log_commit(trans
, log_root_tree
,
2470 log_root_tree
->log_transid
);
2471 btrfs_free_logged_extents(log
, log_transid
);
2472 mutex_unlock(&log_root_tree
->log_mutex
);
2476 atomic_set(&log_root_tree
->log_commit
[index2
], 1);
2478 if (atomic_read(&log_root_tree
->log_commit
[(index2
+ 1) % 2])) {
2479 wait_log_commit(trans
, log_root_tree
,
2480 log_root_tree
->log_transid
- 1);
2483 wait_for_writer(trans
, log_root_tree
);
2486 * now that we've moved on to the tree of log tree roots,
2487 * check the full commit flag again
2489 if (root
->fs_info
->last_trans_log_full_commit
== trans
->transid
) {
2490 blk_finish_plug(&plug
);
2491 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2492 btrfs_free_logged_extents(log
, log_transid
);
2493 mutex_unlock(&log_root_tree
->log_mutex
);
2495 goto out_wake_log_root
;
2498 ret
= btrfs_write_marked_extents(log_root_tree
,
2499 &log_root_tree
->dirty_log_pages
,
2500 EXTENT_DIRTY
| EXTENT_NEW
);
2501 blk_finish_plug(&plug
);
2503 btrfs_abort_transaction(trans
, root
, ret
);
2504 btrfs_free_logged_extents(log
, log_transid
);
2505 mutex_unlock(&log_root_tree
->log_mutex
);
2506 goto out_wake_log_root
;
2508 btrfs_wait_marked_extents(log
, &log
->dirty_log_pages
, mark
);
2509 btrfs_wait_marked_extents(log_root_tree
,
2510 &log_root_tree
->dirty_log_pages
,
2511 EXTENT_NEW
| EXTENT_DIRTY
);
2512 btrfs_wait_logged_extents(log
, log_transid
);
2514 btrfs_set_super_log_root(root
->fs_info
->super_for_commit
,
2515 log_root_tree
->node
->start
);
2516 btrfs_set_super_log_root_level(root
->fs_info
->super_for_commit
,
2517 btrfs_header_level(log_root_tree
->node
));
2519 log_root_tree
->log_transid
++;
2522 mutex_unlock(&log_root_tree
->log_mutex
);
2525 * nobody else is going to jump in and write the the ctree
2526 * super here because the log_commit atomic below is protecting
2527 * us. We must be called with a transaction handle pinning
2528 * the running transaction open, so a full commit can't hop
2529 * in and cause problems either.
2531 btrfs_scrub_pause_super(root
);
2532 ret
= write_ctree_super(trans
, root
->fs_info
->tree_root
, 1);
2533 btrfs_scrub_continue_super(root
);
2535 btrfs_abort_transaction(trans
, root
, ret
);
2536 goto out_wake_log_root
;
2539 mutex_lock(&root
->log_mutex
);
2540 if (root
->last_log_commit
< log_transid
)
2541 root
->last_log_commit
= log_transid
;
2542 mutex_unlock(&root
->log_mutex
);
2545 atomic_set(&log_root_tree
->log_commit
[index2
], 0);
2547 if (waitqueue_active(&log_root_tree
->log_commit_wait
[index2
]))
2548 wake_up(&log_root_tree
->log_commit_wait
[index2
]);
2550 atomic_set(&root
->log_commit
[index1
], 0);
2552 if (waitqueue_active(&root
->log_commit_wait
[index1
]))
2553 wake_up(&root
->log_commit_wait
[index1
]);
2557 static void free_log_tree(struct btrfs_trans_handle
*trans
,
2558 struct btrfs_root
*log
)
2563 struct walk_control wc
= {
2565 .process_func
= process_one_buffer
2569 ret
= walk_log_tree(trans
, log
, &wc
);
2571 /* I don't think this can happen but just in case */
2573 btrfs_abort_transaction(trans
, log
, ret
);
2577 ret
= find_first_extent_bit(&log
->dirty_log_pages
,
2578 0, &start
, &end
, EXTENT_DIRTY
| EXTENT_NEW
,
2583 clear_extent_bits(&log
->dirty_log_pages
, start
, end
,
2584 EXTENT_DIRTY
| EXTENT_NEW
, GFP_NOFS
);
2588 * We may have short-circuited the log tree with the full commit logic
2589 * and left ordered extents on our list, so clear these out to keep us
2590 * from leaking inodes and memory.
2592 btrfs_free_logged_extents(log
, 0);
2593 btrfs_free_logged_extents(log
, 1);
2595 free_extent_buffer(log
->node
);
2600 * free all the extents used by the tree log. This should be called
2601 * at commit time of the full transaction
2603 int btrfs_free_log(struct btrfs_trans_handle
*trans
, struct btrfs_root
*root
)
2605 if (root
->log_root
) {
2606 free_log_tree(trans
, root
->log_root
);
2607 root
->log_root
= NULL
;
2612 int btrfs_free_log_root_tree(struct btrfs_trans_handle
*trans
,
2613 struct btrfs_fs_info
*fs_info
)
2615 if (fs_info
->log_root_tree
) {
2616 free_log_tree(trans
, fs_info
->log_root_tree
);
2617 fs_info
->log_root_tree
= NULL
;
2623 * If both a file and directory are logged, and unlinks or renames are
2624 * mixed in, we have a few interesting corners:
2626 * create file X in dir Y
2627 * link file X to X.link in dir Y
2629 * unlink file X but leave X.link
2632 * After a crash we would expect only X.link to exist. But file X
2633 * didn't get fsync'd again so the log has back refs for X and X.link.
2635 * We solve this by removing directory entries and inode backrefs from the
2636 * log when a file that was logged in the current transaction is
2637 * unlinked. Any later fsync will include the updated log entries, and
2638 * we'll be able to reconstruct the proper directory items from backrefs.
2640 * This optimizations allows us to avoid relogging the entire inode
2641 * or the entire directory.
2643 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle
*trans
,
2644 struct btrfs_root
*root
,
2645 const char *name
, int name_len
,
2646 struct inode
*dir
, u64 index
)
2648 struct btrfs_root
*log
;
2649 struct btrfs_dir_item
*di
;
2650 struct btrfs_path
*path
;
2654 u64 dir_ino
= btrfs_ino(dir
);
2656 if (BTRFS_I(dir
)->logged_trans
< trans
->transid
)
2659 ret
= join_running_log_trans(root
);
2663 mutex_lock(&BTRFS_I(dir
)->log_mutex
);
2665 log
= root
->log_root
;
2666 path
= btrfs_alloc_path();
2672 di
= btrfs_lookup_dir_item(trans
, log
, path
, dir_ino
,
2673 name
, name_len
, -1);
2679 ret
= btrfs_delete_one_dir_name(trans
, log
, path
, di
);
2680 bytes_del
+= name_len
;
2686 btrfs_release_path(path
);
2687 di
= btrfs_lookup_dir_index_item(trans
, log
, path
, dir_ino
,
2688 index
, name
, name_len
, -1);
2694 ret
= btrfs_delete_one_dir_name(trans
, log
, path
, di
);
2695 bytes_del
+= name_len
;
2702 /* update the directory size in the log to reflect the names
2706 struct btrfs_key key
;
2708 key
.objectid
= dir_ino
;
2710 key
.type
= BTRFS_INODE_ITEM_KEY
;
2711 btrfs_release_path(path
);
2713 ret
= btrfs_search_slot(trans
, log
, &key
, path
, 0, 1);
2719 struct btrfs_inode_item
*item
;
2722 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2723 struct btrfs_inode_item
);
2724 i_size
= btrfs_inode_size(path
->nodes
[0], item
);
2725 if (i_size
> bytes_del
)
2726 i_size
-= bytes_del
;
2729 btrfs_set_inode_size(path
->nodes
[0], item
, i_size
);
2730 btrfs_mark_buffer_dirty(path
->nodes
[0]);
2733 btrfs_release_path(path
);
2736 btrfs_free_path(path
);
2738 mutex_unlock(&BTRFS_I(dir
)->log_mutex
);
2739 if (ret
== -ENOSPC
) {
2740 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2743 btrfs_abort_transaction(trans
, root
, ret
);
2745 btrfs_end_log_trans(root
);
2750 /* see comments for btrfs_del_dir_entries_in_log */
2751 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle
*trans
,
2752 struct btrfs_root
*root
,
2753 const char *name
, int name_len
,
2754 struct inode
*inode
, u64 dirid
)
2756 struct btrfs_root
*log
;
2760 if (BTRFS_I(inode
)->logged_trans
< trans
->transid
)
2763 ret
= join_running_log_trans(root
);
2766 log
= root
->log_root
;
2767 mutex_lock(&BTRFS_I(inode
)->log_mutex
);
2769 ret
= btrfs_del_inode_ref(trans
, log
, name
, name_len
, btrfs_ino(inode
),
2771 mutex_unlock(&BTRFS_I(inode
)->log_mutex
);
2772 if (ret
== -ENOSPC
) {
2773 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
2775 } else if (ret
< 0 && ret
!= -ENOENT
)
2776 btrfs_abort_transaction(trans
, root
, ret
);
2777 btrfs_end_log_trans(root
);
2783 * creates a range item in the log for 'dirid'. first_offset and
2784 * last_offset tell us which parts of the key space the log should
2785 * be considered authoritative for.
2787 static noinline
int insert_dir_log_key(struct btrfs_trans_handle
*trans
,
2788 struct btrfs_root
*log
,
2789 struct btrfs_path
*path
,
2790 int key_type
, u64 dirid
,
2791 u64 first_offset
, u64 last_offset
)
2794 struct btrfs_key key
;
2795 struct btrfs_dir_log_item
*item
;
2797 key
.objectid
= dirid
;
2798 key
.offset
= first_offset
;
2799 if (key_type
== BTRFS_DIR_ITEM_KEY
)
2800 key
.type
= BTRFS_DIR_LOG_ITEM_KEY
;
2802 key
.type
= BTRFS_DIR_LOG_INDEX_KEY
;
2803 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
, sizeof(*item
));
2807 item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2808 struct btrfs_dir_log_item
);
2809 btrfs_set_dir_log_end(path
->nodes
[0], item
, last_offset
);
2810 btrfs_mark_buffer_dirty(path
->nodes
[0]);
2811 btrfs_release_path(path
);
2816 * log all the items included in the current transaction for a given
2817 * directory. This also creates the range items in the log tree required
2818 * to replay anything deleted before the fsync
2820 static noinline
int log_dir_items(struct btrfs_trans_handle
*trans
,
2821 struct btrfs_root
*root
, struct inode
*inode
,
2822 struct btrfs_path
*path
,
2823 struct btrfs_path
*dst_path
, int key_type
,
2824 u64 min_offset
, u64
*last_offset_ret
)
2826 struct btrfs_key min_key
;
2827 struct btrfs_key max_key
;
2828 struct btrfs_root
*log
= root
->log_root
;
2829 struct extent_buffer
*src
;
2834 u64 first_offset
= min_offset
;
2835 u64 last_offset
= (u64
)-1;
2836 u64 ino
= btrfs_ino(inode
);
2838 log
= root
->log_root
;
2839 max_key
.objectid
= ino
;
2840 max_key
.offset
= (u64
)-1;
2841 max_key
.type
= key_type
;
2843 min_key
.objectid
= ino
;
2844 min_key
.type
= key_type
;
2845 min_key
.offset
= min_offset
;
2847 path
->keep_locks
= 1;
2849 ret
= btrfs_search_forward(root
, &min_key
, &max_key
,
2850 path
, trans
->transid
);
2853 * we didn't find anything from this transaction, see if there
2854 * is anything at all
2856 if (ret
!= 0 || min_key
.objectid
!= ino
|| min_key
.type
!= key_type
) {
2857 min_key
.objectid
= ino
;
2858 min_key
.type
= key_type
;
2859 min_key
.offset
= (u64
)-1;
2860 btrfs_release_path(path
);
2861 ret
= btrfs_search_slot(NULL
, root
, &min_key
, path
, 0, 0);
2863 btrfs_release_path(path
);
2866 ret
= btrfs_previous_item(root
, path
, ino
, key_type
);
2868 /* if ret == 0 there are items for this type,
2869 * create a range to tell us the last key of this type.
2870 * otherwise, there are no items in this directory after
2871 * *min_offset, and we create a range to indicate that.
2874 struct btrfs_key tmp
;
2875 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
,
2877 if (key_type
== tmp
.type
)
2878 first_offset
= max(min_offset
, tmp
.offset
) + 1;
2883 /* go backward to find any previous key */
2884 ret
= btrfs_previous_item(root
, path
, ino
, key_type
);
2886 struct btrfs_key tmp
;
2887 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
, path
->slots
[0]);
2888 if (key_type
== tmp
.type
) {
2889 first_offset
= tmp
.offset
;
2890 ret
= overwrite_item(trans
, log
, dst_path
,
2891 path
->nodes
[0], path
->slots
[0],
2899 btrfs_release_path(path
);
2901 /* find the first key from this transaction again */
2902 ret
= btrfs_search_slot(NULL
, root
, &min_key
, path
, 0, 0);
2909 * we have a block from this transaction, log every item in it
2910 * from our directory
2913 struct btrfs_key tmp
;
2914 src
= path
->nodes
[0];
2915 nritems
= btrfs_header_nritems(src
);
2916 for (i
= path
->slots
[0]; i
< nritems
; i
++) {
2917 btrfs_item_key_to_cpu(src
, &min_key
, i
);
2919 if (min_key
.objectid
!= ino
|| min_key
.type
!= key_type
)
2921 ret
= overwrite_item(trans
, log
, dst_path
, src
, i
,
2928 path
->slots
[0] = nritems
;
2931 * look ahead to the next item and see if it is also
2932 * from this directory and from this transaction
2934 ret
= btrfs_next_leaf(root
, path
);
2936 last_offset
= (u64
)-1;
2939 btrfs_item_key_to_cpu(path
->nodes
[0], &tmp
, path
->slots
[0]);
2940 if (tmp
.objectid
!= ino
|| tmp
.type
!= key_type
) {
2941 last_offset
= (u64
)-1;
2944 if (btrfs_header_generation(path
->nodes
[0]) != trans
->transid
) {
2945 ret
= overwrite_item(trans
, log
, dst_path
,
2946 path
->nodes
[0], path
->slots
[0],
2951 last_offset
= tmp
.offset
;
2956 btrfs_release_path(path
);
2957 btrfs_release_path(dst_path
);
2960 *last_offset_ret
= last_offset
;
2962 * insert the log range keys to indicate where the log
2965 ret
= insert_dir_log_key(trans
, log
, path
, key_type
,
2966 ino
, first_offset
, last_offset
);
2974 * logging directories is very similar to logging inodes, We find all the items
2975 * from the current transaction and write them to the log.
2977 * The recovery code scans the directory in the subvolume, and if it finds a
2978 * key in the range logged that is not present in the log tree, then it means
2979 * that dir entry was unlinked during the transaction.
2981 * In order for that scan to work, we must include one key smaller than
2982 * the smallest logged by this transaction and one key larger than the largest
2983 * key logged by this transaction.
2985 static noinline
int log_directory_changes(struct btrfs_trans_handle
*trans
,
2986 struct btrfs_root
*root
, struct inode
*inode
,
2987 struct btrfs_path
*path
,
2988 struct btrfs_path
*dst_path
)
2993 int key_type
= BTRFS_DIR_ITEM_KEY
;
2999 ret
= log_dir_items(trans
, root
, inode
, path
,
3000 dst_path
, key_type
, min_key
,
3004 if (max_key
== (u64
)-1)
3006 min_key
= max_key
+ 1;
3009 if (key_type
== BTRFS_DIR_ITEM_KEY
) {
3010 key_type
= BTRFS_DIR_INDEX_KEY
;
3017 * a helper function to drop items from the log before we relog an
3018 * inode. max_key_type indicates the highest item type to remove.
3019 * This cannot be run for file data extents because it does not
3020 * free the extents they point to.
3022 static int drop_objectid_items(struct btrfs_trans_handle
*trans
,
3023 struct btrfs_root
*log
,
3024 struct btrfs_path
*path
,
3025 u64 objectid
, int max_key_type
)
3028 struct btrfs_key key
;
3029 struct btrfs_key found_key
;
3032 key
.objectid
= objectid
;
3033 key
.type
= max_key_type
;
3034 key
.offset
= (u64
)-1;
3037 ret
= btrfs_search_slot(trans
, log
, &key
, path
, -1, 1);
3038 BUG_ON(ret
== 0); /* Logic error */
3042 if (path
->slots
[0] == 0)
3046 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
3049 if (found_key
.objectid
!= objectid
)
3052 found_key
.offset
= 0;
3054 ret
= btrfs_bin_search(path
->nodes
[0], &found_key
, 0,
3057 ret
= btrfs_del_items(trans
, log
, path
, start_slot
,
3058 path
->slots
[0] - start_slot
+ 1);
3060 * If start slot isn't 0 then we don't need to re-search, we've
3061 * found the last guy with the objectid in this tree.
3063 if (ret
|| start_slot
!= 0)
3065 btrfs_release_path(path
);
3067 btrfs_release_path(path
);
3073 static void fill_inode_item(struct btrfs_trans_handle
*trans
,
3074 struct extent_buffer
*leaf
,
3075 struct btrfs_inode_item
*item
,
3076 struct inode
*inode
, int log_inode_only
)
3078 struct btrfs_map_token token
;
3080 btrfs_init_map_token(&token
);
3082 if (log_inode_only
) {
3083 /* set the generation to zero so the recover code
3084 * can tell the difference between an logging
3085 * just to say 'this inode exists' and a logging
3086 * to say 'update this inode with these values'
3088 btrfs_set_token_inode_generation(leaf
, item
, 0, &token
);
3089 btrfs_set_token_inode_size(leaf
, item
, 0, &token
);
3091 btrfs_set_token_inode_generation(leaf
, item
,
3092 BTRFS_I(inode
)->generation
,
3094 btrfs_set_token_inode_size(leaf
, item
, inode
->i_size
, &token
);
3097 btrfs_set_token_inode_uid(leaf
, item
, i_uid_read(inode
), &token
);
3098 btrfs_set_token_inode_gid(leaf
, item
, i_gid_read(inode
), &token
);
3099 btrfs_set_token_inode_mode(leaf
, item
, inode
->i_mode
, &token
);
3100 btrfs_set_token_inode_nlink(leaf
, item
, inode
->i_nlink
, &token
);
3102 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_atime(item
),
3103 inode
->i_atime
.tv_sec
, &token
);
3104 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_atime(item
),
3105 inode
->i_atime
.tv_nsec
, &token
);
3107 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_mtime(item
),
3108 inode
->i_mtime
.tv_sec
, &token
);
3109 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_mtime(item
),
3110 inode
->i_mtime
.tv_nsec
, &token
);
3112 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_ctime(item
),
3113 inode
->i_ctime
.tv_sec
, &token
);
3114 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_ctime(item
),
3115 inode
->i_ctime
.tv_nsec
, &token
);
3117 btrfs_set_token_inode_nbytes(leaf
, item
, inode_get_bytes(inode
),
3120 btrfs_set_token_inode_sequence(leaf
, item
, inode
->i_version
, &token
);
3121 btrfs_set_token_inode_transid(leaf
, item
, trans
->transid
, &token
);
3122 btrfs_set_token_inode_rdev(leaf
, item
, inode
->i_rdev
, &token
);
3123 btrfs_set_token_inode_flags(leaf
, item
, BTRFS_I(inode
)->flags
, &token
);
3124 btrfs_set_token_inode_block_group(leaf
, item
, 0, &token
);
3127 static int log_inode_item(struct btrfs_trans_handle
*trans
,
3128 struct btrfs_root
*log
, struct btrfs_path
*path
,
3129 struct inode
*inode
)
3131 struct btrfs_inode_item
*inode_item
;
3132 struct btrfs_key key
;
3135 memcpy(&key
, &BTRFS_I(inode
)->location
, sizeof(key
));
3136 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
,
3137 sizeof(*inode_item
));
3138 if (ret
&& ret
!= -EEXIST
)
3140 inode_item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
3141 struct btrfs_inode_item
);
3142 fill_inode_item(trans
, path
->nodes
[0], inode_item
, inode
, 0);
3143 btrfs_release_path(path
);
3147 static noinline
int copy_items(struct btrfs_trans_handle
*trans
,
3148 struct inode
*inode
,
3149 struct btrfs_path
*dst_path
,
3150 struct extent_buffer
*src
,
3151 int start_slot
, int nr
, int inode_only
)
3153 unsigned long src_offset
;
3154 unsigned long dst_offset
;
3155 struct btrfs_root
*log
= BTRFS_I(inode
)->root
->log_root
;
3156 struct btrfs_file_extent_item
*extent
;
3157 struct btrfs_inode_item
*inode_item
;
3159 struct btrfs_key
*ins_keys
;
3163 struct list_head ordered_sums
;
3164 int skip_csum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
3166 INIT_LIST_HEAD(&ordered_sums
);
3168 ins_data
= kmalloc(nr
* sizeof(struct btrfs_key
) +
3169 nr
* sizeof(u32
), GFP_NOFS
);
3173 ins_sizes
= (u32
*)ins_data
;
3174 ins_keys
= (struct btrfs_key
*)(ins_data
+ nr
* sizeof(u32
));
3176 for (i
= 0; i
< nr
; i
++) {
3177 ins_sizes
[i
] = btrfs_item_size_nr(src
, i
+ start_slot
);
3178 btrfs_item_key_to_cpu(src
, ins_keys
+ i
, i
+ start_slot
);
3180 ret
= btrfs_insert_empty_items(trans
, log
, dst_path
,
3181 ins_keys
, ins_sizes
, nr
);
3187 for (i
= 0; i
< nr
; i
++, dst_path
->slots
[0]++) {
3188 dst_offset
= btrfs_item_ptr_offset(dst_path
->nodes
[0],
3189 dst_path
->slots
[0]);
3191 src_offset
= btrfs_item_ptr_offset(src
, start_slot
+ i
);
3193 if (ins_keys
[i
].type
== BTRFS_INODE_ITEM_KEY
) {
3194 inode_item
= btrfs_item_ptr(dst_path
->nodes
[0],
3196 struct btrfs_inode_item
);
3197 fill_inode_item(trans
, dst_path
->nodes
[0], inode_item
,
3198 inode
, inode_only
== LOG_INODE_EXISTS
);
3200 copy_extent_buffer(dst_path
->nodes
[0], src
, dst_offset
,
3201 src_offset
, ins_sizes
[i
]);
3204 /* take a reference on file data extents so that truncates
3205 * or deletes of this inode don't have to relog the inode
3208 if (btrfs_key_type(ins_keys
+ i
) == BTRFS_EXTENT_DATA_KEY
&&
3211 extent
= btrfs_item_ptr(src
, start_slot
+ i
,
3212 struct btrfs_file_extent_item
);
3214 if (btrfs_file_extent_generation(src
, extent
) < trans
->transid
)
3217 found_type
= btrfs_file_extent_type(src
, extent
);
3218 if (found_type
== BTRFS_FILE_EXTENT_REG
) {
3220 ds
= btrfs_file_extent_disk_bytenr(src
,
3222 /* ds == 0 is a hole */
3226 dl
= btrfs_file_extent_disk_num_bytes(src
,
3228 cs
= btrfs_file_extent_offset(src
, extent
);
3229 cl
= btrfs_file_extent_num_bytes(src
,
3231 if (btrfs_file_extent_compression(src
,
3237 ret
= btrfs_lookup_csums_range(
3238 log
->fs_info
->csum_root
,
3239 ds
+ cs
, ds
+ cs
+ cl
- 1,
3242 btrfs_release_path(dst_path
);
3250 btrfs_mark_buffer_dirty(dst_path
->nodes
[0]);
3251 btrfs_release_path(dst_path
);
3255 * we have to do this after the loop above to avoid changing the
3256 * log tree while trying to change the log tree.
3259 while (!list_empty(&ordered_sums
)) {
3260 struct btrfs_ordered_sum
*sums
= list_entry(ordered_sums
.next
,
3261 struct btrfs_ordered_sum
,
3264 ret
= btrfs_csum_file_blocks(trans
, log
, sums
);
3265 list_del(&sums
->list
);
3271 static int extent_cmp(void *priv
, struct list_head
*a
, struct list_head
*b
)
3273 struct extent_map
*em1
, *em2
;
3275 em1
= list_entry(a
, struct extent_map
, list
);
3276 em2
= list_entry(b
, struct extent_map
, list
);
3278 if (em1
->start
< em2
->start
)
3280 else if (em1
->start
> em2
->start
)
3285 static int log_one_extent(struct btrfs_trans_handle
*trans
,
3286 struct inode
*inode
, struct btrfs_root
*root
,
3287 struct extent_map
*em
, struct btrfs_path
*path
)
3289 struct btrfs_root
*log
= root
->log_root
;
3290 struct btrfs_file_extent_item
*fi
;
3291 struct extent_buffer
*leaf
;
3292 struct btrfs_ordered_extent
*ordered
;
3293 struct list_head ordered_sums
;
3294 struct btrfs_map_token token
;
3295 struct btrfs_key key
;
3296 u64 mod_start
= em
->mod_start
;
3297 u64 mod_len
= em
->mod_len
;
3300 u64 extent_offset
= em
->start
- em
->orig_start
;
3303 int index
= log
->log_transid
% 2;
3304 bool skip_csum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
3306 ret
= __btrfs_drop_extents(trans
, log
, inode
, path
, em
->start
,
3307 em
->start
+ em
->len
, NULL
, 0);
3311 INIT_LIST_HEAD(&ordered_sums
);
3312 btrfs_init_map_token(&token
);
3313 key
.objectid
= btrfs_ino(inode
);
3314 key
.type
= BTRFS_EXTENT_DATA_KEY
;
3315 key
.offset
= em
->start
;
3317 ret
= btrfs_insert_empty_item(trans
, log
, path
, &key
, sizeof(*fi
));
3320 leaf
= path
->nodes
[0];
3321 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3322 struct btrfs_file_extent_item
);
3324 btrfs_set_token_file_extent_generation(leaf
, fi
, em
->generation
,
3326 if (test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
)) {
3328 btrfs_set_token_file_extent_type(leaf
, fi
,
3329 BTRFS_FILE_EXTENT_PREALLOC
,
3332 btrfs_set_token_file_extent_type(leaf
, fi
,
3333 BTRFS_FILE_EXTENT_REG
,
3335 if (em
->block_start
== 0)
3339 block_len
= max(em
->block_len
, em
->orig_block_len
);
3340 if (em
->compress_type
!= BTRFS_COMPRESS_NONE
) {
3341 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
,
3344 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, block_len
,
3346 } else if (em
->block_start
< EXTENT_MAP_LAST_BYTE
) {
3347 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
,
3349 extent_offset
, &token
);
3350 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, block_len
,
3353 btrfs_set_token_file_extent_disk_bytenr(leaf
, fi
, 0, &token
);
3354 btrfs_set_token_file_extent_disk_num_bytes(leaf
, fi
, 0,
3358 btrfs_set_token_file_extent_offset(leaf
, fi
,
3359 em
->start
- em
->orig_start
,
3361 btrfs_set_token_file_extent_num_bytes(leaf
, fi
, em
->len
, &token
);
3362 btrfs_set_token_file_extent_ram_bytes(leaf
, fi
, em
->ram_bytes
, &token
);
3363 btrfs_set_token_file_extent_compression(leaf
, fi
, em
->compress_type
,
3365 btrfs_set_token_file_extent_encryption(leaf
, fi
, 0, &token
);
3366 btrfs_set_token_file_extent_other_encoding(leaf
, fi
, 0, &token
);
3367 btrfs_mark_buffer_dirty(leaf
);
3369 btrfs_release_path(path
);
3377 if (em
->compress_type
) {
3379 csum_len
= block_len
;
3383 * First check and see if our csums are on our outstanding ordered
3387 spin_lock_irq(&log
->log_extents_lock
[index
]);
3388 list_for_each_entry(ordered
, &log
->logged_list
[index
], log_list
) {
3389 struct btrfs_ordered_sum
*sum
;
3394 if (ordered
->inode
!= inode
)
3397 if (ordered
->file_offset
+ ordered
->len
<= mod_start
||
3398 mod_start
+ mod_len
<= ordered
->file_offset
)
3402 * We are going to copy all the csums on this ordered extent, so
3403 * go ahead and adjust mod_start and mod_len in case this
3404 * ordered extent has already been logged.
3406 if (ordered
->file_offset
> mod_start
) {
3407 if (ordered
->file_offset
+ ordered
->len
>=
3408 mod_start
+ mod_len
)
3409 mod_len
= ordered
->file_offset
- mod_start
;
3411 * If we have this case
3413 * |--------- logged extent ---------|
3414 * |----- ordered extent ----|
3416 * Just don't mess with mod_start and mod_len, we'll
3417 * just end up logging more csums than we need and it
3421 if (ordered
->file_offset
+ ordered
->len
<
3422 mod_start
+ mod_len
) {
3423 mod_len
= (mod_start
+ mod_len
) -
3424 (ordered
->file_offset
+ ordered
->len
);
3425 mod_start
= ordered
->file_offset
+
3433 * To keep us from looping for the above case of an ordered
3434 * extent that falls inside of the logged extent.
3436 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM
,
3439 atomic_inc(&ordered
->refs
);
3440 spin_unlock_irq(&log
->log_extents_lock
[index
]);
3442 * we've dropped the lock, we must either break or
3443 * start over after this.
3446 wait_event(ordered
->wait
, ordered
->csum_bytes_left
== 0);
3448 list_for_each_entry(sum
, &ordered
->list
, list
) {
3449 ret
= btrfs_csum_file_blocks(trans
, log
, sum
);
3451 btrfs_put_ordered_extent(ordered
);
3455 btrfs_put_ordered_extent(ordered
);
3459 spin_unlock_irq(&log
->log_extents_lock
[index
]);
3462 if (!mod_len
|| ret
)
3465 csum_offset
= mod_start
- em
->start
;
3468 /* block start is already adjusted for the file extent offset. */
3469 ret
= btrfs_lookup_csums_range(log
->fs_info
->csum_root
,
3470 em
->block_start
+ csum_offset
,
3471 em
->block_start
+ csum_offset
+
3472 csum_len
- 1, &ordered_sums
, 0);
3476 while (!list_empty(&ordered_sums
)) {
3477 struct btrfs_ordered_sum
*sums
= list_entry(ordered_sums
.next
,
3478 struct btrfs_ordered_sum
,
3481 ret
= btrfs_csum_file_blocks(trans
, log
, sums
);
3482 list_del(&sums
->list
);
3489 static int btrfs_log_changed_extents(struct btrfs_trans_handle
*trans
,
3490 struct btrfs_root
*root
,
3491 struct inode
*inode
,
3492 struct btrfs_path
*path
)
3494 struct extent_map
*em
, *n
;
3495 struct list_head extents
;
3496 struct extent_map_tree
*tree
= &BTRFS_I(inode
)->extent_tree
;
3501 INIT_LIST_HEAD(&extents
);
3503 write_lock(&tree
->lock
);
3504 test_gen
= root
->fs_info
->last_trans_committed
;
3506 list_for_each_entry_safe(em
, n
, &tree
->modified_extents
, list
) {
3507 list_del_init(&em
->list
);
3510 * Just an arbitrary number, this can be really CPU intensive
3511 * once we start getting a lot of extents, and really once we
3512 * have a bunch of extents we just want to commit since it will
3515 if (++num
> 32768) {
3516 list_del_init(&tree
->modified_extents
);
3521 if (em
->generation
<= test_gen
)
3523 /* Need a ref to keep it from getting evicted from cache */
3524 atomic_inc(&em
->refs
);
3525 set_bit(EXTENT_FLAG_LOGGING
, &em
->flags
);
3526 list_add_tail(&em
->list
, &extents
);
3530 list_sort(NULL
, &extents
, extent_cmp
);
3533 while (!list_empty(&extents
)) {
3534 em
= list_entry(extents
.next
, struct extent_map
, list
);
3536 list_del_init(&em
->list
);
3539 * If we had an error we just need to delete everybody from our
3543 clear_em_logging(tree
, em
);
3544 free_extent_map(em
);
3548 write_unlock(&tree
->lock
);
3550 ret
= log_one_extent(trans
, inode
, root
, em
, path
);
3551 write_lock(&tree
->lock
);
3552 clear_em_logging(tree
, em
);
3553 free_extent_map(em
);
3555 WARN_ON(!list_empty(&extents
));
3556 write_unlock(&tree
->lock
);
3558 btrfs_release_path(path
);
3562 /* log a single inode in the tree log.
3563 * At least one parent directory for this inode must exist in the tree
3564 * or be logged already.
3566 * Any items from this inode changed by the current transaction are copied
3567 * to the log tree. An extra reference is taken on any extents in this
3568 * file, allowing us to avoid a whole pile of corner cases around logging
3569 * blocks that have been removed from the tree.
3571 * See LOG_INODE_ALL and related defines for a description of what inode_only
3574 * This handles both files and directories.
3576 static int btrfs_log_inode(struct btrfs_trans_handle
*trans
,
3577 struct btrfs_root
*root
, struct inode
*inode
,
3580 struct btrfs_path
*path
;
3581 struct btrfs_path
*dst_path
;
3582 struct btrfs_key min_key
;
3583 struct btrfs_key max_key
;
3584 struct btrfs_root
*log
= root
->log_root
;
3585 struct extent_buffer
*src
= NULL
;
3589 int ins_start_slot
= 0;
3591 bool fast_search
= false;
3592 u64 ino
= btrfs_ino(inode
);
3594 path
= btrfs_alloc_path();
3597 dst_path
= btrfs_alloc_path();
3599 btrfs_free_path(path
);
3603 min_key
.objectid
= ino
;
3604 min_key
.type
= BTRFS_INODE_ITEM_KEY
;
3607 max_key
.objectid
= ino
;
3610 /* today the code can only do partial logging of directories */
3611 if (S_ISDIR(inode
->i_mode
) ||
3612 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
3613 &BTRFS_I(inode
)->runtime_flags
) &&
3614 inode_only
== LOG_INODE_EXISTS
))
3615 max_key
.type
= BTRFS_XATTR_ITEM_KEY
;
3617 max_key
.type
= (u8
)-1;
3618 max_key
.offset
= (u64
)-1;
3620 /* Only run delayed items if we are a dir or a new file */
3621 if (S_ISDIR(inode
->i_mode
) ||
3622 BTRFS_I(inode
)->generation
> root
->fs_info
->last_trans_committed
) {
3623 ret
= btrfs_commit_inode_delayed_items(trans
, inode
);
3625 btrfs_free_path(path
);
3626 btrfs_free_path(dst_path
);
3631 mutex_lock(&BTRFS_I(inode
)->log_mutex
);
3633 btrfs_get_logged_extents(log
, inode
);
3636 * a brute force approach to making sure we get the most uptodate
3637 * copies of everything.
3639 if (S_ISDIR(inode
->i_mode
)) {
3640 int max_key_type
= BTRFS_DIR_LOG_INDEX_KEY
;
3642 if (inode_only
== LOG_INODE_EXISTS
)
3643 max_key_type
= BTRFS_XATTR_ITEM_KEY
;
3644 ret
= drop_objectid_items(trans
, log
, path
, ino
, max_key_type
);
3646 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
3647 &BTRFS_I(inode
)->runtime_flags
)) {
3648 clear_bit(BTRFS_INODE_COPY_EVERYTHING
,
3649 &BTRFS_I(inode
)->runtime_flags
);
3650 ret
= btrfs_truncate_inode_items(trans
, log
,
3652 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING
,
3653 &BTRFS_I(inode
)->runtime_flags
)) {
3654 if (inode_only
== LOG_INODE_ALL
)
3656 max_key
.type
= BTRFS_XATTR_ITEM_KEY
;
3657 ret
= drop_objectid_items(trans
, log
, path
, ino
,
3660 if (inode_only
== LOG_INODE_ALL
)
3662 ret
= log_inode_item(trans
, log
, dst_path
, inode
);
3675 path
->keep_locks
= 1;
3679 ret
= btrfs_search_forward(root
, &min_key
, &max_key
,
3680 path
, trans
->transid
);
3684 /* note, ins_nr might be > 0 here, cleanup outside the loop */
3685 if (min_key
.objectid
!= ino
)
3687 if (min_key
.type
> max_key
.type
)
3690 src
= path
->nodes
[0];
3691 if (ins_nr
&& ins_start_slot
+ ins_nr
== path
->slots
[0]) {
3694 } else if (!ins_nr
) {
3695 ins_start_slot
= path
->slots
[0];
3700 ret
= copy_items(trans
, inode
, dst_path
, src
, ins_start_slot
,
3701 ins_nr
, inode_only
);
3707 ins_start_slot
= path
->slots
[0];
3710 nritems
= btrfs_header_nritems(path
->nodes
[0]);
3712 if (path
->slots
[0] < nritems
) {
3713 btrfs_item_key_to_cpu(path
->nodes
[0], &min_key
,
3718 ret
= copy_items(trans
, inode
, dst_path
, src
,
3720 ins_nr
, inode_only
);
3727 btrfs_release_path(path
);
3729 if (min_key
.offset
< (u64
)-1)
3731 else if (min_key
.type
< (u8
)-1)
3733 else if (min_key
.objectid
< (u64
)-1)
3739 ret
= copy_items(trans
, inode
, dst_path
, src
, ins_start_slot
,
3740 ins_nr
, inode_only
);
3750 btrfs_release_path(dst_path
);
3751 ret
= btrfs_log_changed_extents(trans
, root
, inode
, dst_path
);
3757 struct extent_map_tree
*tree
= &BTRFS_I(inode
)->extent_tree
;
3758 struct extent_map
*em
, *n
;
3760 write_lock(&tree
->lock
);
3761 list_for_each_entry_safe(em
, n
, &tree
->modified_extents
, list
)
3762 list_del_init(&em
->list
);
3763 write_unlock(&tree
->lock
);
3766 if (inode_only
== LOG_INODE_ALL
&& S_ISDIR(inode
->i_mode
)) {
3767 btrfs_release_path(path
);
3768 btrfs_release_path(dst_path
);
3769 ret
= log_directory_changes(trans
, root
, inode
, path
, dst_path
);
3775 BTRFS_I(inode
)->logged_trans
= trans
->transid
;
3776 BTRFS_I(inode
)->last_log_commit
= BTRFS_I(inode
)->last_sub_trans
;
3779 btrfs_free_logged_extents(log
, log
->log_transid
);
3780 mutex_unlock(&BTRFS_I(inode
)->log_mutex
);
3782 btrfs_free_path(path
);
3783 btrfs_free_path(dst_path
);
3788 * follow the dentry parent pointers up the chain and see if any
3789 * of the directories in it require a full commit before they can
3790 * be logged. Returns zero if nothing special needs to be done or 1 if
3791 * a full commit is required.
3793 static noinline
int check_parent_dirs_for_sync(struct btrfs_trans_handle
*trans
,
3794 struct inode
*inode
,
3795 struct dentry
*parent
,
3796 struct super_block
*sb
,
3800 struct btrfs_root
*root
;
3801 struct dentry
*old_parent
= NULL
;
3804 * for regular files, if its inode is already on disk, we don't
3805 * have to worry about the parents at all. This is because
3806 * we can use the last_unlink_trans field to record renames
3807 * and other fun in this file.
3809 if (S_ISREG(inode
->i_mode
) &&
3810 BTRFS_I(inode
)->generation
<= last_committed
&&
3811 BTRFS_I(inode
)->last_unlink_trans
<= last_committed
)
3814 if (!S_ISDIR(inode
->i_mode
)) {
3815 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3817 inode
= parent
->d_inode
;
3821 BTRFS_I(inode
)->logged_trans
= trans
->transid
;
3824 if (BTRFS_I(inode
)->last_unlink_trans
> last_committed
) {
3825 root
= BTRFS_I(inode
)->root
;
3828 * make sure any commits to the log are forced
3829 * to be full commits
3831 root
->fs_info
->last_trans_log_full_commit
=
3837 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3840 if (IS_ROOT(parent
))
3843 parent
= dget_parent(parent
);
3845 old_parent
= parent
;
3846 inode
= parent
->d_inode
;
3855 * helper function around btrfs_log_inode to make sure newly created
3856 * parent directories also end up in the log. A minimal inode and backref
3857 * only logging is done of any parent directories that are older than
3858 * the last committed transaction
3860 static int btrfs_log_inode_parent(struct btrfs_trans_handle
*trans
,
3861 struct btrfs_root
*root
, struct inode
*inode
,
3862 struct dentry
*parent
, int exists_only
)
3864 int inode_only
= exists_only
? LOG_INODE_EXISTS
: LOG_INODE_ALL
;
3865 struct super_block
*sb
;
3866 struct dentry
*old_parent
= NULL
;
3868 u64 last_committed
= root
->fs_info
->last_trans_committed
;
3872 if (btrfs_test_opt(root
, NOTREELOG
)) {
3877 if (root
->fs_info
->last_trans_log_full_commit
>
3878 root
->fs_info
->last_trans_committed
) {
3883 if (root
!= BTRFS_I(inode
)->root
||
3884 btrfs_root_refs(&root
->root_item
) == 0) {
3889 ret
= check_parent_dirs_for_sync(trans
, inode
, parent
,
3890 sb
, last_committed
);
3894 if (btrfs_inode_in_log(inode
, trans
->transid
)) {
3895 ret
= BTRFS_NO_LOG_SYNC
;
3899 ret
= start_log_trans(trans
, root
);
3903 ret
= btrfs_log_inode(trans
, root
, inode
, inode_only
);
3908 * for regular files, if its inode is already on disk, we don't
3909 * have to worry about the parents at all. This is because
3910 * we can use the last_unlink_trans field to record renames
3911 * and other fun in this file.
3913 if (S_ISREG(inode
->i_mode
) &&
3914 BTRFS_I(inode
)->generation
<= last_committed
&&
3915 BTRFS_I(inode
)->last_unlink_trans
<= last_committed
) {
3920 inode_only
= LOG_INODE_EXISTS
;
3922 if (!parent
|| !parent
->d_inode
|| sb
!= parent
->d_inode
->i_sb
)
3925 inode
= parent
->d_inode
;
3926 if (root
!= BTRFS_I(inode
)->root
)
3929 if (BTRFS_I(inode
)->generation
>
3930 root
->fs_info
->last_trans_committed
) {
3931 ret
= btrfs_log_inode(trans
, root
, inode
, inode_only
);
3935 if (IS_ROOT(parent
))
3938 parent
= dget_parent(parent
);
3940 old_parent
= parent
;
3946 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
3949 btrfs_end_log_trans(root
);
3955 * it is not safe to log dentry if the chunk root has added new
3956 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
3957 * If this returns 1, you must commit the transaction to safely get your
3960 int btrfs_log_dentry_safe(struct btrfs_trans_handle
*trans
,
3961 struct btrfs_root
*root
, struct dentry
*dentry
)
3963 struct dentry
*parent
= dget_parent(dentry
);
3966 ret
= btrfs_log_inode_parent(trans
, root
, dentry
->d_inode
, parent
, 0);
3973 * should be called during mount to recover any replay any log trees
3976 int btrfs_recover_log_trees(struct btrfs_root
*log_root_tree
)
3979 struct btrfs_path
*path
;
3980 struct btrfs_trans_handle
*trans
;
3981 struct btrfs_key key
;
3982 struct btrfs_key found_key
;
3983 struct btrfs_key tmp_key
;
3984 struct btrfs_root
*log
;
3985 struct btrfs_fs_info
*fs_info
= log_root_tree
->fs_info
;
3986 struct walk_control wc
= {
3987 .process_func
= process_one_buffer
,
3991 path
= btrfs_alloc_path();
3995 fs_info
->log_root_recovering
= 1;
3997 trans
= btrfs_start_transaction(fs_info
->tree_root
, 0);
3998 if (IS_ERR(trans
)) {
3999 ret
= PTR_ERR(trans
);
4006 ret
= walk_log_tree(trans
, log_root_tree
, &wc
);
4008 btrfs_error(fs_info
, ret
, "Failed to pin buffers while "
4009 "recovering log root tree.");
4014 key
.objectid
= BTRFS_TREE_LOG_OBJECTID
;
4015 key
.offset
= (u64
)-1;
4016 btrfs_set_key_type(&key
, BTRFS_ROOT_ITEM_KEY
);
4019 ret
= btrfs_search_slot(NULL
, log_root_tree
, &key
, path
, 0, 0);
4022 btrfs_error(fs_info
, ret
,
4023 "Couldn't find tree log root.");
4027 if (path
->slots
[0] == 0)
4031 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
4033 btrfs_release_path(path
);
4034 if (found_key
.objectid
!= BTRFS_TREE_LOG_OBJECTID
)
4037 log
= btrfs_read_fs_root(log_root_tree
, &found_key
);
4040 btrfs_error(fs_info
, ret
,
4041 "Couldn't read tree log root.");
4045 tmp_key
.objectid
= found_key
.offset
;
4046 tmp_key
.type
= BTRFS_ROOT_ITEM_KEY
;
4047 tmp_key
.offset
= (u64
)-1;
4049 wc
.replay_dest
= btrfs_read_fs_root_no_name(fs_info
, &tmp_key
);
4050 if (IS_ERR(wc
.replay_dest
)) {
4051 ret
= PTR_ERR(wc
.replay_dest
);
4052 free_extent_buffer(log
->node
);
4053 free_extent_buffer(log
->commit_root
);
4055 btrfs_error(fs_info
, ret
, "Couldn't read target root "
4056 "for tree log recovery.");
4060 wc
.replay_dest
->log_root
= log
;
4061 btrfs_record_root_in_trans(trans
, wc
.replay_dest
);
4062 ret
= walk_log_tree(trans
, log
, &wc
);
4064 if (!ret
&& wc
.stage
== LOG_WALK_REPLAY_ALL
) {
4065 ret
= fixup_inode_link_counts(trans
, wc
.replay_dest
,
4069 key
.offset
= found_key
.offset
- 1;
4070 wc
.replay_dest
->log_root
= NULL
;
4071 free_extent_buffer(log
->node
);
4072 free_extent_buffer(log
->commit_root
);
4078 if (found_key
.offset
== 0)
4081 btrfs_release_path(path
);
4083 /* step one is to pin it all, step two is to replay just inodes */
4086 wc
.process_func
= replay_one_buffer
;
4087 wc
.stage
= LOG_WALK_REPLAY_INODES
;
4090 /* step three is to replay everything */
4091 if (wc
.stage
< LOG_WALK_REPLAY_ALL
) {
4096 btrfs_free_path(path
);
4098 /* step 4: commit the transaction, which also unpins the blocks */
4099 ret
= btrfs_commit_transaction(trans
, fs_info
->tree_root
);
4103 free_extent_buffer(log_root_tree
->node
);
4104 log_root_tree
->log_root
= NULL
;
4105 fs_info
->log_root_recovering
= 0;
4106 kfree(log_root_tree
);
4111 btrfs_end_transaction(wc
.trans
, fs_info
->tree_root
);
4112 btrfs_free_path(path
);
4117 * there are some corner cases where we want to force a full
4118 * commit instead of allowing a directory to be logged.
4120 * They revolve around files there were unlinked from the directory, and
4121 * this function updates the parent directory so that a full commit is
4122 * properly done if it is fsync'd later after the unlinks are done.
4124 void btrfs_record_unlink_dir(struct btrfs_trans_handle
*trans
,
4125 struct inode
*dir
, struct inode
*inode
,
4129 * when we're logging a file, if it hasn't been renamed
4130 * or unlinked, and its inode is fully committed on disk,
4131 * we don't have to worry about walking up the directory chain
4132 * to log its parents.
4134 * So, we use the last_unlink_trans field to put this transid
4135 * into the file. When the file is logged we check it and
4136 * don't log the parents if the file is fully on disk.
4138 if (S_ISREG(inode
->i_mode
))
4139 BTRFS_I(inode
)->last_unlink_trans
= trans
->transid
;
4142 * if this directory was already logged any new
4143 * names for this file/dir will get recorded
4146 if (BTRFS_I(dir
)->logged_trans
== trans
->transid
)
4150 * if the inode we're about to unlink was logged,
4151 * the log will be properly updated for any new names
4153 if (BTRFS_I(inode
)->logged_trans
== trans
->transid
)
4157 * when renaming files across directories, if the directory
4158 * there we're unlinking from gets fsync'd later on, there's
4159 * no way to find the destination directory later and fsync it
4160 * properly. So, we have to be conservative and force commits
4161 * so the new name gets discovered.
4166 /* we can safely do the unlink without any special recording */
4170 BTRFS_I(dir
)->last_unlink_trans
= trans
->transid
;
4174 * Call this after adding a new name for a file and it will properly
4175 * update the log to reflect the new name.
4177 * It will return zero if all goes well, and it will return 1 if a
4178 * full transaction commit is required.
4180 int btrfs_log_new_name(struct btrfs_trans_handle
*trans
,
4181 struct inode
*inode
, struct inode
*old_dir
,
4182 struct dentry
*parent
)
4184 struct btrfs_root
* root
= BTRFS_I(inode
)->root
;
4187 * this will force the logging code to walk the dentry chain
4190 if (S_ISREG(inode
->i_mode
))
4191 BTRFS_I(inode
)->last_unlink_trans
= trans
->transid
;
4194 * if this inode hasn't been logged and directory we're renaming it
4195 * from hasn't been logged, we don't need to log it
4197 if (BTRFS_I(inode
)->logged_trans
<=
4198 root
->fs_info
->last_trans_committed
&&
4199 (!old_dir
|| BTRFS_I(old_dir
)->logged_trans
<=
4200 root
->fs_info
->last_trans_committed
))
4203 return btrfs_log_inode_parent(trans
, root
, inode
, parent
, 1);