2 * Copyright (C) 2011 STRATO. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/rbtree.h>
24 #include <linux/slab.h>
25 #include <linux/workqueue.h>
29 #include "ordered-data.h"
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
37 * Future enhancements:
38 * - To enhance the performance, better read-ahead strategies for the
39 * extent-tree can be employed.
40 * - In case an unrepairable extent is encountered, track which files are
41 * affected and report them
42 * - In case of a read error on files with nodatasum, map the file and read
43 * the extent to trigger a writeback of the good copy
44 * - track and record media errors, throw out bad devices
45 * - add a mode to also read unallocated space
46 * - make the prefetch cancellable
52 static void scrub_bio_end_io(struct bio
*bio
, int err
);
53 static void scrub_checksum(struct btrfs_work
*work
);
54 static int scrub_checksum_data(struct scrub_dev
*sdev
,
55 struct scrub_page
*spag
, void *buffer
);
56 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
57 struct scrub_page
*spag
, u64 logical
,
59 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
);
60 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
);
61 static void scrub_fixup_end_io(struct bio
*bio
, int err
);
62 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
64 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
);
66 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
67 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
70 u64 flags
; /* extent flags */
74 u8 csum
[BTRFS_CSUM_SIZE
];
79 struct scrub_dev
*sdev
;
84 struct scrub_page spag
[SCRUB_PAGES_PER_BIO
];
87 struct btrfs_work work
;
91 struct scrub_bio
*bios
[SCRUB_BIOS_PER_DEV
];
92 struct btrfs_device
*dev
;
97 wait_queue_head_t list_wait
;
99 struct list_head csum_list
;
105 struct btrfs_scrub_progress stat
;
106 spinlock_t stat_lock
;
109 static void scrub_free_csums(struct scrub_dev
*sdev
)
111 while (!list_empty(&sdev
->csum_list
)) {
112 struct btrfs_ordered_sum
*sum
;
113 sum
= list_first_entry(&sdev
->csum_list
,
114 struct btrfs_ordered_sum
, list
);
115 list_del(&sum
->list
);
120 static noinline_for_stack
void scrub_free_dev(struct scrub_dev
*sdev
)
124 struct page
*last_page
;
129 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
130 struct scrub_bio
*sbio
= sdev
->bios
[i
];
139 for (j
= 0; j
< bio
->bi_vcnt
; ++j
) {
140 if (bio
->bi_io_vec
[j
].bv_page
== last_page
)
142 last_page
= bio
->bi_io_vec
[j
].bv_page
;
143 __free_page(last_page
);
150 scrub_free_csums(sdev
);
154 static noinline_for_stack
155 struct scrub_dev
*scrub_setup_dev(struct btrfs_device
*dev
)
157 struct scrub_dev
*sdev
;
161 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
163 sdev
= kzalloc(sizeof(*sdev
), GFP_NOFS
);
167 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
169 struct scrub_bio
*sbio
;
171 sbio
= kzalloc(sizeof(*sbio
), GFP_NOFS
);
174 sdev
->bios
[i
] = sbio
;
176 bio
= bio_kmalloc(GFP_NOFS
, SCRUB_PAGES_PER_BIO
);
184 sbio
->work
.func
= scrub_checksum
;
185 bio
->bi_private
= sdev
->bios
[i
];
186 bio
->bi_end_io
= scrub_bio_end_io
;
188 bio
->bi_bdev
= dev
->bdev
;
191 for (j
= 0; j
< SCRUB_PAGES_PER_BIO
; ++j
) {
193 page
= alloc_page(GFP_NOFS
);
197 ret
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
201 WARN_ON(bio
->bi_vcnt
!= SCRUB_PAGES_PER_BIO
);
203 if (i
!= SCRUB_BIOS_PER_DEV
-1)
204 sdev
->bios
[i
]->next_free
= i
+ 1;
206 sdev
->bios
[i
]->next_free
= -1;
208 sdev
->first_free
= 0;
210 atomic_set(&sdev
->in_flight
, 0);
211 atomic_set(&sdev
->cancel_req
, 0);
212 sdev
->csum_size
= btrfs_super_csum_size(&fs_info
->super_copy
);
213 INIT_LIST_HEAD(&sdev
->csum_list
);
215 spin_lock_init(&sdev
->list_lock
);
216 spin_lock_init(&sdev
->stat_lock
);
217 init_waitqueue_head(&sdev
->list_wait
);
221 scrub_free_dev(sdev
);
222 return ERR_PTR(-ENOMEM
);
226 * scrub_recheck_error gets called when either verification of the page
227 * failed or the bio failed to read, e.g. with EIO. In the latter case,
228 * recheck_error gets called for every page in the bio, even though only
231 static void scrub_recheck_error(struct scrub_bio
*sbio
, int ix
)
234 if (scrub_fixup_io(READ
, sbio
->sdev
->dev
->bdev
,
235 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
236 sbio
->bio
->bi_io_vec
[ix
].bv_page
) == 0) {
237 if (scrub_fixup_check(sbio
, ix
) == 0)
242 scrub_fixup(sbio
, ix
);
245 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
)
250 u64 flags
= sbio
->spag
[ix
].flags
;
252 page
= sbio
->bio
->bi_io_vec
[ix
].bv_page
;
253 buffer
= kmap_atomic(page
, KM_USER0
);
254 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
255 ret
= scrub_checksum_data(sbio
->sdev
,
256 sbio
->spag
+ ix
, buffer
);
257 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
258 ret
= scrub_checksum_tree_block(sbio
->sdev
,
260 sbio
->logical
+ ix
* PAGE_SIZE
,
265 kunmap_atomic(buffer
, KM_USER0
);
270 static void scrub_fixup_end_io(struct bio
*bio
, int err
)
272 complete((struct completion
*)bio
->bi_private
);
275 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
)
277 struct scrub_dev
*sdev
= sbio
->sdev
;
278 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
279 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
280 struct btrfs_multi_bio
*multi
= NULL
;
281 u64 logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
285 DECLARE_COMPLETION_ONSTACK(complete
);
287 if ((sbio
->spag
[ix
].flags
& BTRFS_EXTENT_FLAG_DATA
) &&
288 (sbio
->spag
[ix
].have_csum
== 0)) {
290 * nodatasum, don't try to fix anything
291 * FIXME: we can do better, open the inode and trigger a
298 ret
= btrfs_map_block(map_tree
, REQ_WRITE
, logical
, &length
,
300 if (ret
|| !multi
|| length
< PAGE_SIZE
) {
302 "scrub_fixup: btrfs_map_block failed us for %llu\n",
303 (unsigned long long)logical
);
308 if (multi
->num_stripes
== 1)
309 /* there aren't any replicas */
313 * first find a good copy
315 for (i
= 0; i
< multi
->num_stripes
; ++i
) {
316 if (i
== sbio
->spag
[ix
].mirror_num
)
319 if (scrub_fixup_io(READ
, multi
->stripes
[i
].dev
->bdev
,
320 multi
->stripes
[i
].physical
>> 9,
321 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
322 /* I/O-error, this is not a good copy */
326 if (scrub_fixup_check(sbio
, ix
) == 0)
329 if (i
== multi
->num_stripes
)
332 if (!sdev
->readonly
) {
334 * bi_io_vec[ix].bv_page now contains good data, write it back
336 if (scrub_fixup_io(WRITE
, sdev
->dev
->bdev
,
337 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
338 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
339 /* I/O-error, writeback failed, give up */
345 spin_lock(&sdev
->stat_lock
);
346 ++sdev
->stat
.corrected_errors
;
347 spin_unlock(&sdev
->stat_lock
);
349 if (printk_ratelimit())
350 printk(KERN_ERR
"btrfs: fixed up at %llu\n",
351 (unsigned long long)logical
);
356 spin_lock(&sdev
->stat_lock
);
357 ++sdev
->stat
.uncorrectable_errors
;
358 spin_unlock(&sdev
->stat_lock
);
360 if (printk_ratelimit())
361 printk(KERN_ERR
"btrfs: unable to fixup at %llu\n",
362 (unsigned long long)logical
);
365 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
368 struct bio
*bio
= NULL
;
370 DECLARE_COMPLETION_ONSTACK(complete
);
372 /* we are going to wait on this IO */
375 bio
= bio_alloc(GFP_NOFS
, 1);
377 bio
->bi_sector
= sector
;
378 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
379 bio
->bi_end_io
= scrub_fixup_end_io
;
380 bio
->bi_private
= &complete
;
383 wait_for_completion(&complete
);
385 ret
= !test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
390 static void scrub_bio_end_io(struct bio
*bio
, int err
)
392 struct scrub_bio
*sbio
= bio
->bi_private
;
393 struct scrub_dev
*sdev
= sbio
->sdev
;
394 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
398 btrfs_queue_worker(&fs_info
->scrub_workers
, &sbio
->work
);
401 static void scrub_checksum(struct btrfs_work
*work
)
403 struct scrub_bio
*sbio
= container_of(work
, struct scrub_bio
, work
);
404 struct scrub_dev
*sdev
= sbio
->sdev
;
413 for (i
= 0; i
< sbio
->count
; ++i
)
414 scrub_recheck_error(sbio
, i
);
416 sbio
->bio
->bi_flags
&= ~(BIO_POOL_MASK
- 1);
417 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
418 sbio
->bio
->bi_phys_segments
= 0;
419 sbio
->bio
->bi_idx
= 0;
421 for (i
= 0; i
< sbio
->count
; i
++) {
423 bi
= &sbio
->bio
->bi_io_vec
[i
];
425 bi
->bv_len
= PAGE_SIZE
;
428 spin_lock(&sdev
->stat_lock
);
429 ++sdev
->stat
.read_errors
;
430 spin_unlock(&sdev
->stat_lock
);
433 for (i
= 0; i
< sbio
->count
; ++i
) {
434 page
= sbio
->bio
->bi_io_vec
[i
].bv_page
;
435 buffer
= kmap_atomic(page
, KM_USER0
);
436 flags
= sbio
->spag
[i
].flags
;
437 logical
= sbio
->logical
+ i
* PAGE_SIZE
;
439 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
440 ret
= scrub_checksum_data(sdev
, sbio
->spag
+ i
, buffer
);
441 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
442 ret
= scrub_checksum_tree_block(sdev
, sbio
->spag
+ i
,
444 } else if (flags
& BTRFS_EXTENT_FLAG_SUPER
) {
446 (void)scrub_checksum_super(sbio
, buffer
);
450 kunmap_atomic(buffer
, KM_USER0
);
452 scrub_recheck_error(sbio
, i
);
456 spin_lock(&sdev
->list_lock
);
457 sbio
->next_free
= sdev
->first_free
;
458 sdev
->first_free
= sbio
->index
;
459 spin_unlock(&sdev
->list_lock
);
460 atomic_dec(&sdev
->in_flight
);
461 wake_up(&sdev
->list_wait
);
464 static int scrub_checksum_data(struct scrub_dev
*sdev
,
465 struct scrub_page
*spag
, void *buffer
)
467 u8 csum
[BTRFS_CSUM_SIZE
];
470 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
472 if (!spag
->have_csum
)
475 crc
= btrfs_csum_data(root
, buffer
, crc
, PAGE_SIZE
);
476 btrfs_csum_final(crc
, csum
);
477 if (memcmp(csum
, spag
->csum
, sdev
->csum_size
))
480 spin_lock(&sdev
->stat_lock
);
481 ++sdev
->stat
.data_extents_scrubbed
;
482 sdev
->stat
.data_bytes_scrubbed
+= PAGE_SIZE
;
484 ++sdev
->stat
.csum_errors
;
485 spin_unlock(&sdev
->stat_lock
);
490 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
491 struct scrub_page
*spag
, u64 logical
,
494 struct btrfs_header
*h
;
495 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
496 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
497 u8 csum
[BTRFS_CSUM_SIZE
];
503 * we don't use the getter functions here, as we
504 * a) don't have an extent buffer and
505 * b) the page is already kmapped
507 h
= (struct btrfs_header
*)buffer
;
509 if (logical
!= le64_to_cpu(h
->bytenr
))
512 if (spag
->generation
!= le64_to_cpu(h
->generation
))
515 if (memcmp(h
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
518 if (memcmp(h
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
522 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
523 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
524 btrfs_csum_final(crc
, csum
);
525 if (memcmp(csum
, h
->csum
, sdev
->csum_size
))
528 spin_lock(&sdev
->stat_lock
);
529 ++sdev
->stat
.tree_extents_scrubbed
;
530 sdev
->stat
.tree_bytes_scrubbed
+= PAGE_SIZE
;
532 ++sdev
->stat
.csum_errors
;
534 ++sdev
->stat
.verify_errors
;
535 spin_unlock(&sdev
->stat_lock
);
537 return fail
|| crc_fail
;
540 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
)
542 struct btrfs_super_block
*s
;
544 struct scrub_dev
*sdev
= sbio
->sdev
;
545 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
546 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
547 u8 csum
[BTRFS_CSUM_SIZE
];
551 s
= (struct btrfs_super_block
*)buffer
;
552 logical
= sbio
->logical
;
554 if (logical
!= le64_to_cpu(s
->bytenr
))
557 if (sbio
->spag
[0].generation
!= le64_to_cpu(s
->generation
))
560 if (memcmp(s
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
563 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
564 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
565 btrfs_csum_final(crc
, csum
);
566 if (memcmp(csum
, s
->csum
, sbio
->sdev
->csum_size
))
571 * if we find an error in a super block, we just report it.
572 * They will get written with the next transaction commit
575 spin_lock(&sdev
->stat_lock
);
576 ++sdev
->stat
.super_errors
;
577 spin_unlock(&sdev
->stat_lock
);
583 static int scrub_submit(struct scrub_dev
*sdev
)
585 struct scrub_bio
*sbio
;
587 if (sdev
->curr
== -1)
590 sbio
= sdev
->bios
[sdev
->curr
];
592 sbio
->bio
->bi_sector
= sbio
->physical
>> 9;
593 sbio
->bio
->bi_size
= sbio
->count
* PAGE_SIZE
;
594 sbio
->bio
->bi_next
= NULL
;
595 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
596 sbio
->bio
->bi_comp_cpu
= -1;
597 sbio
->bio
->bi_bdev
= sdev
->dev
->bdev
;
600 atomic_inc(&sdev
->in_flight
);
602 submit_bio(0, sbio
->bio
);
607 static int scrub_page(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
608 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
,
611 struct scrub_bio
*sbio
;
615 * grab a fresh bio or wait for one to become available
617 while (sdev
->curr
== -1) {
618 spin_lock(&sdev
->list_lock
);
619 sdev
->curr
= sdev
->first_free
;
620 if (sdev
->curr
!= -1) {
621 sdev
->first_free
= sdev
->bios
[sdev
->curr
]->next_free
;
622 sdev
->bios
[sdev
->curr
]->next_free
= -1;
623 sdev
->bios
[sdev
->curr
]->count
= 0;
624 spin_unlock(&sdev
->list_lock
);
626 spin_unlock(&sdev
->list_lock
);
627 wait_event(sdev
->list_wait
, sdev
->first_free
!= -1);
630 sbio
= sdev
->bios
[sdev
->curr
];
631 if (sbio
->count
== 0) {
632 sbio
->physical
= physical
;
633 sbio
->logical
= logical
;
634 } else if (sbio
->physical
+ sbio
->count
* PAGE_SIZE
!= physical
||
635 sbio
->logical
+ sbio
->count
* PAGE_SIZE
!= logical
) {
639 sbio
->spag
[sbio
->count
].flags
= flags
;
640 sbio
->spag
[sbio
->count
].generation
= gen
;
641 sbio
->spag
[sbio
->count
].have_csum
= 0;
642 sbio
->spag
[sbio
->count
].mirror_num
= mirror_num
;
644 sbio
->spag
[sbio
->count
].have_csum
= 1;
645 memcpy(sbio
->spag
[sbio
->count
].csum
, csum
, sdev
->csum_size
);
648 if (sbio
->count
== SCRUB_PAGES_PER_BIO
|| force
)
654 static int scrub_find_csum(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
657 struct btrfs_ordered_sum
*sum
= NULL
;
660 unsigned long num_sectors
;
661 u32 sectorsize
= sdev
->dev
->dev_root
->sectorsize
;
663 while (!list_empty(&sdev
->csum_list
)) {
664 sum
= list_first_entry(&sdev
->csum_list
,
665 struct btrfs_ordered_sum
, list
);
666 if (sum
->bytenr
> logical
)
668 if (sum
->bytenr
+ sum
->len
> logical
)
671 ++sdev
->stat
.csum_discards
;
672 list_del(&sum
->list
);
679 num_sectors
= sum
->len
/ sectorsize
;
680 for (i
= 0; i
< num_sectors
; ++i
) {
681 if (sum
->sums
[i
].bytenr
== logical
) {
682 memcpy(csum
, &sum
->sums
[i
].sum
, sdev
->csum_size
);
687 if (ret
&& i
== num_sectors
- 1) {
688 list_del(&sum
->list
);
694 /* scrub extent tries to collect up to 64 kB for each bio */
695 static int scrub_extent(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
696 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
)
699 u8 csum
[BTRFS_CSUM_SIZE
];
702 u64 l
= min_t(u64
, len
, PAGE_SIZE
);
705 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
706 /* push csums to sbio */
707 have_csum
= scrub_find_csum(sdev
, logical
, l
, csum
);
709 ++sdev
->stat
.no_csum
;
711 ret
= scrub_page(sdev
, logical
, l
, physical
, flags
, gen
,
712 mirror_num
, have_csum
? csum
: NULL
, 0);
722 static noinline_for_stack
int scrub_stripe(struct scrub_dev
*sdev
,
723 struct map_lookup
*map
, int num
, u64 base
, u64 length
)
725 struct btrfs_path
*path
;
726 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
727 struct btrfs_root
*root
= fs_info
->extent_root
;
728 struct btrfs_root
*csum_root
= fs_info
->csum_root
;
729 struct btrfs_extent_item
*extent
;
736 struct extent_buffer
*l
;
737 struct btrfs_key key
;
743 u64 increment
= map
->stripe_len
;
748 do_div(nstripes
, map
->stripe_len
);
749 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
750 offset
= map
->stripe_len
* num
;
751 increment
= map
->stripe_len
* map
->num_stripes
;
753 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
754 int factor
= map
->num_stripes
/ map
->sub_stripes
;
755 offset
= map
->stripe_len
* (num
/ map
->sub_stripes
);
756 increment
= map
->stripe_len
* factor
;
757 mirror_num
= num
% map
->sub_stripes
;
758 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
759 increment
= map
->stripe_len
;
760 mirror_num
= num
% map
->num_stripes
;
761 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
762 increment
= map
->stripe_len
;
763 mirror_num
= num
% map
->num_stripes
;
765 increment
= map
->stripe_len
;
769 path
= btrfs_alloc_path();
774 path
->search_commit_root
= 1;
775 path
->skip_locking
= 1;
778 * find all extents for each stripe and just read them to get
779 * them into the page cache
780 * FIXME: we can do better. build a more intelligent prefetching
782 logical
= base
+ offset
;
783 physical
= map
->stripes
[num
].physical
;
785 for (i
= 0; i
< nstripes
; ++i
) {
786 key
.objectid
= logical
;
787 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
790 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
795 slot
= path
->slots
[0];
796 btrfs_item_key_to_cpu(l
, &key
, slot
);
797 if (key
.objectid
!= logical
) {
798 ret
= btrfs_previous_item(root
, path
, 0,
799 BTRFS_EXTENT_ITEM_KEY
);
806 slot
= path
->slots
[0];
807 if (slot
>= btrfs_header_nritems(l
)) {
808 ret
= btrfs_next_leaf(root
, path
);
816 btrfs_item_key_to_cpu(l
, &key
, slot
);
818 if (key
.objectid
>= logical
+ map
->stripe_len
)
823 btrfs_release_path(path
);
824 logical
+= increment
;
825 physical
+= map
->stripe_len
;
830 * collect all data csums for the stripe to avoid seeking during
831 * the scrub. This might currently (crc32) end up to be about 1MB
835 logical
= base
+ offset
+ start_stripe
* increment
;
836 for (i
= start_stripe
; i
< nstripes
; ++i
) {
837 ret
= btrfs_lookup_csums_range(csum_root
, logical
,
838 logical
+ map
->stripe_len
- 1,
839 &sdev
->csum_list
, 1);
843 logical
+= increment
;
847 * now find all extents for each stripe and scrub them
849 logical
= base
+ offset
+ start_stripe
* increment
;
850 physical
= map
->stripes
[num
].physical
+ start_stripe
* map
->stripe_len
;
852 for (i
= start_stripe
; i
< nstripes
; ++i
) {
856 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
857 atomic_read(&sdev
->cancel_req
)) {
862 * check to see if we have to pause
864 if (atomic_read(&fs_info
->scrub_pause_req
)) {
865 /* push queued extents */
867 wait_event(sdev
->list_wait
,
868 atomic_read(&sdev
->in_flight
) == 0);
869 atomic_inc(&fs_info
->scrubs_paused
);
870 wake_up(&fs_info
->scrub_pause_wait
);
871 mutex_lock(&fs_info
->scrub_lock
);
872 while (atomic_read(&fs_info
->scrub_pause_req
)) {
873 mutex_unlock(&fs_info
->scrub_lock
);
874 wait_event(fs_info
->scrub_pause_wait
,
875 atomic_read(&fs_info
->scrub_pause_req
) == 0);
876 mutex_lock(&fs_info
->scrub_lock
);
878 atomic_dec(&fs_info
->scrubs_paused
);
879 mutex_unlock(&fs_info
->scrub_lock
);
880 wake_up(&fs_info
->scrub_pause_wait
);
881 scrub_free_csums(sdev
);
886 key
.objectid
= logical
;
887 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
890 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
895 slot
= path
->slots
[0];
896 btrfs_item_key_to_cpu(l
, &key
, slot
);
897 if (key
.objectid
!= logical
) {
898 ret
= btrfs_previous_item(root
, path
, 0,
899 BTRFS_EXTENT_ITEM_KEY
);
906 slot
= path
->slots
[0];
907 if (slot
>= btrfs_header_nritems(l
)) {
908 ret
= btrfs_next_leaf(root
, path
);
916 btrfs_item_key_to_cpu(l
, &key
, slot
);
918 if (key
.objectid
+ key
.offset
<= logical
)
921 if (key
.objectid
>= logical
+ map
->stripe_len
)
924 if (btrfs_key_type(&key
) != BTRFS_EXTENT_ITEM_KEY
)
927 extent
= btrfs_item_ptr(l
, slot
,
928 struct btrfs_extent_item
);
929 flags
= btrfs_extent_flags(l
, extent
);
930 generation
= btrfs_extent_generation(l
, extent
);
932 if (key
.objectid
< logical
&&
933 (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)) {
935 "btrfs scrub: tree block %llu spanning "
936 "stripes, ignored. logical=%llu\n",
937 (unsigned long long)key
.objectid
,
938 (unsigned long long)logical
);
943 * trim extent to this stripe
945 if (key
.objectid
< logical
) {
946 key
.offset
-= logical
- key
.objectid
;
947 key
.objectid
= logical
;
949 if (key
.objectid
+ key
.offset
>
950 logical
+ map
->stripe_len
) {
951 key
.offset
= logical
+ map
->stripe_len
-
955 ret
= scrub_extent(sdev
, key
.objectid
, key
.offset
,
956 key
.objectid
- logical
+ physical
,
957 flags
, generation
, mirror_num
);
964 btrfs_release_path(path
);
965 logical
+= increment
;
966 physical
+= map
->stripe_len
;
967 spin_lock(&sdev
->stat_lock
);
968 sdev
->stat
.last_physical
= physical
;
969 spin_unlock(&sdev
->stat_lock
);
971 /* push queued extents */
975 btrfs_free_path(path
);
976 return ret
< 0 ? ret
: 0;
979 static noinline_for_stack
int scrub_chunk(struct scrub_dev
*sdev
,
980 u64 chunk_tree
, u64 chunk_objectid
, u64 chunk_offset
, u64 length
)
982 struct btrfs_mapping_tree
*map_tree
=
983 &sdev
->dev
->dev_root
->fs_info
->mapping_tree
;
984 struct map_lookup
*map
;
985 struct extent_map
*em
;
989 read_lock(&map_tree
->map_tree
.lock
);
990 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
991 read_unlock(&map_tree
->map_tree
.lock
);
996 map
= (struct map_lookup
*)em
->bdev
;
997 if (em
->start
!= chunk_offset
)
1000 if (em
->len
< length
)
1003 for (i
= 0; i
< map
->num_stripes
; ++i
) {
1004 if (map
->stripes
[i
].dev
== sdev
->dev
) {
1005 ret
= scrub_stripe(sdev
, map
, i
, chunk_offset
, length
);
1011 free_extent_map(em
);
1016 static noinline_for_stack
1017 int scrub_enumerate_chunks(struct scrub_dev
*sdev
, u64 start
, u64 end
)
1019 struct btrfs_dev_extent
*dev_extent
= NULL
;
1020 struct btrfs_path
*path
;
1021 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
1022 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1029 struct extent_buffer
*l
;
1030 struct btrfs_key key
;
1031 struct btrfs_key found_key
;
1032 struct btrfs_block_group_cache
*cache
;
1034 path
= btrfs_alloc_path();
1039 path
->search_commit_root
= 1;
1040 path
->skip_locking
= 1;
1042 key
.objectid
= sdev
->dev
->devid
;
1044 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1048 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1054 slot
= path
->slots
[0];
1056 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
1058 if (found_key
.objectid
!= sdev
->dev
->devid
)
1061 if (btrfs_key_type(&key
) != BTRFS_DEV_EXTENT_KEY
)
1064 if (found_key
.offset
>= end
)
1067 if (found_key
.offset
< key
.offset
)
1070 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1071 length
= btrfs_dev_extent_length(l
, dev_extent
);
1073 if (found_key
.offset
+ length
<= start
) {
1074 key
.offset
= found_key
.offset
+ length
;
1075 btrfs_release_path(path
);
1079 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
1080 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
1081 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
1084 * get a reference on the corresponding block group to prevent
1085 * the chunk from going away while we scrub it
1087 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
1092 ret
= scrub_chunk(sdev
, chunk_tree
, chunk_objectid
,
1093 chunk_offset
, length
);
1094 btrfs_put_block_group(cache
);
1098 key
.offset
= found_key
.offset
+ length
;
1099 btrfs_release_path(path
);
1103 btrfs_free_path(path
);
1107 static noinline_for_stack
int scrub_supers(struct scrub_dev
*sdev
)
1113 struct btrfs_device
*device
= sdev
->dev
;
1114 struct btrfs_root
*root
= device
->dev_root
;
1116 gen
= root
->fs_info
->last_trans_committed
;
1118 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1119 bytenr
= btrfs_sb_offset(i
);
1120 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>= device
->total_bytes
)
1123 ret
= scrub_page(sdev
, bytenr
, PAGE_SIZE
, bytenr
,
1124 BTRFS_EXTENT_FLAG_SUPER
, gen
, i
, NULL
, 1);
1128 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1134 * get a reference count on fs_info->scrub_workers. start worker if necessary
1136 static noinline_for_stack
int scrub_workers_get(struct btrfs_root
*root
)
1138 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1140 mutex_lock(&fs_info
->scrub_lock
);
1141 if (fs_info
->scrub_workers_refcnt
== 0)
1142 btrfs_start_workers(&fs_info
->scrub_workers
, 1);
1143 ++fs_info
->scrub_workers_refcnt
;
1144 mutex_unlock(&fs_info
->scrub_lock
);
1149 static noinline_for_stack
void scrub_workers_put(struct btrfs_root
*root
)
1151 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1153 mutex_lock(&fs_info
->scrub_lock
);
1154 if (--fs_info
->scrub_workers_refcnt
== 0)
1155 btrfs_stop_workers(&fs_info
->scrub_workers
);
1156 WARN_ON(fs_info
->scrub_workers_refcnt
< 0);
1157 mutex_unlock(&fs_info
->scrub_lock
);
1161 int btrfs_scrub_dev(struct btrfs_root
*root
, u64 devid
, u64 start
, u64 end
,
1162 struct btrfs_scrub_progress
*progress
, int readonly
)
1164 struct scrub_dev
*sdev
;
1165 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1167 struct btrfs_device
*dev
;
1169 if (root
->fs_info
->closing
)
1173 * check some assumptions
1175 if (root
->sectorsize
!= PAGE_SIZE
||
1176 root
->sectorsize
!= root
->leafsize
||
1177 root
->sectorsize
!= root
->nodesize
) {
1178 printk(KERN_ERR
"btrfs_scrub: size assumptions fail\n");
1182 ret
= scrub_workers_get(root
);
1186 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1187 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1188 if (!dev
|| dev
->missing
) {
1189 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1190 scrub_workers_put(root
);
1193 mutex_lock(&fs_info
->scrub_lock
);
1195 if (!dev
->in_fs_metadata
) {
1196 mutex_unlock(&fs_info
->scrub_lock
);
1197 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1198 scrub_workers_put(root
);
1202 if (dev
->scrub_device
) {
1203 mutex_unlock(&fs_info
->scrub_lock
);
1204 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1205 scrub_workers_put(root
);
1206 return -EINPROGRESS
;
1208 sdev
= scrub_setup_dev(dev
);
1210 mutex_unlock(&fs_info
->scrub_lock
);
1211 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1212 scrub_workers_put(root
);
1213 return PTR_ERR(sdev
);
1215 sdev
->readonly
= readonly
;
1216 dev
->scrub_device
= sdev
;
1218 atomic_inc(&fs_info
->scrubs_running
);
1219 mutex_unlock(&fs_info
->scrub_lock
);
1220 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1222 down_read(&fs_info
->scrub_super_lock
);
1223 ret
= scrub_supers(sdev
);
1224 up_read(&fs_info
->scrub_super_lock
);
1227 ret
= scrub_enumerate_chunks(sdev
, start
, end
);
1229 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1231 atomic_dec(&fs_info
->scrubs_running
);
1232 wake_up(&fs_info
->scrub_pause_wait
);
1235 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1237 mutex_lock(&fs_info
->scrub_lock
);
1238 dev
->scrub_device
= NULL
;
1239 mutex_unlock(&fs_info
->scrub_lock
);
1241 scrub_free_dev(sdev
);
1242 scrub_workers_put(root
);
1247 int btrfs_scrub_pause(struct btrfs_root
*root
)
1249 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1251 mutex_lock(&fs_info
->scrub_lock
);
1252 atomic_inc(&fs_info
->scrub_pause_req
);
1253 while (atomic_read(&fs_info
->scrubs_paused
) !=
1254 atomic_read(&fs_info
->scrubs_running
)) {
1255 mutex_unlock(&fs_info
->scrub_lock
);
1256 wait_event(fs_info
->scrub_pause_wait
,
1257 atomic_read(&fs_info
->scrubs_paused
) ==
1258 atomic_read(&fs_info
->scrubs_running
));
1259 mutex_lock(&fs_info
->scrub_lock
);
1261 mutex_unlock(&fs_info
->scrub_lock
);
1266 int btrfs_scrub_continue(struct btrfs_root
*root
)
1268 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1270 atomic_dec(&fs_info
->scrub_pause_req
);
1271 wake_up(&fs_info
->scrub_pause_wait
);
1275 int btrfs_scrub_pause_super(struct btrfs_root
*root
)
1277 down_write(&root
->fs_info
->scrub_super_lock
);
1281 int btrfs_scrub_continue_super(struct btrfs_root
*root
)
1283 up_write(&root
->fs_info
->scrub_super_lock
);
1287 int btrfs_scrub_cancel(struct btrfs_root
*root
)
1289 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1291 mutex_lock(&fs_info
->scrub_lock
);
1292 if (!atomic_read(&fs_info
->scrubs_running
)) {
1293 mutex_unlock(&fs_info
->scrub_lock
);
1297 atomic_inc(&fs_info
->scrub_cancel_req
);
1298 while (atomic_read(&fs_info
->scrubs_running
)) {
1299 mutex_unlock(&fs_info
->scrub_lock
);
1300 wait_event(fs_info
->scrub_pause_wait
,
1301 atomic_read(&fs_info
->scrubs_running
) == 0);
1302 mutex_lock(&fs_info
->scrub_lock
);
1304 atomic_dec(&fs_info
->scrub_cancel_req
);
1305 mutex_unlock(&fs_info
->scrub_lock
);
1310 int btrfs_scrub_cancel_dev(struct btrfs_root
*root
, struct btrfs_device
*dev
)
1312 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1313 struct scrub_dev
*sdev
;
1315 mutex_lock(&fs_info
->scrub_lock
);
1316 sdev
= dev
->scrub_device
;
1318 mutex_unlock(&fs_info
->scrub_lock
);
1321 atomic_inc(&sdev
->cancel_req
);
1322 while (dev
->scrub_device
) {
1323 mutex_unlock(&fs_info
->scrub_lock
);
1324 wait_event(fs_info
->scrub_pause_wait
,
1325 dev
->scrub_device
== NULL
);
1326 mutex_lock(&fs_info
->scrub_lock
);
1328 mutex_unlock(&fs_info
->scrub_lock
);
1332 int btrfs_scrub_cancel_devid(struct btrfs_root
*root
, u64 devid
)
1334 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1335 struct btrfs_device
*dev
;
1339 * we have to hold the device_list_mutex here so the device
1340 * does not go away in cancel_dev. FIXME: find a better solution
1342 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1343 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1345 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1348 ret
= btrfs_scrub_cancel_dev(root
, dev
);
1349 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1354 int btrfs_scrub_progress(struct btrfs_root
*root
, u64 devid
,
1355 struct btrfs_scrub_progress
*progress
)
1357 struct btrfs_device
*dev
;
1358 struct scrub_dev
*sdev
= NULL
;
1360 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1361 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1363 sdev
= dev
->scrub_device
;
1365 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1366 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1368 return dev
? (sdev
? 0 : -ENOTCONN
) : -ENODEV
;