2 * Copyright (C) 2011 STRATO. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/blkdev.h>
23 #include "ordered-data.h"
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - In case of a read error on files with nodatasum, map the file and read
35 * the extent to trigger a writeback of the good copy
36 * - track and record media errors, throw out bad devices
37 * - add a mode to also read unallocated space
43 static void scrub_bio_end_io(struct bio
*bio
, int err
);
44 static void scrub_checksum(struct btrfs_work
*work
);
45 static int scrub_checksum_data(struct scrub_dev
*sdev
,
46 struct scrub_page
*spag
, void *buffer
);
47 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
48 struct scrub_page
*spag
, u64 logical
,
50 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
);
51 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
);
52 static void scrub_fixup_end_io(struct bio
*bio
, int err
);
53 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
55 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
);
57 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
58 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
61 u64 flags
; /* extent flags */
65 u8 csum
[BTRFS_CSUM_SIZE
];
70 struct scrub_dev
*sdev
;
75 struct scrub_page spag
[SCRUB_PAGES_PER_BIO
];
78 struct btrfs_work work
;
82 struct scrub_bio
*bios
[SCRUB_BIOS_PER_DEV
];
83 struct btrfs_device
*dev
;
88 wait_queue_head_t list_wait
;
90 struct list_head csum_list
;
96 struct btrfs_scrub_progress stat
;
100 static void scrub_free_csums(struct scrub_dev
*sdev
)
102 while (!list_empty(&sdev
->csum_list
)) {
103 struct btrfs_ordered_sum
*sum
;
104 sum
= list_first_entry(&sdev
->csum_list
,
105 struct btrfs_ordered_sum
, list
);
106 list_del(&sum
->list
);
111 static void scrub_free_bio(struct bio
*bio
)
114 struct page
*last_page
= NULL
;
119 for (i
= 0; i
< bio
->bi_vcnt
; ++i
) {
120 if (bio
->bi_io_vec
[i
].bv_page
== last_page
)
122 last_page
= bio
->bi_io_vec
[i
].bv_page
;
123 __free_page(last_page
);
128 static noinline_for_stack
void scrub_free_dev(struct scrub_dev
*sdev
)
135 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
136 struct scrub_bio
*sbio
= sdev
->bios
[i
];
141 scrub_free_bio(sbio
->bio
);
145 scrub_free_csums(sdev
);
149 static noinline_for_stack
150 struct scrub_dev
*scrub_setup_dev(struct btrfs_device
*dev
)
152 struct scrub_dev
*sdev
;
154 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
156 sdev
= kzalloc(sizeof(*sdev
), GFP_NOFS
);
160 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
161 struct scrub_bio
*sbio
;
163 sbio
= kzalloc(sizeof(*sbio
), GFP_NOFS
);
166 sdev
->bios
[i
] = sbio
;
171 sbio
->work
.func
= scrub_checksum
;
173 if (i
!= SCRUB_BIOS_PER_DEV
-1)
174 sdev
->bios
[i
]->next_free
= i
+ 1;
176 sdev
->bios
[i
]->next_free
= -1;
178 sdev
->first_free
= 0;
180 atomic_set(&sdev
->in_flight
, 0);
181 atomic_set(&sdev
->cancel_req
, 0);
182 sdev
->csum_size
= btrfs_super_csum_size(fs_info
->super_copy
);
183 INIT_LIST_HEAD(&sdev
->csum_list
);
185 spin_lock_init(&sdev
->list_lock
);
186 spin_lock_init(&sdev
->stat_lock
);
187 init_waitqueue_head(&sdev
->list_wait
);
191 scrub_free_dev(sdev
);
192 return ERR_PTR(-ENOMEM
);
196 * scrub_recheck_error gets called when either verification of the page
197 * failed or the bio failed to read, e.g. with EIO. In the latter case,
198 * recheck_error gets called for every page in the bio, even though only
201 static void scrub_recheck_error(struct scrub_bio
*sbio
, int ix
)
204 if (scrub_fixup_io(READ
, sbio
->sdev
->dev
->bdev
,
205 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
206 sbio
->bio
->bi_io_vec
[ix
].bv_page
) == 0) {
207 if (scrub_fixup_check(sbio
, ix
) == 0)
212 scrub_fixup(sbio
, ix
);
215 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
)
220 u64 flags
= sbio
->spag
[ix
].flags
;
222 page
= sbio
->bio
->bi_io_vec
[ix
].bv_page
;
223 buffer
= kmap_atomic(page
, KM_USER0
);
224 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
225 ret
= scrub_checksum_data(sbio
->sdev
,
226 sbio
->spag
+ ix
, buffer
);
227 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
228 ret
= scrub_checksum_tree_block(sbio
->sdev
,
230 sbio
->logical
+ ix
* PAGE_SIZE
,
235 kunmap_atomic(buffer
, KM_USER0
);
240 static void scrub_fixup_end_io(struct bio
*bio
, int err
)
242 complete((struct completion
*)bio
->bi_private
);
245 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
)
247 struct scrub_dev
*sdev
= sbio
->sdev
;
248 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
249 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
250 struct btrfs_multi_bio
*multi
= NULL
;
251 u64 logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
255 DECLARE_COMPLETION_ONSTACK(complete
);
257 if ((sbio
->spag
[ix
].flags
& BTRFS_EXTENT_FLAG_DATA
) &&
258 (sbio
->spag
[ix
].have_csum
== 0)) {
260 * nodatasum, don't try to fix anything
261 * FIXME: we can do better, open the inode and trigger a
268 ret
= btrfs_map_block(map_tree
, REQ_WRITE
, logical
, &length
,
270 if (ret
|| !multi
|| length
< PAGE_SIZE
) {
272 "scrub_fixup: btrfs_map_block failed us for %llu\n",
273 (unsigned long long)logical
);
278 if (multi
->num_stripes
== 1)
279 /* there aren't any replicas */
283 * first find a good copy
285 for (i
= 0; i
< multi
->num_stripes
; ++i
) {
286 if (i
== sbio
->spag
[ix
].mirror_num
)
289 if (scrub_fixup_io(READ
, multi
->stripes
[i
].dev
->bdev
,
290 multi
->stripes
[i
].physical
>> 9,
291 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
292 /* I/O-error, this is not a good copy */
296 if (scrub_fixup_check(sbio
, ix
) == 0)
299 if (i
== multi
->num_stripes
)
302 if (!sdev
->readonly
) {
304 * bi_io_vec[ix].bv_page now contains good data, write it back
306 if (scrub_fixup_io(WRITE
, sdev
->dev
->bdev
,
307 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
308 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
309 /* I/O-error, writeback failed, give up */
315 spin_lock(&sdev
->stat_lock
);
316 ++sdev
->stat
.corrected_errors
;
317 spin_unlock(&sdev
->stat_lock
);
319 if (printk_ratelimit())
320 printk(KERN_ERR
"btrfs: fixed up at %llu\n",
321 (unsigned long long)logical
);
326 spin_lock(&sdev
->stat_lock
);
327 ++sdev
->stat
.uncorrectable_errors
;
328 spin_unlock(&sdev
->stat_lock
);
330 if (printk_ratelimit())
331 printk(KERN_ERR
"btrfs: unable to fixup at %llu\n",
332 (unsigned long long)logical
);
335 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
338 struct bio
*bio
= NULL
;
340 DECLARE_COMPLETION_ONSTACK(complete
);
342 bio
= bio_alloc(GFP_NOFS
, 1);
344 bio
->bi_sector
= sector
;
345 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
346 bio
->bi_end_io
= scrub_fixup_end_io
;
347 bio
->bi_private
= &complete
;
350 /* this will also unplug the queue */
351 wait_for_completion(&complete
);
353 ret
= !test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
358 static void scrub_bio_end_io(struct bio
*bio
, int err
)
360 struct scrub_bio
*sbio
= bio
->bi_private
;
361 struct scrub_dev
*sdev
= sbio
->sdev
;
362 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
367 btrfs_queue_worker(&fs_info
->scrub_workers
, &sbio
->work
);
370 static void scrub_checksum(struct btrfs_work
*work
)
372 struct scrub_bio
*sbio
= container_of(work
, struct scrub_bio
, work
);
373 struct scrub_dev
*sdev
= sbio
->sdev
;
382 for (i
= 0; i
< sbio
->count
; ++i
)
383 scrub_recheck_error(sbio
, i
);
385 sbio
->bio
->bi_flags
&= ~(BIO_POOL_MASK
- 1);
386 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
387 sbio
->bio
->bi_phys_segments
= 0;
388 sbio
->bio
->bi_idx
= 0;
390 for (i
= 0; i
< sbio
->count
; i
++) {
392 bi
= &sbio
->bio
->bi_io_vec
[i
];
394 bi
->bv_len
= PAGE_SIZE
;
397 spin_lock(&sdev
->stat_lock
);
398 ++sdev
->stat
.read_errors
;
399 spin_unlock(&sdev
->stat_lock
);
402 for (i
= 0; i
< sbio
->count
; ++i
) {
403 page
= sbio
->bio
->bi_io_vec
[i
].bv_page
;
404 buffer
= kmap_atomic(page
, KM_USER0
);
405 flags
= sbio
->spag
[i
].flags
;
406 logical
= sbio
->logical
+ i
* PAGE_SIZE
;
408 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
409 ret
= scrub_checksum_data(sdev
, sbio
->spag
+ i
, buffer
);
410 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
411 ret
= scrub_checksum_tree_block(sdev
, sbio
->spag
+ i
,
413 } else if (flags
& BTRFS_EXTENT_FLAG_SUPER
) {
415 (void)scrub_checksum_super(sbio
, buffer
);
419 kunmap_atomic(buffer
, KM_USER0
);
421 scrub_recheck_error(sbio
, i
);
425 scrub_free_bio(sbio
->bio
);
427 spin_lock(&sdev
->list_lock
);
428 sbio
->next_free
= sdev
->first_free
;
429 sdev
->first_free
= sbio
->index
;
430 spin_unlock(&sdev
->list_lock
);
431 atomic_dec(&sdev
->in_flight
);
432 wake_up(&sdev
->list_wait
);
435 static int scrub_checksum_data(struct scrub_dev
*sdev
,
436 struct scrub_page
*spag
, void *buffer
)
438 u8 csum
[BTRFS_CSUM_SIZE
];
441 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
443 if (!spag
->have_csum
)
446 crc
= btrfs_csum_data(root
, buffer
, crc
, PAGE_SIZE
);
447 btrfs_csum_final(crc
, csum
);
448 if (memcmp(csum
, spag
->csum
, sdev
->csum_size
))
451 spin_lock(&sdev
->stat_lock
);
452 ++sdev
->stat
.data_extents_scrubbed
;
453 sdev
->stat
.data_bytes_scrubbed
+= PAGE_SIZE
;
455 ++sdev
->stat
.csum_errors
;
456 spin_unlock(&sdev
->stat_lock
);
461 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
462 struct scrub_page
*spag
, u64 logical
,
465 struct btrfs_header
*h
;
466 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
467 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
468 u8 csum
[BTRFS_CSUM_SIZE
];
474 * we don't use the getter functions here, as we
475 * a) don't have an extent buffer and
476 * b) the page is already kmapped
478 h
= (struct btrfs_header
*)buffer
;
480 if (logical
!= le64_to_cpu(h
->bytenr
))
483 if (spag
->generation
!= le64_to_cpu(h
->generation
))
486 if (memcmp(h
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
489 if (memcmp(h
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
493 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
494 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
495 btrfs_csum_final(crc
, csum
);
496 if (memcmp(csum
, h
->csum
, sdev
->csum_size
))
499 spin_lock(&sdev
->stat_lock
);
500 ++sdev
->stat
.tree_extents_scrubbed
;
501 sdev
->stat
.tree_bytes_scrubbed
+= PAGE_SIZE
;
503 ++sdev
->stat
.csum_errors
;
505 ++sdev
->stat
.verify_errors
;
506 spin_unlock(&sdev
->stat_lock
);
508 return fail
|| crc_fail
;
511 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
)
513 struct btrfs_super_block
*s
;
515 struct scrub_dev
*sdev
= sbio
->sdev
;
516 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
517 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
518 u8 csum
[BTRFS_CSUM_SIZE
];
522 s
= (struct btrfs_super_block
*)buffer
;
523 logical
= sbio
->logical
;
525 if (logical
!= le64_to_cpu(s
->bytenr
))
528 if (sbio
->spag
[0].generation
!= le64_to_cpu(s
->generation
))
531 if (memcmp(s
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
534 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
535 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
536 btrfs_csum_final(crc
, csum
);
537 if (memcmp(csum
, s
->csum
, sbio
->sdev
->csum_size
))
542 * if we find an error in a super block, we just report it.
543 * They will get written with the next transaction commit
546 spin_lock(&sdev
->stat_lock
);
547 ++sdev
->stat
.super_errors
;
548 spin_unlock(&sdev
->stat_lock
);
554 static int scrub_submit(struct scrub_dev
*sdev
)
556 struct scrub_bio
*sbio
;
560 if (sdev
->curr
== -1)
563 sbio
= sdev
->bios
[sdev
->curr
];
565 bio
= bio_alloc(GFP_NOFS
, sbio
->count
);
569 bio
->bi_private
= sbio
;
570 bio
->bi_end_io
= scrub_bio_end_io
;
571 bio
->bi_bdev
= sdev
->dev
->bdev
;
572 bio
->bi_sector
= sbio
->physical
>> 9;
574 for (i
= 0; i
< sbio
->count
; ++i
) {
578 page
= alloc_page(GFP_NOFS
);
582 ret
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
591 atomic_inc(&sdev
->in_flight
);
593 submit_bio(READ
, bio
);
603 static int scrub_page(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
604 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
,
607 struct scrub_bio
*sbio
;
611 * grab a fresh bio or wait for one to become available
613 while (sdev
->curr
== -1) {
614 spin_lock(&sdev
->list_lock
);
615 sdev
->curr
= sdev
->first_free
;
616 if (sdev
->curr
!= -1) {
617 sdev
->first_free
= sdev
->bios
[sdev
->curr
]->next_free
;
618 sdev
->bios
[sdev
->curr
]->next_free
= -1;
619 sdev
->bios
[sdev
->curr
]->count
= 0;
620 spin_unlock(&sdev
->list_lock
);
622 spin_unlock(&sdev
->list_lock
);
623 wait_event(sdev
->list_wait
, sdev
->first_free
!= -1);
626 sbio
= sdev
->bios
[sdev
->curr
];
627 if (sbio
->count
== 0) {
628 sbio
->physical
= physical
;
629 sbio
->logical
= logical
;
630 } else if (sbio
->physical
+ sbio
->count
* PAGE_SIZE
!= physical
||
631 sbio
->logical
+ sbio
->count
* PAGE_SIZE
!= logical
) {
634 ret
= scrub_submit(sdev
);
639 sbio
->spag
[sbio
->count
].flags
= flags
;
640 sbio
->spag
[sbio
->count
].generation
= gen
;
641 sbio
->spag
[sbio
->count
].have_csum
= 0;
642 sbio
->spag
[sbio
->count
].mirror_num
= mirror_num
;
644 sbio
->spag
[sbio
->count
].have_csum
= 1;
645 memcpy(sbio
->spag
[sbio
->count
].csum
, csum
, sdev
->csum_size
);
648 if (sbio
->count
== SCRUB_PAGES_PER_BIO
|| force
) {
651 ret
= scrub_submit(sdev
);
659 static int scrub_find_csum(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
662 struct btrfs_ordered_sum
*sum
= NULL
;
665 unsigned long num_sectors
;
666 u32 sectorsize
= sdev
->dev
->dev_root
->sectorsize
;
668 while (!list_empty(&sdev
->csum_list
)) {
669 sum
= list_first_entry(&sdev
->csum_list
,
670 struct btrfs_ordered_sum
, list
);
671 if (sum
->bytenr
> logical
)
673 if (sum
->bytenr
+ sum
->len
> logical
)
676 ++sdev
->stat
.csum_discards
;
677 list_del(&sum
->list
);
684 num_sectors
= sum
->len
/ sectorsize
;
685 for (i
= 0; i
< num_sectors
; ++i
) {
686 if (sum
->sums
[i
].bytenr
== logical
) {
687 memcpy(csum
, &sum
->sums
[i
].sum
, sdev
->csum_size
);
692 if (ret
&& i
== num_sectors
- 1) {
693 list_del(&sum
->list
);
699 /* scrub extent tries to collect up to 64 kB for each bio */
700 static int scrub_extent(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
701 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
)
704 u8 csum
[BTRFS_CSUM_SIZE
];
707 u64 l
= min_t(u64
, len
, PAGE_SIZE
);
710 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
711 /* push csums to sbio */
712 have_csum
= scrub_find_csum(sdev
, logical
, l
, csum
);
714 ++sdev
->stat
.no_csum
;
716 ret
= scrub_page(sdev
, logical
, l
, physical
, flags
, gen
,
717 mirror_num
, have_csum
? csum
: NULL
, 0);
727 static noinline_for_stack
int scrub_stripe(struct scrub_dev
*sdev
,
728 struct map_lookup
*map
, int num
, u64 base
, u64 length
)
730 struct btrfs_path
*path
;
731 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
732 struct btrfs_root
*root
= fs_info
->extent_root
;
733 struct btrfs_root
*csum_root
= fs_info
->csum_root
;
734 struct btrfs_extent_item
*extent
;
735 struct blk_plug plug
;
741 struct extent_buffer
*l
;
742 struct btrfs_key key
;
747 struct reada_control
*reada1
;
748 struct reada_control
*reada2
;
749 struct btrfs_key key_start
;
750 struct btrfs_key key_end
;
752 u64 increment
= map
->stripe_len
;
757 do_div(nstripes
, map
->stripe_len
);
758 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
759 offset
= map
->stripe_len
* num
;
760 increment
= map
->stripe_len
* map
->num_stripes
;
762 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
763 int factor
= map
->num_stripes
/ map
->sub_stripes
;
764 offset
= map
->stripe_len
* (num
/ map
->sub_stripes
);
765 increment
= map
->stripe_len
* factor
;
766 mirror_num
= num
% map
->sub_stripes
;
767 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
768 increment
= map
->stripe_len
;
769 mirror_num
= num
% map
->num_stripes
;
770 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
771 increment
= map
->stripe_len
;
772 mirror_num
= num
% map
->num_stripes
;
774 increment
= map
->stripe_len
;
778 path
= btrfs_alloc_path();
782 path
->search_commit_root
= 1;
783 path
->skip_locking
= 1;
786 * trigger the readahead for extent tree csum tree and wait for
787 * completion. During readahead, the scrub is officially paused
788 * to not hold off transaction commits
790 logical
= base
+ offset
;
792 wait_event(sdev
->list_wait
,
793 atomic_read(&sdev
->in_flight
) == 0);
794 atomic_inc(&fs_info
->scrubs_paused
);
795 wake_up(&fs_info
->scrub_pause_wait
);
797 /* FIXME it might be better to start readahead at commit root */
798 key_start
.objectid
= logical
;
799 key_start
.type
= BTRFS_EXTENT_ITEM_KEY
;
800 key_start
.offset
= (u64
)0;
801 key_end
.objectid
= base
+ offset
+ nstripes
* increment
;
802 key_end
.type
= BTRFS_EXTENT_ITEM_KEY
;
803 key_end
.offset
= (u64
)0;
804 reada1
= btrfs_reada_add(root
, &key_start
, &key_end
);
806 key_start
.objectid
= BTRFS_EXTENT_CSUM_OBJECTID
;
807 key_start
.type
= BTRFS_EXTENT_CSUM_KEY
;
808 key_start
.offset
= logical
;
809 key_end
.objectid
= BTRFS_EXTENT_CSUM_OBJECTID
;
810 key_end
.type
= BTRFS_EXTENT_CSUM_KEY
;
811 key_end
.offset
= base
+ offset
+ nstripes
* increment
;
812 reada2
= btrfs_reada_add(csum_root
, &key_start
, &key_end
);
815 btrfs_reada_wait(reada1
);
817 btrfs_reada_wait(reada2
);
819 mutex_lock(&fs_info
->scrub_lock
);
820 while (atomic_read(&fs_info
->scrub_pause_req
)) {
821 mutex_unlock(&fs_info
->scrub_lock
);
822 wait_event(fs_info
->scrub_pause_wait
,
823 atomic_read(&fs_info
->scrub_pause_req
) == 0);
824 mutex_lock(&fs_info
->scrub_lock
);
826 atomic_dec(&fs_info
->scrubs_paused
);
827 mutex_unlock(&fs_info
->scrub_lock
);
828 wake_up(&fs_info
->scrub_pause_wait
);
831 * collect all data csums for the stripe to avoid seeking during
832 * the scrub. This might currently (crc32) end up to be about 1MB
834 blk_start_plug(&plug
);
837 * now find all extents for each stripe and scrub them
839 logical
= base
+ offset
;
840 physical
= map
->stripes
[num
].physical
;
842 for (i
= 0; i
< nstripes
; ++i
) {
846 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
847 atomic_read(&sdev
->cancel_req
)) {
852 * check to see if we have to pause
854 if (atomic_read(&fs_info
->scrub_pause_req
)) {
855 /* push queued extents */
857 wait_event(sdev
->list_wait
,
858 atomic_read(&sdev
->in_flight
) == 0);
859 atomic_inc(&fs_info
->scrubs_paused
);
860 wake_up(&fs_info
->scrub_pause_wait
);
861 mutex_lock(&fs_info
->scrub_lock
);
862 while (atomic_read(&fs_info
->scrub_pause_req
)) {
863 mutex_unlock(&fs_info
->scrub_lock
);
864 wait_event(fs_info
->scrub_pause_wait
,
865 atomic_read(&fs_info
->scrub_pause_req
) == 0);
866 mutex_lock(&fs_info
->scrub_lock
);
868 atomic_dec(&fs_info
->scrubs_paused
);
869 mutex_unlock(&fs_info
->scrub_lock
);
870 wake_up(&fs_info
->scrub_pause_wait
);
873 ret
= btrfs_lookup_csums_range(csum_root
, logical
,
874 logical
+ map
->stripe_len
- 1,
875 &sdev
->csum_list
, 1);
879 key
.objectid
= logical
;
880 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
883 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
887 ret
= btrfs_previous_item(root
, path
, 0,
888 BTRFS_EXTENT_ITEM_KEY
);
892 /* there's no smaller item, so stick with the
894 btrfs_release_path(path
);
895 ret
= btrfs_search_slot(NULL
, root
, &key
,
904 slot
= path
->slots
[0];
905 if (slot
>= btrfs_header_nritems(l
)) {
906 ret
= btrfs_next_leaf(root
, path
);
914 btrfs_item_key_to_cpu(l
, &key
, slot
);
916 if (key
.objectid
+ key
.offset
<= logical
)
919 if (key
.objectid
>= logical
+ map
->stripe_len
)
922 if (btrfs_key_type(&key
) != BTRFS_EXTENT_ITEM_KEY
)
925 extent
= btrfs_item_ptr(l
, slot
,
926 struct btrfs_extent_item
);
927 flags
= btrfs_extent_flags(l
, extent
);
928 generation
= btrfs_extent_generation(l
, extent
);
930 if (key
.objectid
< logical
&&
931 (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)) {
933 "btrfs scrub: tree block %llu spanning "
934 "stripes, ignored. logical=%llu\n",
935 (unsigned long long)key
.objectid
,
936 (unsigned long long)logical
);
941 * trim extent to this stripe
943 if (key
.objectid
< logical
) {
944 key
.offset
-= logical
- key
.objectid
;
945 key
.objectid
= logical
;
947 if (key
.objectid
+ key
.offset
>
948 logical
+ map
->stripe_len
) {
949 key
.offset
= logical
+ map
->stripe_len
-
953 ret
= scrub_extent(sdev
, key
.objectid
, key
.offset
,
954 key
.objectid
- logical
+ physical
,
955 flags
, generation
, mirror_num
);
962 btrfs_release_path(path
);
963 logical
+= increment
;
964 physical
+= map
->stripe_len
;
965 spin_lock(&sdev
->stat_lock
);
966 sdev
->stat
.last_physical
= physical
;
967 spin_unlock(&sdev
->stat_lock
);
969 /* push queued extents */
973 blk_finish_plug(&plug
);
974 btrfs_free_path(path
);
975 return ret
< 0 ? ret
: 0;
978 static noinline_for_stack
int scrub_chunk(struct scrub_dev
*sdev
,
979 u64 chunk_tree
, u64 chunk_objectid
, u64 chunk_offset
, u64 length
)
981 struct btrfs_mapping_tree
*map_tree
=
982 &sdev
->dev
->dev_root
->fs_info
->mapping_tree
;
983 struct map_lookup
*map
;
984 struct extent_map
*em
;
988 read_lock(&map_tree
->map_tree
.lock
);
989 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
990 read_unlock(&map_tree
->map_tree
.lock
);
995 map
= (struct map_lookup
*)em
->bdev
;
996 if (em
->start
!= chunk_offset
)
999 if (em
->len
< length
)
1002 for (i
= 0; i
< map
->num_stripes
; ++i
) {
1003 if (map
->stripes
[i
].dev
== sdev
->dev
) {
1004 ret
= scrub_stripe(sdev
, map
, i
, chunk_offset
, length
);
1010 free_extent_map(em
);
1015 static noinline_for_stack
1016 int scrub_enumerate_chunks(struct scrub_dev
*sdev
, u64 start
, u64 end
)
1018 struct btrfs_dev_extent
*dev_extent
= NULL
;
1019 struct btrfs_path
*path
;
1020 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
1021 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1028 struct extent_buffer
*l
;
1029 struct btrfs_key key
;
1030 struct btrfs_key found_key
;
1031 struct btrfs_block_group_cache
*cache
;
1033 path
= btrfs_alloc_path();
1038 path
->search_commit_root
= 1;
1039 path
->skip_locking
= 1;
1041 key
.objectid
= sdev
->dev
->devid
;
1043 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1047 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1051 if (path
->slots
[0] >=
1052 btrfs_header_nritems(path
->nodes
[0])) {
1053 ret
= btrfs_next_leaf(root
, path
);
1060 slot
= path
->slots
[0];
1062 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
1064 if (found_key
.objectid
!= sdev
->dev
->devid
)
1067 if (btrfs_key_type(&found_key
) != BTRFS_DEV_EXTENT_KEY
)
1070 if (found_key
.offset
>= end
)
1073 if (found_key
.offset
< key
.offset
)
1076 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1077 length
= btrfs_dev_extent_length(l
, dev_extent
);
1079 if (found_key
.offset
+ length
<= start
) {
1080 key
.offset
= found_key
.offset
+ length
;
1081 btrfs_release_path(path
);
1085 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
1086 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
1087 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
1090 * get a reference on the corresponding block group to prevent
1091 * the chunk from going away while we scrub it
1093 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
1098 ret
= scrub_chunk(sdev
, chunk_tree
, chunk_objectid
,
1099 chunk_offset
, length
);
1100 btrfs_put_block_group(cache
);
1104 key
.offset
= found_key
.offset
+ length
;
1105 btrfs_release_path(path
);
1108 btrfs_free_path(path
);
1111 * ret can still be 1 from search_slot or next_leaf,
1112 * that's not an error
1114 return ret
< 0 ? ret
: 0;
1117 static noinline_for_stack
int scrub_supers(struct scrub_dev
*sdev
)
1123 struct btrfs_device
*device
= sdev
->dev
;
1124 struct btrfs_root
*root
= device
->dev_root
;
1126 gen
= root
->fs_info
->last_trans_committed
;
1128 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1129 bytenr
= btrfs_sb_offset(i
);
1130 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>= device
->total_bytes
)
1133 ret
= scrub_page(sdev
, bytenr
, PAGE_SIZE
, bytenr
,
1134 BTRFS_EXTENT_FLAG_SUPER
, gen
, i
, NULL
, 1);
1138 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1144 * get a reference count on fs_info->scrub_workers. start worker if necessary
1146 static noinline_for_stack
int scrub_workers_get(struct btrfs_root
*root
)
1148 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1150 mutex_lock(&fs_info
->scrub_lock
);
1151 if (fs_info
->scrub_workers_refcnt
== 0) {
1152 btrfs_init_workers(&fs_info
->scrub_workers
, "scrub",
1153 fs_info
->thread_pool_size
, &fs_info
->generic_worker
);
1154 fs_info
->scrub_workers
.idle_thresh
= 4;
1155 btrfs_start_workers(&fs_info
->scrub_workers
, 1);
1157 ++fs_info
->scrub_workers_refcnt
;
1158 mutex_unlock(&fs_info
->scrub_lock
);
1163 static noinline_for_stack
void scrub_workers_put(struct btrfs_root
*root
)
1165 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1167 mutex_lock(&fs_info
->scrub_lock
);
1168 if (--fs_info
->scrub_workers_refcnt
== 0)
1169 btrfs_stop_workers(&fs_info
->scrub_workers
);
1170 WARN_ON(fs_info
->scrub_workers_refcnt
< 0);
1171 mutex_unlock(&fs_info
->scrub_lock
);
1175 int btrfs_scrub_dev(struct btrfs_root
*root
, u64 devid
, u64 start
, u64 end
,
1176 struct btrfs_scrub_progress
*progress
, int readonly
)
1178 struct scrub_dev
*sdev
;
1179 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1181 struct btrfs_device
*dev
;
1183 if (btrfs_fs_closing(root
->fs_info
))
1187 * check some assumptions
1189 if (root
->sectorsize
!= PAGE_SIZE
||
1190 root
->sectorsize
!= root
->leafsize
||
1191 root
->sectorsize
!= root
->nodesize
) {
1192 printk(KERN_ERR
"btrfs_scrub: size assumptions fail\n");
1196 ret
= scrub_workers_get(root
);
1200 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1201 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1202 if (!dev
|| dev
->missing
) {
1203 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1204 scrub_workers_put(root
);
1207 mutex_lock(&fs_info
->scrub_lock
);
1209 if (!dev
->in_fs_metadata
) {
1210 mutex_unlock(&fs_info
->scrub_lock
);
1211 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1212 scrub_workers_put(root
);
1216 if (dev
->scrub_device
) {
1217 mutex_unlock(&fs_info
->scrub_lock
);
1218 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1219 scrub_workers_put(root
);
1220 return -EINPROGRESS
;
1222 sdev
= scrub_setup_dev(dev
);
1224 mutex_unlock(&fs_info
->scrub_lock
);
1225 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1226 scrub_workers_put(root
);
1227 return PTR_ERR(sdev
);
1229 sdev
->readonly
= readonly
;
1230 dev
->scrub_device
= sdev
;
1232 atomic_inc(&fs_info
->scrubs_running
);
1233 mutex_unlock(&fs_info
->scrub_lock
);
1234 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1236 down_read(&fs_info
->scrub_super_lock
);
1237 ret
= scrub_supers(sdev
);
1238 up_read(&fs_info
->scrub_super_lock
);
1241 ret
= scrub_enumerate_chunks(sdev
, start
, end
);
1243 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1245 atomic_dec(&fs_info
->scrubs_running
);
1246 wake_up(&fs_info
->scrub_pause_wait
);
1249 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1251 mutex_lock(&fs_info
->scrub_lock
);
1252 dev
->scrub_device
= NULL
;
1253 mutex_unlock(&fs_info
->scrub_lock
);
1255 scrub_free_dev(sdev
);
1256 scrub_workers_put(root
);
1261 int btrfs_scrub_pause(struct btrfs_root
*root
)
1263 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1265 mutex_lock(&fs_info
->scrub_lock
);
1266 atomic_inc(&fs_info
->scrub_pause_req
);
1267 while (atomic_read(&fs_info
->scrubs_paused
) !=
1268 atomic_read(&fs_info
->scrubs_running
)) {
1269 mutex_unlock(&fs_info
->scrub_lock
);
1270 wait_event(fs_info
->scrub_pause_wait
,
1271 atomic_read(&fs_info
->scrubs_paused
) ==
1272 atomic_read(&fs_info
->scrubs_running
));
1273 mutex_lock(&fs_info
->scrub_lock
);
1275 mutex_unlock(&fs_info
->scrub_lock
);
1280 int btrfs_scrub_continue(struct btrfs_root
*root
)
1282 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1284 atomic_dec(&fs_info
->scrub_pause_req
);
1285 wake_up(&fs_info
->scrub_pause_wait
);
1289 int btrfs_scrub_pause_super(struct btrfs_root
*root
)
1291 down_write(&root
->fs_info
->scrub_super_lock
);
1295 int btrfs_scrub_continue_super(struct btrfs_root
*root
)
1297 up_write(&root
->fs_info
->scrub_super_lock
);
1301 int btrfs_scrub_cancel(struct btrfs_root
*root
)
1303 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1305 mutex_lock(&fs_info
->scrub_lock
);
1306 if (!atomic_read(&fs_info
->scrubs_running
)) {
1307 mutex_unlock(&fs_info
->scrub_lock
);
1311 atomic_inc(&fs_info
->scrub_cancel_req
);
1312 while (atomic_read(&fs_info
->scrubs_running
)) {
1313 mutex_unlock(&fs_info
->scrub_lock
);
1314 wait_event(fs_info
->scrub_pause_wait
,
1315 atomic_read(&fs_info
->scrubs_running
) == 0);
1316 mutex_lock(&fs_info
->scrub_lock
);
1318 atomic_dec(&fs_info
->scrub_cancel_req
);
1319 mutex_unlock(&fs_info
->scrub_lock
);
1324 int btrfs_scrub_cancel_dev(struct btrfs_root
*root
, struct btrfs_device
*dev
)
1326 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1327 struct scrub_dev
*sdev
;
1329 mutex_lock(&fs_info
->scrub_lock
);
1330 sdev
= dev
->scrub_device
;
1332 mutex_unlock(&fs_info
->scrub_lock
);
1335 atomic_inc(&sdev
->cancel_req
);
1336 while (dev
->scrub_device
) {
1337 mutex_unlock(&fs_info
->scrub_lock
);
1338 wait_event(fs_info
->scrub_pause_wait
,
1339 dev
->scrub_device
== NULL
);
1340 mutex_lock(&fs_info
->scrub_lock
);
1342 mutex_unlock(&fs_info
->scrub_lock
);
1346 int btrfs_scrub_cancel_devid(struct btrfs_root
*root
, u64 devid
)
1348 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1349 struct btrfs_device
*dev
;
1353 * we have to hold the device_list_mutex here so the device
1354 * does not go away in cancel_dev. FIXME: find a better solution
1356 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1357 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1359 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1362 ret
= btrfs_scrub_cancel_dev(root
, dev
);
1363 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1368 int btrfs_scrub_progress(struct btrfs_root
*root
, u64 devid
,
1369 struct btrfs_scrub_progress
*progress
)
1371 struct btrfs_device
*dev
;
1372 struct scrub_dev
*sdev
= NULL
;
1374 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1375 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1377 sdev
= dev
->scrub_device
;
1379 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1380 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1382 return dev
? (sdev
? 0 : -ENOTCONN
) : -ENODEV
;