2 * Copyright (C) 2011 STRATO. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/blkdev.h>
23 #include "ordered-data.h"
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
31 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
46 static void scrub_bio_end_io(struct bio
*bio
, int err
);
47 static void scrub_checksum(struct btrfs_work
*work
);
48 static int scrub_checksum_data(struct scrub_dev
*sdev
,
49 struct scrub_page
*spag
, void *buffer
);
50 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
51 struct scrub_page
*spag
, u64 logical
,
53 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
);
54 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
);
55 static void scrub_fixup_end_io(struct bio
*bio
, int err
);
56 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
58 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
);
60 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
61 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
64 u64 flags
; /* extent flags */
68 u8 csum
[BTRFS_CSUM_SIZE
];
73 struct scrub_dev
*sdev
;
78 struct scrub_page spag
[SCRUB_PAGES_PER_BIO
];
81 struct btrfs_work work
;
85 struct scrub_bio
*bios
[SCRUB_BIOS_PER_DEV
];
86 struct btrfs_device
*dev
;
91 wait_queue_head_t list_wait
;
93 struct list_head csum_list
;
99 struct btrfs_scrub_progress stat
;
100 spinlock_t stat_lock
;
103 static void scrub_free_csums(struct scrub_dev
*sdev
)
105 while (!list_empty(&sdev
->csum_list
)) {
106 struct btrfs_ordered_sum
*sum
;
107 sum
= list_first_entry(&sdev
->csum_list
,
108 struct btrfs_ordered_sum
, list
);
109 list_del(&sum
->list
);
114 static void scrub_free_bio(struct bio
*bio
)
117 struct page
*last_page
= NULL
;
122 for (i
= 0; i
< bio
->bi_vcnt
; ++i
) {
123 if (bio
->bi_io_vec
[i
].bv_page
== last_page
)
125 last_page
= bio
->bi_io_vec
[i
].bv_page
;
126 __free_page(last_page
);
131 static noinline_for_stack
void scrub_free_dev(struct scrub_dev
*sdev
)
138 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
139 struct scrub_bio
*sbio
= sdev
->bios
[i
];
144 scrub_free_bio(sbio
->bio
);
148 scrub_free_csums(sdev
);
152 static noinline_for_stack
153 struct scrub_dev
*scrub_setup_dev(struct btrfs_device
*dev
)
155 struct scrub_dev
*sdev
;
157 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
159 sdev
= kzalloc(sizeof(*sdev
), GFP_NOFS
);
163 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
164 struct scrub_bio
*sbio
;
166 sbio
= kzalloc(sizeof(*sbio
), GFP_NOFS
);
169 sdev
->bios
[i
] = sbio
;
174 sbio
->work
.func
= scrub_checksum
;
176 if (i
!= SCRUB_BIOS_PER_DEV
-1)
177 sdev
->bios
[i
]->next_free
= i
+ 1;
179 sdev
->bios
[i
]->next_free
= -1;
181 sdev
->first_free
= 0;
183 atomic_set(&sdev
->in_flight
, 0);
184 atomic_set(&sdev
->cancel_req
, 0);
185 sdev
->csum_size
= btrfs_super_csum_size(&fs_info
->super_copy
);
186 INIT_LIST_HEAD(&sdev
->csum_list
);
188 spin_lock_init(&sdev
->list_lock
);
189 spin_lock_init(&sdev
->stat_lock
);
190 init_waitqueue_head(&sdev
->list_wait
);
194 scrub_free_dev(sdev
);
195 return ERR_PTR(-ENOMEM
);
199 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only
204 static int scrub_recheck_error(struct scrub_bio
*sbio
, int ix
)
206 struct scrub_dev
*sdev
= sbio
->sdev
;
207 u64 sector
= (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9;
210 if (scrub_fixup_io(READ
, sbio
->sdev
->dev
->bdev
, sector
,
211 sbio
->bio
->bi_io_vec
[ix
].bv_page
) == 0) {
212 if (scrub_fixup_check(sbio
, ix
) == 0)
217 spin_lock(&sdev
->stat_lock
);
218 ++sdev
->stat
.read_errors
;
219 spin_unlock(&sdev
->stat_lock
);
221 scrub_fixup(sbio
, ix
);
225 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
)
230 u64 flags
= sbio
->spag
[ix
].flags
;
232 page
= sbio
->bio
->bi_io_vec
[ix
].bv_page
;
233 buffer
= kmap_atomic(page
, KM_USER0
);
234 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
235 ret
= scrub_checksum_data(sbio
->sdev
,
236 sbio
->spag
+ ix
, buffer
);
237 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
238 ret
= scrub_checksum_tree_block(sbio
->sdev
,
240 sbio
->logical
+ ix
* PAGE_SIZE
,
245 kunmap_atomic(buffer
, KM_USER0
);
250 static void scrub_fixup_end_io(struct bio
*bio
, int err
)
252 complete((struct completion
*)bio
->bi_private
);
255 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
)
257 struct scrub_dev
*sdev
= sbio
->sdev
;
258 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
259 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
260 struct btrfs_multi_bio
*multi
= NULL
;
261 u64 logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
265 DECLARE_COMPLETION_ONSTACK(complete
);
267 if ((sbio
->spag
[ix
].flags
& BTRFS_EXTENT_FLAG_DATA
) &&
268 (sbio
->spag
[ix
].have_csum
== 0)) {
270 * nodatasum, don't try to fix anything
271 * FIXME: we can do better, open the inode and trigger a
278 ret
= btrfs_map_block(map_tree
, REQ_WRITE
, logical
, &length
,
280 if (ret
|| !multi
|| length
< PAGE_SIZE
) {
282 "scrub_fixup: btrfs_map_block failed us for %llu\n",
283 (unsigned long long)logical
);
288 if (multi
->num_stripes
== 1)
289 /* there aren't any replicas */
293 * first find a good copy
295 for (i
= 0; i
< multi
->num_stripes
; ++i
) {
296 if (i
== sbio
->spag
[ix
].mirror_num
)
299 if (scrub_fixup_io(READ
, multi
->stripes
[i
].dev
->bdev
,
300 multi
->stripes
[i
].physical
>> 9,
301 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
302 /* I/O-error, this is not a good copy */
306 if (scrub_fixup_check(sbio
, ix
) == 0)
309 if (i
== multi
->num_stripes
)
312 if (!sdev
->readonly
) {
314 * bi_io_vec[ix].bv_page now contains good data, write it back
316 if (scrub_fixup_io(WRITE
, sdev
->dev
->bdev
,
317 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
318 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
319 /* I/O-error, writeback failed, give up */
325 spin_lock(&sdev
->stat_lock
);
326 ++sdev
->stat
.corrected_errors
;
327 spin_unlock(&sdev
->stat_lock
);
329 if (printk_ratelimit())
330 printk(KERN_ERR
"btrfs: fixed up at %llu\n",
331 (unsigned long long)logical
);
336 spin_lock(&sdev
->stat_lock
);
337 ++sdev
->stat
.uncorrectable_errors
;
338 spin_unlock(&sdev
->stat_lock
);
340 if (printk_ratelimit())
341 printk(KERN_ERR
"btrfs: unable to fixup at %llu\n",
342 (unsigned long long)logical
);
345 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
348 struct bio
*bio
= NULL
;
350 DECLARE_COMPLETION_ONSTACK(complete
);
352 bio
= bio_alloc(GFP_NOFS
, 1);
354 bio
->bi_sector
= sector
;
355 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
356 bio
->bi_end_io
= scrub_fixup_end_io
;
357 bio
->bi_private
= &complete
;
360 /* this will also unplug the queue */
361 wait_for_completion(&complete
);
363 ret
= !test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
368 static void scrub_bio_end_io(struct bio
*bio
, int err
)
370 struct scrub_bio
*sbio
= bio
->bi_private
;
371 struct scrub_dev
*sdev
= sbio
->sdev
;
372 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
377 btrfs_queue_worker(&fs_info
->scrub_workers
, &sbio
->work
);
380 static void scrub_checksum(struct btrfs_work
*work
)
382 struct scrub_bio
*sbio
= container_of(work
, struct scrub_bio
, work
);
383 struct scrub_dev
*sdev
= sbio
->sdev
;
393 for (i
= 0; i
< sbio
->count
; ++i
)
394 ret
|= scrub_recheck_error(sbio
, i
);
396 spin_lock(&sdev
->stat_lock
);
397 ++sdev
->stat
.unverified_errors
;
398 spin_unlock(&sdev
->stat_lock
);
401 sbio
->bio
->bi_flags
&= ~(BIO_POOL_MASK
- 1);
402 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
403 sbio
->bio
->bi_phys_segments
= 0;
404 sbio
->bio
->bi_idx
= 0;
406 for (i
= 0; i
< sbio
->count
; i
++) {
408 bi
= &sbio
->bio
->bi_io_vec
[i
];
410 bi
->bv_len
= PAGE_SIZE
;
414 for (i
= 0; i
< sbio
->count
; ++i
) {
415 page
= sbio
->bio
->bi_io_vec
[i
].bv_page
;
416 buffer
= kmap_atomic(page
, KM_USER0
);
417 flags
= sbio
->spag
[i
].flags
;
418 logical
= sbio
->logical
+ i
* PAGE_SIZE
;
420 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
421 ret
= scrub_checksum_data(sdev
, sbio
->spag
+ i
, buffer
);
422 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
423 ret
= scrub_checksum_tree_block(sdev
, sbio
->spag
+ i
,
425 } else if (flags
& BTRFS_EXTENT_FLAG_SUPER
) {
427 (void)scrub_checksum_super(sbio
, buffer
);
431 kunmap_atomic(buffer
, KM_USER0
);
433 ret
= scrub_recheck_error(sbio
, i
);
435 spin_lock(&sdev
->stat_lock
);
436 ++sdev
->stat
.unverified_errors
;
437 spin_unlock(&sdev
->stat_lock
);
443 scrub_free_bio(sbio
->bio
);
445 spin_lock(&sdev
->list_lock
);
446 sbio
->next_free
= sdev
->first_free
;
447 sdev
->first_free
= sbio
->index
;
448 spin_unlock(&sdev
->list_lock
);
449 atomic_dec(&sdev
->in_flight
);
450 wake_up(&sdev
->list_wait
);
453 static int scrub_checksum_data(struct scrub_dev
*sdev
,
454 struct scrub_page
*spag
, void *buffer
)
456 u8 csum
[BTRFS_CSUM_SIZE
];
459 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
461 if (!spag
->have_csum
)
464 crc
= btrfs_csum_data(root
, buffer
, crc
, PAGE_SIZE
);
465 btrfs_csum_final(crc
, csum
);
466 if (memcmp(csum
, spag
->csum
, sdev
->csum_size
))
469 spin_lock(&sdev
->stat_lock
);
470 ++sdev
->stat
.data_extents_scrubbed
;
471 sdev
->stat
.data_bytes_scrubbed
+= PAGE_SIZE
;
473 ++sdev
->stat
.csum_errors
;
474 spin_unlock(&sdev
->stat_lock
);
479 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
480 struct scrub_page
*spag
, u64 logical
,
483 struct btrfs_header
*h
;
484 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
485 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
486 u8 csum
[BTRFS_CSUM_SIZE
];
492 * we don't use the getter functions here, as we
493 * a) don't have an extent buffer and
494 * b) the page is already kmapped
496 h
= (struct btrfs_header
*)buffer
;
498 if (logical
!= le64_to_cpu(h
->bytenr
))
501 if (spag
->generation
!= le64_to_cpu(h
->generation
))
504 if (memcmp(h
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
507 if (memcmp(h
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
511 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
512 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
513 btrfs_csum_final(crc
, csum
);
514 if (memcmp(csum
, h
->csum
, sdev
->csum_size
))
517 spin_lock(&sdev
->stat_lock
);
518 ++sdev
->stat
.tree_extents_scrubbed
;
519 sdev
->stat
.tree_bytes_scrubbed
+= PAGE_SIZE
;
521 ++sdev
->stat
.csum_errors
;
523 ++sdev
->stat
.verify_errors
;
524 spin_unlock(&sdev
->stat_lock
);
526 return fail
|| crc_fail
;
529 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
)
531 struct btrfs_super_block
*s
;
533 struct scrub_dev
*sdev
= sbio
->sdev
;
534 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
535 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
536 u8 csum
[BTRFS_CSUM_SIZE
];
540 s
= (struct btrfs_super_block
*)buffer
;
541 logical
= sbio
->logical
;
543 if (logical
!= le64_to_cpu(s
->bytenr
))
546 if (sbio
->spag
[0].generation
!= le64_to_cpu(s
->generation
))
549 if (memcmp(s
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
552 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
553 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
554 btrfs_csum_final(crc
, csum
);
555 if (memcmp(csum
, s
->csum
, sbio
->sdev
->csum_size
))
560 * if we find an error in a super block, we just report it.
561 * They will get written with the next transaction commit
564 spin_lock(&sdev
->stat_lock
);
565 ++sdev
->stat
.super_errors
;
566 spin_unlock(&sdev
->stat_lock
);
572 static int scrub_submit(struct scrub_dev
*sdev
)
574 struct scrub_bio
*sbio
;
578 if (sdev
->curr
== -1)
581 sbio
= sdev
->bios
[sdev
->curr
];
583 bio
= bio_alloc(GFP_NOFS
, sbio
->count
);
587 bio
->bi_private
= sbio
;
588 bio
->bi_end_io
= scrub_bio_end_io
;
589 bio
->bi_bdev
= sdev
->dev
->bdev
;
590 bio
->bi_sector
= sbio
->physical
>> 9;
592 for (i
= 0; i
< sbio
->count
; ++i
) {
596 page
= alloc_page(GFP_NOFS
);
600 ret
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
609 atomic_inc(&sdev
->in_flight
);
611 submit_bio(READ
, bio
);
621 static int scrub_page(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
622 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
,
625 struct scrub_bio
*sbio
;
629 * grab a fresh bio or wait for one to become available
631 while (sdev
->curr
== -1) {
632 spin_lock(&sdev
->list_lock
);
633 sdev
->curr
= sdev
->first_free
;
634 if (sdev
->curr
!= -1) {
635 sdev
->first_free
= sdev
->bios
[sdev
->curr
]->next_free
;
636 sdev
->bios
[sdev
->curr
]->next_free
= -1;
637 sdev
->bios
[sdev
->curr
]->count
= 0;
638 spin_unlock(&sdev
->list_lock
);
640 spin_unlock(&sdev
->list_lock
);
641 wait_event(sdev
->list_wait
, sdev
->first_free
!= -1);
644 sbio
= sdev
->bios
[sdev
->curr
];
645 if (sbio
->count
== 0) {
646 sbio
->physical
= physical
;
647 sbio
->logical
= logical
;
648 } else if (sbio
->physical
+ sbio
->count
* PAGE_SIZE
!= physical
||
649 sbio
->logical
+ sbio
->count
* PAGE_SIZE
!= logical
) {
652 ret
= scrub_submit(sdev
);
657 sbio
->spag
[sbio
->count
].flags
= flags
;
658 sbio
->spag
[sbio
->count
].generation
= gen
;
659 sbio
->spag
[sbio
->count
].have_csum
= 0;
660 sbio
->spag
[sbio
->count
].mirror_num
= mirror_num
;
662 sbio
->spag
[sbio
->count
].have_csum
= 1;
663 memcpy(sbio
->spag
[sbio
->count
].csum
, csum
, sdev
->csum_size
);
666 if (sbio
->count
== SCRUB_PAGES_PER_BIO
|| force
) {
669 ret
= scrub_submit(sdev
);
677 static int scrub_find_csum(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
680 struct btrfs_ordered_sum
*sum
= NULL
;
683 unsigned long num_sectors
;
684 u32 sectorsize
= sdev
->dev
->dev_root
->sectorsize
;
686 while (!list_empty(&sdev
->csum_list
)) {
687 sum
= list_first_entry(&sdev
->csum_list
,
688 struct btrfs_ordered_sum
, list
);
689 if (sum
->bytenr
> logical
)
691 if (sum
->bytenr
+ sum
->len
> logical
)
694 ++sdev
->stat
.csum_discards
;
695 list_del(&sum
->list
);
702 num_sectors
= sum
->len
/ sectorsize
;
703 for (i
= 0; i
< num_sectors
; ++i
) {
704 if (sum
->sums
[i
].bytenr
== logical
) {
705 memcpy(csum
, &sum
->sums
[i
].sum
, sdev
->csum_size
);
710 if (ret
&& i
== num_sectors
- 1) {
711 list_del(&sum
->list
);
717 /* scrub extent tries to collect up to 64 kB for each bio */
718 static int scrub_extent(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
719 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
)
722 u8 csum
[BTRFS_CSUM_SIZE
];
725 u64 l
= min_t(u64
, len
, PAGE_SIZE
);
728 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
729 /* push csums to sbio */
730 have_csum
= scrub_find_csum(sdev
, logical
, l
, csum
);
732 ++sdev
->stat
.no_csum
;
734 ret
= scrub_page(sdev
, logical
, l
, physical
, flags
, gen
,
735 mirror_num
, have_csum
? csum
: NULL
, 0);
745 static noinline_for_stack
int scrub_stripe(struct scrub_dev
*sdev
,
746 struct map_lookup
*map
, int num
, u64 base
, u64 length
)
748 struct btrfs_path
*path
;
749 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
750 struct btrfs_root
*root
= fs_info
->extent_root
;
751 struct btrfs_root
*csum_root
= fs_info
->csum_root
;
752 struct btrfs_extent_item
*extent
;
753 struct blk_plug plug
;
760 struct extent_buffer
*l
;
761 struct btrfs_key key
;
767 u64 increment
= map
->stripe_len
;
772 do_div(nstripes
, map
->stripe_len
);
773 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
774 offset
= map
->stripe_len
* num
;
775 increment
= map
->stripe_len
* map
->num_stripes
;
777 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
778 int factor
= map
->num_stripes
/ map
->sub_stripes
;
779 offset
= map
->stripe_len
* (num
/ map
->sub_stripes
);
780 increment
= map
->stripe_len
* factor
;
781 mirror_num
= num
% map
->sub_stripes
;
782 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
783 increment
= map
->stripe_len
;
784 mirror_num
= num
% map
->num_stripes
;
785 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
786 increment
= map
->stripe_len
;
787 mirror_num
= num
% map
->num_stripes
;
789 increment
= map
->stripe_len
;
793 path
= btrfs_alloc_path();
798 path
->search_commit_root
= 1;
799 path
->skip_locking
= 1;
802 * find all extents for each stripe and just read them to get
803 * them into the page cache
804 * FIXME: we can do better. build a more intelligent prefetching
806 logical
= base
+ offset
;
807 physical
= map
->stripes
[num
].physical
;
809 for (i
= 0; i
< nstripes
; ++i
) {
810 key
.objectid
= logical
;
811 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
814 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
819 * we might miss half an extent here, but that doesn't matter,
820 * as it's only the prefetch
824 slot
= path
->slots
[0];
825 if (slot
>= btrfs_header_nritems(l
)) {
826 ret
= btrfs_next_leaf(root
, path
);
834 btrfs_item_key_to_cpu(l
, &key
, slot
);
836 if (key
.objectid
>= logical
+ map
->stripe_len
)
841 btrfs_release_path(path
);
842 logical
+= increment
;
843 physical
+= map
->stripe_len
;
848 * collect all data csums for the stripe to avoid seeking during
849 * the scrub. This might currently (crc32) end up to be about 1MB
852 blk_start_plug(&plug
);
854 logical
= base
+ offset
+ start_stripe
* increment
;
855 for (i
= start_stripe
; i
< nstripes
; ++i
) {
856 ret
= btrfs_lookup_csums_range(csum_root
, logical
,
857 logical
+ map
->stripe_len
- 1,
858 &sdev
->csum_list
, 1);
862 logical
+= increment
;
866 * now find all extents for each stripe and scrub them
868 logical
= base
+ offset
+ start_stripe
* increment
;
869 physical
= map
->stripes
[num
].physical
+ start_stripe
* map
->stripe_len
;
871 for (i
= start_stripe
; i
< nstripes
; ++i
) {
875 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
876 atomic_read(&sdev
->cancel_req
)) {
881 * check to see if we have to pause
883 if (atomic_read(&fs_info
->scrub_pause_req
)) {
884 /* push queued extents */
886 wait_event(sdev
->list_wait
,
887 atomic_read(&sdev
->in_flight
) == 0);
888 atomic_inc(&fs_info
->scrubs_paused
);
889 wake_up(&fs_info
->scrub_pause_wait
);
890 mutex_lock(&fs_info
->scrub_lock
);
891 while (atomic_read(&fs_info
->scrub_pause_req
)) {
892 mutex_unlock(&fs_info
->scrub_lock
);
893 wait_event(fs_info
->scrub_pause_wait
,
894 atomic_read(&fs_info
->scrub_pause_req
) == 0);
895 mutex_lock(&fs_info
->scrub_lock
);
897 atomic_dec(&fs_info
->scrubs_paused
);
898 mutex_unlock(&fs_info
->scrub_lock
);
899 wake_up(&fs_info
->scrub_pause_wait
);
900 scrub_free_csums(sdev
);
905 key
.objectid
= logical
;
906 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
909 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
913 ret
= btrfs_previous_item(root
, path
, 0,
914 BTRFS_EXTENT_ITEM_KEY
);
918 /* there's no smaller item, so stick with the
920 btrfs_release_path(path
);
921 ret
= btrfs_search_slot(NULL
, root
, &key
,
930 slot
= path
->slots
[0];
931 if (slot
>= btrfs_header_nritems(l
)) {
932 ret
= btrfs_next_leaf(root
, path
);
940 btrfs_item_key_to_cpu(l
, &key
, slot
);
942 if (key
.objectid
+ key
.offset
<= logical
)
945 if (key
.objectid
>= logical
+ map
->stripe_len
)
948 if (btrfs_key_type(&key
) != BTRFS_EXTENT_ITEM_KEY
)
951 extent
= btrfs_item_ptr(l
, slot
,
952 struct btrfs_extent_item
);
953 flags
= btrfs_extent_flags(l
, extent
);
954 generation
= btrfs_extent_generation(l
, extent
);
956 if (key
.objectid
< logical
&&
957 (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)) {
959 "btrfs scrub: tree block %llu spanning "
960 "stripes, ignored. logical=%llu\n",
961 (unsigned long long)key
.objectid
,
962 (unsigned long long)logical
);
967 * trim extent to this stripe
969 if (key
.objectid
< logical
) {
970 key
.offset
-= logical
- key
.objectid
;
971 key
.objectid
= logical
;
973 if (key
.objectid
+ key
.offset
>
974 logical
+ map
->stripe_len
) {
975 key
.offset
= logical
+ map
->stripe_len
-
979 ret
= scrub_extent(sdev
, key
.objectid
, key
.offset
,
980 key
.objectid
- logical
+ physical
,
981 flags
, generation
, mirror_num
);
988 btrfs_release_path(path
);
989 logical
+= increment
;
990 physical
+= map
->stripe_len
;
991 spin_lock(&sdev
->stat_lock
);
992 sdev
->stat
.last_physical
= physical
;
993 spin_unlock(&sdev
->stat_lock
);
995 /* push queued extents */
999 blk_finish_plug(&plug
);
1001 btrfs_free_path(path
);
1002 return ret
< 0 ? ret
: 0;
1005 static noinline_for_stack
int scrub_chunk(struct scrub_dev
*sdev
,
1006 u64 chunk_tree
, u64 chunk_objectid
, u64 chunk_offset
, u64 length
)
1008 struct btrfs_mapping_tree
*map_tree
=
1009 &sdev
->dev
->dev_root
->fs_info
->mapping_tree
;
1010 struct map_lookup
*map
;
1011 struct extent_map
*em
;
1015 read_lock(&map_tree
->map_tree
.lock
);
1016 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
1017 read_unlock(&map_tree
->map_tree
.lock
);
1022 map
= (struct map_lookup
*)em
->bdev
;
1023 if (em
->start
!= chunk_offset
)
1026 if (em
->len
< length
)
1029 for (i
= 0; i
< map
->num_stripes
; ++i
) {
1030 if (map
->stripes
[i
].dev
== sdev
->dev
) {
1031 ret
= scrub_stripe(sdev
, map
, i
, chunk_offset
, length
);
1037 free_extent_map(em
);
1042 static noinline_for_stack
1043 int scrub_enumerate_chunks(struct scrub_dev
*sdev
, u64 start
, u64 end
)
1045 struct btrfs_dev_extent
*dev_extent
= NULL
;
1046 struct btrfs_path
*path
;
1047 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
1048 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1055 struct extent_buffer
*l
;
1056 struct btrfs_key key
;
1057 struct btrfs_key found_key
;
1058 struct btrfs_block_group_cache
*cache
;
1060 path
= btrfs_alloc_path();
1065 path
->search_commit_root
= 1;
1066 path
->skip_locking
= 1;
1068 key
.objectid
= sdev
->dev
->devid
;
1070 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1074 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1078 if (path
->slots
[0] >=
1079 btrfs_header_nritems(path
->nodes
[0])) {
1080 ret
= btrfs_next_leaf(root
, path
);
1087 slot
= path
->slots
[0];
1089 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
1091 if (found_key
.objectid
!= sdev
->dev
->devid
)
1094 if (btrfs_key_type(&found_key
) != BTRFS_DEV_EXTENT_KEY
)
1097 if (found_key
.offset
>= end
)
1100 if (found_key
.offset
< key
.offset
)
1103 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1104 length
= btrfs_dev_extent_length(l
, dev_extent
);
1106 if (found_key
.offset
+ length
<= start
) {
1107 key
.offset
= found_key
.offset
+ length
;
1108 btrfs_release_path(path
);
1112 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
1113 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
1114 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
1117 * get a reference on the corresponding block group to prevent
1118 * the chunk from going away while we scrub it
1120 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
1125 ret
= scrub_chunk(sdev
, chunk_tree
, chunk_objectid
,
1126 chunk_offset
, length
);
1127 btrfs_put_block_group(cache
);
1131 key
.offset
= found_key
.offset
+ length
;
1132 btrfs_release_path(path
);
1135 btrfs_free_path(path
);
1138 * ret can still be 1 from search_slot or next_leaf,
1139 * that's not an error
1141 return ret
< 0 ? ret
: 0;
1144 static noinline_for_stack
int scrub_supers(struct scrub_dev
*sdev
)
1150 struct btrfs_device
*device
= sdev
->dev
;
1151 struct btrfs_root
*root
= device
->dev_root
;
1153 gen
= root
->fs_info
->last_trans_committed
;
1155 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1156 bytenr
= btrfs_sb_offset(i
);
1157 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>= device
->total_bytes
)
1160 ret
= scrub_page(sdev
, bytenr
, PAGE_SIZE
, bytenr
,
1161 BTRFS_EXTENT_FLAG_SUPER
, gen
, i
, NULL
, 1);
1165 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1171 * get a reference count on fs_info->scrub_workers. start worker if necessary
1173 static noinline_for_stack
int scrub_workers_get(struct btrfs_root
*root
)
1175 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1177 mutex_lock(&fs_info
->scrub_lock
);
1178 if (fs_info
->scrub_workers_refcnt
== 0) {
1179 btrfs_init_workers(&fs_info
->scrub_workers
, "scrub",
1180 fs_info
->thread_pool_size
, &fs_info
->generic_worker
);
1181 fs_info
->scrub_workers
.idle_thresh
= 4;
1182 btrfs_start_workers(&fs_info
->scrub_workers
, 1);
1184 ++fs_info
->scrub_workers_refcnt
;
1185 mutex_unlock(&fs_info
->scrub_lock
);
1190 static noinline_for_stack
void scrub_workers_put(struct btrfs_root
*root
)
1192 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1194 mutex_lock(&fs_info
->scrub_lock
);
1195 if (--fs_info
->scrub_workers_refcnt
== 0)
1196 btrfs_stop_workers(&fs_info
->scrub_workers
);
1197 WARN_ON(fs_info
->scrub_workers_refcnt
< 0);
1198 mutex_unlock(&fs_info
->scrub_lock
);
1202 int btrfs_scrub_dev(struct btrfs_root
*root
, u64 devid
, u64 start
, u64 end
,
1203 struct btrfs_scrub_progress
*progress
, int readonly
)
1205 struct scrub_dev
*sdev
;
1206 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1208 struct btrfs_device
*dev
;
1210 if (btrfs_fs_closing(root
->fs_info
))
1214 * check some assumptions
1216 if (root
->sectorsize
!= PAGE_SIZE
||
1217 root
->sectorsize
!= root
->leafsize
||
1218 root
->sectorsize
!= root
->nodesize
) {
1219 printk(KERN_ERR
"btrfs_scrub: size assumptions fail\n");
1223 ret
= scrub_workers_get(root
);
1227 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1228 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1229 if (!dev
|| dev
->missing
) {
1230 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1231 scrub_workers_put(root
);
1234 mutex_lock(&fs_info
->scrub_lock
);
1236 if (!dev
->in_fs_metadata
) {
1237 mutex_unlock(&fs_info
->scrub_lock
);
1238 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1239 scrub_workers_put(root
);
1243 if (dev
->scrub_device
) {
1244 mutex_unlock(&fs_info
->scrub_lock
);
1245 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1246 scrub_workers_put(root
);
1247 return -EINPROGRESS
;
1249 sdev
= scrub_setup_dev(dev
);
1251 mutex_unlock(&fs_info
->scrub_lock
);
1252 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1253 scrub_workers_put(root
);
1254 return PTR_ERR(sdev
);
1256 sdev
->readonly
= readonly
;
1257 dev
->scrub_device
= sdev
;
1259 atomic_inc(&fs_info
->scrubs_running
);
1260 mutex_unlock(&fs_info
->scrub_lock
);
1261 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1263 down_read(&fs_info
->scrub_super_lock
);
1264 ret
= scrub_supers(sdev
);
1265 up_read(&fs_info
->scrub_super_lock
);
1268 ret
= scrub_enumerate_chunks(sdev
, start
, end
);
1270 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1272 atomic_dec(&fs_info
->scrubs_running
);
1273 wake_up(&fs_info
->scrub_pause_wait
);
1276 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1278 mutex_lock(&fs_info
->scrub_lock
);
1279 dev
->scrub_device
= NULL
;
1280 mutex_unlock(&fs_info
->scrub_lock
);
1282 scrub_free_dev(sdev
);
1283 scrub_workers_put(root
);
1288 int btrfs_scrub_pause(struct btrfs_root
*root
)
1290 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1292 mutex_lock(&fs_info
->scrub_lock
);
1293 atomic_inc(&fs_info
->scrub_pause_req
);
1294 while (atomic_read(&fs_info
->scrubs_paused
) !=
1295 atomic_read(&fs_info
->scrubs_running
)) {
1296 mutex_unlock(&fs_info
->scrub_lock
);
1297 wait_event(fs_info
->scrub_pause_wait
,
1298 atomic_read(&fs_info
->scrubs_paused
) ==
1299 atomic_read(&fs_info
->scrubs_running
));
1300 mutex_lock(&fs_info
->scrub_lock
);
1302 mutex_unlock(&fs_info
->scrub_lock
);
1307 int btrfs_scrub_continue(struct btrfs_root
*root
)
1309 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1311 atomic_dec(&fs_info
->scrub_pause_req
);
1312 wake_up(&fs_info
->scrub_pause_wait
);
1316 int btrfs_scrub_pause_super(struct btrfs_root
*root
)
1318 down_write(&root
->fs_info
->scrub_super_lock
);
1322 int btrfs_scrub_continue_super(struct btrfs_root
*root
)
1324 up_write(&root
->fs_info
->scrub_super_lock
);
1328 int btrfs_scrub_cancel(struct btrfs_root
*root
)
1330 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1332 mutex_lock(&fs_info
->scrub_lock
);
1333 if (!atomic_read(&fs_info
->scrubs_running
)) {
1334 mutex_unlock(&fs_info
->scrub_lock
);
1338 atomic_inc(&fs_info
->scrub_cancel_req
);
1339 while (atomic_read(&fs_info
->scrubs_running
)) {
1340 mutex_unlock(&fs_info
->scrub_lock
);
1341 wait_event(fs_info
->scrub_pause_wait
,
1342 atomic_read(&fs_info
->scrubs_running
) == 0);
1343 mutex_lock(&fs_info
->scrub_lock
);
1345 atomic_dec(&fs_info
->scrub_cancel_req
);
1346 mutex_unlock(&fs_info
->scrub_lock
);
1351 int btrfs_scrub_cancel_dev(struct btrfs_root
*root
, struct btrfs_device
*dev
)
1353 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1354 struct scrub_dev
*sdev
;
1356 mutex_lock(&fs_info
->scrub_lock
);
1357 sdev
= dev
->scrub_device
;
1359 mutex_unlock(&fs_info
->scrub_lock
);
1362 atomic_inc(&sdev
->cancel_req
);
1363 while (dev
->scrub_device
) {
1364 mutex_unlock(&fs_info
->scrub_lock
);
1365 wait_event(fs_info
->scrub_pause_wait
,
1366 dev
->scrub_device
== NULL
);
1367 mutex_lock(&fs_info
->scrub_lock
);
1369 mutex_unlock(&fs_info
->scrub_lock
);
1373 int btrfs_scrub_cancel_devid(struct btrfs_root
*root
, u64 devid
)
1375 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1376 struct btrfs_device
*dev
;
1380 * we have to hold the device_list_mutex here so the device
1381 * does not go away in cancel_dev. FIXME: find a better solution
1383 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1384 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1386 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1389 ret
= btrfs_scrub_cancel_dev(root
, dev
);
1390 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1395 int btrfs_scrub_progress(struct btrfs_root
*root
, u64 devid
,
1396 struct btrfs_scrub_progress
*progress
)
1398 struct btrfs_device
*dev
;
1399 struct scrub_dev
*sdev
= NULL
;
1401 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1402 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1404 sdev
= dev
->scrub_device
;
1406 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1407 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1409 return dev
? (sdev
? 0 : -ENOTCONN
) : -ENODEV
;