2 * Copyright (C) 2011 STRATO. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
24 #include "ordered-data.h"
28 * This is only the first step towards a full-features scrub. It reads all
29 * extent and super block and verifies the checksums. In case a bad checksum
30 * is found or the extent cannot be read, good data will be written back if
33 * Future enhancements:
34 * - To enhance the performance, better read-ahead strategies for the
35 * extent-tree can be employed.
36 * - In case an unrepairable extent is encountered, track which files are
37 * affected and report them
38 * - In case of a read error on files with nodatasum, map the file and read
39 * the extent to trigger a writeback of the good copy
40 * - track and record media errors, throw out bad devices
41 * - add a mode to also read unallocated space
42 * - make the prefetch cancellable
48 static void scrub_bio_end_io(struct bio
*bio
, int err
);
49 static void scrub_checksum(struct btrfs_work
*work
);
50 static int scrub_checksum_data(struct scrub_dev
*sdev
,
51 struct scrub_page
*spag
, void *buffer
);
52 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
53 struct scrub_page
*spag
, u64 logical
,
55 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
);
56 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
);
57 static void scrub_fixup_end_io(struct bio
*bio
, int err
);
58 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
60 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
);
62 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
63 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
66 u64 flags
; /* extent flags */
70 u8 csum
[BTRFS_CSUM_SIZE
];
75 struct scrub_dev
*sdev
;
80 struct scrub_page spag
[SCRUB_PAGES_PER_BIO
];
83 struct btrfs_work work
;
87 struct scrub_bio
*bios
[SCRUB_BIOS_PER_DEV
];
88 struct btrfs_device
*dev
;
93 wait_queue_head_t list_wait
;
95 struct list_head csum_list
;
101 struct btrfs_scrub_progress stat
;
102 spinlock_t stat_lock
;
105 struct scrub_warning
{
106 struct btrfs_path
*path
;
107 u64 extent_item_size
;
113 struct btrfs_device
*dev
;
118 static void scrub_free_csums(struct scrub_dev
*sdev
)
120 while (!list_empty(&sdev
->csum_list
)) {
121 struct btrfs_ordered_sum
*sum
;
122 sum
= list_first_entry(&sdev
->csum_list
,
123 struct btrfs_ordered_sum
, list
);
124 list_del(&sum
->list
);
129 static void scrub_free_bio(struct bio
*bio
)
132 struct page
*last_page
= NULL
;
137 for (i
= 0; i
< bio
->bi_vcnt
; ++i
) {
138 if (bio
->bi_io_vec
[i
].bv_page
== last_page
)
140 last_page
= bio
->bi_io_vec
[i
].bv_page
;
141 __free_page(last_page
);
146 static noinline_for_stack
void scrub_free_dev(struct scrub_dev
*sdev
)
153 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
154 struct scrub_bio
*sbio
= sdev
->bios
[i
];
159 scrub_free_bio(sbio
->bio
);
163 scrub_free_csums(sdev
);
167 static noinline_for_stack
168 struct scrub_dev
*scrub_setup_dev(struct btrfs_device
*dev
)
170 struct scrub_dev
*sdev
;
172 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
174 sdev
= kzalloc(sizeof(*sdev
), GFP_NOFS
);
178 for (i
= 0; i
< SCRUB_BIOS_PER_DEV
; ++i
) {
179 struct scrub_bio
*sbio
;
181 sbio
= kzalloc(sizeof(*sbio
), GFP_NOFS
);
184 sdev
->bios
[i
] = sbio
;
189 sbio
->work
.func
= scrub_checksum
;
191 if (i
!= SCRUB_BIOS_PER_DEV
-1)
192 sdev
->bios
[i
]->next_free
= i
+ 1;
194 sdev
->bios
[i
]->next_free
= -1;
196 sdev
->first_free
= 0;
198 atomic_set(&sdev
->in_flight
, 0);
199 atomic_set(&sdev
->cancel_req
, 0);
200 sdev
->csum_size
= btrfs_super_csum_size(&fs_info
->super_copy
);
201 INIT_LIST_HEAD(&sdev
->csum_list
);
203 spin_lock_init(&sdev
->list_lock
);
204 spin_lock_init(&sdev
->stat_lock
);
205 init_waitqueue_head(&sdev
->list_wait
);
209 scrub_free_dev(sdev
);
210 return ERR_PTR(-ENOMEM
);
213 static int scrub_print_warning_inode(u64 inum
, u64 offset
, u64 root
, void *ctx
)
219 struct extent_buffer
*eb
;
220 struct btrfs_inode_item
*inode_item
;
221 struct scrub_warning
*swarn
= ctx
;
222 struct btrfs_fs_info
*fs_info
= swarn
->dev
->dev_root
->fs_info
;
223 struct inode_fs_paths
*ipath
= NULL
;
224 struct btrfs_root
*local_root
;
225 struct btrfs_key root_key
;
227 root_key
.objectid
= root
;
228 root_key
.type
= BTRFS_ROOT_ITEM_KEY
;
229 root_key
.offset
= (u64
)-1;
230 local_root
= btrfs_read_fs_root_no_name(fs_info
, &root_key
);
231 if (IS_ERR(local_root
)) {
232 ret
= PTR_ERR(local_root
);
236 ret
= inode_item_info(inum
, 0, local_root
, swarn
->path
);
238 btrfs_release_path(swarn
->path
);
242 eb
= swarn
->path
->nodes
[0];
243 inode_item
= btrfs_item_ptr(eb
, swarn
->path
->slots
[0],
244 struct btrfs_inode_item
);
245 isize
= btrfs_inode_size(eb
, inode_item
);
246 nlink
= btrfs_inode_nlink(eb
, inode_item
);
247 btrfs_release_path(swarn
->path
);
249 ipath
= init_ipath(4096, local_root
, swarn
->path
);
250 ret
= paths_from_inode(inum
, ipath
);
256 * we deliberately ignore the bit ipath might have been too small to
257 * hold all of the paths here
259 for (i
= 0; i
< ipath
->fspath
->elem_cnt
; ++i
)
260 printk(KERN_WARNING
"btrfs: %s at logical %llu on dev "
261 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
262 "length %llu, links %u (path: %s)\n", swarn
->errstr
,
263 swarn
->logical
, swarn
->dev
->name
,
264 (unsigned long long)swarn
->sector
, root
, inum
, offset
,
265 min(isize
- offset
, (u64
)PAGE_SIZE
), nlink
,
266 ipath
->fspath
->str
[i
]);
272 printk(KERN_WARNING
"btrfs: %s at logical %llu on dev "
273 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
274 "resolving failed with ret=%d\n", swarn
->errstr
,
275 swarn
->logical
, swarn
->dev
->name
,
276 (unsigned long long)swarn
->sector
, root
, inum
, offset
, ret
);
282 static void scrub_print_warning(const char *errstr
, struct scrub_bio
*sbio
,
285 struct btrfs_device
*dev
= sbio
->sdev
->dev
;
286 struct btrfs_fs_info
*fs_info
= dev
->dev_root
->fs_info
;
287 struct btrfs_path
*path
;
288 struct btrfs_key found_key
;
289 struct extent_buffer
*eb
;
290 struct btrfs_extent_item
*ei
;
291 struct scrub_warning swarn
;
296 unsigned long ptr
= 0;
297 const int bufsize
= 4096;
300 path
= btrfs_alloc_path();
302 swarn
.scratch_buf
= kmalloc(bufsize
, GFP_NOFS
);
303 swarn
.msg_buf
= kmalloc(bufsize
, GFP_NOFS
);
304 swarn
.sector
= (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9;
305 swarn
.logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
306 swarn
.errstr
= errstr
;
308 swarn
.msg_bufsize
= bufsize
;
309 swarn
.scratch_bufsize
= bufsize
;
311 if (!path
|| !swarn
.scratch_buf
|| !swarn
.msg_buf
)
314 ret
= extent_from_logical(fs_info
, swarn
.logical
, path
, &found_key
);
318 extent_offset
= swarn
.logical
- found_key
.objectid
;
319 swarn
.extent_item_size
= found_key
.offset
;
322 ei
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_extent_item
);
323 item_size
= btrfs_item_size_nr(eb
, path
->slots
[0]);
325 if (ret
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
327 ret
= tree_backref_for_extent(&ptr
, eb
, ei
, item_size
,
328 &ref_root
, &ref_level
);
329 printk(KERN_WARNING
"%s at logical %llu on dev %s, "
330 "sector %llu: metadata %s (level %d) in tree "
331 "%llu\n", errstr
, swarn
.logical
, dev
->name
,
332 (unsigned long long)swarn
.sector
,
333 ref_level
? "node" : "leaf",
334 ret
< 0 ? -1 : ref_level
,
335 ret
< 0 ? -1 : ref_root
);
339 iterate_extent_inodes(fs_info
, path
, found_key
.objectid
,
341 scrub_print_warning_inode
, &swarn
);
345 btrfs_free_path(path
);
346 kfree(swarn
.scratch_buf
);
347 kfree(swarn
.msg_buf
);
351 * scrub_recheck_error gets called when either verification of the page
352 * failed or the bio failed to read, e.g. with EIO. In the latter case,
353 * recheck_error gets called for every page in the bio, even though only
356 static int scrub_recheck_error(struct scrub_bio
*sbio
, int ix
)
358 struct scrub_dev
*sdev
= sbio
->sdev
;
359 u64 sector
= (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9;
360 static DEFINE_RATELIMIT_STATE(_rs
, DEFAULT_RATELIMIT_INTERVAL
,
361 DEFAULT_RATELIMIT_BURST
);
364 if (scrub_fixup_io(READ
, sbio
->sdev
->dev
->bdev
, sector
,
365 sbio
->bio
->bi_io_vec
[ix
].bv_page
) == 0) {
366 if (scrub_fixup_check(sbio
, ix
) == 0)
369 if (__ratelimit(&_rs
))
370 scrub_print_warning("i/o error", sbio
, ix
);
372 if (__ratelimit(&_rs
))
373 scrub_print_warning("checksum error", sbio
, ix
);
376 spin_lock(&sdev
->stat_lock
);
377 ++sdev
->stat
.read_errors
;
378 spin_unlock(&sdev
->stat_lock
);
380 scrub_fixup(sbio
, ix
);
384 static int scrub_fixup_check(struct scrub_bio
*sbio
, int ix
)
389 u64 flags
= sbio
->spag
[ix
].flags
;
391 page
= sbio
->bio
->bi_io_vec
[ix
].bv_page
;
392 buffer
= kmap_atomic(page
, KM_USER0
);
393 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
394 ret
= scrub_checksum_data(sbio
->sdev
,
395 sbio
->spag
+ ix
, buffer
);
396 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
397 ret
= scrub_checksum_tree_block(sbio
->sdev
,
399 sbio
->logical
+ ix
* PAGE_SIZE
,
404 kunmap_atomic(buffer
, KM_USER0
);
409 static void scrub_fixup_end_io(struct bio
*bio
, int err
)
411 complete((struct completion
*)bio
->bi_private
);
414 static void scrub_fixup(struct scrub_bio
*sbio
, int ix
)
416 struct scrub_dev
*sdev
= sbio
->sdev
;
417 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
418 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
419 struct btrfs_multi_bio
*multi
= NULL
;
420 u64 logical
= sbio
->logical
+ ix
* PAGE_SIZE
;
424 DECLARE_COMPLETION_ONSTACK(complete
);
426 if ((sbio
->spag
[ix
].flags
& BTRFS_EXTENT_FLAG_DATA
) &&
427 (sbio
->spag
[ix
].have_csum
== 0)) {
429 * nodatasum, don't try to fix anything
430 * FIXME: we can do better, open the inode and trigger a
437 ret
= btrfs_map_block(map_tree
, REQ_WRITE
, logical
, &length
,
439 if (ret
|| !multi
|| length
< PAGE_SIZE
) {
441 "scrub_fixup: btrfs_map_block failed us for %llu\n",
442 (unsigned long long)logical
);
447 if (multi
->num_stripes
== 1)
448 /* there aren't any replicas */
452 * first find a good copy
454 for (i
= 0; i
< multi
->num_stripes
; ++i
) {
455 if (i
== sbio
->spag
[ix
].mirror_num
)
458 if (scrub_fixup_io(READ
, multi
->stripes
[i
].dev
->bdev
,
459 multi
->stripes
[i
].physical
>> 9,
460 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
461 /* I/O-error, this is not a good copy */
465 if (scrub_fixup_check(sbio
, ix
) == 0)
468 if (i
== multi
->num_stripes
)
471 if (!sdev
->readonly
) {
473 * bi_io_vec[ix].bv_page now contains good data, write it back
475 if (scrub_fixup_io(WRITE
, sdev
->dev
->bdev
,
476 (sbio
->physical
+ ix
* PAGE_SIZE
) >> 9,
477 sbio
->bio
->bi_io_vec
[ix
].bv_page
)) {
478 /* I/O-error, writeback failed, give up */
484 spin_lock(&sdev
->stat_lock
);
485 ++sdev
->stat
.corrected_errors
;
486 spin_unlock(&sdev
->stat_lock
);
488 printk_ratelimited(KERN_ERR
"btrfs: fixed up error at logical %llu\n",
489 (unsigned long long)logical
);
494 spin_lock(&sdev
->stat_lock
);
495 ++sdev
->stat
.uncorrectable_errors
;
496 spin_unlock(&sdev
->stat_lock
);
498 printk_ratelimited(KERN_ERR
"btrfs: unable to fixup (regular) error at "
499 "logical %llu\n", (unsigned long long)logical
);
502 static int scrub_fixup_io(int rw
, struct block_device
*bdev
, sector_t sector
,
505 struct bio
*bio
= NULL
;
507 DECLARE_COMPLETION_ONSTACK(complete
);
509 bio
= bio_alloc(GFP_NOFS
, 1);
511 bio
->bi_sector
= sector
;
512 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
513 bio
->bi_end_io
= scrub_fixup_end_io
;
514 bio
->bi_private
= &complete
;
517 /* this will also unplug the queue */
518 wait_for_completion(&complete
);
520 ret
= !test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
525 static void scrub_bio_end_io(struct bio
*bio
, int err
)
527 struct scrub_bio
*sbio
= bio
->bi_private
;
528 struct scrub_dev
*sdev
= sbio
->sdev
;
529 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
534 btrfs_queue_worker(&fs_info
->scrub_workers
, &sbio
->work
);
537 static void scrub_checksum(struct btrfs_work
*work
)
539 struct scrub_bio
*sbio
= container_of(work
, struct scrub_bio
, work
);
540 struct scrub_dev
*sdev
= sbio
->sdev
;
550 for (i
= 0; i
< sbio
->count
; ++i
)
551 ret
|= scrub_recheck_error(sbio
, i
);
553 spin_lock(&sdev
->stat_lock
);
554 ++sdev
->stat
.unverified_errors
;
555 spin_unlock(&sdev
->stat_lock
);
558 sbio
->bio
->bi_flags
&= ~(BIO_POOL_MASK
- 1);
559 sbio
->bio
->bi_flags
|= 1 << BIO_UPTODATE
;
560 sbio
->bio
->bi_phys_segments
= 0;
561 sbio
->bio
->bi_idx
= 0;
563 for (i
= 0; i
< sbio
->count
; i
++) {
565 bi
= &sbio
->bio
->bi_io_vec
[i
];
567 bi
->bv_len
= PAGE_SIZE
;
571 for (i
= 0; i
< sbio
->count
; ++i
) {
572 page
= sbio
->bio
->bi_io_vec
[i
].bv_page
;
573 buffer
= kmap_atomic(page
, KM_USER0
);
574 flags
= sbio
->spag
[i
].flags
;
575 logical
= sbio
->logical
+ i
* PAGE_SIZE
;
577 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
578 ret
= scrub_checksum_data(sdev
, sbio
->spag
+ i
, buffer
);
579 } else if (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
) {
580 ret
= scrub_checksum_tree_block(sdev
, sbio
->spag
+ i
,
582 } else if (flags
& BTRFS_EXTENT_FLAG_SUPER
) {
584 (void)scrub_checksum_super(sbio
, buffer
);
588 kunmap_atomic(buffer
, KM_USER0
);
590 ret
= scrub_recheck_error(sbio
, i
);
592 spin_lock(&sdev
->stat_lock
);
593 ++sdev
->stat
.unverified_errors
;
594 spin_unlock(&sdev
->stat_lock
);
600 scrub_free_bio(sbio
->bio
);
602 spin_lock(&sdev
->list_lock
);
603 sbio
->next_free
= sdev
->first_free
;
604 sdev
->first_free
= sbio
->index
;
605 spin_unlock(&sdev
->list_lock
);
606 atomic_dec(&sdev
->in_flight
);
607 wake_up(&sdev
->list_wait
);
610 static int scrub_checksum_data(struct scrub_dev
*sdev
,
611 struct scrub_page
*spag
, void *buffer
)
613 u8 csum
[BTRFS_CSUM_SIZE
];
616 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
618 if (!spag
->have_csum
)
621 crc
= btrfs_csum_data(root
, buffer
, crc
, PAGE_SIZE
);
622 btrfs_csum_final(crc
, csum
);
623 if (memcmp(csum
, spag
->csum
, sdev
->csum_size
))
626 spin_lock(&sdev
->stat_lock
);
627 ++sdev
->stat
.data_extents_scrubbed
;
628 sdev
->stat
.data_bytes_scrubbed
+= PAGE_SIZE
;
630 ++sdev
->stat
.csum_errors
;
631 spin_unlock(&sdev
->stat_lock
);
636 static int scrub_checksum_tree_block(struct scrub_dev
*sdev
,
637 struct scrub_page
*spag
, u64 logical
,
640 struct btrfs_header
*h
;
641 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
642 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
643 u8 csum
[BTRFS_CSUM_SIZE
];
649 * we don't use the getter functions here, as we
650 * a) don't have an extent buffer and
651 * b) the page is already kmapped
653 h
= (struct btrfs_header
*)buffer
;
655 if (logical
!= le64_to_cpu(h
->bytenr
))
658 if (spag
->generation
!= le64_to_cpu(h
->generation
))
661 if (memcmp(h
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
664 if (memcmp(h
->chunk_tree_uuid
, fs_info
->chunk_tree_uuid
,
668 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
669 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
670 btrfs_csum_final(crc
, csum
);
671 if (memcmp(csum
, h
->csum
, sdev
->csum_size
))
674 spin_lock(&sdev
->stat_lock
);
675 ++sdev
->stat
.tree_extents_scrubbed
;
676 sdev
->stat
.tree_bytes_scrubbed
+= PAGE_SIZE
;
678 ++sdev
->stat
.csum_errors
;
680 ++sdev
->stat
.verify_errors
;
681 spin_unlock(&sdev
->stat_lock
);
683 return fail
|| crc_fail
;
686 static int scrub_checksum_super(struct scrub_bio
*sbio
, void *buffer
)
688 struct btrfs_super_block
*s
;
690 struct scrub_dev
*sdev
= sbio
->sdev
;
691 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
692 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
693 u8 csum
[BTRFS_CSUM_SIZE
];
697 s
= (struct btrfs_super_block
*)buffer
;
698 logical
= sbio
->logical
;
700 if (logical
!= le64_to_cpu(s
->bytenr
))
703 if (sbio
->spag
[0].generation
!= le64_to_cpu(s
->generation
))
706 if (memcmp(s
->fsid
, fs_info
->fsid
, BTRFS_UUID_SIZE
))
709 crc
= btrfs_csum_data(root
, buffer
+ BTRFS_CSUM_SIZE
, crc
,
710 PAGE_SIZE
- BTRFS_CSUM_SIZE
);
711 btrfs_csum_final(crc
, csum
);
712 if (memcmp(csum
, s
->csum
, sbio
->sdev
->csum_size
))
717 * if we find an error in a super block, we just report it.
718 * They will get written with the next transaction commit
721 spin_lock(&sdev
->stat_lock
);
722 ++sdev
->stat
.super_errors
;
723 spin_unlock(&sdev
->stat_lock
);
729 static int scrub_submit(struct scrub_dev
*sdev
)
731 struct scrub_bio
*sbio
;
735 if (sdev
->curr
== -1)
738 sbio
= sdev
->bios
[sdev
->curr
];
740 bio
= bio_alloc(GFP_NOFS
, sbio
->count
);
744 bio
->bi_private
= sbio
;
745 bio
->bi_end_io
= scrub_bio_end_io
;
746 bio
->bi_bdev
= sdev
->dev
->bdev
;
747 bio
->bi_sector
= sbio
->physical
>> 9;
749 for (i
= 0; i
< sbio
->count
; ++i
) {
753 page
= alloc_page(GFP_NOFS
);
757 ret
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
766 atomic_inc(&sdev
->in_flight
);
768 submit_bio(READ
, bio
);
778 static int scrub_page(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
779 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
,
782 struct scrub_bio
*sbio
;
786 * grab a fresh bio or wait for one to become available
788 while (sdev
->curr
== -1) {
789 spin_lock(&sdev
->list_lock
);
790 sdev
->curr
= sdev
->first_free
;
791 if (sdev
->curr
!= -1) {
792 sdev
->first_free
= sdev
->bios
[sdev
->curr
]->next_free
;
793 sdev
->bios
[sdev
->curr
]->next_free
= -1;
794 sdev
->bios
[sdev
->curr
]->count
= 0;
795 spin_unlock(&sdev
->list_lock
);
797 spin_unlock(&sdev
->list_lock
);
798 wait_event(sdev
->list_wait
, sdev
->first_free
!= -1);
801 sbio
= sdev
->bios
[sdev
->curr
];
802 if (sbio
->count
== 0) {
803 sbio
->physical
= physical
;
804 sbio
->logical
= logical
;
805 } else if (sbio
->physical
+ sbio
->count
* PAGE_SIZE
!= physical
||
806 sbio
->logical
+ sbio
->count
* PAGE_SIZE
!= logical
) {
809 ret
= scrub_submit(sdev
);
814 sbio
->spag
[sbio
->count
].flags
= flags
;
815 sbio
->spag
[sbio
->count
].generation
= gen
;
816 sbio
->spag
[sbio
->count
].have_csum
= 0;
817 sbio
->spag
[sbio
->count
].mirror_num
= mirror_num
;
819 sbio
->spag
[sbio
->count
].have_csum
= 1;
820 memcpy(sbio
->spag
[sbio
->count
].csum
, csum
, sdev
->csum_size
);
823 if (sbio
->count
== SCRUB_PAGES_PER_BIO
|| force
) {
826 ret
= scrub_submit(sdev
);
834 static int scrub_find_csum(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
837 struct btrfs_ordered_sum
*sum
= NULL
;
840 unsigned long num_sectors
;
841 u32 sectorsize
= sdev
->dev
->dev_root
->sectorsize
;
843 while (!list_empty(&sdev
->csum_list
)) {
844 sum
= list_first_entry(&sdev
->csum_list
,
845 struct btrfs_ordered_sum
, list
);
846 if (sum
->bytenr
> logical
)
848 if (sum
->bytenr
+ sum
->len
> logical
)
851 ++sdev
->stat
.csum_discards
;
852 list_del(&sum
->list
);
859 num_sectors
= sum
->len
/ sectorsize
;
860 for (i
= 0; i
< num_sectors
; ++i
) {
861 if (sum
->sums
[i
].bytenr
== logical
) {
862 memcpy(csum
, &sum
->sums
[i
].sum
, sdev
->csum_size
);
867 if (ret
&& i
== num_sectors
- 1) {
868 list_del(&sum
->list
);
874 /* scrub extent tries to collect up to 64 kB for each bio */
875 static int scrub_extent(struct scrub_dev
*sdev
, u64 logical
, u64 len
,
876 u64 physical
, u64 flags
, u64 gen
, u64 mirror_num
)
879 u8 csum
[BTRFS_CSUM_SIZE
];
882 u64 l
= min_t(u64
, len
, PAGE_SIZE
);
885 if (flags
& BTRFS_EXTENT_FLAG_DATA
) {
886 /* push csums to sbio */
887 have_csum
= scrub_find_csum(sdev
, logical
, l
, csum
);
889 ++sdev
->stat
.no_csum
;
891 ret
= scrub_page(sdev
, logical
, l
, physical
, flags
, gen
,
892 mirror_num
, have_csum
? csum
: NULL
, 0);
902 static noinline_for_stack
int scrub_stripe(struct scrub_dev
*sdev
,
903 struct map_lookup
*map
, int num
, u64 base
, u64 length
)
905 struct btrfs_path
*path
;
906 struct btrfs_fs_info
*fs_info
= sdev
->dev
->dev_root
->fs_info
;
907 struct btrfs_root
*root
= fs_info
->extent_root
;
908 struct btrfs_root
*csum_root
= fs_info
->csum_root
;
909 struct btrfs_extent_item
*extent
;
910 struct blk_plug plug
;
917 struct extent_buffer
*l
;
918 struct btrfs_key key
;
924 u64 increment
= map
->stripe_len
;
929 do_div(nstripes
, map
->stripe_len
);
930 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
931 offset
= map
->stripe_len
* num
;
932 increment
= map
->stripe_len
* map
->num_stripes
;
934 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
935 int factor
= map
->num_stripes
/ map
->sub_stripes
;
936 offset
= map
->stripe_len
* (num
/ map
->sub_stripes
);
937 increment
= map
->stripe_len
* factor
;
938 mirror_num
= num
% map
->sub_stripes
;
939 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
940 increment
= map
->stripe_len
;
941 mirror_num
= num
% map
->num_stripes
;
942 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
943 increment
= map
->stripe_len
;
944 mirror_num
= num
% map
->num_stripes
;
946 increment
= map
->stripe_len
;
950 path
= btrfs_alloc_path();
955 path
->search_commit_root
= 1;
956 path
->skip_locking
= 1;
959 * find all extents for each stripe and just read them to get
960 * them into the page cache
961 * FIXME: we can do better. build a more intelligent prefetching
963 logical
= base
+ offset
;
964 physical
= map
->stripes
[num
].physical
;
966 for (i
= 0; i
< nstripes
; ++i
) {
967 key
.objectid
= logical
;
968 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
971 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
976 * we might miss half an extent here, but that doesn't matter,
977 * as it's only the prefetch
981 slot
= path
->slots
[0];
982 if (slot
>= btrfs_header_nritems(l
)) {
983 ret
= btrfs_next_leaf(root
, path
);
991 btrfs_item_key_to_cpu(l
, &key
, slot
);
993 if (key
.objectid
>= logical
+ map
->stripe_len
)
998 btrfs_release_path(path
);
999 logical
+= increment
;
1000 physical
+= map
->stripe_len
;
1005 * collect all data csums for the stripe to avoid seeking during
1006 * the scrub. This might currently (crc32) end up to be about 1MB
1009 blk_start_plug(&plug
);
1011 logical
= base
+ offset
+ start_stripe
* increment
;
1012 for (i
= start_stripe
; i
< nstripes
; ++i
) {
1013 ret
= btrfs_lookup_csums_range(csum_root
, logical
,
1014 logical
+ map
->stripe_len
- 1,
1015 &sdev
->csum_list
, 1);
1019 logical
+= increment
;
1023 * now find all extents for each stripe and scrub them
1025 logical
= base
+ offset
+ start_stripe
* increment
;
1026 physical
= map
->stripes
[num
].physical
+ start_stripe
* map
->stripe_len
;
1028 for (i
= start_stripe
; i
< nstripes
; ++i
) {
1032 if (atomic_read(&fs_info
->scrub_cancel_req
) ||
1033 atomic_read(&sdev
->cancel_req
)) {
1038 * check to see if we have to pause
1040 if (atomic_read(&fs_info
->scrub_pause_req
)) {
1041 /* push queued extents */
1043 wait_event(sdev
->list_wait
,
1044 atomic_read(&sdev
->in_flight
) == 0);
1045 atomic_inc(&fs_info
->scrubs_paused
);
1046 wake_up(&fs_info
->scrub_pause_wait
);
1047 mutex_lock(&fs_info
->scrub_lock
);
1048 while (atomic_read(&fs_info
->scrub_pause_req
)) {
1049 mutex_unlock(&fs_info
->scrub_lock
);
1050 wait_event(fs_info
->scrub_pause_wait
,
1051 atomic_read(&fs_info
->scrub_pause_req
) == 0);
1052 mutex_lock(&fs_info
->scrub_lock
);
1054 atomic_dec(&fs_info
->scrubs_paused
);
1055 mutex_unlock(&fs_info
->scrub_lock
);
1056 wake_up(&fs_info
->scrub_pause_wait
);
1057 scrub_free_csums(sdev
);
1062 key
.objectid
= logical
;
1063 key
.type
= BTRFS_EXTENT_ITEM_KEY
;
1064 key
.offset
= (u64
)0;
1066 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1070 ret
= btrfs_previous_item(root
, path
, 0,
1071 BTRFS_EXTENT_ITEM_KEY
);
1075 /* there's no smaller item, so stick with the
1077 btrfs_release_path(path
);
1078 ret
= btrfs_search_slot(NULL
, root
, &key
,
1087 slot
= path
->slots
[0];
1088 if (slot
>= btrfs_header_nritems(l
)) {
1089 ret
= btrfs_next_leaf(root
, path
);
1097 btrfs_item_key_to_cpu(l
, &key
, slot
);
1099 if (key
.objectid
+ key
.offset
<= logical
)
1102 if (key
.objectid
>= logical
+ map
->stripe_len
)
1105 if (btrfs_key_type(&key
) != BTRFS_EXTENT_ITEM_KEY
)
1108 extent
= btrfs_item_ptr(l
, slot
,
1109 struct btrfs_extent_item
);
1110 flags
= btrfs_extent_flags(l
, extent
);
1111 generation
= btrfs_extent_generation(l
, extent
);
1113 if (key
.objectid
< logical
&&
1114 (flags
& BTRFS_EXTENT_FLAG_TREE_BLOCK
)) {
1116 "btrfs scrub: tree block %llu spanning "
1117 "stripes, ignored. logical=%llu\n",
1118 (unsigned long long)key
.objectid
,
1119 (unsigned long long)logical
);
1124 * trim extent to this stripe
1126 if (key
.objectid
< logical
) {
1127 key
.offset
-= logical
- key
.objectid
;
1128 key
.objectid
= logical
;
1130 if (key
.objectid
+ key
.offset
>
1131 logical
+ map
->stripe_len
) {
1132 key
.offset
= logical
+ map
->stripe_len
-
1136 ret
= scrub_extent(sdev
, key
.objectid
, key
.offset
,
1137 key
.objectid
- logical
+ physical
,
1138 flags
, generation
, mirror_num
);
1145 btrfs_release_path(path
);
1146 logical
+= increment
;
1147 physical
+= map
->stripe_len
;
1148 spin_lock(&sdev
->stat_lock
);
1149 sdev
->stat
.last_physical
= physical
;
1150 spin_unlock(&sdev
->stat_lock
);
1152 /* push queued extents */
1156 blk_finish_plug(&plug
);
1158 btrfs_free_path(path
);
1159 return ret
< 0 ? ret
: 0;
1162 static noinline_for_stack
int scrub_chunk(struct scrub_dev
*sdev
,
1163 u64 chunk_tree
, u64 chunk_objectid
, u64 chunk_offset
, u64 length
)
1165 struct btrfs_mapping_tree
*map_tree
=
1166 &sdev
->dev
->dev_root
->fs_info
->mapping_tree
;
1167 struct map_lookup
*map
;
1168 struct extent_map
*em
;
1172 read_lock(&map_tree
->map_tree
.lock
);
1173 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
1174 read_unlock(&map_tree
->map_tree
.lock
);
1179 map
= (struct map_lookup
*)em
->bdev
;
1180 if (em
->start
!= chunk_offset
)
1183 if (em
->len
< length
)
1186 for (i
= 0; i
< map
->num_stripes
; ++i
) {
1187 if (map
->stripes
[i
].dev
== sdev
->dev
) {
1188 ret
= scrub_stripe(sdev
, map
, i
, chunk_offset
, length
);
1194 free_extent_map(em
);
1199 static noinline_for_stack
1200 int scrub_enumerate_chunks(struct scrub_dev
*sdev
, u64 start
, u64 end
)
1202 struct btrfs_dev_extent
*dev_extent
= NULL
;
1203 struct btrfs_path
*path
;
1204 struct btrfs_root
*root
= sdev
->dev
->dev_root
;
1205 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1212 struct extent_buffer
*l
;
1213 struct btrfs_key key
;
1214 struct btrfs_key found_key
;
1215 struct btrfs_block_group_cache
*cache
;
1217 path
= btrfs_alloc_path();
1222 path
->search_commit_root
= 1;
1223 path
->skip_locking
= 1;
1225 key
.objectid
= sdev
->dev
->devid
;
1227 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1231 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1235 if (path
->slots
[0] >=
1236 btrfs_header_nritems(path
->nodes
[0])) {
1237 ret
= btrfs_next_leaf(root
, path
);
1244 slot
= path
->slots
[0];
1246 btrfs_item_key_to_cpu(l
, &found_key
, slot
);
1248 if (found_key
.objectid
!= sdev
->dev
->devid
)
1251 if (btrfs_key_type(&found_key
) != BTRFS_DEV_EXTENT_KEY
)
1254 if (found_key
.offset
>= end
)
1257 if (found_key
.offset
< key
.offset
)
1260 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1261 length
= btrfs_dev_extent_length(l
, dev_extent
);
1263 if (found_key
.offset
+ length
<= start
) {
1264 key
.offset
= found_key
.offset
+ length
;
1265 btrfs_release_path(path
);
1269 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
1270 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
1271 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
1274 * get a reference on the corresponding block group to prevent
1275 * the chunk from going away while we scrub it
1277 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
1282 ret
= scrub_chunk(sdev
, chunk_tree
, chunk_objectid
,
1283 chunk_offset
, length
);
1284 btrfs_put_block_group(cache
);
1288 key
.offset
= found_key
.offset
+ length
;
1289 btrfs_release_path(path
);
1292 btrfs_free_path(path
);
1295 * ret can still be 1 from search_slot or next_leaf,
1296 * that's not an error
1298 return ret
< 0 ? ret
: 0;
1301 static noinline_for_stack
int scrub_supers(struct scrub_dev
*sdev
)
1307 struct btrfs_device
*device
= sdev
->dev
;
1308 struct btrfs_root
*root
= device
->dev_root
;
1310 gen
= root
->fs_info
->last_trans_committed
;
1312 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1313 bytenr
= btrfs_sb_offset(i
);
1314 if (bytenr
+ BTRFS_SUPER_INFO_SIZE
>= device
->total_bytes
)
1317 ret
= scrub_page(sdev
, bytenr
, PAGE_SIZE
, bytenr
,
1318 BTRFS_EXTENT_FLAG_SUPER
, gen
, i
, NULL
, 1);
1322 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1328 * get a reference count on fs_info->scrub_workers. start worker if necessary
1330 static noinline_for_stack
int scrub_workers_get(struct btrfs_root
*root
)
1332 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1334 mutex_lock(&fs_info
->scrub_lock
);
1335 if (fs_info
->scrub_workers_refcnt
== 0) {
1336 btrfs_init_workers(&fs_info
->scrub_workers
, "scrub",
1337 fs_info
->thread_pool_size
, &fs_info
->generic_worker
);
1338 fs_info
->scrub_workers
.idle_thresh
= 4;
1339 btrfs_start_workers(&fs_info
->scrub_workers
, 1);
1341 ++fs_info
->scrub_workers_refcnt
;
1342 mutex_unlock(&fs_info
->scrub_lock
);
1347 static noinline_for_stack
void scrub_workers_put(struct btrfs_root
*root
)
1349 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1351 mutex_lock(&fs_info
->scrub_lock
);
1352 if (--fs_info
->scrub_workers_refcnt
== 0)
1353 btrfs_stop_workers(&fs_info
->scrub_workers
);
1354 WARN_ON(fs_info
->scrub_workers_refcnt
< 0);
1355 mutex_unlock(&fs_info
->scrub_lock
);
1359 int btrfs_scrub_dev(struct btrfs_root
*root
, u64 devid
, u64 start
, u64 end
,
1360 struct btrfs_scrub_progress
*progress
, int readonly
)
1362 struct scrub_dev
*sdev
;
1363 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1365 struct btrfs_device
*dev
;
1367 if (btrfs_fs_closing(root
->fs_info
))
1371 * check some assumptions
1373 if (root
->sectorsize
!= PAGE_SIZE
||
1374 root
->sectorsize
!= root
->leafsize
||
1375 root
->sectorsize
!= root
->nodesize
) {
1376 printk(KERN_ERR
"btrfs_scrub: size assumptions fail\n");
1380 ret
= scrub_workers_get(root
);
1384 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1385 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1386 if (!dev
|| dev
->missing
) {
1387 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1388 scrub_workers_put(root
);
1391 mutex_lock(&fs_info
->scrub_lock
);
1393 if (!dev
->in_fs_metadata
) {
1394 mutex_unlock(&fs_info
->scrub_lock
);
1395 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1396 scrub_workers_put(root
);
1400 if (dev
->scrub_device
) {
1401 mutex_unlock(&fs_info
->scrub_lock
);
1402 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1403 scrub_workers_put(root
);
1404 return -EINPROGRESS
;
1406 sdev
= scrub_setup_dev(dev
);
1408 mutex_unlock(&fs_info
->scrub_lock
);
1409 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1410 scrub_workers_put(root
);
1411 return PTR_ERR(sdev
);
1413 sdev
->readonly
= readonly
;
1414 dev
->scrub_device
= sdev
;
1416 atomic_inc(&fs_info
->scrubs_running
);
1417 mutex_unlock(&fs_info
->scrub_lock
);
1418 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1420 down_read(&fs_info
->scrub_super_lock
);
1421 ret
= scrub_supers(sdev
);
1422 up_read(&fs_info
->scrub_super_lock
);
1425 ret
= scrub_enumerate_chunks(sdev
, start
, end
);
1427 wait_event(sdev
->list_wait
, atomic_read(&sdev
->in_flight
) == 0);
1429 atomic_dec(&fs_info
->scrubs_running
);
1430 wake_up(&fs_info
->scrub_pause_wait
);
1433 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1435 mutex_lock(&fs_info
->scrub_lock
);
1436 dev
->scrub_device
= NULL
;
1437 mutex_unlock(&fs_info
->scrub_lock
);
1439 scrub_free_dev(sdev
);
1440 scrub_workers_put(root
);
1445 int btrfs_scrub_pause(struct btrfs_root
*root
)
1447 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1449 mutex_lock(&fs_info
->scrub_lock
);
1450 atomic_inc(&fs_info
->scrub_pause_req
);
1451 while (atomic_read(&fs_info
->scrubs_paused
) !=
1452 atomic_read(&fs_info
->scrubs_running
)) {
1453 mutex_unlock(&fs_info
->scrub_lock
);
1454 wait_event(fs_info
->scrub_pause_wait
,
1455 atomic_read(&fs_info
->scrubs_paused
) ==
1456 atomic_read(&fs_info
->scrubs_running
));
1457 mutex_lock(&fs_info
->scrub_lock
);
1459 mutex_unlock(&fs_info
->scrub_lock
);
1464 int btrfs_scrub_continue(struct btrfs_root
*root
)
1466 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1468 atomic_dec(&fs_info
->scrub_pause_req
);
1469 wake_up(&fs_info
->scrub_pause_wait
);
1473 int btrfs_scrub_pause_super(struct btrfs_root
*root
)
1475 down_write(&root
->fs_info
->scrub_super_lock
);
1479 int btrfs_scrub_continue_super(struct btrfs_root
*root
)
1481 up_write(&root
->fs_info
->scrub_super_lock
);
1485 int btrfs_scrub_cancel(struct btrfs_root
*root
)
1487 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1489 mutex_lock(&fs_info
->scrub_lock
);
1490 if (!atomic_read(&fs_info
->scrubs_running
)) {
1491 mutex_unlock(&fs_info
->scrub_lock
);
1495 atomic_inc(&fs_info
->scrub_cancel_req
);
1496 while (atomic_read(&fs_info
->scrubs_running
)) {
1497 mutex_unlock(&fs_info
->scrub_lock
);
1498 wait_event(fs_info
->scrub_pause_wait
,
1499 atomic_read(&fs_info
->scrubs_running
) == 0);
1500 mutex_lock(&fs_info
->scrub_lock
);
1502 atomic_dec(&fs_info
->scrub_cancel_req
);
1503 mutex_unlock(&fs_info
->scrub_lock
);
1508 int btrfs_scrub_cancel_dev(struct btrfs_root
*root
, struct btrfs_device
*dev
)
1510 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1511 struct scrub_dev
*sdev
;
1513 mutex_lock(&fs_info
->scrub_lock
);
1514 sdev
= dev
->scrub_device
;
1516 mutex_unlock(&fs_info
->scrub_lock
);
1519 atomic_inc(&sdev
->cancel_req
);
1520 while (dev
->scrub_device
) {
1521 mutex_unlock(&fs_info
->scrub_lock
);
1522 wait_event(fs_info
->scrub_pause_wait
,
1523 dev
->scrub_device
== NULL
);
1524 mutex_lock(&fs_info
->scrub_lock
);
1526 mutex_unlock(&fs_info
->scrub_lock
);
1530 int btrfs_scrub_cancel_devid(struct btrfs_root
*root
, u64 devid
)
1532 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
1533 struct btrfs_device
*dev
;
1537 * we have to hold the device_list_mutex here so the device
1538 * does not go away in cancel_dev. FIXME: find a better solution
1540 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1541 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1543 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1546 ret
= btrfs_scrub_cancel_dev(root
, dev
);
1547 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1552 int btrfs_scrub_progress(struct btrfs_root
*root
, u64 devid
,
1553 struct btrfs_scrub_progress
*progress
)
1555 struct btrfs_device
*dev
;
1556 struct scrub_dev
*sdev
= NULL
;
1558 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1559 dev
= btrfs_find_device(root
, devid
, NULL
, NULL
);
1561 sdev
= dev
->scrub_device
;
1563 memcpy(progress
, &sdev
->stat
, sizeof(*progress
));
1564 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1566 return dev
? (sdev
? 0 : -ENOTCONN
) : -ENODEV
;