fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27
  28 /*
  29  * This is only the first step towards a full-features scrub. It reads all
  30  * extent and super block and verifies the checksums. In case a bad checksum
  31  * is found or the extent cannot be read, good data will be written back if
  32  * any can be found.
  33  *
  34  * Future enhancements:
  35  *  - To enhance the performance, better read-ahead strategies for the
  36  *    extent-tree can be employed.
  37  *  - In case an unrepairable extent is encountered, track which files are
  38  *    affected and report them
  39  *  - In case of a read error on files with nodatasum, map the file and read
  40  *    the extent to trigger a writeback of the good copy
  41  *  - track and record media errors, throw out bad devices
  42  *  - add a mode to also read unallocated space
  43  *  - make the prefetch cancellable
  44  */
  45
  46 struct scrub_bio;
  47 struct scrub_page;
  48 struct scrub_dev;
  49 static void scrub_bio_end_io(struct bio *bio, int err);
  50 static void scrub_checksum(struct btrfs_work *work);
  51 static int scrub_checksum_data(struct scrub_dev *sdev,
  52                                struct scrub_page *spag, void *buffer);
  53 static int scrub_checksum_tree_block(struct scrub_dev *sdev,
  54                                      struct scrub_page *spag, u64 logical,
  55                                      void *buffer);
  56 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
  57 static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
  58 static void scrub_fixup_end_io(struct bio *bio, int err);
  59 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
  60                           struct page *page);
  61 static void scrub_fixup(struct scrub_bio *sbio, int ix);
  62
  63 #define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
  64 #define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
  65
  66 struct scrub_page {
  67         u64                     flags;  /* extent flags */
  68         u64                     generation;
  69         int                     mirror_num;
  70         int                     have_csum;
  71         u8                      csum[BTRFS_CSUM_SIZE];
  72 };
  73
  74 struct scrub_bio {
  75         int                     index;
  76         struct scrub_dev        *sdev;
  77         struct bio              *bio;
  78         int                     err;
  79         u64                     logical;
  80         u64                     physical;
  81         struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
  82         u64                     count;
  83         int                     next_free;
  84         struct btrfs_work       work;
  85 };
  86
  87 struct scrub_dev {
  88         struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
  89         struct btrfs_device     *dev;
  90         int                     first_free;
  91         int                     curr;
  92         atomic_t                in_flight;
  93         atomic_t                fixup_cnt;
  94         spinlock_t              list_lock;
  95         wait_queue_head_t       list_wait;
  96         u16                     csum_size;
  97         struct list_head        csum_list;
  98         atomic_t                cancel_req;
  99         int                     readonly;
 100         /*
 101          * statistics
 102          */
 103         struct btrfs_scrub_progress stat;
 104         spinlock_t              stat_lock;
 105 };
 106
 107 struct scrub_fixup_nodatasum {
 108         struct scrub_dev        *sdev;
 109         u64                     logical;
 110         struct btrfs_root       *root;
 111         struct btrfs_work       work;
 112         int                     mirror_num;
 113 };
 114
 115 struct scrub_warning {
 116         struct btrfs_path       *path;
 117         u64                     extent_item_size;
 118         char                    *scratch_buf;
 119         char                    *msg_buf;
 120         const char              *errstr;
 121         sector_t                sector;
 122         u64                     logical;
 123         struct btrfs_device     *dev;
 124         int                     msg_bufsize;
 125         int                     scratch_bufsize;
 126 };
 127
 128 static void scrub_free_csums(struct scrub_dev *sdev)
 129 {
 130         while (!list_empty(&sdev->csum_list)) {
 131                 struct btrfs_ordered_sum *sum;
 132                 sum = list_first_entry(&sdev->csum_list,
 133                                        struct btrfs_ordered_sum, list);
 134                 list_del(&sum->list);
 135                 kfree(sum);
 136         }
 137 }
 138
 139 static void scrub_free_bio(struct bio *bio)
 140 {
 141         int i;
 142         struct page *last_page = NULL;
 143
 144         if (!bio)
 145                 return;
 146
 147         for (i = 0; i < bio->bi_vcnt; ++i) {
 148                 if (bio->bi_io_vec[i].bv_page == last_page)
 149                         continue;
 150                 last_page = bio->bi_io_vec[i].bv_page;
 151                 __free_page(last_page);
 152         }
 153         bio_put(bio);
 154 }
 155
 156 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
 157 {
 158         int i;
 159
 160         if (!sdev)
 161                 return;
 162
 163         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
 164                 struct scrub_bio *sbio = sdev->bios[i];
 165
 166                 if (!sbio)
 167                         break;
 168
 169                 scrub_free_bio(sbio->bio);
 170                 kfree(sbio);
 171         }
 172
 173         scrub_free_csums(sdev);
 174         kfree(sdev);
 175 }
 176
 177 static noinline_for_stack
 178 struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 179 {
 180         struct scrub_dev *sdev;
 181         int             i;
 182         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 183
 184         sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
 185         if (!sdev)
 186                 goto nomem;
 187         sdev->dev = dev;
 188         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
 189                 struct scrub_bio *sbio;
 190
 191                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 192                 if (!sbio)
 193                         goto nomem;
 194                 sdev->bios[i] = sbio;
 195
 196                 sbio->index = i;
 197                 sbio->sdev = sdev;
 198                 sbio->count = 0;
 199                 sbio->work.func = scrub_checksum;
 200
 201                 if (i != SCRUB_BIOS_PER_DEV-1)
 202                         sdev->bios[i]->next_free = i + 1;
 203                 else
 204                         sdev->bios[i]->next_free = -1;
 205         }
 206         sdev->first_free = 0;
 207         sdev->curr = -1;
 208         atomic_set(&sdev->in_flight, 0);
 209         atomic_set(&sdev->fixup_cnt, 0);
 210         atomic_set(&sdev->cancel_req, 0);
 211         sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
 212         INIT_LIST_HEAD(&sdev->csum_list);
 213
 214         spin_lock_init(&sdev->list_lock);
 215         spin_lock_init(&sdev->stat_lock);
 216         init_waitqueue_head(&sdev->list_wait);
 217         return sdev;
 218
 219 nomem:
 220         scrub_free_dev(sdev);
 221         return ERR_PTR(-ENOMEM);
 222 }
 223
 224 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 225 {
 226         u64 isize;
 227         u32 nlink;
 228         int ret;
 229         int i;
 230         struct extent_buffer *eb;
 231         struct btrfs_inode_item *inode_item;
 232         struct scrub_warning *swarn = ctx;
 233         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 234         struct inode_fs_paths *ipath = NULL;
 235         struct btrfs_root *local_root;
 236         struct btrfs_key root_key;
 237
 238         root_key.objectid = root;
 239         root_key.type = BTRFS_ROOT_ITEM_KEY;
 240         root_key.offset = (u64)-1;
 241         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 242         if (IS_ERR(local_root)) {
 243                 ret = PTR_ERR(local_root);
 244                 goto err;
 245         }
 246
 247         ret = inode_item_info(inum, 0, local_root, swarn->path);
 248         if (ret) {
 249                 btrfs_release_path(swarn->path);
 250                 goto err;
 251         }
 252
 253         eb = swarn->path->nodes[0];
 254         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 255                                         struct btrfs_inode_item);
 256         isize = btrfs_inode_size(eb, inode_item);
 257         nlink = btrfs_inode_nlink(eb, inode_item);
 258         btrfs_release_path(swarn->path);
 259
 260         ipath = init_ipath(4096, local_root, swarn->path);
 261         ret = paths_from_inode(inum, ipath);
 262
 263         if (ret < 0)
 264                 goto err;
 265
 266         /*
 267          * we deliberately ignore the bit ipath might have been too small to
 268          * hold all of the paths here
 269          */
 270         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 271                 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
 272                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 273                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 274                         swarn->logical, swarn->dev->name,
 275                         (unsigned long long)swarn->sector, root, inum, offset,
 276                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 277                         ipath->fspath->str[i]);
 278
 279         free_ipath(ipath);
 280         return 0;
 281
 282 err:
 283         printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
 284                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 285                 "resolving failed with ret=%d\n", swarn->errstr,
 286                 swarn->logical, swarn->dev->name,
 287                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 288
 289         free_ipath(ipath);
 290         return 0;
 291 }
 292
 293 static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 294                                 int ix)
 295 {
 296         struct btrfs_device *dev = sbio->sdev->dev;
 297         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 298         struct btrfs_path *path;
 299         struct btrfs_key found_key;
 300         struct extent_buffer *eb;
 301         struct btrfs_extent_item *ei;
 302         struct scrub_warning swarn;
 303         u32 item_size;
 304         int ret;
 305         u64 ref_root;
 306         u8 ref_level;
 307         unsigned long ptr = 0;
 308         const int bufsize = 4096;
 309         u64 extent_offset;
 310
 311         path = btrfs_alloc_path();
 312
 313         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 314         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 315         swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
 316         swarn.logical = sbio->logical + ix * PAGE_SIZE;
 317         swarn.errstr = errstr;
 318         swarn.dev = dev;
 319         swarn.msg_bufsize = bufsize;
 320         swarn.scratch_bufsize = bufsize;
 321
 322         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 323                 goto out;
 324
 325         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
 326         if (ret < 0)
 327                 goto out;
 328
 329         extent_offset = swarn.logical - found_key.objectid;
 330         swarn.extent_item_size = found_key.offset;
 331
 332         eb = path->nodes[0];
 333         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 334         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 335
 336         if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 337                 do {
 338                         ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 339                                                         &ref_root, &ref_level);
 340                         printk(KERN_WARNING "%s at logical %llu on dev %s, "
 341                                 "sector %llu: metadata %s (level %d) in tree "
 342                                 "%llu\n", errstr, swarn.logical, dev->name,
 343                                 (unsigned long long)swarn.sector,
 344                                 ref_level ? "node" : "leaf",
 345                                 ret < 0 ? -1 : ref_level,
 346                                 ret < 0 ? -1 : ref_root);
 347                 } while (ret != 1);
 348         } else {
 349                 swarn.path = path;
 350                 iterate_extent_inodes(fs_info, path, found_key.objectid,
 351                                         extent_offset,
 352                                         scrub_print_warning_inode, &swarn);
 353         }
 354
 355 out:
 356         btrfs_free_path(path);
 357         kfree(swarn.scratch_buf);
 358         kfree(swarn.msg_buf);
 359 }
 360
 361 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 362 {
 363         struct page *page;
 364         unsigned long index;
 365         struct scrub_fixup_nodatasum *fixup = ctx;
 366         int ret;
 367         int corrected;
 368         struct btrfs_key key;
 369         struct inode *inode;
 370         u64 end = offset + PAGE_SIZE - 1;
 371         struct btrfs_root *local_root;
 372
 373         key.objectid = root;
 374         key.type = BTRFS_ROOT_ITEM_KEY;
 375         key.offset = (u64)-1;
 376         local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
 377         if (IS_ERR(local_root))
 378                 return PTR_ERR(local_root);
 379
 380         key.type = BTRFS_INODE_ITEM_KEY;
 381         key.objectid = inum;
 382         key.offset = 0;
 383         inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
 384         if (IS_ERR(inode))
 385                 return PTR_ERR(inode);
 386
 387         ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
 388                                 EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
 389
 390         /* set_extent_bit should either succeed or give proper error */
 391         WARN_ON(ret > 0);
 392         if (ret)
 393                 return ret < 0 ? ret : -EFAULT;
 394
 395         index = offset >> PAGE_CACHE_SHIFT;
 396
 397         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 398         if (!page)
 399                 return -ENOMEM;
 400
 401         ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 402                                         btrfs_get_extent, fixup->mirror_num);
 403         wait_on_page_locked(page);
 404         corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
 405                                         EXTENT_DAMAGED, 0, NULL);
 406
 407         if (corrected)
 408                 WARN_ON(!PageUptodate(page));
 409         else
 410                 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
 411                                         EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
 412
 413         put_page(page);
 414         iput(inode);
 415
 416         if (ret < 0)
 417                 return ret;
 418
 419         if (ret == 0 && corrected) {
 420                 /*
 421                  * we only need to call readpage for one of the inodes belonging
 422                  * to this extent. so make iterate_extent_inodes stop
 423                  */
 424                 return 1;
 425         }
 426
 427         return -EIO;
 428 }
 429
 430 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 431 {
 432         int ret;
 433         struct scrub_fixup_nodatasum *fixup;
 434         struct scrub_dev *sdev;
 435         struct btrfs_trans_handle *trans = NULL;
 436         struct btrfs_fs_info *fs_info;
 437         struct btrfs_path *path;
 438         int uncorrectable = 0;
 439
 440         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 441         sdev = fixup->sdev;
 442         fs_info = fixup->root->fs_info;
 443
 444         path = btrfs_alloc_path();
 445         if (!path) {
 446                 spin_lock(&sdev->stat_lock);
 447                 ++sdev->stat.malloc_errors;
 448                 spin_unlock(&sdev->stat_lock);
 449                 uncorrectable = 1;
 450                 goto out;
 451         }
 452
 453         trans = btrfs_join_transaction(fixup->root);
 454         if (IS_ERR(trans)) {
 455                 uncorrectable = 1;
 456                 goto out;
 457         }
 458
 459         /*
 460          * the idea is to trigger a regular read through the standard path. we
 461          * read a page from the (failed) logical address by specifying the
 462          * corresponding copynum of the failed sector. thus, that readpage is
 463          * expected to fail.
 464          * that is the point where on-the-fly error correction will kick in
 465          * (once it's finished) and rewrite the failed sector if a good copy
 466          * can be found.
 467          */
 468         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 469                                                 path, scrub_fixup_readpage,
 470                                                 fixup);
 471         if (ret < 0) {
 472                 uncorrectable = 1;
 473                 goto out;
 474         }
 475         WARN_ON(ret != 1);
 476
 477         spin_lock(&sdev->stat_lock);
 478         ++sdev->stat.corrected_errors;
 479         spin_unlock(&sdev->stat_lock);
 480
 481 out:
 482         if (trans && !IS_ERR(trans))
 483                 btrfs_end_transaction(trans, fixup->root);
 484         if (uncorrectable) {
 485                 spin_lock(&sdev->stat_lock);
 486                 ++sdev->stat.uncorrectable_errors;
 487                 spin_unlock(&sdev->stat_lock);
 488                 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
 489                                         "(nodatasum) error at logical %llu\n",
 490                                         fixup->logical);
 491         }
 492
 493         btrfs_free_path(path);
 494         kfree(fixup);
 495
 496         /* see caller why we're pretending to be paused in the scrub counters */
 497         mutex_lock(&fs_info->scrub_lock);
 498         atomic_dec(&fs_info->scrubs_running);
 499         atomic_dec(&fs_info->scrubs_paused);
 500         mutex_unlock(&fs_info->scrub_lock);
 501         atomic_dec(&sdev->fixup_cnt);
 502         wake_up(&fs_info->scrub_pause_wait);
 503         wake_up(&sdev->list_wait);
 504 }
 505
 506 /*
 507  * scrub_recheck_error gets called when either verification of the page
 508  * failed or the bio failed to read, e.g. with EIO. In the latter case,
 509  * recheck_error gets called for every page in the bio, even though only
 510  * one may be bad
 511  */
 512 static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 513 {
 514         struct scrub_dev *sdev = sbio->sdev;
 515         u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
 516         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 517                                         DEFAULT_RATELIMIT_BURST);
 518
 519         if (sbio->err) {
 520                 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
 521                                    sbio->bio->bi_io_vec[ix].bv_page) == 0) {
 522                         if (scrub_fixup_check(sbio, ix) == 0)
 523                                 return 0;
 524                 }
 525                 if (__ratelimit(&_rs))
 526                         scrub_print_warning("i/o error", sbio, ix);
 527         } else {
 528                 if (__ratelimit(&_rs))
 529                         scrub_print_warning("checksum error", sbio, ix);
 530         }
 531
 532         spin_lock(&sdev->stat_lock);
 533         ++sdev->stat.read_errors;
 534         spin_unlock(&sdev->stat_lock);
 535
 536         scrub_fixup(sbio, ix);
 537         return 1;
 538 }
 539
 540 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
 541 {
 542         int ret = 1;
 543         struct page *page;
 544         void *buffer;
 545         u64 flags = sbio->spag[ix].flags;
 546
 547         page = sbio->bio->bi_io_vec[ix].bv_page;
 548         buffer = kmap_atomic(page, KM_USER0);
 549         if (flags & BTRFS_EXTENT_FLAG_DATA) {
 550                 ret = scrub_checksum_data(sbio->sdev,
 551                                           sbio->spag + ix, buffer);
 552         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 553                 ret = scrub_checksum_tree_block(sbio->sdev,
 554                                                 sbio->spag + ix,
 555                                                 sbio->logical + ix * PAGE_SIZE,
 556                                                 buffer);
 557         } else {
 558                 WARN_ON(1);
 559         }
 560         kunmap_atomic(buffer, KM_USER0);
 561
 562         return ret;
 563 }
 564
 565 static void scrub_fixup_end_io(struct bio *bio, int err)
 566 {
 567         complete((struct completion *)bio->bi_private);
 568 }
 569
 570 static void scrub_fixup(struct scrub_bio *sbio, int ix)
 571 {
 572         struct scrub_dev *sdev = sbio->sdev;
 573         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 574         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 575         struct btrfs_bio *bbio = NULL;
 576         struct scrub_fixup_nodatasum *fixup;
 577         u64 logical = sbio->logical + ix * PAGE_SIZE;
 578         u64 length;
 579         int i;
 580         int ret;
 581         DECLARE_COMPLETION_ONSTACK(complete);
 582
 583         if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
 584             (sbio->spag[ix].have_csum == 0)) {
 585                 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 586                 if (!fixup)
 587                         goto uncorrectable;
 588                 fixup->sdev = sdev;
 589                 fixup->logical = logical;
 590                 fixup->root = fs_info->extent_root;
 591                 fixup->mirror_num = sbio->spag[ix].mirror_num;
 592                 /*
 593                  * increment scrubs_running to prevent cancel requests from
 594                  * completing as long as a fixup worker is running. we must also
 595                  * increment scrubs_paused to prevent deadlocking on pause
 596                  * requests used for transactions commits (as the worker uses a
 597                  * transaction context). it is safe to regard the fixup worker
 598                  * as paused for all matters practical. effectively, we only
 599                  * avoid cancellation requests from completing.
 600                  */
 601                 mutex_lock(&fs_info->scrub_lock);
 602                 atomic_inc(&fs_info->scrubs_running);
 603                 atomic_inc(&fs_info->scrubs_paused);
 604                 mutex_unlock(&fs_info->scrub_lock);
 605                 atomic_inc(&sdev->fixup_cnt);
 606                 fixup->work.func = scrub_fixup_nodatasum;
 607                 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
 608                 return;
 609         }
 610
 611         length = PAGE_SIZE;
 612         ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
 613                               &bbio, 0);
 614         if (ret || !bbio || length < PAGE_SIZE) {
 615                 printk(KERN_ERR
 616                        "scrub_fixup: btrfs_map_block failed us for %llu\n",
 617                        (unsigned long long)logical);
 618                 WARN_ON(1);
 619                 return;
 620         }
 621
 622         if (bbio->num_stripes == 1)
 623                 /* there aren't any replicas */
 624                 goto uncorrectable;
 625
 626         /*
 627          * first find a good copy
 628          */
 629         for (i = 0; i < bbio->num_stripes; ++i) {
 630                 if (i + 1 == sbio->spag[ix].mirror_num)
 631                         continue;
 632
 633                 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
 634                                    bbio->stripes[i].physical >> 9,
 635                                    sbio->bio->bi_io_vec[ix].bv_page)) {
 636                         /* I/O-error, this is not a good copy */
 637                         continue;
 638                 }
 639
 640                 if (scrub_fixup_check(sbio, ix) == 0)
 641                         break;
 642         }
 643         if (i == bbio->num_stripes)
 644                 goto uncorrectable;
 645
 646         if (!sdev->readonly) {
 647                 /*
 648                  * bi_io_vec[ix].bv_page now contains good data, write it back
 649                  */
 650                 if (scrub_fixup_io(WRITE, sdev->dev->bdev,
 651                                    (sbio->physical + ix * PAGE_SIZE) >> 9,
 652                                    sbio->bio->bi_io_vec[ix].bv_page)) {
 653                         /* I/O-error, writeback failed, give up */
 654                         goto uncorrectable;
 655                 }
 656         }
 657
 658         kfree(bbio);
 659         spin_lock(&sdev->stat_lock);
 660         ++sdev->stat.corrected_errors;
 661         spin_unlock(&sdev->stat_lock);
 662
 663         printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
 664                                (unsigned long long)logical);
 665         return;
 666
 667 uncorrectable:
 668         kfree(bbio);
 669         spin_lock(&sdev->stat_lock);
 670         ++sdev->stat.uncorrectable_errors;
 671         spin_unlock(&sdev->stat_lock);
 672
 673         printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
 674                                 "logical %llu\n", (unsigned long long)logical);
 675 }
 676
 677 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 678                          struct page *page)
 679 {
 680         struct bio *bio = NULL;
 681         int ret;
 682         DECLARE_COMPLETION_ONSTACK(complete);
 683
 684         bio = bio_alloc(GFP_NOFS, 1);
 685         bio->bi_bdev = bdev;
 686         bio->bi_sector = sector;
 687         bio_add_page(bio, page, PAGE_SIZE, 0);
 688         bio->bi_end_io = scrub_fixup_end_io;
 689         bio->bi_private = &complete;
 690         submit_bio(rw, bio);
 691
 692         /* this will also unplug the queue */
 693         wait_for_completion(&complete);
 694
 695         ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
 696         bio_put(bio);
 697         return ret;
 698 }
 699
 700 static void scrub_bio_end_io(struct bio *bio, int err)
 701 {
 702         struct scrub_bio *sbio = bio->bi_private;
 703         struct scrub_dev *sdev = sbio->sdev;
 704         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 705
 706         sbio->err = err;
 707         sbio->bio = bio;
 708
 709         btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 710 }
 711
 712 static void scrub_checksum(struct btrfs_work *work)
 713 {
 714         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 715         struct scrub_dev *sdev = sbio->sdev;
 716         struct page *page;
 717         void *buffer;
 718         int i;
 719         u64 flags;
 720         u64 logical;
 721         int ret;
 722
 723         if (sbio->err) {
 724                 ret = 0;
 725                 for (i = 0; i < sbio->count; ++i)
 726                         ret |= scrub_recheck_error(sbio, i);
 727                 if (!ret) {
 728                         spin_lock(&sdev->stat_lock);
 729                         ++sdev->stat.unverified_errors;
 730                         spin_unlock(&sdev->stat_lock);
 731                 }
 732
 733                 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 734                 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
 735                 sbio->bio->bi_phys_segments = 0;
 736                 sbio->bio->bi_idx = 0;
 737
 738                 for (i = 0; i < sbio->count; i++) {
 739                         struct bio_vec *bi;
 740                         bi = &sbio->bio->bi_io_vec[i];
 741                         bi->bv_offset = 0;
 742                         bi->bv_len = PAGE_SIZE;
 743                 }
 744                 goto out;
 745         }
 746         for (i = 0; i < sbio->count; ++i) {
 747                 page = sbio->bio->bi_io_vec[i].bv_page;
 748                 buffer = kmap_atomic(page, KM_USER0);
 749                 flags = sbio->spag[i].flags;
 750                 logical = sbio->logical + i * PAGE_SIZE;
 751                 ret = 0;
 752                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
 753                         ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
 754                 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 755                         ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
 756                                                         logical, buffer);
 757                 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
 758                         BUG_ON(i);
 759                         (void)scrub_checksum_super(sbio, buffer);
 760                 } else {
 761                         WARN_ON(1);
 762                 }
 763                 kunmap_atomic(buffer, KM_USER0);
 764                 if (ret) {
 765                         ret = scrub_recheck_error(sbio, i);
 766                         if (!ret) {
 767                                 spin_lock(&sdev->stat_lock);
 768                                 ++sdev->stat.unverified_errors;
 769                                 spin_unlock(&sdev->stat_lock);
 770                         }
 771                 }
 772         }
 773
 774 out:
 775         scrub_free_bio(sbio->bio);
 776         sbio->bio = NULL;
 777         spin_lock(&sdev->list_lock);
 778         sbio->next_free = sdev->first_free;
 779         sdev->first_free = sbio->index;
 780         spin_unlock(&sdev->list_lock);
 781         atomic_dec(&sdev->in_flight);
 782         wake_up(&sdev->list_wait);
 783 }
 784
 785 static int scrub_checksum_data(struct scrub_dev *sdev,
 786                                struct scrub_page *spag, void *buffer)
 787 {
 788         u8 csum[BTRFS_CSUM_SIZE];
 789         u32 crc = ~(u32)0;
 790         int fail = 0;
 791         struct btrfs_root *root = sdev->dev->dev_root;
 792
 793         if (!spag->have_csum)
 794                 return 0;
 795
 796         crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
 797         btrfs_csum_final(crc, csum);
 798         if (memcmp(csum, spag->csum, sdev->csum_size))
 799                 fail = 1;
 800
 801         spin_lock(&sdev->stat_lock);
 802         ++sdev->stat.data_extents_scrubbed;
 803         sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
 804         if (fail)
 805                 ++sdev->stat.csum_errors;
 806         spin_unlock(&sdev->stat_lock);
 807
 808         return fail;
 809 }
 810
 811 static int scrub_checksum_tree_block(struct scrub_dev *sdev,
 812                                      struct scrub_page *spag, u64 logical,
 813                                      void *buffer)
 814 {
 815         struct btrfs_header *h;
 816         struct btrfs_root *root = sdev->dev->dev_root;
 817         struct btrfs_fs_info *fs_info = root->fs_info;
 818         u8 csum[BTRFS_CSUM_SIZE];
 819         u32 crc = ~(u32)0;
 820         int fail = 0;
 821         int crc_fail = 0;
 822
 823         /*
 824          * we don't use the getter functions here, as we
 825          * a) don't have an extent buffer and
 826          * b) the page is already kmapped
 827          */
 828         h = (struct btrfs_header *)buffer;
 829
 830         if (logical != le64_to_cpu(h->bytenr))
 831                 ++fail;
 832
 833         if (spag->generation != le64_to_cpu(h->generation))
 834                 ++fail;
 835
 836         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
 837                 ++fail;
 838
 839         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 840                    BTRFS_UUID_SIZE))
 841                 ++fail;
 842
 843         crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
 844                               PAGE_SIZE - BTRFS_CSUM_SIZE);
 845         btrfs_csum_final(crc, csum);
 846         if (memcmp(csum, h->csum, sdev->csum_size))
 847                 ++crc_fail;
 848
 849         spin_lock(&sdev->stat_lock);
 850         ++sdev->stat.tree_extents_scrubbed;
 851         sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
 852         if (crc_fail)
 853                 ++sdev->stat.csum_errors;
 854         if (fail)
 855                 ++sdev->stat.verify_errors;
 856         spin_unlock(&sdev->stat_lock);
 857
 858         return fail || crc_fail;
 859 }
 860
 861 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 862 {
 863         struct btrfs_super_block *s;
 864         u64 logical;
 865         struct scrub_dev *sdev = sbio->sdev;
 866         struct btrfs_root *root = sdev->dev->dev_root;
 867         struct btrfs_fs_info *fs_info = root->fs_info;
 868         u8 csum[BTRFS_CSUM_SIZE];
 869         u32 crc = ~(u32)0;
 870         int fail = 0;
 871
 872         s = (struct btrfs_super_block *)buffer;
 873         logical = sbio->logical;
 874
 875         if (logical != le64_to_cpu(s->bytenr))
 876                 ++fail;
 877
 878         if (sbio->spag[0].generation != le64_to_cpu(s->generation))
 879                 ++fail;
 880
 881         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
 882                 ++fail;
 883
 884         crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
 885                               PAGE_SIZE - BTRFS_CSUM_SIZE);
 886         btrfs_csum_final(crc, csum);
 887         if (memcmp(csum, s->csum, sbio->sdev->csum_size))
 888                 ++fail;
 889
 890         if (fail) {
 891                 /*
 892                  * if we find an error in a super block, we just report it.
 893                  * They will get written with the next transaction commit
 894                  * anyway
 895                  */
 896                 spin_lock(&sdev->stat_lock);
 897                 ++sdev->stat.super_errors;
 898                 spin_unlock(&sdev->stat_lock);
 899         }
 900
 901         return fail;
 902 }
 903
 904 static int scrub_submit(struct scrub_dev *sdev)
 905 {
 906         struct scrub_bio *sbio;
 907         struct bio *bio;
 908         int i;
 909
 910         if (sdev->curr == -1)
 911                 return 0;
 912
 913         sbio = sdev->bios[sdev->curr];
 914
 915         bio = bio_alloc(GFP_NOFS, sbio->count);
 916         if (!bio)
 917                 goto nomem;
 918
 919         bio->bi_private = sbio;
 920         bio->bi_end_io = scrub_bio_end_io;
 921         bio->bi_bdev = sdev->dev->bdev;
 922         bio->bi_sector = sbio->physical >> 9;
 923
 924         for (i = 0; i < sbio->count; ++i) {
 925                 struct page *page;
 926                 int ret;
 927
 928                 page = alloc_page(GFP_NOFS);
 929                 if (!page)
 930                         goto nomem;
 931
 932                 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 933                 if (!ret) {
 934                         __free_page(page);
 935                         goto nomem;
 936                 }
 937         }
 938
 939         sbio->err = 0;
 940         sdev->curr = -1;
 941         atomic_inc(&sdev->in_flight);
 942
 943         submit_bio(READ, bio);
 944
 945         return 0;
 946
 947 nomem:
 948         scrub_free_bio(bio);
 949
 950         return -ENOMEM;
 951 }
 952
 953 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
 954                       u64 physical, u64 flags, u64 gen, int mirror_num,
 955                       u8 *csum, int force)
 956 {
 957         struct scrub_bio *sbio;
 958
 959 again:
 960         /*
 961          * grab a fresh bio or wait for one to become available
 962          */
 963         while (sdev->curr == -1) {
 964                 spin_lock(&sdev->list_lock);
 965                 sdev->curr = sdev->first_free;
 966                 if (sdev->curr != -1) {
 967                         sdev->first_free = sdev->bios[sdev->curr]->next_free;
 968                         sdev->bios[sdev->curr]->next_free = -1;
 969                         sdev->bios[sdev->curr]->count = 0;
 970                         spin_unlock(&sdev->list_lock);
 971                 } else {
 972                         spin_unlock(&sdev->list_lock);
 973                         wait_event(sdev->list_wait, sdev->first_free != -1);
 974                 }
 975         }
 976         sbio = sdev->bios[sdev->curr];
 977         if (sbio->count == 0) {
 978                 sbio->physical = physical;
 979                 sbio->logical = logical;
 980         } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
 981                    sbio->logical + sbio->count * PAGE_SIZE != logical) {
 982                 int ret;
 983
 984                 ret = scrub_submit(sdev);
 985                 if (ret)
 986                         return ret;
 987                 goto again;
 988         }
 989         sbio->spag[sbio->count].flags = flags;
 990         sbio->spag[sbio->count].generation = gen;
 991         sbio->spag[sbio->count].have_csum = 0;
 992         sbio->spag[sbio->count].mirror_num = mirror_num;
 993         if (csum) {
 994                 sbio->spag[sbio->count].have_csum = 1;
 995                 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
 996         }
 997         ++sbio->count;
 998         if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
 999                 int ret;
1000
1001                 ret = scrub_submit(sdev);
1002                 if (ret)
1003                         return ret;
1004         }
1005
1006         return 0;
1007 }
1008
1009 static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1010                            u8 *csum)
1011 {
1012         struct btrfs_ordered_sum *sum = NULL;
1013         int ret = 0;
1014         unsigned long i;
1015         unsigned long num_sectors;
1016         u32 sectorsize = sdev->dev->dev_root->sectorsize;
1017
1018         while (!list_empty(&sdev->csum_list)) {
1019                 sum = list_first_entry(&sdev->csum_list,
1020                                        struct btrfs_ordered_sum, list);
1021                 if (sum->bytenr > logical)
1022                         return 0;
1023                 if (sum->bytenr + sum->len > logical)
1024                         break;
1025
1026                 ++sdev->stat.csum_discards;
1027                 list_del(&sum->list);
1028                 kfree(sum);
1029                 sum = NULL;
1030         }
1031         if (!sum)
1032                 return 0;
1033
1034         num_sectors = sum->len / sectorsize;
1035         for (i = 0; i < num_sectors; ++i) {
1036                 if (sum->sums[i].bytenr == logical) {
1037                         memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
1038                         ret = 1;
1039                         break;
1040                 }
1041         }
1042         if (ret && i == num_sectors - 1) {
1043                 list_del(&sum->list);
1044                 kfree(sum);
1045         }
1046         return ret;
1047 }
1048
1049 /* scrub extent tries to collect up to 64 kB for each bio */
1050 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1051                         u64 physical, u64 flags, u64 gen, int mirror_num)
1052 {
1053         int ret;
1054         u8 csum[BTRFS_CSUM_SIZE];
1055
1056         while (len) {
1057                 u64 l = min_t(u64, len, PAGE_SIZE);
1058                 int have_csum = 0;
1059
1060                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1061                         /* push csums to sbio */
1062                         have_csum = scrub_find_csum(sdev, logical, l, csum);
1063                         if (have_csum == 0)
1064                                 ++sdev->stat.no_csum;
1065                 }
1066                 ret = scrub_page(sdev, logical, l, physical, flags, gen,
1067                                  mirror_num, have_csum ? csum : NULL, 0);
1068                 if (ret)
1069                         return ret;
1070                 len -= l;
1071                 logical += l;
1072                 physical += l;
1073         }
1074         return 0;
1075 }
1076
1077 static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1078         struct map_lookup *map, int num, u64 base, u64 length)
1079 {
1080         struct btrfs_path *path;
1081         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1082         struct btrfs_root *root = fs_info->extent_root;
1083         struct btrfs_root *csum_root = fs_info->csum_root;
1084         struct btrfs_extent_item *extent;
1085         struct blk_plug plug;
1086         u64 flags;
1087         int ret;
1088         int slot;
1089         int i;
1090         u64 nstripes;
1091         int start_stripe;
1092         struct extent_buffer *l;
1093         struct btrfs_key key;
1094         u64 physical;
1095         u64 logical;
1096         u64 generation;
1097         int mirror_num;
1098
1099         u64 increment = map->stripe_len;
1100         u64 offset;
1101
1102         nstripes = length;
1103         offset = 0;
1104         do_div(nstripes, map->stripe_len);
1105         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1106                 offset = map->stripe_len * num;
1107                 increment = map->stripe_len * map->num_stripes;
1108                 mirror_num = 1;
1109         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1110                 int factor = map->num_stripes / map->sub_stripes;
1111                 offset = map->stripe_len * (num / map->sub_stripes);
1112                 increment = map->stripe_len * factor;
1113                 mirror_num = num % map->sub_stripes + 1;
1114         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1115                 increment = map->stripe_len;
1116                 mirror_num = num % map->num_stripes + 1;
1117         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
1118                 increment = map->stripe_len;
1119                 mirror_num = num % map->num_stripes + 1;
1120         } else {
1121                 increment = map->stripe_len;
1122                 mirror_num = 1;
1123         }
1124
1125         path = btrfs_alloc_path();
1126         if (!path)
1127                 return -ENOMEM;
1128
1129         path->reada = 2;
1130         path->search_commit_root = 1;
1131         path->skip_locking = 1;
1132
1133         /*
1134          * find all extents for each stripe and just read them to get
1135          * them into the page cache
1136          * FIXME: we can do better. build a more intelligent prefetching
1137          */
1138         logical = base + offset;
1139         physical = map->stripes[num].physical;
1140         ret = 0;
1141         for (i = 0; i < nstripes; ++i) {
1142                 key.objectid = logical;
1143                 key.type = BTRFS_EXTENT_ITEM_KEY;
1144                 key.offset = (u64)0;
1145
1146                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1147                 if (ret < 0)
1148                         goto out_noplug;
1149
1150                 /*
1151                  * we might miss half an extent here, but that doesn't matter,
1152                  * as it's only the prefetch
1153                  */
1154                 while (1) {
1155                         l = path->nodes[0];
1156                         slot = path->slots[0];
1157                         if (slot >= btrfs_header_nritems(l)) {
1158                                 ret = btrfs_next_leaf(root, path);
1159                                 if (ret == 0)
1160                                         continue;
1161                                 if (ret < 0)
1162                                         goto out_noplug;
1163
1164                                 break;
1165                         }
1166                         btrfs_item_key_to_cpu(l, &key, slot);
1167
1168                         if (key.objectid >= logical + map->stripe_len)
1169                                 break;
1170
1171                         path->slots[0]++;
1172                 }
1173                 btrfs_release_path(path);
1174                 logical += increment;
1175                 physical += map->stripe_len;
1176                 cond_resched();
1177         }
1178
1179         /*
1180          * collect all data csums for the stripe to avoid seeking during
1181          * the scrub. This might currently (crc32) end up to be about 1MB
1182          */
1183         start_stripe = 0;
1184         blk_start_plug(&plug);
1185 again:
1186         logical = base + offset + start_stripe * increment;
1187         for (i = start_stripe; i < nstripes; ++i) {
1188                 ret = btrfs_lookup_csums_range(csum_root, logical,
1189                                                logical + map->stripe_len - 1,
1190                                                &sdev->csum_list, 1);
1191                 if (ret)
1192                         goto out;
1193
1194                 logical += increment;
1195                 cond_resched();
1196         }
1197         /*
1198          * now find all extents for each stripe and scrub them
1199          */
1200         logical = base + offset + start_stripe * increment;
1201         physical = map->stripes[num].physical + start_stripe * map->stripe_len;
1202         ret = 0;
1203         for (i = start_stripe; i < nstripes; ++i) {
1204                 /*
1205                  * canceled?
1206                  */
1207                 if (atomic_read(&fs_info->scrub_cancel_req) ||
1208                     atomic_read(&sdev->cancel_req)) {
1209                         ret = -ECANCELED;
1210                         goto out;
1211                 }
1212                 /*
1213                  * check to see if we have to pause
1214                  */
1215                 if (atomic_read(&fs_info->scrub_pause_req)) {
1216                         /* push queued extents */
1217                         scrub_submit(sdev);
1218                         wait_event(sdev->list_wait,
1219                                    atomic_read(&sdev->in_flight) == 0);
1220                         atomic_inc(&fs_info->scrubs_paused);
1221                         wake_up(&fs_info->scrub_pause_wait);
1222                         mutex_lock(&fs_info->scrub_lock);
1223                         while (atomic_read(&fs_info->scrub_pause_req)) {
1224                                 mutex_unlock(&fs_info->scrub_lock);
1225                                 wait_event(fs_info->scrub_pause_wait,
1226                                    atomic_read(&fs_info->scrub_pause_req) == 0);
1227                                 mutex_lock(&fs_info->scrub_lock);
1228                         }
1229                         atomic_dec(&fs_info->scrubs_paused);
1230                         mutex_unlock(&fs_info->scrub_lock);
1231                         wake_up(&fs_info->scrub_pause_wait);
1232                         scrub_free_csums(sdev);
1233                         start_stripe = i;
1234                         goto again;
1235                 }
1236
1237                 key.objectid = logical;
1238                 key.type = BTRFS_EXTENT_ITEM_KEY;
1239                 key.offset = (u64)0;
1240
1241                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1242                 if (ret < 0)
1243                         goto out;
1244                 if (ret > 0) {
1245                         ret = btrfs_previous_item(root, path, 0,
1246                                                   BTRFS_EXTENT_ITEM_KEY);
1247                         if (ret < 0)
1248                                 goto out;
1249                         if (ret > 0) {
1250                                 /* there's no smaller item, so stick with the
1251                                  * larger one */
1252                                 btrfs_release_path(path);
1253                                 ret = btrfs_search_slot(NULL, root, &key,
1254                                                         path, 0, 0);
1255                                 if (ret < 0)
1256                                         goto out;
1257                         }
1258                 }
1259
1260                 while (1) {
1261                         l = path->nodes[0];
1262                         slot = path->slots[0];
1263                         if (slot >= btrfs_header_nritems(l)) {
1264                                 ret = btrfs_next_leaf(root, path);
1265                                 if (ret == 0)
1266                                         continue;
1267                                 if (ret < 0)
1268                                         goto out;
1269
1270                                 break;
1271                         }
1272                         btrfs_item_key_to_cpu(l, &key, slot);
1273
1274                         if (key.objectid + key.offset <= logical)
1275                                 goto next;
1276
1277                         if (key.objectid >= logical + map->stripe_len)
1278                                 break;
1279
1280                         if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
1281                                 goto next;
1282
1283                         extent = btrfs_item_ptr(l, slot,
1284                                                 struct btrfs_extent_item);
1285                         flags = btrfs_extent_flags(l, extent);
1286                         generation = btrfs_extent_generation(l, extent);
1287
1288                         if (key.objectid < logical &&
1289                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
1290                                 printk(KERN_ERR
1291                                        "btrfs scrub: tree block %llu spanning "
1292                                        "stripes, ignored. logical=%llu\n",
1293                                        (unsigned long long)key.objectid,
1294                                        (unsigned long long)logical);
1295                                 goto next;
1296                         }
1297
1298                         /*
1299                          * trim extent to this stripe
1300                          */
1301                         if (key.objectid < logical) {
1302                                 key.offset -= logical - key.objectid;
1303                                 key.objectid = logical;
1304                         }
1305                         if (key.objectid + key.offset >
1306                             logical + map->stripe_len) {
1307                                 key.offset = logical + map->stripe_len -
1308                                              key.objectid;
1309                         }
1310
1311                         ret = scrub_extent(sdev, key.objectid, key.offset,
1312                                            key.objectid - logical + physical,
1313                                            flags, generation, mirror_num);
1314                         if (ret)
1315                                 goto out;
1316
1317 next:
1318                         path->slots[0]++;
1319                 }
1320                 btrfs_release_path(path);
1321                 logical += increment;
1322                 physical += map->stripe_len;
1323                 spin_lock(&sdev->stat_lock);
1324                 sdev->stat.last_physical = physical;
1325                 spin_unlock(&sdev->stat_lock);
1326         }
1327         /* push queued extents */
1328         scrub_submit(sdev);
1329
1330 out:
1331         blk_finish_plug(&plug);
1332 out_noplug:
1333         btrfs_free_path(path);
1334         return ret < 0 ? ret : 0;
1335 }
1336
1337 static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
1338         u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
1339 {
1340         struct btrfs_mapping_tree *map_tree =
1341                 &sdev->dev->dev_root->fs_info->mapping_tree;
1342         struct map_lookup *map;
1343         struct extent_map *em;
1344         int i;
1345         int ret = -EINVAL;
1346
1347         read_lock(&map_tree->map_tree.lock);
1348         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
1349         read_unlock(&map_tree->map_tree.lock);
1350
1351         if (!em)
1352                 return -EINVAL;
1353
1354         map = (struct map_lookup *)em->bdev;
1355         if (em->start != chunk_offset)
1356                 goto out;
1357
1358         if (em->len < length)
1359                 goto out;
1360
1361         for (i = 0; i < map->num_stripes; ++i) {
1362                 if (map->stripes[i].dev == sdev->dev) {
1363                         ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1364                         if (ret)
1365                                 goto out;
1366                 }
1367         }
1368 out:
1369         free_extent_map(em);
1370
1371         return ret;
1372 }
1373
1374 static noinline_for_stack
1375 int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1376 {
1377         struct btrfs_dev_extent *dev_extent = NULL;
1378         struct btrfs_path *path;
1379         struct btrfs_root *root = sdev->dev->dev_root;
1380         struct btrfs_fs_info *fs_info = root->fs_info;
1381         u64 length;
1382         u64 chunk_tree;
1383         u64 chunk_objectid;
1384         u64 chunk_offset;
1385         int ret;
1386         int slot;
1387         struct extent_buffer *l;
1388         struct btrfs_key key;
1389         struct btrfs_key found_key;
1390         struct btrfs_block_group_cache *cache;
1391
1392         path = btrfs_alloc_path();
1393         if (!path)
1394                 return -ENOMEM;
1395
1396         path->reada = 2;
1397         path->search_commit_root = 1;
1398         path->skip_locking = 1;
1399
1400         key.objectid = sdev->dev->devid;
1401         key.offset = 0ull;
1402         key.type = BTRFS_DEV_EXTENT_KEY;
1403
1404
1405         while (1) {
1406                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1407                 if (ret < 0)
1408                         break;
1409                 if (ret > 0) {
1410                         if (path->slots[0] >=
1411                             btrfs_header_nritems(path->nodes[0])) {
1412                                 ret = btrfs_next_leaf(root, path);
1413                                 if (ret)
1414                                         break;
1415                         }
1416                 }
1417
1418                 l = path->nodes[0];
1419                 slot = path->slots[0];
1420
1421                 btrfs_item_key_to_cpu(l, &found_key, slot);
1422
1423                 if (found_key.objectid != sdev->dev->devid)
1424                         break;
1425
1426                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
1427                         break;
1428
1429                 if (found_key.offset >= end)
1430                         break;
1431
1432                 if (found_key.offset < key.offset)
1433                         break;
1434
1435                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1436                 length = btrfs_dev_extent_length(l, dev_extent);
1437
1438                 if (found_key.offset + length <= start) {
1439                         key.offset = found_key.offset + length;
1440                         btrfs_release_path(path);
1441                         continue;
1442                 }
1443
1444                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1445                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1446                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1447
1448                 /*
1449                  * get a reference on the corresponding block group to prevent
1450                  * the chunk from going away while we scrub it
1451                  */
1452                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1453                 if (!cache) {
1454                         ret = -ENOENT;
1455                         break;
1456                 }
1457                 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1458                                   chunk_offset, length);
1459                 btrfs_put_block_group(cache);
1460                 if (ret)
1461                         break;
1462
1463                 key.offset = found_key.offset + length;
1464                 btrfs_release_path(path);
1465         }
1466
1467         btrfs_free_path(path);
1468
1469         /*
1470          * ret can still be 1 from search_slot or next_leaf,
1471          * that's not an error
1472          */
1473         return ret < 0 ? ret : 0;
1474 }
1475
1476 static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1477 {
1478         int     i;
1479         u64     bytenr;
1480         u64     gen;
1481         int     ret;
1482         struct btrfs_device *device = sdev->dev;
1483         struct btrfs_root *root = device->dev_root;
1484
1485         gen = root->fs_info->last_trans_committed;
1486
1487         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1488                 bytenr = btrfs_sb_offset(i);
1489                 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1490                         break;
1491
1492                 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1493                                  BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1494                 if (ret)
1495                         return ret;
1496         }
1497         wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1498
1499         return 0;
1500 }
1501
1502 /*
1503  * get a reference count on fs_info->scrub_workers. start worker if necessary
1504  */
1505 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1506 {
1507         struct btrfs_fs_info *fs_info = root->fs_info;
1508
1509         mutex_lock(&fs_info->scrub_lock);
1510         if (fs_info->scrub_workers_refcnt == 0) {
1511                 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1512                            fs_info->thread_pool_size, &fs_info->generic_worker);
1513                 fs_info->scrub_workers.idle_thresh = 4;
1514                 btrfs_start_workers(&fs_info->scrub_workers, 1);
1515         }
1516         ++fs_info->scrub_workers_refcnt;
1517         mutex_unlock(&fs_info->scrub_lock);
1518
1519         return 0;
1520 }
1521
1522 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1523 {
1524         struct btrfs_fs_info *fs_info = root->fs_info;
1525
1526         mutex_lock(&fs_info->scrub_lock);
1527         if (--fs_info->scrub_workers_refcnt == 0)
1528                 btrfs_stop_workers(&fs_info->scrub_workers);
1529         WARN_ON(fs_info->scrub_workers_refcnt < 0);
1530         mutex_unlock(&fs_info->scrub_lock);
1531 }
1532
1533
1534 int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1535                     struct btrfs_scrub_progress *progress, int readonly)
1536 {
1537         struct scrub_dev *sdev;
1538         struct btrfs_fs_info *fs_info = root->fs_info;
1539         int ret;
1540         struct btrfs_device *dev;
1541
1542         if (btrfs_fs_closing(root->fs_info))
1543                 return -EINVAL;
1544
1545         /*
1546          * check some assumptions
1547          */
1548         if (root->sectorsize != PAGE_SIZE ||
1549             root->sectorsize != root->leafsize ||
1550             root->sectorsize != root->nodesize) {
1551                 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1552                 return -EINVAL;
1553         }
1554
1555         ret = scrub_workers_get(root);
1556         if (ret)
1557                 return ret;
1558
1559         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1560         dev = btrfs_find_device(root, devid, NULL, NULL);
1561         if (!dev || dev->missing) {
1562                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1563                 scrub_workers_put(root);
1564                 return -ENODEV;
1565         }
1566         mutex_lock(&fs_info->scrub_lock);
1567
1568         if (!dev->in_fs_metadata) {
1569                 mutex_unlock(&fs_info->scrub_lock);
1570                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1571                 scrub_workers_put(root);
1572                 return -ENODEV;
1573         }
1574
1575         if (dev->scrub_device) {
1576                 mutex_unlock(&fs_info->scrub_lock);
1577                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1578                 scrub_workers_put(root);
1579                 return -EINPROGRESS;
1580         }
1581         sdev = scrub_setup_dev(dev);
1582         if (IS_ERR(sdev)) {
1583                 mutex_unlock(&fs_info->scrub_lock);
1584                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1585                 scrub_workers_put(root);
1586                 return PTR_ERR(sdev);
1587         }
1588         sdev->readonly = readonly;
1589         dev->scrub_device = sdev;
1590
1591         atomic_inc(&fs_info->scrubs_running);
1592         mutex_unlock(&fs_info->scrub_lock);
1593         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1594
1595         down_read(&fs_info->scrub_super_lock);
1596         ret = scrub_supers(sdev);
1597         up_read(&fs_info->scrub_super_lock);
1598
1599         if (!ret)
1600                 ret = scrub_enumerate_chunks(sdev, start, end);
1601
1602         wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1603         atomic_dec(&fs_info->scrubs_running);
1604         wake_up(&fs_info->scrub_pause_wait);
1605
1606         wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1607
1608         if (progress)
1609                 memcpy(progress, &sdev->stat, sizeof(*progress));
1610
1611         mutex_lock(&fs_info->scrub_lock);
1612         dev->scrub_device = NULL;
1613         mutex_unlock(&fs_info->scrub_lock);
1614
1615         scrub_free_dev(sdev);
1616         scrub_workers_put(root);
1617
1618         return ret;
1619 }
1620
1621 int btrfs_scrub_pause(struct btrfs_root *root)
1622 {
1623         struct btrfs_fs_info *fs_info = root->fs_info;
1624
1625         mutex_lock(&fs_info->scrub_lock);
1626         atomic_inc(&fs_info->scrub_pause_req);
1627         while (atomic_read(&fs_info->scrubs_paused) !=
1628                atomic_read(&fs_info->scrubs_running)) {
1629                 mutex_unlock(&fs_info->scrub_lock);
1630                 wait_event(fs_info->scrub_pause_wait,
1631                            atomic_read(&fs_info->scrubs_paused) ==
1632                            atomic_read(&fs_info->scrubs_running));
1633                 mutex_lock(&fs_info->scrub_lock);
1634         }
1635         mutex_unlock(&fs_info->scrub_lock);
1636
1637         return 0;
1638 }
1639
1640 int btrfs_scrub_continue(struct btrfs_root *root)
1641 {
1642         struct btrfs_fs_info *fs_info = root->fs_info;
1643
1644         atomic_dec(&fs_info->scrub_pause_req);
1645         wake_up(&fs_info->scrub_pause_wait);
1646         return 0;
1647 }
1648
1649 int btrfs_scrub_pause_super(struct btrfs_root *root)
1650 {
1651         down_write(&root->fs_info->scrub_super_lock);
1652         return 0;
1653 }
1654
1655 int btrfs_scrub_continue_super(struct btrfs_root *root)
1656 {
1657         up_write(&root->fs_info->scrub_super_lock);
1658         return 0;
1659 }
1660
1661 int btrfs_scrub_cancel(struct btrfs_root *root)
1662 {
1663         struct btrfs_fs_info *fs_info = root->fs_info;
1664
1665         mutex_lock(&fs_info->scrub_lock);
1666         if (!atomic_read(&fs_info->scrubs_running)) {
1667                 mutex_unlock(&fs_info->scrub_lock);
1668                 return -ENOTCONN;
1669         }
1670
1671         atomic_inc(&fs_info->scrub_cancel_req);
1672         while (atomic_read(&fs_info->scrubs_running)) {
1673                 mutex_unlock(&fs_info->scrub_lock);
1674                 wait_event(fs_info->scrub_pause_wait,
1675                            atomic_read(&fs_info->scrubs_running) == 0);
1676                 mutex_lock(&fs_info->scrub_lock);
1677         }
1678         atomic_dec(&fs_info->scrub_cancel_req);
1679         mutex_unlock(&fs_info->scrub_lock);
1680
1681         return 0;
1682 }
1683
1684 int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1685 {
1686         struct btrfs_fs_info *fs_info = root->fs_info;
1687         struct scrub_dev *sdev;
1688
1689         mutex_lock(&fs_info->scrub_lock);
1690         sdev = dev->scrub_device;
1691         if (!sdev) {
1692                 mutex_unlock(&fs_info->scrub_lock);
1693                 return -ENOTCONN;
1694         }
1695         atomic_inc(&sdev->cancel_req);
1696         while (dev->scrub_device) {
1697                 mutex_unlock(&fs_info->scrub_lock);
1698                 wait_event(fs_info->scrub_pause_wait,
1699                            dev->scrub_device == NULL);
1700                 mutex_lock(&fs_info->scrub_lock);
1701         }
1702         mutex_unlock(&fs_info->scrub_lock);
1703
1704         return 0;
1705 }
1706 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1707 {
1708         struct btrfs_fs_info *fs_info = root->fs_info;
1709         struct btrfs_device *dev;
1710         int ret;
1711
1712         /*
1713          * we have to hold the device_list_mutex here so the device
1714          * does not go away in cancel_dev. FIXME: find a better solution
1715          */
1716         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1717         dev = btrfs_find_device(root, devid, NULL, NULL);
1718         if (!dev) {
1719                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1720                 return -ENODEV;
1721         }
1722         ret = btrfs_scrub_cancel_dev(root, dev);
1723         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1724
1725         return ret;
1726 }
1727
1728 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1729                          struct btrfs_scrub_progress *progress)
1730 {
1731         struct btrfs_device *dev;
1732         struct scrub_dev *sdev = NULL;
1733
1734         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1735         dev = btrfs_find_device(root, devid, NULL, NULL);
1736         if (dev)
1737                 sdev = dev->scrub_device;
1738         if (sdev)
1739                 memcpy(progress, &sdev->stat, sizeof(*progress));
1740         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1741
1742         return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1743 }