drivers/md/raid1.c

   1 /*
   2  * raid1.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
   5  *
   6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7  *
   8  * RAID-1 management functions.
   9  *
  10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  11  *
  12  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  14  *
  15  * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
  16  * bitmapped intelligence in resync:
  17  *
  18  *      - bitmap marked during normal i/o
  19  *      - bitmap used to skip nondirty blocks during sync
  20  *
  21  * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
  22  * - persistent bitmap code
  23  *
  24  * This program is free software; you can redistribute it and/or modify
  25  * it under the terms of the GNU General Public License as published by
  26  * the Free Software Foundation; either version 2, or (at your option)
  27  * any later version.
  28  *
  29  * You should have received a copy of the GNU General Public License
  30  * (for example /usr/src/linux/COPYING); if not, write to the Free
  31  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  32  */
  33
  34 #include <linux/slab.h>
  35 #include <linux/delay.h>
  36 #include <linux/blkdev.h>
  37 #include <linux/module.h>
  38 #include <linux/seq_file.h>
  39 #include <linux/ratelimit.h>
  40 #include "md.h"
  41 #include "raid1.h"
  42 #include "bitmap.h"
  43
  44 #define DEBUG 0
  45 #define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
  46
  47 /*
  48  * Number of guaranteed r1bios in case of extreme VM load:
  49  */
  50 #define NR_RAID1_BIOS 256
  51
  52
  53 static void allow_barrier(conf_t *conf);
  54 static void lower_barrier(conf_t *conf);
  55
  56 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
  57 {
  58         struct pool_info *pi = data;
  59         int size = offsetof(r1bio_t, bios[pi->raid_disks]);
  60
  61         /* allocate a r1bio with room for raid_disks entries in the bios array */
  62         return kzalloc(size, gfp_flags);
  63 }
  64
  65 static void r1bio_pool_free(void *r1_bio, void *data)
  66 {
  67         kfree(r1_bio);
  68 }
  69
  70 #define RESYNC_BLOCK_SIZE (64*1024)
  71 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
  72 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
  73 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
  74 #define RESYNC_WINDOW (2048*1024)
  75
  76 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  77 {
  78         struct pool_info *pi = data;
  79         struct page *page;
  80         r1bio_t *r1_bio;
  81         struct bio *bio;
  82         int i, j;
  83
  84         r1_bio = r1bio_pool_alloc(gfp_flags, pi);
  85         if (!r1_bio)
  86                 return NULL;
  87
  88         /*
  89          * Allocate bios : 1 for reading, n-1 for writing
  90          */
  91         for (j = pi->raid_disks ; j-- ; ) {
  92                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
  93                 if (!bio)
  94                         goto out_free_bio;
  95                 r1_bio->bios[j] = bio;
  96         }
  97         /*
  98          * Allocate RESYNC_PAGES data pages and attach them to
  99          * the first bio.
 100          * If this is a user-requested check/repair, allocate
 101          * RESYNC_PAGES for each bio.
 102          */
 103         if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
 104                 j = pi->raid_disks;
 105         else
 106                 j = 1;
 107         while(j--) {
 108                 bio = r1_bio->bios[j];
 109                 for (i = 0; i < RESYNC_PAGES; i++) {
 110                         page = alloc_page(gfp_flags);
 111                         if (unlikely(!page))
 112                                 goto out_free_pages;
 113
 114                         bio->bi_io_vec[i].bv_page = page;
 115                         bio->bi_vcnt = i+1;
 116                 }
 117         }
 118         /* If not user-requests, copy the page pointers to all bios */
 119         if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 120                 for (i=0; i<RESYNC_PAGES ; i++)
 121                         for (j=1; j<pi->raid_disks; j++)
 122                                 r1_bio->bios[j]->bi_io_vec[i].bv_page =
 123                                         r1_bio->bios[0]->bi_io_vec[i].bv_page;
 124         }
 125
 126         r1_bio->master_bio = NULL;
 127
 128         return r1_bio;
 129
 130 out_free_pages:
 131         for (j=0 ; j < pi->raid_disks; j++)
 132                 for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
 133                         put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 134         j = -1;
 135 out_free_bio:
 136         while ( ++j < pi->raid_disks )
 137                 bio_put(r1_bio->bios[j]);
 138         r1bio_pool_free(r1_bio, data);
 139         return NULL;
 140 }
 141
 142 static void r1buf_pool_free(void *__r1_bio, void *data)
 143 {
 144         struct pool_info *pi = data;
 145         int i,j;
 146         r1bio_t *r1bio = __r1_bio;
 147
 148         for (i = 0; i < RESYNC_PAGES; i++)
 149                 for (j = pi->raid_disks; j-- ;) {
 150                         if (j == 0 ||
 151                             r1bio->bios[j]->bi_io_vec[i].bv_page !=
 152                             r1bio->bios[0]->bi_io_vec[i].bv_page)
 153                                 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 154                 }
 155         for (i=0 ; i < pi->raid_disks; i++)
 156                 bio_put(r1bio->bios[i]);
 157
 158         r1bio_pool_free(r1bio, data);
 159 }
 160
 161 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 162 {
 163         int i;
 164
 165         for (i = 0; i < conf->raid_disks; i++) {
 166                 struct bio **bio = r1_bio->bios + i;
 167                 if (!BIO_SPECIAL(*bio))
 168                         bio_put(*bio);
 169                 *bio = NULL;
 170         }
 171 }
 172
 173 static void free_r1bio(r1bio_t *r1_bio)
 174 {
 175         conf_t *conf = r1_bio->mddev->private;
 176
 177         put_all_bios(conf, r1_bio);
 178         mempool_free(r1_bio, conf->r1bio_pool);
 179 }
 180
 181 static void put_buf(r1bio_t *r1_bio)
 182 {
 183         conf_t *conf = r1_bio->mddev->private;
 184         int i;
 185
 186         for (i=0; i<conf->raid_disks; i++) {
 187                 struct bio *bio = r1_bio->bios[i];
 188                 if (bio->bi_end_io)
 189                         rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
 190         }
 191
 192         mempool_free(r1_bio, conf->r1buf_pool);
 193
 194         lower_barrier(conf);
 195 }
 196
 197 static void reschedule_retry(r1bio_t *r1_bio)
 198 {
 199         unsigned long flags;
 200         mddev_t *mddev = r1_bio->mddev;
 201         conf_t *conf = mddev->private;
 202
 203         spin_lock_irqsave(&conf->device_lock, flags);
 204         list_add(&r1_bio->retry_list, &conf->retry_list);
 205         conf->nr_queued ++;
 206         spin_unlock_irqrestore(&conf->device_lock, flags);
 207
 208         wake_up(&conf->wait_barrier);
 209         md_wakeup_thread(mddev->thread);
 210 }
 211
 212 /*
 213  * raid_end_bio_io() is called when we have finished servicing a mirrored
 214  * operation and are ready to return a success/failure code to the buffer
 215  * cache layer.
 216  */
 217 static void call_bio_endio(r1bio_t *r1_bio)
 218 {
 219         struct bio *bio = r1_bio->master_bio;
 220         int done;
 221         conf_t *conf = r1_bio->mddev->private;
 222
 223         if (bio->bi_phys_segments) {
 224                 unsigned long flags;
 225                 spin_lock_irqsave(&conf->device_lock, flags);
 226                 bio->bi_phys_segments--;
 227                 done = (bio->bi_phys_segments == 0);
 228                 spin_unlock_irqrestore(&conf->device_lock, flags);
 229         } else
 230                 done = 1;
 231
 232         if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 233                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
 234         if (done) {
 235                 bio_endio(bio, 0);
 236                 /*
 237                  * Wake up any possible resync thread that waits for the device
 238                  * to go idle.
 239                  */
 240                 allow_barrier(conf);
 241         }
 242 }
 243
 244 static void raid_end_bio_io(r1bio_t *r1_bio)
 245 {
 246         struct bio *bio = r1_bio->master_bio;
 247
 248         /* if nobody has done the final endio yet, do it now */
 249         if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 250                 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
 251                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
 252                         (unsigned long long) bio->bi_sector,
 253                         (unsigned long long) bio->bi_sector +
 254                                 (bio->bi_size >> 9) - 1);
 255
 256                 call_bio_endio(r1_bio);
 257         }
 258         free_r1bio(r1_bio);
 259 }
 260
 261 /*
 262  * Update disk head position estimator based on IRQ completion info.
 263  */
 264 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 265 {
 266         conf_t *conf = r1_bio->mddev->private;
 267
 268         conf->mirrors[disk].head_position =
 269                 r1_bio->sector + (r1_bio->sectors);
 270 }
 271
 272 static void raid1_end_read_request(struct bio *bio, int error)
 273 {
 274         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 275         r1bio_t *r1_bio = bio->bi_private;
 276         int mirror;
 277         conf_t *conf = r1_bio->mddev->private;
 278
 279         mirror = r1_bio->read_disk;
 280         /*
 281          * this branch is our 'one mirror IO has finished' event handler:
 282          */
 283         update_head_pos(mirror, r1_bio);
 284
 285         if (uptodate)
 286                 set_bit(R1BIO_Uptodate, &r1_bio->state);
 287         else {
 288                 /* If all other devices have failed, we want to return
 289                  * the error upwards rather than fail the last device.
 290                  * Here we redefine "uptodate" to mean "Don't want to retry"
 291                  */
 292                 unsigned long flags;
 293                 spin_lock_irqsave(&conf->device_lock, flags);
 294                 if (r1_bio->mddev->degraded == conf->raid_disks ||
 295                     (r1_bio->mddev->degraded == conf->raid_disks-1 &&
 296                      !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
 297                         uptodate = 1;
 298                 spin_unlock_irqrestore(&conf->device_lock, flags);
 299         }
 300
 301         if (uptodate)
 302                 raid_end_bio_io(r1_bio);
 303         else {
 304                 /*
 305                  * oops, read error:
 306                  */
 307                 char b[BDEVNAME_SIZE];
 308                 printk_ratelimited(
 309                         KERN_ERR "md/raid1:%s: %s: "
 310                         "rescheduling sector %llu\n",
 311                         mdname(conf->mddev),
 312                         bdevname(conf->mirrors[mirror].rdev->bdev,
 313                                  b),
 314                         (unsigned long long)r1_bio->sector);
 315                 set_bit(R1BIO_ReadError, &r1_bio->state);
 316                 reschedule_retry(r1_bio);
 317         }
 318
 319         rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 320 }
 321
 322 static void close_write(r1bio_t *r1_bio)
 323 {
 324         /* it really is the end of this request */
 325         if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
 326                 /* free extra copy of the data pages */
 327                 int i = r1_bio->behind_page_count;
 328                 while (i--)
 329                         safe_put_page(r1_bio->behind_bvecs[i].bv_page);
 330                 kfree(r1_bio->behind_bvecs);
 331                 r1_bio->behind_bvecs = NULL;
 332         }
 333         /* clear the bitmap if all writes complete successfully */
 334         bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
 335                         r1_bio->sectors,
 336                         !test_bit(R1BIO_Degraded, &r1_bio->state),
 337                         test_bit(R1BIO_BehindIO, &r1_bio->state));
 338         md_write_end(r1_bio->mddev);
 339 }
 340
 341 static void r1_bio_write_done(r1bio_t *r1_bio)
 342 {
 343         if (!atomic_dec_and_test(&r1_bio->remaining))
 344                 return;
 345
 346         if (test_bit(R1BIO_WriteError, &r1_bio->state))
 347                 reschedule_retry(r1_bio);
 348         else {
 349                 close_write(r1_bio);
 350                 if (test_bit(R1BIO_MadeGood, &r1_bio->state))
 351                         reschedule_retry(r1_bio);
 352                 else
 353                         raid_end_bio_io(r1_bio);
 354         }
 355 }
 356
 357 static void raid1_end_write_request(struct bio *bio, int error)
 358 {
 359         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 360         r1bio_t *r1_bio = bio->bi_private;
 361         int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 362         conf_t *conf = r1_bio->mddev->private;
 363         struct bio *to_put = NULL;
 364
 365
 366         for (mirror = 0; mirror < conf->raid_disks; mirror++)
 367                 if (r1_bio->bios[mirror] == bio)
 368                         break;
 369
 370         /*
 371          * 'one mirror IO has finished' event handler:
 372          */
 373         if (!uptodate) {
 374                 set_bit(WriteErrorSeen,
 375                         &conf->mirrors[mirror].rdev->flags);
 376                 set_bit(R1BIO_WriteError, &r1_bio->state);
 377         } else {
 378                 /*
 379                  * Set R1BIO_Uptodate in our master bio, so that we
 380                  * will return a good error code for to the higher
 381                  * levels even if IO on some other mirrored buffer
 382                  * fails.
 383                  *
 384                  * The 'master' represents the composite IO operation
 385                  * to user-side. So if something waits for IO, then it
 386                  * will wait for the 'master' bio.
 387                  */
 388                 sector_t first_bad;
 389                 int bad_sectors;
 390
 391                 r1_bio->bios[mirror] = NULL;
 392                 to_put = bio;
 393                 set_bit(R1BIO_Uptodate, &r1_bio->state);
 394
 395                 /* Maybe we can clear some bad blocks. */
 396                 if (is_badblock(conf->mirrors[mirror].rdev,
 397                                 r1_bio->sector, r1_bio->sectors,
 398                                 &first_bad, &bad_sectors)) {
 399                         r1_bio->bios[mirror] = IO_MADE_GOOD;
 400                         set_bit(R1BIO_MadeGood, &r1_bio->state);
 401                 }
 402         }
 403
 404         update_head_pos(mirror, r1_bio);
 405
 406         if (behind) {
 407                 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
 408                         atomic_dec(&r1_bio->behind_remaining);
 409
 410                 /*
 411                  * In behind mode, we ACK the master bio once the I/O
 412                  * has safely reached all non-writemostly
 413                  * disks. Setting the Returned bit ensures that this
 414                  * gets done only once -- we don't ever want to return
 415                  * -EIO here, instead we'll wait
 416                  */
 417                 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
 418                     test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 419                         /* Maybe we can return now */
 420                         if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 421                                 struct bio *mbio = r1_bio->master_bio;
 422                                 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
 423                                        (unsigned long long) mbio->bi_sector,
 424                                        (unsigned long long) mbio->bi_sector +
 425                                        (mbio->bi_size >> 9) - 1);
 426                                 call_bio_endio(r1_bio);
 427                         }
 428                 }
 429         }
 430         if (r1_bio->bios[mirror] == NULL)
 431                 rdev_dec_pending(conf->mirrors[mirror].rdev,
 432                                  conf->mddev);
 433
 434         /*
 435          * Let's see if all mirrored write operations have finished
 436          * already.
 437          */
 438         r1_bio_write_done(r1_bio);
 439
 440         if (to_put)
 441                 bio_put(to_put);
 442 }
 443
 444
 445 /*
 446  * This routine returns the disk from which the requested read should
 447  * be done. There is a per-array 'next expected sequential IO' sector
 448  * number - if this matches on the next IO then we use the last disk.
 449  * There is also a per-disk 'last know head position' sector that is
 450  * maintained from IRQ contexts, both the normal and the resync IO
 451  * completion handlers update this position correctly. If there is no
 452  * perfect sequential match then we pick the disk whose head is closest.
 453  *
 454  * If there are 2 mirrors in the same 2 devices, performance degrades
 455  * because position is mirror, not device based.
 456  *
 457  * The rdev for the device selected will have nr_pending incremented.
 458  */
 459 static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
 460 {
 461         const sector_t this_sector = r1_bio->sector;
 462         int sectors;
 463         int best_good_sectors;
 464         int start_disk;
 465         int best_disk;
 466         int i;
 467         sector_t best_dist;
 468         mdk_rdev_t *rdev;
 469         int choose_first;
 470
 471         rcu_read_lock();
 472         /*
 473          * Check if we can balance. We can balance on the whole
 474          * device if no resync is going on, or below the resync window.
 475          * We take the first readable disk when above the resync window.
 476          */
 477  retry:
 478         sectors = r1_bio->sectors;
 479         best_disk = -1;
 480         best_dist = MaxSector;
 481         best_good_sectors = 0;
 482
 483         if (conf->mddev->recovery_cp < MaxSector &&
 484             (this_sector + sectors >= conf->next_resync)) {
 485                 choose_first = 1;
 486                 start_disk = 0;
 487         } else {
 488                 choose_first = 0;
 489                 start_disk = conf->last_used;
 490         }
 491
 492         for (i = 0 ; i < conf->raid_disks ; i++) {
 493                 sector_t dist;
 494                 sector_t first_bad;
 495                 int bad_sectors;
 496
 497                 int disk = start_disk + i;
 498                 if (disk >= conf->raid_disks)
 499                         disk -= conf->raid_disks;
 500
 501                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
 502                 if (r1_bio->bios[disk] == IO_BLOCKED
 503                     || rdev == NULL
 504                     || test_bit(Faulty, &rdev->flags))
 505                         continue;
 506                 if (!test_bit(In_sync, &rdev->flags) &&
 507                     rdev->recovery_offset < this_sector + sectors)
 508                         continue;
 509                 if (test_bit(WriteMostly, &rdev->flags)) {
 510                         /* Don't balance among write-mostly, just
 511                          * use the first as a last resort */
 512                         if (best_disk < 0)
 513                                 best_disk = disk;
 514                         continue;
 515                 }
 516                 /* This is a reasonable device to use.  It might
 517                  * even be best.
 518                  */
 519                 if (is_badblock(rdev, this_sector, sectors,
 520                                 &first_bad, &bad_sectors)) {
 521                         if (best_dist < MaxSector)
 522                                 /* already have a better device */
 523                                 continue;
 524                         if (first_bad <= this_sector) {
 525                                 /* cannot read here. If this is the 'primary'
 526                                  * device, then we must not read beyond
 527                                  * bad_sectors from another device..
 528                                  */
 529                                 bad_sectors -= (this_sector - first_bad);
 530                                 if (choose_first && sectors > bad_sectors)
 531                                         sectors = bad_sectors;
 532                                 if (best_good_sectors > sectors)
 533                                         best_good_sectors = sectors;
 534
 535                         } else {
 536                                 sector_t good_sectors = first_bad - this_sector;
 537                                 if (good_sectors > best_good_sectors) {
 538                                         best_good_sectors = good_sectors;
 539                                         best_disk = disk;
 540                                 }
 541                                 if (choose_first)
 542                                         break;
 543                         }
 544                         continue;
 545                 } else
 546                         best_good_sectors = sectors;
 547
 548                 dist = abs(this_sector - conf->mirrors[disk].head_position);
 549                 if (choose_first
 550                     /* Don't change to another disk for sequential reads */
 551                     || conf->next_seq_sect == this_sector
 552                     || dist == 0
 553                     /* If device is idle, use it */
 554                     || atomic_read(&rdev->nr_pending) == 0) {
 555                         best_disk = disk;
 556                         break;
 557                 }
 558                 if (dist < best_dist) {
 559                         best_dist = dist;
 560                         best_disk = disk;
 561                 }
 562         }
 563
 564         if (best_disk >= 0) {
 565                 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
 566                 if (!rdev)
 567                         goto retry;
 568                 atomic_inc(&rdev->nr_pending);
 569                 if (test_bit(Faulty, &rdev->flags)) {
 570                         /* cannot risk returning a device that failed
 571                          * before we inc'ed nr_pending
 572                          */
 573                         rdev_dec_pending(rdev, conf->mddev);
 574                         goto retry;
 575                 }
 576                 sectors = best_good_sectors;
 577                 conf->next_seq_sect = this_sector + sectors;
 578                 conf->last_used = best_disk;
 579         }
 580         rcu_read_unlock();
 581         *max_sectors = sectors;
 582
 583         return best_disk;
 584 }
 585
 586 int md_raid1_congested(mddev_t *mddev, int bits)
 587 {
 588         conf_t *conf = mddev->private;
 589         int i, ret = 0;
 590
 591         rcu_read_lock();
 592         for (i = 0; i < mddev->raid_disks; i++) {
 593                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 594                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
 595                         struct request_queue *q = bdev_get_queue(rdev->bdev);
 596
 597                         BUG_ON(!q);
 598
 599                         /* Note the '|| 1' - when read_balance prefers
 600                          * non-congested targets, it can be removed
 601                          */
 602                         if ((bits & (1<<BDI_async_congested)) || 1)
 603                                 ret |= bdi_congested(&q->backing_dev_info, bits);
 604                         else
 605                                 ret &= bdi_congested(&q->backing_dev_info, bits);
 606                 }
 607         }
 608         rcu_read_unlock();
 609         return ret;
 610 }
 611 EXPORT_SYMBOL_GPL(md_raid1_congested);
 612
 613 static int raid1_congested(void *data, int bits)
 614 {
 615         mddev_t *mddev = data;
 616
 617         return mddev_congested(mddev, bits) ||
 618                 md_raid1_congested(mddev, bits);
 619 }
 620
 621 static void flush_pending_writes(conf_t *conf)
 622 {
 623         /* Any writes that have been queued but are awaiting
 624          * bitmap updates get flushed here.
 625          */
 626         spin_lock_irq(&conf->device_lock);
 627
 628         if (conf->pending_bio_list.head) {
 629                 struct bio *bio;
 630                 bio = bio_list_get(&conf->pending_bio_list);
 631                 spin_unlock_irq(&conf->device_lock);
 632                 /* flush any pending bitmap writes to
 633                  * disk before proceeding w/ I/O */
 634                 bitmap_unplug(conf->mddev->bitmap);
 635
 636                 while (bio) { /* submit pending writes */
 637                         struct bio *next = bio->bi_next;
 638                         bio->bi_next = NULL;
 639                         generic_make_request(bio);
 640                         bio = next;
 641                 }
 642         } else
 643                 spin_unlock_irq(&conf->device_lock);
 644 }
 645
 646 /* Barriers....
 647  * Sometimes we need to suspend IO while we do something else,
 648  * either some resync/recovery, or reconfigure the array.
 649  * To do this we raise a 'barrier'.
 650  * The 'barrier' is a counter that can be raised multiple times
 651  * to count how many activities are happening which preclude
 652  * normal IO.
 653  * We can only raise the barrier if there is no pending IO.
 654  * i.e. if nr_pending == 0.
 655  * We choose only to raise the barrier if no-one is waiting for the
 656  * barrier to go down.  This means that as soon as an IO request
 657  * is ready, no other operations which require a barrier will start
 658  * until the IO request has had a chance.
 659  *
 660  * So: regular IO calls 'wait_barrier'.  When that returns there
 661  *    is no backgroup IO happening,  It must arrange to call
 662  *    allow_barrier when it has finished its IO.
 663  * backgroup IO calls must call raise_barrier.  Once that returns
 664  *    there is no normal IO happeing.  It must arrange to call
 665  *    lower_barrier when the particular background IO completes.
 666  */
 667 #define RESYNC_DEPTH 32
 668
 669 static void raise_barrier(conf_t *conf)
 670 {
 671         spin_lock_irq(&conf->resync_lock);
 672
 673         /* Wait until no block IO is waiting */
 674         wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
 675                             conf->resync_lock, );
 676
 677         /* block any new IO from starting */
 678         conf->barrier++;
 679
 680         /* Now wait for all pending IO to complete */
 681         wait_event_lock_irq(conf->wait_barrier,
 682                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 683                             conf->resync_lock, );
 684
 685         spin_unlock_irq(&conf->resync_lock);
 686 }
 687
 688 static void lower_barrier(conf_t *conf)
 689 {
 690         unsigned long flags;
 691         BUG_ON(conf->barrier <= 0);
 692         spin_lock_irqsave(&conf->resync_lock, flags);
 693         conf->barrier--;
 694         spin_unlock_irqrestore(&conf->resync_lock, flags);
 695         wake_up(&conf->wait_barrier);
 696 }
 697
 698 static void wait_barrier(conf_t *conf)
 699 {
 700         spin_lock_irq(&conf->resync_lock);
 701         if (conf->barrier) {
 702                 conf->nr_waiting++;
 703                 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 704                                     conf->resync_lock,
 705                                     );
 706                 conf->nr_waiting--;
 707         }
 708         conf->nr_pending++;
 709         spin_unlock_irq(&conf->resync_lock);
 710 }
 711
 712 static void allow_barrier(conf_t *conf)
 713 {
 714         unsigned long flags;
 715         spin_lock_irqsave(&conf->resync_lock, flags);
 716         conf->nr_pending--;
 717         spin_unlock_irqrestore(&conf->resync_lock, flags);
 718         wake_up(&conf->wait_barrier);
 719 }
 720
 721 static void freeze_array(conf_t *conf)
 722 {
 723         /* stop syncio and normal IO and wait for everything to
 724          * go quite.
 725          * We increment barrier and nr_waiting, and then
 726          * wait until nr_pending match nr_queued+1
 727          * This is called in the context of one normal IO request
 728          * that has failed. Thus any sync request that might be pending
 729          * will be blocked by nr_pending, and we need to wait for
 730          * pending IO requests to complete or be queued for re-try.
 731          * Thus the number queued (nr_queued) plus this request (1)
 732          * must match the number of pending IOs (nr_pending) before
 733          * we continue.
 734          */
 735         spin_lock_irq(&conf->resync_lock);
 736         conf->barrier++;
 737         conf->nr_waiting++;
 738         wait_event_lock_irq(conf->wait_barrier,
 739                             conf->nr_pending == conf->nr_queued+1,
 740                             conf->resync_lock,
 741                             flush_pending_writes(conf));
 742         spin_unlock_irq(&conf->resync_lock);
 743 }
 744 static void unfreeze_array(conf_t *conf)
 745 {
 746         /* reverse the effect of the freeze */
 747         spin_lock_irq(&conf->resync_lock);
 748         conf->barrier--;
 749         conf->nr_waiting--;
 750         wake_up(&conf->wait_barrier);
 751         spin_unlock_irq(&conf->resync_lock);
 752 }
 753
 754
 755 /* duplicate the data pages for behind I/O
 756  */
 757 static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 758 {
 759         int i;
 760         struct bio_vec *bvec;
 761         struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
 762                                         GFP_NOIO);
 763         if (unlikely(!bvecs))
 764                 return;
 765
 766         bio_for_each_segment(bvec, bio, i) {
 767                 bvecs[i] = *bvec;
 768                 bvecs[i].bv_page = alloc_page(GFP_NOIO);
 769                 if (unlikely(!bvecs[i].bv_page))
 770                         goto do_sync_io;
 771                 memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
 772                        kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
 773                 kunmap(bvecs[i].bv_page);
 774                 kunmap(bvec->bv_page);
 775         }
 776         r1_bio->behind_bvecs = bvecs;
 777         r1_bio->behind_page_count = bio->bi_vcnt;
 778         set_bit(R1BIO_BehindIO, &r1_bio->state);
 779         return;
 780
 781 do_sync_io:
 782         for (i = 0; i < bio->bi_vcnt; i++)
 783                 if (bvecs[i].bv_page)
 784                         put_page(bvecs[i].bv_page);
 785         kfree(bvecs);
 786         PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 787 }
 788
 789 static int make_request(mddev_t *mddev, struct bio * bio)
 790 {
 791         conf_t *conf = mddev->private;
 792         mirror_info_t *mirror;
 793         r1bio_t *r1_bio;
 794         struct bio *read_bio;
 795         int i, disks;
 796         struct bitmap *bitmap;
 797         unsigned long flags;
 798         const int rw = bio_data_dir(bio);
 799         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 800         const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 801         mdk_rdev_t *blocked_rdev;
 802         int plugged;
 803         int first_clone;
 804         int sectors_handled;
 805         int max_sectors;
 806
 807         /*
 808          * Register the new request and wait if the reconstruction
 809          * thread has put up a bar for new requests.
 810          * Continue immediately if no resync is active currently.
 811          */
 812
 813         md_write_start(mddev, bio); /* wait on superblock update early */
 814
 815         if (bio_data_dir(bio) == WRITE &&
 816             bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
 817             bio->bi_sector < mddev->suspend_hi) {
 818                 /* As the suspend_* range is controlled by
 819                  * userspace, we want an interruptible
 820                  * wait.
 821                  */
 822                 DEFINE_WAIT(w);
 823                 for (;;) {
 824                         flush_signals(current);
 825                         prepare_to_wait(&conf->wait_barrier,
 826                                         &w, TASK_INTERRUPTIBLE);
 827                         if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
 828                             bio->bi_sector >= mddev->suspend_hi)
 829                                 break;
 830                         schedule();
 831                 }
 832                 finish_wait(&conf->wait_barrier, &w);
 833         }
 834
 835         wait_barrier(conf);
 836
 837         bitmap = mddev->bitmap;
 838
 839         /*
 840          * make_request() can abort the operation when READA is being
 841          * used and no empty request is available.
 842          *
 843          */
 844         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 845
 846         r1_bio->master_bio = bio;
 847         r1_bio->sectors = bio->bi_size >> 9;
 848         r1_bio->state = 0;
 849         r1_bio->mddev = mddev;
 850         r1_bio->sector = bio->bi_sector;
 851
 852         /* We might need to issue multiple reads to different
 853          * devices if there are bad blocks around, so we keep
 854          * track of the number of reads in bio->bi_phys_segments.
 855          * If this is 0, there is only one r1_bio and no locking
 856          * will be needed when requests complete.  If it is
 857          * non-zero, then it is the number of not-completed requests.
 858          */
 859         bio->bi_phys_segments = 0;
 860         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
 861
 862         if (rw == READ) {
 863                 /*
 864                  * read balancing logic:
 865                  */
 866                 int rdisk;
 867
 868 read_again:
 869                 rdisk = read_balance(conf, r1_bio, &max_sectors);
 870
 871                 if (rdisk < 0) {
 872                         /* couldn't find anywhere to read from */
 873                         raid_end_bio_io(r1_bio);
 874                         return 0;
 875                 }
 876                 mirror = conf->mirrors + rdisk;
 877
 878                 if (test_bit(WriteMostly, &mirror->rdev->flags) &&
 879                     bitmap) {
 880                         /* Reading from a write-mostly device must
 881                          * take care not to over-take any writes
 882                          * that are 'behind'
 883                          */
 884                         wait_event(bitmap->behind_wait,
 885                                    atomic_read(&bitmap->behind_writes) == 0);
 886                 }
 887                 r1_bio->read_disk = rdisk;
 888
 889                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 890                 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
 891                             max_sectors);
 892
 893                 r1_bio->bios[rdisk] = read_bio;
 894
 895                 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 896                 read_bio->bi_bdev = mirror->rdev->bdev;
 897                 read_bio->bi_end_io = raid1_end_read_request;
 898                 read_bio->bi_rw = READ | do_sync;
 899                 read_bio->bi_private = r1_bio;
 900
 901                 if (max_sectors < r1_bio->sectors) {
 902                         /* could not read all from this device, so we will
 903                          * need another r1_bio.
 904                          */
 905
 906                         sectors_handled = (r1_bio->sector + max_sectors
 907                                            - bio->bi_sector);
 908                         r1_bio->sectors = max_sectors;
 909                         spin_lock_irq(&conf->device_lock);
 910                         if (bio->bi_phys_segments == 0)
 911                                 bio->bi_phys_segments = 2;
 912                         else
 913                                 bio->bi_phys_segments++;
 914                         spin_unlock_irq(&conf->device_lock);
 915                         /* Cannot call generic_make_request directly
 916                          * as that will be queued in __make_request
 917                          * and subsequent mempool_alloc might block waiting
 918                          * for it.  So hand bio over to raid1d.
 919                          */
 920                         reschedule_retry(r1_bio);
 921
 922                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 923
 924                         r1_bio->master_bio = bio;
 925                         r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
 926                         r1_bio->state = 0;
 927                         r1_bio->mddev = mddev;
 928                         r1_bio->sector = bio->bi_sector + sectors_handled;
 929                         goto read_again;
 930                 } else
 931                         generic_make_request(read_bio);
 932                 return 0;
 933         }
 934
 935         /*
 936          * WRITE:
 937          */
 938         /* first select target devices under rcu_lock and
 939          * inc refcount on their rdev.  Record them by setting
 940          * bios[x] to bio
 941          * If there are known/acknowledged bad blocks on any device on
 942          * which we have seen a write error, we want to avoid writing those
 943          * blocks.
 944          * This potentially requires several writes to write around
 945          * the bad blocks.  Each set of writes gets it's own r1bio
 946          * with a set of bios attached.
 947          */
 948         plugged = mddev_check_plugged(mddev);
 949
 950         disks = conf->raid_disks;
 951  retry_write:
 952         blocked_rdev = NULL;
 953         rcu_read_lock();
 954         max_sectors = r1_bio->sectors;
 955         for (i = 0;  i < disks; i++) {
 956                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 957                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 958                         atomic_inc(&rdev->nr_pending);
 959                         blocked_rdev = rdev;
 960                         break;
 961                 }
 962                 r1_bio->bios[i] = NULL;
 963                 if (!rdev || test_bit(Faulty, &rdev->flags)) {
 964                         set_bit(R1BIO_Degraded, &r1_bio->state);
 965                         continue;
 966                 }
 967
 968                 atomic_inc(&rdev->nr_pending);
 969                 if (test_bit(WriteErrorSeen, &rdev->flags)) {
 970                         sector_t first_bad;
 971                         int bad_sectors;
 972                         int is_bad;
 973
 974                         is_bad = is_badblock(rdev, r1_bio->sector,
 975                                              max_sectors,
 976                                              &first_bad, &bad_sectors);
 977                         if (is_bad < 0) {
 978                                 /* mustn't write here until the bad block is
 979                                  * acknowledged*/
 980                                 set_bit(BlockedBadBlocks, &rdev->flags);
 981                                 blocked_rdev = rdev;
 982                                 break;
 983                         }
 984                         if (is_bad && first_bad <= r1_bio->sector) {
 985                                 /* Cannot write here at all */
 986                                 bad_sectors -= (r1_bio->sector - first_bad);
 987                                 if (bad_sectors < max_sectors)
 988                                         /* mustn't write more than bad_sectors
 989                                          * to other devices yet
 990                                          */
 991                                         max_sectors = bad_sectors;
 992                                 rdev_dec_pending(rdev, mddev);
 993                                 /* We don't set R1BIO_Degraded as that
 994                                  * only applies if the disk is
 995                                  * missing, so it might be re-added,
 996                                  * and we want to know to recover this
 997                                  * chunk.
 998                                  * In this case the device is here,
 999                                  * and the fact that this chunk is not
1000                                  * in-sync is recorded in the bad
1001                                  * block log
1002                                  */
1003                                 continue;
1004                         }
1005                         if (is_bad) {
1006                                 int good_sectors = first_bad - r1_bio->sector;
1007                                 if (good_sectors < max_sectors)
1008                                         max_sectors = good_sectors;
1009                         }
1010                 }
1011                 r1_bio->bios[i] = bio;
1012         }
1013         rcu_read_unlock();
1014
1015         if (unlikely(blocked_rdev)) {
1016                 /* Wait for this device to become unblocked */
1017                 int j;
1018
1019                 for (j = 0; j < i; j++)
1020                         if (r1_bio->bios[j])
1021                                 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1022                 r1_bio->state = 0;
1023                 allow_barrier(conf);
1024                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1025                 wait_barrier(conf);
1026                 goto retry_write;
1027         }
1028
1029         if (max_sectors < r1_bio->sectors) {
1030                 /* We are splitting this write into multiple parts, so
1031                  * we need to prepare for allocating another r1_bio.
1032                  */
1033                 r1_bio->sectors = max_sectors;
1034                 spin_lock_irq(&conf->device_lock);
1035                 if (bio->bi_phys_segments == 0)
1036                         bio->bi_phys_segments = 2;
1037                 else
1038                         bio->bi_phys_segments++;
1039                 spin_unlock_irq(&conf->device_lock);
1040         }
1041         sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
1042
1043         atomic_set(&r1_bio->remaining, 1);
1044         atomic_set(&r1_bio->behind_remaining, 0);
1045
1046         first_clone = 1;
1047         for (i = 0; i < disks; i++) {
1048                 struct bio *mbio;
1049                 if (!r1_bio->bios[i])
1050                         continue;
1051
1052                 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1053                 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
1054
1055                 if (first_clone) {
1056                         /* do behind I/O ?
1057                          * Not if there are too many, or cannot
1058                          * allocate memory, or a reader on WriteMostly
1059                          * is waiting for behind writes to flush */
1060                         if (bitmap &&
1061                             (atomic_read(&bitmap->behind_writes)
1062                              < mddev->bitmap_info.max_write_behind) &&
1063                             !waitqueue_active(&bitmap->behind_wait))
1064                                 alloc_behind_pages(mbio, r1_bio);
1065
1066                         bitmap_startwrite(bitmap, r1_bio->sector,
1067                                           r1_bio->sectors,
1068                                           test_bit(R1BIO_BehindIO,
1069                                                    &r1_bio->state));
1070                         first_clone = 0;
1071                 }
1072                 if (r1_bio->behind_bvecs) {
1073                         struct bio_vec *bvec;
1074                         int j;
1075
1076                         /* Yes, I really want the '__' version so that
1077                          * we clear any unused pointer in the io_vec, rather
1078                          * than leave them unchanged.  This is important
1079                          * because when we come to free the pages, we won't
1080                          * know the original bi_idx, so we just free
1081                          * them all
1082                          */
1083                         __bio_for_each_segment(bvec, mbio, j, 0)
1084                                 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
1085                         if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
1086                                 atomic_inc(&r1_bio->behind_remaining);
1087                 }
1088
1089                 r1_bio->bios[i] = mbio;
1090
1091                 mbio->bi_sector = (r1_bio->sector +
1092                                    conf->mirrors[i].rdev->data_offset);
1093                 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1094                 mbio->bi_end_io = raid1_end_write_request;
1095                 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1096                 mbio->bi_private = r1_bio;
1097
1098                 atomic_inc(&r1_bio->remaining);
1099                 spin_lock_irqsave(&conf->device_lock, flags);
1100                 bio_list_add(&conf->pending_bio_list, mbio);
1101                 spin_unlock_irqrestore(&conf->device_lock, flags);
1102         }
1103         r1_bio_write_done(r1_bio);
1104
1105         /* In case raid1d snuck in to freeze_array */
1106         wake_up(&conf->wait_barrier);
1107
1108         if (sectors_handled < (bio->bi_size >> 9)) {
1109                 /* We need another r1_bio.  It has already been counted
1110                  * in bio->bi_phys_segments
1111                  */
1112                 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1113                 r1_bio->master_bio = bio;
1114                 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1115                 r1_bio->state = 0;
1116                 r1_bio->mddev = mddev;
1117                 r1_bio->sector = bio->bi_sector + sectors_handled;
1118                 goto retry_write;
1119         }
1120
1121         if (do_sync || !bitmap || !plugged)
1122                 md_wakeup_thread(mddev->thread);
1123
1124         return 0;
1125 }
1126
1127 static void status(struct seq_file *seq, mddev_t *mddev)
1128 {
1129         conf_t *conf = mddev->private;
1130         int i;
1131
1132         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1133                    conf->raid_disks - mddev->degraded);
1134         rcu_read_lock();
1135         for (i = 0; i < conf->raid_disks; i++) {
1136                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1137                 seq_printf(seq, "%s",
1138                            rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1139         }
1140         rcu_read_unlock();
1141         seq_printf(seq, "]");
1142 }
1143
1144
1145 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1146 {
1147         char b[BDEVNAME_SIZE];
1148         conf_t *conf = mddev->private;
1149
1150         /*
1151          * If it is not operational, then we have already marked it as dead
1152          * else if it is the last working disks, ignore the error, let the
1153          * next level up know.
1154          * else mark the drive as failed
1155          */
1156         if (test_bit(In_sync, &rdev->flags)
1157             && (conf->raid_disks - mddev->degraded) == 1) {
1158                 /*
1159                  * Don't fail the drive, act as though we were just a
1160                  * normal single drive.
1161                  * However don't try a recovery from this drive as
1162                  * it is very likely to fail.
1163                  */
1164                 conf->recovery_disabled = mddev->recovery_disabled;
1165                 return;
1166         }
1167         set_bit(Blocked, &rdev->flags);
1168         if (test_and_clear_bit(In_sync, &rdev->flags)) {
1169                 unsigned long flags;
1170                 spin_lock_irqsave(&conf->device_lock, flags);
1171                 mddev->degraded++;
1172                 set_bit(Faulty, &rdev->flags);
1173                 spin_unlock_irqrestore(&conf->device_lock, flags);
1174                 /*
1175                  * if recovery is running, make sure it aborts.
1176                  */
1177                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1178         } else
1179                 set_bit(Faulty, &rdev->flags);
1180         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1181         printk(KERN_ALERT
1182                "md/raid1:%s: Disk failure on %s, disabling device.\n"
1183                "md/raid1:%s: Operation continuing on %d devices.\n",
1184                mdname(mddev), bdevname(rdev->bdev, b),
1185                mdname(mddev), conf->raid_disks - mddev->degraded);
1186 }
1187
1188 static void print_conf(conf_t *conf)
1189 {
1190         int i;
1191
1192         printk(KERN_DEBUG "RAID1 conf printout:\n");
1193         if (!conf) {
1194                 printk(KERN_DEBUG "(!conf)\n");
1195                 return;
1196         }
1197         printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1198                 conf->raid_disks);
1199
1200         rcu_read_lock();
1201         for (i = 0; i < conf->raid_disks; i++) {
1202                 char b[BDEVNAME_SIZE];
1203                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1204                 if (rdev)
1205                         printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1206                                i, !test_bit(In_sync, &rdev->flags),
1207                                !test_bit(Faulty, &rdev->flags),
1208                                bdevname(rdev->bdev,b));
1209         }
1210         rcu_read_unlock();
1211 }
1212
1213 static void close_sync(conf_t *conf)
1214 {
1215         wait_barrier(conf);
1216         allow_barrier(conf);
1217
1218         mempool_destroy(conf->r1buf_pool);
1219         conf->r1buf_pool = NULL;
1220 }
1221
1222 static int raid1_spare_active(mddev_t *mddev)
1223 {
1224         int i;
1225         conf_t *conf = mddev->private;
1226         int count = 0;
1227         unsigned long flags;
1228
1229         /*
1230          * Find all failed disks within the RAID1 configuration
1231          * and mark them readable.
1232          * Called under mddev lock, so rcu protection not needed.
1233          */
1234         for (i = 0; i < conf->raid_disks; i++) {
1235                 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1236                 if (rdev
1237                     && !test_bit(Faulty, &rdev->flags)
1238                     && !test_and_set_bit(In_sync, &rdev->flags)) {
1239                         count++;
1240                         sysfs_notify_dirent_safe(rdev->sysfs_state);
1241                 }
1242         }
1243         spin_lock_irqsave(&conf->device_lock, flags);
1244         mddev->degraded -= count;
1245         spin_unlock_irqrestore(&conf->device_lock, flags);
1246
1247         print_conf(conf);
1248         return count;
1249 }
1250
1251
1252 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1253 {
1254         conf_t *conf = mddev->private;
1255         int err = -EEXIST;
1256         int mirror = 0;
1257         mirror_info_t *p;
1258         int first = 0;
1259         int last = mddev->raid_disks - 1;
1260
1261         if (mddev->recovery_disabled == conf->recovery_disabled)
1262                 return -EBUSY;
1263
1264         if (rdev->raid_disk >= 0)
1265                 first = last = rdev->raid_disk;
1266
1267         for (mirror = first; mirror <= last; mirror++)
1268                 if ( !(p=conf->mirrors+mirror)->rdev) {
1269
1270                         disk_stack_limits(mddev->gendisk, rdev->bdev,
1271                                           rdev->data_offset << 9);
1272                         /* as we don't honour merge_bvec_fn, we must
1273                          * never risk violating it, so limit
1274                          * ->max_segments to one lying with a single
1275                          * page, as a one page request is never in
1276                          * violation.
1277                          */
1278                         if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1279                                 blk_queue_max_segments(mddev->queue, 1);
1280                                 blk_queue_segment_boundary(mddev->queue,
1281                                                            PAGE_CACHE_SIZE - 1);
1282                         }
1283
1284                         p->head_position = 0;
1285                         rdev->raid_disk = mirror;
1286                         err = 0;
1287                         /* As all devices are equivalent, we don't need a full recovery
1288                          * if this was recently any drive of the array
1289                          */
1290                         if (rdev->saved_raid_disk < 0)
1291                                 conf->fullsync = 1;
1292                         rcu_assign_pointer(p->rdev, rdev);
1293                         break;
1294                 }
1295         md_integrity_add_rdev(rdev, mddev);
1296         print_conf(conf);
1297         return err;
1298 }
1299
1300 static int raid1_remove_disk(mddev_t *mddev, int number)
1301 {
1302         conf_t *conf = mddev->private;
1303         int err = 0;
1304         mdk_rdev_t *rdev;
1305         mirror_info_t *p = conf->mirrors+ number;
1306
1307         print_conf(conf);
1308         rdev = p->rdev;
1309         if (rdev) {
1310                 if (test_bit(In_sync, &rdev->flags) ||
1311                     atomic_read(&rdev->nr_pending)) {
1312                         err = -EBUSY;
1313                         goto abort;
1314                 }
1315                 /* Only remove non-faulty devices if recovery
1316                  * is not possible.
1317                  */
1318                 if (!test_bit(Faulty, &rdev->flags) &&
1319                     mddev->recovery_disabled != conf->recovery_disabled &&
1320                     mddev->degraded < conf->raid_disks) {
1321                         err = -EBUSY;
1322                         goto abort;
1323                 }
1324                 p->rdev = NULL;
1325                 synchronize_rcu();
1326                 if (atomic_read(&rdev->nr_pending)) {
1327                         /* lost the race, try later */
1328                         err = -EBUSY;
1329                         p->rdev = rdev;
1330                         goto abort;
1331                 }
1332                 err = md_integrity_register(mddev);
1333         }
1334 abort:
1335
1336         print_conf(conf);
1337         return err;
1338 }
1339
1340
1341 static void end_sync_read(struct bio *bio, int error)
1342 {
1343         r1bio_t *r1_bio = bio->bi_private;
1344         int i;
1345
1346         for (i=r1_bio->mddev->raid_disks; i--; )
1347                 if (r1_bio->bios[i] == bio)
1348                         break;
1349         BUG_ON(i < 0);
1350         update_head_pos(i, r1_bio);
1351         /*
1352          * we have read a block, now it needs to be re-written,
1353          * or re-read if the read failed.
1354          * We don't do much here, just schedule handling by raid1d
1355          */
1356         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1357                 set_bit(R1BIO_Uptodate, &r1_bio->state);
1358
1359         if (atomic_dec_and_test(&r1_bio->remaining))
1360                 reschedule_retry(r1_bio);
1361 }
1362
1363 static void end_sync_write(struct bio *bio, int error)
1364 {
1365         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1366         r1bio_t *r1_bio = bio->bi_private;
1367         mddev_t *mddev = r1_bio->mddev;
1368         conf_t *conf = mddev->private;
1369         int i;
1370         int mirror=0;
1371         sector_t first_bad;
1372         int bad_sectors;
1373
1374         for (i = 0; i < conf->raid_disks; i++)
1375                 if (r1_bio->bios[i] == bio) {
1376                         mirror = i;
1377                         break;
1378                 }
1379         if (!uptodate) {
1380                 sector_t sync_blocks = 0;
1381                 sector_t s = r1_bio->sector;
1382                 long sectors_to_go = r1_bio->sectors;
1383                 /* make sure these bits doesn't get cleared. */
1384                 do {
1385                         bitmap_end_sync(mddev->bitmap, s,
1386                                         &sync_blocks, 1);
1387                         s += sync_blocks;
1388                         sectors_to_go -= sync_blocks;
1389                 } while (sectors_to_go > 0);
1390                 set_bit(WriteErrorSeen,
1391                         &conf->mirrors[mirror].rdev->flags);
1392                 set_bit(R1BIO_WriteError, &r1_bio->state);
1393         } else if (is_badblock(conf->mirrors[mirror].rdev,
1394                                r1_bio->sector,
1395                                r1_bio->sectors,
1396                                &first_bad, &bad_sectors) &&
1397                    !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1398                                 r1_bio->sector,
1399                                 r1_bio->sectors,
1400                                 &first_bad, &bad_sectors)
1401                 )
1402                 set_bit(R1BIO_MadeGood, &r1_bio->state);
1403
1404         update_head_pos(mirror, r1_bio);
1405
1406         if (atomic_dec_and_test(&r1_bio->remaining)) {
1407                 int s = r1_bio->sectors;
1408                 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1409                     test_bit(R1BIO_WriteError, &r1_bio->state))
1410                         reschedule_retry(r1_bio);
1411                 else {
1412                         put_buf(r1_bio);
1413                         md_done_sync(mddev, s, uptodate);
1414                 }
1415         }
1416 }
1417
1418 static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1419                             int sectors, struct page *page, int rw)
1420 {
1421         if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1422                 /* success */
1423                 return 1;
1424         if (rw == WRITE)
1425                 set_bit(WriteErrorSeen, &rdev->flags);
1426         /* need to record an error - either for the block or the device */
1427         if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1428                 md_error(rdev->mddev, rdev);
1429         return 0;
1430 }
1431
1432 static int fix_sync_read_error(r1bio_t *r1_bio)
1433 {
1434         /* Try some synchronous reads of other devices to get
1435          * good data, much like with normal read errors.  Only
1436          * read into the pages we already have so we don't
1437          * need to re-issue the read request.
1438          * We don't need to freeze the array, because being in an
1439          * active sync request, there is no normal IO, and
1440          * no overlapping syncs.
1441          * We don't need to check is_badblock() again as we
1442          * made sure that anything with a bad block in range
1443          * will have bi_end_io clear.
1444          */
1445         mddev_t *mddev = r1_bio->mddev;
1446         conf_t *conf = mddev->private;
1447         struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1448         sector_t sect = r1_bio->sector;
1449         int sectors = r1_bio->sectors;
1450         int idx = 0;
1451
1452         while(sectors) {
1453                 int s = sectors;
1454                 int d = r1_bio->read_disk;
1455                 int success = 0;
1456                 mdk_rdev_t *rdev;
1457                 int start;
1458
1459                 if (s > (PAGE_SIZE>>9))
1460                         s = PAGE_SIZE >> 9;
1461                 do {
1462                         if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1463                                 /* No rcu protection needed here devices
1464                                  * can only be removed when no resync is
1465                                  * active, and resync is currently active
1466                                  */
1467                                 rdev = conf->mirrors[d].rdev;
1468                                 if (sync_page_io(rdev, sect, s<<9,
1469                                                  bio->bi_io_vec[idx].bv_page,
1470                                                  READ, false)) {
1471                                         success = 1;
1472                                         break;
1473                                 }
1474                         }
1475                         d++;
1476                         if (d == conf->raid_disks)
1477                                 d = 0;
1478                 } while (!success && d != r1_bio->read_disk);
1479
1480                 if (!success) {
1481                         char b[BDEVNAME_SIZE];
1482                         int abort = 0;
1483                         /* Cannot read from anywhere, this block is lost.
1484                          * Record a bad block on each device.  If that doesn't
1485                          * work just disable and interrupt the recovery.
1486                          * Don't fail devices as that won't really help.
1487                          */
1488                         printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1489                                " for block %llu\n",
1490                                mdname(mddev),
1491                                bdevname(bio->bi_bdev, b),
1492                                (unsigned long long)r1_bio->sector);
1493                         for (d = 0; d < conf->raid_disks; d++) {
1494                                 rdev = conf->mirrors[d].rdev;
1495                                 if (!rdev || test_bit(Faulty, &rdev->flags))
1496                                         continue;
1497                                 if (!rdev_set_badblocks(rdev, sect, s, 0))
1498                                         abort = 1;
1499                         }
1500                         if (abort) {
1501                                 mddev->recovery_disabled = 1;
1502                                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1503                                 md_done_sync(mddev, r1_bio->sectors, 0);
1504                                 put_buf(r1_bio);
1505                                 return 0;
1506                         }
1507                         /* Try next page */
1508                         sectors -= s;
1509                         sect += s;
1510                         idx++;
1511                         continue;
1512                 }
1513
1514                 start = d;
1515                 /* write it back and re-read */
1516                 while (d != r1_bio->read_disk) {
1517                         if (d == 0)
1518                                 d = conf->raid_disks;
1519                         d--;
1520                         if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1521                                 continue;
1522                         rdev = conf->mirrors[d].rdev;
1523                         if (r1_sync_page_io(rdev, sect, s,
1524                                             bio->bi_io_vec[idx].bv_page,
1525                                             WRITE) == 0) {
1526                                 r1_bio->bios[d]->bi_end_io = NULL;
1527                                 rdev_dec_pending(rdev, mddev);
1528                         }
1529                 }
1530                 d = start;
1531                 while (d != r1_bio->read_disk) {
1532                         if (d == 0)
1533                                 d = conf->raid_disks;
1534                         d--;
1535                         if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1536                                 continue;
1537                         rdev = conf->mirrors[d].rdev;
1538                         if (r1_sync_page_io(rdev, sect, s,
1539                                             bio->bi_io_vec[idx].bv_page,
1540                                             READ) != 0)
1541                                 atomic_add(s, &rdev->corrected_errors);
1542                 }
1543                 sectors -= s;
1544                 sect += s;
1545                 idx ++;
1546         }
1547         set_bit(R1BIO_Uptodate, &r1_bio->state);
1548         set_bit(BIO_UPTODATE, &bio->bi_flags);
1549         return 1;
1550 }
1551
1552 static int process_checks(r1bio_t *r1_bio)
1553 {
1554         /* We have read all readable devices.  If we haven't
1555          * got the block, then there is no hope left.
1556          * If we have, then we want to do a comparison
1557          * and skip the write if everything is the same.
1558          * If any blocks failed to read, then we need to
1559          * attempt an over-write
1560          */
1561         mddev_t *mddev = r1_bio->mddev;
1562         conf_t *conf = mddev->private;
1563         int primary;
1564         int i;
1565
1566         for (primary = 0; primary < conf->raid_disks; primary++)
1567                 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1568                     test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1569                         r1_bio->bios[primary]->bi_end_io = NULL;
1570                         rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1571                         break;
1572                 }
1573         r1_bio->read_disk = primary;
1574         for (i = 0; i < conf->raid_disks; i++) {
1575                 int j;
1576                 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1577                 struct bio *pbio = r1_bio->bios[primary];
1578                 struct bio *sbio = r1_bio->bios[i];
1579                 int size;
1580
1581                 if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1582                         continue;
1583
1584                 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1585                         for (j = vcnt; j-- ; ) {
1586                                 struct page *p, *s;
1587                                 p = pbio->bi_io_vec[j].bv_page;
1588                                 s = sbio->bi_io_vec[j].bv_page;
1589                                 if (memcmp(page_address(p),
1590                                            page_address(s),
1591                                            PAGE_SIZE))
1592                                         break;
1593                         }
1594                 } else
1595                         j = 0;
1596                 if (j >= 0)
1597                         mddev->resync_mismatches += r1_bio->sectors;
1598                 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1599                               && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1600                         /* No need to write to this device. */
1601                         sbio->bi_end_io = NULL;
1602                         rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1603                         continue;
1604                 }
1605                 /* fixup the bio for reuse */
1606                 sbio->bi_vcnt = vcnt;
1607                 sbio->bi_size = r1_bio->sectors << 9;
1608                 sbio->bi_idx = 0;
1609                 sbio->bi_phys_segments = 0;
1610                 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1611                 sbio->bi_flags |= 1 << BIO_UPTODATE;
1612                 sbio->bi_next = NULL;
1613                 sbio->bi_sector = r1_bio->sector +
1614                         conf->mirrors[i].rdev->data_offset;
1615                 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1616                 size = sbio->bi_size;
1617                 for (j = 0; j < vcnt ; j++) {
1618                         struct bio_vec *bi;
1619                         bi = &sbio->bi_io_vec[j];
1620                         bi->bv_offset = 0;
1621                         if (size > PAGE_SIZE)
1622                                 bi->bv_len = PAGE_SIZE;
1623                         else
1624                                 bi->bv_len = size;
1625                         size -= PAGE_SIZE;
1626                         memcpy(page_address(bi->bv_page),
1627                                page_address(pbio->bi_io_vec[j].bv_page),
1628                                PAGE_SIZE);
1629                 }
1630         }
1631         return 0;
1632 }
1633
1634 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1635 {
1636         conf_t *conf = mddev->private;
1637         int i;
1638         int disks = conf->raid_disks;
1639         struct bio *bio, *wbio;
1640
1641         bio = r1_bio->bios[r1_bio->read_disk];
1642
1643         if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1644                 /* ouch - failed to read all of that. */
1645                 if (!fix_sync_read_error(r1_bio))
1646                         return;
1647
1648         if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1649                 if (process_checks(r1_bio) < 0)
1650                         return;
1651         /*
1652          * schedule writes
1653          */
1654         atomic_set(&r1_bio->remaining, 1);
1655         for (i = 0; i < disks ; i++) {
1656                 wbio = r1_bio->bios[i];
1657                 if (wbio->bi_end_io == NULL ||
1658                     (wbio->bi_end_io == end_sync_read &&
1659                      (i == r1_bio->read_disk ||
1660                       !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1661                         continue;
1662
1663                 wbio->bi_rw = WRITE;
1664                 wbio->bi_end_io = end_sync_write;
1665                 atomic_inc(&r1_bio->remaining);
1666                 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1667
1668                 generic_make_request(wbio);
1669         }
1670
1671         if (atomic_dec_and_test(&r1_bio->remaining)) {
1672                 /* if we're here, all write(s) have completed, so clean up */
1673                 md_done_sync(mddev, r1_bio->sectors, 1);
1674                 put_buf(r1_bio);
1675         }
1676 }
1677
1678 /*
1679  * This is a kernel thread which:
1680  *
1681  *      1.      Retries failed read operations on working mirrors.
1682  *      2.      Updates the raid superblock when problems encounter.
1683  *      3.      Performs writes following reads for array synchronising.
1684  */
1685
1686 static void fix_read_error(conf_t *conf, int read_disk,
1687                            sector_t sect, int sectors)
1688 {
1689         mddev_t *mddev = conf->mddev;
1690         while(sectors) {
1691                 int s = sectors;
1692                 int d = read_disk;
1693                 int success = 0;
1694                 int start;
1695                 mdk_rdev_t *rdev;
1696
1697                 if (s > (PAGE_SIZE>>9))
1698                         s = PAGE_SIZE >> 9;
1699
1700                 do {
1701                         /* Note: no rcu protection needed here
1702                          * as this is synchronous in the raid1d thread
1703                          * which is the thread that might remove
1704                          * a device.  If raid1d ever becomes multi-threaded....
1705                          */
1706                         sector_t first_bad;
1707                         int bad_sectors;
1708
1709                         rdev = conf->mirrors[d].rdev;
1710                         if (rdev &&
1711                             test_bit(In_sync, &rdev->flags) &&
1712                             is_badblock(rdev, sect, s,
1713                                         &first_bad, &bad_sectors) == 0 &&
1714                             sync_page_io(rdev, sect, s<<9,
1715                                          conf->tmppage, READ, false))
1716                                 success = 1;
1717                         else {
1718                                 d++;
1719                                 if (d == conf->raid_disks)
1720                                         d = 0;
1721                         }
1722                 } while (!success && d != read_disk);
1723
1724                 if (!success) {
1725                         /* Cannot read from anywhere - mark it bad */
1726                         mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
1727                         if (!rdev_set_badblocks(rdev, sect, s, 0))
1728                                 md_error(mddev, rdev);
1729                         break;
1730                 }
1731                 /* write it back and re-read */
1732                 start = d;
1733                 while (d != read_disk) {
1734                         if (d==0)
1735                                 d = conf->raid_disks;
1736                         d--;
1737                         rdev = conf->mirrors[d].rdev;
1738                         if (rdev &&
1739                             test_bit(In_sync, &rdev->flags))
1740                                 r1_sync_page_io(rdev, sect, s,
1741                                                 conf->tmppage, WRITE);
1742                 }
1743                 d = start;
1744                 while (d != read_disk) {
1745                         char b[BDEVNAME_SIZE];
1746                         if (d==0)
1747                                 d = conf->raid_disks;
1748                         d--;
1749                         rdev = conf->mirrors[d].rdev;
1750                         if (rdev &&
1751                             test_bit(In_sync, &rdev->flags)) {
1752                                 if (r1_sync_page_io(rdev, sect, s,
1753                                                     conf->tmppage, READ)) {
1754                                         atomic_add(s, &rdev->corrected_errors);
1755                                         printk(KERN_INFO
1756                                                "md/raid1:%s: read error corrected "
1757                                                "(%d sectors at %llu on %s)\n",
1758                                                mdname(mddev), s,
1759                                                (unsigned long long)(sect +
1760                                                    rdev->data_offset),
1761                                                bdevname(rdev->bdev, b));
1762                                 }
1763                         }
1764                 }
1765                 sectors -= s;
1766                 sect += s;
1767         }
1768 }
1769
1770 static void bi_complete(struct bio *bio, int error)
1771 {
1772         complete((struct completion *)bio->bi_private);
1773 }
1774
1775 static int submit_bio_wait(int rw, struct bio *bio)
1776 {
1777         struct completion event;
1778         rw |= REQ_SYNC;
1779
1780         init_completion(&event);
1781         bio->bi_private = &event;
1782         bio->bi_end_io = bi_complete;
1783         submit_bio(rw, bio);
1784         wait_for_completion(&event);
1785
1786         return test_bit(BIO_UPTODATE, &bio->bi_flags);
1787 }
1788
1789 static int narrow_write_error(r1bio_t *r1_bio, int i)
1790 {
1791         mddev_t *mddev = r1_bio->mddev;
1792         conf_t *conf = mddev->private;
1793         mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1794         int vcnt, idx;
1795         struct bio_vec *vec;
1796
1797         /* bio has the data to be written to device 'i' where
1798          * we just recently had a write error.
1799          * We repeatedly clone the bio and trim down to one block,
1800          * then try the write.  Where the write fails we record
1801          * a bad block.
1802          * It is conceivable that the bio doesn't exactly align with
1803          * blocks.  We must handle this somehow.
1804          *
1805          * We currently own a reference on the rdev.
1806          */
1807
1808         int block_sectors;
1809         sector_t sector;
1810         int sectors;
1811         int sect_to_write = r1_bio->sectors;
1812         int ok = 1;
1813
1814         if (rdev->badblocks.shift < 0)
1815                 return 0;
1816
1817         block_sectors = 1 << rdev->badblocks.shift;
1818         sector = r1_bio->sector;
1819         sectors = ((sector + block_sectors)
1820                    & ~(sector_t)(block_sectors - 1))
1821                 - sector;
1822
1823         if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
1824                 vcnt = r1_bio->behind_page_count;
1825                 vec = r1_bio->behind_bvecs;
1826                 idx = 0;
1827                 while (vec[idx].bv_page == NULL)
1828                         idx++;
1829         } else {
1830                 vcnt = r1_bio->master_bio->bi_vcnt;
1831                 vec = r1_bio->master_bio->bi_io_vec;
1832                 idx = r1_bio->master_bio->bi_idx;
1833         }
1834         while (sect_to_write) {
1835                 struct bio *wbio;
1836                 if (sectors > sect_to_write)
1837                         sectors = sect_to_write;
1838                 /* Write at 'sector' for 'sectors'*/
1839
1840                 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
1841                 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
1842                 wbio->bi_sector = r1_bio->sector;
1843                 wbio->bi_rw = WRITE;
1844                 wbio->bi_vcnt = vcnt;
1845                 wbio->bi_size = r1_bio->sectors << 9;
1846                 wbio->bi_idx = idx;
1847
1848                 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
1849                 wbio->bi_sector += rdev->data_offset;
1850                 wbio->bi_bdev = rdev->bdev;
1851                 if (submit_bio_wait(WRITE, wbio) == 0)
1852                         /* failure! */
1853                         ok = rdev_set_badblocks(rdev, sector,
1854                                                 sectors, 0)
1855                                 && ok;
1856
1857                 bio_put(wbio);
1858                 sect_to_write -= sectors;
1859                 sector += sectors;
1860                 sectors = block_sectors;
1861         }
1862         return ok;
1863 }
1864
1865 static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
1866 {
1867         int m;
1868         int s = r1_bio->sectors;
1869         for (m = 0; m < conf->raid_disks ; m++) {
1870                 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1871                 struct bio *bio = r1_bio->bios[m];
1872                 if (bio->bi_end_io == NULL)
1873                         continue;
1874                 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1875                     test_bit(R1BIO_MadeGood, &r1_bio->state)) {
1876                         rdev_clear_badblocks(rdev, r1_bio->sector, s);
1877                 }
1878                 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1879                     test_bit(R1BIO_WriteError, &r1_bio->state)) {
1880                         if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
1881                                 md_error(conf->mddev, rdev);
1882                 }
1883         }
1884         put_buf(r1_bio);
1885         md_done_sync(conf->mddev, s, 1);
1886 }
1887
1888 static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
1889 {
1890         int m;
1891         for (m = 0; m < conf->raid_disks ; m++)
1892                 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1893                         mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1894                         rdev_clear_badblocks(rdev,
1895                                              r1_bio->sector,
1896                                              r1_bio->sectors);
1897                         rdev_dec_pending(rdev, conf->mddev);
1898                 } else if (r1_bio->bios[m] != NULL) {
1899                         /* This drive got a write error.  We need to
1900                          * narrow down and record precise write
1901                          * errors.
1902                          */
1903                         if (!narrow_write_error(r1_bio, m)) {
1904                                 md_error(conf->mddev,
1905                                          conf->mirrors[m].rdev);
1906                                 /* an I/O failed, we can't clear the bitmap */
1907                                 set_bit(R1BIO_Degraded, &r1_bio->state);
1908                         }
1909                         rdev_dec_pending(conf->mirrors[m].rdev,
1910                                          conf->mddev);
1911                 }
1912         if (test_bit(R1BIO_WriteError, &r1_bio->state))
1913                 close_write(r1_bio);
1914         raid_end_bio_io(r1_bio);
1915 }
1916
1917 static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
1918 {
1919         int disk;
1920         int max_sectors;
1921         mddev_t *mddev = conf->mddev;
1922         struct bio *bio;
1923         char b[BDEVNAME_SIZE];
1924         mdk_rdev_t *rdev;
1925
1926         clear_bit(R1BIO_ReadError, &r1_bio->state);
1927         /* we got a read error. Maybe the drive is bad.  Maybe just
1928          * the block and we can fix it.
1929          * We freeze all other IO, and try reading the block from
1930          * other devices.  When we find one, we re-write
1931          * and check it that fixes the read error.
1932          * This is all done synchronously while the array is
1933          * frozen
1934          */
1935         if (mddev->ro == 0) {
1936                 freeze_array(conf);
1937                 fix_read_error(conf, r1_bio->read_disk,
1938                                r1_bio->sector, r1_bio->sectors);
1939                 unfreeze_array(conf);
1940         } else
1941                 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1942
1943         bio = r1_bio->bios[r1_bio->read_disk];
1944         bdevname(bio->bi_bdev, b);
1945 read_more:
1946         disk = read_balance(conf, r1_bio, &max_sectors);
1947         if (disk == -1) {
1948                 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1949                        " read error for block %llu\n",
1950                        mdname(mddev), b, (unsigned long long)r1_bio->sector);
1951                 raid_end_bio_io(r1_bio);
1952         } else {
1953                 const unsigned long do_sync
1954                         = r1_bio->master_bio->bi_rw & REQ_SYNC;
1955                 if (bio) {
1956                         r1_bio->bios[r1_bio->read_disk] =
1957                                 mddev->ro ? IO_BLOCKED : NULL;
1958                         bio_put(bio);
1959                 }
1960                 r1_bio->read_disk = disk;
1961                 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
1962                 md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
1963                 r1_bio->bios[r1_bio->read_disk] = bio;
1964                 rdev = conf->mirrors[disk].rdev;
1965                 printk_ratelimited(KERN_ERR
1966                                    "md/raid1:%s: redirecting sector %llu"
1967                                    " to other mirror: %s\n",
1968                                    mdname(mddev),
1969                                    (unsigned long long)r1_bio->sector,
1970                                    bdevname(rdev->bdev, b));
1971                 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1972                 bio->bi_bdev = rdev->bdev;
1973                 bio->bi_end_io = raid1_end_read_request;
1974                 bio->bi_rw = READ | do_sync;
1975                 bio->bi_private = r1_bio;
1976                 if (max_sectors < r1_bio->sectors) {
1977                         /* Drat - have to split this up more */
1978                         struct bio *mbio = r1_bio->master_bio;
1979                         int sectors_handled = (r1_bio->sector + max_sectors
1980                                                - mbio->bi_sector);
1981                         r1_bio->sectors = max_sectors;
1982                         spin_lock_irq(&conf->device_lock);
1983                         if (mbio->bi_phys_segments == 0)
1984                                 mbio->bi_phys_segments = 2;
1985                         else
1986                                 mbio->bi_phys_segments++;
1987                         spin_unlock_irq(&conf->device_lock);
1988                         generic_make_request(bio);
1989                         bio = NULL;
1990
1991                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1992
1993                         r1_bio->master_bio = mbio;
1994                         r1_bio->sectors = (mbio->bi_size >> 9)
1995                                           - sectors_handled;
1996                         r1_bio->state = 0;
1997                         set_bit(R1BIO_ReadError, &r1_bio->state);
1998                         r1_bio->mddev = mddev;
1999                         r1_bio->sector = mbio->bi_sector + sectors_handled;
2000
2001                         goto read_more;
2002                 } else
2003                         generic_make_request(bio);
2004         }
2005 }
2006
2007 static void raid1d(mddev_t *mddev)
2008 {
2009         r1bio_t *r1_bio;
2010         unsigned long flags;
2011         conf_t *conf = mddev->private;
2012         struct list_head *head = &conf->retry_list;
2013         struct blk_plug plug;
2014
2015         md_check_recovery(mddev);
2016
2017         blk_start_plug(&plug);
2018         for (;;) {
2019
2020                 if (atomic_read(&mddev->plug_cnt) == 0)
2021                         flush_pending_writes(conf);
2022
2023                 spin_lock_irqsave(&conf->device_lock, flags);
2024                 if (list_empty(head)) {
2025                         spin_unlock_irqrestore(&conf->device_lock, flags);
2026                         break;
2027                 }
2028                 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
2029                 list_del(head->prev);
2030                 conf->nr_queued--;
2031                 spin_unlock_irqrestore(&conf->device_lock, flags);
2032
2033                 mddev = r1_bio->mddev;
2034                 conf = mddev->private;
2035                 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
2036                         if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2037                             test_bit(R1BIO_WriteError, &r1_bio->state))
2038                                 handle_sync_write_finished(conf, r1_bio);
2039                         else
2040                                 sync_request_write(mddev, r1_bio);
2041                 } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2042                            test_bit(R1BIO_WriteError, &r1_bio->state))
2043                         handle_write_finished(conf, r1_bio);
2044                 else if (test_bit(R1BIO_ReadError, &r1_bio->state))
2045                         handle_read_error(conf, r1_bio);
2046                 else
2047                         /* just a partial read to be scheduled from separate
2048                          * context
2049                          */
2050                         generic_make_request(r1_bio->bios[r1_bio->read_disk]);
2051
2052                 cond_resched();
2053                 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2054                         md_check_recovery(mddev);
2055         }
2056         blk_finish_plug(&plug);
2057 }
2058
2059
2060 static int init_resync(conf_t *conf)
2061 {
2062         int buffs;
2063
2064         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2065         BUG_ON(conf->r1buf_pool);
2066         conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
2067                                           conf->poolinfo);
2068         if (!conf->r1buf_pool)
2069                 return -ENOMEM;
2070         conf->next_resync = 0;
2071         return 0;
2072 }
2073
2074 /*
2075  * perform a "sync" on one "block"
2076  *
2077  * We need to make sure that no normal I/O request - particularly write
2078  * requests - conflict with active sync requests.
2079  *
2080  * This is achieved by tracking pending requests and a 'barrier' concept
2081  * that can be installed to exclude normal IO requests.
2082  */
2083
2084 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
2085 {
2086         conf_t *conf = mddev->private;
2087         r1bio_t *r1_bio;
2088         struct bio *bio;
2089         sector_t max_sector, nr_sectors;
2090         int disk = -1;
2091         int i;
2092         int wonly = -1;
2093         int write_targets = 0, read_targets = 0;
2094         sector_t sync_blocks;
2095         int still_degraded = 0;
2096         int good_sectors = RESYNC_SECTORS;
2097         int min_bad = 0; /* number of sectors that are bad in all devices */
2098
2099         if (!conf->r1buf_pool)
2100                 if (init_resync(conf))
2101                         return 0;
2102
2103         max_sector = mddev->dev_sectors;
2104         if (sector_nr >= max_sector) {
2105                 /* If we aborted, we need to abort the
2106                  * sync on the 'current' bitmap chunk (there will
2107                  * only be one in raid1 resync.
2108                  * We can find the current addess in mddev->curr_resync
2109                  */
2110                 if (mddev->curr_resync < max_sector) /* aborted */
2111                         bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2112                                                 &sync_blocks, 1);
2113                 else /* completed sync */
2114                         conf->fullsync = 0;
2115
2116                 bitmap_close_sync(mddev->bitmap);
2117                 close_sync(conf);
2118                 return 0;
2119         }
2120
2121         if (mddev->bitmap == NULL &&
2122             mddev->recovery_cp == MaxSector &&
2123             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2124             conf->fullsync == 0) {
2125                 *skipped = 1;
2126                 return max_sector - sector_nr;
2127         }
2128         /* before building a request, check if we can skip these blocks..
2129          * This call the bitmap_start_sync doesn't actually record anything
2130          */
2131         if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
2132             !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2133                 /* We can skip this block, and probably several more */
2134                 *skipped = 1;
2135                 return sync_blocks;
2136         }
2137         /*
2138          * If there is non-resync activity waiting for a turn,
2139          * and resync is going fast enough,
2140          * then let it though before starting on this new sync request.
2141          */
2142         if (!go_faster && conf->nr_waiting)
2143                 msleep_interruptible(1000);
2144
2145         bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2146         r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
2147         raise_barrier(conf);
2148
2149         conf->next_resync = sector_nr;
2150
2151         rcu_read_lock();
2152         /*
2153          * If we get a correctably read error during resync or recovery,
2154          * we might want to read from a different device.  So we
2155          * flag all drives that could conceivably be read from for READ,
2156          * and any others (which will be non-In_sync devices) for WRITE.
2157          * If a read fails, we try reading from something else for which READ
2158          * is OK.
2159          */
2160
2161         r1_bio->mddev = mddev;
2162         r1_bio->sector = sector_nr;
2163         r1_bio->state = 0;
2164         set_bit(R1BIO_IsSync, &r1_bio->state);
2165
2166         for (i=0; i < conf->raid_disks; i++) {
2167                 mdk_rdev_t *rdev;
2168                 bio = r1_bio->bios[i];
2169
2170                 /* take from bio_init */
2171                 bio->bi_next = NULL;
2172                 bio->bi_flags &= ~(BIO_POOL_MASK-1);
2173                 bio->bi_flags |= 1 << BIO_UPTODATE;
2174                 bio->bi_comp_cpu = -1;
2175                 bio->bi_rw = READ;
2176                 bio->bi_vcnt = 0;
2177                 bio->bi_idx = 0;
2178                 bio->bi_phys_segments = 0;
2179                 bio->bi_size = 0;
2180                 bio->bi_end_io = NULL;
2181                 bio->bi_private = NULL;
2182
2183                 rdev = rcu_dereference(conf->mirrors[i].rdev);
2184                 if (rdev == NULL ||
2185                     test_bit(Faulty, &rdev->flags)) {
2186                         still_degraded = 1;
2187                 } else if (!test_bit(In_sync, &rdev->flags)) {
2188                         bio->bi_rw = WRITE;
2189                         bio->bi_end_io = end_sync_write;
2190                         write_targets ++;
2191                 } else {
2192                         /* may need to read from here */
2193                         sector_t first_bad = MaxSector;
2194                         int bad_sectors;
2195
2196                         if (is_badblock(rdev, sector_nr, good_sectors,
2197                                         &first_bad, &bad_sectors)) {
2198                                 if (first_bad > sector_nr)
2199                                         good_sectors = first_bad - sector_nr;
2200                                 else {
2201                                         bad_sectors -= (sector_nr - first_bad);
2202                                         if (min_bad == 0 ||
2203                                             min_bad > bad_sectors)
2204                                                 min_bad = bad_sectors;
2205                                 }
2206                         }
2207                         if (sector_nr < first_bad) {
2208                                 if (test_bit(WriteMostly, &rdev->flags)) {
2209                                         if (wonly < 0)
2210                                                 wonly = i;
2211                                 } else {
2212                                         if (disk < 0)
2213                                                 disk = i;
2214                                 }
2215                                 bio->bi_rw = READ;
2216                                 bio->bi_end_io = end_sync_read;
2217                                 read_targets++;
2218                         }
2219                 }
2220                 if (bio->bi_end_io) {
2221                         atomic_inc(&rdev->nr_pending);
2222                         bio->bi_sector = sector_nr + rdev->data_offset;
2223                         bio->bi_bdev = rdev->bdev;
2224                         bio->bi_private = r1_bio;
2225                 }
2226         }
2227         rcu_read_unlock();
2228         if (disk < 0)
2229                 disk = wonly;
2230         r1_bio->read_disk = disk;
2231
2232         if (read_targets == 0 && min_bad > 0) {
2233                 /* These sectors are bad on all InSync devices, so we
2234                  * need to mark them bad on all write targets
2235                  */
2236                 int ok = 1;
2237                 for (i = 0 ; i < conf->raid_disks ; i++)
2238                         if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2239                                 mdk_rdev_t *rdev =
2240                                         rcu_dereference(conf->mirrors[i].rdev);
2241                                 ok = rdev_set_badblocks(rdev, sector_nr,
2242                                                         min_bad, 0
2243                                         ) && ok;
2244                         }
2245                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2246                 *skipped = 1;
2247                 put_buf(r1_bio);
2248
2249                 if (!ok) {
2250                         /* Cannot record the badblocks, so need to
2251                          * abort the resync.
2252                          * If there are multiple read targets, could just
2253                          * fail the really bad ones ???
2254                          */
2255                         conf->recovery_disabled = mddev->recovery_disabled;
2256                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2257                         return 0;
2258                 } else
2259                         return min_bad;
2260
2261         }
2262         if (min_bad > 0 && min_bad < good_sectors) {
2263                 /* only resync enough to reach the next bad->good
2264                  * transition */
2265                 good_sectors = min_bad;
2266         }
2267
2268         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
2269                 /* extra read targets are also write targets */
2270                 write_targets += read_targets-1;
2271
2272         if (write_targets == 0 || read_targets == 0) {
2273                 /* There is nowhere to write, so all non-sync
2274                  * drives must be failed - so we are finished
2275                  */
2276                 sector_t rv = max_sector - sector_nr;
2277                 *skipped = 1;
2278                 put_buf(r1_bio);
2279                 return rv;
2280         }
2281
2282         if (max_sector > mddev->resync_max)
2283                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2284         if (max_sector > sector_nr + good_sectors)
2285                 max_sector = sector_nr + good_sectors;
2286         nr_sectors = 0;
2287         sync_blocks = 0;
2288         do {
2289                 struct page *page;
2290                 int len = PAGE_SIZE;
2291                 if (sector_nr + (len>>9) > max_sector)
2292                         len = (max_sector - sector_nr) << 9;
2293                 if (len == 0)
2294                         break;
2295                 if (sync_blocks == 0) {
2296                         if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2297                                                &sync_blocks, still_degraded) &&
2298                             !conf->fullsync &&
2299                             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2300                                 break;
2301                         BUG_ON(sync_blocks < (PAGE_SIZE>>9));
2302                         if ((len >> 9) > sync_blocks)
2303                                 len = sync_blocks<<9;
2304                 }
2305
2306                 for (i=0 ; i < conf->raid_disks; i++) {
2307                         bio = r1_bio->bios[i];
2308                         if (bio->bi_end_io) {
2309                                 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2310                                 if (bio_add_page(bio, page, len, 0) == 0) {
2311                                         /* stop here */
2312                                         bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2313                                         while (i > 0) {
2314                                                 i--;
2315                                                 bio = r1_bio->bios[i];
2316                                                 if (bio->bi_end_io==NULL)
2317                                                         continue;
2318                                                 /* remove last page from this bio */
2319                                                 bio->bi_vcnt--;
2320                                                 bio->bi_size -= len;
2321                                                 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
2322                                         }
2323                                         goto bio_full;
2324                                 }
2325                         }
2326                 }
2327                 nr_sectors += len>>9;
2328                 sector_nr += len>>9;
2329                 sync_blocks -= (len>>9);
2330         } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
2331  bio_full:
2332         r1_bio->sectors = nr_sectors;
2333
2334         /* For a user-requested sync, we read all readable devices and do a
2335          * compare
2336          */
2337         if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2338                 atomic_set(&r1_bio->remaining, read_targets);
2339                 for (i=0; i<conf->raid_disks; i++) {
2340                         bio = r1_bio->bios[i];
2341                         if (bio->bi_end_io == end_sync_read) {
2342                                 md_sync_acct(bio->bi_bdev, nr_sectors);
2343                                 generic_make_request(bio);
2344                         }
2345                 }
2346         } else {
2347                 atomic_set(&r1_bio->remaining, 1);
2348                 bio = r1_bio->bios[r1_bio->read_disk];
2349                 md_sync_acct(bio->bi_bdev, nr_sectors);
2350                 generic_make_request(bio);
2351
2352         }
2353         return nr_sectors;
2354 }
2355
2356 static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2357 {
2358         if (sectors)
2359                 return sectors;
2360
2361         return mddev->dev_sectors;
2362 }
2363
2364 static conf_t *setup_conf(mddev_t *mddev)
2365 {
2366         conf_t *conf;
2367         int i;
2368         mirror_info_t *disk;
2369         mdk_rdev_t *rdev;
2370         int err = -ENOMEM;
2371
2372         conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2373         if (!conf)
2374                 goto abort;
2375
2376         conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2377                                  GFP_KERNEL);
2378         if (!conf->mirrors)
2379                 goto abort;
2380
2381         conf->tmppage = alloc_page(GFP_KERNEL);
2382         if (!conf->tmppage)
2383                 goto abort;
2384
2385         conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2386         if (!conf->poolinfo)
2387                 goto abort;
2388         conf->poolinfo->raid_disks = mddev->raid_disks;
2389         conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2390                                           r1bio_pool_free,
2391                                           conf->poolinfo);
2392         if (!conf->r1bio_pool)
2393                 goto abort;
2394
2395         conf->poolinfo->mddev = mddev;
2396
2397         spin_lock_init(&conf->device_lock);
2398         list_for_each_entry(rdev, &mddev->disks, same_set) {
2399                 int disk_idx = rdev->raid_disk;
2400                 if (disk_idx >= mddev->raid_disks
2401                     || disk_idx < 0)
2402                         continue;
2403                 disk = conf->mirrors + disk_idx;
2404
2405                 disk->rdev = rdev;
2406
2407                 disk->head_position = 0;
2408         }
2409         conf->raid_disks = mddev->raid_disks;
2410         conf->mddev = mddev;
2411         INIT_LIST_HEAD(&conf->retry_list);
2412
2413         spin_lock_init(&conf->resync_lock);
2414         init_waitqueue_head(&conf->wait_barrier);
2415
2416         bio_list_init(&conf->pending_bio_list);
2417
2418         conf->last_used = -1;
2419         for (i = 0; i < conf->raid_disks; i++) {
2420
2421                 disk = conf->mirrors + i;
2422
2423                 if (!disk->rdev ||
2424                     !test_bit(In_sync, &disk->rdev->flags)) {
2425                         disk->head_position = 0;
2426                         if (disk->rdev)
2427                                 conf->fullsync = 1;
2428                 } else if (conf->last_used < 0)
2429                         /*
2430                          * The first working device is used as a
2431                          * starting point to read balancing.
2432                          */
2433                         conf->last_used = i;
2434         }
2435
2436         err = -EIO;
2437         if (conf->last_used < 0) {
2438                 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2439                        mdname(mddev));
2440                 goto abort;
2441         }
2442         err = -ENOMEM;
2443         conf->thread = md_register_thread(raid1d, mddev, NULL);
2444         if (!conf->thread) {
2445                 printk(KERN_ERR
2446                        "md/raid1:%s: couldn't allocate thread\n",
2447                        mdname(mddev));
2448                 goto abort;
2449         }
2450
2451         return conf;
2452
2453  abort:
2454         if (conf) {
2455                 if (conf->r1bio_pool)
2456                         mempool_destroy(conf->r1bio_pool);
2457                 kfree(conf->mirrors);
2458                 safe_put_page(conf->tmppage);
2459                 kfree(conf->poolinfo);
2460                 kfree(conf);
2461         }
2462         return ERR_PTR(err);
2463 }
2464
2465 static int run(mddev_t *mddev)
2466 {
2467         conf_t *conf;
2468         int i;
2469         mdk_rdev_t *rdev;
2470
2471         if (mddev->level != 1) {
2472                 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2473                        mdname(mddev), mddev->level);
2474                 return -EIO;
2475         }
2476         if (mddev->reshape_position != MaxSector) {
2477                 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2478                        mdname(mddev));
2479                 return -EIO;
2480         }
2481         /*
2482          * copy the already verified devices into our private RAID1
2483          * bookkeeping area. [whatever we allocate in run(),
2484          * should be freed in stop()]
2485          */
2486         if (mddev->private == NULL)
2487                 conf = setup_conf(mddev);
2488         else
2489                 conf = mddev->private;
2490
2491         if (IS_ERR(conf))
2492                 return PTR_ERR(conf);
2493
2494         list_for_each_entry(rdev, &mddev->disks, same_set) {
2495                 if (!mddev->gendisk)
2496                         continue;
2497                 disk_stack_limits(mddev->gendisk, rdev->bdev,
2498                                   rdev->data_offset << 9);
2499                 /* as we don't honour merge_bvec_fn, we must never risk
2500                  * violating it, so limit ->max_segments to 1 lying within
2501                  * a single page, as a one page request is never in violation.
2502                  */
2503                 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2504                         blk_queue_max_segments(mddev->queue, 1);
2505                         blk_queue_segment_boundary(mddev->queue,
2506                                                    PAGE_CACHE_SIZE - 1);
2507                 }
2508         }
2509
2510         mddev->degraded = 0;
2511         for (i=0; i < conf->raid_disks; i++)
2512                 if (conf->mirrors[i].rdev == NULL ||
2513                     !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2514                     test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2515                         mddev->degraded++;
2516
2517         if (conf->raid_disks - mddev->degraded == 1)
2518                 mddev->recovery_cp = MaxSector;
2519
2520         if (mddev->recovery_cp != MaxSector)
2521                 printk(KERN_NOTICE "md/raid1:%s: not clean"
2522                        " -- starting background reconstruction\n",
2523                        mdname(mddev));
2524         printk(KERN_INFO
2525                 "md/raid1:%s: active with %d out of %d mirrors\n",
2526                 mdname(mddev), mddev->raid_disks - mddev->degraded,
2527                 mddev->raid_disks);
2528
2529         /*
2530          * Ok, everything is just fine now
2531          */
2532         mddev->thread = conf->thread;
2533         conf->thread = NULL;
2534         mddev->private = conf;
2535
2536         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2537
2538         if (mddev->queue) {
2539                 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2540                 mddev->queue->backing_dev_info.congested_data = mddev;
2541         }
2542         return md_integrity_register(mddev);
2543 }
2544
2545 static int stop(mddev_t *mddev)
2546 {
2547         conf_t *conf = mddev->private;
2548         struct bitmap *bitmap = mddev->bitmap;
2549
2550         /* wait for behind writes to complete */
2551         if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2552                 printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
2553                        mdname(mddev));
2554                 /* need to kick something here to make sure I/O goes? */
2555                 wait_event(bitmap->behind_wait,
2556                            atomic_read(&bitmap->behind_writes) == 0);
2557         }
2558
2559         raise_barrier(conf);
2560         lower_barrier(conf);
2561
2562         md_unregister_thread(mddev->thread);
2563         mddev->thread = NULL;
2564         if (conf->r1bio_pool)
2565                 mempool_destroy(conf->r1bio_pool);
2566         kfree(conf->mirrors);
2567         kfree(conf->poolinfo);
2568         kfree(conf);
2569         mddev->private = NULL;
2570         return 0;
2571 }
2572
2573 static int raid1_resize(mddev_t *mddev, sector_t sectors)
2574 {
2575         /* no resync is happening, and there is enough space
2576          * on all devices, so we can resize.
2577          * We need to make sure resync covers any new space.
2578          * If the array is shrinking we should possibly wait until
2579          * any io in the removed space completes, but it hardly seems
2580          * worth it.
2581          */
2582         md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2583         if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2584                 return -EINVAL;
2585         set_capacity(mddev->gendisk, mddev->array_sectors);
2586         revalidate_disk(mddev->gendisk);
2587         if (sectors > mddev->dev_sectors &&
2588             mddev->recovery_cp > mddev->dev_sectors) {
2589                 mddev->recovery_cp = mddev->dev_sectors;
2590                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2591         }
2592         mddev->dev_sectors = sectors;
2593         mddev->resync_max_sectors = sectors;
2594         return 0;
2595 }
2596
2597 static int raid1_reshape(mddev_t *mddev)
2598 {
2599         /* We need to:
2600          * 1/ resize the r1bio_pool
2601          * 2/ resize conf->mirrors
2602          *
2603          * We allocate a new r1bio_pool if we can.
2604          * Then raise a device barrier and wait until all IO stops.
2605          * Then resize conf->mirrors and swap in the new r1bio pool.
2606          *
2607          * At the same time, we "pack" the devices so that all the missing
2608          * devices have the higher raid_disk numbers.
2609          */
2610         mempool_t *newpool, *oldpool;
2611         struct pool_info *newpoolinfo;
2612         mirror_info_t *newmirrors;
2613         conf_t *conf = mddev->private;
2614         int cnt, raid_disks;
2615         unsigned long flags;
2616         int d, d2, err;
2617
2618         /* Cannot change chunk_size, layout, or level */
2619         if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
2620             mddev->layout != mddev->new_layout ||
2621             mddev->level != mddev->new_level) {
2622                 mddev->new_chunk_sectors = mddev->chunk_sectors;
2623                 mddev->new_layout = mddev->layout;
2624                 mddev->new_level = mddev->level;
2625                 return -EINVAL;
2626         }
2627
2628         err = md_allow_write(mddev);
2629         if (err)
2630                 return err;
2631
2632         raid_disks = mddev->raid_disks + mddev->delta_disks;
2633
2634         if (raid_disks < conf->raid_disks) {
2635                 cnt=0;
2636                 for (d= 0; d < conf->raid_disks; d++)
2637                         if (conf->mirrors[d].rdev)
2638                                 cnt++;
2639                 if (cnt > raid_disks)
2640                         return -EBUSY;
2641         }
2642
2643         newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2644         if (!newpoolinfo)
2645                 return -ENOMEM;
2646         newpoolinfo->mddev = mddev;
2647         newpoolinfo->raid_disks = raid_disks;
2648
2649         newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2650                                  r1bio_pool_free, newpoolinfo);
2651         if (!newpool) {
2652                 kfree(newpoolinfo);
2653                 return -ENOMEM;
2654         }
2655         newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2656         if (!newmirrors) {
2657                 kfree(newpoolinfo);
2658                 mempool_destroy(newpool);
2659                 return -ENOMEM;
2660         }
2661
2662         raise_barrier(conf);
2663
2664         /* ok, everything is stopped */
2665         oldpool = conf->r1bio_pool;
2666         conf->r1bio_pool = newpool;
2667
2668         for (d = d2 = 0; d < conf->raid_disks; d++) {
2669                 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2670                 if (rdev && rdev->raid_disk != d2) {
2671                         sysfs_unlink_rdev(mddev, rdev);
2672                         rdev->raid_disk = d2;
2673                         sysfs_unlink_rdev(mddev, rdev);
2674                         if (sysfs_link_rdev(mddev, rdev))
2675                                 printk(KERN_WARNING
2676                                        "md/raid1:%s: cannot register rd%d\n",
2677                                        mdname(mddev), rdev->raid_disk);
2678                 }
2679                 if (rdev)
2680                         newmirrors[d2++].rdev = rdev;
2681         }
2682         kfree(conf->mirrors);
2683         conf->mirrors = newmirrors;
2684         kfree(conf->poolinfo);
2685         conf->poolinfo = newpoolinfo;
2686
2687         spin_lock_irqsave(&conf->device_lock, flags);
2688         mddev->degraded += (raid_disks - conf->raid_disks);
2689         spin_unlock_irqrestore(&conf->device_lock, flags);
2690         conf->raid_disks = mddev->raid_disks = raid_disks;
2691         mddev->delta_disks = 0;
2692
2693         conf->last_used = 0; /* just make sure it is in-range */
2694         lower_barrier(conf);
2695
2696         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2697         md_wakeup_thread(mddev->thread);
2698
2699         mempool_destroy(oldpool);
2700         return 0;
2701 }
2702
2703 static void raid1_quiesce(mddev_t *mddev, int state)
2704 {
2705         conf_t *conf = mddev->private;
2706
2707         switch(state) {
2708         case 2: /* wake for suspend */
2709                 wake_up(&conf->wait_barrier);
2710                 break;
2711         case 1:
2712                 raise_barrier(conf);
2713                 break;
2714         case 0:
2715                 lower_barrier(conf);
2716                 break;
2717         }
2718 }
2719
2720 static void *raid1_takeover(mddev_t *mddev)
2721 {
2722         /* raid1 can take over:
2723          *  raid5 with 2 devices, any layout or chunk size
2724          */
2725         if (mddev->level == 5 && mddev->raid_disks == 2) {
2726                 conf_t *conf;
2727                 mddev->new_level = 1;
2728                 mddev->new_layout = 0;
2729                 mddev->new_chunk_sectors = 0;
2730                 conf = setup_conf(mddev);
2731                 if (!IS_ERR(conf))
2732                         conf->barrier = 1;
2733                 return conf;
2734         }
2735         return ERR_PTR(-EINVAL);
2736 }
2737
2738 static struct mdk_personality raid1_personality =
2739 {
2740         .name           = "raid1",
2741         .level          = 1,
2742         .owner          = THIS_MODULE,
2743         .make_request   = make_request,
2744         .run            = run,
2745         .stop           = stop,
2746         .status         = status,
2747         .error_handler  = error,
2748         .hot_add_disk   = raid1_add_disk,
2749         .hot_remove_disk= raid1_remove_disk,
2750         .spare_active   = raid1_spare_active,
2751         .sync_request   = sync_request,
2752         .resize         = raid1_resize,
2753         .size           = raid1_size,
2754         .check_reshape  = raid1_reshape,
2755         .quiesce        = raid1_quiesce,
2756         .takeover       = raid1_takeover,
2757 };
2758
2759 static int __init raid_init(void)
2760 {
2761         return register_md_personality(&raid1_personality);
2762 }
2763
2764 static void raid_exit(void)
2765 {
2766         unregister_md_personality(&raid1_personality);
2767 }
2768
2769 module_init(raid_init);
2770 module_exit(raid_exit);
2771 MODULE_LICENSE("GPL");
2772 MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2773 MODULE_ALIAS("md-personality-3"); /* RAID1 */
2774 MODULE_ALIAS("md-raid1");
2775 MODULE_ALIAS("md-level-1");