drivers/md/raid1.c

   1 /*
   2  * raid1.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
   5  *
   6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7  *
   8  * RAID-1 management functions.
   9  *
  10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  11  *
  12  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License as published by
  17  * the Free Software Foundation; either version 2, or (at your option)
  18  * any later version.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * (for example /usr/src/linux/COPYING); if not, write to the Free
  22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/malloc.h>
  27 #include <linux/raid/raid1.h>
  28 #include <asm/atomic.h>
  29
  30 #define MAJOR_NR MD_MAJOR
  31 #define MD_DRIVER
  32 #define MD_PERSONALITY
  33
  34 #define MAX_WORK_PER_DISK 128
  35
  36 /*
  37  * The following can be used to debug the driver
  38  */
  39 #define RAID1_DEBUG     0
  40
  41 #if RAID1_DEBUG
  42 #define PRINTK(x...)   printk(x)
  43 #define inline
  44 #define __inline__
  45 #else
  46 #define PRINTK(x...)  do { } while (0)
  47 #endif
  48
  49
  50 static mdk_personality_t raid1_personality;
  51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
  52 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
  53
  54 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
  55 {
  56         /* return a linked list of "cnt" struct buffer_heads.
  57          * don't take any off the free list unless we know we can
  58          * get all we need, otherwise we could deadlock
  59          */
  60         struct buffer_head *bh=NULL;
  61
  62         while(cnt) {
  63                 struct buffer_head *t;
  64                 md_spin_lock_irq(&conf->device_lock);
  65                 if (conf->freebh_cnt >= cnt)
  66                         while (cnt) {
  67                                 t = conf->freebh;
  68                                 conf->freebh = t->b_next;
  69                                 t->b_next = bh;
  70                                 bh = t;
  71                                 t->b_state = 0;
  72                                 conf->freebh_cnt--;
  73                                 cnt--;
  74                         }
  75                 md_spin_unlock_irq(&conf->device_lock);
  76                 if (cnt == 0)
  77                         break;
  78                 t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_BUFFER);
  79                 if (t) {
  80                         memset(t, 0, sizeof(*t));
  81                         t->b_next = bh;
  82                         bh = t;
  83                         cnt--;
  84                 } else {
  85                         PRINTK("waiting for %d bh\n", cnt);
  86                         wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
  87                 }
  88         }
  89         return bh;
  90 }
  91
  92 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
  93 {
  94         unsigned long flags;
  95         spin_lock_irqsave(&conf->device_lock, flags);
  96         while (bh) {
  97                 struct buffer_head *t = bh;
  98                 bh=bh->b_next;
  99                 if (t->b_pprev == NULL)
 100                         kfree(t);
 101                 else {
 102                         t->b_next= conf->freebh;
 103                         conf->freebh = t;
 104                         conf->freebh_cnt++;
 105                 }
 106         }
 107         spin_unlock_irqrestore(&conf->device_lock, flags);
 108         wake_up(&conf->wait_buffer);
 109 }
 110
 111 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
 112 {
 113         /* allocate cnt buffer_heads, possibly less if kalloc fails */
 114         int i = 0;
 115
 116         while (i < cnt) {
 117                 struct buffer_head *bh;
 118                 bh = kmalloc(sizeof(*bh), GFP_KERNEL);
 119                 if (!bh) break;
 120                 memset(bh, 0, sizeof(*bh));
 121
 122                 md_spin_lock_irq(&conf->device_lock);
 123                 bh->b_pprev = &conf->freebh;
 124                 bh->b_next = conf->freebh;
 125                 conf->freebh = bh;
 126                 conf->freebh_cnt++;
 127                 md_spin_unlock_irq(&conf->device_lock);
 128
 129                 i++;
 130         }
 131         return i;
 132 }
 133
 134 static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
 135 {
 136         /* discard cnt buffer_heads, if we can find them */
 137         int i = 0;
 138
 139         md_spin_lock_irq(&conf->device_lock);
 140         while ((i < cnt) && conf->freebh) {
 141                 struct buffer_head *bh = conf->freebh;
 142                 conf->freebh = bh->b_next;
 143                 kfree(bh);
 144                 i++;
 145                 conf->freebh_cnt--;
 146         }
 147         md_spin_unlock_irq(&conf->device_lock);
 148         return i;
 149 }
 150
 151
 152 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
 153 {
 154         struct raid1_bh *r1_bh = NULL;
 155
 156         do {
 157                 md_spin_lock_irq(&conf->device_lock);
 158                 if (conf->freer1) {
 159                         r1_bh = conf->freer1;
 160                         conf->freer1 = r1_bh->next_r1;
 161                         r1_bh->next_r1 = NULL;
 162                         r1_bh->state = 0;
 163                         r1_bh->bh_req.b_state = 0;
 164                 }
 165                 md_spin_unlock_irq(&conf->device_lock);
 166                 if (r1_bh)
 167                         return r1_bh;
 168                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh),
 169                                         GFP_BUFFER);
 170                 if (r1_bh) {
 171                         memset(r1_bh, 0, sizeof(*r1_bh));
 172                         return r1_bh;
 173                 }
 174                 wait_event(conf->wait_buffer, conf->freer1);
 175         } while (1);
 176 }
 177
 178 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
 179 {
 180         struct buffer_head *bh = r1_bh->mirror_bh_list;
 181         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 182
 183         r1_bh->mirror_bh_list = NULL;
 184
 185         if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
 186                 unsigned long flags;
 187                 spin_lock_irqsave(&conf->device_lock, flags);
 188                 r1_bh->next_r1 = conf->freer1;
 189                 conf->freer1 = r1_bh;
 190                 spin_unlock_irqrestore(&conf->device_lock, flags);
 191         } else {
 192                 kfree(r1_bh);
 193         }
 194         raid1_free_bh(conf, bh);
 195 }
 196
 197 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
 198 {
 199         int i = 0;
 200
 201         while (i < cnt) {
 202                 struct raid1_bh *r1_bh;
 203                 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 204                 if (!r1_bh)
 205                         break;
 206                 memset(r1_bh, 0, sizeof(*r1_bh));
 207
 208                 md_spin_lock_irq(&conf->device_lock);
 209                 set_bit(R1BH_PreAlloc, &r1_bh->state);
 210                 r1_bh->next_r1 = conf->freer1;
 211                 conf->freer1 = r1_bh;
 212                 md_spin_unlock_irq(&conf->device_lock);
 213
 214                 i++;
 215         }
 216         return i;
 217 }
 218
 219 static void raid1_shrink_r1bh(raid1_conf_t *conf)
 220 {
 221         md_spin_lock_irq(&conf->device_lock);
 222         while (conf->freer1) {
 223                 struct raid1_bh *r1_bh = conf->freer1;
 224                 conf->freer1 = r1_bh->next_r1;
 225                 kfree(r1_bh);
 226         }
 227         md_spin_unlock_irq(&conf->device_lock);
 228 }
 229
 230
 231
 232 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
 233 {
 234         unsigned long flags;
 235         struct buffer_head *bh = r1_bh->mirror_bh_list;
 236         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 237         r1_bh->mirror_bh_list = NULL;
 238
 239         spin_lock_irqsave(&conf->device_lock, flags);
 240         r1_bh->next_r1 = conf->freebuf;
 241         conf->freebuf = r1_bh;
 242         spin_unlock_irqrestore(&conf->device_lock, flags);
 243         raid1_free_bh(conf, bh);
 244 }
 245
 246 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
 247 {
 248         struct raid1_bh *r1_bh;
 249
 250         md_spin_lock_irq(&conf->device_lock);
 251         wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
 252         r1_bh = conf->freebuf;
 253         conf->freebuf = r1_bh->next_r1;
 254         r1_bh->next_r1= NULL;
 255         md_spin_unlock_irq(&conf->device_lock);
 256
 257         return r1_bh;
 258 }
 259
 260 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 261 {
 262         int i = 0;
 263
 264         md_spin_lock_irq(&conf->device_lock);
 265         while (i < cnt) {
 266                 struct raid1_bh *r1_bh;
 267                 struct page *page;
 268
 269                 page = alloc_page(GFP_KERNEL);
 270                 if (!page)
 271                         break;
 272
 273                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 274                 if (!r1_bh) {
 275                         __free_page(page);
 276                         break;
 277                 }
 278                 memset(r1_bh, 0, sizeof(*r1_bh));
 279                 r1_bh->bh_req.b_page = page;
 280                 r1_bh->bh_req.b_data = page_address(page);
 281                 r1_bh->next_r1 = conf->freebuf;
 282                 conf->freebuf = r1_bh;
 283                 i++;
 284         }
 285         md_spin_unlock_irq(&conf->device_lock);
 286         return i;
 287 }
 288
 289 static void raid1_shrink_buffers (raid1_conf_t *conf)
 290 {
 291         md_spin_lock_irq(&conf->device_lock);
 292         while (conf->freebuf) {
 293                 struct raid1_bh *r1_bh = conf->freebuf;
 294                 conf->freebuf = r1_bh->next_r1;
 295                 __free_page(r1_bh->bh_req.b_page);
 296                 kfree(r1_bh);
 297         }
 298         md_spin_unlock_irq(&conf->device_lock);
 299 }
 300
 301 static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size)
 302 {
 303         raid1_conf_t *conf = mddev_to_conf(mddev);
 304         int i, disks = MD_SB_DISKS;
 305
 306         /*
 307          * Later we do read balancing on the read side
 308          * now we use the first available disk.
 309          */
 310
 311         for (i = 0; i < disks; i++) {
 312                 if (conf->mirrors[i].operational) {
 313                         *rdev = conf->mirrors[i].dev;
 314                         return (0);
 315                 }
 316         }
 317
 318         printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
 319         return (-1);
 320 }
 321
 322 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
 323 {
 324         unsigned long flags;
 325         mddev_t *mddev = r1_bh->mddev;
 326         raid1_conf_t *conf = mddev_to_conf(mddev);
 327
 328         md_spin_lock_irqsave(&retry_list_lock, flags);
 329         if (raid1_retry_list == NULL)
 330                 raid1_retry_tail = &raid1_retry_list;
 331         *raid1_retry_tail = r1_bh;
 332         raid1_retry_tail = &r1_bh->next_r1;
 333         r1_bh->next_r1 = NULL;
 334         md_spin_unlock_irqrestore(&retry_list_lock, flags);
 335         md_wakeup_thread(conf->thread);
 336 }
 337
 338
 339 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
 340 {
 341         unsigned long flags;
 342         spin_lock_irqsave(&conf->segment_lock, flags);
 343         if (sector < conf->start_active)
 344                 conf->cnt_done--;
 345         else if (sector >= conf->start_future && conf->phase == phase)
 346                 conf->cnt_future--;
 347         else if (!--conf->cnt_pending)
 348                 wake_up(&conf->wait_ready);
 349
 350         spin_unlock_irqrestore(&conf->segment_lock, flags);
 351 }
 352
 353 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
 354 {
 355         unsigned long flags;
 356         spin_lock_irqsave(&conf->segment_lock, flags);
 357         if (sector >= conf->start_ready)
 358                 --conf->cnt_ready;
 359         else if (sector >= conf->start_active) {
 360                 if (!--conf->cnt_active) {
 361                         conf->start_active = conf->start_ready;
 362                         wake_up(&conf->wait_done);
 363                 }
 364         }
 365         spin_unlock_irqrestore(&conf->segment_lock, flags);
 366 }
 367
 368 /*
 369  * raid1_end_bh_io() is called when we have finished servicing a mirrored
 370  * operation and are ready to return a success/failure code to the buffer
 371  * cache layer.
 372  */
 373 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
 374 {
 375         struct buffer_head *bh = r1_bh->master_bh;
 376
 377         io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
 378                         test_bit(R1BH_SyncPhase, &r1_bh->state));
 379
 380         bh->b_end_io(bh, uptodate);
 381         raid1_free_r1bh(r1_bh);
 382 }
 383 void raid1_end_request (struct buffer_head *bh, int uptodate)
 384 {
 385         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
 386
 387         /*
 388          * this branch is our 'one mirror IO has finished' event handler:
 389          */
 390         if (!uptodate)
 391                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
 392         else
 393                 /*
 394                  * Set R1BH_Uptodate in our master buffer_head, so that
 395                  * we will return a good error code for to the higher
 396                  * levels even if IO on some other mirrored buffer fails.
 397                  *
 398                  * The 'master' represents the complex operation to
 399                  * user-side. So if something waits for IO, then it will
 400                  * wait for the 'master' buffer_head.
 401                  */
 402                 set_bit (R1BH_Uptodate, &r1_bh->state);
 403
 404         /*
 405          * We split up the read and write side, imho they are
 406          * conceptually different.
 407          */
 408
 409         if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
 410                 /*
 411                  * we have only one buffer_head on the read side
 412                  */
 413
 414                 if (uptodate) {
 415                         raid1_end_bh_io(r1_bh, uptodate);
 416                         return;
 417                 }
 418                 /*
 419                  * oops, read error:
 420                  */
 421                 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
 422                          partition_name(bh->b_dev), bh->b_blocknr);
 423                 raid1_reschedule_retry(r1_bh);
 424                 return;
 425         }
 426
 427         /*
 428          * WRITE:
 429          *
 430          * Let's see if all mirrored write operations have finished
 431          * already.
 432          */
 433
 434         if (atomic_dec_and_test(&r1_bh->remaining))
 435                 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
 436 }
 437
 438 /*
 439  * This routine returns the disk from which the requested read should
 440  * be done. It bookkeeps the last read position for every disk
 441  * in array and when new read requests come, the disk which last
 442  * position is nearest to the request, is chosen.
 443  *
 444  * TODO: now if there are 2 mirrors in the same 2 devices, performance
 445  * degrades dramatically because position is mirror, not device based.
 446  * This should be changed to be device based. Also atomic sequential
 447  * reads should be somehow balanced.
 448  */
 449
 450 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
 451 {
 452         int new_disk = conf->last_used;
 453         const int sectors = bh->b_size >> 9;
 454         const unsigned long this_sector = bh->b_rsector;
 455         int disk = new_disk;
 456         unsigned long new_distance;
 457         unsigned long current_distance;
 458
 459         /*
 460          * Check if it is sane at all to balance
 461          */
 462
 463         if (conf->resync_mirrors)
 464                 goto rb_out;
 465
 466
 467         /* make sure that disk is operational */
 468         while( !conf->mirrors[new_disk].operational) {
 469                 if (new_disk <= 0) new_disk = conf->raid_disks;
 470                 new_disk--;
 471                 if (new_disk == disk) {
 472                         /*
 473                          * This means no working disk was found
 474                          * Nothing much to do, lets not change anything
 475                          * and hope for the best...
 476                          */
 477
 478                         new_disk = conf->last_used;
 479
 480                         goto rb_out;
 481                 }
 482         }
 483         disk = new_disk;
 484         /* now disk == new_disk == starting point for search */
 485
 486         /*
 487          * Don't touch anything for sequential reads.
 488          */
 489
 490         if (this_sector == conf->mirrors[new_disk].head_position)
 491                 goto rb_out;
 492
 493         /*
 494          * If reads have been done only on a single disk
 495          * for a time, lets give another disk a change.
 496          * This is for kicking those idling disks so that
 497          * they would find work near some hotspot.
 498          */
 499
 500         if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
 501                 conf->sect_count = 0;
 502
 503                 do {
 504                         if (new_disk<=0)
 505                                 new_disk = conf->raid_disks;
 506                         new_disk--;
 507                         if (new_disk == disk)
 508                                 break;
 509                 } while ((conf->mirrors[new_disk].write_only) ||
 510                          (!conf->mirrors[new_disk].operational));
 511
 512                 goto rb_out;
 513         }
 514
 515         current_distance = abs(this_sector -
 516                                 conf->mirrors[disk].head_position);
 517
 518         /* Find the disk which is closest */
 519
 520         do {
 521                 if (disk <= 0)
 522                         disk = conf->raid_disks;
 523                 disk--;
 524
 525                 if ((conf->mirrors[disk].write_only) ||
 526                                 (!conf->mirrors[disk].operational))
 527                         continue;
 528
 529                 new_distance = abs(this_sector -
 530                                         conf->mirrors[disk].head_position);
 531
 532                 if (new_distance < current_distance) {
 533                         conf->sect_count = 0;
 534                         current_distance = new_distance;
 535                         new_disk = disk;
 536                 }
 537         } while (disk != conf->last_used);
 538
 539 rb_out:
 540         conf->mirrors[new_disk].head_position = this_sector + sectors;
 541
 542         conf->last_used = new_disk;
 543         conf->sect_count += sectors;
 544
 545         return new_disk;
 546 }
 547
 548 static int raid1_make_request (mddev_t *mddev, int rw,
 549                                struct buffer_head * bh)
 550 {
 551         raid1_conf_t *conf = mddev_to_conf(mddev);
 552         struct buffer_head *bh_req, *bhl;
 553         struct raid1_bh * r1_bh;
 554         int disks = MD_SB_DISKS;
 555         int i, sum_bhs = 0, sectors;
 556         struct mirror_info *mirror;
 557
 558         if (!buffer_locked(bh))
 559                 BUG();
 560
 561 /*
 562  * make_request() can abort the operation when READA is being
 563  * used and no empty request is available.
 564  *
 565  * Currently, just replace the command with READ/WRITE.
 566  */
 567         if (rw == READA)
 568                 rw = READ;
 569
 570         r1_bh = raid1_alloc_r1bh (conf);
 571
 572         spin_lock_irq(&conf->segment_lock);
 573         wait_event_lock_irq(conf->wait_done,
 574                         bh->b_rsector < conf->start_active ||
 575                         bh->b_rsector >= conf->start_future,
 576                         conf->segment_lock);
 577         if (bh->b_rsector < conf->start_active)
 578                 conf->cnt_done++;
 579         else {
 580                 conf->cnt_future++;
 581                 if (conf->phase)
 582                         set_bit(R1BH_SyncPhase, &r1_bh->state);
 583         }
 584         spin_unlock_irq(&conf->segment_lock);
 585
 586         /*
 587          * i think the read and write branch should be separated completely,
 588          * since we want to do read balancing on the read side for example.
 589          * Alternative implementations? :) --mingo
 590          */
 591
 592         r1_bh->master_bh = bh;
 593         r1_bh->mddev = mddev;
 594         r1_bh->cmd = rw;
 595
 596         sectors = bh->b_size >> 9;
 597         if (rw == READ) {
 598                 /*
 599                  * read balancing logic:
 600                  */
 601                 mirror = conf->mirrors + raid1_read_balance(conf, bh);
 602
 603                 bh_req = &r1_bh->bh_req;
 604                 memcpy(bh_req, bh, sizeof(*bh));
 605                 bh_req->b_blocknr = bh->b_rsector / sectors;
 606                 bh_req->b_dev = mirror->dev;
 607                 bh_req->b_rdev = mirror->dev;
 608         /*      bh_req->b_rsector = bh->n_rsector; */
 609                 bh_req->b_end_io = raid1_end_request;
 610                 bh_req->b_private = r1_bh;
 611                 generic_make_request (rw, bh_req);
 612                 return 0;
 613         }
 614
 615         /*
 616          * WRITE:
 617          */
 618
 619         bhl = raid1_alloc_bh(conf, conf->raid_disks);
 620         for (i = 0; i < disks; i++) {
 621                 struct buffer_head *mbh;
 622                 if (!conf->mirrors[i].operational)
 623                         continue;
 624
 625         /*
 626          * We should use a private pool (size depending on NR_REQUEST),
 627          * to avoid writes filling up the memory with bhs
 628          *
 629          * Such pools are much faster than kmalloc anyways (so we waste
 630          * almost nothing by not using the master bh when writing and
 631          * win alot of cleanness) but for now we are cool enough. --mingo
 632          *
 633          * It's safe to sleep here, buffer heads cannot be used in a shared
 634          * manner in the write branch. Look how we lock the buffer at the
 635          * beginning of this function to grok the difference ;)
 636          */
 637                 mbh = bhl;
 638                 if (mbh == NULL) {
 639                         MD_BUG();
 640                         break;
 641                 }
 642                 bhl = mbh->b_next;
 643                 mbh->b_next = NULL;
 644                 mbh->b_this_page = (struct buffer_head *)1;
 645
 646         /*
 647          * prepare mirrored mbh (fields ordered for max mem throughput):
 648          */
 649                 mbh->b_blocknr    = bh->b_rsector / sectors;
 650                 mbh->b_dev        = conf->mirrors[i].dev;
 651                 mbh->b_rdev       = conf->mirrors[i].dev;
 652                 mbh->b_rsector    = bh->b_rsector;
 653                 mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
 654                                                 (1<<BH_Mapped) | (1<<BH_Lock);
 655
 656                 atomic_set(&mbh->b_count, 1);
 657                 mbh->b_size       = bh->b_size;
 658                 mbh->b_page       = bh->b_page;
 659                 mbh->b_data       = bh->b_data;
 660                 mbh->b_list       = BUF_LOCKED;
 661                 mbh->b_end_io     = raid1_end_request;
 662                 mbh->b_private    = r1_bh;
 663
 664                 mbh->b_next = r1_bh->mirror_bh_list;
 665                 r1_bh->mirror_bh_list = mbh;
 666                 sum_bhs++;
 667         }
 668         if (bhl) raid1_free_bh(conf,bhl);
 669         md_atomic_set(&r1_bh->remaining, sum_bhs);
 670
 671         /*
 672          * We have to be a bit careful about the semaphore above, thats
 673          * why we start the requests separately. Since kmalloc() could
 674          * fail, sleep and make_request() can sleep too, this is the
 675          * safer solution. Imagine, end_request decreasing the semaphore
 676          * before we could have set it up ... We could play tricks with
 677          * the semaphore (presetting it and correcting at the end if
 678          * sum_bhs is not 'n' but we have to do end_request by hand if
 679          * all requests finish until we had a chance to set up the
 680          * semaphore correctly ... lots of races).
 681          */
 682         bh = r1_bh->mirror_bh_list;
 683         while(bh) {
 684                 struct buffer_head *bh2 = bh;
 685                 bh = bh->b_next;
 686                 generic_make_request(rw, bh2);
 687         }
 688         return (0);
 689 }
 690
 691 static int raid1_status (char *page, mddev_t *mddev)
 692 {
 693         raid1_conf_t *conf = mddev_to_conf(mddev);
 694         int sz = 0, i;
 695
 696         sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
 697                                                  conf->working_disks);
 698         for (i = 0; i < conf->raid_disks; i++)
 699                 sz += sprintf (page+sz, "%s",
 700                         conf->mirrors[i].operational ? "U" : "_");
 701         sz += sprintf (page+sz, "]");
 702         return sz;
 703 }
 704
 705 #define LAST_DISK KERN_ALERT \
 706 "raid1: only one disk left and IO error.\n"
 707
 708 #define NO_SPARE_DISK KERN_ALERT \
 709 "raid1: no spare disk left, degrading mirror level by one.\n"
 710
 711 #define DISK_FAILED KERN_ALERT \
 712 "raid1: Disk failure on %s, disabling device. \n" \
 713 "       Operation continuing on %d devices\n"
 714
 715 #define START_SYNCING KERN_ALERT \
 716 "raid1: start syncing spare disk.\n"
 717
 718 #define ALREADY_SYNCING KERN_INFO \
 719 "raid1: syncing already in progress.\n"
 720
 721 static void mark_disk_bad (mddev_t *mddev, int failed)
 722 {
 723         raid1_conf_t *conf = mddev_to_conf(mddev);
 724         struct mirror_info *mirror = conf->mirrors+failed;
 725         mdp_super_t *sb = mddev->sb;
 726
 727         mirror->operational = 0;
 728         mark_disk_faulty(sb->disks+mirror->number);
 729         mark_disk_nonsync(sb->disks+mirror->number);
 730         mark_disk_inactive(sb->disks+mirror->number);
 731         sb->active_disks--;
 732         sb->working_disks--;
 733         sb->failed_disks++;
 734         mddev->sb_dirty = 1;
 735         md_wakeup_thread(conf->thread);
 736         conf->working_disks--;
 737         printk (DISK_FAILED, partition_name (mirror->dev),
 738                                  conf->working_disks);
 739 }
 740
 741 static int raid1_error (mddev_t *mddev, kdev_t dev)
 742 {
 743         raid1_conf_t *conf = mddev_to_conf(mddev);
 744         struct mirror_info * mirrors = conf->mirrors;
 745         int disks = MD_SB_DISKS;
 746         int i;
 747
 748         if (conf->working_disks == 1) {
 749                 /*
 750                  * Uh oh, we can do nothing if this is our last disk, but
 751                  * first check if this is a queued request for a device
 752                  * which has just failed.
 753                  */
 754                 for (i = 0; i < disks; i++) {
 755                         if (mirrors[i].dev==dev && !mirrors[i].operational)
 756                                 return 0;
 757                 }
 758                 printk (LAST_DISK);
 759         } else {
 760                 /*
 761                  * Mark disk as unusable
 762                  */
 763                 for (i = 0; i < disks; i++) {
 764                         if (mirrors[i].dev==dev && mirrors[i].operational) {
 765                                 mark_disk_bad(mddev, i);
 766                                 break;
 767                         }
 768                 }
 769         }
 770         return 0;
 771 }
 772
 773 #undef LAST_DISK
 774 #undef NO_SPARE_DISK
 775 #undef DISK_FAILED
 776 #undef START_SYNCING
 777
 778
 779 static void print_raid1_conf (raid1_conf_t *conf)
 780 {
 781         int i;
 782         struct mirror_info *tmp;
 783
 784         printk("RAID1 conf printout:\n");
 785         if (!conf) {
 786                 printk("(conf==NULL)\n");
 787                 return;
 788         }
 789         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 790                          conf->raid_disks, conf->nr_disks);
 791
 792         for (i = 0; i < MD_SB_DISKS; i++) {
 793                 tmp = conf->mirrors + i;
 794                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 795                         i, tmp->spare,tmp->operational,
 796                         tmp->number,tmp->raid_disk,tmp->used_slot,
 797                         partition_name(tmp->dev));
 798         }
 799 }
 800
 801 static void close_sync(raid1_conf_t *conf)
 802 {
 803         mddev_t *mddev = conf->mddev;
 804         /* If reconstruction was interrupted, we need to close the "active" and "pending"
 805          * holes.
 806          * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
 807          */
 808         /* this is really needed when recovery stops too... */
 809         spin_lock_irq(&conf->segment_lock);
 810         conf->start_active = conf->start_pending;
 811         conf->start_ready = conf->start_pending;
 812         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 813         conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
 814         conf->start_future = mddev->sb->size+1;
 815         conf->cnt_pending = conf->cnt_future;
 816         conf->cnt_future = 0;
 817         conf->phase = conf->phase ^1;
 818         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 819         conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
 820         conf->phase = 0;
 821         conf->cnt_future = conf->cnt_done;;
 822         conf->cnt_done = 0;
 823         spin_unlock_irq(&conf->segment_lock);
 824         wake_up(&conf->wait_done);
 825 }
 826
 827 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 828 {
 829         int err = 0;
 830         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 831         raid1_conf_t *conf = mddev->private;
 832         struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 833         mdp_super_t *sb = mddev->sb;
 834         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 835
 836         print_raid1_conf(conf);
 837         md_spin_lock_irq(&conf->device_lock);
 838         /*
 839          * find the disk ...
 840          */
 841         switch (state) {
 842
 843         case DISKOP_SPARE_ACTIVE:
 844
 845                 /*
 846                  * Find the failed disk within the RAID1 configuration ...
 847                  * (this can only be in the first conf->working_disks part)
 848                  */
 849                 for (i = 0; i < conf->raid_disks; i++) {
 850                         tmp = conf->mirrors + i;
 851                         if ((!tmp->operational && !tmp->spare) ||
 852                                         !tmp->used_slot) {
 853                                 failed_disk = i;
 854                                 break;
 855                         }
 856                 }
 857                 /*
 858                  * When we activate a spare disk we _must_ have a disk in
 859                  * the lower (active) part of the array to replace.
 860                  */
 861                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 862                         MD_BUG();
 863                         err = 1;
 864                         goto abort;
 865                 }
 866                 /* fall through */
 867
 868         case DISKOP_SPARE_WRITE:
 869         case DISKOP_SPARE_INACTIVE:
 870
 871                 /*
 872                  * Find the spare disk ... (can only be in the 'high'
 873                  * area of the array)
 874                  */
 875                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 876                         tmp = conf->mirrors + i;
 877                         if (tmp->spare && tmp->number == (*d)->number) {
 878                                 spare_disk = i;
 879                                 break;
 880                         }
 881                 }
 882                 if (spare_disk == -1) {
 883                         MD_BUG();
 884                         err = 1;
 885                         goto abort;
 886                 }
 887                 break;
 888
 889         case DISKOP_HOT_REMOVE_DISK:
 890
 891                 for (i = 0; i < MD_SB_DISKS; i++) {
 892                         tmp = conf->mirrors + i;
 893                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
 894                                 if (tmp->operational) {
 895                                         err = -EBUSY;
 896                                         goto abort;
 897                                 }
 898                                 removed_disk = i;
 899                                 break;
 900                         }
 901                 }
 902                 if (removed_disk == -1) {
 903                         MD_BUG();
 904                         err = 1;
 905                         goto abort;
 906                 }
 907                 break;
 908
 909         case DISKOP_HOT_ADD_DISK:
 910
 911                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 912                         tmp = conf->mirrors + i;
 913                         if (!tmp->used_slot) {
 914                                 added_disk = i;
 915                                 break;
 916                         }
 917                 }
 918                 if (added_disk == -1) {
 919                         MD_BUG();
 920                         err = 1;
 921                         goto abort;
 922                 }
 923                 break;
 924         }
 925
 926         switch (state) {
 927         /*
 928          * Switch the spare disk to write-only mode:
 929          */
 930         case DISKOP_SPARE_WRITE:
 931                 sdisk = conf->mirrors + spare_disk;
 932                 sdisk->operational = 1;
 933                 sdisk->write_only = 1;
 934                 break;
 935         /*
 936          * Deactivate a spare disk:
 937          */
 938         case DISKOP_SPARE_INACTIVE:
 939                 close_sync(conf);
 940                 sdisk = conf->mirrors + spare_disk;
 941                 sdisk->operational = 0;
 942                 sdisk->write_only = 0;
 943                 break;
 944         /*
 945          * Activate (mark read-write) the (now sync) spare disk,
 946          * which means we switch it's 'raid position' (->raid_disk)
 947          * with the failed disk. (only the first 'conf->nr_disks'
 948          * slots are used for 'real' disks and we must preserve this
 949          * property)
 950          */
 951         case DISKOP_SPARE_ACTIVE:
 952                 close_sync(conf);
 953                 sdisk = conf->mirrors + spare_disk;
 954                 fdisk = conf->mirrors + failed_disk;
 955
 956                 spare_desc = &sb->disks[sdisk->number];
 957                 failed_desc = &sb->disks[fdisk->number];
 958
 959                 if (spare_desc != *d) {
 960                         MD_BUG();
 961                         err = 1;
 962                         goto abort;
 963                 }
 964
 965                 if (spare_desc->raid_disk != sdisk->raid_disk) {
 966                         MD_BUG();
 967                         err = 1;
 968                         goto abort;
 969                 }
 970
 971                 if (sdisk->raid_disk != spare_disk) {
 972                         MD_BUG();
 973                         err = 1;
 974                         goto abort;
 975                 }
 976
 977                 if (failed_desc->raid_disk != fdisk->raid_disk) {
 978                         MD_BUG();
 979                         err = 1;
 980                         goto abort;
 981                 }
 982
 983                 if (fdisk->raid_disk != failed_disk) {
 984                         MD_BUG();
 985                         err = 1;
 986                         goto abort;
 987                 }
 988
 989                 /*
 990                  * do the switch finally
 991                  */
 992                 xchg_values(*spare_desc, *failed_desc);
 993                 xchg_values(*fdisk, *sdisk);
 994
 995                 /*
 996                  * (careful, 'failed' and 'spare' are switched from now on)
 997                  *
 998                  * we want to preserve linear numbering and we want to
 999                  * give the proper raid_disk number to the now activated
1000                  * disk. (this means we switch back these values)
1001                  */
1002
1003                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1004                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1005                 xchg_values(spare_desc->number, failed_desc->number);
1006                 xchg_values(sdisk->number, fdisk->number);
1007
1008                 *d = failed_desc;
1009
1010                 if (sdisk->dev == MKDEV(0,0))
1011                         sdisk->used_slot = 0;
1012                 /*
1013                  * this really activates the spare.
1014                  */
1015                 fdisk->spare = 0;
1016                 fdisk->write_only = 0;
1017
1018                 /*
1019                  * if we activate a spare, we definitely replace a
1020                  * non-operational disk slot in the 'low' area of
1021                  * the disk array.
1022                  */
1023
1024                 conf->working_disks++;
1025
1026                 break;
1027
1028         case DISKOP_HOT_REMOVE_DISK:
1029                 rdisk = conf->mirrors + removed_disk;
1030
1031                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1032                         MD_BUG();
1033                         err = 1;
1034                         goto abort;
1035                 }
1036                 rdisk->dev = MKDEV(0,0);
1037                 rdisk->used_slot = 0;
1038                 conf->nr_disks--;
1039                 break;
1040
1041         case DISKOP_HOT_ADD_DISK:
1042                 adisk = conf->mirrors + added_disk;
1043                 added_desc = *d;
1044
1045                 if (added_disk != added_desc->number) {
1046                         MD_BUG();
1047                         err = 1;
1048                         goto abort;
1049                 }
1050
1051                 adisk->number = added_desc->number;
1052                 adisk->raid_disk = added_desc->raid_disk;
1053                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1054
1055                 adisk->operational = 0;
1056                 adisk->write_only = 0;
1057                 adisk->spare = 1;
1058                 adisk->used_slot = 1;
1059                 adisk->head_position = 0;
1060                 conf->nr_disks++;
1061
1062                 break;
1063
1064         default:
1065                 MD_BUG();
1066                 err = 1;
1067                 goto abort;
1068         }
1069 abort:
1070         md_spin_unlock_irq(&conf->device_lock);
1071         if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1072                 /* should move to "END_REBUILD" when such exists */
1073                 raid1_shrink_buffers(conf);
1074
1075         print_raid1_conf(conf);
1076         return err;
1077 }
1078
1079
1080 #define IO_ERROR KERN_ALERT \
1081 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1082
1083 #define REDIRECT_SECTOR KERN_ERR \
1084 "raid1: %s: redirecting sector %lu to another mirror\n"
1085
1086 /*
1087  * This is a kernel thread which:
1088  *
1089  *      1.      Retries failed read operations on working mirrors.
1090  *      2.      Updates the raid superblock when problems encounter.
1091  *      3.      Performs writes following reads for array syncronising.
1092  */
1093 static void end_sync_write(struct buffer_head *bh, int uptodate);
1094 static void end_sync_read(struct buffer_head *bh, int uptodate);
1095
1096 static void raid1d (void *data)
1097 {
1098         struct raid1_bh *r1_bh;
1099         struct buffer_head *bh;
1100         unsigned long flags;
1101         mddev_t *mddev;
1102         kdev_t dev;
1103
1104
1105         for (;;) {
1106                 md_spin_lock_irqsave(&retry_list_lock, flags);
1107                 r1_bh = raid1_retry_list;
1108                 if (!r1_bh)
1109                         break;
1110                 raid1_retry_list = r1_bh->next_r1;
1111                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1112
1113                 mddev = r1_bh->mddev;
1114                 if (mddev->sb_dirty) {
1115                         printk(KERN_INFO "dirty sb detected, updating.\n");
1116                         mddev->sb_dirty = 0;
1117                         md_update_sb(mddev);
1118                 }
1119                 bh = &r1_bh->bh_req;
1120                 switch(r1_bh->cmd) {
1121                 case SPECIAL:
1122                         /* have to allocate lots of bh structures and
1123                          * schedule writes
1124                          */
1125                         if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1126                                 int i, sum_bhs = 0;
1127                                 int disks = MD_SB_DISKS;
1128                                 struct buffer_head *bhl, *mbh;
1129                                 raid1_conf_t *conf;
1130                                 int sectors = bh->b_size >> 9;
1131
1132                                 conf = mddev_to_conf(mddev);
1133                                 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1134                                 for (i = 0; i < disks ; i++) {
1135                                         if (!conf->mirrors[i].operational)
1136                                                 continue;
1137                                         if (i==conf->last_used)
1138                                                 /* we read from here, no need to write */
1139                                                 continue;
1140                                         if (i < conf->raid_disks
1141                                             && !conf->resync_mirrors)
1142                                                 /* don't need to write this,
1143                                                  * we are just rebuilding */
1144                                                 continue;
1145                                         mbh = bhl;
1146                                         if (!mbh) {
1147                                                 MD_BUG();
1148                                                 break;
1149                                         }
1150                                         bhl = mbh->b_next;
1151                                         mbh->b_this_page = (struct buffer_head *)1;
1152
1153
1154                                 /*
1155                                  * prepare mirrored bh (fields ordered for max mem throughput):
1156                                  */
1157                                         mbh->b_blocknr    = bh->b_blocknr;
1158                                         mbh->b_dev        = conf->mirrors[i].dev;
1159                                         mbh->b_rdev       = conf->mirrors[i].dev;
1160                                         mbh->b_rsector    = bh->b_blocknr * sectors;
1161                                         mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1162                                                 (1<<BH_Mapped) | (1<<BH_Lock);
1163                                         atomic_set(&mbh->b_count, 1);
1164                                         mbh->b_size       = bh->b_size;
1165                                         mbh->b_page       = bh->b_page;
1166                                         mbh->b_data       = bh->b_data;
1167                                         mbh->b_list       = BUF_LOCKED;
1168                                         mbh->b_end_io     = end_sync_write;
1169                                         mbh->b_private    = r1_bh;
1170
1171                                         mbh->b_next = r1_bh->mirror_bh_list;
1172                                         r1_bh->mirror_bh_list = mbh;
1173
1174                                         sum_bhs++;
1175                                 }
1176                                 md_atomic_set(&r1_bh->remaining, sum_bhs);
1177                                 if (bhl) raid1_free_bh(conf, bhl);
1178                                 mbh = r1_bh->mirror_bh_list;
1179                                 while (mbh) {
1180                                         struct buffer_head *bh1 = mbh;
1181                                         mbh = mbh->b_next;
1182                                         generic_make_request(WRITE, bh1);
1183                                         md_sync_acct(bh1->b_dev, bh1->b_size/512);
1184                                 }
1185                         } else {
1186                                 dev = bh->b_dev;
1187                                 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1188                                 if (bh->b_dev == dev) {
1189                                         printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1190                                         md_done_sync(mddev, bh->b_size>>10, 0);
1191                                 } else {
1192                                         printk (REDIRECT_SECTOR,
1193                                                 partition_name(bh->b_dev), bh->b_blocknr);
1194                                         bh->b_rdev = bh->b_dev;
1195                                         generic_make_request(READ, bh);
1196                                 }
1197                         }
1198
1199                         break;
1200                 case READ:
1201                 case READA:
1202                         dev = bh->b_dev;
1203
1204                         raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1205                         if (bh->b_dev == dev) {
1206                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1207                                 raid1_end_bh_io(r1_bh, 0);
1208                         } else {
1209                                 printk (REDIRECT_SECTOR,
1210                                         partition_name(bh->b_dev), bh->b_blocknr);
1211                                 bh->b_rdev = bh->b_dev;
1212                                 generic_make_request (r1_bh->cmd, bh);
1213                         }
1214                         break;
1215                 }
1216         }
1217         md_spin_unlock_irqrestore(&retry_list_lock, flags);
1218 }
1219 #undef IO_ERROR
1220 #undef REDIRECT_SECTOR
1221
1222 /*
1223  * Private kernel thread to reconstruct mirrors after an unclean
1224  * shutdown.
1225  */
1226 static void raid1syncd (void *data)
1227 {
1228         raid1_conf_t *conf = data;
1229         mddev_t *mddev = conf->mddev;
1230
1231         if (!conf->resync_mirrors)
1232                 return;
1233         if (conf->resync_mirrors == 2)
1234                 return;
1235         down(&mddev->recovery_sem);
1236         if (!md_do_sync(mddev, NULL)) {
1237                 /*
1238                  * Only if everything went Ok.
1239                  */
1240                 conf->resync_mirrors = 0;
1241         }
1242
1243         close_sync(conf);
1244
1245         up(&mddev->recovery_sem);
1246         raid1_shrink_buffers(conf);
1247 }
1248
1249 /*
1250  * perform a "sync" on one "block"
1251  *
1252  * We need to make sure that no normal I/O request - particularly write
1253  * requests - conflict with active sync requests.
1254  * This is achieved by conceptually dividing the device space into a
1255  * number of sections:
1256  *  DONE: 0 .. a-1     These blocks are in-sync
1257  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1258  *                     no normal IO requests
1259  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1260  *                     request may be happening
1261  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1262  *                     ones will be added
1263  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1264  *                     be happening, but not sync
1265  *
1266  * We keep a
1267  *   phase    which flips (0 or 1) each time d moves and
1268  * a count of:
1269  *   z =  active io requests in FUTURE since d moved - marked with
1270  *        current phase
1271  *   y =  active io requests in FUTURE before d moved, or PENDING -
1272  *        marked with previous phase
1273  *   x =  active sync requests in READY
1274  *   w =  active sync requests in ACTIVE
1275  *   v =  active io requests in DONE
1276  *
1277  * Normally, a=b=c=d=0 and z= active io requests
1278  *   or a=b=c=d=END and v= active io requests
1279  * Allowed changes to a,b,c,d:
1280  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1281  * B:  y==0 -> c=d
1282  * C:   b=c, w+=x, x=0
1283  * D:  w==0 -> a=b
1284  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1285  *
1286  * At start of sync we apply A.
1287  * When y reaches 0, we apply B then A then being sync requests
1288  * When sync point reaches c-1, we wait for y==0, and W==0, and
1289  * then apply apply B then A then D then C.
1290  * Finally, we apply E
1291  *
1292  * The sync request simply issues a "read" against a working drive
1293  * This is marked so that on completion the raid1d thread is woken to
1294  * issue suitable write requests
1295  */
1296
1297 static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr)
1298 {
1299         raid1_conf_t *conf = mddev_to_conf(mddev);
1300         struct mirror_info *mirror;
1301         struct raid1_bh *r1_bh;
1302         struct buffer_head *bh;
1303         int bsize;
1304         int disk;
1305
1306         spin_lock_irq(&conf->segment_lock);
1307         if (!block_nr) {
1308                 /* initialize ...*/
1309                 int buffs;
1310                 conf->start_active = 0;
1311                 conf->start_ready = 0;
1312                 conf->start_pending = 0;
1313                 conf->start_future = 0;
1314                 conf->phase = 0;
1315                 /* we want enough buffers to hold twice the window of 128*/
1316                 buffs = 128 *2 / (PAGE_SIZE>>9);
1317                 buffs = raid1_grow_buffers(conf, buffs);
1318                 if (buffs < 2)
1319                         goto nomem;
1320
1321                 conf->window = buffs*(PAGE_SIZE>>9)/2;
1322                 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1323                 conf->cnt_done = conf->cnt_pending = 0;
1324                 if (conf->cnt_ready || conf->cnt_active)
1325                         MD_BUG();
1326         }
1327         while ((block_nr<<1) >= conf->start_pending) {
1328                 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1329                         block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1330                         conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1331                 wait_event_lock_irq(conf->wait_done,
1332                                         !conf->cnt_active,
1333                                         conf->segment_lock);
1334                 wait_event_lock_irq(conf->wait_ready,
1335                                         !conf->cnt_pending,
1336                                         conf->segment_lock);
1337                 conf->start_active = conf->start_ready;
1338                 conf->start_ready = conf->start_pending;
1339                 conf->start_pending = conf->start_future;
1340                 conf->start_future = conf->start_future+conf->window;
1341                 // Note: falling off the end is not a problem
1342                 conf->phase = conf->phase ^1;
1343                 conf->cnt_active = conf->cnt_ready;
1344                 conf->cnt_ready = 0;
1345                 conf->cnt_pending = conf->cnt_future;
1346                 conf->cnt_future = 0;
1347                 wake_up(&conf->wait_done);
1348         }
1349         conf->cnt_ready++;
1350         spin_unlock_irq(&conf->segment_lock);
1351
1352
1353         /* If reconstructing, and >1 working disc,
1354          * could dedicate one to rebuild and others to
1355          * service read requests ..
1356          */
1357         disk = conf->last_used;
1358         /* make sure disk is operational */
1359         while (!conf->mirrors[disk].operational) {
1360                 if (disk <= 0) disk = conf->raid_disks;
1361                 disk--;
1362                 if (disk == conf->last_used)
1363                         break;
1364         }
1365         conf->last_used = disk;
1366
1367         mirror = conf->mirrors+conf->last_used;
1368
1369         r1_bh = raid1_alloc_buf (conf);
1370         r1_bh->master_bh = NULL;
1371         r1_bh->mddev = mddev;
1372         r1_bh->cmd = SPECIAL;
1373         bh = &r1_bh->bh_req;
1374
1375         bh->b_blocknr = block_nr;
1376         bsize = 1024;
1377         while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE
1378                         && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) {
1379                 bh->b_blocknr >>= 1;
1380                 bsize <<= 1;
1381         }
1382         bh->b_size = bsize;
1383         bh->b_list = BUF_LOCKED;
1384         bh->b_dev = mirror->dev;
1385         bh->b_rdev = mirror->dev;
1386         bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1387         if (!bh->b_page)
1388                 BUG();
1389         if (!bh->b_data)
1390                 BUG();
1391         if (bh->b_data != page_address(bh->b_page))
1392                 BUG();
1393         bh->b_end_io = end_sync_read;
1394         bh->b_private = r1_bh;
1395         bh->b_rsector = block_nr<<1;
1396         init_waitqueue_head(&bh->b_wait);
1397
1398         generic_make_request(READ, bh);
1399         md_sync_acct(bh->b_dev, bh->b_size/512);
1400
1401         return (bsize >> 10);
1402
1403 nomem:
1404         raid1_shrink_buffers(conf);
1405         spin_unlock_irq(&conf->segment_lock);
1406         return -ENOMEM;
1407 }
1408
1409 static void end_sync_read(struct buffer_head *bh, int uptodate)
1410 {
1411         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1412
1413         /* we have read a block, now it needs to be re-written,
1414          * or re-read if the read failed.
1415          * We don't do much here, just schedule handling by raid1d
1416          */
1417         if (!uptodate)
1418                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1419         else
1420                 set_bit(R1BH_Uptodate, &r1_bh->state);
1421         raid1_reschedule_retry(r1_bh);
1422 }
1423
1424 static void end_sync_write(struct buffer_head *bh, int uptodate)
1425 {
1426         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1427
1428         if (!uptodate)
1429                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1430         if (atomic_dec_and_test(&r1_bh->remaining)) {
1431                 mddev_t *mddev = r1_bh->mddev;
1432                 unsigned long sect = bh->b_blocknr * (bh->b_size>>9);
1433                 int size = bh->b_size;
1434                 raid1_free_buf(r1_bh);
1435                 sync_request_done(sect, mddev_to_conf(mddev));
1436                 md_done_sync(mddev,size>>10, uptodate);
1437         }
1438 }
1439
1440 /*
1441  * This will catch the scenario in which one of the mirrors was
1442  * mounted as a normal device rather than as a part of a raid set.
1443  *
1444  * check_consistency is very personality-dependent, eg. RAID5 cannot
1445  * do this check, it uses another method.
1446  */
1447 static int __check_consistency (mddev_t *mddev, int row)
1448 {
1449         raid1_conf_t *conf = mddev_to_conf(mddev);
1450         int disks = MD_SB_DISKS;
1451         kdev_t dev;
1452         struct buffer_head *bh = NULL;
1453         int i, rc = 0;
1454         char *buffer = NULL;
1455
1456         for (i = 0; i < disks; i++) {
1457                 printk("(checking disk %d)\n",i);
1458                 if (!conf->mirrors[i].operational)
1459                         continue;
1460                 printk("(really checking disk %d)\n",i);
1461                 dev = conf->mirrors[i].dev;
1462                 set_blocksize(dev, 4096);
1463                 if ((bh = bread(dev, row / 4, 4096)) == NULL)
1464                         break;
1465                 if (!buffer) {
1466                         buffer = (char *) __get_free_page(GFP_KERNEL);
1467                         if (!buffer)
1468                                 break;
1469                         memcpy(buffer, bh->b_data, 4096);
1470                 } else if (memcmp(buffer, bh->b_data, 4096)) {
1471                         rc = 1;
1472                         break;
1473                 }
1474                 bforget(bh);
1475                 fsync_dev(dev);
1476                 invalidate_buffers(dev);
1477                 bh = NULL;
1478         }
1479         if (buffer)
1480                 free_page((unsigned long) buffer);
1481         if (bh) {
1482                 dev = bh->b_dev;
1483                 bforget(bh);
1484                 fsync_dev(dev);
1485                 invalidate_buffers(dev);
1486         }
1487         return rc;
1488 }
1489
1490 static int check_consistency (mddev_t *mddev)
1491 {
1492         if (__check_consistency(mddev, 0))
1493 /*
1494  * we do not do this currently, as it's perfectly possible to
1495  * have an inconsistent array when it's freshly created. Only
1496  * newly written data has to be consistent.
1497  */
1498                 return 0;
1499
1500         return 0;
1501 }
1502
1503 #define INVALID_LEVEL KERN_WARNING \
1504 "raid1: md%d: raid level not set to mirroring (%d)\n"
1505
1506 #define NO_SB KERN_ERR \
1507 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1508
1509 #define ERRORS KERN_ERR \
1510 "raid1: disabled mirror %s (errors detected)\n"
1511
1512 #define NOT_IN_SYNC KERN_ERR \
1513 "raid1: disabled mirror %s (not in sync)\n"
1514
1515 #define INCONSISTENT KERN_ERR \
1516 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1517
1518 #define ALREADY_RUNNING KERN_ERR \
1519 "raid1: disabled mirror %s (mirror %d already operational)\n"
1520
1521 #define OPERATIONAL KERN_INFO \
1522 "raid1: device %s operational as mirror %d\n"
1523
1524 #define MEM_ERROR KERN_ERR \
1525 "raid1: couldn't allocate memory for md%d\n"
1526
1527 #define SPARE KERN_INFO \
1528 "raid1: spare disk %s\n"
1529
1530 #define NONE_OPERATIONAL KERN_ERR \
1531 "raid1: no operational mirrors for md%d\n"
1532
1533 #define RUNNING_CKRAID KERN_ERR \
1534 "raid1: detected mirror differences -- running resync\n"
1535
1536 #define ARRAY_IS_ACTIVE KERN_INFO \
1537 "raid1: raid set md%d active with %d out of %d mirrors\n"
1538
1539 #define THREAD_ERROR KERN_ERR \
1540 "raid1: couldn't allocate thread for md%d\n"
1541
1542 #define START_RESYNC KERN_WARNING \
1543 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1544
1545 static int raid1_run (mddev_t *mddev)
1546 {
1547         raid1_conf_t *conf;
1548         int i, j, disk_idx;
1549         struct mirror_info *disk;
1550         mdp_super_t *sb = mddev->sb;
1551         mdp_disk_t *descriptor;
1552         mdk_rdev_t *rdev;
1553         struct md_list_head *tmp;
1554         int start_recovery = 0;
1555
1556         MOD_INC_USE_COUNT;
1557
1558         if (sb->level != 1) {
1559                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1560                 goto out;
1561         }
1562         /*
1563          * copy the already verified devices into our private RAID1
1564          * bookkeeping area. [whatever we allocate in raid1_run(),
1565          * should be freed in raid1_stop()]
1566          */
1567
1568         conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1569         mddev->private = conf;
1570         if (!conf) {
1571                 printk(MEM_ERROR, mdidx(mddev));
1572                 goto out;
1573         }
1574         memset(conf, 0, sizeof(*conf));
1575
1576         ITERATE_RDEV(mddev,rdev,tmp) {
1577                 if (rdev->faulty) {
1578                         printk(ERRORS, partition_name(rdev->dev));
1579                 } else {
1580                         if (!rdev->sb) {
1581                                 MD_BUG();
1582                                 continue;
1583                         }
1584                 }
1585                 if (rdev->desc_nr == -1) {
1586                         MD_BUG();
1587                         continue;
1588                 }
1589                 descriptor = &sb->disks[rdev->desc_nr];
1590                 disk_idx = descriptor->raid_disk;
1591                 disk = conf->mirrors + disk_idx;
1592
1593                 if (disk_faulty(descriptor)) {
1594                         disk->number = descriptor->number;
1595                         disk->raid_disk = disk_idx;
1596                         disk->dev = rdev->dev;
1597                         disk->sect_limit = MAX_WORK_PER_DISK;
1598                         disk->operational = 0;
1599                         disk->write_only = 0;
1600                         disk->spare = 0;
1601                         disk->used_slot = 1;
1602                         disk->head_position = 0;
1603                         continue;
1604                 }
1605                 if (disk_active(descriptor)) {
1606                         if (!disk_sync(descriptor)) {
1607                                 printk(NOT_IN_SYNC,
1608                                         partition_name(rdev->dev));
1609                                 continue;
1610                         }
1611                         if ((descriptor->number > MD_SB_DISKS) ||
1612                                          (disk_idx > sb->raid_disks)) {
1613
1614                                 printk(INCONSISTENT,
1615                                         partition_name(rdev->dev));
1616                                 continue;
1617                         }
1618                         if (disk->operational) {
1619                                 printk(ALREADY_RUNNING,
1620                                         partition_name(rdev->dev),
1621                                         disk_idx);
1622                                 continue;
1623                         }
1624                         printk(OPERATIONAL, partition_name(rdev->dev),
1625                                         disk_idx);
1626                         disk->number = descriptor->number;
1627                         disk->raid_disk = disk_idx;
1628                         disk->dev = rdev->dev;
1629                         disk->sect_limit = MAX_WORK_PER_DISK;
1630                         disk->operational = 1;
1631                         disk->write_only = 0;
1632                         disk->spare = 0;
1633                         disk->used_slot = 1;
1634                         disk->head_position = 0;
1635                         conf->working_disks++;
1636                 } else {
1637                 /*
1638                  * Must be a spare disk ..
1639                  */
1640                         printk(SPARE, partition_name(rdev->dev));
1641                         disk->number = descriptor->number;
1642                         disk->raid_disk = disk_idx;
1643                         disk->dev = rdev->dev;
1644                         disk->sect_limit = MAX_WORK_PER_DISK;
1645                         disk->operational = 0;
1646                         disk->write_only = 0;
1647                         disk->spare = 1;
1648                         disk->used_slot = 1;
1649                         disk->head_position = 0;
1650                 }
1651         }
1652         conf->raid_disks = sb->raid_disks;
1653         conf->nr_disks = sb->nr_disks;
1654         conf->mddev = mddev;
1655         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1656
1657         conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1658         init_waitqueue_head(&conf->wait_buffer);
1659         init_waitqueue_head(&conf->wait_done);
1660         init_waitqueue_head(&conf->wait_ready);
1661
1662         if (!conf->working_disks) {
1663                 printk(NONE_OPERATIONAL, mdidx(mddev));
1664                 goto out_free_conf;
1665         }
1666
1667
1668         /* pre-allocate some buffer_head structures.
1669          * As a minimum, 1 r1bh and raid_disks buffer_heads
1670          * would probably get us by in tight memory situations,
1671          * but a few more is probably a good idea.
1672          * For now, try 16 r1bh and 16*raid_disks bufferheads
1673          * This will allow at least 16 concurrent reads or writes
1674          * even if kmalloc starts failing
1675          */
1676         if (raid1_grow_r1bh(conf, 16) < 16 ||
1677             raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
1678                 printk(MEM_ERROR, mdidx(mddev));
1679                 goto out_free_conf;
1680         }
1681
1682         for (i = 0; i < MD_SB_DISKS; i++) {
1683
1684                 descriptor = sb->disks+i;
1685                 disk_idx = descriptor->raid_disk;
1686                 disk = conf->mirrors + disk_idx;
1687
1688                 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1689                                 !disk->used_slot) {
1690
1691                         disk->number = descriptor->number;
1692                         disk->raid_disk = disk_idx;
1693                         disk->dev = MKDEV(0,0);
1694
1695                         disk->operational = 0;
1696                         disk->write_only = 0;
1697                         disk->spare = 0;
1698                         disk->used_slot = 1;
1699                         disk->head_position = 0;
1700                 }
1701         }
1702
1703         /*
1704          * find the first working one and use it as a starting point
1705          * to read balancing.
1706          */
1707         for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1708                 /* nothing */;
1709         conf->last_used = j;
1710
1711
1712         if (conf->working_disks != sb->raid_disks) {
1713                 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1714                 start_recovery = 1;
1715         }
1716
1717         if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
1718                 /*
1719                  * we do sanity checks even if the device says
1720                  * it's clean ...
1721                  */
1722                 if (check_consistency(mddev)) {
1723                         printk(RUNNING_CKRAID);
1724                         sb->state &= ~(1 << MD_SB_CLEAN);
1725                 }
1726         }
1727
1728         {
1729                 const char * name = "raid1d";
1730
1731                 conf->thread = md_register_thread(raid1d, conf, name);
1732                 if (!conf->thread) {
1733                         printk(THREAD_ERROR, mdidx(mddev));
1734                         goto out_free_conf;
1735                 }
1736         }
1737
1738         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1739                 const char * name = "raid1syncd";
1740
1741                 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1742                 if (!conf->resync_thread) {
1743                         printk(THREAD_ERROR, mdidx(mddev));
1744                         goto out_free_conf;
1745                 }
1746
1747                 printk(START_RESYNC, mdidx(mddev));
1748                 conf->resync_mirrors = 1;
1749                 md_wakeup_thread(conf->resync_thread);
1750         }
1751
1752         /*
1753          * Regenerate the "device is in sync with the raid set" bit for
1754          * each device.
1755          */
1756         for (i = 0; i < MD_SB_DISKS; i++) {
1757                 mark_disk_nonsync(sb->disks+i);
1758                 for (j = 0; j < sb->raid_disks; j++) {
1759                         if (!conf->mirrors[j].operational)
1760                                 continue;
1761                         if (sb->disks[i].number == conf->mirrors[j].number)
1762                                 mark_disk_sync(sb->disks+i);
1763                 }
1764         }
1765         sb->active_disks = conf->working_disks;
1766
1767         if (start_recovery)
1768                 md_recover_arrays();
1769
1770
1771         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1772         /*
1773          * Ok, everything is just fine now
1774          */
1775         return 0;
1776
1777 out_free_conf:
1778         raid1_shrink_r1bh(conf);
1779         raid1_shrink_bh(conf, conf->freebh_cnt);
1780         raid1_shrink_buffers(conf);
1781         kfree(conf);
1782         mddev->private = NULL;
1783 out:
1784         MOD_DEC_USE_COUNT;
1785         return -EIO;
1786 }
1787
1788 #undef INVALID_LEVEL
1789 #undef NO_SB
1790 #undef ERRORS
1791 #undef NOT_IN_SYNC
1792 #undef INCONSISTENT
1793 #undef ALREADY_RUNNING
1794 #undef OPERATIONAL
1795 #undef SPARE
1796 #undef NONE_OPERATIONAL
1797 #undef RUNNING_CKRAID
1798 #undef ARRAY_IS_ACTIVE
1799
1800 static int raid1_stop_resync (mddev_t *mddev)
1801 {
1802         raid1_conf_t *conf = mddev_to_conf(mddev);
1803
1804         if (conf->resync_thread) {
1805                 if (conf->resync_mirrors) {
1806                         conf->resync_mirrors = 2;
1807                         md_interrupt_thread(conf->resync_thread);
1808
1809                         printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1810                         return 1;
1811                 }
1812                 return 0;
1813         }
1814         return 0;
1815 }
1816
1817 static int raid1_restart_resync (mddev_t *mddev)
1818 {
1819         raid1_conf_t *conf = mddev_to_conf(mddev);
1820
1821         if (conf->resync_mirrors) {
1822                 if (!conf->resync_thread) {
1823                         MD_BUG();
1824                         return 0;
1825                 }
1826                 conf->resync_mirrors = 1;
1827                 md_wakeup_thread(conf->resync_thread);
1828                 return 1;
1829         }
1830         return 0;
1831 }
1832
1833 static int raid1_stop (mddev_t *mddev)
1834 {
1835         raid1_conf_t *conf = mddev_to_conf(mddev);
1836
1837         md_unregister_thread(conf->thread);
1838         if (conf->resync_thread)
1839                 md_unregister_thread(conf->resync_thread);
1840         raid1_shrink_r1bh(conf);
1841         raid1_shrink_bh(conf, conf->freebh_cnt);
1842         raid1_shrink_buffers(conf);
1843         kfree(conf);
1844         mddev->private = NULL;
1845         MOD_DEC_USE_COUNT;
1846         return 0;
1847 }
1848
1849 static mdk_personality_t raid1_personality=
1850 {
1851         name:           "raid1",
1852         make_request:   raid1_make_request,
1853         run:            raid1_run,
1854         stop:           raid1_stop,
1855         status:         raid1_status,
1856         error_handler:  raid1_error,
1857         diskop:         raid1_diskop,
1858         stop_resync:    raid1_stop_resync,
1859         restart_resync: raid1_restart_resync,
1860         sync_request:   raid1_sync_request
1861 };
1862
1863 static int md__init raid1_init (void)
1864 {
1865         return register_md_personality (RAID1, &raid1_personality);
1866 }
1867
1868 static void raid1_exit (void)
1869 {
1870         unregister_md_personality (RAID1);
1871 }
1872
1873 module_init(raid1_init);
1874 module_exit(raid1_exit);
1875