drivers/block/raid1.c

   1 /*
   2  * raid1.c : Multiple Devices driver for Linux
   3  *
   4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
   5  *
   6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   7  *
   8  * RAID-1 management functions.
   9  *
  10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  11  *
  12  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License as published by
  17  * the Free Software Foundation; either version 2, or (at your option)
  18  * any later version.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * (for example /usr/src/linux/COPYING); if not, write to the Free
  22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/malloc.h>
  27 #include <linux/raid/raid1.h>
  28 #include <asm/atomic.h>
  29
  30 #define MAJOR_NR MD_MAJOR
  31 #define MD_DRIVER
  32 #define MD_PERSONALITY
  33
  34 #define MAX_WORK_PER_DISK 128
  35
  36 /*
  37  * The following can be used to debug the driver
  38  */
  39 #define RAID1_DEBUG     0
  40
  41 #if RAID1_DEBUG
  42 #define PRINTK(x...)   printk(x)
  43 #define inline
  44 #define __inline__
  45 #else
  46 #define PRINTK(x...)  do { } while (0)
  47 #endif
  48
  49
  50 static mdk_personality_t raid1_personality;
  51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
  52 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
  53
  54 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
  55 {
  56         /* return a linked list of "cnt" struct buffer_heads.
  57          * don't take any off the free list unless we know we can
  58          * get all we need, otherwise we could deadlock
  59          */
  60         struct buffer_head *bh=NULL;
  61
  62         while(cnt) {
  63                 struct buffer_head *t;
  64                 md_spin_lock_irq(&conf->device_lock);
  65                 if (conf->freebh_cnt >= cnt)
  66                         while (cnt) {
  67                                 t = conf->freebh;
  68                                 conf->freebh = t->b_next;
  69                                 t->b_next = bh;
  70                                 bh = t;
  71                                 t->b_state = 0;
  72                                 conf->freebh_cnt--;
  73                                 cnt--;
  74                         }
  75                 md_spin_unlock_irq(&conf->device_lock);
  76                 if (cnt == 0)
  77                         break;
  78                 t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_KERNEL);
  79                 if (t) {
  80                         memset(t, 0, sizeof(*t));
  81                         t->b_next = bh;
  82                         bh = t;
  83                         cnt--;
  84                 } else {
  85                         PRINTK("waiting for %d bh\n", cnt);
  86                         wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
  87                 }
  88         }
  89         return bh;
  90 }
  91
  92 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
  93 {
  94         md_spin_lock_irq(&conf->device_lock);
  95         while (bh) {
  96                 struct buffer_head *t = bh;
  97                 bh=bh->b_next;
  98                 if (t->b_pprev == NULL)
  99                         kfree(t);
 100                 else {
 101                         t->b_next= conf->freebh;
 102                         conf->freebh = t;
 103                         conf->freebh_cnt++;
 104                 }
 105         }
 106         md_spin_unlock_irq(&conf->device_lock);
 107         wake_up(&conf->wait_buffer);
 108 }
 109
 110 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
 111 {
 112         /* allocate cnt buffer_heads, possibly less if kalloc fails */
 113         int i = 0;
 114
 115         while (i < cnt) {
 116                 struct buffer_head *bh;
 117                 bh = kmalloc(sizeof(*bh), GFP_KERNEL);
 118                 if (!bh) break;
 119                 memset(bh, 0, sizeof(*bh));
 120
 121                 md_spin_lock_irq(&conf->device_lock);
 122                 bh->b_pprev = &conf->freebh;
 123                 bh->b_next = conf->freebh;
 124                 conf->freebh = bh;
 125                 conf->freebh_cnt++;
 126                 md_spin_unlock_irq(&conf->device_lock);
 127
 128                 i++;
 129         }
 130         return i;
 131 }
 132
 133 static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
 134 {
 135         /* discard cnt buffer_heads, if we can find them */
 136         int i = 0;
 137
 138         md_spin_lock_irq(&conf->device_lock);
 139         while ((i < cnt) && conf->freebh) {
 140                 struct buffer_head *bh = conf->freebh;
 141                 conf->freebh = bh->b_next;
 142                 kfree(bh);
 143                 i++;
 144                 conf->freebh_cnt--;
 145         }
 146         md_spin_unlock_irq(&conf->device_lock);
 147         return i;
 148 }
 149
 150
 151 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
 152 {
 153         struct raid1_bh *r1_bh = NULL;
 154
 155         do {
 156                 md_spin_lock_irq(&conf->device_lock);
 157                 if (conf->freer1) {
 158                         r1_bh = conf->freer1;
 159                         conf->freer1 = r1_bh->next_r1;
 160                         r1_bh->next_r1 = NULL;
 161                         r1_bh->state = 0;
 162                         r1_bh->bh_req.b_state = 0;
 163                 }
 164                 md_spin_unlock_irq(&conf->device_lock);
 165                 if (r1_bh)
 166                         return r1_bh;
 167                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh),
 168                                         GFP_KERNEL);
 169                 if (r1_bh) {
 170                         memset(r1_bh, 0, sizeof(*r1_bh));
 171                         return r1_bh;
 172                 }
 173                 wait_event(conf->wait_buffer, conf->freer1);
 174         } while (1);
 175 }
 176
 177 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
 178 {
 179         struct buffer_head *bh = r1_bh->mirror_bh_list;
 180         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 181
 182         r1_bh->mirror_bh_list = NULL;
 183
 184         if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
 185                 md_spin_lock_irq(&conf->device_lock);
 186                 r1_bh->next_r1 = conf->freer1;
 187                 conf->freer1 = r1_bh;
 188                 md_spin_unlock_irq(&conf->device_lock);
 189         } else {
 190                 kfree(r1_bh);
 191         }
 192         raid1_free_bh(conf, bh);
 193 }
 194
 195 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
 196 {
 197         int i = 0;
 198
 199         while (i < cnt) {
 200                 struct raid1_bh *r1_bh;
 201                 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 202                 if (!r1_bh)
 203                         break;
 204                 memset(r1_bh, 0, sizeof(*r1_bh));
 205
 206                 md_spin_lock_irq(&conf->device_lock);
 207                 set_bit(R1BH_PreAlloc, &r1_bh->state);
 208                 r1_bh->next_r1 = conf->freer1;
 209                 conf->freer1 = r1_bh;
 210                 md_spin_unlock_irq(&conf->device_lock);
 211
 212                 i++;
 213         }
 214         return i;
 215 }
 216
 217 static void raid1_shrink_r1bh(raid1_conf_t *conf)
 218 {
 219         md_spin_lock_irq(&conf->device_lock);
 220         while (conf->freer1) {
 221                 struct raid1_bh *r1_bh = conf->freer1;
 222                 conf->freer1 = r1_bh->next_r1;
 223                 kfree(r1_bh);
 224         }
 225         md_spin_unlock_irq(&conf->device_lock);
 226 }
 227
 228
 229
 230 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
 231 {
 232         struct buffer_head *bh = r1_bh->mirror_bh_list;
 233         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 234         r1_bh->mirror_bh_list = NULL;
 235
 236         md_spin_lock_irq(&conf->device_lock);
 237         r1_bh->next_r1 = conf->freebuf;
 238         conf->freebuf = r1_bh;
 239         md_spin_unlock_irq(&conf->device_lock);
 240         raid1_free_bh(conf, bh);
 241 }
 242
 243 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
 244 {
 245         struct raid1_bh *r1_bh;
 246
 247         md_spin_lock_irq(&conf->device_lock);
 248         wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
 249         r1_bh = conf->freebuf;
 250         conf->freebuf = r1_bh->next_r1;
 251         r1_bh->next_r1= NULL;
 252         md_spin_unlock_irq(&conf->device_lock);
 253
 254         return r1_bh;
 255 }
 256
 257 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 258 {
 259         int i = 0;
 260
 261         md_spin_lock_irq(&conf->device_lock);
 262         while (i < cnt) {
 263                 struct raid1_bh *r1_bh;
 264                 struct page *page;
 265
 266                 page = alloc_page(GFP_KERNEL);
 267                 if (!page)
 268                         break;
 269
 270                 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
 271                 if (!r1_bh) {
 272                         __free_page(page);
 273                         break;
 274                 }
 275                 memset(r1_bh, 0, sizeof(*r1_bh));
 276                 r1_bh->bh_req.b_page = page;
 277                 r1_bh->bh_req.b_data = (char *) page_address(page);
 278                 r1_bh->next_r1 = conf->freebuf;
 279                 conf->freebuf = r1_bh;
 280                 i++;
 281         }
 282         md_spin_unlock_irq(&conf->device_lock);
 283         return i;
 284 }
 285
 286 static void raid1_shrink_buffers (raid1_conf_t *conf)
 287 {
 288         md_spin_lock_irq(&conf->device_lock);
 289         while (conf->freebuf) {
 290                 struct raid1_bh *r1_bh = conf->freebuf;
 291                 conf->freebuf = r1_bh->next_r1;
 292                 __free_page(r1_bh->bh_req.b_page);
 293                 kfree(r1_bh);
 294         }
 295         md_spin_unlock_irq(&conf->device_lock);
 296 }
 297
 298 static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size)
 299 {
 300         raid1_conf_t *conf = mddev_to_conf(mddev);
 301         int i, disks = MD_SB_DISKS;
 302
 303         /*
 304          * Later we do read balancing on the read side
 305          * now we use the first available disk.
 306          */
 307
 308         for (i = 0; i < disks; i++) {
 309                 if (conf->mirrors[i].operational) {
 310                         *rdev = conf->mirrors[i].dev;
 311                         return (0);
 312                 }
 313         }
 314
 315         printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
 316         return (-1);
 317 }
 318
 319 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
 320 {
 321         unsigned long flags;
 322         mddev_t *mddev = r1_bh->mddev;
 323         raid1_conf_t *conf = mddev_to_conf(mddev);
 324
 325         md_spin_lock_irqsave(&retry_list_lock, flags);
 326         if (raid1_retry_list == NULL)
 327                 raid1_retry_tail = &raid1_retry_list;
 328         *raid1_retry_tail = r1_bh;
 329         raid1_retry_tail = &r1_bh->next_r1;
 330         r1_bh->next_r1 = NULL;
 331         md_spin_unlock_irqrestore(&retry_list_lock, flags);
 332         md_wakeup_thread(conf->thread);
 333 }
 334
 335
 336 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
 337 {
 338         unsigned long flags;
 339         spin_lock_irqsave(&conf->segment_lock, flags);
 340         if (sector < conf->start_active)
 341                 conf->cnt_done--;
 342         else if (sector >= conf->start_future && conf->phase == phase)
 343                 conf->cnt_future--;
 344         else if (!--conf->cnt_pending)
 345                 wake_up(&conf->wait_ready);
 346
 347         spin_unlock_irqrestore(&conf->segment_lock, flags);
 348 }
 349
 350 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
 351 {
 352         unsigned long flags;
 353         spin_lock_irqsave(&conf->segment_lock, flags);
 354         if (sector >= conf->start_ready)
 355                 --conf->cnt_ready;
 356         else if (sector >= conf->start_active) {
 357                 if (!--conf->cnt_active) {
 358                         conf->start_active = conf->start_ready;
 359                         wake_up(&conf->wait_done);
 360                 }
 361         }
 362         spin_unlock_irqrestore(&conf->segment_lock, flags);
 363 }
 364
 365 /*
 366  * raid1_end_bh_io() is called when we have finished servicing a mirrored
 367  * operation and are ready to return a success/failure code to the buffer
 368  * cache layer.
 369  */
 370 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
 371 {
 372         struct buffer_head *bh = r1_bh->master_bh;
 373
 374         io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
 375                         test_bit(R1BH_SyncPhase, &r1_bh->state));
 376
 377         bh->b_end_io(bh, uptodate);
 378         raid1_free_r1bh(r1_bh);
 379 }
 380 void raid1_end_request (struct buffer_head *bh, int uptodate)
 381 {
 382         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
 383
 384         /*
 385          * this branch is our 'one mirror IO has finished' event handler:
 386          */
 387         if (!uptodate)
 388                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
 389         else
 390                 /*
 391                  * Set R1BH_Uptodate in our master buffer_head, so that
 392                  * we will return a good error code for to the higher
 393                  * levels even if IO on some other mirrored buffer fails.
 394                  *
 395                  * The 'master' represents the complex operation to
 396                  * user-side. So if something waits for IO, then it will
 397                  * wait for the 'master' buffer_head.
 398                  */
 399                 set_bit (R1BH_Uptodate, &r1_bh->state);
 400
 401         /*
 402          * We split up the read and write side, imho they are
 403          * conceptually different.
 404          */
 405
 406         if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
 407                 /*
 408                  * we have only one buffer_head on the read side
 409                  */
 410
 411                 if (uptodate) {
 412                         raid1_end_bh_io(r1_bh, uptodate);
 413                         return;
 414                 }
 415                 /*
 416                  * oops, read error:
 417                  */
 418                 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
 419                          partition_name(bh->b_dev), bh->b_blocknr);
 420                 raid1_reschedule_retry(r1_bh);
 421                 return;
 422         }
 423
 424         /*
 425          * WRITE:
 426          *
 427          * Let's see if all mirrored write operations have finished
 428          * already.
 429          */
 430
 431         if (atomic_dec_and_test(&r1_bh->remaining))
 432                 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
 433 }
 434
 435 /*
 436  * This routine returns the disk from which the requested read should
 437  * be done. It bookkeeps the last read position for every disk
 438  * in array and when new read requests come, the disk which last
 439  * position is nearest to the request, is chosen.
 440  *
 441  * TODO: now if there are 2 mirrors in the same 2 devices, performance
 442  * degrades dramatically because position is mirror, not device based.
 443  * This should be changed to be device based. Also atomic sequential
 444  * reads should be somehow balanced.
 445  */
 446
 447 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
 448 {
 449         int new_disk = conf->last_used;
 450         const int sectors = bh->b_size >> 9;
 451         const long this_sector = bh->b_blocknr * sectors;
 452         int disk = new_disk;
 453         unsigned long new_distance;
 454         unsigned long current_distance;
 455
 456         /*
 457          * Check if it is sane at all to balance
 458          */
 459
 460         if (conf->resync_mirrors)
 461                 goto rb_out;
 462
 463         if (conf->working_disks < 2) {
 464                 int i = 0;
 465
 466                 while( !conf->mirrors[new_disk].operational &&
 467                                 (i < MD_SB_DISKS) ) {
 468                         new_disk = conf->mirrors[new_disk].next;
 469                         i++;
 470                 }
 471
 472                 if (i >= MD_SB_DISKS) {
 473                         /*
 474                          * This means no working disk was found
 475                          * Nothing much to do, lets not change anything
 476                          * and hope for the best...
 477                          */
 478
 479                         new_disk = conf->last_used;
 480                 }
 481
 482                 goto rb_out;
 483         }
 484
 485         /*
 486          * Don't touch anything for sequential reads.
 487          */
 488
 489         if (this_sector == conf->mirrors[new_disk].head_position)
 490                 goto rb_out;
 491
 492         /*
 493          * If reads have been done only on a single disk
 494          * for a time, lets give another disk a change.
 495          * This is for kicking those idling disks so that
 496          * they would find work near some hotspot.
 497          */
 498
 499         if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
 500                 conf->sect_count = 0;
 501
 502                 while( new_disk != conf->mirrors[new_disk].next ) {
 503                         if ((conf->mirrors[new_disk].write_only) ||
 504                                 (!conf->mirrors[new_disk].operational) )
 505                                 continue;
 506
 507                         new_disk = conf->mirrors[new_disk].next;
 508                         break;
 509                 }
 510
 511                 goto rb_out;
 512         }
 513
 514         current_distance = abs(this_sector -
 515                                 conf->mirrors[disk].head_position);
 516
 517         /* Find the disk which is closest */
 518
 519         while( conf->mirrors[disk].next != conf->last_used ) {
 520                 disk = conf->mirrors[disk].next;
 521
 522                 if ((conf->mirrors[disk].write_only) ||
 523                                 (!conf->mirrors[disk].operational))
 524                         continue;
 525
 526                 new_distance = abs(this_sector -
 527                                         conf->mirrors[disk].head_position);
 528
 529                 if (new_distance < current_distance) {
 530                         conf->sect_count = 0;
 531                         current_distance = new_distance;
 532                         new_disk = disk;
 533                 }
 534         }
 535
 536 rb_out:
 537         conf->mirrors[new_disk].head_position = this_sector + sectors;
 538
 539         conf->last_used = new_disk;
 540         conf->sect_count += sectors;
 541
 542         return new_disk;
 543 }
 544
 545 static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw,
 546                                                  struct buffer_head * bh)
 547 {
 548         raid1_conf_t *conf = mddev_to_conf(mddev);
 549         struct buffer_head *bh_req, *bhl;
 550         struct raid1_bh * r1_bh;
 551         int disks = MD_SB_DISKS;
 552         int i, sum_bhs = 0, sectors;
 553         struct mirror_info *mirror;
 554         DECLARE_WAITQUEUE(wait, current);
 555
 556         if (!buffer_locked(bh))
 557                 BUG();
 558
 559 /*
 560  * make_request() can abort the operation when READA is being
 561  * used and no empty request is available.
 562  *
 563  * Currently, just replace the command with READ/WRITE.
 564  */
 565         if (rw == READA)
 566                 rw = READ;
 567
 568         if (rw == WRITE) {
 569                 rw = WRITERAW;
 570                 /*
 571                  * we first clean the bh, then we start the IO, then
 572                  * when the IO has finished, we end_io the bh and
 573                  * mark it uptodate. This way we do not miss the
 574                  * case when the bh got dirty again during the IO.
 575                  *
 576                  * We do an important optimization here - if the
 577                  * buffer was not dirty and we are during resync or
 578                  * reconstruction, then we can skip writing it back
 579                  * to the master disk! (we still have to write it
 580                  * back to the other disks, because we are not sync
 581                  * yet.)
 582                  */
 583                 if (atomic_set_buffer_clean(bh))
 584                         __mark_buffer_clean(bh);
 585                 else {
 586                         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 587                         return 0;
 588                 }
 589         }
 590         r1_bh = raid1_alloc_r1bh (conf);
 591
 592         spin_lock_irq(&conf->segment_lock);
 593         wait_event_lock_irq(conf->wait_done,
 594                         bh->b_rsector < conf->start_active ||
 595                         bh->b_rsector >= conf->start_future,
 596                         conf->segment_lock);
 597         if (bh->b_rsector < conf->start_active)
 598                 conf->cnt_done++;
 599         else {
 600                 conf->cnt_future++;
 601                 if (conf->phase)
 602                         set_bit(R1BH_SyncPhase, &r1_bh->state);
 603         }
 604         spin_unlock_irq(&conf->segment_lock);
 605
 606         /*
 607          * i think the read and write branch should be separated completely,
 608          * since we want to do read balancing on the read side for example.
 609          * Alternative implementations? :) --mingo
 610          */
 611
 612         r1_bh->master_bh = bh;
 613         r1_bh->mddev = mddev;
 614         r1_bh->cmd = rw;
 615
 616         sectors = bh->b_size >> 9;
 617         if (rw == READ) {
 618                 /*
 619                  * read balancing logic:
 620                  */
 621                 mirror = conf->mirrors + raid1_read_balance(conf, bh);
 622
 623                 bh_req = &r1_bh->bh_req;
 624                 memcpy(bh_req, bh, sizeof(*bh));
 625                 bh_req->b_blocknr = bh->b_rsector * sectors;
 626                 bh_req->b_dev = mirror->dev;
 627                 bh_req->b_rdev = mirror->dev;
 628         /*      bh_req->b_rsector = bh->n_rsector; */
 629                 bh_req->b_end_io = raid1_end_request;
 630                 bh_req->b_dev_id = r1_bh;
 631                 q = blk_get_queue(bh_req->b_rdev);
 632                 generic_make_request (q, rw, bh_req);
 633                 return 0;
 634         }
 635
 636         /*
 637          * WRITE:
 638          */
 639
 640         bhl = raid1_alloc_bh(conf, conf->raid_disks);
 641         for (i = 0; i < disks; i++) {
 642                 struct buffer_head *mbh;
 643                 if (!conf->mirrors[i].operational)
 644                         continue;
 645
 646         /*
 647          * We should use a private pool (size depending on NR_REQUEST),
 648          * to avoid writes filling up the memory with bhs
 649          *
 650          * Such pools are much faster than kmalloc anyways (so we waste
 651          * almost nothing by not using the master bh when writing and
 652          * win alot of cleanness) but for now we are cool enough. --mingo
 653          *
 654          * It's safe to sleep here, buffer heads cannot be used in a shared
 655          * manner in the write branch. Look how we lock the buffer at the
 656          * beginning of this function to grok the difference ;)
 657          */
 658                 mbh = bhl;
 659                 if (mbh == NULL) {
 660                         MD_BUG();
 661                         break;
 662                 }
 663                 bhl = mbh->b_next;
 664                 mbh->b_next = NULL;
 665                 mbh->b_this_page = (struct buffer_head *)1;
 666
 667         /*
 668          * prepare mirrored mbh (fields ordered for max mem throughput):
 669          */
 670                 mbh->b_blocknr    = bh->b_rsector * sectors;
 671                 mbh->b_dev        = conf->mirrors[i].dev;
 672                 mbh->b_rdev       = conf->mirrors[i].dev;
 673                 mbh->b_rsector    = bh->b_rsector;
 674                 mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
 675                                                 (1<<BH_Mapped) | (1<<BH_Lock);
 676
 677                 atomic_set(&mbh->b_count, 1);
 678                 mbh->b_size       = bh->b_size;
 679                 mbh->b_page       = bh->b_page;
 680                 mbh->b_data       = bh->b_data;
 681                 mbh->b_list       = BUF_LOCKED;
 682                 mbh->b_end_io     = raid1_end_request;
 683                 mbh->b_dev_id     = r1_bh;
 684
 685                 mbh->b_next = r1_bh->mirror_bh_list;
 686                 r1_bh->mirror_bh_list = mbh;
 687                 sum_bhs++;
 688         }
 689         if (bhl) raid1_free_bh(conf,bhl);
 690         md_atomic_set(&r1_bh->remaining, sum_bhs);
 691
 692         /*
 693          * We have to be a bit careful about the semaphore above, thats
 694          * why we start the requests separately. Since kmalloc() could
 695          * fail, sleep and make_request() can sleep too, this is the
 696          * safer solution. Imagine, end_request decreasing the semaphore
 697          * before we could have set it up ... We could play tricks with
 698          * the semaphore (presetting it and correcting at the end if
 699          * sum_bhs is not 'n' but we have to do end_request by hand if
 700          * all requests finish until we had a chance to set up the
 701          * semaphore correctly ... lots of races).
 702          */
 703         bh = r1_bh->mirror_bh_list;
 704         while(bh) {
 705                 struct buffer_head *bh2 = bh;
 706                 bh = bh->b_next;
 707                 q = blk_get_queue(bh2->b_rdev);
 708                 generic_make_request(q, rw, bh2);
 709         }
 710         return (0);
 711 }
 712
 713 static int raid1_status (char *page, mddev_t *mddev)
 714 {
 715         raid1_conf_t *conf = mddev_to_conf(mddev);
 716         int sz = 0, i;
 717
 718         sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
 719                                                  conf->working_disks);
 720         for (i = 0; i < conf->raid_disks; i++)
 721                 sz += sprintf (page+sz, "%s",
 722                         conf->mirrors[i].operational ? "U" : "_");
 723         sz += sprintf (page+sz, "]");
 724         return sz;
 725 }
 726
 727 static void unlink_disk (raid1_conf_t *conf, int target)
 728 {
 729         int disks = MD_SB_DISKS;
 730         int i;
 731
 732         for (i = 0; i < disks; i++)
 733                 if (conf->mirrors[i].next == target)
 734                         conf->mirrors[i].next = conf->mirrors[target].next;
 735 }
 736
 737 #define LAST_DISK KERN_ALERT \
 738 "raid1: only one disk left and IO error.\n"
 739
 740 #define NO_SPARE_DISK KERN_ALERT \
 741 "raid1: no spare disk left, degrading mirror level by one.\n"
 742
 743 #define DISK_FAILED KERN_ALERT \
 744 "raid1: Disk failure on %s, disabling device. \n" \
 745 "       Operation continuing on %d devices\n"
 746
 747 #define START_SYNCING KERN_ALERT \
 748 "raid1: start syncing spare disk.\n"
 749
 750 #define ALREADY_SYNCING KERN_INFO \
 751 "raid1: syncing already in progress.\n"
 752
 753 static void mark_disk_bad (mddev_t *mddev, int failed)
 754 {
 755         raid1_conf_t *conf = mddev_to_conf(mddev);
 756         struct mirror_info *mirror = conf->mirrors+failed;
 757         mdp_super_t *sb = mddev->sb;
 758
 759         mirror->operational = 0;
 760         unlink_disk(conf, failed);
 761         mark_disk_faulty(sb->disks+mirror->number);
 762         mark_disk_nonsync(sb->disks+mirror->number);
 763         mark_disk_inactive(sb->disks+mirror->number);
 764         sb->active_disks--;
 765         sb->working_disks--;
 766         sb->failed_disks++;
 767         mddev->sb_dirty = 1;
 768         md_wakeup_thread(conf->thread);
 769         conf->working_disks--;
 770         printk (DISK_FAILED, partition_name (mirror->dev),
 771                                  conf->working_disks);
 772 }
 773
 774 static int raid1_error (mddev_t *mddev, kdev_t dev)
 775 {
 776         raid1_conf_t *conf = mddev_to_conf(mddev);
 777         struct mirror_info * mirrors = conf->mirrors;
 778         int disks = MD_SB_DISKS;
 779         int i;
 780
 781         if (conf->working_disks == 1) {
 782                 /*
 783                  * Uh oh, we can do nothing if this is our last disk, but
 784                  * first check if this is a queued request for a device
 785                  * which has just failed.
 786                  */
 787                 for (i = 0; i < disks; i++) {
 788                         if (mirrors[i].dev==dev && !mirrors[i].operational)
 789                                 return 0;
 790                 }
 791                 printk (LAST_DISK);
 792         } else {
 793                 /*
 794                  * Mark disk as unusable
 795                  */
 796                 for (i = 0; i < disks; i++) {
 797                         if (mirrors[i].dev==dev && mirrors[i].operational) {
 798                                 mark_disk_bad(mddev, i);
 799                                 break;
 800                         }
 801                 }
 802         }
 803         return 0;
 804 }
 805
 806 #undef LAST_DISK
 807 #undef NO_SPARE_DISK
 808 #undef DISK_FAILED
 809 #undef START_SYNCING
 810
 811 /*
 812  * Insert the spare disk into the drive-ring
 813  */
 814 static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
 815 {
 816         int j, next;
 817         int disks = MD_SB_DISKS;
 818         struct mirror_info *p = conf->mirrors;
 819
 820         for (j = 0; j < disks; j++, p++)
 821                 if (p->operational && !p->write_only) {
 822                         next = p->next;
 823                         p->next = mirror->raid_disk;
 824                         mirror->next = next;
 825                         return;
 826                 }
 827
 828         printk("raid1: bug: no read-operational devices\n");
 829 }
 830
 831 static void print_raid1_conf (raid1_conf_t *conf)
 832 {
 833         int i;
 834         struct mirror_info *tmp;
 835
 836         printk("RAID1 conf printout:\n");
 837         if (!conf) {
 838                 printk("(conf==NULL)\n");
 839                 return;
 840         }
 841         printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
 842                          conf->raid_disks, conf->nr_disks);
 843
 844         for (i = 0; i < MD_SB_DISKS; i++) {
 845                 tmp = conf->mirrors + i;
 846                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 847                         i, tmp->spare,tmp->operational,
 848                         tmp->number,tmp->raid_disk,tmp->used_slot,
 849                         partition_name(tmp->dev));
 850         }
 851 }
 852
 853 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
 854 {
 855         int err = 0;
 856         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
 857         raid1_conf_t *conf = mddev->private;
 858         struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
 859         mdp_super_t *sb = mddev->sb;
 860         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
 861
 862         print_raid1_conf(conf);
 863         md_spin_lock_irq(&conf->device_lock);
 864         /*
 865          * find the disk ...
 866          */
 867         switch (state) {
 868
 869         case DISKOP_SPARE_ACTIVE:
 870
 871                 /*
 872                  * Find the failed disk within the RAID1 configuration ...
 873                  * (this can only be in the first conf->working_disks part)
 874                  */
 875                 for (i = 0; i < conf->raid_disks; i++) {
 876                         tmp = conf->mirrors + i;
 877                         if ((!tmp->operational && !tmp->spare) ||
 878                                         !tmp->used_slot) {
 879                                 failed_disk = i;
 880                                 break;
 881                         }
 882                 }
 883                 /*
 884                  * When we activate a spare disk we _must_ have a disk in
 885                  * the lower (active) part of the array to replace.
 886                  */
 887                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
 888                         MD_BUG();
 889                         err = 1;
 890                         goto abort;
 891                 }
 892                 /* fall through */
 893
 894         case DISKOP_SPARE_WRITE:
 895         case DISKOP_SPARE_INACTIVE:
 896
 897                 /*
 898                  * Find the spare disk ... (can only be in the 'high'
 899                  * area of the array)
 900                  */
 901                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 902                         tmp = conf->mirrors + i;
 903                         if (tmp->spare && tmp->number == (*d)->number) {
 904                                 spare_disk = i;
 905                                 break;
 906                         }
 907                 }
 908                 if (spare_disk == -1) {
 909                         MD_BUG();
 910                         err = 1;
 911                         goto abort;
 912                 }
 913                 break;
 914
 915         case DISKOP_HOT_REMOVE_DISK:
 916
 917                 for (i = 0; i < MD_SB_DISKS; i++) {
 918                         tmp = conf->mirrors + i;
 919                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
 920                                 if (tmp->operational) {
 921                                         err = -EBUSY;
 922                                         goto abort;
 923                                 }
 924                                 removed_disk = i;
 925                                 break;
 926                         }
 927                 }
 928                 if (removed_disk == -1) {
 929                         MD_BUG();
 930                         err = 1;
 931                         goto abort;
 932                 }
 933                 break;
 934
 935         case DISKOP_HOT_ADD_DISK:
 936
 937                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
 938                         tmp = conf->mirrors + i;
 939                         if (!tmp->used_slot) {
 940                                 added_disk = i;
 941                                 break;
 942                         }
 943                 }
 944                 if (added_disk == -1) {
 945                         MD_BUG();
 946                         err = 1;
 947                         goto abort;
 948                 }
 949                 break;
 950         }
 951
 952         switch (state) {
 953         /*
 954          * Switch the spare disk to write-only mode:
 955          */
 956         case DISKOP_SPARE_WRITE:
 957                 sdisk = conf->mirrors + spare_disk;
 958                 sdisk->operational = 1;
 959                 sdisk->write_only = 1;
 960                 break;
 961         /*
 962          * Deactivate a spare disk:
 963          */
 964         case DISKOP_SPARE_INACTIVE:
 965                 sdisk = conf->mirrors + spare_disk;
 966                 sdisk->operational = 0;
 967                 sdisk->write_only = 0;
 968                 break;
 969         /*
 970          * Activate (mark read-write) the (now sync) spare disk,
 971          * which means we switch it's 'raid position' (->raid_disk)
 972          * with the failed disk. (only the first 'conf->nr_disks'
 973          * slots are used for 'real' disks and we must preserve this
 974          * property)
 975          */
 976         case DISKOP_SPARE_ACTIVE:
 977
 978                 sdisk = conf->mirrors + spare_disk;
 979                 fdisk = conf->mirrors + failed_disk;
 980
 981                 spare_desc = &sb->disks[sdisk->number];
 982                 failed_desc = &sb->disks[fdisk->number];
 983
 984                 if (spare_desc != *d) {
 985                         MD_BUG();
 986                         err = 1;
 987                         goto abort;
 988                 }
 989
 990                 if (spare_desc->raid_disk != sdisk->raid_disk) {
 991                         MD_BUG();
 992                         err = 1;
 993                         goto abort;
 994                 }
 995
 996                 if (sdisk->raid_disk != spare_disk) {
 997                         MD_BUG();
 998                         err = 1;
 999                         goto abort;
1000                 }
1001
1002                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1003                         MD_BUG();
1004                         err = 1;
1005                         goto abort;
1006                 }
1007
1008                 if (fdisk->raid_disk != failed_disk) {
1009                         MD_BUG();
1010                         err = 1;
1011                         goto abort;
1012                 }
1013
1014                 /*
1015                  * do the switch finally
1016                  */
1017                 xchg_values(*spare_desc, *failed_desc);
1018                 xchg_values(*fdisk, *sdisk);
1019
1020                 /*
1021                  * (careful, 'failed' and 'spare' are switched from now on)
1022                  *
1023                  * we want to preserve linear numbering and we want to
1024                  * give the proper raid_disk number to the now activated
1025                  * disk. (this means we switch back these values)
1026                  */
1027
1028                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1029                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1030                 xchg_values(spare_desc->number, failed_desc->number);
1031                 xchg_values(sdisk->number, fdisk->number);
1032
1033                 *d = failed_desc;
1034
1035                 if (sdisk->dev == MKDEV(0,0))
1036                         sdisk->used_slot = 0;
1037                 /*
1038                  * this really activates the spare.
1039                  */
1040                 fdisk->spare = 0;
1041                 fdisk->write_only = 0;
1042                 link_disk(conf, fdisk);
1043
1044                 /*
1045                  * if we activate a spare, we definitely replace a
1046                  * non-operational disk slot in the 'low' area of
1047                  * the disk array.
1048                  */
1049
1050                 conf->working_disks++;
1051
1052                 break;
1053
1054         case DISKOP_HOT_REMOVE_DISK:
1055                 rdisk = conf->mirrors + removed_disk;
1056
1057                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1058                         MD_BUG();
1059                         err = 1;
1060                         goto abort;
1061                 }
1062                 rdisk->dev = MKDEV(0,0);
1063                 rdisk->used_slot = 0;
1064                 conf->nr_disks--;
1065                 break;
1066
1067         case DISKOP_HOT_ADD_DISK:
1068                 adisk = conf->mirrors + added_disk;
1069                 added_desc = *d;
1070
1071                 if (added_disk != added_desc->number) {
1072                         MD_BUG();
1073                         err = 1;
1074                         goto abort;
1075                 }
1076
1077                 adisk->number = added_desc->number;
1078                 adisk->raid_disk = added_desc->raid_disk;
1079                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1080
1081                 adisk->operational = 0;
1082                 adisk->write_only = 0;
1083                 adisk->spare = 1;
1084                 adisk->used_slot = 1;
1085                 adisk->head_position = 0;
1086                 conf->nr_disks++;
1087
1088                 break;
1089
1090         default:
1091                 MD_BUG();
1092                 err = 1;
1093                 goto abort;
1094         }
1095 abort:
1096         md_spin_unlock_irq(&conf->device_lock);
1097         if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1098                 /* should move to "END_REBUILD" when such exists */
1099                 raid1_shrink_buffers(conf);
1100
1101         print_raid1_conf(conf);
1102         return err;
1103 }
1104
1105
1106 #define IO_ERROR KERN_ALERT \
1107 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1108
1109 #define REDIRECT_SECTOR KERN_ERR \
1110 "raid1: %s: redirecting sector %lu to another mirror\n"
1111
1112 /*
1113  * This is a kernel thread which:
1114  *
1115  *      1.      Retries failed read operations on working mirrors.
1116  *      2.      Updates the raid superblock when problems encounter.
1117  *      3.      Performs writes following reads for array syncronising.
1118  */
1119 static void end_sync_write(struct buffer_head *bh, int uptodate);
1120 static void end_sync_read(struct buffer_head *bh, int uptodate);
1121
1122 static void raid1d (void *data)
1123 {
1124         struct raid1_bh *r1_bh;
1125         struct buffer_head *bh;
1126         unsigned long flags;
1127         request_queue_t *q;
1128         mddev_t *mddev;
1129         kdev_t dev;
1130
1131
1132         for (;;) {
1133                 md_spin_lock_irqsave(&retry_list_lock, flags);
1134                 r1_bh = raid1_retry_list;
1135                 if (!r1_bh)
1136                         break;
1137                 raid1_retry_list = r1_bh->next_r1;
1138                 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1139
1140                 mddev = r1_bh->mddev;
1141                 if (mddev->sb_dirty) {
1142                         printk(KERN_INFO "dirty sb detected, updating.\n");
1143                         mddev->sb_dirty = 0;
1144                         md_update_sb(mddev);
1145                 }
1146                 bh = &r1_bh->bh_req;
1147                 switch(r1_bh->cmd) {
1148                 case SPECIAL:
1149                         /* have to allocate lots of bh structures and
1150                          * schedule writes
1151                          */
1152                         if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1153                                 int i, sum_bhs = 0;
1154                                 int disks = MD_SB_DISKS;
1155                                 struct buffer_head *bhl, *mbh;
1156                                 raid1_conf_t *conf;
1157                                 int sectors = bh->b_size >> 9;
1158
1159                                 conf = mddev_to_conf(mddev);
1160                                 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1161                                 for (i = 0; i < disks ; i++) {
1162                                         if (!conf->mirrors[i].operational)
1163                                                 continue;
1164                                         if (i==conf->last_used)
1165                                                 /* we read from here, no need to write */
1166                                                 continue;
1167                                         if (i < conf->raid_disks
1168                                             && !conf->resync_mirrors)
1169                                                 /* don't need to write this,
1170                                                  * we are just rebuilding */
1171                                                 continue;
1172                                         mbh = bhl;
1173                                         if (!mbh) {
1174                                                 MD_BUG();
1175                                                 break;
1176                                         }
1177                                         bhl = mbh->b_next;
1178                                         mbh->b_this_page = (struct buffer_head *)1;
1179
1180
1181                                 /*
1182                                  * prepare mirrored bh (fields ordered for max mem throughput):
1183                                  */
1184                                         mbh->b_blocknr    = bh->b_blocknr;
1185                                         mbh->b_dev        = conf->mirrors[i].dev;
1186                                         mbh->b_rdev       = conf->mirrors[i].dev;
1187                                         mbh->b_rsector    = bh->b_blocknr * sectors;
1188                                         mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1189                                                 (1<<BH_Mapped) | (1<<BH_Lock);
1190                                         atomic_set(&mbh->b_count, 1);
1191                                         mbh->b_size       = bh->b_size;
1192                                         mbh->b_page       = bh->b_page;
1193                                         mbh->b_data       = bh->b_data;
1194                                         mbh->b_list       = BUF_LOCKED;
1195                                         mbh->b_end_io     = end_sync_write;
1196                                         mbh->b_dev_id     = r1_bh;
1197
1198                                         mbh->b_next = r1_bh->mirror_bh_list;
1199                                         r1_bh->mirror_bh_list = mbh;
1200
1201                                         sum_bhs++;
1202                                 }
1203                                 md_atomic_set(&r1_bh->remaining, sum_bhs);
1204                                 if (bhl) raid1_free_bh(conf, bhl);
1205                                 mbh = r1_bh->mirror_bh_list;
1206                                 while (mbh) {
1207                                         struct buffer_head *bh1 = mbh;
1208                                         mbh = mbh->b_next;
1209                                         q = blk_get_queue(bh1->b_rdev);
1210                                         generic_make_request(q, WRITE, bh1);
1211                                         drive_stat_acct(bh1->b_rdev, WRITE, -bh1->b_size/512, 0);
1212                                 }
1213                         } else {
1214                                 dev = bh->b_dev;
1215                                 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1216                                 if (bh->b_dev == dev) {
1217                                         printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1218                                         md_done_sync(mddev, bh->b_size>>10, 0);
1219                                 } else {
1220                                         printk (REDIRECT_SECTOR,
1221                                                 partition_name(bh->b_dev), bh->b_blocknr);
1222                                         bh->b_rdev = bh->b_dev;
1223                                         q = blk_get_queue(bh->b_rdev);
1224                                         generic_make_request (q, READ, bh);
1225                                 }
1226                         }
1227
1228                         break;
1229                 case READ:
1230                 case READA:
1231                         dev = bh->b_dev;
1232
1233                         raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1234                         if (bh->b_dev == dev) {
1235                                 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1236                                 raid1_end_bh_io(r1_bh, 0);
1237                         } else {
1238                                 printk (REDIRECT_SECTOR,
1239                                         partition_name(bh->b_dev), bh->b_blocknr);
1240                                 bh->b_rdev = bh->b_dev;
1241                                 q = blk_get_queue(bh->b_rdev);
1242                                 generic_make_request (q, r1_bh->cmd, bh);
1243                         }
1244                         break;
1245                 }
1246         }
1247         md_spin_unlock_irqrestore(&retry_list_lock, flags);
1248 }
1249 #undef IO_ERROR
1250 #undef REDIRECT_SECTOR
1251
1252 /*
1253  * Private kernel thread to reconstruct mirrors after an unclean
1254  * shutdown.
1255  */
1256 static void raid1syncd (void *data)
1257 {
1258         raid1_conf_t *conf = data;
1259         mddev_t *mddev = conf->mddev;
1260
1261         if (!conf->resync_mirrors)
1262                 return;
1263         if (conf->resync_mirrors == 2)
1264                 return;
1265         down(&mddev->recovery_sem);
1266         if (!md_do_sync(mddev, NULL)) {
1267                 /*
1268                  * Only if everything went Ok.
1269                  */
1270                 conf->resync_mirrors = 0;
1271         }
1272
1273         /* If reconstruction was interrupted, we need to close the "active" and "pending"
1274          * holes.
1275          * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
1276          */
1277         /* this is really needed when recovery stops too... */
1278         spin_lock_irq(&conf->segment_lock);
1279         conf->start_active = conf->start_pending;
1280         conf->start_ready = conf->start_pending;
1281         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
1282         conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
1283         conf->start_future = mddev->sb->size+1;
1284         conf->cnt_pending = conf->cnt_future;
1285         conf->cnt_future = 0;
1286         conf->phase = conf->phase ^1;
1287         wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
1288         conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
1289         conf->phase = 0;
1290         conf->cnt_future = conf->cnt_done;;
1291         conf->cnt_done = 0;
1292         spin_unlock_irq(&conf->segment_lock);
1293         wake_up(&conf->wait_done);
1294
1295         up(&mddev->recovery_sem);
1296         raid1_shrink_buffers(conf);
1297 }
1298
1299 /*
1300  * perform a "sync" on one "block"
1301  *
1302  * We need to make sure that no normal I/O request - particularly write
1303  * requests - conflict with active sync requests.
1304  * This is achieved by conceptually dividing the device space into a
1305  * number of sections:
1306  *  DONE: 0 .. a-1     These blocks are in-sync
1307  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1308  *                     no normal IO requests
1309  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1310  *                     request may be happening
1311  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1312  *                     ones will be added
1313  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1314  *                     be happening, but not sync
1315  *
1316  * We keep a
1317  *   phase    which flips (0 or 1) each time d moves and
1318  * a count of:
1319  *   z =  active io requests in FUTURE since d moved - marked with
1320  *        current phase
1321  *   y =  active io requests in FUTURE before d moved, or PENDING -
1322  *        marked with previous phase
1323  *   x =  active sync requests in READY
1324  *   w =  active sync requests in ACTIVE
1325  *   v =  active io requests in DONE
1326  *
1327  * Normally, a=b=c=d=0 and z= active io requests
1328  *   or a=b=c=d=END and v= active io requests
1329  * Allowed changes to a,b,c,d:
1330  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1331  * B:  y==0 -> c=d
1332  * C:   b=c, w+=x, x=0
1333  * D:  w==0 -> a=b
1334  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1335  *
1336  * At start of sync we apply A.
1337  * When y reaches 0, we apply B then A then being sync requests
1338  * When sync point reaches c-1, we wait for y==0, and W==0, and
1339  * then apply apply B then A then D then C.
1340  * Finally, we apply E
1341  *
1342  * The sync request simply issues a "read" against a working drive
1343  * This is marked so that on completion the raid1d thread is woken to
1344  * issue suitable write requests
1345  */
1346
1347 static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr)
1348 {
1349         raid1_conf_t *conf = mddev_to_conf(mddev);
1350         struct mirror_info *mirror;
1351         request_queue_t *q;
1352         struct raid1_bh *r1_bh;
1353         struct buffer_head *bh;
1354         int bsize;
1355
1356         spin_lock_irq(&conf->segment_lock);
1357         if (!block_nr) {
1358                 /* initialize ...*/
1359                 int buffs;
1360                 conf->start_active = 0;
1361                 conf->start_ready = 0;
1362                 conf->start_pending = 0;
1363                 conf->start_future = 0;
1364                 conf->phase = 0;
1365                 /* we want enough buffers to hold twice the window of 128*/
1366                 buffs = 128 *2 / (PAGE_SIZE>>9);
1367                 buffs = raid1_grow_buffers(conf, buffs);
1368                 if (buffs < 2)
1369                         goto nomem;
1370
1371                 conf->window = buffs*(PAGE_SIZE>>9)/2;
1372                 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1373                 conf->cnt_done = conf->cnt_pending = 0;
1374                 if (conf->cnt_ready || conf->cnt_active)
1375                         MD_BUG();
1376         }
1377         while ((block_nr<<1) >= conf->start_pending) {
1378                 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1379                         block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1380                         conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1381                 wait_event_lock_irq(conf->wait_done,
1382                                         !conf->cnt_active,
1383                                         conf->segment_lock);
1384                 wait_event_lock_irq(conf->wait_ready,
1385                                         !conf->cnt_pending,
1386                                         conf->segment_lock);
1387                 conf->start_active = conf->start_ready;
1388                 conf->start_ready = conf->start_pending;
1389                 conf->start_pending = conf->start_future;
1390                 conf->start_future = conf->start_future+conf->window;
1391                 // Note: falling off the end is not a problem
1392                 conf->phase = conf->phase ^1;
1393                 conf->cnt_active = conf->cnt_ready;
1394                 conf->cnt_ready = 0;
1395                 conf->cnt_pending = conf->cnt_future;
1396                 conf->cnt_future = 0;
1397                 wake_up(&conf->wait_done);
1398         }
1399         conf->cnt_ready++;
1400         spin_unlock_irq(&conf->segment_lock);
1401
1402
1403         /* If reconstructing, and >1 working disc,
1404          * could dedicate one to rebuild and others to
1405          * service read requests ..
1406          */
1407         mirror = conf->mirrors+conf->last_used;
1408
1409         r1_bh = raid1_alloc_buf (conf);
1410         r1_bh->master_bh = NULL;
1411         r1_bh->mddev = mddev;
1412         r1_bh->cmd = SPECIAL;
1413         bh = &r1_bh->bh_req;
1414
1415         bh->b_blocknr = block_nr;
1416         bsize = 1024;
1417         while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE
1418                         && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) {
1419                 bh->b_blocknr >>= 1;
1420                 bsize <<= 1;
1421         }
1422         bh->b_size = bsize;
1423         bh->b_list = BUF_LOCKED;
1424         bh->b_dev = mirror->dev;
1425         bh->b_rdev = mirror->dev;
1426         bh->b_state = (1<<BH_Req) | (1<<BH_Mapped);
1427         if (!bh->b_page)
1428                 BUG();
1429         if (!bh->b_data)
1430                 BUG();
1431         if (bh->b_data != (char *) page_address(bh->b_page))
1432                 BUG();
1433         bh->b_end_io = end_sync_read;
1434         bh->b_dev_id = (void *) r1_bh;
1435         bh->b_rsector = block_nr<<1;
1436         init_waitqueue_head(&bh->b_wait);
1437
1438         q = blk_get_queue(bh->b_rdev);
1439         generic_make_request(q, READ, bh);
1440         drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0);
1441
1442         return (bsize >> 10);
1443
1444 nomem:
1445         raid1_shrink_buffers(conf);
1446         spin_unlock_irq(&conf->segment_lock);
1447         return -ENOMEM;
1448 }
1449
1450 static void end_sync_read(struct buffer_head *bh, int uptodate)
1451 {
1452         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
1453
1454         /* we have read a block, now it needs to be re-written,
1455          * or re-read if the read failed.
1456          * We don't do much here, just schedule handling by raid1d
1457          */
1458         if (!uptodate)
1459                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1460         else
1461                 set_bit(R1BH_Uptodate, &r1_bh->state);
1462         raid1_reschedule_retry(r1_bh);
1463 }
1464
1465 static void end_sync_write(struct buffer_head *bh, int uptodate)
1466 {
1467         struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
1468
1469         if (!uptodate)
1470                 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1471         if (atomic_dec_and_test(&r1_bh->remaining)) {
1472                 mddev_t *mddev = r1_bh->mddev;
1473                 unsigned long sect = bh->b_blocknr * (bh->b_size>>9);
1474                 int size = bh->b_size;
1475                 raid1_free_buf(r1_bh);
1476                 sync_request_done(sect, mddev_to_conf(mddev));
1477                 md_done_sync(mddev,size>>10, uptodate);
1478         }
1479 }
1480
1481 /*
1482  * This will catch the scenario in which one of the mirrors was
1483  * mounted as a normal device rather than as a part of a raid set.
1484  *
1485  * check_consistency is very personality-dependent, eg. RAID5 cannot
1486  * do this check, it uses another method.
1487  */
1488 static int __check_consistency (mddev_t *mddev, int row)
1489 {
1490         raid1_conf_t *conf = mddev_to_conf(mddev);
1491         int disks = MD_SB_DISKS;
1492         kdev_t dev;
1493         struct buffer_head *bh = NULL;
1494         int i, rc = 0;
1495         char *buffer = NULL;
1496
1497         for (i = 0; i < disks; i++) {
1498                 printk("(checking disk %d)\n",i);
1499                 if (!conf->mirrors[i].operational)
1500                         continue;
1501                 printk("(really checking disk %d)\n",i);
1502                 dev = conf->mirrors[i].dev;
1503                 set_blocksize(dev, 4096);
1504                 if ((bh = bread(dev, row / 4, 4096)) == NULL)
1505                         break;
1506                 if (!buffer) {
1507                         buffer = (char *) __get_free_page(GFP_KERNEL);
1508                         if (!buffer)
1509                                 break;
1510                         memcpy(buffer, bh->b_data, 4096);
1511                 } else if (memcmp(buffer, bh->b_data, 4096)) {
1512                         rc = 1;
1513                         break;
1514                 }
1515                 bforget(bh);
1516                 fsync_dev(dev);
1517                 invalidate_buffers(dev);
1518                 bh = NULL;
1519         }
1520         if (buffer)
1521                 free_page((unsigned long) buffer);
1522         if (bh) {
1523                 dev = bh->b_dev;
1524                 bforget(bh);
1525                 fsync_dev(dev);
1526                 invalidate_buffers(dev);
1527         }
1528         return rc;
1529 }
1530
1531 static int check_consistency (mddev_t *mddev)
1532 {
1533         if (__check_consistency(mddev, 0))
1534 /*
1535  * we do not do this currently, as it's perfectly possible to
1536  * have an inconsistent array when it's freshly created. Only
1537  * newly written data has to be consistent.
1538  */
1539                 return 0;
1540
1541         return 0;
1542 }
1543
1544 #define INVALID_LEVEL KERN_WARNING \
1545 "raid1: md%d: raid level not set to mirroring (%d)\n"
1546
1547 #define NO_SB KERN_ERR \
1548 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1549
1550 #define ERRORS KERN_ERR \
1551 "raid1: disabled mirror %s (errors detected)\n"
1552
1553 #define NOT_IN_SYNC KERN_ERR \
1554 "raid1: disabled mirror %s (not in sync)\n"
1555
1556 #define INCONSISTENT KERN_ERR \
1557 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1558
1559 #define ALREADY_RUNNING KERN_ERR \
1560 "raid1: disabled mirror %s (mirror %d already operational)\n"
1561
1562 #define OPERATIONAL KERN_INFO \
1563 "raid1: device %s operational as mirror %d\n"
1564
1565 #define MEM_ERROR KERN_ERR \
1566 "raid1: couldn't allocate memory for md%d\n"
1567
1568 #define SPARE KERN_INFO \
1569 "raid1: spare disk %s\n"
1570
1571 #define NONE_OPERATIONAL KERN_ERR \
1572 "raid1: no operational mirrors for md%d\n"
1573
1574 #define RUNNING_CKRAID KERN_ERR \
1575 "raid1: detected mirror differences -- running resync\n"
1576
1577 #define ARRAY_IS_ACTIVE KERN_INFO \
1578 "raid1: raid set md%d active with %d out of %d mirrors\n"
1579
1580 #define THREAD_ERROR KERN_ERR \
1581 "raid1: couldn't allocate thread for md%d\n"
1582
1583 #define START_RESYNC KERN_WARNING \
1584 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1585
1586 static int raid1_run (mddev_t *mddev)
1587 {
1588         raid1_conf_t *conf;
1589         int i, j, disk_idx;
1590         struct mirror_info *disk;
1591         mdp_super_t *sb = mddev->sb;
1592         mdp_disk_t *descriptor;
1593         mdk_rdev_t *rdev;
1594         struct md_list_head *tmp;
1595         int start_recovery = 0;
1596
1597         MOD_INC_USE_COUNT;
1598
1599         if (sb->level != 1) {
1600                 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1601                 goto out;
1602         }
1603         /*
1604          * copy the already verified devices into our private RAID1
1605          * bookkeeping area. [whatever we allocate in raid1_run(),
1606          * should be freed in raid1_stop()]
1607          */
1608
1609         conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1610         mddev->private = conf;
1611         if (!conf) {
1612                 printk(MEM_ERROR, mdidx(mddev));
1613                 goto out;
1614         }
1615         memset(conf, 0, sizeof(*conf));
1616
1617         ITERATE_RDEV(mddev,rdev,tmp) {
1618                 if (rdev->faulty) {
1619                         printk(ERRORS, partition_name(rdev->dev));
1620                 } else {
1621                         if (!rdev->sb) {
1622                                 MD_BUG();
1623                                 continue;
1624                         }
1625                 }
1626                 if (rdev->desc_nr == -1) {
1627                         MD_BUG();
1628                         continue;
1629                 }
1630                 descriptor = &sb->disks[rdev->desc_nr];
1631                 disk_idx = descriptor->raid_disk;
1632                 disk = conf->mirrors + disk_idx;
1633
1634                 if (disk_faulty(descriptor)) {
1635                         disk->number = descriptor->number;
1636                         disk->raid_disk = disk_idx;
1637                         disk->dev = rdev->dev;
1638                         disk->sect_limit = MAX_WORK_PER_DISK;
1639                         disk->operational = 0;
1640                         disk->write_only = 0;
1641                         disk->spare = 0;
1642                         disk->used_slot = 1;
1643                         disk->head_position = 0;
1644                         continue;
1645                 }
1646                 if (disk_active(descriptor)) {
1647                         if (!disk_sync(descriptor)) {
1648                                 printk(NOT_IN_SYNC,
1649                                         partition_name(rdev->dev));
1650                                 continue;
1651                         }
1652                         if ((descriptor->number > MD_SB_DISKS) ||
1653                                          (disk_idx > sb->raid_disks)) {
1654
1655                                 printk(INCONSISTENT,
1656                                         partition_name(rdev->dev));
1657                                 continue;
1658                         }
1659                         if (disk->operational) {
1660                                 printk(ALREADY_RUNNING,
1661                                         partition_name(rdev->dev),
1662                                         disk_idx);
1663                                 continue;
1664                         }
1665                         printk(OPERATIONAL, partition_name(rdev->dev),
1666                                         disk_idx);
1667                         disk->number = descriptor->number;
1668                         disk->raid_disk = disk_idx;
1669                         disk->dev = rdev->dev;
1670                         disk->sect_limit = MAX_WORK_PER_DISK;
1671                         disk->operational = 1;
1672                         disk->write_only = 0;
1673                         disk->spare = 0;
1674                         disk->used_slot = 1;
1675                         disk->head_position = 0;
1676                         conf->working_disks++;
1677                 } else {
1678                 /*
1679                  * Must be a spare disk ..
1680                  */
1681                         printk(SPARE, partition_name(rdev->dev));
1682                         disk->number = descriptor->number;
1683                         disk->raid_disk = disk_idx;
1684                         disk->dev = rdev->dev;
1685                         disk->sect_limit = MAX_WORK_PER_DISK;
1686                         disk->operational = 0;
1687                         disk->write_only = 0;
1688                         disk->spare = 1;
1689                         disk->used_slot = 1;
1690                         disk->head_position = 0;
1691                 }
1692         }
1693         conf->raid_disks = sb->raid_disks;
1694         conf->nr_disks = sb->nr_disks;
1695         conf->mddev = mddev;
1696         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1697
1698         conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1699         init_waitqueue_head(&conf->wait_buffer);
1700         init_waitqueue_head(&conf->wait_done);
1701         init_waitqueue_head(&conf->wait_ready);
1702
1703         if (!conf->working_disks) {
1704                 printk(NONE_OPERATIONAL, mdidx(mddev));
1705                 goto out_free_conf;
1706         }
1707
1708
1709         /* pre-allocate some buffer_head structures.
1710          * As a minimum, 1 r1bh and raid_disks buffer_heads
1711          * would probably get us by in tight memory situations,
1712          * but a few more is probably a good idea.
1713          * For now, try 16 r1bh and 16*raid_disks bufferheads
1714          * This will allow at least 16 concurrent reads or writes
1715          * even if kmalloc starts failing
1716          */
1717         if (raid1_grow_r1bh(conf, 16) < 16 ||
1718             raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
1719                 printk(MEM_ERROR, mdidx(mddev));
1720                 goto out_free_conf;
1721         }
1722
1723         for (i = 0; i < MD_SB_DISKS; i++) {
1724
1725                 descriptor = sb->disks+i;
1726                 disk_idx = descriptor->raid_disk;
1727                 disk = conf->mirrors + disk_idx;
1728
1729                 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1730                                 !disk->used_slot) {
1731
1732                         disk->number = descriptor->number;
1733                         disk->raid_disk = disk_idx;
1734                         disk->dev = MKDEV(0,0);
1735
1736                         disk->operational = 0;
1737                         disk->write_only = 0;
1738                         disk->spare = 0;
1739                         disk->used_slot = 1;
1740                         disk->head_position = 0;
1741                 }
1742         }
1743
1744         /*
1745          * find the first working one and use it as a starting point
1746          * to read balancing.
1747          */
1748         for (j = 0; !conf->mirrors[j].operational; j++)
1749                 /* nothing */;
1750         conf->last_used = j;
1751
1752         /*
1753          * initialize the 'working disks' list.
1754          */
1755         for (i = conf->raid_disks - 1; i >= 0; i--) {
1756                 if (conf->mirrors[i].operational) {
1757                         conf->mirrors[i].next = j;
1758                         j = i;
1759                 }
1760         }
1761
1762         if (conf->working_disks != sb->raid_disks) {
1763                 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1764                 start_recovery = 1;
1765         }
1766
1767         if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
1768                 /*
1769                  * we do sanity checks even if the device says
1770                  * it's clean ...
1771                  */
1772                 if (check_consistency(mddev)) {
1773                         printk(RUNNING_CKRAID);
1774                         sb->state &= ~(1 << MD_SB_CLEAN);
1775                 }
1776         }
1777
1778         {
1779                 const char * name = "raid1d";
1780
1781                 conf->thread = md_register_thread(raid1d, conf, name);
1782                 if (!conf->thread) {
1783                         printk(THREAD_ERROR, mdidx(mddev));
1784                         goto out_free_conf;
1785                 }
1786         }
1787
1788         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1789                 const char * name = "raid1syncd";
1790
1791                 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1792                 if (!conf->resync_thread) {
1793                         printk(THREAD_ERROR, mdidx(mddev));
1794                         goto out_free_conf;
1795                 }
1796
1797                 printk(START_RESYNC, mdidx(mddev));
1798                 conf->resync_mirrors = 1;
1799                 md_wakeup_thread(conf->resync_thread);
1800         }
1801
1802         /*
1803          * Regenerate the "device is in sync with the raid set" bit for
1804          * each device.
1805          */
1806         for (i = 0; i < MD_SB_DISKS; i++) {
1807                 mark_disk_nonsync(sb->disks+i);
1808                 for (j = 0; j < sb->raid_disks; j++) {
1809                         if (!conf->mirrors[j].operational)
1810                                 continue;
1811                         if (sb->disks[i].number == conf->mirrors[j].number)
1812                                 mark_disk_sync(sb->disks+i);
1813                 }
1814         }
1815         sb->active_disks = conf->working_disks;
1816
1817         if (start_recovery)
1818                 md_recover_arrays();
1819
1820
1821         printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1822         /*
1823          * Ok, everything is just fine now
1824          */
1825         return 0;
1826
1827 out_free_conf:
1828         raid1_shrink_r1bh(conf);
1829         raid1_shrink_bh(conf, conf->freebh_cnt);
1830         raid1_shrink_buffers(conf);
1831         kfree(conf);
1832         mddev->private = NULL;
1833 out:
1834         MOD_DEC_USE_COUNT;
1835         return -EIO;
1836 }
1837
1838 #undef INVALID_LEVEL
1839 #undef NO_SB
1840 #undef ERRORS
1841 #undef NOT_IN_SYNC
1842 #undef INCONSISTENT
1843 #undef ALREADY_RUNNING
1844 #undef OPERATIONAL
1845 #undef SPARE
1846 #undef NONE_OPERATIONAL
1847 #undef RUNNING_CKRAID
1848 #undef ARRAY_IS_ACTIVE
1849
1850 static int raid1_stop_resync (mddev_t *mddev)
1851 {
1852         raid1_conf_t *conf = mddev_to_conf(mddev);
1853
1854         if (conf->resync_thread) {
1855                 if (conf->resync_mirrors) {
1856                         conf->resync_mirrors = 2;
1857                         md_interrupt_thread(conf->resync_thread);
1858
1859                         printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1860                         return 1;
1861                 }
1862                 return 0;
1863         }
1864         return 0;
1865 }
1866
1867 static int raid1_restart_resync (mddev_t *mddev)
1868 {
1869         raid1_conf_t *conf = mddev_to_conf(mddev);
1870
1871         if (conf->resync_mirrors) {
1872                 if (!conf->resync_thread) {
1873                         MD_BUG();
1874                         return 0;
1875                 }
1876                 conf->resync_mirrors = 1;
1877                 md_wakeup_thread(conf->resync_thread);
1878                 return 1;
1879         }
1880         return 0;
1881 }
1882
1883 static int raid1_stop (mddev_t *mddev)
1884 {
1885         raid1_conf_t *conf = mddev_to_conf(mddev);
1886
1887         md_unregister_thread(conf->thread);
1888         if (conf->resync_thread)
1889                 md_unregister_thread(conf->resync_thread);
1890         raid1_shrink_r1bh(conf);
1891         raid1_shrink_bh(conf, conf->freebh_cnt);
1892         raid1_shrink_buffers(conf);
1893         kfree(conf);
1894         mddev->private = NULL;
1895         MOD_DEC_USE_COUNT;
1896         return 0;
1897 }
1898
1899 static mdk_personality_t raid1_personality=
1900 {
1901         name:           "raid1",
1902         make_request:   raid1_make_request,
1903         run:            raid1_run,
1904         stop:           raid1_stop,
1905         status:         raid1_status,
1906         error_handler:  raid1_error,
1907         diskop:         raid1_diskop,
1908         stop_resync:    raid1_stop_resync,
1909         restart_resync: raid1_restart_resync,
1910         sync_request:   raid1_sync_request
1911 };
1912
1913 int raid1_init (void)
1914 {
1915         return register_md_personality (RAID1, &raid1_personality);
1916 }
1917
1918 #ifdef MODULE
1919 int init_module (void)
1920 {
1921         return raid1_init();
1922 }
1923
1924 void cleanup_module (void)
1925 {
1926         unregister_md_personality (RAID1);
1927 }
1928 #endif