Import 2.4.0-test5pre3
[davej-history.git] / drivers / block / raid1.c
blob608bc4c43f4411225a60769366e3fbfc3bdaf202
1 /*
2 * raid1.c : Multiple Devices driver for Linux
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8 * RAID-1 management functions.
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <linux/module.h>
26 #include <linux/malloc.h>
27 #include <linux/raid/raid1.h>
28 #include <asm/atomic.h>
30 #define MAJOR_NR MD_MAJOR
31 #define MD_DRIVER
32 #define MD_PERSONALITY
34 #define MAX_WORK_PER_DISK 128
37 * The following can be used to debug the driver
39 #define RAID1_DEBUG 0
41 #if RAID1_DEBUG
42 #define PRINTK(x...) printk(x)
43 #define inline
44 #define __inline__
45 #else
46 #define PRINTK(x...) do { } while (0)
47 #endif
50 static mdk_personality_t raid1_personality;
51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
52 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
54 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
56 /* return a linked list of "cnt" struct buffer_heads.
57 * don't take any off the free list unless we know we can
58 * get all we need, otherwise we could deadlock
60 struct buffer_head *bh=NULL;
62 while(cnt) {
63 struct buffer_head *t;
64 md_spin_lock_irq(&conf->device_lock);
65 if (conf->freebh_cnt >= cnt)
66 while (cnt) {
67 t = conf->freebh;
68 conf->freebh = t->b_next;
69 t->b_next = bh;
70 bh = t;
71 t->b_state = 0;
72 conf->freebh_cnt--;
73 cnt--;
75 md_spin_unlock_irq(&conf->device_lock);
76 if (cnt == 0)
77 break;
78 t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_BUFFER);
79 if (t) {
80 memset(t, 0, sizeof(*t));
81 t->b_next = bh;
82 bh = t;
83 cnt--;
84 } else {
85 PRINTK("waiting for %d bh\n", cnt);
86 wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
89 return bh;
92 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
94 md_spin_lock_irq(&conf->device_lock);
95 while (bh) {
96 struct buffer_head *t = bh;
97 bh=bh->b_next;
98 if (t->b_pprev == NULL)
99 kfree(t);
100 else {
101 t->b_next= conf->freebh;
102 conf->freebh = t;
103 conf->freebh_cnt++;
106 md_spin_unlock_irq(&conf->device_lock);
107 wake_up(&conf->wait_buffer);
110 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
112 /* allocate cnt buffer_heads, possibly less if kalloc fails */
113 int i = 0;
115 while (i < cnt) {
116 struct buffer_head *bh;
117 bh = kmalloc(sizeof(*bh), GFP_KERNEL);
118 if (!bh) break;
119 memset(bh, 0, sizeof(*bh));
121 md_spin_lock_irq(&conf->device_lock);
122 bh->b_pprev = &conf->freebh;
123 bh->b_next = conf->freebh;
124 conf->freebh = bh;
125 conf->freebh_cnt++;
126 md_spin_unlock_irq(&conf->device_lock);
128 i++;
130 return i;
133 static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
135 /* discard cnt buffer_heads, if we can find them */
136 int i = 0;
138 md_spin_lock_irq(&conf->device_lock);
139 while ((i < cnt) && conf->freebh) {
140 struct buffer_head *bh = conf->freebh;
141 conf->freebh = bh->b_next;
142 kfree(bh);
143 i++;
144 conf->freebh_cnt--;
146 md_spin_unlock_irq(&conf->device_lock);
147 return i;
151 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
153 struct raid1_bh *r1_bh = NULL;
155 do {
156 md_spin_lock_irq(&conf->device_lock);
157 if (conf->freer1) {
158 r1_bh = conf->freer1;
159 conf->freer1 = r1_bh->next_r1;
160 r1_bh->next_r1 = NULL;
161 r1_bh->state = 0;
162 r1_bh->bh_req.b_state = 0;
164 md_spin_unlock_irq(&conf->device_lock);
165 if (r1_bh)
166 return r1_bh;
167 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh),
168 GFP_BUFFER);
169 if (r1_bh) {
170 memset(r1_bh, 0, sizeof(*r1_bh));
171 return r1_bh;
173 wait_event(conf->wait_buffer, conf->freer1);
174 } while (1);
177 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
179 struct buffer_head *bh = r1_bh->mirror_bh_list;
180 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
182 r1_bh->mirror_bh_list = NULL;
184 if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
185 md_spin_lock_irq(&conf->device_lock);
186 r1_bh->next_r1 = conf->freer1;
187 conf->freer1 = r1_bh;
188 md_spin_unlock_irq(&conf->device_lock);
189 } else {
190 kfree(r1_bh);
192 raid1_free_bh(conf, bh);
195 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
197 int i = 0;
199 while (i < cnt) {
200 struct raid1_bh *r1_bh;
201 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
202 if (!r1_bh)
203 break;
204 memset(r1_bh, 0, sizeof(*r1_bh));
206 md_spin_lock_irq(&conf->device_lock);
207 set_bit(R1BH_PreAlloc, &r1_bh->state);
208 r1_bh->next_r1 = conf->freer1;
209 conf->freer1 = r1_bh;
210 md_spin_unlock_irq(&conf->device_lock);
212 i++;
214 return i;
217 static void raid1_shrink_r1bh(raid1_conf_t *conf)
219 md_spin_lock_irq(&conf->device_lock);
220 while (conf->freer1) {
221 struct raid1_bh *r1_bh = conf->freer1;
222 conf->freer1 = r1_bh->next_r1;
223 kfree(r1_bh);
225 md_spin_unlock_irq(&conf->device_lock);
230 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
232 struct buffer_head *bh = r1_bh->mirror_bh_list;
233 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
234 r1_bh->mirror_bh_list = NULL;
236 md_spin_lock_irq(&conf->device_lock);
237 r1_bh->next_r1 = conf->freebuf;
238 conf->freebuf = r1_bh;
239 md_spin_unlock_irq(&conf->device_lock);
240 raid1_free_bh(conf, bh);
243 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
245 struct raid1_bh *r1_bh;
247 md_spin_lock_irq(&conf->device_lock);
248 wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
249 r1_bh = conf->freebuf;
250 conf->freebuf = r1_bh->next_r1;
251 r1_bh->next_r1= NULL;
252 md_spin_unlock_irq(&conf->device_lock);
254 return r1_bh;
257 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
259 int i = 0;
261 md_spin_lock_irq(&conf->device_lock);
262 while (i < cnt) {
263 struct raid1_bh *r1_bh;
264 struct page *page;
266 page = alloc_page(GFP_KERNEL);
267 if (!page)
268 break;
270 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
271 if (!r1_bh) {
272 __free_page(page);
273 break;
275 memset(r1_bh, 0, sizeof(*r1_bh));
276 r1_bh->bh_req.b_page = page;
277 r1_bh->bh_req.b_data = (char *) page_address(page);
278 r1_bh->next_r1 = conf->freebuf;
279 conf->freebuf = r1_bh;
280 i++;
282 md_spin_unlock_irq(&conf->device_lock);
283 return i;
286 static void raid1_shrink_buffers (raid1_conf_t *conf)
288 md_spin_lock_irq(&conf->device_lock);
289 while (conf->freebuf) {
290 struct raid1_bh *r1_bh = conf->freebuf;
291 conf->freebuf = r1_bh->next_r1;
292 __free_page(r1_bh->bh_req.b_page);
293 kfree(r1_bh);
295 md_spin_unlock_irq(&conf->device_lock);
298 static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size)
300 raid1_conf_t *conf = mddev_to_conf(mddev);
301 int i, disks = MD_SB_DISKS;
304 * Later we do read balancing on the read side
305 * now we use the first available disk.
308 for (i = 0; i < disks; i++) {
309 if (conf->mirrors[i].operational) {
310 *rdev = conf->mirrors[i].dev;
311 return (0);
315 printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
316 return (-1);
319 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
321 unsigned long flags;
322 mddev_t *mddev = r1_bh->mddev;
323 raid1_conf_t *conf = mddev_to_conf(mddev);
325 md_spin_lock_irqsave(&retry_list_lock, flags);
326 if (raid1_retry_list == NULL)
327 raid1_retry_tail = &raid1_retry_list;
328 *raid1_retry_tail = r1_bh;
329 raid1_retry_tail = &r1_bh->next_r1;
330 r1_bh->next_r1 = NULL;
331 md_spin_unlock_irqrestore(&retry_list_lock, flags);
332 md_wakeup_thread(conf->thread);
336 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
338 unsigned long flags;
339 spin_lock_irqsave(&conf->segment_lock, flags);
340 if (sector < conf->start_active)
341 conf->cnt_done--;
342 else if (sector >= conf->start_future && conf->phase == phase)
343 conf->cnt_future--;
344 else if (!--conf->cnt_pending)
345 wake_up(&conf->wait_ready);
347 spin_unlock_irqrestore(&conf->segment_lock, flags);
350 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
352 unsigned long flags;
353 spin_lock_irqsave(&conf->segment_lock, flags);
354 if (sector >= conf->start_ready)
355 --conf->cnt_ready;
356 else if (sector >= conf->start_active) {
357 if (!--conf->cnt_active) {
358 conf->start_active = conf->start_ready;
359 wake_up(&conf->wait_done);
362 spin_unlock_irqrestore(&conf->segment_lock, flags);
366 * raid1_end_bh_io() is called when we have finished servicing a mirrored
367 * operation and are ready to return a success/failure code to the buffer
368 * cache layer.
370 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
372 struct buffer_head *bh = r1_bh->master_bh;
374 io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
375 test_bit(R1BH_SyncPhase, &r1_bh->state));
377 bh->b_end_io(bh, uptodate);
378 raid1_free_r1bh(r1_bh);
380 void raid1_end_request (struct buffer_head *bh, int uptodate)
382 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
385 * this branch is our 'one mirror IO has finished' event handler:
387 if (!uptodate)
388 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
389 else
391 * Set R1BH_Uptodate in our master buffer_head, so that
392 * we will return a good error code for to the higher
393 * levels even if IO on some other mirrored buffer fails.
395 * The 'master' represents the complex operation to
396 * user-side. So if something waits for IO, then it will
397 * wait for the 'master' buffer_head.
399 set_bit (R1BH_Uptodate, &r1_bh->state);
402 * We split up the read and write side, imho they are
403 * conceptually different.
406 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
408 * we have only one buffer_head on the read side
411 if (uptodate) {
412 raid1_end_bh_io(r1_bh, uptodate);
413 return;
416 * oops, read error:
418 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
419 partition_name(bh->b_dev), bh->b_blocknr);
420 raid1_reschedule_retry(r1_bh);
421 return;
425 * WRITE:
427 * Let's see if all mirrored write operations have finished
428 * already.
431 if (atomic_dec_and_test(&r1_bh->remaining))
432 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
436 * This routine returns the disk from which the requested read should
437 * be done. It bookkeeps the last read position for every disk
438 * in array and when new read requests come, the disk which last
439 * position is nearest to the request, is chosen.
441 * TODO: now if there are 2 mirrors in the same 2 devices, performance
442 * degrades dramatically because position is mirror, not device based.
443 * This should be changed to be device based. Also atomic sequential
444 * reads should be somehow balanced.
447 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
449 int new_disk = conf->last_used;
450 const int sectors = bh->b_size >> 9;
451 const long this_sector = bh->b_blocknr * sectors;
452 int disk = new_disk;
453 unsigned long new_distance;
454 unsigned long current_distance;
457 * Check if it is sane at all to balance
460 if (conf->resync_mirrors)
461 goto rb_out;
463 if (conf->working_disks < 2) {
464 int i = 0;
466 while( !conf->mirrors[new_disk].operational &&
467 (i < MD_SB_DISKS) ) {
468 new_disk = conf->mirrors[new_disk].next;
469 i++;
472 if (i >= MD_SB_DISKS) {
474 * This means no working disk was found
475 * Nothing much to do, lets not change anything
476 * and hope for the best...
479 new_disk = conf->last_used;
482 goto rb_out;
486 * Don't touch anything for sequential reads.
489 if (this_sector == conf->mirrors[new_disk].head_position)
490 goto rb_out;
493 * If reads have been done only on a single disk
494 * for a time, lets give another disk a change.
495 * This is for kicking those idling disks so that
496 * they would find work near some hotspot.
499 if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
500 conf->sect_count = 0;
502 while( new_disk != conf->mirrors[new_disk].next ) {
503 if ((conf->mirrors[new_disk].write_only) ||
504 (!conf->mirrors[new_disk].operational) )
505 continue;
507 new_disk = conf->mirrors[new_disk].next;
508 break;
511 goto rb_out;
514 current_distance = abs(this_sector -
515 conf->mirrors[disk].head_position);
517 /* Find the disk which is closest */
519 while( conf->mirrors[disk].next != conf->last_used ) {
520 disk = conf->mirrors[disk].next;
522 if ((conf->mirrors[disk].write_only) ||
523 (!conf->mirrors[disk].operational))
524 continue;
526 new_distance = abs(this_sector -
527 conf->mirrors[disk].head_position);
529 if (new_distance < current_distance) {
530 conf->sect_count = 0;
531 current_distance = new_distance;
532 new_disk = disk;
536 rb_out:
537 conf->mirrors[new_disk].head_position = this_sector + sectors;
539 conf->last_used = new_disk;
540 conf->sect_count += sectors;
542 return new_disk;
545 static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw,
546 struct buffer_head * bh)
548 raid1_conf_t *conf = mddev_to_conf(mddev);
549 struct buffer_head *bh_req, *bhl;
550 struct raid1_bh * r1_bh;
551 int disks = MD_SB_DISKS;
552 int i, sum_bhs = 0, sectors;
553 struct mirror_info *mirror;
555 if (!buffer_locked(bh))
556 BUG();
559 * make_request() can abort the operation when READA is being
560 * used and no empty request is available.
562 * Currently, just replace the command with READ/WRITE.
564 if (rw == READA)
565 rw = READ;
567 if (rw == WRITE) {
568 rw = WRITERAW;
570 * we first clean the bh, then we start the IO, then
571 * when the IO has finished, we end_io the bh and
572 * mark it uptodate. This way we do not miss the
573 * case when the bh got dirty again during the IO.
575 * We do an important optimization here - if the
576 * buffer was not dirty and we are during resync or
577 * reconstruction, then we can skip writing it back
578 * to the master disk! (we still have to write it
579 * back to the other disks, because we are not sync
580 * yet.)
582 if (atomic_set_buffer_clean(bh))
583 __mark_buffer_clean(bh);
584 else {
585 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
586 return 0;
589 r1_bh = raid1_alloc_r1bh (conf);
591 spin_lock_irq(&conf->segment_lock);
592 wait_event_lock_irq(conf->wait_done,
593 bh->b_rsector < conf->start_active ||
594 bh->b_rsector >= conf->start_future,
595 conf->segment_lock);
596 if (bh->b_rsector < conf->start_active)
597 conf->cnt_done++;
598 else {
599 conf->cnt_future++;
600 if (conf->phase)
601 set_bit(R1BH_SyncPhase, &r1_bh->state);
603 spin_unlock_irq(&conf->segment_lock);
606 * i think the read and write branch should be separated completely,
607 * since we want to do read balancing on the read side for example.
608 * Alternative implementations? :) --mingo
611 r1_bh->master_bh = bh;
612 r1_bh->mddev = mddev;
613 r1_bh->cmd = rw;
615 sectors = bh->b_size >> 9;
616 if (rw == READ) {
618 * read balancing logic:
620 mirror = conf->mirrors + raid1_read_balance(conf, bh);
622 bh_req = &r1_bh->bh_req;
623 memcpy(bh_req, bh, sizeof(*bh));
624 bh_req->b_blocknr = bh->b_rsector * sectors;
625 bh_req->b_dev = mirror->dev;
626 bh_req->b_rdev = mirror->dev;
627 /* bh_req->b_rsector = bh->n_rsector; */
628 bh_req->b_end_io = raid1_end_request;
629 bh_req->b_dev_id = r1_bh;
630 q = blk_get_queue(bh_req->b_rdev);
631 generic_make_request (q, rw, bh_req);
632 return 0;
636 * WRITE:
639 bhl = raid1_alloc_bh(conf, conf->raid_disks);
640 for (i = 0; i < disks; i++) {
641 struct buffer_head *mbh;
642 if (!conf->mirrors[i].operational)
643 continue;
646 * We should use a private pool (size depending on NR_REQUEST),
647 * to avoid writes filling up the memory with bhs
649 * Such pools are much faster than kmalloc anyways (so we waste
650 * almost nothing by not using the master bh when writing and
651 * win alot of cleanness) but for now we are cool enough. --mingo
653 * It's safe to sleep here, buffer heads cannot be used in a shared
654 * manner in the write branch. Look how we lock the buffer at the
655 * beginning of this function to grok the difference ;)
657 mbh = bhl;
658 if (mbh == NULL) {
659 MD_BUG();
660 break;
662 bhl = mbh->b_next;
663 mbh->b_next = NULL;
664 mbh->b_this_page = (struct buffer_head *)1;
667 * prepare mirrored mbh (fields ordered for max mem throughput):
669 mbh->b_blocknr = bh->b_rsector * sectors;
670 mbh->b_dev = conf->mirrors[i].dev;
671 mbh->b_rdev = conf->mirrors[i].dev;
672 mbh->b_rsector = bh->b_rsector;
673 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
674 (1<<BH_Mapped) | (1<<BH_Lock);
676 atomic_set(&mbh->b_count, 1);
677 mbh->b_size = bh->b_size;
678 mbh->b_page = bh->b_page;
679 mbh->b_data = bh->b_data;
680 mbh->b_list = BUF_LOCKED;
681 mbh->b_end_io = raid1_end_request;
682 mbh->b_dev_id = r1_bh;
684 mbh->b_next = r1_bh->mirror_bh_list;
685 r1_bh->mirror_bh_list = mbh;
686 sum_bhs++;
688 if (bhl) raid1_free_bh(conf,bhl);
689 md_atomic_set(&r1_bh->remaining, sum_bhs);
692 * We have to be a bit careful about the semaphore above, thats
693 * why we start the requests separately. Since kmalloc() could
694 * fail, sleep and make_request() can sleep too, this is the
695 * safer solution. Imagine, end_request decreasing the semaphore
696 * before we could have set it up ... We could play tricks with
697 * the semaphore (presetting it and correcting at the end if
698 * sum_bhs is not 'n' but we have to do end_request by hand if
699 * all requests finish until we had a chance to set up the
700 * semaphore correctly ... lots of races).
702 bh = r1_bh->mirror_bh_list;
703 while(bh) {
704 struct buffer_head *bh2 = bh;
705 bh = bh->b_next;
706 q = blk_get_queue(bh2->b_rdev);
707 generic_make_request(q, rw, bh2);
709 return (0);
712 static int raid1_status (char *page, mddev_t *mddev)
714 raid1_conf_t *conf = mddev_to_conf(mddev);
715 int sz = 0, i;
717 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
718 conf->working_disks);
719 for (i = 0; i < conf->raid_disks; i++)
720 sz += sprintf (page+sz, "%s",
721 conf->mirrors[i].operational ? "U" : "_");
722 sz += sprintf (page+sz, "]");
723 return sz;
726 static void unlink_disk (raid1_conf_t *conf, int target)
728 int disks = MD_SB_DISKS;
729 int i;
731 for (i = 0; i < disks; i++)
732 if (conf->mirrors[i].next == target)
733 conf->mirrors[i].next = conf->mirrors[target].next;
736 #define LAST_DISK KERN_ALERT \
737 "raid1: only one disk left and IO error.\n"
739 #define NO_SPARE_DISK KERN_ALERT \
740 "raid1: no spare disk left, degrading mirror level by one.\n"
742 #define DISK_FAILED KERN_ALERT \
743 "raid1: Disk failure on %s, disabling device. \n" \
744 " Operation continuing on %d devices\n"
746 #define START_SYNCING KERN_ALERT \
747 "raid1: start syncing spare disk.\n"
749 #define ALREADY_SYNCING KERN_INFO \
750 "raid1: syncing already in progress.\n"
752 static void mark_disk_bad (mddev_t *mddev, int failed)
754 raid1_conf_t *conf = mddev_to_conf(mddev);
755 struct mirror_info *mirror = conf->mirrors+failed;
756 mdp_super_t *sb = mddev->sb;
758 mirror->operational = 0;
759 unlink_disk(conf, failed);
760 mark_disk_faulty(sb->disks+mirror->number);
761 mark_disk_nonsync(sb->disks+mirror->number);
762 mark_disk_inactive(sb->disks+mirror->number);
763 sb->active_disks--;
764 sb->working_disks--;
765 sb->failed_disks++;
766 mddev->sb_dirty = 1;
767 md_wakeup_thread(conf->thread);
768 conf->working_disks--;
769 printk (DISK_FAILED, partition_name (mirror->dev),
770 conf->working_disks);
773 static int raid1_error (mddev_t *mddev, kdev_t dev)
775 raid1_conf_t *conf = mddev_to_conf(mddev);
776 struct mirror_info * mirrors = conf->mirrors;
777 int disks = MD_SB_DISKS;
778 int i;
780 if (conf->working_disks == 1) {
782 * Uh oh, we can do nothing if this is our last disk, but
783 * first check if this is a queued request for a device
784 * which has just failed.
786 for (i = 0; i < disks; i++) {
787 if (mirrors[i].dev==dev && !mirrors[i].operational)
788 return 0;
790 printk (LAST_DISK);
791 } else {
793 * Mark disk as unusable
795 for (i = 0; i < disks; i++) {
796 if (mirrors[i].dev==dev && mirrors[i].operational) {
797 mark_disk_bad(mddev, i);
798 break;
802 return 0;
805 #undef LAST_DISK
806 #undef NO_SPARE_DISK
807 #undef DISK_FAILED
808 #undef START_SYNCING
811 * Insert the spare disk into the drive-ring
813 static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
815 int j, next;
816 int disks = MD_SB_DISKS;
817 struct mirror_info *p = conf->mirrors;
819 for (j = 0; j < disks; j++, p++)
820 if (p->operational && !p->write_only) {
821 next = p->next;
822 p->next = mirror->raid_disk;
823 mirror->next = next;
824 return;
827 printk("raid1: bug: no read-operational devices\n");
830 static void print_raid1_conf (raid1_conf_t *conf)
832 int i;
833 struct mirror_info *tmp;
835 printk("RAID1 conf printout:\n");
836 if (!conf) {
837 printk("(conf==NULL)\n");
838 return;
840 printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
841 conf->raid_disks, conf->nr_disks);
843 for (i = 0; i < MD_SB_DISKS; i++) {
844 tmp = conf->mirrors + i;
845 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
846 i, tmp->spare,tmp->operational,
847 tmp->number,tmp->raid_disk,tmp->used_slot,
848 partition_name(tmp->dev));
852 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
854 int err = 0;
855 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
856 raid1_conf_t *conf = mddev->private;
857 struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
858 mdp_super_t *sb = mddev->sb;
859 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
861 print_raid1_conf(conf);
862 md_spin_lock_irq(&conf->device_lock);
864 * find the disk ...
866 switch (state) {
868 case DISKOP_SPARE_ACTIVE:
871 * Find the failed disk within the RAID1 configuration ...
872 * (this can only be in the first conf->working_disks part)
874 for (i = 0; i < conf->raid_disks; i++) {
875 tmp = conf->mirrors + i;
876 if ((!tmp->operational && !tmp->spare) ||
877 !tmp->used_slot) {
878 failed_disk = i;
879 break;
883 * When we activate a spare disk we _must_ have a disk in
884 * the lower (active) part of the array to replace.
886 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
887 MD_BUG();
888 err = 1;
889 goto abort;
891 /* fall through */
893 case DISKOP_SPARE_WRITE:
894 case DISKOP_SPARE_INACTIVE:
897 * Find the spare disk ... (can only be in the 'high'
898 * area of the array)
900 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
901 tmp = conf->mirrors + i;
902 if (tmp->spare && tmp->number == (*d)->number) {
903 spare_disk = i;
904 break;
907 if (spare_disk == -1) {
908 MD_BUG();
909 err = 1;
910 goto abort;
912 break;
914 case DISKOP_HOT_REMOVE_DISK:
916 for (i = 0; i < MD_SB_DISKS; i++) {
917 tmp = conf->mirrors + i;
918 if (tmp->used_slot && (tmp->number == (*d)->number)) {
919 if (tmp->operational) {
920 err = -EBUSY;
921 goto abort;
923 removed_disk = i;
924 break;
927 if (removed_disk == -1) {
928 MD_BUG();
929 err = 1;
930 goto abort;
932 break;
934 case DISKOP_HOT_ADD_DISK:
936 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
937 tmp = conf->mirrors + i;
938 if (!tmp->used_slot) {
939 added_disk = i;
940 break;
943 if (added_disk == -1) {
944 MD_BUG();
945 err = 1;
946 goto abort;
948 break;
951 switch (state) {
953 * Switch the spare disk to write-only mode:
955 case DISKOP_SPARE_WRITE:
956 sdisk = conf->mirrors + spare_disk;
957 sdisk->operational = 1;
958 sdisk->write_only = 1;
959 break;
961 * Deactivate a spare disk:
963 case DISKOP_SPARE_INACTIVE:
964 sdisk = conf->mirrors + spare_disk;
965 sdisk->operational = 0;
966 sdisk->write_only = 0;
967 break;
969 * Activate (mark read-write) the (now sync) spare disk,
970 * which means we switch it's 'raid position' (->raid_disk)
971 * with the failed disk. (only the first 'conf->nr_disks'
972 * slots are used for 'real' disks and we must preserve this
973 * property)
975 case DISKOP_SPARE_ACTIVE:
977 sdisk = conf->mirrors + spare_disk;
978 fdisk = conf->mirrors + failed_disk;
980 spare_desc = &sb->disks[sdisk->number];
981 failed_desc = &sb->disks[fdisk->number];
983 if (spare_desc != *d) {
984 MD_BUG();
985 err = 1;
986 goto abort;
989 if (spare_desc->raid_disk != sdisk->raid_disk) {
990 MD_BUG();
991 err = 1;
992 goto abort;
995 if (sdisk->raid_disk != spare_disk) {
996 MD_BUG();
997 err = 1;
998 goto abort;
1001 if (failed_desc->raid_disk != fdisk->raid_disk) {
1002 MD_BUG();
1003 err = 1;
1004 goto abort;
1007 if (fdisk->raid_disk != failed_disk) {
1008 MD_BUG();
1009 err = 1;
1010 goto abort;
1014 * do the switch finally
1016 xchg_values(*spare_desc, *failed_desc);
1017 xchg_values(*fdisk, *sdisk);
1020 * (careful, 'failed' and 'spare' are switched from now on)
1022 * we want to preserve linear numbering and we want to
1023 * give the proper raid_disk number to the now activated
1024 * disk. (this means we switch back these values)
1027 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1028 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1029 xchg_values(spare_desc->number, failed_desc->number);
1030 xchg_values(sdisk->number, fdisk->number);
1032 *d = failed_desc;
1034 if (sdisk->dev == MKDEV(0,0))
1035 sdisk->used_slot = 0;
1037 * this really activates the spare.
1039 fdisk->spare = 0;
1040 fdisk->write_only = 0;
1041 link_disk(conf, fdisk);
1044 * if we activate a spare, we definitely replace a
1045 * non-operational disk slot in the 'low' area of
1046 * the disk array.
1049 conf->working_disks++;
1051 break;
1053 case DISKOP_HOT_REMOVE_DISK:
1054 rdisk = conf->mirrors + removed_disk;
1056 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1057 MD_BUG();
1058 err = 1;
1059 goto abort;
1061 rdisk->dev = MKDEV(0,0);
1062 rdisk->used_slot = 0;
1063 conf->nr_disks--;
1064 break;
1066 case DISKOP_HOT_ADD_DISK:
1067 adisk = conf->mirrors + added_disk;
1068 added_desc = *d;
1070 if (added_disk != added_desc->number) {
1071 MD_BUG();
1072 err = 1;
1073 goto abort;
1076 adisk->number = added_desc->number;
1077 adisk->raid_disk = added_desc->raid_disk;
1078 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1080 adisk->operational = 0;
1081 adisk->write_only = 0;
1082 adisk->spare = 1;
1083 adisk->used_slot = 1;
1084 adisk->head_position = 0;
1085 conf->nr_disks++;
1087 break;
1089 default:
1090 MD_BUG();
1091 err = 1;
1092 goto abort;
1094 abort:
1095 md_spin_unlock_irq(&conf->device_lock);
1096 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1097 /* should move to "END_REBUILD" when such exists */
1098 raid1_shrink_buffers(conf);
1100 print_raid1_conf(conf);
1101 return err;
1105 #define IO_ERROR KERN_ALERT \
1106 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1108 #define REDIRECT_SECTOR KERN_ERR \
1109 "raid1: %s: redirecting sector %lu to another mirror\n"
1112 * This is a kernel thread which:
1114 * 1. Retries failed read operations on working mirrors.
1115 * 2. Updates the raid superblock when problems encounter.
1116 * 3. Performs writes following reads for array syncronising.
1118 static void end_sync_write(struct buffer_head *bh, int uptodate);
1119 static void end_sync_read(struct buffer_head *bh, int uptodate);
1121 static void raid1d (void *data)
1123 struct raid1_bh *r1_bh;
1124 struct buffer_head *bh;
1125 unsigned long flags;
1126 request_queue_t *q;
1127 mddev_t *mddev;
1128 kdev_t dev;
1131 for (;;) {
1132 md_spin_lock_irqsave(&retry_list_lock, flags);
1133 r1_bh = raid1_retry_list;
1134 if (!r1_bh)
1135 break;
1136 raid1_retry_list = r1_bh->next_r1;
1137 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1139 mddev = r1_bh->mddev;
1140 if (mddev->sb_dirty) {
1141 printk(KERN_INFO "dirty sb detected, updating.\n");
1142 mddev->sb_dirty = 0;
1143 md_update_sb(mddev);
1145 bh = &r1_bh->bh_req;
1146 switch(r1_bh->cmd) {
1147 case SPECIAL:
1148 /* have to allocate lots of bh structures and
1149 * schedule writes
1151 if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1152 int i, sum_bhs = 0;
1153 int disks = MD_SB_DISKS;
1154 struct buffer_head *bhl, *mbh;
1155 raid1_conf_t *conf;
1156 int sectors = bh->b_size >> 9;
1158 conf = mddev_to_conf(mddev);
1159 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1160 for (i = 0; i < disks ; i++) {
1161 if (!conf->mirrors[i].operational)
1162 continue;
1163 if (i==conf->last_used)
1164 /* we read from here, no need to write */
1165 continue;
1166 if (i < conf->raid_disks
1167 && !conf->resync_mirrors)
1168 /* don't need to write this,
1169 * we are just rebuilding */
1170 continue;
1171 mbh = bhl;
1172 if (!mbh) {
1173 MD_BUG();
1174 break;
1176 bhl = mbh->b_next;
1177 mbh->b_this_page = (struct buffer_head *)1;
1181 * prepare mirrored bh (fields ordered for max mem throughput):
1183 mbh->b_blocknr = bh->b_blocknr;
1184 mbh->b_dev = conf->mirrors[i].dev;
1185 mbh->b_rdev = conf->mirrors[i].dev;
1186 mbh->b_rsector = bh->b_blocknr * sectors;
1187 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
1188 (1<<BH_Mapped) | (1<<BH_Lock);
1189 atomic_set(&mbh->b_count, 1);
1190 mbh->b_size = bh->b_size;
1191 mbh->b_page = bh->b_page;
1192 mbh->b_data = bh->b_data;
1193 mbh->b_list = BUF_LOCKED;
1194 mbh->b_end_io = end_sync_write;
1195 mbh->b_dev_id = r1_bh;
1197 mbh->b_next = r1_bh->mirror_bh_list;
1198 r1_bh->mirror_bh_list = mbh;
1200 sum_bhs++;
1202 md_atomic_set(&r1_bh->remaining, sum_bhs);
1203 if (bhl) raid1_free_bh(conf, bhl);
1204 mbh = r1_bh->mirror_bh_list;
1205 while (mbh) {
1206 struct buffer_head *bh1 = mbh;
1207 mbh = mbh->b_next;
1208 q = blk_get_queue(bh1->b_rdev);
1209 generic_make_request(q, WRITE, bh1);
1210 md_sync_acct(bh1->b_rdev, bh1->b_size/512);
1212 } else {
1213 dev = bh->b_dev;
1214 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1215 if (bh->b_dev == dev) {
1216 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1217 md_done_sync(mddev, bh->b_size>>10, 0);
1218 } else {
1219 printk (REDIRECT_SECTOR,
1220 partition_name(bh->b_dev), bh->b_blocknr);
1221 bh->b_rdev = bh->b_dev;
1222 q = blk_get_queue(bh->b_rdev);
1223 generic_make_request (q, READ, bh);
1227 break;
1228 case READ:
1229 case READA:
1230 dev = bh->b_dev;
1232 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1233 if (bh->b_dev == dev) {
1234 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1235 raid1_end_bh_io(r1_bh, 0);
1236 } else {
1237 printk (REDIRECT_SECTOR,
1238 partition_name(bh->b_dev), bh->b_blocknr);
1239 bh->b_rdev = bh->b_dev;
1240 q = blk_get_queue(bh->b_rdev);
1241 generic_make_request (q, r1_bh->cmd, bh);
1243 break;
1246 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1248 #undef IO_ERROR
1249 #undef REDIRECT_SECTOR
1252 * Private kernel thread to reconstruct mirrors after an unclean
1253 * shutdown.
1255 static void raid1syncd (void *data)
1257 raid1_conf_t *conf = data;
1258 mddev_t *mddev = conf->mddev;
1260 if (!conf->resync_mirrors)
1261 return;
1262 if (conf->resync_mirrors == 2)
1263 return;
1264 down(&mddev->recovery_sem);
1265 if (!md_do_sync(mddev, NULL)) {
1267 * Only if everything went Ok.
1269 conf->resync_mirrors = 0;
1272 /* If reconstruction was interrupted, we need to close the "active" and "pending"
1273 * holes.
1274 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
1276 /* this is really needed when recovery stops too... */
1277 spin_lock_irq(&conf->segment_lock);
1278 conf->start_active = conf->start_pending;
1279 conf->start_ready = conf->start_pending;
1280 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
1281 conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
1282 conf->start_future = mddev->sb->size+1;
1283 conf->cnt_pending = conf->cnt_future;
1284 conf->cnt_future = 0;
1285 conf->phase = conf->phase ^1;
1286 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
1287 conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
1288 conf->phase = 0;
1289 conf->cnt_future = conf->cnt_done;;
1290 conf->cnt_done = 0;
1291 spin_unlock_irq(&conf->segment_lock);
1292 wake_up(&conf->wait_done);
1294 up(&mddev->recovery_sem);
1295 raid1_shrink_buffers(conf);
1299 * perform a "sync" on one "block"
1301 * We need to make sure that no normal I/O request - particularly write
1302 * requests - conflict with active sync requests.
1303 * This is achieved by conceptually dividing the device space into a
1304 * number of sections:
1305 * DONE: 0 .. a-1 These blocks are in-sync
1306 * ACTIVE: a.. b-1 These blocks may have active sync requests, but
1307 * no normal IO requests
1308 * READY: b .. c-1 These blocks have no normal IO requests - sync
1309 * request may be happening
1310 * PENDING: c .. d-1 These blocks may have IO requests, but no new
1311 * ones will be added
1312 * FUTURE: d .. end These blocks are not to be considered yet. IO may
1313 * be happening, but not sync
1315 * We keep a
1316 * phase which flips (0 or 1) each time d moves and
1317 * a count of:
1318 * z = active io requests in FUTURE since d moved - marked with
1319 * current phase
1320 * y = active io requests in FUTURE before d moved, or PENDING -
1321 * marked with previous phase
1322 * x = active sync requests in READY
1323 * w = active sync requests in ACTIVE
1324 * v = active io requests in DONE
1326 * Normally, a=b=c=d=0 and z= active io requests
1327 * or a=b=c=d=END and v= active io requests
1328 * Allowed changes to a,b,c,d:
1329 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
1330 * B: y==0 -> c=d
1331 * C: b=c, w+=x, x=0
1332 * D: w==0 -> a=b
1333 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1335 * At start of sync we apply A.
1336 * When y reaches 0, we apply B then A then being sync requests
1337 * When sync point reaches c-1, we wait for y==0, and W==0, and
1338 * then apply apply B then A then D then C.
1339 * Finally, we apply E
1341 * The sync request simply issues a "read" against a working drive
1342 * This is marked so that on completion the raid1d thread is woken to
1343 * issue suitable write requests
1346 static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr)
1348 raid1_conf_t *conf = mddev_to_conf(mddev);
1349 struct mirror_info *mirror;
1350 request_queue_t *q;
1351 struct raid1_bh *r1_bh;
1352 struct buffer_head *bh;
1353 int bsize;
1355 spin_lock_irq(&conf->segment_lock);
1356 if (!block_nr) {
1357 /* initialize ...*/
1358 int buffs;
1359 conf->start_active = 0;
1360 conf->start_ready = 0;
1361 conf->start_pending = 0;
1362 conf->start_future = 0;
1363 conf->phase = 0;
1364 /* we want enough buffers to hold twice the window of 128*/
1365 buffs = 128 *2 / (PAGE_SIZE>>9);
1366 buffs = raid1_grow_buffers(conf, buffs);
1367 if (buffs < 2)
1368 goto nomem;
1370 conf->window = buffs*(PAGE_SIZE>>9)/2;
1371 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1372 conf->cnt_done = conf->cnt_pending = 0;
1373 if (conf->cnt_ready || conf->cnt_active)
1374 MD_BUG();
1376 while ((block_nr<<1) >= conf->start_pending) {
1377 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1378 block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1379 conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1380 wait_event_lock_irq(conf->wait_done,
1381 !conf->cnt_active,
1382 conf->segment_lock);
1383 wait_event_lock_irq(conf->wait_ready,
1384 !conf->cnt_pending,
1385 conf->segment_lock);
1386 conf->start_active = conf->start_ready;
1387 conf->start_ready = conf->start_pending;
1388 conf->start_pending = conf->start_future;
1389 conf->start_future = conf->start_future+conf->window;
1390 // Note: falling off the end is not a problem
1391 conf->phase = conf->phase ^1;
1392 conf->cnt_active = conf->cnt_ready;
1393 conf->cnt_ready = 0;
1394 conf->cnt_pending = conf->cnt_future;
1395 conf->cnt_future = 0;
1396 wake_up(&conf->wait_done);
1398 conf->cnt_ready++;
1399 spin_unlock_irq(&conf->segment_lock);
1402 /* If reconstructing, and >1 working disc,
1403 * could dedicate one to rebuild and others to
1404 * service read requests ..
1406 mirror = conf->mirrors+conf->last_used;
1408 r1_bh = raid1_alloc_buf (conf);
1409 r1_bh->master_bh = NULL;
1410 r1_bh->mddev = mddev;
1411 r1_bh->cmd = SPECIAL;
1412 bh = &r1_bh->bh_req;
1414 bh->b_blocknr = block_nr;
1415 bsize = 1024;
1416 while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE
1417 && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) {
1418 bh->b_blocknr >>= 1;
1419 bsize <<= 1;
1421 bh->b_size = bsize;
1422 bh->b_list = BUF_LOCKED;
1423 bh->b_dev = mirror->dev;
1424 bh->b_rdev = mirror->dev;
1425 bh->b_state = (1<<BH_Req) | (1<<BH_Mapped);
1426 if (!bh->b_page)
1427 BUG();
1428 if (!bh->b_data)
1429 BUG();
1430 if (bh->b_data != (char *) page_address(bh->b_page))
1431 BUG();
1432 bh->b_end_io = end_sync_read;
1433 bh->b_dev_id = (void *) r1_bh;
1434 bh->b_rsector = block_nr<<1;
1435 init_waitqueue_head(&bh->b_wait);
1437 q = blk_get_queue(bh->b_rdev);
1438 generic_make_request(q, READ, bh);
1439 md_sync_acct(bh->b_rdev, bh->b_size/512);
1441 return (bsize >> 10);
1443 nomem:
1444 raid1_shrink_buffers(conf);
1445 spin_unlock_irq(&conf->segment_lock);
1446 return -ENOMEM;
1449 static void end_sync_read(struct buffer_head *bh, int uptodate)
1451 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
1453 /* we have read a block, now it needs to be re-written,
1454 * or re-read if the read failed.
1455 * We don't do much here, just schedule handling by raid1d
1457 if (!uptodate)
1458 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1459 else
1460 set_bit(R1BH_Uptodate, &r1_bh->state);
1461 raid1_reschedule_retry(r1_bh);
1464 static void end_sync_write(struct buffer_head *bh, int uptodate)
1466 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id);
1468 if (!uptodate)
1469 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1470 if (atomic_dec_and_test(&r1_bh->remaining)) {
1471 mddev_t *mddev = r1_bh->mddev;
1472 unsigned long sect = bh->b_blocknr * (bh->b_size>>9);
1473 int size = bh->b_size;
1474 raid1_free_buf(r1_bh);
1475 sync_request_done(sect, mddev_to_conf(mddev));
1476 md_done_sync(mddev,size>>10, uptodate);
1481 * This will catch the scenario in which one of the mirrors was
1482 * mounted as a normal device rather than as a part of a raid set.
1484 * check_consistency is very personality-dependent, eg. RAID5 cannot
1485 * do this check, it uses another method.
1487 static int __check_consistency (mddev_t *mddev, int row)
1489 raid1_conf_t *conf = mddev_to_conf(mddev);
1490 int disks = MD_SB_DISKS;
1491 kdev_t dev;
1492 struct buffer_head *bh = NULL;
1493 int i, rc = 0;
1494 char *buffer = NULL;
1496 for (i = 0; i < disks; i++) {
1497 printk("(checking disk %d)\n",i);
1498 if (!conf->mirrors[i].operational)
1499 continue;
1500 printk("(really checking disk %d)\n",i);
1501 dev = conf->mirrors[i].dev;
1502 set_blocksize(dev, 4096);
1503 if ((bh = bread(dev, row / 4, 4096)) == NULL)
1504 break;
1505 if (!buffer) {
1506 buffer = (char *) __get_free_page(GFP_KERNEL);
1507 if (!buffer)
1508 break;
1509 memcpy(buffer, bh->b_data, 4096);
1510 } else if (memcmp(buffer, bh->b_data, 4096)) {
1511 rc = 1;
1512 break;
1514 bforget(bh);
1515 fsync_dev(dev);
1516 invalidate_buffers(dev);
1517 bh = NULL;
1519 if (buffer)
1520 free_page((unsigned long) buffer);
1521 if (bh) {
1522 dev = bh->b_dev;
1523 bforget(bh);
1524 fsync_dev(dev);
1525 invalidate_buffers(dev);
1527 return rc;
1530 static int check_consistency (mddev_t *mddev)
1532 if (__check_consistency(mddev, 0))
1534 * we do not do this currently, as it's perfectly possible to
1535 * have an inconsistent array when it's freshly created. Only
1536 * newly written data has to be consistent.
1538 return 0;
1540 return 0;
1543 #define INVALID_LEVEL KERN_WARNING \
1544 "raid1: md%d: raid level not set to mirroring (%d)\n"
1546 #define NO_SB KERN_ERR \
1547 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1549 #define ERRORS KERN_ERR \
1550 "raid1: disabled mirror %s (errors detected)\n"
1552 #define NOT_IN_SYNC KERN_ERR \
1553 "raid1: disabled mirror %s (not in sync)\n"
1555 #define INCONSISTENT KERN_ERR \
1556 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1558 #define ALREADY_RUNNING KERN_ERR \
1559 "raid1: disabled mirror %s (mirror %d already operational)\n"
1561 #define OPERATIONAL KERN_INFO \
1562 "raid1: device %s operational as mirror %d\n"
1564 #define MEM_ERROR KERN_ERR \
1565 "raid1: couldn't allocate memory for md%d\n"
1567 #define SPARE KERN_INFO \
1568 "raid1: spare disk %s\n"
1570 #define NONE_OPERATIONAL KERN_ERR \
1571 "raid1: no operational mirrors for md%d\n"
1573 #define RUNNING_CKRAID KERN_ERR \
1574 "raid1: detected mirror differences -- running resync\n"
1576 #define ARRAY_IS_ACTIVE KERN_INFO \
1577 "raid1: raid set md%d active with %d out of %d mirrors\n"
1579 #define THREAD_ERROR KERN_ERR \
1580 "raid1: couldn't allocate thread for md%d\n"
1582 #define START_RESYNC KERN_WARNING \
1583 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1585 static int raid1_run (mddev_t *mddev)
1587 raid1_conf_t *conf;
1588 int i, j, disk_idx;
1589 struct mirror_info *disk;
1590 mdp_super_t *sb = mddev->sb;
1591 mdp_disk_t *descriptor;
1592 mdk_rdev_t *rdev;
1593 struct md_list_head *tmp;
1594 int start_recovery = 0;
1596 MOD_INC_USE_COUNT;
1598 if (sb->level != 1) {
1599 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1600 goto out;
1603 * copy the already verified devices into our private RAID1
1604 * bookkeeping area. [whatever we allocate in raid1_run(),
1605 * should be freed in raid1_stop()]
1608 conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1609 mddev->private = conf;
1610 if (!conf) {
1611 printk(MEM_ERROR, mdidx(mddev));
1612 goto out;
1614 memset(conf, 0, sizeof(*conf));
1616 ITERATE_RDEV(mddev,rdev,tmp) {
1617 if (rdev->faulty) {
1618 printk(ERRORS, partition_name(rdev->dev));
1619 } else {
1620 if (!rdev->sb) {
1621 MD_BUG();
1622 continue;
1625 if (rdev->desc_nr == -1) {
1626 MD_BUG();
1627 continue;
1629 descriptor = &sb->disks[rdev->desc_nr];
1630 disk_idx = descriptor->raid_disk;
1631 disk = conf->mirrors + disk_idx;
1633 if (disk_faulty(descriptor)) {
1634 disk->number = descriptor->number;
1635 disk->raid_disk = disk_idx;
1636 disk->dev = rdev->dev;
1637 disk->sect_limit = MAX_WORK_PER_DISK;
1638 disk->operational = 0;
1639 disk->write_only = 0;
1640 disk->spare = 0;
1641 disk->used_slot = 1;
1642 disk->head_position = 0;
1643 continue;
1645 if (disk_active(descriptor)) {
1646 if (!disk_sync(descriptor)) {
1647 printk(NOT_IN_SYNC,
1648 partition_name(rdev->dev));
1649 continue;
1651 if ((descriptor->number > MD_SB_DISKS) ||
1652 (disk_idx > sb->raid_disks)) {
1654 printk(INCONSISTENT,
1655 partition_name(rdev->dev));
1656 continue;
1658 if (disk->operational) {
1659 printk(ALREADY_RUNNING,
1660 partition_name(rdev->dev),
1661 disk_idx);
1662 continue;
1664 printk(OPERATIONAL, partition_name(rdev->dev),
1665 disk_idx);
1666 disk->number = descriptor->number;
1667 disk->raid_disk = disk_idx;
1668 disk->dev = rdev->dev;
1669 disk->sect_limit = MAX_WORK_PER_DISK;
1670 disk->operational = 1;
1671 disk->write_only = 0;
1672 disk->spare = 0;
1673 disk->used_slot = 1;
1674 disk->head_position = 0;
1675 conf->working_disks++;
1676 } else {
1678 * Must be a spare disk ..
1680 printk(SPARE, partition_name(rdev->dev));
1681 disk->number = descriptor->number;
1682 disk->raid_disk = disk_idx;
1683 disk->dev = rdev->dev;
1684 disk->sect_limit = MAX_WORK_PER_DISK;
1685 disk->operational = 0;
1686 disk->write_only = 0;
1687 disk->spare = 1;
1688 disk->used_slot = 1;
1689 disk->head_position = 0;
1692 conf->raid_disks = sb->raid_disks;
1693 conf->nr_disks = sb->nr_disks;
1694 conf->mddev = mddev;
1695 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1697 conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1698 init_waitqueue_head(&conf->wait_buffer);
1699 init_waitqueue_head(&conf->wait_done);
1700 init_waitqueue_head(&conf->wait_ready);
1702 if (!conf->working_disks) {
1703 printk(NONE_OPERATIONAL, mdidx(mddev));
1704 goto out_free_conf;
1708 /* pre-allocate some buffer_head structures.
1709 * As a minimum, 1 r1bh and raid_disks buffer_heads
1710 * would probably get us by in tight memory situations,
1711 * but a few more is probably a good idea.
1712 * For now, try 16 r1bh and 16*raid_disks bufferheads
1713 * This will allow at least 16 concurrent reads or writes
1714 * even if kmalloc starts failing
1716 if (raid1_grow_r1bh(conf, 16) < 16 ||
1717 raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
1718 printk(MEM_ERROR, mdidx(mddev));
1719 goto out_free_conf;
1722 for (i = 0; i < MD_SB_DISKS; i++) {
1724 descriptor = sb->disks+i;
1725 disk_idx = descriptor->raid_disk;
1726 disk = conf->mirrors + disk_idx;
1728 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1729 !disk->used_slot) {
1731 disk->number = descriptor->number;
1732 disk->raid_disk = disk_idx;
1733 disk->dev = MKDEV(0,0);
1735 disk->operational = 0;
1736 disk->write_only = 0;
1737 disk->spare = 0;
1738 disk->used_slot = 1;
1739 disk->head_position = 0;
1744 * find the first working one and use it as a starting point
1745 * to read balancing.
1747 for (j = 0; !conf->mirrors[j].operational; j++)
1748 /* nothing */;
1749 conf->last_used = j;
1752 * initialize the 'working disks' list.
1754 for (i = conf->raid_disks - 1; i >= 0; i--) {
1755 if (conf->mirrors[i].operational) {
1756 conf->mirrors[i].next = j;
1757 j = i;
1761 if (conf->working_disks != sb->raid_disks) {
1762 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1763 start_recovery = 1;
1766 if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
1768 * we do sanity checks even if the device says
1769 * it's clean ...
1771 if (check_consistency(mddev)) {
1772 printk(RUNNING_CKRAID);
1773 sb->state &= ~(1 << MD_SB_CLEAN);
1778 const char * name = "raid1d";
1780 conf->thread = md_register_thread(raid1d, conf, name);
1781 if (!conf->thread) {
1782 printk(THREAD_ERROR, mdidx(mddev));
1783 goto out_free_conf;
1787 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1788 const char * name = "raid1syncd";
1790 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1791 if (!conf->resync_thread) {
1792 printk(THREAD_ERROR, mdidx(mddev));
1793 goto out_free_conf;
1796 printk(START_RESYNC, mdidx(mddev));
1797 conf->resync_mirrors = 1;
1798 md_wakeup_thread(conf->resync_thread);
1802 * Regenerate the "device is in sync with the raid set" bit for
1803 * each device.
1805 for (i = 0; i < MD_SB_DISKS; i++) {
1806 mark_disk_nonsync(sb->disks+i);
1807 for (j = 0; j < sb->raid_disks; j++) {
1808 if (!conf->mirrors[j].operational)
1809 continue;
1810 if (sb->disks[i].number == conf->mirrors[j].number)
1811 mark_disk_sync(sb->disks+i);
1814 sb->active_disks = conf->working_disks;
1816 if (start_recovery)
1817 md_recover_arrays();
1820 printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1822 * Ok, everything is just fine now
1824 return 0;
1826 out_free_conf:
1827 raid1_shrink_r1bh(conf);
1828 raid1_shrink_bh(conf, conf->freebh_cnt);
1829 raid1_shrink_buffers(conf);
1830 kfree(conf);
1831 mddev->private = NULL;
1832 out:
1833 MOD_DEC_USE_COUNT;
1834 return -EIO;
1837 #undef INVALID_LEVEL
1838 #undef NO_SB
1839 #undef ERRORS
1840 #undef NOT_IN_SYNC
1841 #undef INCONSISTENT
1842 #undef ALREADY_RUNNING
1843 #undef OPERATIONAL
1844 #undef SPARE
1845 #undef NONE_OPERATIONAL
1846 #undef RUNNING_CKRAID
1847 #undef ARRAY_IS_ACTIVE
1849 static int raid1_stop_resync (mddev_t *mddev)
1851 raid1_conf_t *conf = mddev_to_conf(mddev);
1853 if (conf->resync_thread) {
1854 if (conf->resync_mirrors) {
1855 conf->resync_mirrors = 2;
1856 md_interrupt_thread(conf->resync_thread);
1858 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1859 return 1;
1861 return 0;
1863 return 0;
1866 static int raid1_restart_resync (mddev_t *mddev)
1868 raid1_conf_t *conf = mddev_to_conf(mddev);
1870 if (conf->resync_mirrors) {
1871 if (!conf->resync_thread) {
1872 MD_BUG();
1873 return 0;
1875 conf->resync_mirrors = 1;
1876 md_wakeup_thread(conf->resync_thread);
1877 return 1;
1879 return 0;
1882 static int raid1_stop (mddev_t *mddev)
1884 raid1_conf_t *conf = mddev_to_conf(mddev);
1886 md_unregister_thread(conf->thread);
1887 if (conf->resync_thread)
1888 md_unregister_thread(conf->resync_thread);
1889 raid1_shrink_r1bh(conf);
1890 raid1_shrink_bh(conf, conf->freebh_cnt);
1891 raid1_shrink_buffers(conf);
1892 kfree(conf);
1893 mddev->private = NULL;
1894 MOD_DEC_USE_COUNT;
1895 return 0;
1898 static mdk_personality_t raid1_personality=
1900 name: "raid1",
1901 make_request: raid1_make_request,
1902 run: raid1_run,
1903 stop: raid1_stop,
1904 status: raid1_status,
1905 error_handler: raid1_error,
1906 diskop: raid1_diskop,
1907 stop_resync: raid1_stop_resync,
1908 restart_resync: raid1_restart_resync,
1909 sync_request: raid1_sync_request
1912 int raid1_init (void)
1914 return register_md_personality (RAID1, &raid1_personality);
1917 #ifdef MODULE
1918 int init_module (void)
1920 return raid1_init();
1923 void cleanup_module (void)
1925 unregister_md_personality (RAID1);
1927 #endif