- Linus: drop support for old-style Makefiles entirely. Big.
[davej-history.git] / drivers / md / raid1.c
blob3a381b6a22fc7fd34ff2f205d3346d7f222ea315
1 /*
2 * raid1.c : Multiple Devices driver for Linux
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8 * RAID-1 management functions.
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <linux/module.h>
26 #include <linux/malloc.h>
27 #include <linux/raid/raid1.h>
28 #include <asm/atomic.h>
30 #define MAJOR_NR MD_MAJOR
31 #define MD_DRIVER
32 #define MD_PERSONALITY
34 #define MAX_WORK_PER_DISK 128
37 * The following can be used to debug the driver
39 #define RAID1_DEBUG 0
41 #if RAID1_DEBUG
42 #define PRINTK(x...) printk(x)
43 #define inline
44 #define __inline__
45 #else
46 #define PRINTK(x...) do { } while (0)
47 #endif
50 static mdk_personality_t raid1_personality;
51 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
52 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
54 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
56 /* return a linked list of "cnt" struct buffer_heads.
57 * don't take any off the free list unless we know we can
58 * get all we need, otherwise we could deadlock
60 struct buffer_head *bh=NULL;
62 while(cnt) {
63 struct buffer_head *t;
64 md_spin_lock_irq(&conf->device_lock);
65 if (conf->freebh_cnt >= cnt)
66 while (cnt) {
67 t = conf->freebh;
68 conf->freebh = t->b_next;
69 t->b_next = bh;
70 bh = t;
71 t->b_state = 0;
72 conf->freebh_cnt--;
73 cnt--;
75 md_spin_unlock_irq(&conf->device_lock);
76 if (cnt == 0)
77 break;
78 t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_BUFFER);
79 if (t) {
80 memset(t, 0, sizeof(*t));
81 t->b_next = bh;
82 bh = t;
83 cnt--;
84 } else {
85 PRINTK("waiting for %d bh\n", cnt);
86 wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
89 return bh;
92 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
94 unsigned long flags;
95 spin_lock_irqsave(&conf->device_lock, flags);
96 while (bh) {
97 struct buffer_head *t = bh;
98 bh=bh->b_next;
99 if (t->b_pprev == NULL)
100 kfree(t);
101 else {
102 t->b_next= conf->freebh;
103 conf->freebh = t;
104 conf->freebh_cnt++;
107 spin_unlock_irqrestore(&conf->device_lock, flags);
108 wake_up(&conf->wait_buffer);
111 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
113 /* allocate cnt buffer_heads, possibly less if kalloc fails */
114 int i = 0;
116 while (i < cnt) {
117 struct buffer_head *bh;
118 bh = kmalloc(sizeof(*bh), GFP_KERNEL);
119 if (!bh) break;
120 memset(bh, 0, sizeof(*bh));
122 md_spin_lock_irq(&conf->device_lock);
123 bh->b_pprev = &conf->freebh;
124 bh->b_next = conf->freebh;
125 conf->freebh = bh;
126 conf->freebh_cnt++;
127 md_spin_unlock_irq(&conf->device_lock);
129 i++;
131 return i;
134 static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
136 /* discard cnt buffer_heads, if we can find them */
137 int i = 0;
139 md_spin_lock_irq(&conf->device_lock);
140 while ((i < cnt) && conf->freebh) {
141 struct buffer_head *bh = conf->freebh;
142 conf->freebh = bh->b_next;
143 kfree(bh);
144 i++;
145 conf->freebh_cnt--;
147 md_spin_unlock_irq(&conf->device_lock);
148 return i;
152 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
154 struct raid1_bh *r1_bh = NULL;
156 do {
157 md_spin_lock_irq(&conf->device_lock);
158 if (conf->freer1) {
159 r1_bh = conf->freer1;
160 conf->freer1 = r1_bh->next_r1;
161 r1_bh->next_r1 = NULL;
162 r1_bh->state = 0;
163 r1_bh->bh_req.b_state = 0;
165 md_spin_unlock_irq(&conf->device_lock);
166 if (r1_bh)
167 return r1_bh;
168 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh),
169 GFP_BUFFER);
170 if (r1_bh) {
171 memset(r1_bh, 0, sizeof(*r1_bh));
172 return r1_bh;
174 wait_event(conf->wait_buffer, conf->freer1);
175 } while (1);
178 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
180 struct buffer_head *bh = r1_bh->mirror_bh_list;
181 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
183 r1_bh->mirror_bh_list = NULL;
185 if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
186 unsigned long flags;
187 spin_lock_irqsave(&conf->device_lock, flags);
188 r1_bh->next_r1 = conf->freer1;
189 conf->freer1 = r1_bh;
190 spin_unlock_irqrestore(&conf->device_lock, flags);
191 } else {
192 kfree(r1_bh);
194 raid1_free_bh(conf, bh);
197 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
199 int i = 0;
201 while (i < cnt) {
202 struct raid1_bh *r1_bh;
203 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
204 if (!r1_bh)
205 break;
206 memset(r1_bh, 0, sizeof(*r1_bh));
208 md_spin_lock_irq(&conf->device_lock);
209 set_bit(R1BH_PreAlloc, &r1_bh->state);
210 r1_bh->next_r1 = conf->freer1;
211 conf->freer1 = r1_bh;
212 md_spin_unlock_irq(&conf->device_lock);
214 i++;
216 return i;
219 static void raid1_shrink_r1bh(raid1_conf_t *conf)
221 md_spin_lock_irq(&conf->device_lock);
222 while (conf->freer1) {
223 struct raid1_bh *r1_bh = conf->freer1;
224 conf->freer1 = r1_bh->next_r1;
225 kfree(r1_bh);
227 md_spin_unlock_irq(&conf->device_lock);
232 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
234 unsigned long flags;
235 struct buffer_head *bh = r1_bh->mirror_bh_list;
236 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
237 r1_bh->mirror_bh_list = NULL;
239 spin_lock_irqsave(&conf->device_lock, flags);
240 r1_bh->next_r1 = conf->freebuf;
241 conf->freebuf = r1_bh;
242 spin_unlock_irqrestore(&conf->device_lock, flags);
243 raid1_free_bh(conf, bh);
246 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
248 struct raid1_bh *r1_bh;
250 md_spin_lock_irq(&conf->device_lock);
251 wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
252 r1_bh = conf->freebuf;
253 conf->freebuf = r1_bh->next_r1;
254 r1_bh->next_r1= NULL;
255 md_spin_unlock_irq(&conf->device_lock);
257 return r1_bh;
260 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
262 int i = 0;
264 md_spin_lock_irq(&conf->device_lock);
265 while (i < cnt) {
266 struct raid1_bh *r1_bh;
267 struct page *page;
269 page = alloc_page(GFP_KERNEL);
270 if (!page)
271 break;
273 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
274 if (!r1_bh) {
275 __free_page(page);
276 break;
278 memset(r1_bh, 0, sizeof(*r1_bh));
279 r1_bh->bh_req.b_page = page;
280 r1_bh->bh_req.b_data = page_address(page);
281 r1_bh->next_r1 = conf->freebuf;
282 conf->freebuf = r1_bh;
283 i++;
285 md_spin_unlock_irq(&conf->device_lock);
286 return i;
289 static void raid1_shrink_buffers (raid1_conf_t *conf)
291 md_spin_lock_irq(&conf->device_lock);
292 while (conf->freebuf) {
293 struct raid1_bh *r1_bh = conf->freebuf;
294 conf->freebuf = r1_bh->next_r1;
295 __free_page(r1_bh->bh_req.b_page);
296 kfree(r1_bh);
298 md_spin_unlock_irq(&conf->device_lock);
301 static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size)
303 raid1_conf_t *conf = mddev_to_conf(mddev);
304 int i, disks = MD_SB_DISKS;
307 * Later we do read balancing on the read side
308 * now we use the first available disk.
311 for (i = 0; i < disks; i++) {
312 if (conf->mirrors[i].operational) {
313 *rdev = conf->mirrors[i].dev;
314 return (0);
318 printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
319 return (-1);
322 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
324 unsigned long flags;
325 mddev_t *mddev = r1_bh->mddev;
326 raid1_conf_t *conf = mddev_to_conf(mddev);
328 md_spin_lock_irqsave(&retry_list_lock, flags);
329 if (raid1_retry_list == NULL)
330 raid1_retry_tail = &raid1_retry_list;
331 *raid1_retry_tail = r1_bh;
332 raid1_retry_tail = &r1_bh->next_r1;
333 r1_bh->next_r1 = NULL;
334 md_spin_unlock_irqrestore(&retry_list_lock, flags);
335 md_wakeup_thread(conf->thread);
339 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
341 unsigned long flags;
342 spin_lock_irqsave(&conf->segment_lock, flags);
343 if (sector < conf->start_active)
344 conf->cnt_done--;
345 else if (sector >= conf->start_future && conf->phase == phase)
346 conf->cnt_future--;
347 else if (!--conf->cnt_pending)
348 wake_up(&conf->wait_ready);
350 spin_unlock_irqrestore(&conf->segment_lock, flags);
353 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
355 unsigned long flags;
356 spin_lock_irqsave(&conf->segment_lock, flags);
357 if (sector >= conf->start_ready)
358 --conf->cnt_ready;
359 else if (sector >= conf->start_active) {
360 if (!--conf->cnt_active) {
361 conf->start_active = conf->start_ready;
362 wake_up(&conf->wait_done);
365 spin_unlock_irqrestore(&conf->segment_lock, flags);
369 * raid1_end_bh_io() is called when we have finished servicing a mirrored
370 * operation and are ready to return a success/failure code to the buffer
371 * cache layer.
373 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
375 struct buffer_head *bh = r1_bh->master_bh;
377 io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
378 test_bit(R1BH_SyncPhase, &r1_bh->state));
380 bh->b_end_io(bh, uptodate);
381 raid1_free_r1bh(r1_bh);
383 void raid1_end_request (struct buffer_head *bh, int uptodate)
385 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
388 * this branch is our 'one mirror IO has finished' event handler:
390 if (!uptodate)
391 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
392 else
394 * Set R1BH_Uptodate in our master buffer_head, so that
395 * we will return a good error code for to the higher
396 * levels even if IO on some other mirrored buffer fails.
398 * The 'master' represents the complex operation to
399 * user-side. So if something waits for IO, then it will
400 * wait for the 'master' buffer_head.
402 set_bit (R1BH_Uptodate, &r1_bh->state);
405 * We split up the read and write side, imho they are
406 * conceptually different.
409 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
411 * we have only one buffer_head on the read side
414 if (uptodate) {
415 raid1_end_bh_io(r1_bh, uptodate);
416 return;
419 * oops, read error:
421 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
422 partition_name(bh->b_dev), bh->b_blocknr);
423 raid1_reschedule_retry(r1_bh);
424 return;
428 * WRITE:
430 * Let's see if all mirrored write operations have finished
431 * already.
434 if (atomic_dec_and_test(&r1_bh->remaining))
435 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
439 * This routine returns the disk from which the requested read should
440 * be done. It bookkeeps the last read position for every disk
441 * in array and when new read requests come, the disk which last
442 * position is nearest to the request, is chosen.
444 * TODO: now if there are 2 mirrors in the same 2 devices, performance
445 * degrades dramatically because position is mirror, not device based.
446 * This should be changed to be device based. Also atomic sequential
447 * reads should be somehow balanced.
450 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
452 int new_disk = conf->last_used;
453 const int sectors = bh->b_size >> 9;
454 const unsigned long this_sector = bh->b_rsector;
455 int disk = new_disk;
456 unsigned long new_distance;
457 unsigned long current_distance;
460 * Check if it is sane at all to balance
463 if (conf->resync_mirrors)
464 goto rb_out;
467 /* make sure that disk is operational */
468 while( !conf->mirrors[new_disk].operational) {
469 if (new_disk <= 0) new_disk = conf->raid_disks;
470 new_disk--;
471 if (new_disk == disk) {
473 * This means no working disk was found
474 * Nothing much to do, lets not change anything
475 * and hope for the best...
478 new_disk = conf->last_used;
480 goto rb_out;
483 disk = new_disk;
484 /* now disk == new_disk == starting point for search */
487 * Don't touch anything for sequential reads.
490 if (this_sector == conf->mirrors[new_disk].head_position)
491 goto rb_out;
494 * If reads have been done only on a single disk
495 * for a time, lets give another disk a change.
496 * This is for kicking those idling disks so that
497 * they would find work near some hotspot.
500 if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
501 conf->sect_count = 0;
503 do {
504 if (new_disk<=0)
505 new_disk = conf->raid_disks;
506 new_disk--;
507 if (new_disk == disk)
508 break;
509 } while ((conf->mirrors[new_disk].write_only) ||
510 (!conf->mirrors[new_disk].operational));
512 goto rb_out;
515 current_distance = abs(this_sector -
516 conf->mirrors[disk].head_position);
518 /* Find the disk which is closest */
520 do {
521 if (disk <= 0)
522 disk = conf->raid_disks;
523 disk--;
525 if ((conf->mirrors[disk].write_only) ||
526 (!conf->mirrors[disk].operational))
527 continue;
529 new_distance = abs(this_sector -
530 conf->mirrors[disk].head_position);
532 if (new_distance < current_distance) {
533 conf->sect_count = 0;
534 current_distance = new_distance;
535 new_disk = disk;
537 } while (disk != conf->last_used);
539 rb_out:
540 conf->mirrors[new_disk].head_position = this_sector + sectors;
542 conf->last_used = new_disk;
543 conf->sect_count += sectors;
545 return new_disk;
548 static int raid1_make_request (mddev_t *mddev, int rw,
549 struct buffer_head * bh)
551 raid1_conf_t *conf = mddev_to_conf(mddev);
552 struct buffer_head *bh_req, *bhl;
553 struct raid1_bh * r1_bh;
554 int disks = MD_SB_DISKS;
555 int i, sum_bhs = 0, sectors;
556 struct mirror_info *mirror;
558 if (!buffer_locked(bh))
559 BUG();
562 * make_request() can abort the operation when READA is being
563 * used and no empty request is available.
565 * Currently, just replace the command with READ/WRITE.
567 if (rw == READA)
568 rw = READ;
570 r1_bh = raid1_alloc_r1bh (conf);
572 spin_lock_irq(&conf->segment_lock);
573 wait_event_lock_irq(conf->wait_done,
574 bh->b_rsector < conf->start_active ||
575 bh->b_rsector >= conf->start_future,
576 conf->segment_lock);
577 if (bh->b_rsector < conf->start_active)
578 conf->cnt_done++;
579 else {
580 conf->cnt_future++;
581 if (conf->phase)
582 set_bit(R1BH_SyncPhase, &r1_bh->state);
584 spin_unlock_irq(&conf->segment_lock);
587 * i think the read and write branch should be separated completely,
588 * since we want to do read balancing on the read side for example.
589 * Alternative implementations? :) --mingo
592 r1_bh->master_bh = bh;
593 r1_bh->mddev = mddev;
594 r1_bh->cmd = rw;
596 sectors = bh->b_size >> 9;
597 if (rw == READ) {
599 * read balancing logic:
601 mirror = conf->mirrors + raid1_read_balance(conf, bh);
603 bh_req = &r1_bh->bh_req;
604 memcpy(bh_req, bh, sizeof(*bh));
605 bh_req->b_blocknr = bh->b_rsector / sectors;
606 bh_req->b_dev = mirror->dev;
607 bh_req->b_rdev = mirror->dev;
608 /* bh_req->b_rsector = bh->n_rsector; */
609 bh_req->b_end_io = raid1_end_request;
610 bh_req->b_private = r1_bh;
611 generic_make_request (rw, bh_req);
612 return 0;
616 * WRITE:
619 bhl = raid1_alloc_bh(conf, conf->raid_disks);
620 for (i = 0; i < disks; i++) {
621 struct buffer_head *mbh;
622 if (!conf->mirrors[i].operational)
623 continue;
626 * We should use a private pool (size depending on NR_REQUEST),
627 * to avoid writes filling up the memory with bhs
629 * Such pools are much faster than kmalloc anyways (so we waste
630 * almost nothing by not using the master bh when writing and
631 * win alot of cleanness) but for now we are cool enough. --mingo
633 * It's safe to sleep here, buffer heads cannot be used in a shared
634 * manner in the write branch. Look how we lock the buffer at the
635 * beginning of this function to grok the difference ;)
637 mbh = bhl;
638 if (mbh == NULL) {
639 MD_BUG();
640 break;
642 bhl = mbh->b_next;
643 mbh->b_next = NULL;
644 mbh->b_this_page = (struct buffer_head *)1;
647 * prepare mirrored mbh (fields ordered for max mem throughput):
649 mbh->b_blocknr = bh->b_rsector / sectors;
650 mbh->b_dev = conf->mirrors[i].dev;
651 mbh->b_rdev = conf->mirrors[i].dev;
652 mbh->b_rsector = bh->b_rsector;
653 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
654 (1<<BH_Mapped) | (1<<BH_Lock);
656 atomic_set(&mbh->b_count, 1);
657 mbh->b_size = bh->b_size;
658 mbh->b_page = bh->b_page;
659 mbh->b_data = bh->b_data;
660 mbh->b_list = BUF_LOCKED;
661 mbh->b_end_io = raid1_end_request;
662 mbh->b_private = r1_bh;
664 mbh->b_next = r1_bh->mirror_bh_list;
665 r1_bh->mirror_bh_list = mbh;
666 sum_bhs++;
668 if (bhl) raid1_free_bh(conf,bhl);
669 md_atomic_set(&r1_bh->remaining, sum_bhs);
672 * We have to be a bit careful about the semaphore above, thats
673 * why we start the requests separately. Since kmalloc() could
674 * fail, sleep and make_request() can sleep too, this is the
675 * safer solution. Imagine, end_request decreasing the semaphore
676 * before we could have set it up ... We could play tricks with
677 * the semaphore (presetting it and correcting at the end if
678 * sum_bhs is not 'n' but we have to do end_request by hand if
679 * all requests finish until we had a chance to set up the
680 * semaphore correctly ... lots of races).
682 bh = r1_bh->mirror_bh_list;
683 while(bh) {
684 struct buffer_head *bh2 = bh;
685 bh = bh->b_next;
686 generic_make_request(rw, bh2);
688 return (0);
691 static int raid1_status (char *page, mddev_t *mddev)
693 raid1_conf_t *conf = mddev_to_conf(mddev);
694 int sz = 0, i;
696 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
697 conf->working_disks);
698 for (i = 0; i < conf->raid_disks; i++)
699 sz += sprintf (page+sz, "%s",
700 conf->mirrors[i].operational ? "U" : "_");
701 sz += sprintf (page+sz, "]");
702 return sz;
705 #define LAST_DISK KERN_ALERT \
706 "raid1: only one disk left and IO error.\n"
708 #define NO_SPARE_DISK KERN_ALERT \
709 "raid1: no spare disk left, degrading mirror level by one.\n"
711 #define DISK_FAILED KERN_ALERT \
712 "raid1: Disk failure on %s, disabling device. \n" \
713 " Operation continuing on %d devices\n"
715 #define START_SYNCING KERN_ALERT \
716 "raid1: start syncing spare disk.\n"
718 #define ALREADY_SYNCING KERN_INFO \
719 "raid1: syncing already in progress.\n"
721 static void mark_disk_bad (mddev_t *mddev, int failed)
723 raid1_conf_t *conf = mddev_to_conf(mddev);
724 struct mirror_info *mirror = conf->mirrors+failed;
725 mdp_super_t *sb = mddev->sb;
727 mirror->operational = 0;
728 mark_disk_faulty(sb->disks+mirror->number);
729 mark_disk_nonsync(sb->disks+mirror->number);
730 mark_disk_inactive(sb->disks+mirror->number);
731 sb->active_disks--;
732 sb->working_disks--;
733 sb->failed_disks++;
734 mddev->sb_dirty = 1;
735 md_wakeup_thread(conf->thread);
736 conf->working_disks--;
737 printk (DISK_FAILED, partition_name (mirror->dev),
738 conf->working_disks);
741 static int raid1_error (mddev_t *mddev, kdev_t dev)
743 raid1_conf_t *conf = mddev_to_conf(mddev);
744 struct mirror_info * mirrors = conf->mirrors;
745 int disks = MD_SB_DISKS;
746 int i;
748 if (conf->working_disks == 1) {
750 * Uh oh, we can do nothing if this is our last disk, but
751 * first check if this is a queued request for a device
752 * which has just failed.
754 for (i = 0; i < disks; i++) {
755 if (mirrors[i].dev==dev && !mirrors[i].operational)
756 return 0;
758 printk (LAST_DISK);
759 } else {
761 * Mark disk as unusable
763 for (i = 0; i < disks; i++) {
764 if (mirrors[i].dev==dev && mirrors[i].operational) {
765 mark_disk_bad(mddev, i);
766 break;
770 return 0;
773 #undef LAST_DISK
774 #undef NO_SPARE_DISK
775 #undef DISK_FAILED
776 #undef START_SYNCING
779 static void print_raid1_conf (raid1_conf_t *conf)
781 int i;
782 struct mirror_info *tmp;
784 printk("RAID1 conf printout:\n");
785 if (!conf) {
786 printk("(conf==NULL)\n");
787 return;
789 printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
790 conf->raid_disks, conf->nr_disks);
792 for (i = 0; i < MD_SB_DISKS; i++) {
793 tmp = conf->mirrors + i;
794 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
795 i, tmp->spare,tmp->operational,
796 tmp->number,tmp->raid_disk,tmp->used_slot,
797 partition_name(tmp->dev));
801 static void close_sync(raid1_conf_t *conf)
803 mddev_t *mddev = conf->mddev;
804 /* If reconstruction was interrupted, we need to close the "active" and "pending"
805 * holes.
806 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
808 /* this is really needed when recovery stops too... */
809 spin_lock_irq(&conf->segment_lock);
810 conf->start_active = conf->start_pending;
811 conf->start_ready = conf->start_pending;
812 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
813 conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
814 conf->start_future = mddev->sb->size+1;
815 conf->cnt_pending = conf->cnt_future;
816 conf->cnt_future = 0;
817 conf->phase = conf->phase ^1;
818 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
819 conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
820 conf->phase = 0;
821 conf->cnt_future = conf->cnt_done;;
822 conf->cnt_done = 0;
823 spin_unlock_irq(&conf->segment_lock);
824 wake_up(&conf->wait_done);
827 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
829 int err = 0;
830 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
831 raid1_conf_t *conf = mddev->private;
832 struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
833 mdp_super_t *sb = mddev->sb;
834 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
836 print_raid1_conf(conf);
837 md_spin_lock_irq(&conf->device_lock);
839 * find the disk ...
841 switch (state) {
843 case DISKOP_SPARE_ACTIVE:
846 * Find the failed disk within the RAID1 configuration ...
847 * (this can only be in the first conf->working_disks part)
849 for (i = 0; i < conf->raid_disks; i++) {
850 tmp = conf->mirrors + i;
851 if ((!tmp->operational && !tmp->spare) ||
852 !tmp->used_slot) {
853 failed_disk = i;
854 break;
858 * When we activate a spare disk we _must_ have a disk in
859 * the lower (active) part of the array to replace.
861 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
862 MD_BUG();
863 err = 1;
864 goto abort;
866 /* fall through */
868 case DISKOP_SPARE_WRITE:
869 case DISKOP_SPARE_INACTIVE:
872 * Find the spare disk ... (can only be in the 'high'
873 * area of the array)
875 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
876 tmp = conf->mirrors + i;
877 if (tmp->spare && tmp->number == (*d)->number) {
878 spare_disk = i;
879 break;
882 if (spare_disk == -1) {
883 MD_BUG();
884 err = 1;
885 goto abort;
887 break;
889 case DISKOP_HOT_REMOVE_DISK:
891 for (i = 0; i < MD_SB_DISKS; i++) {
892 tmp = conf->mirrors + i;
893 if (tmp->used_slot && (tmp->number == (*d)->number)) {
894 if (tmp->operational) {
895 err = -EBUSY;
896 goto abort;
898 removed_disk = i;
899 break;
902 if (removed_disk == -1) {
903 MD_BUG();
904 err = 1;
905 goto abort;
907 break;
909 case DISKOP_HOT_ADD_DISK:
911 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
912 tmp = conf->mirrors + i;
913 if (!tmp->used_slot) {
914 added_disk = i;
915 break;
918 if (added_disk == -1) {
919 MD_BUG();
920 err = 1;
921 goto abort;
923 break;
926 switch (state) {
928 * Switch the spare disk to write-only mode:
930 case DISKOP_SPARE_WRITE:
931 sdisk = conf->mirrors + spare_disk;
932 sdisk->operational = 1;
933 sdisk->write_only = 1;
934 break;
936 * Deactivate a spare disk:
938 case DISKOP_SPARE_INACTIVE:
939 close_sync(conf);
940 sdisk = conf->mirrors + spare_disk;
941 sdisk->operational = 0;
942 sdisk->write_only = 0;
943 break;
945 * Activate (mark read-write) the (now sync) spare disk,
946 * which means we switch it's 'raid position' (->raid_disk)
947 * with the failed disk. (only the first 'conf->nr_disks'
948 * slots are used for 'real' disks and we must preserve this
949 * property)
951 case DISKOP_SPARE_ACTIVE:
952 close_sync(conf);
953 sdisk = conf->mirrors + spare_disk;
954 fdisk = conf->mirrors + failed_disk;
956 spare_desc = &sb->disks[sdisk->number];
957 failed_desc = &sb->disks[fdisk->number];
959 if (spare_desc != *d) {
960 MD_BUG();
961 err = 1;
962 goto abort;
965 if (spare_desc->raid_disk != sdisk->raid_disk) {
966 MD_BUG();
967 err = 1;
968 goto abort;
971 if (sdisk->raid_disk != spare_disk) {
972 MD_BUG();
973 err = 1;
974 goto abort;
977 if (failed_desc->raid_disk != fdisk->raid_disk) {
978 MD_BUG();
979 err = 1;
980 goto abort;
983 if (fdisk->raid_disk != failed_disk) {
984 MD_BUG();
985 err = 1;
986 goto abort;
990 * do the switch finally
992 xchg_values(*spare_desc, *failed_desc);
993 xchg_values(*fdisk, *sdisk);
996 * (careful, 'failed' and 'spare' are switched from now on)
998 * we want to preserve linear numbering and we want to
999 * give the proper raid_disk number to the now activated
1000 * disk. (this means we switch back these values)
1003 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1004 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1005 xchg_values(spare_desc->number, failed_desc->number);
1006 xchg_values(sdisk->number, fdisk->number);
1008 *d = failed_desc;
1010 if (sdisk->dev == MKDEV(0,0))
1011 sdisk->used_slot = 0;
1013 * this really activates the spare.
1015 fdisk->spare = 0;
1016 fdisk->write_only = 0;
1019 * if we activate a spare, we definitely replace a
1020 * non-operational disk slot in the 'low' area of
1021 * the disk array.
1024 conf->working_disks++;
1026 break;
1028 case DISKOP_HOT_REMOVE_DISK:
1029 rdisk = conf->mirrors + removed_disk;
1031 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1032 MD_BUG();
1033 err = 1;
1034 goto abort;
1036 rdisk->dev = MKDEV(0,0);
1037 rdisk->used_slot = 0;
1038 conf->nr_disks--;
1039 break;
1041 case DISKOP_HOT_ADD_DISK:
1042 adisk = conf->mirrors + added_disk;
1043 added_desc = *d;
1045 if (added_disk != added_desc->number) {
1046 MD_BUG();
1047 err = 1;
1048 goto abort;
1051 adisk->number = added_desc->number;
1052 adisk->raid_disk = added_desc->raid_disk;
1053 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1055 adisk->operational = 0;
1056 adisk->write_only = 0;
1057 adisk->spare = 1;
1058 adisk->used_slot = 1;
1059 adisk->head_position = 0;
1060 conf->nr_disks++;
1062 break;
1064 default:
1065 MD_BUG();
1066 err = 1;
1067 goto abort;
1069 abort:
1070 md_spin_unlock_irq(&conf->device_lock);
1071 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1072 /* should move to "END_REBUILD" when such exists */
1073 raid1_shrink_buffers(conf);
1075 print_raid1_conf(conf);
1076 return err;
1080 #define IO_ERROR KERN_ALERT \
1081 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1083 #define REDIRECT_SECTOR KERN_ERR \
1084 "raid1: %s: redirecting sector %lu to another mirror\n"
1087 * This is a kernel thread which:
1089 * 1. Retries failed read operations on working mirrors.
1090 * 2. Updates the raid superblock when problems encounter.
1091 * 3. Performs writes following reads for array syncronising.
1093 static void end_sync_write(struct buffer_head *bh, int uptodate);
1094 static void end_sync_read(struct buffer_head *bh, int uptodate);
1096 static void raid1d (void *data)
1098 struct raid1_bh *r1_bh;
1099 struct buffer_head *bh;
1100 unsigned long flags;
1101 mddev_t *mddev;
1102 kdev_t dev;
1105 for (;;) {
1106 md_spin_lock_irqsave(&retry_list_lock, flags);
1107 r1_bh = raid1_retry_list;
1108 if (!r1_bh)
1109 break;
1110 raid1_retry_list = r1_bh->next_r1;
1111 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1113 mddev = r1_bh->mddev;
1114 if (mddev->sb_dirty) {
1115 printk(KERN_INFO "dirty sb detected, updating.\n");
1116 mddev->sb_dirty = 0;
1117 md_update_sb(mddev);
1119 bh = &r1_bh->bh_req;
1120 switch(r1_bh->cmd) {
1121 case SPECIAL:
1122 /* have to allocate lots of bh structures and
1123 * schedule writes
1125 if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1126 int i, sum_bhs = 0;
1127 int disks = MD_SB_DISKS;
1128 struct buffer_head *bhl, *mbh;
1129 raid1_conf_t *conf;
1130 int sectors = bh->b_size >> 9;
1132 conf = mddev_to_conf(mddev);
1133 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1134 for (i = 0; i < disks ; i++) {
1135 if (!conf->mirrors[i].operational)
1136 continue;
1137 if (i==conf->last_used)
1138 /* we read from here, no need to write */
1139 continue;
1140 if (i < conf->raid_disks
1141 && !conf->resync_mirrors)
1142 /* don't need to write this,
1143 * we are just rebuilding */
1144 continue;
1145 mbh = bhl;
1146 if (!mbh) {
1147 MD_BUG();
1148 break;
1150 bhl = mbh->b_next;
1151 mbh->b_this_page = (struct buffer_head *)1;
1155 * prepare mirrored bh (fields ordered for max mem throughput):
1157 mbh->b_blocknr = bh->b_blocknr;
1158 mbh->b_dev = conf->mirrors[i].dev;
1159 mbh->b_rdev = conf->mirrors[i].dev;
1160 mbh->b_rsector = bh->b_blocknr * sectors;
1161 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
1162 (1<<BH_Mapped) | (1<<BH_Lock);
1163 atomic_set(&mbh->b_count, 1);
1164 mbh->b_size = bh->b_size;
1165 mbh->b_page = bh->b_page;
1166 mbh->b_data = bh->b_data;
1167 mbh->b_list = BUF_LOCKED;
1168 mbh->b_end_io = end_sync_write;
1169 mbh->b_private = r1_bh;
1171 mbh->b_next = r1_bh->mirror_bh_list;
1172 r1_bh->mirror_bh_list = mbh;
1174 sum_bhs++;
1176 md_atomic_set(&r1_bh->remaining, sum_bhs);
1177 if (bhl) raid1_free_bh(conf, bhl);
1178 mbh = r1_bh->mirror_bh_list;
1179 while (mbh) {
1180 struct buffer_head *bh1 = mbh;
1181 mbh = mbh->b_next;
1182 generic_make_request(WRITE, bh1);
1183 md_sync_acct(bh1->b_dev, bh1->b_size/512);
1185 } else {
1186 dev = bh->b_dev;
1187 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1188 if (bh->b_dev == dev) {
1189 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1190 md_done_sync(mddev, bh->b_size>>10, 0);
1191 } else {
1192 printk (REDIRECT_SECTOR,
1193 partition_name(bh->b_dev), bh->b_blocknr);
1194 bh->b_rdev = bh->b_dev;
1195 generic_make_request(READ, bh);
1199 break;
1200 case READ:
1201 case READA:
1202 dev = bh->b_dev;
1204 raid1_map (mddev, &bh->b_dev, bh->b_size >> 9);
1205 if (bh->b_dev == dev) {
1206 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1207 raid1_end_bh_io(r1_bh, 0);
1208 } else {
1209 printk (REDIRECT_SECTOR,
1210 partition_name(bh->b_dev), bh->b_blocknr);
1211 bh->b_rdev = bh->b_dev;
1212 generic_make_request (r1_bh->cmd, bh);
1214 break;
1217 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1219 #undef IO_ERROR
1220 #undef REDIRECT_SECTOR
1223 * Private kernel thread to reconstruct mirrors after an unclean
1224 * shutdown.
1226 static void raid1syncd (void *data)
1228 raid1_conf_t *conf = data;
1229 mddev_t *mddev = conf->mddev;
1231 if (!conf->resync_mirrors)
1232 return;
1233 if (conf->resync_mirrors == 2)
1234 return;
1235 down(&mddev->recovery_sem);
1236 if (!md_do_sync(mddev, NULL)) {
1238 * Only if everything went Ok.
1240 conf->resync_mirrors = 0;
1243 close_sync(conf);
1245 up(&mddev->recovery_sem);
1246 raid1_shrink_buffers(conf);
1250 * perform a "sync" on one "block"
1252 * We need to make sure that no normal I/O request - particularly write
1253 * requests - conflict with active sync requests.
1254 * This is achieved by conceptually dividing the device space into a
1255 * number of sections:
1256 * DONE: 0 .. a-1 These blocks are in-sync
1257 * ACTIVE: a.. b-1 These blocks may have active sync requests, but
1258 * no normal IO requests
1259 * READY: b .. c-1 These blocks have no normal IO requests - sync
1260 * request may be happening
1261 * PENDING: c .. d-1 These blocks may have IO requests, but no new
1262 * ones will be added
1263 * FUTURE: d .. end These blocks are not to be considered yet. IO may
1264 * be happening, but not sync
1266 * We keep a
1267 * phase which flips (0 or 1) each time d moves and
1268 * a count of:
1269 * z = active io requests in FUTURE since d moved - marked with
1270 * current phase
1271 * y = active io requests in FUTURE before d moved, or PENDING -
1272 * marked with previous phase
1273 * x = active sync requests in READY
1274 * w = active sync requests in ACTIVE
1275 * v = active io requests in DONE
1277 * Normally, a=b=c=d=0 and z= active io requests
1278 * or a=b=c=d=END and v= active io requests
1279 * Allowed changes to a,b,c,d:
1280 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
1281 * B: y==0 -> c=d
1282 * C: b=c, w+=x, x=0
1283 * D: w==0 -> a=b
1284 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1286 * At start of sync we apply A.
1287 * When y reaches 0, we apply B then A then being sync requests
1288 * When sync point reaches c-1, we wait for y==0, and W==0, and
1289 * then apply apply B then A then D then C.
1290 * Finally, we apply E
1292 * The sync request simply issues a "read" against a working drive
1293 * This is marked so that on completion the raid1d thread is woken to
1294 * issue suitable write requests
1297 static int raid1_sync_request (mddev_t *mddev, unsigned long block_nr)
1299 raid1_conf_t *conf = mddev_to_conf(mddev);
1300 struct mirror_info *mirror;
1301 struct raid1_bh *r1_bh;
1302 struct buffer_head *bh;
1303 int bsize;
1304 int disk;
1306 spin_lock_irq(&conf->segment_lock);
1307 if (!block_nr) {
1308 /* initialize ...*/
1309 int buffs;
1310 conf->start_active = 0;
1311 conf->start_ready = 0;
1312 conf->start_pending = 0;
1313 conf->start_future = 0;
1314 conf->phase = 0;
1315 /* we want enough buffers to hold twice the window of 128*/
1316 buffs = 128 *2 / (PAGE_SIZE>>9);
1317 buffs = raid1_grow_buffers(conf, buffs);
1318 if (buffs < 2)
1319 goto nomem;
1321 conf->window = buffs*(PAGE_SIZE>>9)/2;
1322 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1323 conf->cnt_done = conf->cnt_pending = 0;
1324 if (conf->cnt_ready || conf->cnt_active)
1325 MD_BUG();
1327 while ((block_nr<<1) >= conf->start_pending) {
1328 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1329 block_nr<<1, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1330 conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1331 wait_event_lock_irq(conf->wait_done,
1332 !conf->cnt_active,
1333 conf->segment_lock);
1334 wait_event_lock_irq(conf->wait_ready,
1335 !conf->cnt_pending,
1336 conf->segment_lock);
1337 conf->start_active = conf->start_ready;
1338 conf->start_ready = conf->start_pending;
1339 conf->start_pending = conf->start_future;
1340 conf->start_future = conf->start_future+conf->window;
1341 // Note: falling off the end is not a problem
1342 conf->phase = conf->phase ^1;
1343 conf->cnt_active = conf->cnt_ready;
1344 conf->cnt_ready = 0;
1345 conf->cnt_pending = conf->cnt_future;
1346 conf->cnt_future = 0;
1347 wake_up(&conf->wait_done);
1349 conf->cnt_ready++;
1350 spin_unlock_irq(&conf->segment_lock);
1353 /* If reconstructing, and >1 working disc,
1354 * could dedicate one to rebuild and others to
1355 * service read requests ..
1357 disk = conf->last_used;
1358 /* make sure disk is operational */
1359 while (!conf->mirrors[disk].operational) {
1360 if (disk <= 0) disk = conf->raid_disks;
1361 disk--;
1362 if (disk == conf->last_used)
1363 break;
1365 conf->last_used = disk;
1367 mirror = conf->mirrors+conf->last_used;
1369 r1_bh = raid1_alloc_buf (conf);
1370 r1_bh->master_bh = NULL;
1371 r1_bh->mddev = mddev;
1372 r1_bh->cmd = SPECIAL;
1373 bh = &r1_bh->bh_req;
1375 bh->b_blocknr = block_nr;
1376 bsize = 1024;
1377 while (!(bh->b_blocknr & 1) && bsize < PAGE_SIZE
1378 && (bh->b_blocknr+2)*(bsize>>10) < mddev->sb->size) {
1379 bh->b_blocknr >>= 1;
1380 bsize <<= 1;
1382 bh->b_size = bsize;
1383 bh->b_list = BUF_LOCKED;
1384 bh->b_dev = mirror->dev;
1385 bh->b_rdev = mirror->dev;
1386 bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1387 if (!bh->b_page)
1388 BUG();
1389 if (!bh->b_data)
1390 BUG();
1391 if (bh->b_data != page_address(bh->b_page))
1392 BUG();
1393 bh->b_end_io = end_sync_read;
1394 bh->b_private = r1_bh;
1395 bh->b_rsector = block_nr<<1;
1396 init_waitqueue_head(&bh->b_wait);
1398 generic_make_request(READ, bh);
1399 md_sync_acct(bh->b_dev, bh->b_size/512);
1401 return (bsize >> 10);
1403 nomem:
1404 raid1_shrink_buffers(conf);
1405 spin_unlock_irq(&conf->segment_lock);
1406 return -ENOMEM;
1409 static void end_sync_read(struct buffer_head *bh, int uptodate)
1411 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1413 /* we have read a block, now it needs to be re-written,
1414 * or re-read if the read failed.
1415 * We don't do much here, just schedule handling by raid1d
1417 if (!uptodate)
1418 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1419 else
1420 set_bit(R1BH_Uptodate, &r1_bh->state);
1421 raid1_reschedule_retry(r1_bh);
1424 static void end_sync_write(struct buffer_head *bh, int uptodate)
1426 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1428 if (!uptodate)
1429 md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev);
1430 if (atomic_dec_and_test(&r1_bh->remaining)) {
1431 mddev_t *mddev = r1_bh->mddev;
1432 unsigned long sect = bh->b_blocknr * (bh->b_size>>9);
1433 int size = bh->b_size;
1434 raid1_free_buf(r1_bh);
1435 sync_request_done(sect, mddev_to_conf(mddev));
1436 md_done_sync(mddev,size>>10, uptodate);
1441 * This will catch the scenario in which one of the mirrors was
1442 * mounted as a normal device rather than as a part of a raid set.
1444 * check_consistency is very personality-dependent, eg. RAID5 cannot
1445 * do this check, it uses another method.
1447 static int __check_consistency (mddev_t *mddev, int row)
1449 raid1_conf_t *conf = mddev_to_conf(mddev);
1450 int disks = MD_SB_DISKS;
1451 kdev_t dev;
1452 struct buffer_head *bh = NULL;
1453 int i, rc = 0;
1454 char *buffer = NULL;
1456 for (i = 0; i < disks; i++) {
1457 printk("(checking disk %d)\n",i);
1458 if (!conf->mirrors[i].operational)
1459 continue;
1460 printk("(really checking disk %d)\n",i);
1461 dev = conf->mirrors[i].dev;
1462 set_blocksize(dev, 4096);
1463 if ((bh = bread(dev, row / 4, 4096)) == NULL)
1464 break;
1465 if (!buffer) {
1466 buffer = (char *) __get_free_page(GFP_KERNEL);
1467 if (!buffer)
1468 break;
1469 memcpy(buffer, bh->b_data, 4096);
1470 } else if (memcmp(buffer, bh->b_data, 4096)) {
1471 rc = 1;
1472 break;
1474 bforget(bh);
1475 fsync_dev(dev);
1476 invalidate_buffers(dev);
1477 bh = NULL;
1479 if (buffer)
1480 free_page((unsigned long) buffer);
1481 if (bh) {
1482 dev = bh->b_dev;
1483 bforget(bh);
1484 fsync_dev(dev);
1485 invalidate_buffers(dev);
1487 return rc;
1490 static int check_consistency (mddev_t *mddev)
1492 if (__check_consistency(mddev, 0))
1494 * we do not do this currently, as it's perfectly possible to
1495 * have an inconsistent array when it's freshly created. Only
1496 * newly written data has to be consistent.
1498 return 0;
1500 return 0;
1503 #define INVALID_LEVEL KERN_WARNING \
1504 "raid1: md%d: raid level not set to mirroring (%d)\n"
1506 #define NO_SB KERN_ERR \
1507 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1509 #define ERRORS KERN_ERR \
1510 "raid1: disabled mirror %s (errors detected)\n"
1512 #define NOT_IN_SYNC KERN_ERR \
1513 "raid1: disabled mirror %s (not in sync)\n"
1515 #define INCONSISTENT KERN_ERR \
1516 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1518 #define ALREADY_RUNNING KERN_ERR \
1519 "raid1: disabled mirror %s (mirror %d already operational)\n"
1521 #define OPERATIONAL KERN_INFO \
1522 "raid1: device %s operational as mirror %d\n"
1524 #define MEM_ERROR KERN_ERR \
1525 "raid1: couldn't allocate memory for md%d\n"
1527 #define SPARE KERN_INFO \
1528 "raid1: spare disk %s\n"
1530 #define NONE_OPERATIONAL KERN_ERR \
1531 "raid1: no operational mirrors for md%d\n"
1533 #define RUNNING_CKRAID KERN_ERR \
1534 "raid1: detected mirror differences -- running resync\n"
1536 #define ARRAY_IS_ACTIVE KERN_INFO \
1537 "raid1: raid set md%d active with %d out of %d mirrors\n"
1539 #define THREAD_ERROR KERN_ERR \
1540 "raid1: couldn't allocate thread for md%d\n"
1542 #define START_RESYNC KERN_WARNING \
1543 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1545 static int raid1_run (mddev_t *mddev)
1547 raid1_conf_t *conf;
1548 int i, j, disk_idx;
1549 struct mirror_info *disk;
1550 mdp_super_t *sb = mddev->sb;
1551 mdp_disk_t *descriptor;
1552 mdk_rdev_t *rdev;
1553 struct md_list_head *tmp;
1554 int start_recovery = 0;
1556 MOD_INC_USE_COUNT;
1558 if (sb->level != 1) {
1559 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1560 goto out;
1563 * copy the already verified devices into our private RAID1
1564 * bookkeeping area. [whatever we allocate in raid1_run(),
1565 * should be freed in raid1_stop()]
1568 conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1569 mddev->private = conf;
1570 if (!conf) {
1571 printk(MEM_ERROR, mdidx(mddev));
1572 goto out;
1574 memset(conf, 0, sizeof(*conf));
1576 ITERATE_RDEV(mddev,rdev,tmp) {
1577 if (rdev->faulty) {
1578 printk(ERRORS, partition_name(rdev->dev));
1579 } else {
1580 if (!rdev->sb) {
1581 MD_BUG();
1582 continue;
1585 if (rdev->desc_nr == -1) {
1586 MD_BUG();
1587 continue;
1589 descriptor = &sb->disks[rdev->desc_nr];
1590 disk_idx = descriptor->raid_disk;
1591 disk = conf->mirrors + disk_idx;
1593 if (disk_faulty(descriptor)) {
1594 disk->number = descriptor->number;
1595 disk->raid_disk = disk_idx;
1596 disk->dev = rdev->dev;
1597 disk->sect_limit = MAX_WORK_PER_DISK;
1598 disk->operational = 0;
1599 disk->write_only = 0;
1600 disk->spare = 0;
1601 disk->used_slot = 1;
1602 disk->head_position = 0;
1603 continue;
1605 if (disk_active(descriptor)) {
1606 if (!disk_sync(descriptor)) {
1607 printk(NOT_IN_SYNC,
1608 partition_name(rdev->dev));
1609 continue;
1611 if ((descriptor->number > MD_SB_DISKS) ||
1612 (disk_idx > sb->raid_disks)) {
1614 printk(INCONSISTENT,
1615 partition_name(rdev->dev));
1616 continue;
1618 if (disk->operational) {
1619 printk(ALREADY_RUNNING,
1620 partition_name(rdev->dev),
1621 disk_idx);
1622 continue;
1624 printk(OPERATIONAL, partition_name(rdev->dev),
1625 disk_idx);
1626 disk->number = descriptor->number;
1627 disk->raid_disk = disk_idx;
1628 disk->dev = rdev->dev;
1629 disk->sect_limit = MAX_WORK_PER_DISK;
1630 disk->operational = 1;
1631 disk->write_only = 0;
1632 disk->spare = 0;
1633 disk->used_slot = 1;
1634 disk->head_position = 0;
1635 conf->working_disks++;
1636 } else {
1638 * Must be a spare disk ..
1640 printk(SPARE, partition_name(rdev->dev));
1641 disk->number = descriptor->number;
1642 disk->raid_disk = disk_idx;
1643 disk->dev = rdev->dev;
1644 disk->sect_limit = MAX_WORK_PER_DISK;
1645 disk->operational = 0;
1646 disk->write_only = 0;
1647 disk->spare = 1;
1648 disk->used_slot = 1;
1649 disk->head_position = 0;
1652 conf->raid_disks = sb->raid_disks;
1653 conf->nr_disks = sb->nr_disks;
1654 conf->mddev = mddev;
1655 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1657 conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1658 init_waitqueue_head(&conf->wait_buffer);
1659 init_waitqueue_head(&conf->wait_done);
1660 init_waitqueue_head(&conf->wait_ready);
1662 if (!conf->working_disks) {
1663 printk(NONE_OPERATIONAL, mdidx(mddev));
1664 goto out_free_conf;
1668 /* pre-allocate some buffer_head structures.
1669 * As a minimum, 1 r1bh and raid_disks buffer_heads
1670 * would probably get us by in tight memory situations,
1671 * but a few more is probably a good idea.
1672 * For now, try 16 r1bh and 16*raid_disks bufferheads
1673 * This will allow at least 16 concurrent reads or writes
1674 * even if kmalloc starts failing
1676 if (raid1_grow_r1bh(conf, 16) < 16 ||
1677 raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
1678 printk(MEM_ERROR, mdidx(mddev));
1679 goto out_free_conf;
1682 for (i = 0; i < MD_SB_DISKS; i++) {
1684 descriptor = sb->disks+i;
1685 disk_idx = descriptor->raid_disk;
1686 disk = conf->mirrors + disk_idx;
1688 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1689 !disk->used_slot) {
1691 disk->number = descriptor->number;
1692 disk->raid_disk = disk_idx;
1693 disk->dev = MKDEV(0,0);
1695 disk->operational = 0;
1696 disk->write_only = 0;
1697 disk->spare = 0;
1698 disk->used_slot = 1;
1699 disk->head_position = 0;
1704 * find the first working one and use it as a starting point
1705 * to read balancing.
1707 for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1708 /* nothing */;
1709 conf->last_used = j;
1712 if (conf->working_disks != sb->raid_disks) {
1713 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1714 start_recovery = 1;
1717 if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
1719 * we do sanity checks even if the device says
1720 * it's clean ...
1722 if (check_consistency(mddev)) {
1723 printk(RUNNING_CKRAID);
1724 sb->state &= ~(1 << MD_SB_CLEAN);
1729 const char * name = "raid1d";
1731 conf->thread = md_register_thread(raid1d, conf, name);
1732 if (!conf->thread) {
1733 printk(THREAD_ERROR, mdidx(mddev));
1734 goto out_free_conf;
1738 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1739 const char * name = "raid1syncd";
1741 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1742 if (!conf->resync_thread) {
1743 printk(THREAD_ERROR, mdidx(mddev));
1744 goto out_free_conf;
1747 printk(START_RESYNC, mdidx(mddev));
1748 conf->resync_mirrors = 1;
1749 md_wakeup_thread(conf->resync_thread);
1753 * Regenerate the "device is in sync with the raid set" bit for
1754 * each device.
1756 for (i = 0; i < MD_SB_DISKS; i++) {
1757 mark_disk_nonsync(sb->disks+i);
1758 for (j = 0; j < sb->raid_disks; j++) {
1759 if (!conf->mirrors[j].operational)
1760 continue;
1761 if (sb->disks[i].number == conf->mirrors[j].number)
1762 mark_disk_sync(sb->disks+i);
1765 sb->active_disks = conf->working_disks;
1767 if (start_recovery)
1768 md_recover_arrays();
1771 printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1773 * Ok, everything is just fine now
1775 return 0;
1777 out_free_conf:
1778 raid1_shrink_r1bh(conf);
1779 raid1_shrink_bh(conf, conf->freebh_cnt);
1780 raid1_shrink_buffers(conf);
1781 kfree(conf);
1782 mddev->private = NULL;
1783 out:
1784 MOD_DEC_USE_COUNT;
1785 return -EIO;
1788 #undef INVALID_LEVEL
1789 #undef NO_SB
1790 #undef ERRORS
1791 #undef NOT_IN_SYNC
1792 #undef INCONSISTENT
1793 #undef ALREADY_RUNNING
1794 #undef OPERATIONAL
1795 #undef SPARE
1796 #undef NONE_OPERATIONAL
1797 #undef RUNNING_CKRAID
1798 #undef ARRAY_IS_ACTIVE
1800 static int raid1_stop_resync (mddev_t *mddev)
1802 raid1_conf_t *conf = mddev_to_conf(mddev);
1804 if (conf->resync_thread) {
1805 if (conf->resync_mirrors) {
1806 conf->resync_mirrors = 2;
1807 md_interrupt_thread(conf->resync_thread);
1809 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1810 return 1;
1812 return 0;
1814 return 0;
1817 static int raid1_restart_resync (mddev_t *mddev)
1819 raid1_conf_t *conf = mddev_to_conf(mddev);
1821 if (conf->resync_mirrors) {
1822 if (!conf->resync_thread) {
1823 MD_BUG();
1824 return 0;
1826 conf->resync_mirrors = 1;
1827 md_wakeup_thread(conf->resync_thread);
1828 return 1;
1830 return 0;
1833 static int raid1_stop (mddev_t *mddev)
1835 raid1_conf_t *conf = mddev_to_conf(mddev);
1837 md_unregister_thread(conf->thread);
1838 if (conf->resync_thread)
1839 md_unregister_thread(conf->resync_thread);
1840 raid1_shrink_r1bh(conf);
1841 raid1_shrink_bh(conf, conf->freebh_cnt);
1842 raid1_shrink_buffers(conf);
1843 kfree(conf);
1844 mddev->private = NULL;
1845 MOD_DEC_USE_COUNT;
1846 return 0;
1849 static mdk_personality_t raid1_personality=
1851 name: "raid1",
1852 make_request: raid1_make_request,
1853 run: raid1_run,
1854 stop: raid1_stop,
1855 status: raid1_status,
1856 error_handler: raid1_error,
1857 diskop: raid1_diskop,
1858 stop_resync: raid1_stop_resync,
1859 restart_resync: raid1_restart_resync,
1860 sync_request: raid1_sync_request
1863 static int md__init raid1_init (void)
1865 return register_md_personality (RAID1, &raid1_personality);
1868 static void raid1_exit (void)
1870 unregister_md_personality (RAID1);
1873 module_init(raid1_init);
1874 module_exit(raid1_exit);