2 * raid1.c : Multiple Devices driver for Linux
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8 * RAID-1 management functions.
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <linux/module.h>
26 #include <linux/malloc.h>
27 #include <linux/raid/raid1.h>
28 #include <asm/atomic.h>
30 #define MAJOR_NR MD_MAJOR
32 #define MD_PERSONALITY
34 #define MAX_WORK_PER_DISK 128
37 * The following can be used to debug the driver
42 #define PRINTK(x...) printk(x)
46 #define PRINTK(x...) do { } while (0)
50 static mdk_personality_t raid1_personality
;
51 static md_spinlock_t retry_list_lock
= MD_SPIN_LOCK_UNLOCKED
;
52 struct raid1_bh
*raid1_retry_list
= NULL
, **raid1_retry_tail
;
54 static struct buffer_head
*raid1_alloc_bh(raid1_conf_t
*conf
, int cnt
)
56 /* return a linked list of "cnt" struct buffer_heads.
57 * don't take any off the free list unless we know we can
58 * get all we need, otherwise we could deadlock
60 struct buffer_head
*bh
=NULL
;
63 struct buffer_head
*t
;
64 md_spin_lock_irq(&conf
->device_lock
);
65 if (conf
->freebh_cnt
>= cnt
)
68 conf
->freebh
= t
->b_next
;
75 md_spin_unlock_irq(&conf
->device_lock
);
78 t
= (struct buffer_head
*)kmalloc(sizeof(struct buffer_head
), GFP_KERNEL
);
80 memset(t
, 0, sizeof(*t
));
85 PRINTK("waiting for %d bh\n", cnt
);
86 wait_event(conf
->wait_buffer
, conf
->freebh_cnt
>= cnt
);
92 static inline void raid1_free_bh(raid1_conf_t
*conf
, struct buffer_head
*bh
)
94 md_spin_lock_irq(&conf
->device_lock
);
96 struct buffer_head
*t
= bh
;
98 if (t
->b_pprev
== NULL
)
101 t
->b_next
= conf
->freebh
;
106 md_spin_unlock_irq(&conf
->device_lock
);
107 wake_up(&conf
->wait_buffer
);
110 static int raid1_grow_bh(raid1_conf_t
*conf
, int cnt
)
112 /* allocate cnt buffer_heads, possibly less if kalloc fails */
116 struct buffer_head
*bh
;
117 bh
= kmalloc(sizeof(*bh
), GFP_KERNEL
);
119 memset(bh
, 0, sizeof(*bh
));
121 md_spin_lock_irq(&conf
->device_lock
);
122 bh
->b_pprev
= &conf
->freebh
;
123 bh
->b_next
= conf
->freebh
;
126 md_spin_unlock_irq(&conf
->device_lock
);
133 static int raid1_shrink_bh(raid1_conf_t
*conf
, int cnt
)
135 /* discard cnt buffer_heads, if we can find them */
138 md_spin_lock_irq(&conf
->device_lock
);
139 while ((i
< cnt
) && conf
->freebh
) {
140 struct buffer_head
*bh
= conf
->freebh
;
141 conf
->freebh
= bh
->b_next
;
146 md_spin_unlock_irq(&conf
->device_lock
);
151 static struct raid1_bh
*raid1_alloc_r1bh(raid1_conf_t
*conf
)
153 struct raid1_bh
*r1_bh
= NULL
;
156 md_spin_lock_irq(&conf
->device_lock
);
158 r1_bh
= conf
->freer1
;
159 conf
->freer1
= r1_bh
->next_r1
;
160 r1_bh
->next_r1
= NULL
;
162 r1_bh
->bh_req
.b_state
= 0;
164 md_spin_unlock_irq(&conf
->device_lock
);
167 r1_bh
= (struct raid1_bh
*) kmalloc(sizeof(struct raid1_bh
),
170 memset(r1_bh
, 0, sizeof(*r1_bh
));
173 wait_event(conf
->wait_buffer
, conf
->freer1
);
177 static inline void raid1_free_r1bh(struct raid1_bh
*r1_bh
)
179 struct buffer_head
*bh
= r1_bh
->mirror_bh_list
;
180 raid1_conf_t
*conf
= mddev_to_conf(r1_bh
->mddev
);
182 r1_bh
->mirror_bh_list
= NULL
;
184 if (test_bit(R1BH_PreAlloc
, &r1_bh
->state
)) {
185 md_spin_lock_irq(&conf
->device_lock
);
186 r1_bh
->next_r1
= conf
->freer1
;
187 conf
->freer1
= r1_bh
;
188 md_spin_unlock_irq(&conf
->device_lock
);
192 raid1_free_bh(conf
, bh
);
195 static int raid1_grow_r1bh (raid1_conf_t
*conf
, int cnt
)
200 struct raid1_bh
*r1_bh
;
201 r1_bh
= (struct raid1_bh
*)kmalloc(sizeof(*r1_bh
), GFP_KERNEL
);
204 memset(r1_bh
, 0, sizeof(*r1_bh
));
206 md_spin_lock_irq(&conf
->device_lock
);
207 set_bit(R1BH_PreAlloc
, &r1_bh
->state
);
208 r1_bh
->next_r1
= conf
->freer1
;
209 conf
->freer1
= r1_bh
;
210 md_spin_unlock_irq(&conf
->device_lock
);
217 static void raid1_shrink_r1bh(raid1_conf_t
*conf
)
219 md_spin_lock_irq(&conf
->device_lock
);
220 while (conf
->freer1
) {
221 struct raid1_bh
*r1_bh
= conf
->freer1
;
222 conf
->freer1
= r1_bh
->next_r1
;
225 md_spin_unlock_irq(&conf
->device_lock
);
230 static inline void raid1_free_buf(struct raid1_bh
*r1_bh
)
232 struct buffer_head
*bh
= r1_bh
->mirror_bh_list
;
233 raid1_conf_t
*conf
= mddev_to_conf(r1_bh
->mddev
);
234 r1_bh
->mirror_bh_list
= NULL
;
236 md_spin_lock_irq(&conf
->device_lock
);
237 r1_bh
->next_r1
= conf
->freebuf
;
238 conf
->freebuf
= r1_bh
;
239 md_spin_unlock_irq(&conf
->device_lock
);
240 raid1_free_bh(conf
, bh
);
243 static struct raid1_bh
*raid1_alloc_buf(raid1_conf_t
*conf
)
245 struct raid1_bh
*r1_bh
;
247 md_spin_lock_irq(&conf
->device_lock
);
248 wait_event_lock_irq(conf
->wait_buffer
, conf
->freebuf
, conf
->device_lock
);
249 r1_bh
= conf
->freebuf
;
250 conf
->freebuf
= r1_bh
->next_r1
;
251 r1_bh
->next_r1
= NULL
;
252 md_spin_unlock_irq(&conf
->device_lock
);
257 static int raid1_grow_buffers (raid1_conf_t
*conf
, int cnt
)
261 md_spin_lock_irq(&conf
->device_lock
);
263 struct raid1_bh
*r1_bh
;
266 page
= alloc_page(GFP_KERNEL
);
270 r1_bh
= (struct raid1_bh
*) kmalloc(sizeof(*r1_bh
), GFP_KERNEL
);
275 memset(r1_bh
, 0, sizeof(*r1_bh
));
276 r1_bh
->bh_req
.b_page
= page
;
277 r1_bh
->bh_req
.b_data
= (char *) page_address(page
);
278 r1_bh
->next_r1
= conf
->freebuf
;
279 conf
->freebuf
= r1_bh
;
282 md_spin_unlock_irq(&conf
->device_lock
);
286 static void raid1_shrink_buffers (raid1_conf_t
*conf
)
288 md_spin_lock_irq(&conf
->device_lock
);
289 while (conf
->freebuf
) {
290 struct raid1_bh
*r1_bh
= conf
->freebuf
;
291 conf
->freebuf
= r1_bh
->next_r1
;
292 __free_page(r1_bh
->bh_req
.b_page
);
295 md_spin_unlock_irq(&conf
->device_lock
);
298 static int raid1_map (mddev_t
*mddev
, kdev_t
*rdev
, unsigned long size
)
300 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
301 int i
, disks
= MD_SB_DISKS
;
304 * Later we do read balancing on the read side
305 * now we use the first available disk.
308 for (i
= 0; i
< disks
; i
++) {
309 if (conf
->mirrors
[i
].operational
) {
310 *rdev
= conf
->mirrors
[i
].dev
;
315 printk (KERN_ERR
"raid1_map(): huh, no more operational devices?\n");
319 static void raid1_reschedule_retry (struct raid1_bh
*r1_bh
)
322 mddev_t
*mddev
= r1_bh
->mddev
;
323 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
325 md_spin_lock_irqsave(&retry_list_lock
, flags
);
326 if (raid1_retry_list
== NULL
)
327 raid1_retry_tail
= &raid1_retry_list
;
328 *raid1_retry_tail
= r1_bh
;
329 raid1_retry_tail
= &r1_bh
->next_r1
;
330 r1_bh
->next_r1
= NULL
;
331 md_spin_unlock_irqrestore(&retry_list_lock
, flags
);
332 md_wakeup_thread(conf
->thread
);
336 static void inline io_request_done(unsigned long sector
, raid1_conf_t
*conf
, int phase
)
339 spin_lock_irqsave(&conf
->segment_lock
, flags
);
340 if (sector
< conf
->start_active
)
342 else if (sector
>= conf
->start_future
&& conf
->phase
== phase
)
344 else if (!--conf
->cnt_pending
)
345 wake_up(&conf
->wait_ready
);
347 spin_unlock_irqrestore(&conf
->segment_lock
, flags
);
350 static void inline sync_request_done (unsigned long sector
, raid1_conf_t
*conf
)
353 spin_lock_irqsave(&conf
->segment_lock
, flags
);
354 if (sector
>= conf
->start_ready
)
356 else if (sector
>= conf
->start_active
) {
357 if (!--conf
->cnt_active
) {
358 conf
->start_active
= conf
->start_ready
;
359 wake_up(&conf
->wait_done
);
362 spin_unlock_irqrestore(&conf
->segment_lock
, flags
);
366 * raid1_end_bh_io() is called when we have finished servicing a mirrored
367 * operation and are ready to return a success/failure code to the buffer
370 static void raid1_end_bh_io (struct raid1_bh
*r1_bh
, int uptodate
)
372 struct buffer_head
*bh
= r1_bh
->master_bh
;
374 io_request_done(bh
->b_rsector
, mddev_to_conf(r1_bh
->mddev
),
375 test_bit(R1BH_SyncPhase
, &r1_bh
->state
));
377 bh
->b_end_io(bh
, uptodate
);
378 raid1_free_r1bh(r1_bh
);
380 void raid1_end_request (struct buffer_head
*bh
, int uptodate
)
382 struct raid1_bh
* r1_bh
= (struct raid1_bh
*)(bh
->b_dev_id
);
385 * this branch is our 'one mirror IO has finished' event handler:
388 md_error (mddev_to_kdev(r1_bh
->mddev
), bh
->b_dev
);
391 * Set R1BH_Uptodate in our master buffer_head, so that
392 * we will return a good error code for to the higher
393 * levels even if IO on some other mirrored buffer fails.
395 * The 'master' represents the complex operation to
396 * user-side. So if something waits for IO, then it will
397 * wait for the 'master' buffer_head.
399 set_bit (R1BH_Uptodate
, &r1_bh
->state
);
402 * We split up the read and write side, imho they are
403 * conceptually different.
406 if ( (r1_bh
->cmd
== READ
) || (r1_bh
->cmd
== READA
) ) {
408 * we have only one buffer_head on the read side
412 raid1_end_bh_io(r1_bh
, uptodate
);
418 printk(KERN_ERR
"raid1: %s: rescheduling block %lu\n",
419 partition_name(bh
->b_dev
), bh
->b_blocknr
);
420 raid1_reschedule_retry(r1_bh
);
427 * Let's see if all mirrored write operations have finished
431 if (atomic_dec_and_test(&r1_bh
->remaining
))
432 raid1_end_bh_io(r1_bh
, test_bit(R1BH_Uptodate
, &r1_bh
->state
));
436 * This routine returns the disk from which the requested read should
437 * be done. It bookkeeps the last read position for every disk
438 * in array and when new read requests come, the disk which last
439 * position is nearest to the request, is chosen.
441 * TODO: now if there are 2 mirrors in the same 2 devices, performance
442 * degrades dramatically because position is mirror, not device based.
443 * This should be changed to be device based. Also atomic sequential
444 * reads should be somehow balanced.
447 static int raid1_read_balance (raid1_conf_t
*conf
, struct buffer_head
*bh
)
449 int new_disk
= conf
->last_used
;
450 const int sectors
= bh
->b_size
>> 9;
451 const long this_sector
= bh
->b_blocknr
* sectors
;
453 unsigned long new_distance
;
454 unsigned long current_distance
;
457 * Check if it is sane at all to balance
460 if (conf
->resync_mirrors
)
463 if (conf
->working_disks
< 2) {
466 while( !conf
->mirrors
[new_disk
].operational
&&
467 (i
< MD_SB_DISKS
) ) {
468 new_disk
= conf
->mirrors
[new_disk
].next
;
472 if (i
>= MD_SB_DISKS
) {
474 * This means no working disk was found
475 * Nothing much to do, lets not change anything
476 * and hope for the best...
479 new_disk
= conf
->last_used
;
486 * Don't touch anything for sequential reads.
489 if (this_sector
== conf
->mirrors
[new_disk
].head_position
)
493 * If reads have been done only on a single disk
494 * for a time, lets give another disk a change.
495 * This is for kicking those idling disks so that
496 * they would find work near some hotspot.
499 if (conf
->sect_count
>= conf
->mirrors
[new_disk
].sect_limit
) {
500 conf
->sect_count
= 0;
502 while( new_disk
!= conf
->mirrors
[new_disk
].next
) {
503 if ((conf
->mirrors
[new_disk
].write_only
) ||
504 (!conf
->mirrors
[new_disk
].operational
) )
507 new_disk
= conf
->mirrors
[new_disk
].next
;
514 current_distance
= abs(this_sector
-
515 conf
->mirrors
[disk
].head_position
);
517 /* Find the disk which is closest */
519 while( conf
->mirrors
[disk
].next
!= conf
->last_used
) {
520 disk
= conf
->mirrors
[disk
].next
;
522 if ((conf
->mirrors
[disk
].write_only
) ||
523 (!conf
->mirrors
[disk
].operational
))
526 new_distance
= abs(this_sector
-
527 conf
->mirrors
[disk
].head_position
);
529 if (new_distance
< current_distance
) {
530 conf
->sect_count
= 0;
531 current_distance
= new_distance
;
537 conf
->mirrors
[new_disk
].head_position
= this_sector
+ sectors
;
539 conf
->last_used
= new_disk
;
540 conf
->sect_count
+= sectors
;
545 static int raid1_make_request (request_queue_t
*q
, mddev_t
*mddev
, int rw
,
546 struct buffer_head
* bh
)
548 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
549 struct buffer_head
*bh_req
, *bhl
;
550 struct raid1_bh
* r1_bh
;
551 int disks
= MD_SB_DISKS
;
552 int i
, sum_bhs
= 0, sectors
;
553 struct mirror_info
*mirror
;
554 DECLARE_WAITQUEUE(wait
, current
);
556 if (!buffer_locked(bh
))
560 * make_request() can abort the operation when READA is being
561 * used and no empty request is available.
563 * Currently, just replace the command with READ/WRITE.
571 * we first clean the bh, then we start the IO, then
572 * when the IO has finished, we end_io the bh and
573 * mark it uptodate. This way we do not miss the
574 * case when the bh got dirty again during the IO.
576 * We do an important optimization here - if the
577 * buffer was not dirty and we are during resync or
578 * reconstruction, then we can skip writing it back
579 * to the master disk! (we still have to write it
580 * back to the other disks, because we are not sync
583 if (atomic_set_buffer_clean(bh
))
584 __mark_buffer_clean(bh
);
586 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
590 r1_bh
= raid1_alloc_r1bh (conf
);
592 spin_lock_irq(&conf
->segment_lock
);
593 wait_event_lock_irq(conf
->wait_done
,
594 bh
->b_rsector
< conf
->start_active
||
595 bh
->b_rsector
>= conf
->start_future
,
597 if (bh
->b_rsector
< conf
->start_active
)
602 set_bit(R1BH_SyncPhase
, &r1_bh
->state
);
604 spin_unlock_irq(&conf
->segment_lock
);
607 * i think the read and write branch should be separated completely,
608 * since we want to do read balancing on the read side for example.
609 * Alternative implementations? :) --mingo
612 r1_bh
->master_bh
= bh
;
613 r1_bh
->mddev
= mddev
;
616 sectors
= bh
->b_size
>> 9;
619 * read balancing logic:
621 mirror
= conf
->mirrors
+ raid1_read_balance(conf
, bh
);
623 bh_req
= &r1_bh
->bh_req
;
624 memcpy(bh_req
, bh
, sizeof(*bh
));
625 bh_req
->b_blocknr
= bh
->b_rsector
* sectors
;
626 bh_req
->b_dev
= mirror
->dev
;
627 bh_req
->b_rdev
= mirror
->dev
;
628 /* bh_req->b_rsector = bh->n_rsector; */
629 bh_req
->b_end_io
= raid1_end_request
;
630 bh_req
->b_dev_id
= r1_bh
;
631 q
= blk_get_queue(bh_req
->b_rdev
);
632 generic_make_request (q
, rw
, bh_req
);
640 bhl
= raid1_alloc_bh(conf
, conf
->raid_disks
);
641 for (i
= 0; i
< disks
; i
++) {
642 struct buffer_head
*mbh
;
643 if (!conf
->mirrors
[i
].operational
)
647 * We should use a private pool (size depending on NR_REQUEST),
648 * to avoid writes filling up the memory with bhs
650 * Such pools are much faster than kmalloc anyways (so we waste
651 * almost nothing by not using the master bh when writing and
652 * win alot of cleanness) but for now we are cool enough. --mingo
654 * It's safe to sleep here, buffer heads cannot be used in a shared
655 * manner in the write branch. Look how we lock the buffer at the
656 * beginning of this function to grok the difference ;)
665 mbh
->b_this_page
= (struct buffer_head
*)1;
668 * prepare mirrored mbh (fields ordered for max mem throughput):
670 mbh
->b_blocknr
= bh
->b_rsector
* sectors
;
671 mbh
->b_dev
= conf
->mirrors
[i
].dev
;
672 mbh
->b_rdev
= conf
->mirrors
[i
].dev
;
673 mbh
->b_rsector
= bh
->b_rsector
;
674 mbh
->b_state
= (1<<BH_Req
) | (1<<BH_Dirty
) |
675 (1<<BH_Mapped
) | (1<<BH_Lock
);
677 atomic_set(&mbh
->b_count
, 1);
678 mbh
->b_size
= bh
->b_size
;
679 mbh
->b_page
= bh
->b_page
;
680 mbh
->b_data
= bh
->b_data
;
681 mbh
->b_list
= BUF_LOCKED
;
682 mbh
->b_end_io
= raid1_end_request
;
683 mbh
->b_dev_id
= r1_bh
;
685 mbh
->b_next
= r1_bh
->mirror_bh_list
;
686 r1_bh
->mirror_bh_list
= mbh
;
689 if (bhl
) raid1_free_bh(conf
,bhl
);
690 md_atomic_set(&r1_bh
->remaining
, sum_bhs
);
693 * We have to be a bit careful about the semaphore above, thats
694 * why we start the requests separately. Since kmalloc() could
695 * fail, sleep and make_request() can sleep too, this is the
696 * safer solution. Imagine, end_request decreasing the semaphore
697 * before we could have set it up ... We could play tricks with
698 * the semaphore (presetting it and correcting at the end if
699 * sum_bhs is not 'n' but we have to do end_request by hand if
700 * all requests finish until we had a chance to set up the
701 * semaphore correctly ... lots of races).
703 bh
= r1_bh
->mirror_bh_list
;
705 struct buffer_head
*bh2
= bh
;
707 q
= blk_get_queue(bh2
->b_rdev
);
708 generic_make_request(q
, rw
, bh2
);
713 static int raid1_status (char *page
, mddev_t
*mddev
)
715 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
718 sz
+= sprintf (page
+sz
, " [%d/%d] [", conf
->raid_disks
,
719 conf
->working_disks
);
720 for (i
= 0; i
< conf
->raid_disks
; i
++)
721 sz
+= sprintf (page
+sz
, "%s",
722 conf
->mirrors
[i
].operational
? "U" : "_");
723 sz
+= sprintf (page
+sz
, "]");
727 static void unlink_disk (raid1_conf_t
*conf
, int target
)
729 int disks
= MD_SB_DISKS
;
732 for (i
= 0; i
< disks
; i
++)
733 if (conf
->mirrors
[i
].next
== target
)
734 conf
->mirrors
[i
].next
= conf
->mirrors
[target
].next
;
737 #define LAST_DISK KERN_ALERT \
738 "raid1: only one disk left and IO error.\n"
740 #define NO_SPARE_DISK KERN_ALERT \
741 "raid1: no spare disk left, degrading mirror level by one.\n"
743 #define DISK_FAILED KERN_ALERT \
744 "raid1: Disk failure on %s, disabling device. \n" \
745 " Operation continuing on %d devices\n"
747 #define START_SYNCING KERN_ALERT \
748 "raid1: start syncing spare disk.\n"
750 #define ALREADY_SYNCING KERN_INFO \
751 "raid1: syncing already in progress.\n"
753 static void mark_disk_bad (mddev_t
*mddev
, int failed
)
755 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
756 struct mirror_info
*mirror
= conf
->mirrors
+failed
;
757 mdp_super_t
*sb
= mddev
->sb
;
759 mirror
->operational
= 0;
760 unlink_disk(conf
, failed
);
761 mark_disk_faulty(sb
->disks
+mirror
->number
);
762 mark_disk_nonsync(sb
->disks
+mirror
->number
);
763 mark_disk_inactive(sb
->disks
+mirror
->number
);
768 md_wakeup_thread(conf
->thread
);
769 conf
->working_disks
--;
770 printk (DISK_FAILED
, partition_name (mirror
->dev
),
771 conf
->working_disks
);
774 static int raid1_error (mddev_t
*mddev
, kdev_t dev
)
776 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
777 struct mirror_info
* mirrors
= conf
->mirrors
;
778 int disks
= MD_SB_DISKS
;
781 if (conf
->working_disks
== 1) {
783 * Uh oh, we can do nothing if this is our last disk, but
784 * first check if this is a queued request for a device
785 * which has just failed.
787 for (i
= 0; i
< disks
; i
++) {
788 if (mirrors
[i
].dev
==dev
&& !mirrors
[i
].operational
)
794 * Mark disk as unusable
796 for (i
= 0; i
< disks
; i
++) {
797 if (mirrors
[i
].dev
==dev
&& mirrors
[i
].operational
) {
798 mark_disk_bad(mddev
, i
);
812 * Insert the spare disk into the drive-ring
814 static void link_disk(raid1_conf_t
*conf
, struct mirror_info
*mirror
)
817 int disks
= MD_SB_DISKS
;
818 struct mirror_info
*p
= conf
->mirrors
;
820 for (j
= 0; j
< disks
; j
++, p
++)
821 if (p
->operational
&& !p
->write_only
) {
823 p
->next
= mirror
->raid_disk
;
828 printk("raid1: bug: no read-operational devices\n");
831 static void print_raid1_conf (raid1_conf_t
*conf
)
834 struct mirror_info
*tmp
;
836 printk("RAID1 conf printout:\n");
838 printk("(conf==NULL)\n");
841 printk(" --- wd:%d rd:%d nd:%d\n", conf
->working_disks
,
842 conf
->raid_disks
, conf
->nr_disks
);
844 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
845 tmp
= conf
->mirrors
+ i
;
846 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
847 i
, tmp
->spare
,tmp
->operational
,
848 tmp
->number
,tmp
->raid_disk
,tmp
->used_slot
,
849 partition_name(tmp
->dev
));
853 static int raid1_diskop(mddev_t
*mddev
, mdp_disk_t
**d
, int state
)
856 int i
, failed_disk
=-1, spare_disk
=-1, removed_disk
=-1, added_disk
=-1;
857 raid1_conf_t
*conf
= mddev
->private;
858 struct mirror_info
*tmp
, *sdisk
, *fdisk
, *rdisk
, *adisk
;
859 mdp_super_t
*sb
= mddev
->sb
;
860 mdp_disk_t
*failed_desc
, *spare_desc
, *added_desc
;
862 print_raid1_conf(conf
);
863 md_spin_lock_irq(&conf
->device_lock
);
869 case DISKOP_SPARE_ACTIVE
:
872 * Find the failed disk within the RAID1 configuration ...
873 * (this can only be in the first conf->working_disks part)
875 for (i
= 0; i
< conf
->raid_disks
; i
++) {
876 tmp
= conf
->mirrors
+ i
;
877 if ((!tmp
->operational
&& !tmp
->spare
) ||
884 * When we activate a spare disk we _must_ have a disk in
885 * the lower (active) part of the array to replace.
887 if ((failed_disk
== -1) || (failed_disk
>= conf
->raid_disks
)) {
894 case DISKOP_SPARE_WRITE
:
895 case DISKOP_SPARE_INACTIVE
:
898 * Find the spare disk ... (can only be in the 'high'
901 for (i
= conf
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
902 tmp
= conf
->mirrors
+ i
;
903 if (tmp
->spare
&& tmp
->number
== (*d
)->number
) {
908 if (spare_disk
== -1) {
915 case DISKOP_HOT_REMOVE_DISK
:
917 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
918 tmp
= conf
->mirrors
+ i
;
919 if (tmp
->used_slot
&& (tmp
->number
== (*d
)->number
)) {
920 if (tmp
->operational
) {
928 if (removed_disk
== -1) {
935 case DISKOP_HOT_ADD_DISK
:
937 for (i
= conf
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
938 tmp
= conf
->mirrors
+ i
;
939 if (!tmp
->used_slot
) {
944 if (added_disk
== -1) {
954 * Switch the spare disk to write-only mode:
956 case DISKOP_SPARE_WRITE
:
957 sdisk
= conf
->mirrors
+ spare_disk
;
958 sdisk
->operational
= 1;
959 sdisk
->write_only
= 1;
962 * Deactivate a spare disk:
964 case DISKOP_SPARE_INACTIVE
:
965 sdisk
= conf
->mirrors
+ spare_disk
;
966 sdisk
->operational
= 0;
967 sdisk
->write_only
= 0;
970 * Activate (mark read-write) the (now sync) spare disk,
971 * which means we switch it's 'raid position' (->raid_disk)
972 * with the failed disk. (only the first 'conf->nr_disks'
973 * slots are used for 'real' disks and we must preserve this
976 case DISKOP_SPARE_ACTIVE
:
978 sdisk
= conf
->mirrors
+ spare_disk
;
979 fdisk
= conf
->mirrors
+ failed_disk
;
981 spare_desc
= &sb
->disks
[sdisk
->number
];
982 failed_desc
= &sb
->disks
[fdisk
->number
];
984 if (spare_desc
!= *d
) {
990 if (spare_desc
->raid_disk
!= sdisk
->raid_disk
) {
996 if (sdisk
->raid_disk
!= spare_disk
) {
1002 if (failed_desc
->raid_disk
!= fdisk
->raid_disk
) {
1008 if (fdisk
->raid_disk
!= failed_disk
) {
1015 * do the switch finally
1017 xchg_values(*spare_desc
, *failed_desc
);
1018 xchg_values(*fdisk
, *sdisk
);
1021 * (careful, 'failed' and 'spare' are switched from now on)
1023 * we want to preserve linear numbering and we want to
1024 * give the proper raid_disk number to the now activated
1025 * disk. (this means we switch back these values)
1028 xchg_values(spare_desc
->raid_disk
, failed_desc
->raid_disk
);
1029 xchg_values(sdisk
->raid_disk
, fdisk
->raid_disk
);
1030 xchg_values(spare_desc
->number
, failed_desc
->number
);
1031 xchg_values(sdisk
->number
, fdisk
->number
);
1035 if (sdisk
->dev
== MKDEV(0,0))
1036 sdisk
->used_slot
= 0;
1038 * this really activates the spare.
1041 fdisk
->write_only
= 0;
1042 link_disk(conf
, fdisk
);
1045 * if we activate a spare, we definitely replace a
1046 * non-operational disk slot in the 'low' area of
1050 conf
->working_disks
++;
1054 case DISKOP_HOT_REMOVE_DISK
:
1055 rdisk
= conf
->mirrors
+ removed_disk
;
1057 if (rdisk
->spare
&& (removed_disk
< conf
->raid_disks
)) {
1062 rdisk
->dev
= MKDEV(0,0);
1063 rdisk
->used_slot
= 0;
1067 case DISKOP_HOT_ADD_DISK
:
1068 adisk
= conf
->mirrors
+ added_disk
;
1071 if (added_disk
!= added_desc
->number
) {
1077 adisk
->number
= added_desc
->number
;
1078 adisk
->raid_disk
= added_desc
->raid_disk
;
1079 adisk
->dev
= MKDEV(added_desc
->major
,added_desc
->minor
);
1081 adisk
->operational
= 0;
1082 adisk
->write_only
= 0;
1084 adisk
->used_slot
= 1;
1085 adisk
->head_position
= 0;
1096 md_spin_unlock_irq(&conf
->device_lock
);
1097 if (state
== DISKOP_SPARE_ACTIVE
|| state
== DISKOP_SPARE_INACTIVE
)
1098 /* should move to "END_REBUILD" when such exists */
1099 raid1_shrink_buffers(conf
);
1101 print_raid1_conf(conf
);
1106 #define IO_ERROR KERN_ALERT \
1107 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1109 #define REDIRECT_SECTOR KERN_ERR \
1110 "raid1: %s: redirecting sector %lu to another mirror\n"
1113 * This is a kernel thread which:
1115 * 1. Retries failed read operations on working mirrors.
1116 * 2. Updates the raid superblock when problems encounter.
1117 * 3. Performs writes following reads for array syncronising.
1119 static void end_sync_write(struct buffer_head
*bh
, int uptodate
);
1120 static void end_sync_read(struct buffer_head
*bh
, int uptodate
);
1122 static void raid1d (void *data
)
1124 struct raid1_bh
*r1_bh
;
1125 struct buffer_head
*bh
;
1126 unsigned long flags
;
1133 md_spin_lock_irqsave(&retry_list_lock
, flags
);
1134 r1_bh
= raid1_retry_list
;
1137 raid1_retry_list
= r1_bh
->next_r1
;
1138 md_spin_unlock_irqrestore(&retry_list_lock
, flags
);
1140 mddev
= r1_bh
->mddev
;
1141 if (mddev
->sb_dirty
) {
1142 printk(KERN_INFO
"dirty sb detected, updating.\n");
1143 mddev
->sb_dirty
= 0;
1144 md_update_sb(mddev
);
1146 bh
= &r1_bh
->bh_req
;
1147 switch(r1_bh
->cmd
) {
1149 /* have to allocate lots of bh structures and
1152 if (test_bit(R1BH_Uptodate
, &r1_bh
->state
)) {
1154 int disks
= MD_SB_DISKS
;
1155 struct buffer_head
*bhl
, *mbh
;
1157 int sectors
= bh
->b_size
>> 9;
1159 conf
= mddev_to_conf(mddev
);
1160 bhl
= raid1_alloc_bh(conf
, conf
->raid_disks
); /* don't really need this many */
1161 for (i
= 0; i
< disks
; i
++) {
1162 if (!conf
->mirrors
[i
].operational
)
1164 if (i
==conf
->last_used
)
1165 /* we read from here, no need to write */
1167 if (i
< conf
->raid_disks
1168 && !conf
->resync_mirrors
)
1169 /* don't need to write this,
1170 * we are just rebuilding */
1178 mbh
->b_this_page
= (struct buffer_head
*)1;
1182 * prepare mirrored bh (fields ordered for max mem throughput):
1184 mbh
->b_blocknr
= bh
->b_blocknr
;
1185 mbh
->b_dev
= conf
->mirrors
[i
].dev
;
1186 mbh
->b_rdev
= conf
->mirrors
[i
].dev
;
1187 mbh
->b_rsector
= bh
->b_blocknr
* sectors
;
1188 mbh
->b_state
= (1<<BH_Req
) | (1<<BH_Dirty
) |
1189 (1<<BH_Mapped
) | (1<<BH_Lock
);
1190 atomic_set(&mbh
->b_count
, 1);
1191 mbh
->b_size
= bh
->b_size
;
1192 mbh
->b_page
= bh
->b_page
;
1193 mbh
->b_data
= bh
->b_data
;
1194 mbh
->b_list
= BUF_LOCKED
;
1195 mbh
->b_end_io
= end_sync_write
;
1196 mbh
->b_dev_id
= r1_bh
;
1198 mbh
->b_next
= r1_bh
->mirror_bh_list
;
1199 r1_bh
->mirror_bh_list
= mbh
;
1203 md_atomic_set(&r1_bh
->remaining
, sum_bhs
);
1204 if (bhl
) raid1_free_bh(conf
, bhl
);
1205 mbh
= r1_bh
->mirror_bh_list
;
1207 struct buffer_head
*bh1
= mbh
;
1209 q
= blk_get_queue(bh1
->b_rdev
);
1210 generic_make_request(q
, WRITE
, bh1
);
1211 drive_stat_acct(bh1
->b_rdev
, WRITE
, -bh1
->b_size
/512, 0);
1215 raid1_map (mddev
, &bh
->b_dev
, bh
->b_size
>> 9);
1216 if (bh
->b_dev
== dev
) {
1217 printk (IO_ERROR
, partition_name(bh
->b_dev
), bh
->b_blocknr
);
1218 md_done_sync(mddev
, bh
->b_size
>>10, 0);
1220 printk (REDIRECT_SECTOR
,
1221 partition_name(bh
->b_dev
), bh
->b_blocknr
);
1222 bh
->b_rdev
= bh
->b_dev
;
1223 q
= blk_get_queue(bh
->b_rdev
);
1224 generic_make_request (q
, READ
, bh
);
1233 raid1_map (mddev
, &bh
->b_dev
, bh
->b_size
>> 9);
1234 if (bh
->b_dev
== dev
) {
1235 printk (IO_ERROR
, partition_name(bh
->b_dev
), bh
->b_blocknr
);
1236 raid1_end_bh_io(r1_bh
, 0);
1238 printk (REDIRECT_SECTOR
,
1239 partition_name(bh
->b_dev
), bh
->b_blocknr
);
1240 bh
->b_rdev
= bh
->b_dev
;
1241 q
= blk_get_queue(bh
->b_rdev
);
1242 generic_make_request (q
, r1_bh
->cmd
, bh
);
1247 md_spin_unlock_irqrestore(&retry_list_lock
, flags
);
1250 #undef REDIRECT_SECTOR
1253 * Private kernel thread to reconstruct mirrors after an unclean
1256 static void raid1syncd (void *data
)
1258 raid1_conf_t
*conf
= data
;
1259 mddev_t
*mddev
= conf
->mddev
;
1261 if (!conf
->resync_mirrors
)
1263 if (conf
->resync_mirrors
== 2)
1265 down(&mddev
->recovery_sem
);
1266 if (!md_do_sync(mddev
, NULL
)) {
1268 * Only if everything went Ok.
1270 conf
->resync_mirrors
= 0;
1273 /* If reconstruction was interrupted, we need to close the "active" and "pending"
1275 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
1277 /* this is really needed when recovery stops too... */
1278 spin_lock_irq(&conf
->segment_lock
);
1279 conf
->start_active
= conf
->start_pending
;
1280 conf
->start_ready
= conf
->start_pending
;
1281 wait_event_lock_irq(conf
->wait_ready
, !conf
->cnt_pending
, conf
->segment_lock
);
1282 conf
->start_active
=conf
->start_ready
= conf
->start_pending
= conf
->start_future
;
1283 conf
->start_future
= mddev
->sb
->size
+1;
1284 conf
->cnt_pending
= conf
->cnt_future
;
1285 conf
->cnt_future
= 0;
1286 conf
->phase
= conf
->phase
^1;
1287 wait_event_lock_irq(conf
->wait_ready
, !conf
->cnt_pending
, conf
->segment_lock
);
1288 conf
->start_active
= conf
->start_ready
= conf
->start_pending
= conf
->start_future
= 0;
1290 conf
->cnt_future
= conf
->cnt_done
;;
1292 spin_unlock_irq(&conf
->segment_lock
);
1293 wake_up(&conf
->wait_done
);
1295 up(&mddev
->recovery_sem
);
1296 raid1_shrink_buffers(conf
);
1300 * perform a "sync" on one "block"
1302 * We need to make sure that no normal I/O request - particularly write
1303 * requests - conflict with active sync requests.
1304 * This is achieved by conceptually dividing the device space into a
1305 * number of sections:
1306 * DONE: 0 .. a-1 These blocks are in-sync
1307 * ACTIVE: a.. b-1 These blocks may have active sync requests, but
1308 * no normal IO requests
1309 * READY: b .. c-1 These blocks have no normal IO requests - sync
1310 * request may be happening
1311 * PENDING: c .. d-1 These blocks may have IO requests, but no new
1312 * ones will be added
1313 * FUTURE: d .. end These blocks are not to be considered yet. IO may
1314 * be happening, but not sync
1317 * phase which flips (0 or 1) each time d moves and
1319 * z = active io requests in FUTURE since d moved - marked with
1321 * y = active io requests in FUTURE before d moved, or PENDING -
1322 * marked with previous phase
1323 * x = active sync requests in READY
1324 * w = active sync requests in ACTIVE
1325 * v = active io requests in DONE
1327 * Normally, a=b=c=d=0 and z= active io requests
1328 * or a=b=c=d=END and v= active io requests
1329 * Allowed changes to a,b,c,d:
1330 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
1334 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1336 * At start of sync we apply A.
1337 * When y reaches 0, we apply B then A then being sync requests
1338 * When sync point reaches c-1, we wait for y==0, and W==0, and
1339 * then apply apply B then A then D then C.
1340 * Finally, we apply E
1342 * The sync request simply issues a "read" against a working drive
1343 * This is marked so that on completion the raid1d thread is woken to
1344 * issue suitable write requests
1347 static int raid1_sync_request (mddev_t
*mddev
, unsigned long block_nr
)
1349 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
1350 struct mirror_info
*mirror
;
1352 struct raid1_bh
*r1_bh
;
1353 struct buffer_head
*bh
;
1356 spin_lock_irq(&conf
->segment_lock
);
1360 conf
->start_active
= 0;
1361 conf
->start_ready
= 0;
1362 conf
->start_pending
= 0;
1363 conf
->start_future
= 0;
1365 /* we want enough buffers to hold twice the window of 128*/
1366 buffs
= 128 *2 / (PAGE_SIZE
>>9);
1367 buffs
= raid1_grow_buffers(conf
, buffs
);
1371 conf
->window
= buffs
*(PAGE_SIZE
>>9)/2;
1372 conf
->cnt_future
+= conf
->cnt_done
+conf
->cnt_pending
;
1373 conf
->cnt_done
= conf
->cnt_pending
= 0;
1374 if (conf
->cnt_ready
|| conf
->cnt_active
)
1377 while ((block_nr
<<1) >= conf
->start_pending
) {
1378 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1379 block_nr
<<1, conf
->start_active
, conf
->start_ready
, conf
->start_pending
, conf
->start_future
,
1380 conf
->cnt_done
, conf
->cnt_active
, conf
->cnt_ready
, conf
->cnt_pending
, conf
->cnt_future
);
1381 wait_event_lock_irq(conf
->wait_done
,
1383 conf
->segment_lock
);
1384 wait_event_lock_irq(conf
->wait_ready
,
1386 conf
->segment_lock
);
1387 conf
->start_active
= conf
->start_ready
;
1388 conf
->start_ready
= conf
->start_pending
;
1389 conf
->start_pending
= conf
->start_future
;
1390 conf
->start_future
= conf
->start_future
+conf
->window
;
1391 // Note: falling off the end is not a problem
1392 conf
->phase
= conf
->phase
^1;
1393 conf
->cnt_active
= conf
->cnt_ready
;
1394 conf
->cnt_ready
= 0;
1395 conf
->cnt_pending
= conf
->cnt_future
;
1396 conf
->cnt_future
= 0;
1397 wake_up(&conf
->wait_done
);
1400 spin_unlock_irq(&conf
->segment_lock
);
1403 /* If reconstructing, and >1 working disc,
1404 * could dedicate one to rebuild and others to
1405 * service read requests ..
1407 mirror
= conf
->mirrors
+conf
->last_used
;
1409 r1_bh
= raid1_alloc_buf (conf
);
1410 r1_bh
->master_bh
= NULL
;
1411 r1_bh
->mddev
= mddev
;
1412 r1_bh
->cmd
= SPECIAL
;
1413 bh
= &r1_bh
->bh_req
;
1415 bh
->b_blocknr
= block_nr
;
1417 while (!(bh
->b_blocknr
& 1) && bsize
< PAGE_SIZE
1418 && (bh
->b_blocknr
+2)*(bsize
>>10) < mddev
->sb
->size
) {
1419 bh
->b_blocknr
>>= 1;
1423 bh
->b_list
= BUF_LOCKED
;
1424 bh
->b_dev
= mirror
->dev
;
1425 bh
->b_rdev
= mirror
->dev
;
1426 bh
->b_state
= (1<<BH_Req
) | (1<<BH_Mapped
);
1431 if (bh
->b_data
!= (char *) page_address(bh
->b_page
))
1433 bh
->b_end_io
= end_sync_read
;
1434 bh
->b_dev_id
= (void *) r1_bh
;
1435 bh
->b_rsector
= block_nr
<<1;
1436 init_waitqueue_head(&bh
->b_wait
);
1438 q
= blk_get_queue(bh
->b_rdev
);
1439 generic_make_request(q
, READ
, bh
);
1440 drive_stat_acct(bh
->b_rdev
, READ
, -bh
->b_size
/512, 0);
1442 return (bsize
>> 10);
1445 raid1_shrink_buffers(conf
);
1446 spin_unlock_irq(&conf
->segment_lock
);
1450 static void end_sync_read(struct buffer_head
*bh
, int uptodate
)
1452 struct raid1_bh
* r1_bh
= (struct raid1_bh
*)(bh
->b_dev_id
);
1454 /* we have read a block, now it needs to be re-written,
1455 * or re-read if the read failed.
1456 * We don't do much here, just schedule handling by raid1d
1459 md_error (mddev_to_kdev(r1_bh
->mddev
), bh
->b_dev
);
1461 set_bit(R1BH_Uptodate
, &r1_bh
->state
);
1462 raid1_reschedule_retry(r1_bh
);
1465 static void end_sync_write(struct buffer_head
*bh
, int uptodate
)
1467 struct raid1_bh
* r1_bh
= (struct raid1_bh
*)(bh
->b_dev_id
);
1470 md_error (mddev_to_kdev(r1_bh
->mddev
), bh
->b_dev
);
1471 if (atomic_dec_and_test(&r1_bh
->remaining
)) {
1472 mddev_t
*mddev
= r1_bh
->mddev
;
1473 unsigned long sect
= bh
->b_blocknr
* (bh
->b_size
>>9);
1474 int size
= bh
->b_size
;
1475 raid1_free_buf(r1_bh
);
1476 sync_request_done(sect
, mddev_to_conf(mddev
));
1477 md_done_sync(mddev
,size
>>10, uptodate
);
1482 * This will catch the scenario in which one of the mirrors was
1483 * mounted as a normal device rather than as a part of a raid set.
1485 * check_consistency is very personality-dependent, eg. RAID5 cannot
1486 * do this check, it uses another method.
1488 static int __check_consistency (mddev_t
*mddev
, int row
)
1490 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
1491 int disks
= MD_SB_DISKS
;
1493 struct buffer_head
*bh
= NULL
;
1495 char *buffer
= NULL
;
1497 for (i
= 0; i
< disks
; i
++) {
1498 printk("(checking disk %d)\n",i
);
1499 if (!conf
->mirrors
[i
].operational
)
1501 printk("(really checking disk %d)\n",i
);
1502 dev
= conf
->mirrors
[i
].dev
;
1503 set_blocksize(dev
, 4096);
1504 if ((bh
= bread(dev
, row
/ 4, 4096)) == NULL
)
1507 buffer
= (char *) __get_free_page(GFP_KERNEL
);
1510 memcpy(buffer
, bh
->b_data
, 4096);
1511 } else if (memcmp(buffer
, bh
->b_data
, 4096)) {
1517 invalidate_buffers(dev
);
1521 free_page((unsigned long) buffer
);
1526 invalidate_buffers(dev
);
1531 static int check_consistency (mddev_t
*mddev
)
1533 if (__check_consistency(mddev
, 0))
1535 * we do not do this currently, as it's perfectly possible to
1536 * have an inconsistent array when it's freshly created. Only
1537 * newly written data has to be consistent.
1544 #define INVALID_LEVEL KERN_WARNING \
1545 "raid1: md%d: raid level not set to mirroring (%d)\n"
1547 #define NO_SB KERN_ERR \
1548 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1550 #define ERRORS KERN_ERR \
1551 "raid1: disabled mirror %s (errors detected)\n"
1553 #define NOT_IN_SYNC KERN_ERR \
1554 "raid1: disabled mirror %s (not in sync)\n"
1556 #define INCONSISTENT KERN_ERR \
1557 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1559 #define ALREADY_RUNNING KERN_ERR \
1560 "raid1: disabled mirror %s (mirror %d already operational)\n"
1562 #define OPERATIONAL KERN_INFO \
1563 "raid1: device %s operational as mirror %d\n"
1565 #define MEM_ERROR KERN_ERR \
1566 "raid1: couldn't allocate memory for md%d\n"
1568 #define SPARE KERN_INFO \
1569 "raid1: spare disk %s\n"
1571 #define NONE_OPERATIONAL KERN_ERR \
1572 "raid1: no operational mirrors for md%d\n"
1574 #define RUNNING_CKRAID KERN_ERR \
1575 "raid1: detected mirror differences -- running resync\n"
1577 #define ARRAY_IS_ACTIVE KERN_INFO \
1578 "raid1: raid set md%d active with %d out of %d mirrors\n"
1580 #define THREAD_ERROR KERN_ERR \
1581 "raid1: couldn't allocate thread for md%d\n"
1583 #define START_RESYNC KERN_WARNING \
1584 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1586 static int raid1_run (mddev_t
*mddev
)
1590 struct mirror_info
*disk
;
1591 mdp_super_t
*sb
= mddev
->sb
;
1592 mdp_disk_t
*descriptor
;
1594 struct md_list_head
*tmp
;
1595 int start_recovery
= 0;
1599 if (sb
->level
!= 1) {
1600 printk(INVALID_LEVEL
, mdidx(mddev
), sb
->level
);
1604 * copy the already verified devices into our private RAID1
1605 * bookkeeping area. [whatever we allocate in raid1_run(),
1606 * should be freed in raid1_stop()]
1609 conf
= kmalloc(sizeof(raid1_conf_t
), GFP_KERNEL
);
1610 mddev
->private = conf
;
1612 printk(MEM_ERROR
, mdidx(mddev
));
1615 memset(conf
, 0, sizeof(*conf
));
1617 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1619 printk(ERRORS
, partition_name(rdev
->dev
));
1626 if (rdev
->desc_nr
== -1) {
1630 descriptor
= &sb
->disks
[rdev
->desc_nr
];
1631 disk_idx
= descriptor
->raid_disk
;
1632 disk
= conf
->mirrors
+ disk_idx
;
1634 if (disk_faulty(descriptor
)) {
1635 disk
->number
= descriptor
->number
;
1636 disk
->raid_disk
= disk_idx
;
1637 disk
->dev
= rdev
->dev
;
1638 disk
->sect_limit
= MAX_WORK_PER_DISK
;
1639 disk
->operational
= 0;
1640 disk
->write_only
= 0;
1642 disk
->used_slot
= 1;
1643 disk
->head_position
= 0;
1646 if (disk_active(descriptor
)) {
1647 if (!disk_sync(descriptor
)) {
1649 partition_name(rdev
->dev
));
1652 if ((descriptor
->number
> MD_SB_DISKS
) ||
1653 (disk_idx
> sb
->raid_disks
)) {
1655 printk(INCONSISTENT
,
1656 partition_name(rdev
->dev
));
1659 if (disk
->operational
) {
1660 printk(ALREADY_RUNNING
,
1661 partition_name(rdev
->dev
),
1665 printk(OPERATIONAL
, partition_name(rdev
->dev
),
1667 disk
->number
= descriptor
->number
;
1668 disk
->raid_disk
= disk_idx
;
1669 disk
->dev
= rdev
->dev
;
1670 disk
->sect_limit
= MAX_WORK_PER_DISK
;
1671 disk
->operational
= 1;
1672 disk
->write_only
= 0;
1674 disk
->used_slot
= 1;
1675 disk
->head_position
= 0;
1676 conf
->working_disks
++;
1679 * Must be a spare disk ..
1681 printk(SPARE
, partition_name(rdev
->dev
));
1682 disk
->number
= descriptor
->number
;
1683 disk
->raid_disk
= disk_idx
;
1684 disk
->dev
= rdev
->dev
;
1685 disk
->sect_limit
= MAX_WORK_PER_DISK
;
1686 disk
->operational
= 0;
1687 disk
->write_only
= 0;
1689 disk
->used_slot
= 1;
1690 disk
->head_position
= 0;
1693 conf
->raid_disks
= sb
->raid_disks
;
1694 conf
->nr_disks
= sb
->nr_disks
;
1695 conf
->mddev
= mddev
;
1696 conf
->device_lock
= MD_SPIN_LOCK_UNLOCKED
;
1698 conf
->segment_lock
= MD_SPIN_LOCK_UNLOCKED
;
1699 init_waitqueue_head(&conf
->wait_buffer
);
1700 init_waitqueue_head(&conf
->wait_done
);
1701 init_waitqueue_head(&conf
->wait_ready
);
1703 if (!conf
->working_disks
) {
1704 printk(NONE_OPERATIONAL
, mdidx(mddev
));
1709 /* pre-allocate some buffer_head structures.
1710 * As a minimum, 1 r1bh and raid_disks buffer_heads
1711 * would probably get us by in tight memory situations,
1712 * but a few more is probably a good idea.
1713 * For now, try 16 r1bh and 16*raid_disks bufferheads
1714 * This will allow at least 16 concurrent reads or writes
1715 * even if kmalloc starts failing
1717 if (raid1_grow_r1bh(conf
, 16) < 16 ||
1718 raid1_grow_bh(conf
, 16*conf
->raid_disks
)< 16*conf
->raid_disks
) {
1719 printk(MEM_ERROR
, mdidx(mddev
));
1723 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1725 descriptor
= sb
->disks
+i
;
1726 disk_idx
= descriptor
->raid_disk
;
1727 disk
= conf
->mirrors
+ disk_idx
;
1729 if (disk_faulty(descriptor
) && (disk_idx
< conf
->raid_disks
) &&
1732 disk
->number
= descriptor
->number
;
1733 disk
->raid_disk
= disk_idx
;
1734 disk
->dev
= MKDEV(0,0);
1736 disk
->operational
= 0;
1737 disk
->write_only
= 0;
1739 disk
->used_slot
= 1;
1740 disk
->head_position
= 0;
1745 * find the first working one and use it as a starting point
1746 * to read balancing.
1748 for (j
= 0; !conf
->mirrors
[j
].operational
; j
++)
1750 conf
->last_used
= j
;
1753 * initialize the 'working disks' list.
1755 for (i
= conf
->raid_disks
- 1; i
>= 0; i
--) {
1756 if (conf
->mirrors
[i
].operational
) {
1757 conf
->mirrors
[i
].next
= j
;
1762 if (conf
->working_disks
!= sb
->raid_disks
) {
1763 printk(KERN_ALERT
"raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev
));
1767 if (!start_recovery
&& (sb
->state
& (1 << MD_SB_CLEAN
))) {
1769 * we do sanity checks even if the device says
1772 if (check_consistency(mddev
)) {
1773 printk(RUNNING_CKRAID
);
1774 sb
->state
&= ~(1 << MD_SB_CLEAN
);
1779 const char * name
= "raid1d";
1781 conf
->thread
= md_register_thread(raid1d
, conf
, name
);
1782 if (!conf
->thread
) {
1783 printk(THREAD_ERROR
, mdidx(mddev
));
1788 if (!start_recovery
&& !(sb
->state
& (1 << MD_SB_CLEAN
))) {
1789 const char * name
= "raid1syncd";
1791 conf
->resync_thread
= md_register_thread(raid1syncd
, conf
,name
);
1792 if (!conf
->resync_thread
) {
1793 printk(THREAD_ERROR
, mdidx(mddev
));
1797 printk(START_RESYNC
, mdidx(mddev
));
1798 conf
->resync_mirrors
= 1;
1799 md_wakeup_thread(conf
->resync_thread
);
1803 * Regenerate the "device is in sync with the raid set" bit for
1806 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1807 mark_disk_nonsync(sb
->disks
+i
);
1808 for (j
= 0; j
< sb
->raid_disks
; j
++) {
1809 if (!conf
->mirrors
[j
].operational
)
1811 if (sb
->disks
[i
].number
== conf
->mirrors
[j
].number
)
1812 mark_disk_sync(sb
->disks
+i
);
1815 sb
->active_disks
= conf
->working_disks
;
1818 md_recover_arrays();
1821 printk(ARRAY_IS_ACTIVE
, mdidx(mddev
), sb
->active_disks
, sb
->raid_disks
);
1823 * Ok, everything is just fine now
1828 raid1_shrink_r1bh(conf
);
1829 raid1_shrink_bh(conf
, conf
->freebh_cnt
);
1830 raid1_shrink_buffers(conf
);
1832 mddev
->private = NULL
;
1838 #undef INVALID_LEVEL
1843 #undef ALREADY_RUNNING
1846 #undef NONE_OPERATIONAL
1847 #undef RUNNING_CKRAID
1848 #undef ARRAY_IS_ACTIVE
1850 static int raid1_stop_resync (mddev_t
*mddev
)
1852 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
1854 if (conf
->resync_thread
) {
1855 if (conf
->resync_mirrors
) {
1856 conf
->resync_mirrors
= 2;
1857 md_interrupt_thread(conf
->resync_thread
);
1859 printk(KERN_INFO
"raid1: mirror resync was not fully finished, restarting next time.\n");
1867 static int raid1_restart_resync (mddev_t
*mddev
)
1869 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
1871 if (conf
->resync_mirrors
) {
1872 if (!conf
->resync_thread
) {
1876 conf
->resync_mirrors
= 1;
1877 md_wakeup_thread(conf
->resync_thread
);
1883 static int raid1_stop (mddev_t
*mddev
)
1885 raid1_conf_t
*conf
= mddev_to_conf(mddev
);
1887 md_unregister_thread(conf
->thread
);
1888 if (conf
->resync_thread
)
1889 md_unregister_thread(conf
->resync_thread
);
1890 raid1_shrink_r1bh(conf
);
1891 raid1_shrink_bh(conf
, conf
->freebh_cnt
);
1892 raid1_shrink_buffers(conf
);
1894 mddev
->private = NULL
;
1899 static mdk_personality_t raid1_personality
=
1902 make_request
: raid1_make_request
,
1905 status
: raid1_status
,
1906 error_handler
: raid1_error
,
1907 diskop
: raid1_diskop
,
1908 stop_resync
: raid1_stop_resync
,
1909 restart_resync
: raid1_restart_resync
,
1910 sync_request
: raid1_sync_request
1913 int raid1_init (void)
1915 return register_md_personality (RAID1
, &raid1_personality
);
1919 int init_module (void)
1921 return raid1_init();
1924 void cleanup_module (void)
1926 unregister_md_personality (RAID1
);