1 /*****************************************************************************
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
5 * RAID-5 management functions.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 #include <linux/module.h>
18 #include <linux/locks.h>
19 #include <linux/malloc.h>
21 #include <linux/raid5.h>
22 #include <asm/bitops.h>
23 #include <asm/atomic.h>
26 static struct md_personality raid5_personality
;
31 #define NR_STRIPES 128
33 #define HASH_PAGES_ORDER 0
34 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
35 #define HASH_MASK (NR_HASH - 1)
36 #define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
39 * The following can be used to debug the driver
44 #define PRINTK(x) do { printk x; } while (0);
46 #define PRINTK(x) do { ; } while (0)
49 static inline int stripe_locked(struct stripe_head
*sh
)
51 return test_bit(STRIPE_LOCKED
, &sh
->state
);
54 static inline int stripe_error(struct stripe_head
*sh
)
56 return test_bit(STRIPE_ERROR
, &sh
->state
);
60 * Stripes are locked whenever new buffers can't be added to them.
62 static inline void lock_stripe(struct stripe_head
*sh
)
64 struct raid5_data
*raid_conf
= sh
->raid_conf
;
65 if (!test_and_set_bit(STRIPE_LOCKED
, &sh
->state
)) {
66 PRINTK(("locking stripe %lu\n", sh
->sector
));
67 raid_conf
->nr_locked_stripes
++;
71 static inline void unlock_stripe(struct stripe_head
*sh
)
73 struct raid5_data
*raid_conf
= sh
->raid_conf
;
74 if (test_and_clear_bit(STRIPE_LOCKED
, &sh
->state
)) {
75 PRINTK(("unlocking stripe %lu\n", sh
->sector
));
76 raid_conf
->nr_locked_stripes
--;
81 static inline void finish_stripe(struct stripe_head
*sh
)
83 struct raid5_data
*raid_conf
= sh
->raid_conf
;
85 sh
->cmd
= STRIPE_NONE
;
86 sh
->phase
= PHASE_COMPLETE
;
87 raid_conf
->nr_pending_stripes
--;
88 raid_conf
->nr_cached_stripes
++;
89 wake_up(&raid_conf
->wait_for_stripe
);
92 void __wait_on_stripe(struct stripe_head
*sh
)
94 DECLARE_WAITQUEUE(wait
, current
);
96 PRINTK(("wait_on_stripe %lu\n", sh
->sector
));
98 add_wait_queue(&sh
->wait
, &wait
);
100 current
->state
= TASK_UNINTERRUPTIBLE
;
101 if (stripe_locked(sh
)) {
105 PRINTK(("wait_on_stripe %lu done\n", sh
->sector
));
106 remove_wait_queue(&sh
->wait
, &wait
);
108 current
->state
= TASK_RUNNING
;
111 static inline void wait_on_stripe(struct stripe_head
*sh
)
113 if (stripe_locked(sh
))
114 __wait_on_stripe(sh
);
117 static inline void remove_hash(struct raid5_data
*raid_conf
, struct stripe_head
*sh
)
119 PRINTK(("remove_hash(), stripe %lu\n", sh
->sector
));
121 if (sh
->hash_pprev
) {
123 sh
->hash_next
->hash_pprev
= sh
->hash_pprev
;
124 *sh
->hash_pprev
= sh
->hash_next
;
125 sh
->hash_pprev
= NULL
;
126 raid_conf
->nr_hashed_stripes
--;
130 static inline void insert_hash(struct raid5_data
*raid_conf
, struct stripe_head
*sh
)
132 struct stripe_head
**shp
= &stripe_hash(raid_conf
, sh
->sector
, sh
->size
);
134 PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh
->sector
, raid_conf
->nr_hashed_stripes
));
136 if ((sh
->hash_next
= *shp
) != NULL
)
137 (*shp
)->hash_pprev
= &sh
->hash_next
;
139 sh
->hash_pprev
= shp
;
140 raid_conf
->nr_hashed_stripes
++;
143 static struct buffer_head
*get_free_buffer(struct stripe_head
*sh
, int b_size
)
145 struct buffer_head
*bh
;
150 if ((bh
= sh
->buffer_pool
) == NULL
)
152 sh
->buffer_pool
= bh
->b_next
;
154 restore_flags(flags
);
158 static struct buffer_head
*get_free_bh(struct stripe_head
*sh
)
160 struct buffer_head
*bh
;
165 if ((bh
= sh
->bh_pool
) == NULL
)
167 sh
->bh_pool
= bh
->b_next
;
168 restore_flags(flags
);
172 static void put_free_buffer(struct stripe_head
*sh
, struct buffer_head
*bh
)
178 bh
->b_next
= sh
->buffer_pool
;
179 sh
->buffer_pool
= bh
;
180 restore_flags(flags
);
183 static void put_free_bh(struct stripe_head
*sh
, struct buffer_head
*bh
)
189 bh
->b_next
= sh
->bh_pool
;
191 restore_flags(flags
);
194 static struct stripe_head
*get_free_stripe(struct raid5_data
*raid_conf
)
196 struct stripe_head
*sh
;
201 if ((sh
= raid_conf
->free_sh_list
) == NULL
) {
202 restore_flags(flags
);
205 raid_conf
->free_sh_list
= sh
->free_next
;
206 raid_conf
->nr_free_sh
--;
207 if (!raid_conf
->nr_free_sh
&& raid_conf
->free_sh_list
)
208 printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n");
209 restore_flags(flags
);
210 if (sh
->hash_pprev
|| sh
->nr_pending
|| sh
->count
)
211 printk("get_free_stripe(): bug\n");
215 static void put_free_stripe(struct raid5_data
*raid_conf
, struct stripe_head
*sh
)
221 sh
->free_next
= raid_conf
->free_sh_list
;
222 raid_conf
->free_sh_list
= sh
;
223 raid_conf
->nr_free_sh
++;
224 restore_flags(flags
);
227 static void shrink_buffers(struct stripe_head
*sh
, int num
)
229 struct buffer_head
*bh
;
232 if ((bh
= get_free_buffer(sh
, -1)) == NULL
)
234 free_page((unsigned long) bh
->b_data
);
239 static void shrink_bh(struct stripe_head
*sh
, int num
)
241 struct buffer_head
*bh
;
244 if ((bh
= get_free_bh(sh
)) == NULL
)
250 static int grow_buffers(struct stripe_head
*sh
, int num
, int b_size
, int priority
)
252 struct buffer_head
*bh
;
255 if ((bh
= kmalloc(sizeof(struct buffer_head
), priority
)) == NULL
)
257 memset(bh
, 0, sizeof (struct buffer_head
));
258 bh
->b_data
= (char *) __get_free_page(priority
);
264 put_free_buffer(sh
, bh
);
269 static int grow_bh(struct stripe_head
*sh
, int num
, int priority
)
271 struct buffer_head
*bh
;
274 if ((bh
= kmalloc(sizeof(struct buffer_head
), priority
)) == NULL
)
276 memset(bh
, 0, sizeof (struct buffer_head
));
282 static void raid5_kfree_buffer(struct stripe_head
*sh
, struct buffer_head
*bh
)
288 put_free_buffer(sh
, bh
);
289 restore_flags(flags
);
292 static void raid5_kfree_bh(struct stripe_head
*sh
, struct buffer_head
*bh
)
299 restore_flags(flags
);
302 static void raid5_kfree_old_bh(struct stripe_head
*sh
, int i
)
304 if (!sh
->bh_old
[i
]) {
305 printk("raid5_kfree_old_bh: bug: sector %lu, index %d not present\n", sh
->sector
, i
);
308 raid5_kfree_buffer(sh
, sh
->bh_old
[i
]);
309 sh
->bh_old
[i
] = NULL
;
312 static void raid5_update_old_bh(struct stripe_head
*sh
, int i
)
314 PRINTK(("stripe %lu, idx %d, updating cache copy\n", sh
->sector
, i
));
315 if (!sh
->bh_copy
[i
]) {
316 printk("raid5_update_old_bh: bug: sector %lu, index %d not present\n", sh
->sector
, i
);
320 raid5_kfree_old_bh(sh
, i
);
321 sh
->bh_old
[i
] = sh
->bh_copy
[i
];
322 sh
->bh_copy
[i
] = NULL
;
325 static void kfree_stripe(struct stripe_head
*sh
)
327 struct raid5_data
*raid_conf
= sh
->raid_conf
;
328 int disks
= raid_conf
->raid_disks
, j
;
330 PRINTK(("kfree_stripe called, stripe %lu\n", sh
->sector
));
331 if (sh
->phase
!= PHASE_COMPLETE
|| stripe_locked(sh
) || sh
->count
) {
332 printk("raid5: kfree_stripe(), sector %lu, phase %d, locked %d, count %d\n", sh
->sector
, sh
->phase
, stripe_locked(sh
), sh
->count
);
335 for (j
= 0; j
< disks
; j
++) {
337 raid5_kfree_old_bh(sh
, j
);
338 if (sh
->bh_new
[j
] || sh
->bh_copy
[j
])
339 printk("raid5: bug: sector %lu, new %p, copy %p\n", sh
->sector
, sh
->bh_new
[j
], sh
->bh_copy
[j
]);
341 remove_hash(raid_conf
, sh
);
342 put_free_stripe(raid_conf
, sh
);
345 static int shrink_stripe_cache(struct raid5_data
*raid_conf
, int nr
)
347 struct stripe_head
*sh
;
350 PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr
, raid_conf
->nr_hashed_stripes
, raid_conf
->clock
));
351 for (i
= 0; i
< NR_HASH
; i
++) {
353 sh
= raid_conf
->stripe_hashtbl
[(i
+ raid_conf
->clock
) & HASH_MASK
];
354 for (; sh
; sh
= sh
->hash_next
) {
355 if (sh
->phase
!= PHASE_COMPLETE
)
357 if (stripe_locked(sh
))
363 PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf
->nr_hashed_stripes
));
364 raid_conf
->clock
= (i
+ raid_conf
->clock
) & HASH_MASK
;
370 PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf
->nr_hashed_stripes
));
374 static struct stripe_head
*find_stripe(struct raid5_data
*raid_conf
, unsigned long sector
, int size
)
376 struct stripe_head
*sh
;
378 if (raid_conf
->buffer_size
!= size
) {
379 PRINTK(("switching size, %d --> %d\n", raid_conf
->buffer_size
, size
));
380 shrink_stripe_cache(raid_conf
, raid_conf
->max_nr_stripes
);
381 raid_conf
->buffer_size
= size
;
384 PRINTK(("find_stripe, sector %lu\n", sector
));
385 for (sh
= stripe_hash(raid_conf
, sector
, size
); sh
; sh
= sh
->hash_next
)
386 if (sh
->sector
== sector
&& sh
->raid_conf
== raid_conf
) {
387 if (sh
->size
== size
) {
388 PRINTK(("found stripe %lu\n", sector
));
391 PRINTK(("switching size for %lu, %d --> %d\n", sector
, sh
->size
, size
));
396 PRINTK(("stripe %lu not in cache\n", sector
));
400 static int grow_stripes(struct raid5_data
*raid_conf
, int num
, int priority
)
402 struct stripe_head
*sh
;
405 if ((sh
= kmalloc(sizeof(struct stripe_head
), priority
)) == NULL
)
407 memset(sh
, 0, sizeof(*sh
));
408 if (grow_buffers(sh
, 2 * raid_conf
->raid_disks
, PAGE_SIZE
, priority
)) {
409 shrink_buffers(sh
, 2 * raid_conf
->raid_disks
);
413 if (grow_bh(sh
, raid_conf
->raid_disks
, priority
)) {
414 shrink_buffers(sh
, 2 * raid_conf
->raid_disks
);
415 shrink_bh(sh
, raid_conf
->raid_disks
);
419 put_free_stripe(raid_conf
, sh
);
420 raid_conf
->nr_stripes
++;
425 static void shrink_stripes(struct raid5_data
*raid_conf
, int num
)
427 struct stripe_head
*sh
;
430 sh
= get_free_stripe(raid_conf
);
433 shrink_buffers(sh
, raid_conf
->raid_disks
* 2);
434 shrink_bh(sh
, raid_conf
->raid_disks
);
436 raid_conf
->nr_stripes
--;
440 static struct stripe_head
*kmalloc_stripe(struct raid5_data
*raid_conf
, unsigned long sector
, int size
)
442 struct stripe_head
*sh
= NULL
, *tmp
;
443 struct buffer_head
*buffer_pool
, *bh_pool
;
445 PRINTK(("kmalloc_stripe called\n"));
447 while ((sh
= get_free_stripe(raid_conf
)) == NULL
) {
448 shrink_stripe_cache(raid_conf
, raid_conf
->max_nr_stripes
/ 8);
449 if ((sh
= get_free_stripe(raid_conf
)) != NULL
)
451 if (!raid_conf
->nr_pending_stripes
)
452 printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n");
453 md_wakeup_thread(raid_conf
->thread
);
454 PRINTK(("waiting for some stripes to complete\n"));
455 sleep_on(&raid_conf
->wait_for_stripe
);
459 * The above might have slept, so perhaps another process
460 * already created the stripe for us..
462 if ((tmp
= find_stripe(raid_conf
, sector
, size
)) != NULL
) {
463 put_free_stripe(raid_conf
, sh
);
468 buffer_pool
= sh
->buffer_pool
;
469 bh_pool
= sh
->bh_pool
;
470 memset(sh
, 0, sizeof(*sh
));
471 sh
->buffer_pool
= buffer_pool
;
472 sh
->bh_pool
= bh_pool
;
473 sh
->phase
= PHASE_COMPLETE
;
474 sh
->cmd
= STRIPE_NONE
;
475 sh
->raid_conf
= raid_conf
;
478 raid_conf
->nr_cached_stripes
++;
479 insert_hash(raid_conf
, sh
);
480 } else printk("raid5: bug: kmalloc_stripe() == NULL\n");
484 static struct stripe_head
*get_stripe(struct raid5_data
*raid_conf
, unsigned long sector
, int size
)
486 struct stripe_head
*sh
;
488 PRINTK(("get_stripe, sector %lu\n", sector
));
489 sh
= find_stripe(raid_conf
, sector
, size
);
493 sh
= kmalloc_stripe(raid_conf
, sector
, size
);
498 static struct buffer_head
*raid5_kmalloc_buffer(struct stripe_head
*sh
, int b_size
)
500 struct buffer_head
*bh
;
502 if ((bh
= get_free_buffer(sh
, b_size
)) == NULL
)
503 printk("raid5: bug: raid5_kmalloc_buffer() == NULL\n");
507 static struct buffer_head
*raid5_kmalloc_bh(struct stripe_head
*sh
)
509 struct buffer_head
*bh
;
511 if ((bh
= get_free_bh(sh
)) == NULL
)
512 printk("raid5: bug: raid5_kmalloc_bh() == NULL\n");
516 static inline void raid5_end_buffer_io (struct stripe_head
*sh
, int i
, int uptodate
)
518 struct buffer_head
*bh
= sh
->bh_new
[i
];
520 sh
->bh_new
[i
] = NULL
;
521 raid5_kfree_bh(sh
, sh
->bh_req
[i
]);
522 sh
->bh_req
[i
] = NULL
;
523 bh
->b_end_io(bh
, uptodate
);
525 printk(KERN_ALERT
"raid5: %s: unrecoverable I/O error for "
526 "block %lu\n", kdevname(bh
->b_dev
), bh
->b_blocknr
);
529 static inline void raid5_mark_buffer_uptodate (struct buffer_head
*bh
, int uptodate
)
532 set_bit(BH_Uptodate
, &bh
->b_state
);
534 clear_bit(BH_Uptodate
, &bh
->b_state
);
537 static void raid5_end_request (struct buffer_head
* bh
, int uptodate
)
539 struct stripe_head
*sh
= bh
->b_dev_id
;
540 struct raid5_data
*raid_conf
= sh
->raid_conf
;
541 int disks
= raid_conf
->raid_disks
, i
;
544 PRINTK(("end_request %lu, nr_pending %d\n", sh
->sector
, sh
->nr_pending
));
547 raid5_mark_buffer_uptodate(bh
, uptodate
);
549 if (!sh
->nr_pending
) {
550 md_wakeup_thread(raid_conf
->thread
);
551 atomic_inc(&raid_conf
->nr_handle
);
554 md_error(bh
->b_dev
, bh
->b_rdev
);
555 if (raid_conf
->failed_disks
) {
556 for (i
= 0; i
< disks
; i
++) {
557 if (raid_conf
->disks
[i
].operational
)
559 if (bh
!= sh
->bh_old
[i
] && bh
!= sh
->bh_req
[i
] && bh
!= sh
->bh_copy
[i
])
561 if (bh
->b_rdev
!= raid_conf
->disks
[i
].dev
)
563 set_bit(STRIPE_ERROR
, &sh
->state
);
566 restore_flags(flags
);
569 static int raid5_map (struct md_dev
*mddev
, kdev_t
*rdev
,
570 unsigned long *rsector
, unsigned long size
)
572 /* No complex mapping used: the core of the work is done in the
578 static void raid5_build_block (struct stripe_head
*sh
, struct buffer_head
*bh
, int i
)
580 struct raid5_data
*raid_conf
= sh
->raid_conf
;
581 struct md_dev
*mddev
= raid_conf
->mddev
;
582 int minor
= (int) (mddev
- md_dev
);
584 kdev_t dev
= MKDEV(MD_MAJOR
, minor
);
585 int block
= sh
->sector
/ (sh
->size
>> 9);
587 b_data
= ((volatile struct buffer_head
*) bh
)->b_data
;
588 memset (bh
, 0, sizeof (struct buffer_head
));
589 init_buffer(bh
, raid5_end_request
, sh
);
591 bh
->b_blocknr
= block
;
592 ((volatile struct buffer_head
*) bh
)->b_data
= b_data
;
594 bh
->b_rdev
= raid_conf
->disks
[i
].dev
;
595 bh
->b_rsector
= sh
->sector
;
597 bh
->b_state
= (1 << BH_Req
) | (1 << BH_Mapped
);
598 bh
->b_size
= sh
->size
;
599 bh
->b_list
= BUF_LOCKED
;
602 static int raid5_error (struct md_dev
*mddev
, kdev_t dev
)
604 struct raid5_data
*raid_conf
= (struct raid5_data
*) mddev
->private;
605 md_superblock_t
*sb
= mddev
->sb
;
606 struct disk_info
*disk
;
609 PRINTK(("raid5_error called\n"));
610 raid_conf
->resync_parity
= 0;
611 for (i
= 0, disk
= raid_conf
->disks
; i
< raid_conf
->raid_disks
; i
++, disk
++)
612 if (disk
->dev
== dev
&& disk
->operational
) {
613 disk
->operational
= 0;
614 sb
->disks
[disk
->number
].state
|= (1 << MD_FAULTY_DEVICE
);
615 sb
->disks
[disk
->number
].state
&= ~(1 << MD_SYNC_DEVICE
);
616 sb
->disks
[disk
->number
].state
&= ~(1 << MD_ACTIVE_DEVICE
);
621 raid_conf
->working_disks
--;
622 raid_conf
->failed_disks
++;
623 md_wakeup_thread(raid_conf
->thread
);
625 "RAID5: Disk failure on %s, disabling device."
626 "Operation continuing on %d devices\n",
627 kdevname (dev
), raid_conf
->working_disks
);
633 * Input: a 'big' sector number,
634 * Output: index of the data and parity disk, and the sector # in them.
636 static inline unsigned long
637 raid5_compute_sector (int r_sector
, unsigned int raid_disks
, unsigned int data_disks
,
638 unsigned int * dd_idx
, unsigned int * pd_idx
,
639 struct raid5_data
*raid_conf
)
642 int chunk_number
, chunk_offset
;
643 unsigned long new_sector
;
644 int sectors_per_chunk
= raid_conf
->chunk_size
>> 9;
646 /* First compute the information on this sector */
649 * Compute the chunk number and the sector offset inside the chunk
651 chunk_number
= r_sector
/ sectors_per_chunk
;
652 chunk_offset
= r_sector
% sectors_per_chunk
;
655 * Compute the stripe number
657 stripe
= chunk_number
/ data_disks
;
660 * Compute the data disk and parity disk indexes inside the stripe
662 *dd_idx
= chunk_number
% data_disks
;
665 * Select the parity disk based on the user selected algorithm.
667 if (raid_conf
->level
== 4)
668 *pd_idx
= data_disks
;
669 else switch (raid_conf
->algorithm
) {
670 case ALGORITHM_LEFT_ASYMMETRIC
:
671 *pd_idx
= data_disks
- stripe
% raid_disks
;
672 if (*dd_idx
>= *pd_idx
)
675 case ALGORITHM_RIGHT_ASYMMETRIC
:
676 *pd_idx
= stripe
% raid_disks
;
677 if (*dd_idx
>= *pd_idx
)
680 case ALGORITHM_LEFT_SYMMETRIC
:
681 *pd_idx
= data_disks
- stripe
% raid_disks
;
682 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
684 case ALGORITHM_RIGHT_SYMMETRIC
:
685 *pd_idx
= stripe
% raid_disks
;
686 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
689 printk ("raid5: unsupported algorithm %d\n", raid_conf
->algorithm
);
693 * Finally, compute the new sector number
695 new_sector
= stripe
* sectors_per_chunk
+ chunk_offset
;
698 if ( *dd_idx
> data_disks
|| *pd_idx
> data_disks
||
699 chunk_offset
+ bh
->b_size
/ 512 > sectors_per_chunk
)
701 printk ("raid5: bug: dd_idx == %d, pd_idx == %d, chunk_offset == %d\n",
702 *dd_idx
, *pd_idx
, chunk_offset
);
708 static unsigned long compute_blocknr(struct stripe_head
*sh
, int i
)
710 struct raid5_data
*raid_conf
= sh
->raid_conf
;
711 int raid_disks
= raid_conf
->raid_disks
, data_disks
= raid_disks
- 1;
712 unsigned long new_sector
= sh
->sector
, check
;
713 int sectors_per_chunk
= raid_conf
->chunk_size
>> 9;
714 unsigned long stripe
= new_sector
/ sectors_per_chunk
;
715 int chunk_offset
= new_sector
% sectors_per_chunk
;
716 int chunk_number
, dummy1
, dummy2
, dd_idx
= i
;
717 unsigned long r_sector
, blocknr
;
719 switch (raid_conf
->algorithm
) {
720 case ALGORITHM_LEFT_ASYMMETRIC
:
721 case ALGORITHM_RIGHT_ASYMMETRIC
:
725 case ALGORITHM_LEFT_SYMMETRIC
:
726 case ALGORITHM_RIGHT_SYMMETRIC
:
729 i
-= (sh
->pd_idx
+ 1);
732 printk ("raid5: unsupported algorithm %d\n", raid_conf
->algorithm
);
735 chunk_number
= stripe
* data_disks
+ i
;
736 r_sector
= chunk_number
* sectors_per_chunk
+ chunk_offset
;
737 blocknr
= r_sector
/ (sh
->size
>> 9);
739 check
= raid5_compute_sector (r_sector
, raid_disks
, data_disks
, &dummy1
, &dummy2
, raid_conf
);
740 if (check
!= sh
->sector
|| dummy1
!= dd_idx
|| dummy2
!= sh
->pd_idx
) {
741 printk("compute_blocknr: map not correct\n");
747 #ifdef HAVE_ARCH_XORBLOCK
748 static void xor_block(struct buffer_head
*dest
, struct buffer_head
*source
)
750 __xor_block((char *) dest
->b_data
, (char *) source
->b_data
, dest
->b_size
);
753 static void xor_block(struct buffer_head
*dest
, struct buffer_head
*source
)
755 long lines
= dest
->b_size
/ (sizeof (long)) / 8, i
;
756 long *destp
= (long *) dest
->b_data
, *sourcep
= (long *) source
->b_data
;
758 for (i
= lines
; i
> 0; i
--) {
759 *(destp
+ 0) ^= *(sourcep
+ 0);
760 *(destp
+ 1) ^= *(sourcep
+ 1);
761 *(destp
+ 2) ^= *(sourcep
+ 2);
762 *(destp
+ 3) ^= *(sourcep
+ 3);
763 *(destp
+ 4) ^= *(sourcep
+ 4);
764 *(destp
+ 5) ^= *(sourcep
+ 5);
765 *(destp
+ 6) ^= *(sourcep
+ 6);
766 *(destp
+ 7) ^= *(sourcep
+ 7);
773 static void compute_block(struct stripe_head
*sh
, int dd_idx
)
775 struct raid5_data
*raid_conf
= sh
->raid_conf
;
776 int i
, disks
= raid_conf
->raid_disks
;
778 PRINTK(("compute_block, stripe %lu, idx %d\n", sh
->sector
, dd_idx
));
780 if (sh
->bh_old
[dd_idx
] == NULL
)
781 sh
->bh_old
[dd_idx
] = raid5_kmalloc_buffer(sh
, sh
->size
);
782 raid5_build_block(sh
, sh
->bh_old
[dd_idx
], dd_idx
);
784 memset(sh
->bh_old
[dd_idx
]->b_data
, 0, sh
->size
);
785 for (i
= 0; i
< disks
; i
++) {
789 xor_block(sh
->bh_old
[dd_idx
], sh
->bh_old
[i
]);
792 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx
, sh
->sector
, i
);
794 raid5_mark_buffer_uptodate(sh
->bh_old
[dd_idx
], 1);
797 static void compute_parity(struct stripe_head
*sh
, int method
)
799 struct raid5_data
*raid_conf
= sh
->raid_conf
;
800 int i
, pd_idx
= sh
->pd_idx
, disks
= raid_conf
->raid_disks
;
802 PRINTK(("compute_parity, stripe %lu, method %d\n", sh
->sector
, method
));
803 for (i
= 0; i
< disks
; i
++) {
804 if (i
== pd_idx
|| !sh
->bh_new
[i
])
807 sh
->bh_copy
[i
] = raid5_kmalloc_buffer(sh
, sh
->size
);
808 raid5_build_block(sh
, sh
->bh_copy
[i
], i
);
809 mark_buffer_clean(sh
->bh_new
[i
]);
810 memcpy(sh
->bh_copy
[i
]->b_data
, sh
->bh_new
[i
]->b_data
, sh
->size
);
812 if (sh
->bh_copy
[pd_idx
] == NULL
)
813 sh
->bh_copy
[pd_idx
] = raid5_kmalloc_buffer(sh
, sh
->size
);
814 raid5_build_block(sh
, sh
->bh_copy
[pd_idx
], sh
->pd_idx
);
816 if (method
== RECONSTRUCT_WRITE
) {
817 memset(sh
->bh_copy
[pd_idx
]->b_data
, 0, sh
->size
);
818 for (i
= 0; i
< disks
; i
++) {
822 xor_block(sh
->bh_copy
[pd_idx
], sh
->bh_copy
[i
]);
826 xor_block(sh
->bh_copy
[pd_idx
], sh
->bh_old
[i
]);
830 } else if (method
== READ_MODIFY_WRITE
) {
831 memcpy(sh
->bh_copy
[pd_idx
]->b_data
, sh
->bh_old
[pd_idx
]->b_data
, sh
->size
);
832 for (i
= 0; i
< disks
; i
++) {
835 if (sh
->bh_new
[i
] && sh
->bh_old
[i
]) {
836 xor_block(sh
->bh_copy
[pd_idx
], sh
->bh_copy
[i
]);
837 xor_block(sh
->bh_copy
[pd_idx
], sh
->bh_old
[i
]);
842 raid5_mark_buffer_uptodate(sh
->bh_copy
[pd_idx
], 1);
845 static void add_stripe_bh (struct stripe_head
*sh
, struct buffer_head
*bh
, int dd_idx
, int rw
)
847 struct raid5_data
*raid_conf
= sh
->raid_conf
;
848 struct buffer_head
*bh_req
;
850 if (sh
->bh_new
[dd_idx
]) {
851 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx
, sh
->sector
);
852 printk("forcing oops.\n");
856 set_bit(BH_Lock
, &bh
->b_state
);
858 bh_req
= raid5_kmalloc_bh(sh
);
859 raid5_build_block(sh
, bh_req
, dd_idx
);
860 bh_req
->b_data
= bh
->b_data
;
862 if (sh
->phase
== PHASE_COMPLETE
&& sh
->cmd
== STRIPE_NONE
) {
863 sh
->phase
= PHASE_BEGIN
;
864 sh
->cmd
= (rw
== READ
) ? STRIPE_READ
: STRIPE_WRITE
;
865 raid_conf
->nr_pending_stripes
++;
866 atomic_inc(&raid_conf
->nr_handle
);
868 sh
->bh_new
[dd_idx
] = bh
;
869 sh
->bh_req
[dd_idx
] = bh_req
;
870 sh
->cmd_new
[dd_idx
] = rw
;
874 static void complete_stripe(struct stripe_head
*sh
)
876 struct raid5_data
*raid_conf
= sh
->raid_conf
;
877 int disks
= raid_conf
->raid_disks
;
880 PRINTK(("complete_stripe %lu\n", sh
->sector
));
881 for (i
= 0; i
< disks
; i
++) {
882 if (sh
->cmd
== STRIPE_WRITE
&& i
== sh
->pd_idx
)
883 raid5_update_old_bh(sh
, i
);
887 if (sh
->cmd
== STRIPE_WRITE
) {
888 if (memcmp(sh
->bh_new
[i
]->b_data
, sh
->bh_copy
[i
]->b_data
, sh
->size
)) {
889 printk("copy differs, %s, sector %lu ",
890 test_bit(BH_Dirty
, &sh
->bh_new
[i
]->b_state
) ? "dirty" : "clean",
892 } else if (test_bit(BH_Dirty
, &sh
->bh_new
[i
]->b_state
))
893 printk("sector %lu dirty\n", sh
->sector
);
896 if (sh
->cmd
== STRIPE_WRITE
)
897 raid5_update_old_bh(sh
, i
);
898 raid5_end_buffer_io(sh
, i
, 1);
903 if (new && sh
->cmd
== STRIPE_WRITE
)
904 printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new);
909 PRINTK(("stripe %lu, new == %d\n", sh
->sector
, new));
910 sh
->phase
= PHASE_BEGIN
;
915 * handle_stripe() is our main logic routine. Note that:
917 * 1. lock_stripe() should be used whenever we can't accept additonal
918 * buffers, either during short sleeping in handle_stripe() or
919 * during io operations.
921 * 2. We should be careful to set sh->nr_pending whenever we sleep,
922 * to prevent re-entry of handle_stripe() for the same sh.
924 * 3. raid_conf->failed_disks and disk->operational can be changed
925 * from an interrupt. This complicates things a bit, but it allows
926 * us to stop issuing requests for a failed drive as soon as possible.
928 static void handle_stripe(struct stripe_head
*sh
)
930 struct raid5_data
*raid_conf
= sh
->raid_conf
;
931 struct md_dev
*mddev
= raid_conf
->mddev
;
932 int minor
= (int) (mddev
- md_dev
);
933 struct buffer_head
*bh
;
934 int disks
= raid_conf
->raid_disks
;
935 int i
, nr
= 0, nr_read
= 0, nr_write
= 0;
936 int nr_cache
= 0, nr_cache_other
= 0, nr_cache_overwrite
= 0, parity
= 0;
937 int nr_failed_other
= 0, nr_failed_overwrite
= 0, parity_failed
= 0;
938 int reading
= 0, nr_writing
= 0;
939 int method1
= INT_MAX
, method2
= INT_MAX
;
942 int operational
[MD_SB_DISKS
], failed_disks
= raid_conf
->failed_disks
;
944 PRINTK(("handle_stripe(), stripe %lu\n", sh
->sector
));
945 if (sh
->nr_pending
) {
946 printk("handle_stripe(), stripe %lu, io still pending\n", sh
->sector
);
949 if (sh
->phase
== PHASE_COMPLETE
) {
950 printk("handle_stripe(), stripe %lu, already complete\n", sh
->sector
);
954 atomic_dec(&raid_conf
->nr_handle
);
956 if (test_and_clear_bit(STRIPE_ERROR
, &sh
->state
)) {
957 printk("raid5: restarting stripe %lu\n", sh
->sector
);
958 sh
->phase
= PHASE_BEGIN
;
961 if ((sh
->cmd
== STRIPE_WRITE
&& sh
->phase
== PHASE_WRITE
) ||
962 (sh
->cmd
== STRIPE_READ
&& sh
->phase
== PHASE_READ
)) {
967 if (sh
->phase
== PHASE_COMPLETE
)
973 for (i
= 0; i
< disks
; i
++) {
974 operational
[i
] = raid_conf
->disks
[i
].operational
;
975 if (i
== sh
->pd_idx
&& raid_conf
->resync_parity
)
978 failed_disks
= raid_conf
->failed_disks
;
979 restore_flags(flags
);
981 if (failed_disks
> 1) {
982 for (i
= 0; i
< disks
; i
++) {
984 raid5_end_buffer_io(sh
, i
, 0);
992 for (i
= 0; i
< disks
; i
++) {
995 if (i
== sh
->pd_idx
) {
998 else if(!operational
[i
])
1002 if (!sh
->bh_new
[i
]) {
1005 else if (!operational
[i
])
1011 if (sh
->cmd_new
[i
] == READ
)
1013 if (sh
->cmd_new
[i
] == WRITE
)
1016 nr_cache_overwrite
++;
1017 else if (!operational
[i
])
1018 nr_failed_overwrite
++;
1021 if (nr_write
&& nr_read
)
1022 printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write
, nr_read
, sh
->cmd
);
1026 * Attempt to add entries :-)
1028 if (nr_write
!= disks
- 1) {
1029 for (i
= 0; i
< disks
; i
++) {
1030 if (i
== sh
->pd_idx
)
1034 block
= (int) compute_blocknr(sh
, i
);
1035 bh
= find_buffer(MKDEV(MD_MAJOR
, minor
), block
, sh
->size
);
1036 if (bh
&& bh
->b_count
== 0 && buffer_dirty(bh
) && !buffer_locked(bh
)) {
1037 PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh
->sector
, i
, block
));
1038 add_stripe_bh(sh
, bh
, i
, WRITE
);
1041 if (sh
->bh_old
[i
]) {
1042 nr_cache_overwrite
++;
1044 } else if (!operational
[i
]) {
1045 nr_failed_overwrite
++;
1051 PRINTK(("handle_stripe() -- begin writing, stripe %lu\n", sh
->sector
));
1053 * Writing, need to update parity buffer.
1055 * Compute the number of I/O requests in the "reconstruct
1056 * write" and "read modify write" methods.
1058 if (!nr_failed_other
)
1059 method1
= (disks
- 1) - (nr_write
+ nr_cache_other
);
1060 if (!nr_failed_overwrite
&& !parity_failed
)
1061 method2
= nr_write
- nr_cache_overwrite
+ (1 - parity
);
1063 if (method1
== INT_MAX
&& method2
== INT_MAX
)
1064 printk("raid5: bug: method1 == method2 == INT_MAX\n");
1065 PRINTK(("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh
->sector
, nr_write
, method1
, method2
));
1067 if (!method1
|| !method2
) {
1070 sh
->phase
= PHASE_WRITE
;
1071 compute_parity(sh
, method1
<= method2
? RECONSTRUCT_WRITE
: READ_MODIFY_WRITE
);
1072 for (i
= 0; i
< disks
; i
++) {
1073 if (!operational
[i
] && !raid_conf
->spare
&& !raid_conf
->resync_parity
)
1075 if (i
== sh
->pd_idx
|| sh
->bh_new
[i
])
1079 sh
->nr_pending
= nr_writing
;
1080 PRINTK(("handle_stripe() %lu, writing back %d\n", sh
->sector
, sh
->nr_pending
));
1082 for (i
= 0; i
< disks
; i
++) {
1083 if (!operational
[i
] && !raid_conf
->spare
&& !raid_conf
->resync_parity
)
1085 bh
= sh
->bh_copy
[i
];
1086 if (i
!= sh
->pd_idx
&& ((bh
== NULL
) ^ (sh
->bh_new
[i
] == NULL
)))
1087 printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh
, i
, sh
->bh_new
[i
]);
1088 if (i
== sh
->pd_idx
&& !bh
)
1089 printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i
);
1091 bh
->b_state
|= (1<<BH_Dirty
);
1092 PRINTK(("making request for buffer %d\n", i
));
1093 clear_bit(BH_Lock
, &bh
->b_state
);
1094 if (!operational
[i
] && !raid_conf
->resync_parity
) {
1095 bh
->b_rdev
= raid_conf
->spare
->dev
;
1096 make_request(MAJOR(raid_conf
->spare
->dev
), WRITE
, bh
);
1098 make_request(MAJOR(raid_conf
->disks
[i
].dev
), WRITE
, bh
);
1106 if (method1
< method2
) {
1107 sh
->write_method
= RECONSTRUCT_WRITE
;
1108 for (i
= 0; i
< disks
; i
++) {
1109 if (i
== sh
->pd_idx
)
1111 if (sh
->bh_new
[i
] || sh
->bh_old
[i
])
1113 sh
->bh_old
[i
] = raid5_kmalloc_buffer(sh
, sh
->size
);
1114 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1118 sh
->write_method
= READ_MODIFY_WRITE
;
1119 for (i
= 0; i
< disks
; i
++) {
1122 if (!sh
->bh_new
[i
] && i
!= sh
->pd_idx
)
1124 sh
->bh_old
[i
] = raid5_kmalloc_buffer(sh
, sh
->size
);
1125 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1129 sh
->phase
= PHASE_READ_OLD
;
1130 sh
->nr_pending
= reading
;
1131 PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh
->sector
, sh
->nr_pending
));
1132 for (i
= 0; i
< disks
; i
++) {
1135 if (buffer_uptodate(sh
->bh_old
[i
]))
1137 clear_bit(BH_Lock
, &sh
->bh_old
[i
]->b_state
);
1138 make_request(MAJOR(raid_conf
->disks
[i
].dev
), READ
, sh
->bh_old
[i
]);
1144 method1
= nr_read
- nr_cache_overwrite
;
1148 PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh
->sector
, nr_read
, nr_cache
, method1
));
1149 if (!method1
|| (method1
== 1 && nr_cache
== disks
- 1)) {
1150 PRINTK(("read %lu completed from cache\n", sh
->sector
));
1151 for (i
= 0; i
< disks
; i
++) {
1155 compute_block(sh
, i
);
1156 memcpy(sh
->bh_new
[i
]->b_data
, sh
->bh_old
[i
]->b_data
, sh
->size
);
1159 complete_stripe(sh
);
1162 if (nr_failed_overwrite
) {
1163 sh
->phase
= PHASE_READ_OLD
;
1164 sh
->nr_pending
= (disks
- 1) - nr_cache
;
1165 PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh
->sector
, sh
->nr_pending
));
1166 for (i
= 0; i
< disks
; i
++) {
1169 if (!operational
[i
])
1171 sh
->bh_old
[i
] = raid5_kmalloc_buffer(sh
, sh
->size
);
1172 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1173 clear_bit(BH_Lock
, &sh
->bh_old
[i
]->b_state
);
1174 make_request(MAJOR(raid_conf
->disks
[i
].dev
), READ
, sh
->bh_old
[i
]);
1177 sh
->phase
= PHASE_READ
;
1178 sh
->nr_pending
= nr_read
- nr_cache_overwrite
;
1179 PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh
->sector
, sh
->nr_pending
));
1180 for (i
= 0; i
< disks
; i
++) {
1183 if (sh
->bh_old
[i
]) {
1184 memcpy(sh
->bh_new
[i
]->b_data
, sh
->bh_old
[i
]->b_data
, sh
->size
);
1187 make_request(MAJOR(raid_conf
->disks
[i
].dev
), READ
, sh
->bh_req
[i
]);
1193 static int raid5_make_request (struct md_dev
*mddev
, int rw
, struct buffer_head
* bh
)
1195 struct raid5_data
*raid_conf
= (struct raid5_data
*) mddev
->private;
1196 const unsigned int raid_disks
= raid_conf
->raid_disks
;
1197 const unsigned int data_disks
= raid_disks
- 1;
1198 unsigned int dd_idx
, pd_idx
;
1199 unsigned long new_sector
;
1201 struct stripe_head
*sh
;
1203 if (rw
== READA
) rw
= READ
;
1204 if (rw
== WRITEA
) rw
= WRITE
;
1206 new_sector
= raid5_compute_sector(bh
->b_rsector
, raid_disks
, data_disks
,
1207 &dd_idx
, &pd_idx
, raid_conf
);
1209 PRINTK(("raid5_make_request, sector %lu\n", new_sector
));
1211 sh
= get_stripe(raid_conf
, new_sector
, bh
->b_size
);
1212 if ((rw
== READ
&& sh
->cmd
== STRIPE_WRITE
) || (rw
== WRITE
&& sh
->cmd
== STRIPE_READ
)) {
1213 PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw
, sh
->cmd
));
1215 if (!sh
->nr_pending
)
1219 sh
->pd_idx
= pd_idx
;
1220 if (sh
->phase
!= PHASE_COMPLETE
&& sh
->phase
!= PHASE_BEGIN
)
1221 PRINTK(("stripe %lu catching the bus!\n", sh
->sector
));
1222 if (sh
->bh_new
[dd_idx
]) {
1223 printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx
, sh
->sector
);
1224 printk("raid5: bh %p, bh_new %p\n", bh
, sh
->bh_new
[dd_idx
]);
1226 md_wakeup_thread(raid_conf
->thread
);
1230 add_stripe_bh(sh
, bh
, dd_idx
, rw
);
1232 md_wakeup_thread(raid_conf
->thread
);
1236 static void unplug_devices(struct stripe_head
*sh
)
1239 struct raid5_data
*raid_conf
= sh
->raid_conf
;
1242 for (i
= 0; i
< raid_conf
->raid_disks
; i
++)
1243 unplug_device(blk_dev
+ MAJOR(raid_conf
->disks
[i
].dev
));
1248 * This is our raid5 kernel thread.
1250 * We scan the hash table for stripes which can be handled now.
1251 * During the scan, completed stripes are saved for us by the interrupt
1252 * handler, so that they will not have to wait for our next wakeup.
1254 static void raid5d (void *data
)
1256 struct stripe_head
*sh
;
1257 struct raid5_data
*raid_conf
= data
;
1258 struct md_dev
*mddev
= raid_conf
->mddev
;
1259 int i
, handled
= 0, unplug
= 0;
1260 unsigned long flags
;
1262 PRINTK(("+++ raid5d active\n"));
1264 if (mddev
->sb_dirty
) {
1265 mddev
->sb_dirty
= 0;
1266 md_update_sb((int) (mddev
- md_dev
));
1268 for (i
= 0; i
< NR_HASH
; i
++) {
1270 sh
= raid_conf
->stripe_hashtbl
[i
];
1271 for (; sh
; sh
= sh
->hash_next
) {
1272 if (sh
->raid_conf
!= raid_conf
)
1274 if (sh
->phase
== PHASE_COMPLETE
)
1278 if (sh
->sector
== raid_conf
->next_sector
) {
1279 raid_conf
->sector_count
+= (sh
->size
>> 9);
1280 if (raid_conf
->sector_count
>= 128)
1285 PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh
->sector
, raid_conf
->sector_count
));
1288 raid_conf
->sector_count
= 0;
1290 raid_conf
->next_sector
= sh
->sector
+ (sh
->size
>> 9);
1297 PRINTK(("%d stripes handled, nr_handle %d\n", handled
, atomic_read(&raid_conf
->nr_handle
)));
1300 if (!atomic_read(&raid_conf
->nr_handle
))
1301 clear_bit(THREAD_WAKEUP
, &raid_conf
->thread
->flags
);
1303 PRINTK(("--- raid5d inactive\n"));
1306 #if SUPPORT_RECONSTRUCTION
1308 * Private kernel thread for parity reconstruction after an unclean
1309 * shutdown. Reconstruction on spare drives in case of a failed drive
1310 * is done by the generic mdsyncd.
1312 static void raid5syncd (void *data
)
1314 struct raid5_data
*raid_conf
= data
;
1315 struct md_dev
*mddev
= raid_conf
->mddev
;
1317 if (!raid_conf
->resync_parity
)
1320 raid_conf
->resync_parity
= 0;
1322 #endif /* SUPPORT_RECONSTRUCTION */
1324 static int __check_consistency (struct md_dev
*mddev
, int row
)
1326 struct raid5_data
*raid_conf
= mddev
->private;
1328 struct buffer_head
*bh
[MD_SB_DISKS
], tmp
;
1329 int i
, rc
= 0, nr
= 0;
1331 if (raid_conf
->working_disks
!= raid_conf
->raid_disks
)
1334 if ((tmp
.b_data
= (char *) get_free_page(GFP_KERNEL
)) == NULL
)
1336 memset(bh
, 0, MD_SB_DISKS
* sizeof(struct buffer_head
*));
1337 for (i
= 0; i
< raid_conf
->raid_disks
; i
++) {
1338 dev
= raid_conf
->disks
[i
].dev
;
1339 set_blocksize(dev
, 4096);
1340 if ((bh
[i
] = bread(dev
, row
/ 4, 4096)) == NULL
)
1344 if (nr
== raid_conf
->raid_disks
) {
1345 for (i
= 1; i
< nr
; i
++)
1346 xor_block(&tmp
, bh
[i
]);
1347 if (memcmp(tmp
.b_data
, bh
[0]->b_data
, 4096))
1350 for (i
= 0; i
< raid_conf
->raid_disks
; i
++) {
1351 dev
= raid_conf
->disks
[i
].dev
;
1357 invalidate_buffers(dev
);
1359 free_page((unsigned long) tmp
.b_data
);
1363 static int check_consistency (struct md_dev
*mddev
)
1365 int size
= mddev
->sb
->size
;
1368 for (row
= 0; row
< size
; row
+= size
/ 8)
1369 if (__check_consistency(mddev
, row
))
1374 static int raid5_run (int minor
, struct md_dev
*mddev
)
1376 struct raid5_data
*raid_conf
;
1377 int i
, j
, raid_disk
, memory
;
1378 md_superblock_t
*sb
= mddev
->sb
;
1379 md_descriptor_t
*descriptor
;
1380 struct real_dev
*realdev
;
1384 if (sb
->level
!= 5 && sb
->level
!= 4) {
1385 printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR
, minor
)), sb
->level
);
1390 mddev
->private = kmalloc (sizeof (struct raid5_data
), GFP_KERNEL
);
1391 if ((raid_conf
= mddev
->private) == NULL
)
1393 memset (raid_conf
, 0, sizeof (*raid_conf
));
1394 raid_conf
->mddev
= mddev
;
1396 if ((raid_conf
->stripe_hashtbl
= (struct stripe_head
**) __get_free_pages(GFP_ATOMIC
, HASH_PAGES_ORDER
)) == NULL
)
1398 memset(raid_conf
->stripe_hashtbl
, 0, HASH_PAGES
* PAGE_SIZE
);
1400 init_waitqueue_head(&raid_conf
->wait_for_stripe
);
1401 PRINTK(("raid5_run(%d) called.\n", minor
));
1403 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
1404 realdev
= &mddev
->devices
[i
];
1406 printk(KERN_ERR
"raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev
->dev
));
1411 * This is important -- we are using the descriptor on
1412 * the disk only to get a pointer to the descriptor on
1413 * the main superblock, which might be more recent.
1415 descriptor
= &sb
->disks
[realdev
->sb
->descriptor
.number
];
1416 if (descriptor
->state
& (1 << MD_FAULTY_DEVICE
)) {
1417 printk(KERN_ERR
"raid5: disabled device %s (errors detected)\n", kdevname(realdev
->dev
));
1420 if (descriptor
->state
& (1 << MD_ACTIVE_DEVICE
)) {
1421 if (!(descriptor
->state
& (1 << MD_SYNC_DEVICE
))) {
1422 printk(KERN_ERR
"raid5: disabled device %s (not in sync)\n", kdevname(realdev
->dev
));
1425 raid_disk
= descriptor
->raid_disk
;
1426 if (descriptor
->number
> sb
->nr_disks
|| raid_disk
> sb
->raid_disks
) {
1427 printk(KERN_ERR
"raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev
->dev
));
1430 if (raid_conf
->disks
[raid_disk
].operational
) {
1431 printk(KERN_ERR
"raid5: disabled device %s (device %d already operational)\n", kdevname(realdev
->dev
), raid_disk
);
1434 printk(KERN_INFO
"raid5: device %s operational as raid disk %d\n", kdevname(realdev
->dev
), raid_disk
);
1436 raid_conf
->disks
[raid_disk
].number
= descriptor
->number
;
1437 raid_conf
->disks
[raid_disk
].raid_disk
= raid_disk
;
1438 raid_conf
->disks
[raid_disk
].dev
= mddev
->devices
[i
].dev
;
1439 raid_conf
->disks
[raid_disk
].operational
= 1;
1441 raid_conf
->working_disks
++;
1444 * Must be a spare disk ..
1446 printk(KERN_INFO
"raid5: spare disk %s\n", kdevname(realdev
->dev
));
1447 raid_disk
= descriptor
->raid_disk
;
1448 raid_conf
->disks
[raid_disk
].number
= descriptor
->number
;
1449 raid_conf
->disks
[raid_disk
].raid_disk
= raid_disk
;
1450 raid_conf
->disks
[raid_disk
].dev
= mddev
->devices
[i
].dev
;
1452 raid_conf
->disks
[raid_disk
].operational
= 0;
1453 raid_conf
->disks
[raid_disk
].write_only
= 0;
1454 raid_conf
->disks
[raid_disk
].spare
= 1;
1457 raid_conf
->raid_disks
= sb
->raid_disks
;
1458 raid_conf
->failed_disks
= raid_conf
->raid_disks
- raid_conf
->working_disks
;
1459 raid_conf
->mddev
= mddev
;
1460 raid_conf
->chunk_size
= sb
->chunk_size
;
1461 raid_conf
->level
= sb
->level
;
1462 raid_conf
->algorithm
= sb
->parity_algorithm
;
1463 raid_conf
->max_nr_stripes
= NR_STRIPES
;
1465 if (raid_conf
->working_disks
!= sb
->raid_disks
&& sb
->state
!= (1 << MD_SB_CLEAN
)) {
1466 printk(KERN_ALERT
"raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR
, minor
)));
1469 if (!raid_conf
->chunk_size
|| raid_conf
->chunk_size
% 4) {
1470 printk(KERN_ERR
"raid5: invalid chunk size %d for %s\n", raid_conf
->chunk_size
, kdevname(MKDEV(MD_MAJOR
, minor
)));
1473 if (raid_conf
->algorithm
> ALGORITHM_RIGHT_SYMMETRIC
) {
1474 printk(KERN_ERR
"raid5: unsupported parity algorithm %d for %s\n", raid_conf
->algorithm
, kdevname(MKDEV(MD_MAJOR
, minor
)));
1477 if (raid_conf
->failed_disks
> 1) {
1478 printk(KERN_ERR
"raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR
, minor
)), raid_conf
->failed_disks
, raid_conf
->raid_disks
);
1482 if ((sb
->state
& (1 << MD_SB_CLEAN
)) && check_consistency(mddev
)) {
1483 printk(KERN_ERR
"raid5: detected raid-5 xor inconsistenty -- run ckraid\n");
1484 sb
->state
|= 1 << MD_SB_ERRORS
;
1488 if ((raid_conf
->thread
= md_register_thread(raid5d
, raid_conf
)) == NULL
) {
1489 printk(KERN_ERR
"raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR
, minor
)));
1493 #if SUPPORT_RECONSTRUCTION
1494 if ((raid_conf
->resync_thread
= md_register_thread(raid5syncd
, raid_conf
)) == NULL
) {
1495 printk(KERN_ERR
"raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR
, minor
)));
1498 #endif /* SUPPORT_RECONSTRUCTION */
1500 memory
= raid_conf
->max_nr_stripes
* (sizeof(struct stripe_head
) +
1501 raid_conf
->raid_disks
* (sizeof(struct buffer_head
) +
1502 2 * (sizeof(struct buffer_head
) + PAGE_SIZE
))) / 1024;
1503 if (grow_stripes(raid_conf
, raid_conf
->max_nr_stripes
, GFP_KERNEL
)) {
1504 printk(KERN_ERR
"raid5: couldn't allocate %dkB for buffers\n", memory
);
1505 shrink_stripes(raid_conf
, raid_conf
->max_nr_stripes
);
1508 printk(KERN_INFO
"raid5: allocated %dkB for %s\n", memory
, kdevname(MKDEV(MD_MAJOR
, minor
)));
1511 * Regenerate the "device is in sync with the raid set" bit for
1514 for (i
= 0; i
< sb
->nr_disks
; i
++) {
1515 sb
->disks
[i
].state
&= ~(1 << MD_SYNC_DEVICE
);
1516 for (j
= 0; j
< sb
->raid_disks
; j
++) {
1517 if (!raid_conf
->disks
[j
].operational
)
1519 if (sb
->disks
[i
].number
== raid_conf
->disks
[j
].number
)
1520 sb
->disks
[i
].state
|= 1 << MD_SYNC_DEVICE
;
1523 sb
->active_disks
= raid_conf
->working_disks
;
1525 if (sb
->active_disks
== sb
->raid_disks
)
1526 printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf
->level
, kdevname(MKDEV(MD_MAJOR
, minor
)), sb
->active_disks
, sb
->raid_disks
, raid_conf
->algorithm
);
1528 printk(KERN_ALERT
"raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf
->level
, kdevname(MKDEV(MD_MAJOR
, minor
)), sb
->active_disks
, sb
->raid_disks
, raid_conf
->algorithm
);
1530 if ((sb
->state
& (1 << MD_SB_CLEAN
)) == 0) {
1531 printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR
, minor
)));
1532 raid_conf
->resync_parity
= 1;
1533 #if SUPPORT_RECONSTRUCTION
1534 md_wakeup_thread(raid_conf
->resync_thread
);
1535 #endif /* SUPPORT_RECONSTRUCTION */
1538 /* Ok, everything is just fine now */
1542 if (raid_conf
->stripe_hashtbl
)
1543 free_pages((unsigned long) raid_conf
->stripe_hashtbl
, HASH_PAGES_ORDER
);
1546 mddev
->private = NULL
;
1547 printk(KERN_ALERT
"raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR
, minor
)));
1552 static int raid5_stop (int minor
, struct md_dev
*mddev
)
1554 struct raid5_data
*raid_conf
= (struct raid5_data
*) mddev
->private;
1556 shrink_stripe_cache(raid_conf
, raid_conf
->max_nr_stripes
);
1557 shrink_stripes(raid_conf
, raid_conf
->max_nr_stripes
);
1558 md_unregister_thread(raid_conf
->thread
);
1559 #if SUPPORT_RECONSTRUCTION
1560 md_unregister_thread(raid_conf
->resync_thread
);
1561 #endif /* SUPPORT_RECONSTRUCTION */
1562 free_pages((unsigned long) raid_conf
->stripe_hashtbl
, HASH_PAGES_ORDER
);
1564 mddev
->private = NULL
;
1569 static int raid5_status (char *page
, int minor
, struct md_dev
*mddev
)
1571 struct raid5_data
*raid_conf
= (struct raid5_data
*) mddev
->private;
1572 md_superblock_t
*sb
= mddev
->sb
;
1575 sz
+= sprintf (page
+sz
, " level %d, %dk chunk, algorithm %d", sb
->level
, sb
->chunk_size
>> 10, sb
->parity_algorithm
);
1576 sz
+= sprintf (page
+sz
, " [%d/%d] [", raid_conf
->raid_disks
, raid_conf
->working_disks
);
1577 for (i
= 0; i
< raid_conf
->raid_disks
; i
++)
1578 sz
+= sprintf (page
+sz
, "%s", raid_conf
->disks
[i
].operational
? "U" : "_");
1579 sz
+= sprintf (page
+sz
, "]");
1583 static int raid5_mark_spare(struct md_dev
*mddev
, md_descriptor_t
*spare
, int state
)
1585 int i
= 0, failed_disk
= -1;
1586 struct raid5_data
*raid_conf
= mddev
->private;
1587 struct disk_info
*disk
= raid_conf
->disks
;
1588 unsigned long flags
;
1589 md_superblock_t
*sb
= mddev
->sb
;
1590 md_descriptor_t
*descriptor
;
1592 for (i
= 0; i
< MD_SB_DISKS
; i
++, disk
++) {
1593 if (disk
->spare
&& disk
->number
== spare
->number
)
1598 for (i
= 0, disk
= raid_conf
->disks
; i
< raid_conf
->raid_disks
; i
++, disk
++)
1599 if (!disk
->operational
)
1601 if (failed_disk
== -1)
1607 disk
->operational
= 1;
1608 disk
->write_only
= 1;
1609 raid_conf
->spare
= disk
;
1611 case SPARE_INACTIVE
:
1612 disk
->operational
= 0;
1613 disk
->write_only
= 0;
1614 raid_conf
->spare
= NULL
;
1618 disk
->write_only
= 0;
1620 descriptor
= &sb
->disks
[raid_conf
->disks
[failed_disk
].number
];
1621 i
= spare
->raid_disk
;
1622 disk
->raid_disk
= spare
->raid_disk
= descriptor
->raid_disk
;
1623 if (disk
->raid_disk
!= failed_disk
)
1624 printk("raid5: disk->raid_disk != failed_disk");
1625 descriptor
->raid_disk
= i
;
1627 raid_conf
->spare
= NULL
;
1628 raid_conf
->working_disks
++;
1629 raid_conf
->failed_disks
--;
1630 raid_conf
->disks
[failed_disk
] = *disk
;
1633 printk("raid5_mark_spare: bug: state == %d\n", state
);
1634 restore_flags(flags
);
1637 restore_flags(flags
);
1641 static struct md_personality raid5_personality
=
1650 NULL
, /* no ioctls */
1653 /* raid5_hot_add_disk, */ NULL
,
1654 /* raid1_hot_remove_drive */ NULL
,
1658 int raid5_init (void)
1660 return register_md_personality (RAID5
, &raid5_personality
);
1664 int init_module (void)
1666 return raid5_init();
1669 void cleanup_module (void)
1671 unregister_md_personality (RAID5
);