2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
6 * RAID-5 management functions.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/locks.h>
22 #include <linux/malloc.h>
23 #include <linux/raid/raid5.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
27 static mdk_personality_t raid5_personality
;
33 #define NR_STRIPES 128
35 #define HASH_PAGES_ORDER 0
36 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
37 #define HASH_MASK (NR_HASH - 1)
38 #define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
41 * The following can be used to debug the driver
44 #define RAID5_PARANOIA 1
45 #if RAID5_PARANOIA && CONFIG_SMP
46 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
47 # define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG()
49 # define CHECK_DEVLOCK()
50 # define CHECK_SHLOCK(unused)
54 #define PRINTK(x...) printk(x)
58 #define PRINTK(x...) do { } while (0)
61 static void print_raid5_conf (raid5_conf_t
*conf
);
63 static inline int stripe_locked(struct stripe_head
*sh
)
65 return test_bit(STRIPE_LOCKED
, &sh
->state
);
68 static void __unlock_stripe(struct stripe_head
*sh
)
70 if (!md_test_and_clear_bit(STRIPE_LOCKED
, &sh
->state
))
72 PRINTK("unlocking stripe %lu\n", sh
->sector
);
76 static void finish_unlock_stripe(struct stripe_head
*sh
)
78 raid5_conf_t
*conf
= sh
->raid_conf
;
79 sh
->cmd
= STRIPE_NONE
;
80 sh
->phase
= PHASE_COMPLETE
;
81 atomic_dec(&conf
->nr_pending_stripes
);
82 atomic_inc(&conf
->nr_cached_stripes
);
84 atomic_dec(&sh
->count
);
85 wake_up(&conf
->wait_for_stripe
);
88 static void remove_hash(raid5_conf_t
*conf
, struct stripe_head
*sh
)
90 PRINTK("remove_hash(), stripe %lu\n", sh
->sector
);
96 sh
->hash_next
->hash_pprev
= sh
->hash_pprev
;
97 *sh
->hash_pprev
= sh
->hash_next
;
98 sh
->hash_pprev
= NULL
;
99 atomic_dec(&conf
->nr_hashed_stripes
);
103 static void lock_get_bh (struct buffer_head
*bh
)
105 while (md_test_and_set_bit(BH_Lock
, &bh
->b_state
))
106 __wait_on_buffer(bh
);
107 atomic_inc(&bh
->b_count
);
110 static __inline__
void insert_hash(raid5_conf_t
*conf
, struct stripe_head
*sh
)
112 struct stripe_head
**shp
= &stripe_hash(conf
, sh
->sector
, sh
->size
);
114 PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
115 sh
->sector
, atomic_read(&conf
->nr_hashed_stripes
));
119 if ((sh
->hash_next
= *shp
) != NULL
)
120 (*shp
)->hash_pprev
= &sh
->hash_next
;
122 sh
->hash_pprev
= shp
;
123 atomic_inc(&conf
->nr_hashed_stripes
);
126 static struct buffer_head
*get_free_buffer(struct stripe_head
*sh
, int b_size
)
128 struct buffer_head
*bh
;
132 md_spin_lock_irqsave(&sh
->stripe_lock
, flags
);
133 bh
= sh
->buffer_pool
;
136 sh
->buffer_pool
= bh
->b_next
;
138 if (atomic_read(&bh
->b_count
))
141 md_spin_unlock_irqrestore(&sh
->stripe_lock
, flags
);
146 static struct buffer_head
*get_free_bh(struct stripe_head
*sh
)
148 struct buffer_head
*bh
;
152 md_spin_lock_irqsave(&sh
->stripe_lock
, flags
);
156 sh
->bh_pool
= bh
->b_next
;
157 if (atomic_read(&bh
->b_count
))
160 md_spin_unlock_irqrestore(&sh
->stripe_lock
, flags
);
165 static void put_free_buffer(struct stripe_head
*sh
, struct buffer_head
*bh
)
169 if (atomic_read(&bh
->b_count
))
172 md_spin_lock_irqsave(&sh
->stripe_lock
, flags
);
173 bh
->b_next
= sh
->buffer_pool
;
174 sh
->buffer_pool
= bh
;
175 md_spin_unlock_irqrestore(&sh
->stripe_lock
, flags
);
178 static void put_free_bh(struct stripe_head
*sh
, struct buffer_head
*bh
)
182 if (atomic_read(&bh
->b_count
))
185 md_spin_lock_irqsave(&sh
->stripe_lock
, flags
);
186 bh
->b_next
= sh
->bh_pool
;
188 md_spin_unlock_irqrestore(&sh
->stripe_lock
, flags
);
191 static struct stripe_head
*get_free_stripe(raid5_conf_t
*conf
)
193 struct stripe_head
*sh
;
195 md_spin_lock_irq(&conf
->device_lock
);
196 sh
= conf
->free_sh_list
;
199 conf
->free_sh_list
= sh
->free_next
;
200 atomic_dec(&conf
->nr_free_sh
);
201 if (!atomic_read(&conf
->nr_free_sh
) && conf
->free_sh_list
)
203 if (sh
->hash_pprev
|| md_atomic_read(&sh
->nr_pending
) ||
204 atomic_read(&sh
->count
))
207 md_spin_unlock_irq(&conf
->device_lock
);
211 static void __put_free_stripe (raid5_conf_t
*conf
, struct stripe_head
*sh
)
213 if (atomic_read(&sh
->count
) != 0)
217 clear_bit(STRIPE_LOCKED
, &sh
->state
);
218 sh
->free_next
= conf
->free_sh_list
;
219 conf
->free_sh_list
= sh
;
220 atomic_inc(&conf
->nr_free_sh
);
223 static void shrink_buffers(struct stripe_head
*sh
, int num
)
225 struct buffer_head
*bh
;
228 bh
= get_free_buffer(sh
, -1);
231 free_page((unsigned long) bh
->b_data
);
236 static void shrink_bh(struct stripe_head
*sh
, int num
)
238 struct buffer_head
*bh
;
241 bh
= get_free_bh(sh
);
248 static int grow_raid5_buffers(struct stripe_head
*sh
, int num
, int b_size
, int priority
)
250 struct buffer_head
*bh
;
254 bh
= kmalloc(sizeof(struct buffer_head
), priority
);
257 memset(bh
, 0, sizeof (struct buffer_head
));
258 init_waitqueue_head(&bh
->b_wait
);
259 page
= alloc_page(priority
);
260 bh
->b_data
= (char *) page_address(page
);
266 atomic_set(&bh
->b_count
, 0);
267 set_bh_page(bh
, page
, 0);
268 put_free_buffer(sh
, bh
);
273 static int grow_bh(struct stripe_head
*sh
, int num
, int priority
)
275 struct buffer_head
*bh
;
278 bh
= kmalloc(sizeof(struct buffer_head
), priority
);
281 memset(bh
, 0, sizeof (struct buffer_head
));
282 init_waitqueue_head(&bh
->b_wait
);
288 static void raid5_free_buffer(struct stripe_head
*sh
, struct buffer_head
*bh
)
290 put_free_buffer(sh
, bh
);
293 static void raid5_free_bh(struct stripe_head
*sh
, struct buffer_head
*bh
)
298 static void raid5_free_old_bh(struct stripe_head
*sh
, int i
)
303 raid5_free_buffer(sh
, sh
->bh_old
[i
]);
304 sh
->bh_old
[i
] = NULL
;
307 static void raid5_update_old_bh(struct stripe_head
*sh
, int i
)
310 PRINTK("stripe %lu, idx %d, updating cache copy\n", sh
->sector
, i
);
314 raid5_free_old_bh(sh
, i
);
315 sh
->bh_old
[i
] = sh
->bh_copy
[i
];
316 sh
->bh_copy
[i
] = NULL
;
319 static void free_stripe(struct stripe_head
*sh
)
321 raid5_conf_t
*conf
= sh
->raid_conf
;
322 int disks
= conf
->raid_disks
, j
;
324 if (atomic_read(&sh
->count
) != 0)
328 PRINTK("free_stripe called, stripe %lu\n", sh
->sector
);
329 if (sh
->phase
!= PHASE_COMPLETE
|| atomic_read(&sh
->count
)) {
330 PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh
->sector
, sh
->phase
, atomic_read(&sh
->count
));
333 for (j
= 0; j
< disks
; j
++) {
335 raid5_free_old_bh(sh
, j
);
336 if (sh
->bh_new
[j
] || sh
->bh_copy
[j
])
339 remove_hash(conf
, sh
);
340 __put_free_stripe(conf
, sh
);
343 static int shrink_stripe_cache(raid5_conf_t
*conf
, int nr
)
345 struct stripe_head
*sh
;
348 PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr
, atomic_read(&conf
->nr_hashed_stripes
), conf
->clock
);
349 md_spin_lock_irq(&conf
->device_lock
);
350 for (i
= 0; i
< NR_HASH
; i
++) {
351 sh
= conf
->stripe_hashtbl
[(i
+ conf
->clock
) & HASH_MASK
];
352 for (; sh
; sh
= sh
->hash_next
) {
353 if (sh
->phase
!= PHASE_COMPLETE
)
355 if (atomic_read(&sh
->count
))
358 * Try to lock this stripe:
360 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
))
364 conf
->clock
= (i
+ conf
->clock
) & HASH_MASK
;
370 md_spin_unlock_irq(&conf
->device_lock
);
371 PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n",
372 atomic_read(&conf
->nr_hashed_stripes
),
373 atomic_read(&conf
->nr_pending_stripes
));
377 void __wait_lock_stripe(struct stripe_head
*sh
)
379 MD_DECLARE_WAITQUEUE(wait
, current
);
381 PRINTK("wait_lock_stripe %lu\n", sh
->sector
);
382 if (!atomic_read(&sh
->count
))
384 add_wait_queue(&sh
->wait
, &wait
);
386 set_current_state(TASK_UNINTERRUPTIBLE
);
387 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
)) {
391 PRINTK("wait_lock_stripe %lu done\n", sh
->sector
);
392 remove_wait_queue(&sh
->wait
, &wait
);
393 current
->state
= TASK_RUNNING
;
396 static struct stripe_head
*__find_stripe(raid5_conf_t
*conf
, unsigned long sector
, int size
)
398 struct stripe_head
*sh
;
400 PRINTK("__find_stripe, sector %lu\n", sector
);
401 for (sh
= stripe_hash(conf
, sector
, size
); sh
; sh
= sh
->hash_next
) {
402 if (sh
->sector
== sector
&& sh
->raid_conf
== conf
) {
403 if (sh
->size
!= size
)
408 PRINTK("__stripe %lu not in cache\n", sector
);
412 static inline struct stripe_head
*alloc_stripe(raid5_conf_t
*conf
, unsigned long sector
, int size
)
414 struct stripe_head
*sh
;
415 struct buffer_head
*buffer_pool
, *bh_pool
;
416 MD_DECLARE_WAITQUEUE(wait
, current
);
418 PRINTK("alloc_stripe called\n");
421 while ((sh
= get_free_stripe(conf
)) == NULL
) {
423 add_wait_queue(&conf
->wait_for_stripe
, &wait
);
424 set_current_state(TASK_UNINTERRUPTIBLE
);
425 cnt
= shrink_stripe_cache(conf
, conf
->max_nr_stripes
/ 8);
426 sh
= get_free_stripe(conf
);
427 if (!sh
&& cnt
< (conf
->max_nr_stripes
/8)) {
428 md_wakeup_thread(conf
->thread
);
429 PRINTK("waiting for some stripes to complete - %d %d\n", cnt
, conf
->max_nr_stripes
/8);
432 remove_wait_queue(&conf
->wait_for_stripe
, &wait
);
433 current
->state
= TASK_RUNNING
;
438 buffer_pool
= sh
->buffer_pool
;
439 bh_pool
= sh
->bh_pool
;
440 memset(sh
, 0, sizeof(*sh
));
441 sh
->stripe_lock
= MD_SPIN_LOCK_UNLOCKED
;
442 md_init_waitqueue_head(&sh
->wait
);
443 sh
->buffer_pool
= buffer_pool
;
444 sh
->bh_pool
= bh_pool
;
445 sh
->phase
= PHASE_COMPLETE
;
446 sh
->cmd
= STRIPE_NONE
;
447 sh
->raid_conf
= conf
;
450 atomic_inc(&conf
->nr_cached_stripes
);
455 static struct stripe_head
*get_lock_stripe(raid5_conf_t
*conf
, unsigned long sector
, int size
)
457 struct stripe_head
*sh
, *new = NULL
;
459 PRINTK("get_stripe, sector %lu\n", sector
);
462 * Do this in set_blocksize()!
464 if (conf
->buffer_size
!= size
) {
465 PRINTK("switching size, %d --> %d\n", conf
->buffer_size
, size
);
466 shrink_stripe_cache(conf
, conf
->max_nr_stripes
);
467 conf
->buffer_size
= size
;
471 md_spin_lock_irq(&conf
->device_lock
);
472 sh
= __find_stripe(conf
, sector
, size
);
475 md_spin_unlock_irq(&conf
->device_lock
);
476 new = alloc_stripe(conf
, sector
, size
);
481 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
))
483 insert_hash(conf
, sh
);
484 atomic_inc(&sh
->count
);
485 md_spin_unlock_irq(&conf
->device_lock
);
487 atomic_inc(&sh
->count
);
489 if (md_test_and_set_bit(STRIPE_LOCKED
, &new->state
))
491 __put_free_stripe(conf
, new);
493 md_spin_unlock_irq(&conf
->device_lock
);
494 PRINTK("get_stripe, waiting, sector %lu\n", sector
);
495 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
))
496 __wait_lock_stripe(sh
);
501 static int grow_stripes(raid5_conf_t
*conf
, int num
, int priority
)
503 struct stripe_head
*sh
;
506 sh
= kmalloc(sizeof(struct stripe_head
), priority
);
509 memset(sh
, 0, sizeof(*sh
));
510 sh
->raid_conf
= conf
;
511 sh
->stripe_lock
= MD_SPIN_LOCK_UNLOCKED
;
512 md_init_waitqueue_head(&sh
->wait
);
514 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
))
516 if (grow_raid5_buffers(sh
, 2 * conf
->raid_disks
, PAGE_SIZE
, priority
)) {
517 shrink_buffers(sh
, 2 * conf
->raid_disks
);
521 if (grow_bh(sh
, conf
->raid_disks
, priority
)) {
522 shrink_buffers(sh
, 2 * conf
->raid_disks
);
523 shrink_bh(sh
, conf
->raid_disks
);
527 md_spin_lock_irq(&conf
->device_lock
);
528 __put_free_stripe(conf
, sh
);
529 atomic_inc(&conf
->nr_stripes
);
530 md_spin_unlock_irq(&conf
->device_lock
);
535 static void shrink_stripes(raid5_conf_t
*conf
, int num
)
537 struct stripe_head
*sh
;
540 sh
= get_free_stripe(conf
);
543 if (md_test_and_set_bit(STRIPE_LOCKED
, &sh
->state
))
545 shrink_buffers(sh
, conf
->raid_disks
* 2);
546 shrink_bh(sh
, conf
->raid_disks
);
548 atomic_dec(&conf
->nr_stripes
);
553 static struct buffer_head
*raid5_alloc_buffer(struct stripe_head
*sh
, int b_size
)
555 struct buffer_head
*bh
;
557 bh
= get_free_buffer(sh
, b_size
);
563 static struct buffer_head
*raid5_alloc_bh(struct stripe_head
*sh
)
565 struct buffer_head
*bh
;
567 bh
= get_free_bh(sh
);
573 static void raid5_end_buffer_io (struct stripe_head
*sh
, int i
, int uptodate
)
575 struct buffer_head
*bh
= sh
->bh_new
[i
];
577 PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh
->b_blocknr
, uptodate
);
578 sh
->bh_new
[i
] = NULL
;
579 raid5_free_bh(sh
, sh
->bh_req
[i
]);
580 sh
->bh_req
[i
] = NULL
;
581 PRINTK("calling %p->end_io: %p.\n", bh
, bh
->b_end_io
);
582 bh
->b_end_io(bh
, uptodate
);
584 printk(KERN_ALERT
"raid5: %s: unrecoverable I/O error for "
586 partition_name(mddev_to_kdev(sh
->raid_conf
->mddev
)),
590 static inline void raid5_mark_buffer_uptodate (struct buffer_head
*bh
, int uptodate
)
593 set_bit(BH_Uptodate
, &bh
->b_state
);
595 clear_bit(BH_Uptodate
, &bh
->b_state
);
598 static void raid5_end_request (struct buffer_head
* bh
, int uptodate
)
600 struct stripe_head
*sh
= bh
->b_dev_id
;
601 raid5_conf_t
*conf
= sh
->raid_conf
;
602 int disks
= conf
->raid_disks
, i
;
605 PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh
->sector
, atomic_read(&sh
->nr_pending
), uptodate
, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3));
606 md_spin_lock_irqsave(&sh
->stripe_lock
, flags
);
607 raid5_mark_buffer_uptodate(bh
, uptodate
);
609 md_error(mddev_to_kdev(conf
->mddev
), bh
->b_dev
);
610 if (conf
->failed_disks
) {
611 for (i
= 0; i
< disks
; i
++) {
612 if (conf
->disks
[i
].operational
)
614 if (bh
!= sh
->bh_old
[i
] && bh
!= sh
->bh_req
[i
] && bh
!= sh
->bh_copy
[i
])
616 if (bh
->b_dev
!= conf
->disks
[i
].dev
)
618 set_bit(STRIPE_ERROR
, &sh
->state
);
621 md_spin_unlock_irqrestore(&sh
->stripe_lock
, flags
);
623 if (atomic_dec_and_test(&sh
->nr_pending
)) {
624 atomic_inc(&conf
->nr_handle
);
625 md_wakeup_thread(conf
->thread
);
629 static void raid5_build_block (struct stripe_head
*sh
, struct buffer_head
*bh
, int i
)
631 raid5_conf_t
*conf
= sh
->raid_conf
;
634 int block
= sh
->sector
/ (sh
->size
>> 9);
638 memset (bh
, 0, sizeof (struct buffer_head
));
639 init_waitqueue_head(&bh
->b_wait
);
640 init_buffer(bh
, raid5_end_request
, sh
);
641 bh
->b_dev
= conf
->disks
[i
].dev
;
642 bh
->b_blocknr
= block
;
647 bh
->b_rdev
= conf
->disks
[i
].dev
;
648 bh
->b_rsector
= sh
->sector
;
650 bh
->b_state
= (1 << BH_Req
) | (1 << BH_Mapped
);
651 bh
->b_size
= sh
->size
;
652 bh
->b_list
= BUF_LOCKED
;
655 static int raid5_error (mddev_t
*mddev
, kdev_t dev
)
657 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
658 mdp_super_t
*sb
= mddev
->sb
;
659 struct disk_info
*disk
;
662 PRINTK("raid5_error called\n");
663 conf
->resync_parity
= 0;
664 for (i
= 0, disk
= conf
->disks
; i
< conf
->raid_disks
; i
++, disk
++) {
665 if (disk
->dev
== dev
&& disk
->operational
) {
666 disk
->operational
= 0;
667 mark_disk_faulty(sb
->disks
+disk
->number
);
668 mark_disk_nonsync(sb
->disks
+disk
->number
);
669 mark_disk_inactive(sb
->disks
+disk
->number
);
674 conf
->working_disks
--;
675 conf
->failed_disks
++;
676 md_wakeup_thread(conf
->thread
);
678 "raid5: Disk failure on %s, disabling device."
679 " Operation continuing on %d devices\n",
680 partition_name (dev
), conf
->working_disks
);
685 * handle errors in spares (during reconstruction)
689 if (disk
->dev
== dev
) {
691 "raid5: Disk failure on spare %s\n",
692 partition_name (dev
));
693 if (!conf
->spare
->operational
) {
697 disk
->operational
= 0;
698 disk
->write_only
= 0;
700 mark_disk_faulty(sb
->disks
+disk
->number
);
701 mark_disk_nonsync(sb
->disks
+disk
->number
);
702 mark_disk_inactive(sb
->disks
+disk
->number
);
715 * Input: a 'big' sector number,
716 * Output: index of the data and parity disk, and the sector # in them.
718 static unsigned long raid5_compute_sector(int r_sector
, unsigned int raid_disks
,
719 unsigned int data_disks
, unsigned int * dd_idx
,
720 unsigned int * pd_idx
, raid5_conf_t
*conf
)
723 int chunk_number
, chunk_offset
;
724 unsigned long new_sector
;
725 int sectors_per_chunk
= conf
->chunk_size
>> 9;
727 /* First compute the information on this sector */
730 * Compute the chunk number and the sector offset inside the chunk
732 chunk_number
= r_sector
/ sectors_per_chunk
;
733 chunk_offset
= r_sector
% sectors_per_chunk
;
736 * Compute the stripe number
738 stripe
= chunk_number
/ data_disks
;
741 * Compute the data disk and parity disk indexes inside the stripe
743 *dd_idx
= chunk_number
% data_disks
;
746 * Select the parity disk based on the user selected algorithm.
748 if (conf
->level
== 4)
749 *pd_idx
= data_disks
;
750 else switch (conf
->algorithm
) {
751 case ALGORITHM_LEFT_ASYMMETRIC
:
752 *pd_idx
= data_disks
- stripe
% raid_disks
;
753 if (*dd_idx
>= *pd_idx
)
756 case ALGORITHM_RIGHT_ASYMMETRIC
:
757 *pd_idx
= stripe
% raid_disks
;
758 if (*dd_idx
>= *pd_idx
)
761 case ALGORITHM_LEFT_SYMMETRIC
:
762 *pd_idx
= data_disks
- stripe
% raid_disks
;
763 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
765 case ALGORITHM_RIGHT_SYMMETRIC
:
766 *pd_idx
= stripe
% raid_disks
;
767 *dd_idx
= (*pd_idx
+ 1 + *dd_idx
) % raid_disks
;
770 printk ("raid5: unsupported algorithm %d\n", conf
->algorithm
);
774 * Finally, compute the new sector number
776 new_sector
= stripe
* sectors_per_chunk
+ chunk_offset
;
780 static unsigned long compute_blocknr(struct stripe_head
*sh
, int i
)
782 raid5_conf_t
*conf
= sh
->raid_conf
;
783 int raid_disks
= conf
->raid_disks
, data_disks
= raid_disks
- 1;
784 unsigned long new_sector
= sh
->sector
, check
;
785 int sectors_per_chunk
= conf
->chunk_size
>> 9;
786 unsigned long stripe
= new_sector
/ sectors_per_chunk
;
787 int chunk_offset
= new_sector
% sectors_per_chunk
;
788 int chunk_number
, dummy1
, dummy2
, dd_idx
= i
;
789 unsigned long r_sector
, blocknr
;
791 switch (conf
->algorithm
) {
792 case ALGORITHM_LEFT_ASYMMETRIC
:
793 case ALGORITHM_RIGHT_ASYMMETRIC
:
797 case ALGORITHM_LEFT_SYMMETRIC
:
798 case ALGORITHM_RIGHT_SYMMETRIC
:
801 i
-= (sh
->pd_idx
+ 1);
804 printk ("raid5: unsupported algorithm %d\n", conf
->algorithm
);
807 chunk_number
= stripe
* data_disks
+ i
;
808 r_sector
= chunk_number
* sectors_per_chunk
+ chunk_offset
;
809 blocknr
= r_sector
/ (sh
->size
>> 9);
811 check
= raid5_compute_sector (r_sector
, raid_disks
, data_disks
, &dummy1
, &dummy2
, conf
);
812 if (check
!= sh
->sector
|| dummy1
!= dd_idx
|| dummy2
!= sh
->pd_idx
) {
813 printk("compute_blocknr: map not correct\n");
819 static void compute_block(struct stripe_head
*sh
, int dd_idx
)
821 raid5_conf_t
*conf
= sh
->raid_conf
;
822 int i
, count
, disks
= conf
->raid_disks
;
823 struct buffer_head
*bh_ptr
[MAX_XOR_BLOCKS
];
825 PRINTK("compute_block, stripe %lu, idx %d\n", sh
->sector
, dd_idx
);
827 if (sh
->bh_old
[dd_idx
] == NULL
)
828 sh
->bh_old
[dd_idx
] = raid5_alloc_buffer(sh
, sh
->size
);
829 raid5_build_block(sh
, sh
->bh_old
[dd_idx
], dd_idx
);
831 memset(sh
->bh_old
[dd_idx
]->b_data
, 0, sh
->size
);
832 bh_ptr
[0] = sh
->bh_old
[dd_idx
];
834 for (i
= 0; i
< disks
; i
++) {
838 bh_ptr
[count
++] = sh
->bh_old
[i
];
840 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx
, sh
->sector
, i
);
842 if (count
== MAX_XOR_BLOCKS
) {
843 xor_block(count
, &bh_ptr
[0]);
848 xor_block(count
, &bh_ptr
[0]);
849 raid5_mark_buffer_uptodate(sh
->bh_old
[dd_idx
], 1);
852 static void compute_parity(struct stripe_head
*sh
, int method
)
854 raid5_conf_t
*conf
= sh
->raid_conf
;
855 int i
, pd_idx
= sh
->pd_idx
, disks
= conf
->raid_disks
, count
;
856 struct buffer_head
*bh_ptr
[MAX_XOR_BLOCKS
];
858 PRINTK("compute_parity, stripe %lu, method %d\n", sh
->sector
, method
);
859 for (i
= 0; i
< disks
; i
++) {
860 if (i
== pd_idx
|| !sh
->bh_new
[i
])
863 sh
->bh_copy
[i
] = raid5_alloc_buffer(sh
, sh
->size
);
864 raid5_build_block(sh
, sh
->bh_copy
[i
], i
);
865 if (atomic_set_buffer_clean(sh
->bh_new
[i
]))
866 atomic_set_buffer_dirty(sh
->bh_copy
[i
]);
867 memcpy(sh
->bh_copy
[i
]->b_data
, sh
->bh_new
[i
]->b_data
, sh
->size
);
869 if (sh
->bh_copy
[pd_idx
] == NULL
) {
870 sh
->bh_copy
[pd_idx
] = raid5_alloc_buffer(sh
, sh
->size
);
871 atomic_set_buffer_dirty(sh
->bh_copy
[pd_idx
]);
873 raid5_build_block(sh
, sh
->bh_copy
[pd_idx
], sh
->pd_idx
);
875 if (method
== RECONSTRUCT_WRITE
) {
876 memset(sh
->bh_copy
[pd_idx
]->b_data
, 0, sh
->size
);
877 bh_ptr
[0] = sh
->bh_copy
[pd_idx
];
879 for (i
= 0; i
< disks
; i
++) {
883 bh_ptr
[count
++] = sh
->bh_copy
[i
];
884 } else if (sh
->bh_old
[i
]) {
885 bh_ptr
[count
++] = sh
->bh_old
[i
];
887 if (count
== MAX_XOR_BLOCKS
) {
888 xor_block(count
, &bh_ptr
[0]);
893 xor_block(count
, &bh_ptr
[0]);
895 } else if (method
== READ_MODIFY_WRITE
) {
896 memcpy(sh
->bh_copy
[pd_idx
]->b_data
, sh
->bh_old
[pd_idx
]->b_data
, sh
->size
);
897 bh_ptr
[0] = sh
->bh_copy
[pd_idx
];
899 for (i
= 0; i
< disks
; i
++) {
902 if (sh
->bh_new
[i
] && sh
->bh_old
[i
]) {
903 bh_ptr
[count
++] = sh
->bh_copy
[i
];
904 bh_ptr
[count
++] = sh
->bh_old
[i
];
906 if (count
>= (MAX_XOR_BLOCKS
- 1)) {
907 xor_block(count
, &bh_ptr
[0]);
912 xor_block(count
, &bh_ptr
[0]);
915 raid5_mark_buffer_uptodate(sh
->bh_copy
[pd_idx
], 1);
918 static void add_stripe_bh (struct stripe_head
*sh
, struct buffer_head
*bh
, int dd_idx
, int rw
)
920 raid5_conf_t
*conf
= sh
->raid_conf
;
921 struct buffer_head
*bh_req
;
923 PRINTK("adding bh b#%lu to stripe s#%lu\n", bh
->b_blocknr
, sh
->sector
);
925 if (sh
->bh_new
[dd_idx
])
928 bh_req
= raid5_alloc_bh(sh
);
929 raid5_build_block(sh
, bh_req
, dd_idx
);
930 bh_req
->b_data
= bh
->b_data
;
931 bh_req
->b_page
= bh
->b_page
;
933 md_spin_lock_irq(&conf
->device_lock
);
934 if (sh
->phase
== PHASE_COMPLETE
&& sh
->cmd
== STRIPE_NONE
) {
935 PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh
->sector
, rw
== READ
? "read" : "write");
936 sh
->phase
= PHASE_BEGIN
;
937 sh
->cmd
= (rw
== READ
) ? STRIPE_READ
: STRIPE_WRITE
;
938 atomic_inc(&conf
->nr_pending_stripes
);
939 atomic_inc(&conf
->nr_handle
);
940 PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf
->nr_pending_stripes
), atomic_read(&conf
->nr_handle
));
942 sh
->bh_new
[dd_idx
] = bh
;
943 sh
->bh_req
[dd_idx
] = bh_req
;
944 sh
->cmd_new
[dd_idx
] = rw
;
946 md_spin_unlock_irq(&conf
->device_lock
);
948 PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh
->b_blocknr
, sh
->sector
, dd_idx
);
951 static void complete_stripe(struct stripe_head
*sh
)
953 raid5_conf_t
*conf
= sh
->raid_conf
;
954 int disks
= conf
->raid_disks
;
957 PRINTK("complete_stripe %lu\n", sh
->sector
);
958 for (i
= 0; i
< disks
; i
++) {
959 if (sh
->cmd
== STRIPE_SYNC
&& sh
->bh_copy
[i
])
960 raid5_update_old_bh(sh
, i
);
961 if (sh
->cmd
== STRIPE_WRITE
&& i
== sh
->pd_idx
)
962 raid5_update_old_bh(sh
, i
);
964 PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh
->sector
, sh
->new[i
]);
967 if (sh
->cmd
== STRIPE_WRITE
) {
968 if (memcmp(sh
->bh_new
[i
]->b_data
, sh
->bh_copy
[i
]->b_data
, sh
->size
)) {
969 printk("copy differs, %s, sector %lu ",
970 test_bit(BH_Dirty
, &sh
->bh_new
[i
]->b_state
) ? "dirty" : "clean",
972 } else if (test_bit(BH_Dirty
, &sh
->bh_new
[i
]->b_state
))
973 printk("sector %lu dirty\n", sh
->sector
);
976 if (sh
->cmd
== STRIPE_WRITE
)
977 raid5_update_old_bh(sh
, i
);
978 raid5_end_buffer_io(sh
, i
, 1);
983 if (new && sh
->cmd
== STRIPE_WRITE
)
984 printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new);
986 if (sh
->cmd
== STRIPE_SYNC
)
987 md_done_sync(conf
->mddev
, (sh
->size
>>10) - sh
->sync_redone
,1);
989 finish_unlock_stripe(sh
);
991 PRINTK("stripe %lu, new == %d\n", sh
->sector
, new);
992 sh
->phase
= PHASE_BEGIN
;
997 static int is_stripe_allclean(struct stripe_head
*sh
, int disks
)
1002 for (i
= 0; i
< disks
; i
++) {
1004 if (test_bit(BH_Dirty
, &sh
->bh_new
[i
]))
1007 if (test_bit(BH_Dirty
, &sh
->bh_old
[i
]))
1013 static void handle_stripe_write (mddev_t
*mddev
, raid5_conf_t
*conf
,
1014 struct stripe_head
*sh
, int nr_write
, int * operational
, int disks
,
1015 int parity
, int parity_failed
, int nr_cache
, int nr_cache_other
,
1016 int nr_failed_other
, int nr_cache_overwrite
, int nr_failed_overwrite
)
1021 struct buffer_head
*bh
;
1022 int method1
= INT_MAX
, method2
= INT_MAX
;
1025 * Attempt to add entries :-)
1027 if (nr_write
!= disks
- 1) {
1028 for (i
= 0; i
< disks
; i
++) {
1029 if (i
== sh
->pd_idx
)
1033 block
= (int) compute_blocknr(sh
, i
);
1034 bh
= get_hash_table(mddev_to_kdev(mddev
), block
, sh
->size
);
1037 if (buffer_dirty(bh
) && !md_test_and_set_bit(BH_Lock
, &bh
->b_state
)) {
1038 PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh
->sector
, i
, block
);
1039 add_stripe_bh(sh
, bh
, i
, WRITE
);
1042 if (sh
->bh_old
[i
]) {
1043 nr_cache_overwrite
++;
1046 if (!operational
[i
]) {
1047 nr_failed_overwrite
++;
1051 atomic_dec(&bh
->b_count
);
1054 PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh
->sector
);
1056 * Writing, need to update parity buffer.
1058 * Compute the number of I/O requests in the "reconstruct
1059 * write" and "read modify write" methods.
1061 if (!nr_failed_other
)
1062 method1
= (disks
- 1) - (nr_write
+ nr_cache_other
);
1063 if (!nr_failed_overwrite
&& !parity_failed
)
1064 method2
= nr_write
- nr_cache_overwrite
+ (1 - parity
);
1066 if (method1
== INT_MAX
&& method2
== INT_MAX
)
1068 PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh
->sector
, nr_write
, method1
, method2
);
1070 if (!method1
|| !method2
) {
1071 allclean
= is_stripe_allclean(sh
, disks
);
1072 sh
->phase
= PHASE_WRITE
;
1073 compute_parity(sh
, method1
<= method2
? RECONSTRUCT_WRITE
: READ_MODIFY_WRITE
);
1075 for (i
= 0; i
< disks
; i
++) {
1076 if (!operational
[i
] && !conf
->spare
&& !conf
->resync_parity
)
1078 bh
= sh
->bh_copy
[i
];
1079 if (i
!= sh
->pd_idx
&& ((bh
== NULL
) ^ (sh
->bh_new
[i
] == NULL
)))
1080 printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh
, i
, sh
->bh_new
[i
]);
1081 if (i
== sh
->pd_idx
&& !bh
)
1082 printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i
);
1084 PRINTK("making request for buffer %d\n", i
);
1086 if (!operational
[i
] && !conf
->resync_parity
) {
1087 PRINTK("writing spare %d\n", i
);
1088 atomic_inc(&sh
->nr_pending
);
1089 bh
->b_dev
= bh
->b_rdev
= conf
->spare
->dev
;
1090 q
= blk_get_queue(bh
->b_rdev
);
1091 generic_make_request(q
, WRITERAW
, bh
);
1094 atomic_inc(&sh
->nr_pending
);
1095 bh
->b_dev
= bh
->b_rdev
= conf
->disks
[i
].dev
;
1096 q
= blk_get_queue(bh
->b_rdev
);
1097 generic_make_request(q
, WRITERAW
, bh
);
1099 if (!allclean
|| (i
==sh
->pd_idx
)) {
1100 PRINTK("writing dirty %d\n", i
);
1101 atomic_inc(&sh
->nr_pending
);
1102 bh
->b_dev
= bh
->b_rdev
= conf
->disks
[i
].dev
;
1103 q
= blk_get_queue(bh
->b_rdev
);
1104 generic_make_request(q
, WRITERAW
, bh
);
1106 PRINTK("not writing clean %d\n", i
);
1107 raid5_end_request(bh
, 1);
1112 atomic_dec(&bh
->b_count
);
1115 PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1119 if (method1
< method2
) {
1120 sh
->write_method
= RECONSTRUCT_WRITE
;
1121 for (i
= 0; i
< disks
; i
++) {
1122 if (i
== sh
->pd_idx
)
1124 if (sh
->bh_new
[i
] || sh
->bh_old
[i
])
1126 sh
->bh_old
[i
] = raid5_alloc_buffer(sh
, sh
->size
);
1127 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1130 sh
->write_method
= READ_MODIFY_WRITE
;
1131 for (i
= 0; i
< disks
; i
++) {
1134 if (!sh
->bh_new
[i
] && i
!= sh
->pd_idx
)
1136 sh
->bh_old
[i
] = raid5_alloc_buffer(sh
, sh
->size
);
1137 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1140 sh
->phase
= PHASE_READ_OLD
;
1141 for (i
= 0; i
< disks
; i
++) {
1144 if (test_bit(BH_Uptodate
, &sh
->bh_old
[i
]->b_state
))
1146 lock_get_bh(sh
->bh_old
[i
]);
1147 atomic_inc(&sh
->nr_pending
);
1148 sh
->bh_old
[i
]->b_dev
= sh
->bh_old
[i
]->b_rdev
= conf
->disks
[i
].dev
;
1149 q
= blk_get_queue(sh
->bh_old
[i
]->b_rdev
);
1150 generic_make_request(q
, READ
, sh
->bh_old
[i
]);
1151 atomic_dec(&sh
->bh_old
[i
]->b_count
);
1153 PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1159 static void handle_stripe_read (mddev_t
*mddev
, raid5_conf_t
*conf
,
1160 struct stripe_head
*sh
, int nr_read
, int * operational
, int disks
,
1161 int parity
, int parity_failed
, int nr_cache
, int nr_cache_other
,
1162 int nr_failed_other
, int nr_cache_overwrite
, int nr_failed_overwrite
)
1166 int method1
= INT_MAX
;
1168 method1
= nr_read
- nr_cache_overwrite
;
1170 PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh
->sector
, nr_read
, nr_cache
, method1
);
1172 if (!method1
|| (method1
== 1 && nr_cache
== disks
- 1)) {
1173 PRINTK("read %lu completed from cache\n", sh
->sector
);
1174 for (i
= 0; i
< disks
; i
++) {
1178 compute_block(sh
, i
);
1179 memcpy(sh
->bh_new
[i
]->b_data
, sh
->bh_old
[i
]->b_data
, sh
->size
);
1181 complete_stripe(sh
);
1184 if (nr_failed_overwrite
) {
1185 sh
->phase
= PHASE_READ_OLD
;
1186 for (i
= 0; i
< disks
; i
++) {
1189 if (!operational
[i
])
1191 sh
->bh_old
[i
] = raid5_alloc_buffer(sh
, sh
->size
);
1192 raid5_build_block(sh
, sh
->bh_old
[i
], i
);
1193 lock_get_bh(sh
->bh_old
[i
]);
1194 atomic_inc(&sh
->nr_pending
);
1195 sh
->bh_old
[i
]->b_dev
= sh
->bh_old
[i
]->b_rdev
= conf
->disks
[i
].dev
;
1196 q
= blk_get_queue(sh
->bh_old
[i
]->b_rdev
);
1197 generic_make_request(q
, READ
, sh
->bh_old
[i
]);
1198 atomic_dec(&sh
->bh_old
[i
]->b_count
);
1200 PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1203 sh
->phase
= PHASE_READ
;
1204 for (i
= 0; i
< disks
; i
++) {
1207 if (sh
->bh_old
[i
]) {
1208 memcpy(sh
->bh_new
[i
]->b_data
, sh
->bh_old
[i
]->b_data
, sh
->size
);
1212 if (sh
->bh_req
[i
] == NULL
|| test_bit(BH_Lock
, &sh
->bh_req
[i
]->b_state
)) {
1214 printk("req %d is NULL! or locked \n", i
);
1215 for (j
=0; j
<disks
; j
++) {
1216 printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n",
1217 j
, sh
->bh_new
[j
], sh
->bh_old
[j
], sh
->bh_req
[j
],
1218 sh
->new[j
], sh
->cmd_new
[j
]);
1223 lock_get_bh(sh
->bh_req
[i
]);
1224 atomic_inc(&sh
->nr_pending
);
1225 sh
->bh_req
[i
]->b_dev
= sh
->bh_req
[i
]->b_rdev
= conf
->disks
[i
].dev
;
1226 q
= blk_get_queue(sh
->bh_req
[i
]->b_rdev
);
1227 generic_make_request(q
, READ
, sh
->bh_req
[i
]);
1228 atomic_dec(&sh
->bh_req
[i
]->b_count
);
1230 PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1236 static void handle_stripe_sync (mddev_t
*mddev
, raid5_conf_t
*conf
,
1237 struct stripe_head
*sh
, int * operational
, int disks
,
1238 int parity
, int parity_failed
, int nr_cache
, int nr_cache_other
,
1239 int nr_failed_other
, int nr_cache_overwrite
, int nr_failed_overwrite
)
1242 struct buffer_head
*bh
;
1245 /* firstly, we want to have data from all non-failed drives
1248 PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh
->sector
, disks
, nr_cache
);
1249 if ((nr_cache
< disks
-1) || ((nr_cache
== disks
-1) && !(parity_failed
+nr_failed_other
+nr_failed_overwrite
))
1251 sh
->phase
= PHASE_READ_OLD
;
1252 for (i
= 0; i
< disks
; i
++) {
1255 if (!conf
->disks
[i
].operational
)
1258 bh
= raid5_alloc_buffer(sh
, sh
->size
);
1260 raid5_build_block(sh
, bh
, i
);
1262 atomic_inc(&sh
->nr_pending
);
1263 bh
->b_dev
= bh
->b_rdev
= conf
->disks
[i
].dev
;
1264 q
= blk_get_queue(bh
->b_rdev
);
1265 generic_make_request(q
, READ
, bh
);
1266 drive_stat_acct(bh
->b_rdev
, READ
, -bh
->b_size
/512, 0);
1267 atomic_dec(&sh
->bh_old
[i
]->b_count
);
1269 PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1273 /* now, if there is a failed drive, rebuild and write to spare */
1274 if (nr_cache
== disks
-1) {
1275 sh
->phase
= PHASE_WRITE
;
1276 /* we can generate the missing block, which will be on the failed drive */
1277 for (i
=0; i
<disks
; i
++) {
1280 compute_block(sh
, i
);
1282 bh
= sh
->bh_copy
[i
];
1284 memcpy(bh
->b_data
, sh
->bh_old
[i
]->b_data
, sh
->size
);
1285 set_bit(BH_Uptodate
, &bh
->b_state
);
1288 sh
->bh_old
[i
] = NULL
;
1289 sh
->bh_copy
[i
] = bh
;
1291 atomic_inc(&sh
->nr_pending
);
1293 bh
->b_dev
= bh
->b_rdev
= conf
->spare
->dev
;
1294 q
= blk_get_queue(bh
->b_rdev
);
1295 generic_make_request(q
, WRITERAW
, bh
);
1296 drive_stat_acct(bh
->b_rdev
, WRITE
, -bh
->b_size
/512, 0);
1297 atomic_dec(&bh
->b_count
);
1298 PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1305 /* nr_cache == disks:
1306 * check parity and compute/write if needed
1309 compute_parity(sh
, RECONSTRUCT_WRITE
);
1310 pd_idx
= sh
->pd_idx
;
1311 if (!memcmp(sh
->bh_copy
[pd_idx
]->b_data
, sh
->bh_old
[pd_idx
]->b_data
, sh
->size
)) {
1312 /* the parity is correct - Yay! */
1313 complete_stripe(sh
);
1315 sh
->phase
= PHASE_WRITE
;
1316 bh
= sh
->bh_copy
[pd_idx
];
1317 atomic_set_buffer_dirty(bh
);
1319 atomic_inc(&sh
->nr_pending
);
1320 bh
->b_dev
= bh
->b_rdev
= conf
->disks
[pd_idx
].dev
;
1321 q
= blk_get_queue(bh
->b_rdev
);
1322 generic_make_request(q
, WRITERAW
, bh
);
1323 drive_stat_acct(bh
->b_rdev
, WRITE
, -bh
->b_size
/512, 0);
1324 atomic_dec(&bh
->b_count
);
1325 PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n",
1326 sh
->sector
, md_atomic_read(&sh
->nr_pending
));
1331 * handle_stripe() is our main logic routine. Note that:
1333 * 1. lock_stripe() should be used whenever we can't accept additonal
1334 * buffers, either during short sleeping in handle_stripe() or
1335 * during io operations.
1337 * 2. We should be careful to set sh->nr_pending whenever we sleep,
1338 * to prevent re-entry of handle_stripe() for the same sh.
1340 * 3. conf->failed_disks and disk->operational can be changed
1341 * from an interrupt. This complicates things a bit, but it allows
1342 * us to stop issuing requests for a failed drive as soon as possible.
1344 static void handle_stripe(struct stripe_head
*sh
)
1346 raid5_conf_t
*conf
= sh
->raid_conf
;
1347 mddev_t
*mddev
= conf
->mddev
;
1348 int disks
= conf
->raid_disks
;
1349 int i
, nr_read
= 0, nr_write
= 0, parity
= 0;
1350 int nr_cache
= 0, nr_cache_other
= 0, nr_cache_overwrite
= 0;
1351 int nr_failed_other
= 0, nr_failed_overwrite
= 0, parity_failed
= 0;
1352 int operational
[MD_SB_DISKS
], failed_disks
= conf
->failed_disks
;
1354 PRINTK("handle_stripe(), stripe %lu\n", sh
->sector
);
1355 if (!stripe_locked(sh
))
1357 if (md_atomic_read(&sh
->nr_pending
))
1359 if (sh
->phase
== PHASE_COMPLETE
)
1362 atomic_dec(&conf
->nr_handle
);
1364 if (md_test_and_clear_bit(STRIPE_ERROR
, &sh
->state
)) {
1365 printk("raid5: restarting stripe %lu\n", sh
->sector
);
1366 sh
->phase
= PHASE_BEGIN
;
1369 if ((sh
->cmd
== STRIPE_WRITE
&& sh
->phase
== PHASE_WRITE
) ||
1370 (sh
->cmd
== STRIPE_READ
&& sh
->phase
== PHASE_READ
) ||
1371 (sh
->cmd
== STRIPE_SYNC
&& sh
->phase
== PHASE_WRITE
)
1376 complete_stripe(sh
);
1377 if (sh
->phase
== PHASE_COMPLETE
)
1381 md_spin_lock_irq(&conf
->device_lock
);
1382 for (i
= 0; i
< disks
; i
++) {
1383 operational
[i
] = conf
->disks
[i
].operational
;
1384 if (i
== sh
->pd_idx
&& conf
->resync_parity
)
1387 failed_disks
= conf
->failed_disks
;
1388 md_spin_unlock_irq(&conf
->device_lock
);
1391 * Make this one more graceful?
1393 if (failed_disks
> 1) {
1394 for (i
= 0; i
< disks
; i
++) {
1395 if (sh
->bh_new
[i
]) {
1396 raid5_end_buffer_io(sh
, i
, 0);
1400 if (sh
->cmd
== STRIPE_SYNC
)
1401 md_done_sync(conf
->mddev
, (sh
->size
>>10) - sh
->sync_redone
,1);
1402 finish_unlock_stripe(sh
);
1406 PRINTK("=== stripe index START ===\n");
1407 for (i
= 0; i
< disks
; i
++) {
1408 PRINTK("disk %d, ", i
);
1409 if (sh
->bh_old
[i
]) {
1411 PRINTK(" (old cached, %d)", nr_cache
);
1413 if (i
== sh
->pd_idx
) {
1415 if (sh
->bh_old
[i
]) {
1419 PRINTK(" UNCACHED.");
1420 if (!operational
[i
]) {
1428 if (!sh
->bh_new
[i
]) {
1429 PRINTK(" (no new data block) ");
1430 if (sh
->bh_old
[i
]) {
1431 PRINTK(" (but old block cached) ");
1434 if (!operational
[i
]) {
1435 PRINTK(" (because failed disk) ");
1438 PRINTK(" (no old block either) ");
1444 if (sh
->cmd_new
[i
] == READ
) {
1446 PRINTK(" (new READ %d)", nr_read
);
1448 if (sh
->cmd_new
[i
] == WRITE
) {
1450 PRINTK(" (new WRITE %d)", nr_write
);
1452 if (sh
->bh_old
[i
]) {
1453 nr_cache_overwrite
++;
1454 PRINTK(" (overwriting old %d)", nr_cache_overwrite
);
1456 if (!operational
[i
]) {
1457 nr_failed_overwrite
++;
1458 PRINTK(" (overwriting failed %d)", nr_failed_overwrite
);
1463 PRINTK("=== stripe index END ===\n");
1465 if (nr_write
&& nr_read
)
1469 handle_stripe_write(
1470 mddev
, conf
, sh
, nr_write
, operational
, disks
,
1471 parity
, parity_failed
, nr_cache
, nr_cache_other
,
1472 nr_failed_other
, nr_cache_overwrite
,
1477 mddev
, conf
, sh
, nr_read
, operational
, disks
,
1478 parity
, parity_failed
, nr_cache
, nr_cache_other
,
1479 nr_failed_other
, nr_cache_overwrite
,
1482 else if (sh
->cmd
== STRIPE_SYNC
)
1484 mddev
, conf
, sh
, operational
, disks
,
1485 parity
, parity_failed
, nr_cache
, nr_cache_other
,
1486 nr_failed_other
, nr_cache_overwrite
, nr_failed_overwrite
1491 static int raid5_make_request (request_queue_t
*q
, mddev_t
*mddev
, int rw
, struct buffer_head
* bh
)
1493 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
1494 const unsigned int raid_disks
= conf
->raid_disks
;
1495 const unsigned int data_disks
= raid_disks
- 1;
1496 unsigned int dd_idx
, pd_idx
;
1497 unsigned long new_sector
;
1499 struct stripe_head
*sh
;
1504 new_sector
= raid5_compute_sector(bh
->b_rsector
,
1505 raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1507 PRINTK("raid5_make_request, sector %lu\n", new_sector
);
1508 sh
= get_lock_stripe(conf
, new_sector
, bh
->b_size
);
1510 if ((rw
== READ
&& sh
->cmd
== STRIPE_WRITE
) || (rw
== WRITE
&& sh
->cmd
== STRIPE_READ
)) {
1511 PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw
, sh
->cmd
);
1513 if (!md_atomic_read(&sh
->nr_pending
))
1518 sh
->pd_idx
= pd_idx
;
1519 if (sh
->phase
!= PHASE_COMPLETE
&& sh
->phase
!= PHASE_BEGIN
)
1520 PRINTK("stripe %lu catching the bus!\n", sh
->sector
);
1521 if (sh
->bh_new
[dd_idx
])
1523 add_stripe_bh(sh
, bh
, dd_idx
, rw
);
1525 md_wakeup_thread(conf
->thread
);
1530 * Determine correct block size for this device.
1532 unsigned int device_bsize (kdev_t dev
)
1534 unsigned int i
, correct_size
;
1536 correct_size
= BLOCK_SIZE
;
1537 if (blksize_size
[MAJOR(dev
)]) {
1538 i
= blksize_size
[MAJOR(dev
)][MINOR(dev
)];
1543 return correct_size
;
1546 static int raid5_sync_request (mddev_t
*mddev
, unsigned long block_nr
)
1548 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
1549 struct stripe_head
*sh
;
1550 int sectors_per_chunk
= conf
->chunk_size
>> 9;
1551 unsigned long stripe
= (block_nr
<<2)/sectors_per_chunk
;
1552 int chunk_offset
= (block_nr
<<2) % sectors_per_chunk
;
1554 unsigned long first_sector
;
1555 int raid_disks
= conf
->raid_disks
;
1556 int data_disks
= raid_disks
-1;
1560 if (!conf
->buffer_size
)
1561 conf
->buffer_size
= /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE
;
1562 bufsize
= conf
->buffer_size
;
1563 /* Hmm... race on buffer_size ?? */
1564 redone
= block_nr
% (bufsize
>>10);
1566 sh
= get_lock_stripe(conf
, block_nr
<<1, bufsize
);
1567 first_sector
= raid5_compute_sector(stripe
*data_disks
*sectors_per_chunk
1568 + chunk_offset
, raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1569 sh
->pd_idx
= pd_idx
;
1570 sh
->cmd
= STRIPE_SYNC
;
1571 sh
->phase
= PHASE_BEGIN
;
1572 sh
->sync_redone
= redone
;
1573 atomic_inc(&conf
->nr_pending_stripes
);
1574 atomic_inc(&conf
->nr_handle
);
1575 md_wakeup_thread(conf
->thread
);
1576 return (bufsize
>>10)-redone
;
1580 * This is our raid5 kernel thread.
1582 * We scan the hash table for stripes which can be handled now.
1583 * During the scan, completed stripes are saved for us by the interrupt
1584 * handler, so that they will not have to wait for our next wakeup.
1586 static void raid5d (void *data
)
1588 struct stripe_head
*sh
;
1589 raid5_conf_t
*conf
= data
;
1590 mddev_t
*mddev
= conf
->mddev
;
1593 PRINTK("+++ raid5d active\n");
1596 md_spin_lock_irq(&conf
->device_lock
);
1597 clear_bit(THREAD_WAKEUP
, &conf
->thread
->flags
);
1599 if (mddev
->sb_dirty
) {
1600 md_spin_unlock_irq(&conf
->device_lock
);
1601 mddev
->sb_dirty
= 0;
1602 md_update_sb(mddev
);
1603 md_spin_lock_irq(&conf
->device_lock
);
1605 for (i
= 0; i
< NR_HASH
; i
++) {
1607 sh
= conf
->stripe_hashtbl
[i
];
1608 for (; sh
; sh
= sh
->hash_next
) {
1609 if (sh
->raid_conf
!= conf
)
1611 if (sh
->phase
== PHASE_COMPLETE
)
1613 if (md_atomic_read(&sh
->nr_pending
))
1615 md_spin_unlock_irq(&conf
->device_lock
);
1616 if (!atomic_read(&sh
->count
))
1621 md_spin_lock_irq(&conf
->device_lock
);
1626 PRINTK("%d stripes handled, nr_handle %d\n", handled
, md_atomic_read(&conf
->nr_handle
));
1627 if (test_and_clear_bit(THREAD_WAKEUP
, &conf
->thread
->flags
) &&
1628 md_atomic_read(&conf
->nr_handle
))
1631 md_spin_unlock_irq(&conf
->device_lock
);
1633 PRINTK("--- raid5d inactive\n");
1637 * Private kernel thread for parity reconstruction after an unclean
1638 * shutdown. Reconstruction on spare drives in case of a failed drive
1639 * is done by the generic mdsyncd.
1641 static void raid5syncd (void *data
)
1643 raid5_conf_t
*conf
= data
;
1644 mddev_t
*mddev
= conf
->mddev
;
1646 if (!conf
->resync_parity
)
1648 if (conf
->resync_parity
== 2)
1650 down(&mddev
->recovery_sem
);
1651 if (md_do_sync(mddev
,NULL
)) {
1652 up(&mddev
->recovery_sem
);
1653 printk("raid5: resync aborted!\n");
1656 conf
->resync_parity
= 0;
1657 up(&mddev
->recovery_sem
);
1658 printk("raid5: resync finished.\n");
1661 static int __check_consistency (mddev_t
*mddev
, int row
)
1663 raid5_conf_t
*conf
= mddev
->private;
1665 struct buffer_head
*bh
[MD_SB_DISKS
], *tmp
= NULL
;
1666 int i
, ret
= 0, nr
= 0, count
;
1667 struct buffer_head
*bh_ptr
[MAX_XOR_BLOCKS
];
1669 if (conf
->working_disks
!= conf
->raid_disks
)
1671 tmp
= kmalloc(sizeof(*tmp
), GFP_KERNEL
);
1673 tmp
->b_page
= alloc_page(GFP_KERNEL
);
1674 tmp
->b_data
= (char *)page_address(tmp
->b_page
);
1677 md_clear_page((unsigned long)tmp
->b_data
);
1678 memset(bh
, 0, MD_SB_DISKS
* sizeof(struct buffer_head
*));
1679 for (i
= 0; i
< conf
->raid_disks
; i
++) {
1680 dev
= conf
->disks
[i
].dev
;
1681 set_blocksize(dev
, 4096);
1682 bh
[i
] = bread(dev
, row
/ 4, 4096);
1687 if (nr
== conf
->raid_disks
) {
1690 for (i
= 1; i
< nr
; i
++) {
1691 bh_ptr
[count
++] = bh
[i
];
1692 if (count
== MAX_XOR_BLOCKS
) {
1693 xor_block(count
, &bh_ptr
[0]);
1698 xor_block(count
, &bh_ptr
[0]);
1700 if (memcmp(tmp
->b_data
, bh
[0]->b_data
, 4096))
1703 for (i
= 0; i
< conf
->raid_disks
; i
++) {
1704 dev
= conf
->disks
[i
].dev
;
1710 invalidate_buffers(dev
);
1712 free_page((unsigned long) tmp
->b_data
);
1719 static int check_consistency (mddev_t
*mddev
)
1721 if (__check_consistency(mddev
, 0))
1723 * We are not checking this currently, as it's legitimate to have
1724 * an inconsistent array, at creation time.
1731 static int raid5_run (mddev_t
*mddev
)
1734 int i
, j
, raid_disk
, memory
;
1735 mdp_super_t
*sb
= mddev
->sb
;
1738 struct disk_info
*disk
;
1739 struct md_list_head
*tmp
;
1740 int start_recovery
= 0;
1744 if (sb
->level
!= 5 && sb
->level
!= 4) {
1745 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev
), sb
->level
);
1750 mddev
->private = kmalloc (sizeof (raid5_conf_t
), GFP_KERNEL
);
1751 if ((conf
= mddev
->private) == NULL
)
1753 memset (conf
, 0, sizeof (*conf
));
1754 conf
->mddev
= mddev
;
1756 if ((conf
->stripe_hashtbl
= (struct stripe_head
**) md__get_free_pages(GFP_ATOMIC
, HASH_PAGES_ORDER
)) == NULL
)
1758 memset(conf
->stripe_hashtbl
, 0, HASH_PAGES
* PAGE_SIZE
);
1760 conf
->device_lock
= MD_SPIN_LOCK_UNLOCKED
;
1761 md_init_waitqueue_head(&conf
->wait_for_stripe
);
1762 PRINTK("raid5_run(md%d) called.\n", mdidx(mddev
));
1764 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1766 * This is important -- we are using the descriptor on
1767 * the disk only to get a pointer to the descriptor on
1768 * the main superblock, which might be more recent.
1770 desc
= sb
->disks
+ rdev
->desc_nr
;
1771 raid_disk
= desc
->raid_disk
;
1772 disk
= conf
->disks
+ raid_disk
;
1774 if (disk_faulty(desc
)) {
1775 printk(KERN_ERR
"raid5: disabled device %s (errors detected)\n", partition_name(rdev
->dev
));
1776 if (!rdev
->faulty
) {
1780 disk
->number
= desc
->number
;
1781 disk
->raid_disk
= raid_disk
;
1782 disk
->dev
= rdev
->dev
;
1784 disk
->operational
= 0;
1785 disk
->write_only
= 0;
1787 disk
->used_slot
= 1;
1790 if (disk_active(desc
)) {
1791 if (!disk_sync(desc
)) {
1792 printk(KERN_ERR
"raid5: disabled device %s (not in sync)\n", partition_name(rdev
->dev
));
1796 if (raid_disk
> sb
->raid_disks
) {
1797 printk(KERN_ERR
"raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev
->dev
));
1800 if (disk
->operational
) {
1801 printk(KERN_ERR
"raid5: disabled device %s (device %d already operational)\n", partition_name(rdev
->dev
), raid_disk
);
1804 printk(KERN_INFO
"raid5: device %s operational as raid disk %d\n", partition_name(rdev
->dev
), raid_disk
);
1806 disk
->number
= desc
->number
;
1807 disk
->raid_disk
= raid_disk
;
1808 disk
->dev
= rdev
->dev
;
1809 disk
->operational
= 1;
1810 disk
->used_slot
= 1;
1812 conf
->working_disks
++;
1815 * Must be a spare disk ..
1817 printk(KERN_INFO
"raid5: spare disk %s\n", partition_name(rdev
->dev
));
1818 disk
->number
= desc
->number
;
1819 disk
->raid_disk
= raid_disk
;
1820 disk
->dev
= rdev
->dev
;
1822 disk
->operational
= 0;
1823 disk
->write_only
= 0;
1825 disk
->used_slot
= 1;
1829 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1830 desc
= sb
->disks
+ i
;
1831 raid_disk
= desc
->raid_disk
;
1832 disk
= conf
->disks
+ raid_disk
;
1834 if (disk_faulty(desc
) && (raid_disk
< sb
->raid_disks
) &&
1835 !conf
->disks
[raid_disk
].used_slot
) {
1837 disk
->number
= desc
->number
;
1838 disk
->raid_disk
= raid_disk
;
1839 disk
->dev
= MKDEV(0,0);
1841 disk
->operational
= 0;
1842 disk
->write_only
= 0;
1844 disk
->used_slot
= 1;
1848 conf
->raid_disks
= sb
->raid_disks
;
1850 * 0 for a fully functional array, 1 for a degraded array.
1852 conf
->failed_disks
= conf
->raid_disks
- conf
->working_disks
;
1853 conf
->mddev
= mddev
;
1854 conf
->chunk_size
= sb
->chunk_size
;
1855 conf
->level
= sb
->level
;
1856 conf
->algorithm
= sb
->layout
;
1857 conf
->max_nr_stripes
= NR_STRIPES
;
1860 for (i
= 0; i
< conf
->raid_disks
; i
++) {
1861 if (!conf
->disks
[i
].used_slot
) {
1867 if (!conf
->chunk_size
|| conf
->chunk_size
% 4) {
1868 printk(KERN_ERR
"raid5: invalid chunk size %d for md%d\n", conf
->chunk_size
, mdidx(mddev
));
1871 if (conf
->algorithm
> ALGORITHM_RIGHT_SYMMETRIC
) {
1872 printk(KERN_ERR
"raid5: unsupported parity algorithm %d for md%d\n", conf
->algorithm
, mdidx(mddev
));
1875 if (conf
->failed_disks
> 1) {
1876 printk(KERN_ERR
"raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev
), conf
->failed_disks
, conf
->raid_disks
);
1880 if (conf
->working_disks
!= sb
->raid_disks
) {
1881 printk(KERN_ALERT
"raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev
));
1885 if (!start_recovery
&& (sb
->state
& (1 << MD_SB_CLEAN
)) &&
1886 check_consistency(mddev
)) {
1887 printk(KERN_ERR
"raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
1888 sb
->state
&= ~(1 << MD_SB_CLEAN
);
1892 const char * name
= "raid5d";
1894 conf
->thread
= md_register_thread(raid5d
, conf
, name
);
1895 if (!conf
->thread
) {
1896 printk(KERN_ERR
"raid5: couldn't allocate thread for md%d\n", mdidx(mddev
));
1901 memory
= conf
->max_nr_stripes
* (sizeof(struct stripe_head
) +
1902 conf
->raid_disks
* (sizeof(struct buffer_head
) +
1903 2 * (sizeof(struct buffer_head
) + PAGE_SIZE
))) / 1024;
1904 if (grow_stripes(conf
, conf
->max_nr_stripes
, GFP_KERNEL
)) {
1905 printk(KERN_ERR
"raid5: couldn't allocate %dkB for buffers\n", memory
);
1906 shrink_stripes(conf
, conf
->max_nr_stripes
);
1909 printk(KERN_INFO
"raid5: allocated %dkB for md%d\n", memory
, mdidx(mddev
));
1912 * Regenerate the "device is in sync with the raid set" bit for
1915 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1916 mark_disk_nonsync(sb
->disks
+ i
);
1917 for (j
= 0; j
< sb
->raid_disks
; j
++) {
1918 if (!conf
->disks
[j
].operational
)
1920 if (sb
->disks
[i
].number
== conf
->disks
[j
].number
)
1921 mark_disk_sync(sb
->disks
+ i
);
1924 sb
->active_disks
= conf
->working_disks
;
1926 if (sb
->active_disks
== sb
->raid_disks
)
1927 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf
->level
, mdidx(mddev
), sb
->active_disks
, sb
->raid_disks
, conf
->algorithm
);
1929 printk(KERN_ALERT
"raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf
->level
, mdidx(mddev
), sb
->active_disks
, sb
->raid_disks
, conf
->algorithm
);
1931 if (!start_recovery
&& !(sb
->state
& (1 << MD_SB_CLEAN
))) {
1932 const char * name
= "raid5syncd";
1934 conf
->resync_thread
= md_register_thread(raid5syncd
, conf
,name
);
1935 if (!conf
->resync_thread
) {
1936 printk(KERN_ERR
"raid5: couldn't allocate thread for md%d\n", mdidx(mddev
));
1940 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev
));
1941 conf
->resync_parity
= 1;
1942 md_wakeup_thread(conf
->resync_thread
);
1945 print_raid5_conf(conf
);
1947 md_recover_arrays();
1948 print_raid5_conf(conf
);
1950 /* Ok, everything is just fine now */
1954 print_raid5_conf(conf
);
1955 if (conf
->stripe_hashtbl
)
1956 free_pages((unsigned long) conf
->stripe_hashtbl
,
1960 mddev
->private = NULL
;
1961 printk(KERN_ALERT
"raid5: failed to run raid set md%d\n", mdidx(mddev
));
1966 static int raid5_stop_resync (mddev_t
*mddev
)
1968 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1969 mdk_thread_t
*thread
= conf
->resync_thread
;
1972 if (conf
->resync_parity
) {
1973 conf
->resync_parity
= 2;
1974 md_interrupt_thread(thread
);
1975 printk(KERN_INFO
"raid5: parity resync was not fully finished, restarting next time.\n");
1983 static int raid5_restart_resync (mddev_t
*mddev
)
1985 raid5_conf_t
*conf
= mddev_to_conf(mddev
);
1987 if (conf
->resync_parity
) {
1988 if (!conf
->resync_thread
) {
1992 printk("raid5: waking up raid5resync.\n");
1993 conf
->resync_parity
= 1;
1994 md_wakeup_thread(conf
->resync_thread
);
1997 printk("raid5: no restart-resync needed.\n");
2002 static int raid5_stop (mddev_t
*mddev
)
2004 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
2006 shrink_stripe_cache(conf
, conf
->max_nr_stripes
);
2007 shrink_stripes(conf
, conf
->max_nr_stripes
);
2008 md_unregister_thread(conf
->thread
);
2009 if (conf
->resync_thread
)
2010 md_unregister_thread(conf
->resync_thread
);
2011 free_pages((unsigned long) conf
->stripe_hashtbl
, HASH_PAGES_ORDER
);
2013 mddev
->private = NULL
;
2019 static void print_sh (struct stripe_head
*sh
)
2023 printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh
->sector
, sh
->phase
, sh
->size
, sh
->pd_idx
, sh
->state
, sh
->cmd
);
2024 printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh
->sector
, sh
->write_method
, atomic_read(&sh
->nr_pending
), atomic_read(&sh
->count
));
2025 printk("sh %lu, ", sh
->sector
);
2026 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
2028 printk("(old%d: %p) ", i
, sh
->bh_old
[i
]);
2030 printk("(new%d: %p) ", i
, sh
->bh_new
[i
]);
2032 printk("(copy%d: %p) ", i
, sh
->bh_copy
[i
]);
2034 printk("(req%d: %p) ", i
, sh
->bh_req
[i
]);
2037 for (i
= 0; i
< MD_SB_DISKS
; i
++)
2038 printk("%d(%d/%d) ", i
, sh
->cmd_new
[i
], sh
->new[i
]);
2042 static void printall (raid5_conf_t
*conf
)
2044 struct stripe_head
*sh
;
2047 md_spin_lock_irq(&conf
->device_lock
);
2048 for (i
= 0; i
< NR_HASH
; i
++) {
2049 sh
= conf
->stripe_hashtbl
[i
];
2050 for (; sh
; sh
= sh
->hash_next
) {
2051 if (sh
->raid_conf
!= conf
)
2056 md_spin_unlock_irq(&conf
->device_lock
);
2058 PRINTK("--- raid5d inactive\n");
2062 static int raid5_status (char *page
, mddev_t
*mddev
)
2064 raid5_conf_t
*conf
= (raid5_conf_t
*) mddev
->private;
2065 mdp_super_t
*sb
= mddev
->sb
;
2068 sz
+= sprintf (page
+sz
, " level %d, %dk chunk, algorithm %d", sb
->level
, sb
->chunk_size
>> 10, sb
->layout
);
2069 sz
+= sprintf (page
+sz
, " [%d/%d] [", conf
->raid_disks
, conf
->working_disks
);
2070 for (i
= 0; i
< conf
->raid_disks
; i
++)
2071 sz
+= sprintf (page
+sz
, "%s", conf
->disks
[i
].operational
? "U" : "_");
2072 sz
+= sprintf (page
+sz
, "]");
2075 sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
2078 D(nr_hashed_stripes
);
2079 D(nr_locked_stripes
);
2080 D(nr_pending_stripes
);
2081 D(nr_cached_stripes
);
2088 static void print_raid5_conf (raid5_conf_t
*conf
)
2091 struct disk_info
*tmp
;
2093 printk("RAID5 conf printout:\n");
2095 printk("(conf==NULL)\n");
2098 printk(" --- rd:%d wd:%d fd:%d\n", conf
->raid_disks
,
2099 conf
->working_disks
, conf
->failed_disks
);
2101 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
2102 tmp
= conf
->disks
+ i
;
2103 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
2104 i
, tmp
->spare
,tmp
->operational
,
2105 tmp
->number
,tmp
->raid_disk
,tmp
->used_slot
,
2106 partition_name(tmp
->dev
));
2110 static int raid5_diskop(mddev_t
*mddev
, mdp_disk_t
**d
, int state
)
2113 int i
, failed_disk
=-1, spare_disk
=-1, removed_disk
=-1, added_disk
=-1;
2114 raid5_conf_t
*conf
= mddev
->private;
2115 struct disk_info
*tmp
, *sdisk
, *fdisk
, *rdisk
, *adisk
;
2116 mdp_super_t
*sb
= mddev
->sb
;
2117 mdp_disk_t
*failed_desc
, *spare_desc
, *added_desc
;
2119 print_raid5_conf(conf
);
2120 md_spin_lock_irq(&conf
->device_lock
);
2126 case DISKOP_SPARE_ACTIVE
:
2129 * Find the failed disk within the RAID5 configuration ...
2130 * (this can only be in the first conf->raid_disks part)
2132 for (i
= 0; i
< conf
->raid_disks
; i
++) {
2133 tmp
= conf
->disks
+ i
;
2134 if ((!tmp
->operational
&& !tmp
->spare
) ||
2141 * When we activate a spare disk we _must_ have a disk in
2142 * the lower (active) part of the array to replace.
2144 if ((failed_disk
== -1) || (failed_disk
>= conf
->raid_disks
)) {
2151 case DISKOP_SPARE_WRITE
:
2152 case DISKOP_SPARE_INACTIVE
:
2155 * Find the spare disk ... (can only be in the 'high'
2156 * area of the array)
2158 for (i
= conf
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
2159 tmp
= conf
->disks
+ i
;
2160 if (tmp
->spare
&& tmp
->number
== (*d
)->number
) {
2165 if (spare_disk
== -1) {
2172 case DISKOP_HOT_REMOVE_DISK
:
2174 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
2175 tmp
= conf
->disks
+ i
;
2176 if (tmp
->used_slot
&& (tmp
->number
== (*d
)->number
)) {
2177 if (tmp
->operational
) {
2185 if (removed_disk
== -1) {
2192 case DISKOP_HOT_ADD_DISK
:
2194 for (i
= conf
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
2195 tmp
= conf
->disks
+ i
;
2196 if (!tmp
->used_slot
) {
2201 if (added_disk
== -1) {
2211 * Switch the spare disk to write-only mode:
2213 case DISKOP_SPARE_WRITE
:
2219 sdisk
= conf
->disks
+ spare_disk
;
2220 sdisk
->operational
= 1;
2221 sdisk
->write_only
= 1;
2222 conf
->spare
= sdisk
;
2225 * Deactivate a spare disk:
2227 case DISKOP_SPARE_INACTIVE
:
2228 sdisk
= conf
->disks
+ spare_disk
;
2229 sdisk
->operational
= 0;
2230 sdisk
->write_only
= 0;
2232 * Was the spare being resynced?
2234 if (conf
->spare
== sdisk
)
2238 * Activate (mark read-write) the (now sync) spare disk,
2239 * which means we switch it's 'raid position' (->raid_disk)
2240 * with the failed disk. (only the first 'conf->raid_disks'
2241 * slots are used for 'real' disks and we must preserve this
2244 case DISKOP_SPARE_ACTIVE
:
2250 sdisk
= conf
->disks
+ spare_disk
;
2251 fdisk
= conf
->disks
+ failed_disk
;
2253 spare_desc
= &sb
->disks
[sdisk
->number
];
2254 failed_desc
= &sb
->disks
[fdisk
->number
];
2256 if (spare_desc
!= *d
) {
2262 if (spare_desc
->raid_disk
!= sdisk
->raid_disk
) {
2268 if (sdisk
->raid_disk
!= spare_disk
) {
2274 if (failed_desc
->raid_disk
!= fdisk
->raid_disk
) {
2280 if (fdisk
->raid_disk
!= failed_disk
) {
2287 * do the switch finally
2289 xchg_values(*spare_desc
, *failed_desc
);
2290 xchg_values(*fdisk
, *sdisk
);
2293 * (careful, 'failed' and 'spare' are switched from now on)
2295 * we want to preserve linear numbering and we want to
2296 * give the proper raid_disk number to the now activated
2297 * disk. (this means we switch back these values)
2300 xchg_values(spare_desc
->raid_disk
, failed_desc
->raid_disk
);
2301 xchg_values(sdisk
->raid_disk
, fdisk
->raid_disk
);
2302 xchg_values(spare_desc
->number
, failed_desc
->number
);
2303 xchg_values(sdisk
->number
, fdisk
->number
);
2307 if (sdisk
->dev
== MKDEV(0,0))
2308 sdisk
->used_slot
= 0;
2311 * this really activates the spare.
2314 fdisk
->write_only
= 0;
2317 * if we activate a spare, we definitely replace a
2318 * non-operational disk slot in the 'low' area of
2321 conf
->failed_disks
--;
2322 conf
->working_disks
++;
2327 case DISKOP_HOT_REMOVE_DISK
:
2328 rdisk
= conf
->disks
+ removed_disk
;
2330 if (rdisk
->spare
&& (removed_disk
< conf
->raid_disks
)) {
2335 rdisk
->dev
= MKDEV(0,0);
2336 rdisk
->used_slot
= 0;
2340 case DISKOP_HOT_ADD_DISK
:
2341 adisk
= conf
->disks
+ added_disk
;
2344 if (added_disk
!= added_desc
->number
) {
2350 adisk
->number
= added_desc
->number
;
2351 adisk
->raid_disk
= added_desc
->raid_disk
;
2352 adisk
->dev
= MKDEV(added_desc
->major
,added_desc
->minor
);
2354 adisk
->operational
= 0;
2355 adisk
->write_only
= 0;
2357 adisk
->used_slot
= 1;
2368 md_spin_unlock_irq(&conf
->device_lock
);
2369 print_raid5_conf(conf
);
2373 static mdk_personality_t raid5_personality
=
2376 make_request
: raid5_make_request
,
2379 status
: raid5_status
,
2380 error_handler
: raid5_error
,
2381 diskop
: raid5_diskop
,
2382 stop_resync
: raid5_stop_resync
,
2383 restart_resync
: raid5_restart_resync
,
2384 sync_request
: raid5_sync_request
2387 int raid5_init (void)
2391 err
= register_md_personality (RAID5
, &raid5_personality
);
2398 int init_module (void)
2400 return raid5_init();
2403 void cleanup_module (void)
2405 unregister_md_personality (RAID5
);