Import 2.4.0-test2pre7
[davej-history.git] / drivers / block / raid5.c
blobe7bf85d08d64a422610912fd403861eb779141eb
1 /*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
6 * RAID-5 management functions.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/locks.h>
22 #include <linux/malloc.h>
23 #include <linux/raid/raid5.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
27 static mdk_personality_t raid5_personality;
30 * Stripe cache
33 #define NR_STRIPES 128
34 #define HASH_PAGES 1
35 #define HASH_PAGES_ORDER 0
36 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
37 #define HASH_MASK (NR_HASH - 1)
38 #define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK])
41 * The following can be used to debug the driver
43 #define RAID5_DEBUG 0
44 #define RAID5_PARANOIA 1
45 #if RAID5_PARANOIA && CONFIG_SMP
46 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
47 # define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG()
48 #else
49 # define CHECK_DEVLOCK()
50 # define CHECK_SHLOCK(unused)
51 #endif
53 #if RAID5_DEBUG
54 #define PRINTK(x...) printk(x)
55 #define inline
56 #define __inline__
57 #else
58 #define PRINTK(x...) do { } while (0)
59 #endif
61 static void print_raid5_conf (raid5_conf_t *conf);
63 static inline int stripe_locked(struct stripe_head *sh)
65 return test_bit(STRIPE_LOCKED, &sh->state);
68 static void __unlock_stripe(struct stripe_head *sh)
70 if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state))
71 BUG();
72 PRINTK("unlocking stripe %lu\n", sh->sector);
73 wake_up(&sh->wait);
76 static void finish_unlock_stripe(struct stripe_head *sh)
78 raid5_conf_t *conf = sh->raid_conf;
79 sh->cmd = STRIPE_NONE;
80 sh->phase = PHASE_COMPLETE;
81 atomic_dec(&conf->nr_pending_stripes);
82 atomic_inc(&conf->nr_cached_stripes);
83 __unlock_stripe(sh);
84 atomic_dec(&sh->count);
85 wake_up(&conf->wait_for_stripe);
88 static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh)
90 PRINTK("remove_hash(), stripe %lu\n", sh->sector);
92 CHECK_DEVLOCK();
93 CHECK_SHLOCK(sh);
94 if (sh->hash_pprev) {
95 if (sh->hash_next)
96 sh->hash_next->hash_pprev = sh->hash_pprev;
97 *sh->hash_pprev = sh->hash_next;
98 sh->hash_pprev = NULL;
99 atomic_dec(&conf->nr_hashed_stripes);
103 static void lock_get_bh (struct buffer_head *bh)
105 while (md_test_and_set_bit(BH_Lock, &bh->b_state))
106 __wait_on_buffer(bh);
107 atomic_inc(&bh->b_count);
110 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
112 struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size);
114 PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n",
115 sh->sector, atomic_read(&conf->nr_hashed_stripes));
117 CHECK_DEVLOCK();
118 CHECK_SHLOCK(sh);
119 if ((sh->hash_next = *shp) != NULL)
120 (*shp)->hash_pprev = &sh->hash_next;
121 *shp = sh;
122 sh->hash_pprev = shp;
123 atomic_inc(&conf->nr_hashed_stripes);
126 static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size)
128 struct buffer_head *bh;
129 unsigned long flags;
131 CHECK_SHLOCK(sh);
132 md_spin_lock_irqsave(&sh->stripe_lock, flags);
133 bh = sh->buffer_pool;
134 if (!bh)
135 goto out_unlock;
136 sh->buffer_pool = bh->b_next;
137 bh->b_size = b_size;
138 if (atomic_read(&bh->b_count))
139 BUG();
140 out_unlock:
141 md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
143 return bh;
146 static struct buffer_head *get_free_bh(struct stripe_head *sh)
148 struct buffer_head *bh;
149 unsigned long flags;
151 CHECK_SHLOCK(sh);
152 md_spin_lock_irqsave(&sh->stripe_lock, flags);
153 bh = sh->bh_pool;
154 if (!bh)
155 goto out_unlock;
156 sh->bh_pool = bh->b_next;
157 if (atomic_read(&bh->b_count))
158 BUG();
159 out_unlock:
160 md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
162 return bh;
165 static void put_free_buffer(struct stripe_head *sh, struct buffer_head *bh)
167 unsigned long flags;
169 if (atomic_read(&bh->b_count))
170 BUG();
171 CHECK_SHLOCK(sh);
172 md_spin_lock_irqsave(&sh->stripe_lock, flags);
173 bh->b_next = sh->buffer_pool;
174 sh->buffer_pool = bh;
175 md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
178 static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh)
180 unsigned long flags;
182 if (atomic_read(&bh->b_count))
183 BUG();
184 CHECK_SHLOCK(sh);
185 md_spin_lock_irqsave(&sh->stripe_lock, flags);
186 bh->b_next = sh->bh_pool;
187 sh->bh_pool = bh;
188 md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
191 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
193 struct stripe_head *sh;
195 md_spin_lock_irq(&conf->device_lock);
196 sh = conf->free_sh_list;
197 if (!sh)
198 goto out;
199 conf->free_sh_list = sh->free_next;
200 atomic_dec(&conf->nr_free_sh);
201 if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list)
202 BUG();
203 if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) ||
204 atomic_read(&sh->count))
205 BUG();
206 out:
207 md_spin_unlock_irq(&conf->device_lock);
208 return sh;
211 static void __put_free_stripe (raid5_conf_t *conf, struct stripe_head *sh)
213 if (atomic_read(&sh->count) != 0)
214 BUG();
215 CHECK_DEVLOCK();
216 CHECK_SHLOCK(sh);
217 clear_bit(STRIPE_LOCKED, &sh->state);
218 sh->free_next = conf->free_sh_list;
219 conf->free_sh_list = sh;
220 atomic_inc(&conf->nr_free_sh);
223 static void shrink_buffers(struct stripe_head *sh, int num)
225 struct buffer_head *bh;
227 while (num--) {
228 bh = get_free_buffer(sh, -1);
229 if (!bh)
230 return;
231 free_page((unsigned long) bh->b_data);
232 kfree(bh);
236 static void shrink_bh(struct stripe_head *sh, int num)
238 struct buffer_head *bh;
240 while (num--) {
241 bh = get_free_bh(sh);
242 if (!bh)
243 return;
244 kfree(bh);
248 static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority)
250 struct buffer_head *bh;
252 while (num--) {
253 struct page *page;
254 bh = kmalloc(sizeof(struct buffer_head), priority);
255 if (!bh)
256 return 1;
257 memset(bh, 0, sizeof (struct buffer_head));
258 init_waitqueue_head(&bh->b_wait);
259 page = alloc_page(priority);
260 bh->b_data = (char *) page_address(page);
261 if (!bh->b_data) {
262 kfree(bh);
263 return 1;
265 bh->b_size = b_size;
266 atomic_set(&bh->b_count, 0);
267 set_bh_page(bh, page, 0);
268 put_free_buffer(sh, bh);
270 return 0;
273 static int grow_bh(struct stripe_head *sh, int num, int priority)
275 struct buffer_head *bh;
277 while (num--) {
278 bh = kmalloc(sizeof(struct buffer_head), priority);
279 if (!bh)
280 return 1;
281 memset(bh, 0, sizeof (struct buffer_head));
282 init_waitqueue_head(&bh->b_wait);
283 put_free_bh(sh, bh);
285 return 0;
288 static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh)
290 put_free_buffer(sh, bh);
293 static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh)
295 put_free_bh(sh, bh);
298 static void raid5_free_old_bh(struct stripe_head *sh, int i)
300 CHECK_SHLOCK(sh);
301 if (!sh->bh_old[i])
302 BUG();
303 raid5_free_buffer(sh, sh->bh_old[i]);
304 sh->bh_old[i] = NULL;
307 static void raid5_update_old_bh(struct stripe_head *sh, int i)
309 CHECK_SHLOCK(sh);
310 PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i);
311 if (!sh->bh_copy[i])
312 BUG();
313 if (sh->bh_old[i])
314 raid5_free_old_bh(sh, i);
315 sh->bh_old[i] = sh->bh_copy[i];
316 sh->bh_copy[i] = NULL;
319 static void free_stripe(struct stripe_head *sh)
321 raid5_conf_t *conf = sh->raid_conf;
322 int disks = conf->raid_disks, j;
324 if (atomic_read(&sh->count) != 0)
325 BUG();
326 CHECK_DEVLOCK();
327 CHECK_SHLOCK(sh);
328 PRINTK("free_stripe called, stripe %lu\n", sh->sector);
329 if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) {
330 PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count));
331 return;
333 for (j = 0; j < disks; j++) {
334 if (sh->bh_old[j])
335 raid5_free_old_bh(sh, j);
336 if (sh->bh_new[j] || sh->bh_copy[j])
337 BUG();
339 remove_hash(conf, sh);
340 __put_free_stripe(conf, sh);
343 static int shrink_stripe_cache(raid5_conf_t *conf, int nr)
345 struct stripe_head *sh;
346 int i, count = 0;
348 PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock);
349 md_spin_lock_irq(&conf->device_lock);
350 for (i = 0; i < NR_HASH; i++) {
351 sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK];
352 for (; sh; sh = sh->hash_next) {
353 if (sh->phase != PHASE_COMPLETE)
354 continue;
355 if (atomic_read(&sh->count))
356 continue;
358 * Try to lock this stripe:
360 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
361 continue;
362 free_stripe(sh);
363 if (++count == nr) {
364 conf->clock = (i + conf->clock) & HASH_MASK;
365 goto out;
369 out:
370 md_spin_unlock_irq(&conf->device_lock);
371 PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n",
372 atomic_read(&conf->nr_hashed_stripes),
373 atomic_read(&conf->nr_pending_stripes));
374 return count;
377 void __wait_lock_stripe(struct stripe_head *sh)
379 MD_DECLARE_WAITQUEUE(wait, current);
381 PRINTK("wait_lock_stripe %lu\n", sh->sector);
382 if (!atomic_read(&sh->count))
383 BUG();
384 add_wait_queue(&sh->wait, &wait);
385 repeat:
386 set_current_state(TASK_UNINTERRUPTIBLE);
387 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) {
388 schedule();
389 goto repeat;
391 PRINTK("wait_lock_stripe %lu done\n", sh->sector);
392 remove_wait_queue(&sh->wait, &wait);
393 current->state = TASK_RUNNING;
396 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size)
398 struct stripe_head *sh;
400 PRINTK("__find_stripe, sector %lu\n", sector);
401 for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) {
402 if (sh->sector == sector && sh->raid_conf == conf) {
403 if (sh->size != size)
404 BUG();
405 return sh;
408 PRINTK("__stripe %lu not in cache\n", sector);
409 return NULL;
412 static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size)
414 struct stripe_head *sh;
415 struct buffer_head *buffer_pool, *bh_pool;
416 MD_DECLARE_WAITQUEUE(wait, current);
418 PRINTK("alloc_stripe called\n");
421 while ((sh = get_free_stripe(conf)) == NULL) {
422 int cnt;
423 add_wait_queue(&conf->wait_for_stripe, &wait);
424 set_current_state(TASK_UNINTERRUPTIBLE);
425 cnt = shrink_stripe_cache(conf, conf->max_nr_stripes / 8);
426 sh = get_free_stripe(conf);
427 if (!sh && cnt < (conf->max_nr_stripes/8)) {
428 md_wakeup_thread(conf->thread);
429 PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8);
430 schedule();
432 remove_wait_queue(&conf->wait_for_stripe, &wait);
433 current->state = TASK_RUNNING;
434 if (sh)
435 break;
438 buffer_pool = sh->buffer_pool;
439 bh_pool = sh->bh_pool;
440 memset(sh, 0, sizeof(*sh));
441 sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
442 md_init_waitqueue_head(&sh->wait);
443 sh->buffer_pool = buffer_pool;
444 sh->bh_pool = bh_pool;
445 sh->phase = PHASE_COMPLETE;
446 sh->cmd = STRIPE_NONE;
447 sh->raid_conf = conf;
448 sh->sector = sector;
449 sh->size = size;
450 atomic_inc(&conf->nr_cached_stripes);
452 return sh;
455 static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size)
457 struct stripe_head *sh, *new = NULL;
459 PRINTK("get_stripe, sector %lu\n", sector);
462 * Do this in set_blocksize()!
464 if (conf->buffer_size != size) {
465 PRINTK("switching size, %d --> %d\n", conf->buffer_size, size);
466 shrink_stripe_cache(conf, conf->max_nr_stripes);
467 conf->buffer_size = size;
470 repeat:
471 md_spin_lock_irq(&conf->device_lock);
472 sh = __find_stripe(conf, sector, size);
473 if (!sh) {
474 if (!new) {
475 md_spin_unlock_irq(&conf->device_lock);
476 new = alloc_stripe(conf, sector, size);
477 goto repeat;
479 sh = new;
480 new = NULL;
481 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
482 BUG();
483 insert_hash(conf, sh);
484 atomic_inc(&sh->count);
485 md_spin_unlock_irq(&conf->device_lock);
486 } else {
487 atomic_inc(&sh->count);
488 if (new) {
489 if (md_test_and_set_bit(STRIPE_LOCKED, &new->state))
490 BUG();
491 __put_free_stripe(conf, new);
493 md_spin_unlock_irq(&conf->device_lock);
494 PRINTK("get_stripe, waiting, sector %lu\n", sector);
495 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
496 __wait_lock_stripe(sh);
498 return sh;
501 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
503 struct stripe_head *sh;
505 while (num--) {
506 sh = kmalloc(sizeof(struct stripe_head), priority);
507 if (!sh)
508 return 1;
509 memset(sh, 0, sizeof(*sh));
510 sh->raid_conf = conf;
511 sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED;
512 md_init_waitqueue_head(&sh->wait);
514 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
515 BUG();
516 if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) {
517 shrink_buffers(sh, 2 * conf->raid_disks);
518 kfree(sh);
519 return 1;
521 if (grow_bh(sh, conf->raid_disks, priority)) {
522 shrink_buffers(sh, 2 * conf->raid_disks);
523 shrink_bh(sh, conf->raid_disks);
524 kfree(sh);
525 return 1;
527 md_spin_lock_irq(&conf->device_lock);
528 __put_free_stripe(conf, sh);
529 atomic_inc(&conf->nr_stripes);
530 md_spin_unlock_irq(&conf->device_lock);
532 return 0;
535 static void shrink_stripes(raid5_conf_t *conf, int num)
537 struct stripe_head *sh;
539 while (num--) {
540 sh = get_free_stripe(conf);
541 if (!sh)
542 break;
543 if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state))
544 BUG();
545 shrink_buffers(sh, conf->raid_disks * 2);
546 shrink_bh(sh, conf->raid_disks);
547 kfree(sh);
548 atomic_dec(&conf->nr_stripes);
553 static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size)
555 struct buffer_head *bh;
557 bh = get_free_buffer(sh, b_size);
558 if (!bh)
559 BUG();
560 return bh;
563 static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh)
565 struct buffer_head *bh;
567 bh = get_free_bh(sh);
568 if (!bh)
569 BUG();
570 return bh;
573 static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate)
575 struct buffer_head *bh = sh->bh_new[i];
577 PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate);
578 sh->bh_new[i] = NULL;
579 raid5_free_bh(sh, sh->bh_req[i]);
580 sh->bh_req[i] = NULL;
581 PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io);
582 bh->b_end_io(bh, uptodate);
583 if (!uptodate)
584 printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for "
585 "block %lu\n",
586 partition_name(mddev_to_kdev(sh->raid_conf->mddev)),
587 bh->b_blocknr);
590 static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate)
592 if (uptodate)
593 set_bit(BH_Uptodate, &bh->b_state);
594 else
595 clear_bit(BH_Uptodate, &bh->b_state);
598 static void raid5_end_request (struct buffer_head * bh, int uptodate)
600 struct stripe_head *sh = bh->b_dev_id;
601 raid5_conf_t *conf = sh->raid_conf;
602 int disks = conf->raid_disks, i;
603 unsigned long flags;
605 PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3));
606 md_spin_lock_irqsave(&sh->stripe_lock, flags);
607 raid5_mark_buffer_uptodate(bh, uptodate);
608 if (!uptodate)
609 md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
610 if (conf->failed_disks) {
611 for (i = 0; i < disks; i++) {
612 if (conf->disks[i].operational)
613 continue;
614 if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i])
615 continue;
616 if (bh->b_dev != conf->disks[i].dev)
617 continue;
618 set_bit(STRIPE_ERROR, &sh->state);
621 md_spin_unlock_irqrestore(&sh->stripe_lock, flags);
623 if (atomic_dec_and_test(&sh->nr_pending)) {
624 atomic_inc(&conf->nr_handle);
625 md_wakeup_thread(conf->thread);
629 static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i)
631 raid5_conf_t *conf = sh->raid_conf;
632 char *b_data;
633 struct page *b_page;
634 int block = sh->sector / (sh->size >> 9);
636 b_data = bh->b_data;
637 b_page = bh->b_page;
638 memset (bh, 0, sizeof (struct buffer_head));
639 init_waitqueue_head(&bh->b_wait);
640 init_buffer(bh, raid5_end_request, sh);
641 bh->b_dev = conf->disks[i].dev;
642 bh->b_blocknr = block;
644 bh->b_data = b_data;
645 bh->b_page = b_page;
647 bh->b_rdev = conf->disks[i].dev;
648 bh->b_rsector = sh->sector;
650 bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
651 bh->b_size = sh->size;
652 bh->b_list = BUF_LOCKED;
655 static int raid5_error (mddev_t *mddev, kdev_t dev)
657 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
658 mdp_super_t *sb = mddev->sb;
659 struct disk_info *disk;
660 int i;
662 PRINTK("raid5_error called\n");
663 conf->resync_parity = 0;
664 for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
665 if (disk->dev == dev && disk->operational) {
666 disk->operational = 0;
667 mark_disk_faulty(sb->disks+disk->number);
668 mark_disk_nonsync(sb->disks+disk->number);
669 mark_disk_inactive(sb->disks+disk->number);
670 sb->active_disks--;
671 sb->working_disks--;
672 sb->failed_disks++;
673 mddev->sb_dirty = 1;
674 conf->working_disks--;
675 conf->failed_disks++;
676 md_wakeup_thread(conf->thread);
677 printk (KERN_ALERT
678 "raid5: Disk failure on %s, disabling device."
679 " Operation continuing on %d devices\n",
680 partition_name (dev), conf->working_disks);
681 return 0;
685 * handle errors in spares (during reconstruction)
687 if (conf->spare) {
688 disk = conf->spare;
689 if (disk->dev == dev) {
690 printk (KERN_ALERT
691 "raid5: Disk failure on spare %s\n",
692 partition_name (dev));
693 if (!conf->spare->operational) {
694 MD_BUG();
695 return -EIO;
697 disk->operational = 0;
698 disk->write_only = 0;
699 conf->spare = NULL;
700 mark_disk_faulty(sb->disks+disk->number);
701 mark_disk_nonsync(sb->disks+disk->number);
702 mark_disk_inactive(sb->disks+disk->number);
703 sb->spare_disks--;
704 sb->working_disks--;
705 sb->failed_disks++;
707 return 0;
710 MD_BUG();
711 return -EIO;
715 * Input: a 'big' sector number,
716 * Output: index of the data and parity disk, and the sector # in them.
718 static unsigned long raid5_compute_sector(int r_sector, unsigned int raid_disks,
719 unsigned int data_disks, unsigned int * dd_idx,
720 unsigned int * pd_idx, raid5_conf_t *conf)
722 unsigned int stripe;
723 int chunk_number, chunk_offset;
724 unsigned long new_sector;
725 int sectors_per_chunk = conf->chunk_size >> 9;
727 /* First compute the information on this sector */
730 * Compute the chunk number and the sector offset inside the chunk
732 chunk_number = r_sector / sectors_per_chunk;
733 chunk_offset = r_sector % sectors_per_chunk;
736 * Compute the stripe number
738 stripe = chunk_number / data_disks;
741 * Compute the data disk and parity disk indexes inside the stripe
743 *dd_idx = chunk_number % data_disks;
746 * Select the parity disk based on the user selected algorithm.
748 if (conf->level == 4)
749 *pd_idx = data_disks;
750 else switch (conf->algorithm) {
751 case ALGORITHM_LEFT_ASYMMETRIC:
752 *pd_idx = data_disks - stripe % raid_disks;
753 if (*dd_idx >= *pd_idx)
754 (*dd_idx)++;
755 break;
756 case ALGORITHM_RIGHT_ASYMMETRIC:
757 *pd_idx = stripe % raid_disks;
758 if (*dd_idx >= *pd_idx)
759 (*dd_idx)++;
760 break;
761 case ALGORITHM_LEFT_SYMMETRIC:
762 *pd_idx = data_disks - stripe % raid_disks;
763 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
764 break;
765 case ALGORITHM_RIGHT_SYMMETRIC:
766 *pd_idx = stripe % raid_disks;
767 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
768 break;
769 default:
770 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
774 * Finally, compute the new sector number
776 new_sector = stripe * sectors_per_chunk + chunk_offset;
777 return new_sector;
780 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
782 raid5_conf_t *conf = sh->raid_conf;
783 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
784 unsigned long new_sector = sh->sector, check;
785 int sectors_per_chunk = conf->chunk_size >> 9;
786 unsigned long stripe = new_sector / sectors_per_chunk;
787 int chunk_offset = new_sector % sectors_per_chunk;
788 int chunk_number, dummy1, dummy2, dd_idx = i;
789 unsigned long r_sector, blocknr;
791 switch (conf->algorithm) {
792 case ALGORITHM_LEFT_ASYMMETRIC:
793 case ALGORITHM_RIGHT_ASYMMETRIC:
794 if (i > sh->pd_idx)
795 i--;
796 break;
797 case ALGORITHM_LEFT_SYMMETRIC:
798 case ALGORITHM_RIGHT_SYMMETRIC:
799 if (i < sh->pd_idx)
800 i += raid_disks;
801 i -= (sh->pd_idx + 1);
802 break;
803 default:
804 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
807 chunk_number = stripe * data_disks + i;
808 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
809 blocknr = r_sector / (sh->size >> 9);
811 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
812 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
813 printk("compute_blocknr: map not correct\n");
814 return 0;
816 return blocknr;
819 static void compute_block(struct stripe_head *sh, int dd_idx)
821 raid5_conf_t *conf = sh->raid_conf;
822 int i, count, disks = conf->raid_disks;
823 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
825 PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
827 if (sh->bh_old[dd_idx] == NULL)
828 sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size);
829 raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx);
831 memset(sh->bh_old[dd_idx]->b_data, 0, sh->size);
832 bh_ptr[0] = sh->bh_old[dd_idx];
833 count = 1;
834 for (i = 0; i < disks; i++) {
835 if (i == dd_idx)
836 continue;
837 if (sh->bh_old[i]) {
838 bh_ptr[count++] = sh->bh_old[i];
839 } else {
840 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
842 if (count == MAX_XOR_BLOCKS) {
843 xor_block(count, &bh_ptr[0]);
844 count = 1;
847 if (count != 1)
848 xor_block(count, &bh_ptr[0]);
849 raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1);
852 static void compute_parity(struct stripe_head *sh, int method)
854 raid5_conf_t *conf = sh->raid_conf;
855 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
856 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
858 PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
859 for (i = 0; i < disks; i++) {
860 if (i == pd_idx || !sh->bh_new[i])
861 continue;
862 if (!sh->bh_copy[i])
863 sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size);
864 raid5_build_block(sh, sh->bh_copy[i], i);
865 if (atomic_set_buffer_clean(sh->bh_new[i]))
866 atomic_set_buffer_dirty(sh->bh_copy[i]);
867 memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size);
869 if (sh->bh_copy[pd_idx] == NULL) {
870 sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size);
871 atomic_set_buffer_dirty(sh->bh_copy[pd_idx]);
873 raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx);
875 if (method == RECONSTRUCT_WRITE) {
876 memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size);
877 bh_ptr[0] = sh->bh_copy[pd_idx];
878 count = 1;
879 for (i = 0; i < disks; i++) {
880 if (i == sh->pd_idx)
881 continue;
882 if (sh->bh_new[i]) {
883 bh_ptr[count++] = sh->bh_copy[i];
884 } else if (sh->bh_old[i]) {
885 bh_ptr[count++] = sh->bh_old[i];
887 if (count == MAX_XOR_BLOCKS) {
888 xor_block(count, &bh_ptr[0]);
889 count = 1;
892 if (count != 1) {
893 xor_block(count, &bh_ptr[0]);
895 } else if (method == READ_MODIFY_WRITE) {
896 memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size);
897 bh_ptr[0] = sh->bh_copy[pd_idx];
898 count = 1;
899 for (i = 0; i < disks; i++) {
900 if (i == sh->pd_idx)
901 continue;
902 if (sh->bh_new[i] && sh->bh_old[i]) {
903 bh_ptr[count++] = sh->bh_copy[i];
904 bh_ptr[count++] = sh->bh_old[i];
906 if (count >= (MAX_XOR_BLOCKS - 1)) {
907 xor_block(count, &bh_ptr[0]);
908 count = 1;
911 if (count != 1) {
912 xor_block(count, &bh_ptr[0]);
915 raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1);
918 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
920 raid5_conf_t *conf = sh->raid_conf;
921 struct buffer_head *bh_req;
923 PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
924 CHECK_SHLOCK(sh);
925 if (sh->bh_new[dd_idx])
926 BUG();
928 bh_req = raid5_alloc_bh(sh);
929 raid5_build_block(sh, bh_req, dd_idx);
930 bh_req->b_data = bh->b_data;
931 bh_req->b_page = bh->b_page;
933 md_spin_lock_irq(&conf->device_lock);
934 if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) {
935 PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write");
936 sh->phase = PHASE_BEGIN;
937 sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE;
938 atomic_inc(&conf->nr_pending_stripes);
939 atomic_inc(&conf->nr_handle);
940 PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle));
942 sh->bh_new[dd_idx] = bh;
943 sh->bh_req[dd_idx] = bh_req;
944 sh->cmd_new[dd_idx] = rw;
945 sh->new[dd_idx] = 1;
946 md_spin_unlock_irq(&conf->device_lock);
948 PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
951 static void complete_stripe(struct stripe_head *sh)
953 raid5_conf_t *conf = sh->raid_conf;
954 int disks = conf->raid_disks;
955 int i, new = 0;
957 PRINTK("complete_stripe %lu\n", sh->sector);
958 for (i = 0; i < disks; i++) {
959 if (sh->cmd == STRIPE_SYNC && sh->bh_copy[i])
960 raid5_update_old_bh(sh, i);
961 if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx)
962 raid5_update_old_bh(sh, i);
963 if (sh->bh_new[i]) {
964 PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]);
965 if (!sh->new[i]) {
966 #if 0
967 if (sh->cmd == STRIPE_WRITE) {
968 if (memcmp(sh->bh_new[i]->b_data, sh->bh_copy[i]->b_data, sh->size)) {
969 printk("copy differs, %s, sector %lu ",
970 test_bit(BH_Dirty, &sh->bh_new[i]->b_state) ? "dirty" : "clean",
971 sh->sector);
972 } else if (test_bit(BH_Dirty, &sh->bh_new[i]->b_state))
973 printk("sector %lu dirty\n", sh->sector);
975 #endif
976 if (sh->cmd == STRIPE_WRITE)
977 raid5_update_old_bh(sh, i);
978 raid5_end_buffer_io(sh, i, 1);
979 continue;
980 } else
981 new++;
983 if (new && sh->cmd == STRIPE_WRITE)
984 printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new);
986 if (sh->cmd == STRIPE_SYNC)
987 md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
988 if (!new)
989 finish_unlock_stripe(sh);
990 else {
991 PRINTK("stripe %lu, new == %d\n", sh->sector, new);
992 sh->phase = PHASE_BEGIN;
997 static int is_stripe_allclean(struct stripe_head *sh, int disks)
999 int i;
1001 return 0;
1002 for (i = 0; i < disks; i++) {
1003 if (sh->bh_new[i])
1004 if (test_bit(BH_Dirty, &sh->bh_new[i]))
1005 return 0;
1006 if (sh->bh_old[i])
1007 if (test_bit(BH_Dirty, &sh->bh_old[i]))
1008 return 0;
1010 return 1;
1013 static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf,
1014 struct stripe_head *sh, int nr_write, int * operational, int disks,
1015 int parity, int parity_failed, int nr_cache, int nr_cache_other,
1016 int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
1018 int i, allclean;
1019 request_queue_t *q;
1020 unsigned int block;
1021 struct buffer_head *bh;
1022 int method1 = INT_MAX, method2 = INT_MAX;
1025 * Attempt to add entries :-)
1027 if (nr_write != disks - 1) {
1028 for (i = 0; i < disks; i++) {
1029 if (i == sh->pd_idx)
1030 continue;
1031 if (sh->bh_new[i])
1032 continue;
1033 block = (int) compute_blocknr(sh, i);
1034 bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size);
1035 if (!bh)
1036 continue;
1037 if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) {
1038 PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block);
1039 add_stripe_bh(sh, bh, i, WRITE);
1040 sh->new[i] = 0;
1041 nr_write++;
1042 if (sh->bh_old[i]) {
1043 nr_cache_overwrite++;
1044 nr_cache_other--;
1045 } else
1046 if (!operational[i]) {
1047 nr_failed_overwrite++;
1048 nr_failed_other--;
1051 atomic_dec(&bh->b_count);
1054 PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector);
1056 * Writing, need to update parity buffer.
1058 * Compute the number of I/O requests in the "reconstruct
1059 * write" and "read modify write" methods.
1061 if (!nr_failed_other)
1062 method1 = (disks - 1) - (nr_write + nr_cache_other);
1063 if (!nr_failed_overwrite && !parity_failed)
1064 method2 = nr_write - nr_cache_overwrite + (1 - parity);
1066 if (method1 == INT_MAX && method2 == INT_MAX)
1067 BUG();
1068 PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2);
1070 if (!method1 || !method2) {
1071 allclean = is_stripe_allclean(sh, disks);
1072 sh->phase = PHASE_WRITE;
1073 compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1075 for (i = 0; i < disks; i++) {
1076 if (!operational[i] && !conf->spare && !conf->resync_parity)
1077 continue;
1078 bh = sh->bh_copy[i];
1079 if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL)))
1080 printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]);
1081 if (i == sh->pd_idx && !bh)
1082 printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i);
1083 if (bh) {
1084 PRINTK("making request for buffer %d\n", i);
1085 lock_get_bh(bh);
1086 if (!operational[i] && !conf->resync_parity) {
1087 PRINTK("writing spare %d\n", i);
1088 atomic_inc(&sh->nr_pending);
1089 bh->b_dev = bh->b_rdev = conf->spare->dev;
1090 q = blk_get_queue(bh->b_rdev);
1091 generic_make_request(q, WRITERAW, bh);
1092 } else {
1093 #if 0
1094 atomic_inc(&sh->nr_pending);
1095 bh->b_dev = bh->b_rdev = conf->disks[i].dev;
1096 q = blk_get_queue(bh->b_rdev);
1097 generic_make_request(q, WRITERAW, bh);
1098 #else
1099 if (!allclean || (i==sh->pd_idx)) {
1100 PRINTK("writing dirty %d\n", i);
1101 atomic_inc(&sh->nr_pending);
1102 bh->b_dev = bh->b_rdev = conf->disks[i].dev;
1103 q = blk_get_queue(bh->b_rdev);
1104 generic_make_request(q, WRITERAW, bh);
1105 } else {
1106 PRINTK("not writing clean %d\n", i);
1107 raid5_end_request(bh, 1);
1108 sh->new[i] = 0;
1110 #endif
1112 atomic_dec(&bh->b_count);
1115 PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
1116 return;
1119 if (method1 < method2) {
1120 sh->write_method = RECONSTRUCT_WRITE;
1121 for (i = 0; i < disks; i++) {
1122 if (i == sh->pd_idx)
1123 continue;
1124 if (sh->bh_new[i] || sh->bh_old[i])
1125 continue;
1126 sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
1127 raid5_build_block(sh, sh->bh_old[i], i);
1129 } else {
1130 sh->write_method = READ_MODIFY_WRITE;
1131 for (i = 0; i < disks; i++) {
1132 if (sh->bh_old[i])
1133 continue;
1134 if (!sh->bh_new[i] && i != sh->pd_idx)
1135 continue;
1136 sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
1137 raid5_build_block(sh, sh->bh_old[i], i);
1140 sh->phase = PHASE_READ_OLD;
1141 for (i = 0; i < disks; i++) {
1142 if (!sh->bh_old[i])
1143 continue;
1144 if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state))
1145 continue;
1146 lock_get_bh(sh->bh_old[i]);
1147 atomic_inc(&sh->nr_pending);
1148 sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev;
1149 q = blk_get_queue(sh->bh_old[i]->b_rdev);
1150 generic_make_request(q, READ, sh->bh_old[i]);
1151 atomic_dec(&sh->bh_old[i]->b_count);
1153 PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
1157 * Reading
1159 static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf,
1160 struct stripe_head *sh, int nr_read, int * operational, int disks,
1161 int parity, int parity_failed, int nr_cache, int nr_cache_other,
1162 int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
1164 int i;
1165 request_queue_t *q;
1166 int method1 = INT_MAX;
1168 method1 = nr_read - nr_cache_overwrite;
1170 PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1);
1172 if (!method1 || (method1 == 1 && nr_cache == disks - 1)) {
1173 PRINTK("read %lu completed from cache\n", sh->sector);
1174 for (i = 0; i < disks; i++) {
1175 if (!sh->bh_new[i])
1176 continue;
1177 if (!sh->bh_old[i])
1178 compute_block(sh, i);
1179 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
1181 complete_stripe(sh);
1182 return;
1184 if (nr_failed_overwrite) {
1185 sh->phase = PHASE_READ_OLD;
1186 for (i = 0; i < disks; i++) {
1187 if (sh->bh_old[i])
1188 continue;
1189 if (!operational[i])
1190 continue;
1191 sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size);
1192 raid5_build_block(sh, sh->bh_old[i], i);
1193 lock_get_bh(sh->bh_old[i]);
1194 atomic_inc(&sh->nr_pending);
1195 sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev;
1196 q = blk_get_queue(sh->bh_old[i]->b_rdev);
1197 generic_make_request(q, READ, sh->bh_old[i]);
1198 atomic_dec(&sh->bh_old[i]->b_count);
1200 PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
1201 return;
1203 sh->phase = PHASE_READ;
1204 for (i = 0; i < disks; i++) {
1205 if (!sh->bh_new[i])
1206 continue;
1207 if (sh->bh_old[i]) {
1208 memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size);
1209 continue;
1211 #if RAID5_PARANOIA
1212 if (sh->bh_req[i] == NULL || test_bit(BH_Lock, &sh->bh_req[i]->b_state)) {
1213 int j;
1214 printk("req %d is NULL! or locked \n", i);
1215 for (j=0; j<disks; j++) {
1216 printk("%d: new=%p old=%p req=%p new=%d cmd=%d\n",
1217 j, sh->bh_new[j], sh->bh_old[j], sh->bh_req[j],
1218 sh->new[j], sh->cmd_new[j]);
1222 #endif
1223 lock_get_bh(sh->bh_req[i]);
1224 atomic_inc(&sh->nr_pending);
1225 sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev;
1226 q = blk_get_queue(sh->bh_req[i]->b_rdev);
1227 generic_make_request(q, READ, sh->bh_req[i]);
1228 atomic_dec(&sh->bh_req[i]->b_count);
1230 PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending));
1234 * Syncing
1236 static void handle_stripe_sync (mddev_t *mddev , raid5_conf_t *conf,
1237 struct stripe_head *sh, int * operational, int disks,
1238 int parity, int parity_failed, int nr_cache, int nr_cache_other,
1239 int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite)
1241 request_queue_t *q;
1242 struct buffer_head *bh;
1243 int i, pd_idx;
1245 /* firstly, we want to have data from all non-failed drives
1246 * in bh_old
1248 PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache);
1249 if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite))
1251 sh->phase = PHASE_READ_OLD;
1252 for (i = 0; i < disks; i++) {
1253 if (sh->bh_old[i])
1254 continue;
1255 if (!conf->disks[i].operational)
1256 continue;
1258 bh = raid5_alloc_buffer(sh, sh->size);
1259 sh->bh_old[i] = bh;
1260 raid5_build_block(sh, bh, i);
1261 lock_get_bh(bh);
1262 atomic_inc(&sh->nr_pending);
1263 bh->b_dev = bh->b_rdev = conf->disks[i].dev;
1264 q = blk_get_queue(bh->b_rdev);
1265 generic_make_request(q, READ, bh);
1266 drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0);
1267 atomic_dec(&sh->bh_old[i]->b_count);
1269 PRINTK("handle_stripe_sync() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
1271 return;
1273 /* now, if there is a failed drive, rebuild and write to spare */
1274 if (nr_cache == disks-1) {
1275 sh->phase = PHASE_WRITE;
1276 /* we can generate the missing block, which will be on the failed drive */
1277 for (i=0; i<disks; i++) {
1278 if (operational[i])
1279 continue;
1280 compute_block(sh, i);
1281 if (conf->spare) {
1282 bh = sh->bh_copy[i];
1283 if (bh) {
1284 memcpy(bh->b_data, sh->bh_old[i]->b_data, sh->size);
1285 set_bit(BH_Uptodate, &bh->b_state);
1286 } else {
1287 bh = sh->bh_old[i];
1288 sh->bh_old[i] = NULL;
1289 sh->bh_copy[i] = bh;
1291 atomic_inc(&sh->nr_pending);
1292 lock_get_bh(bh);
1293 bh->b_dev = bh->b_rdev = conf->spare->dev;
1294 q = blk_get_queue(bh->b_rdev);
1295 generic_make_request(q, WRITERAW, bh);
1296 drive_stat_acct(bh->b_rdev, WRITE, -bh->b_size/512, 0);
1297 atomic_dec(&bh->b_count);
1298 PRINTK("handle_stripe_sync() %lu, phase WRITE, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending));
1300 break;
1302 return;
1305 /* nr_cache == disks:
1306 * check parity and compute/write if needed
1309 compute_parity(sh, RECONSTRUCT_WRITE);
1310 pd_idx = sh->pd_idx;
1311 if (!memcmp(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size)) {
1312 /* the parity is correct - Yay! */
1313 complete_stripe(sh);
1314 } else {
1315 sh->phase = PHASE_WRITE;
1316 bh = sh->bh_copy[pd_idx];
1317 atomic_set_buffer_dirty(bh);
1318 lock_get_bh(bh);
1319 atomic_inc(&sh->nr_pending);
1320 bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev;
1321 q = blk_get_queue(bh->b_rdev);
1322 generic_make_request(q, WRITERAW, bh);
1323 drive_stat_acct(bh->b_rdev, WRITE, -bh->b_size/512, 0);
1324 atomic_dec(&bh->b_count);
1325 PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n",
1326 sh->sector, md_atomic_read(&sh->nr_pending));
1331 * handle_stripe() is our main logic routine. Note that:
1333 * 1. lock_stripe() should be used whenever we can't accept additonal
1334 * buffers, either during short sleeping in handle_stripe() or
1335 * during io operations.
1337 * 2. We should be careful to set sh->nr_pending whenever we sleep,
1338 * to prevent re-entry of handle_stripe() for the same sh.
1340 * 3. conf->failed_disks and disk->operational can be changed
1341 * from an interrupt. This complicates things a bit, but it allows
1342 * us to stop issuing requests for a failed drive as soon as possible.
1344 static void handle_stripe(struct stripe_head *sh)
1346 raid5_conf_t *conf = sh->raid_conf;
1347 mddev_t *mddev = conf->mddev;
1348 int disks = conf->raid_disks;
1349 int i, nr_read = 0, nr_write = 0, parity = 0;
1350 int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0;
1351 int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0;
1352 int operational[MD_SB_DISKS], failed_disks = conf->failed_disks;
1354 PRINTK("handle_stripe(), stripe %lu\n", sh->sector);
1355 if (!stripe_locked(sh))
1356 BUG();
1357 if (md_atomic_read(&sh->nr_pending))
1358 BUG();
1359 if (sh->phase == PHASE_COMPLETE)
1360 BUG();
1362 atomic_dec(&conf->nr_handle);
1364 if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) {
1365 printk("raid5: restarting stripe %lu\n", sh->sector);
1366 sh->phase = PHASE_BEGIN;
1369 if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) ||
1370 (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) ||
1371 (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE)
1374 * Completed
1376 complete_stripe(sh);
1377 if (sh->phase == PHASE_COMPLETE)
1378 return;
1381 md_spin_lock_irq(&conf->device_lock);
1382 for (i = 0; i < disks; i++) {
1383 operational[i] = conf->disks[i].operational;
1384 if (i == sh->pd_idx && conf->resync_parity)
1385 operational[i] = 0;
1387 failed_disks = conf->failed_disks;
1388 md_spin_unlock_irq(&conf->device_lock);
1391 * Make this one more graceful?
1393 if (failed_disks > 1) {
1394 for (i = 0; i < disks; i++) {
1395 if (sh->bh_new[i]) {
1396 raid5_end_buffer_io(sh, i, 0);
1397 continue;
1400 if (sh->cmd == STRIPE_SYNC)
1401 md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
1402 finish_unlock_stripe(sh);
1403 return;
1406 PRINTK("=== stripe index START ===\n");
1407 for (i = 0; i < disks; i++) {
1408 PRINTK("disk %d, ", i);
1409 if (sh->bh_old[i]) {
1410 nr_cache++;
1411 PRINTK(" (old cached, %d)", nr_cache);
1413 if (i == sh->pd_idx) {
1414 PRINTK(" PARITY.");
1415 if (sh->bh_old[i]) {
1416 PRINTK(" CACHED.");
1417 parity = 1;
1418 } else {
1419 PRINTK(" UNCACHED.");
1420 if (!operational[i]) {
1421 PRINTK(" FAILED.");
1422 parity_failed = 1;
1425 PRINTK("\n");
1426 continue;
1428 if (!sh->bh_new[i]) {
1429 PRINTK(" (no new data block) ");
1430 if (sh->bh_old[i]) {
1431 PRINTK(" (but old block cached) ");
1432 nr_cache_other++;
1433 } else {
1434 if (!operational[i]) {
1435 PRINTK(" (because failed disk) ");
1436 nr_failed_other++;
1437 } else
1438 PRINTK(" (no old block either) ");
1440 PRINTK("\n");
1441 continue;
1443 sh->new[i] = 0;
1444 if (sh->cmd_new[i] == READ) {
1445 nr_read++;
1446 PRINTK(" (new READ %d)", nr_read);
1448 if (sh->cmd_new[i] == WRITE) {
1449 nr_write++;
1450 PRINTK(" (new WRITE %d)", nr_write);
1452 if (sh->bh_old[i]) {
1453 nr_cache_overwrite++;
1454 PRINTK(" (overwriting old %d)", nr_cache_overwrite);
1455 } else {
1456 if (!operational[i]) {
1457 nr_failed_overwrite++;
1458 PRINTK(" (overwriting failed %d)", nr_failed_overwrite);
1461 PRINTK("\n");
1463 PRINTK("=== stripe index END ===\n");
1465 if (nr_write && nr_read)
1466 BUG();
1468 if (nr_write)
1469 handle_stripe_write(
1470 mddev, conf, sh, nr_write, operational, disks,
1471 parity, parity_failed, nr_cache, nr_cache_other,
1472 nr_failed_other, nr_cache_overwrite,
1473 nr_failed_overwrite
1475 else if (nr_read)
1476 handle_stripe_read(
1477 mddev, conf, sh, nr_read, operational, disks,
1478 parity, parity_failed, nr_cache, nr_cache_other,
1479 nr_failed_other, nr_cache_overwrite,
1480 nr_failed_overwrite
1482 else if (sh->cmd == STRIPE_SYNC)
1483 handle_stripe_sync(
1484 mddev, conf, sh, operational, disks,
1485 parity, parity_failed, nr_cache, nr_cache_other,
1486 nr_failed_other, nr_cache_overwrite, nr_failed_overwrite
1491 static int raid5_make_request (request_queue_t *q, mddev_t *mddev, int rw, struct buffer_head * bh)
1493 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1494 const unsigned int raid_disks = conf->raid_disks;
1495 const unsigned int data_disks = raid_disks - 1;
1496 unsigned int dd_idx, pd_idx;
1497 unsigned long new_sector;
1499 struct stripe_head *sh;
1501 if (rw == READA)
1502 rw = READ;
1504 new_sector = raid5_compute_sector(bh->b_rsector,
1505 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1507 PRINTK("raid5_make_request, sector %lu\n", new_sector);
1508 sh = get_lock_stripe(conf, new_sector, bh->b_size);
1509 #if 0
1510 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) {
1511 PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd);
1512 lock_stripe(sh);
1513 if (!md_atomic_read(&sh->nr_pending))
1514 handle_stripe(sh);
1515 goto repeat;
1517 #endif
1518 sh->pd_idx = pd_idx;
1519 if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN)
1520 PRINTK("stripe %lu catching the bus!\n", sh->sector);
1521 if (sh->bh_new[dd_idx])
1522 BUG();
1523 add_stripe_bh(sh, bh, dd_idx, rw);
1525 md_wakeup_thread(conf->thread);
1526 return 0;
1530 * Determine correct block size for this device.
1532 unsigned int device_bsize (kdev_t dev)
1534 unsigned int i, correct_size;
1536 correct_size = BLOCK_SIZE;
1537 if (blksize_size[MAJOR(dev)]) {
1538 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1539 if (i)
1540 correct_size = i;
1543 return correct_size;
1546 static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr)
1548 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1549 struct stripe_head *sh;
1550 int sectors_per_chunk = conf->chunk_size >> 9;
1551 unsigned long stripe = (block_nr<<2)/sectors_per_chunk;
1552 int chunk_offset = (block_nr<<2) % sectors_per_chunk;
1553 int dd_idx, pd_idx;
1554 unsigned long first_sector;
1555 int raid_disks = conf->raid_disks;
1556 int data_disks = raid_disks-1;
1557 int redone = 0;
1558 int bufsize;
1560 if (!conf->buffer_size)
1561 conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE;
1562 bufsize = conf->buffer_size;
1563 /* Hmm... race on buffer_size ?? */
1564 redone = block_nr% (bufsize>>10);
1565 block_nr -= redone;
1566 sh = get_lock_stripe(conf, block_nr<<1, bufsize);
1567 first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1568 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1569 sh->pd_idx = pd_idx;
1570 sh->cmd = STRIPE_SYNC;
1571 sh->phase = PHASE_BEGIN;
1572 sh->sync_redone = redone;
1573 atomic_inc(&conf->nr_pending_stripes);
1574 atomic_inc(&conf->nr_handle);
1575 md_wakeup_thread(conf->thread);
1576 return (bufsize>>10)-redone;
1580 * This is our raid5 kernel thread.
1582 * We scan the hash table for stripes which can be handled now.
1583 * During the scan, completed stripes are saved for us by the interrupt
1584 * handler, so that they will not have to wait for our next wakeup.
1586 static void raid5d (void *data)
1588 struct stripe_head *sh;
1589 raid5_conf_t *conf = data;
1590 mddev_t *mddev = conf->mddev;
1591 int i, handled;
1593 PRINTK("+++ raid5d active\n");
1595 handled = 0;
1596 md_spin_lock_irq(&conf->device_lock);
1597 clear_bit(THREAD_WAKEUP, &conf->thread->flags);
1598 repeat_pass:
1599 if (mddev->sb_dirty) {
1600 md_spin_unlock_irq(&conf->device_lock);
1601 mddev->sb_dirty = 0;
1602 md_update_sb(mddev);
1603 md_spin_lock_irq(&conf->device_lock);
1605 for (i = 0; i < NR_HASH; i++) {
1606 repeat:
1607 sh = conf->stripe_hashtbl[i];
1608 for (; sh; sh = sh->hash_next) {
1609 if (sh->raid_conf != conf)
1610 continue;
1611 if (sh->phase == PHASE_COMPLETE)
1612 continue;
1613 if (md_atomic_read(&sh->nr_pending))
1614 continue;
1615 md_spin_unlock_irq(&conf->device_lock);
1616 if (!atomic_read(&sh->count))
1617 BUG();
1619 handled++;
1620 handle_stripe(sh);
1621 md_spin_lock_irq(&conf->device_lock);
1622 goto repeat;
1625 if (conf) {
1626 PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle));
1627 if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) &&
1628 md_atomic_read(&conf->nr_handle))
1629 goto repeat_pass;
1631 md_spin_unlock_irq(&conf->device_lock);
1633 PRINTK("--- raid5d inactive\n");
1637 * Private kernel thread for parity reconstruction after an unclean
1638 * shutdown. Reconstruction on spare drives in case of a failed drive
1639 * is done by the generic mdsyncd.
1641 static void raid5syncd (void *data)
1643 raid5_conf_t *conf = data;
1644 mddev_t *mddev = conf->mddev;
1646 if (!conf->resync_parity)
1647 return;
1648 if (conf->resync_parity == 2)
1649 return;
1650 down(&mddev->recovery_sem);
1651 if (md_do_sync(mddev,NULL)) {
1652 up(&mddev->recovery_sem);
1653 printk("raid5: resync aborted!\n");
1654 return;
1656 conf->resync_parity = 0;
1657 up(&mddev->recovery_sem);
1658 printk("raid5: resync finished.\n");
1661 static int __check_consistency (mddev_t *mddev, int row)
1663 raid5_conf_t *conf = mddev->private;
1664 kdev_t dev;
1665 struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL;
1666 int i, ret = 0, nr = 0, count;
1667 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
1669 if (conf->working_disks != conf->raid_disks)
1670 goto out;
1671 tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
1672 tmp->b_size = 4096;
1673 tmp->b_page = alloc_page(GFP_KERNEL);
1674 tmp->b_data = (char *)page_address(tmp->b_page);
1675 if (!tmp->b_data)
1676 goto out;
1677 md_clear_page((unsigned long)tmp->b_data);
1678 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
1679 for (i = 0; i < conf->raid_disks; i++) {
1680 dev = conf->disks[i].dev;
1681 set_blocksize(dev, 4096);
1682 bh[i] = bread(dev, row / 4, 4096);
1683 if (!bh[i])
1684 break;
1685 nr++;
1687 if (nr == conf->raid_disks) {
1688 bh_ptr[0] = tmp;
1689 count = 1;
1690 for (i = 1; i < nr; i++) {
1691 bh_ptr[count++] = bh[i];
1692 if (count == MAX_XOR_BLOCKS) {
1693 xor_block(count, &bh_ptr[0]);
1694 count = 1;
1697 if (count != 1) {
1698 xor_block(count, &bh_ptr[0]);
1700 if (memcmp(tmp->b_data, bh[0]->b_data, 4096))
1701 ret = 1;
1703 for (i = 0; i < conf->raid_disks; i++) {
1704 dev = conf->disks[i].dev;
1705 if (bh[i]) {
1706 bforget(bh[i]);
1707 bh[i] = NULL;
1709 fsync_dev(dev);
1710 invalidate_buffers(dev);
1712 free_page((unsigned long) tmp->b_data);
1713 out:
1714 if (tmp)
1715 kfree(tmp);
1716 return ret;
1719 static int check_consistency (mddev_t *mddev)
1721 if (__check_consistency(mddev, 0))
1723 * We are not checking this currently, as it's legitimate to have
1724 * an inconsistent array, at creation time.
1726 return 0;
1728 return 0;
1731 static int raid5_run (mddev_t *mddev)
1733 raid5_conf_t *conf;
1734 int i, j, raid_disk, memory;
1735 mdp_super_t *sb = mddev->sb;
1736 mdp_disk_t *desc;
1737 mdk_rdev_t *rdev;
1738 struct disk_info *disk;
1739 struct md_list_head *tmp;
1740 int start_recovery = 0;
1742 MOD_INC_USE_COUNT;
1744 if (sb->level != 5 && sb->level != 4) {
1745 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1746 MOD_DEC_USE_COUNT;
1747 return -EIO;
1750 mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1751 if ((conf = mddev->private) == NULL)
1752 goto abort;
1753 memset (conf, 0, sizeof (*conf));
1754 conf->mddev = mddev;
1756 if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1757 goto abort;
1758 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1760 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1761 md_init_waitqueue_head(&conf->wait_for_stripe);
1762 PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1764 ITERATE_RDEV(mddev,rdev,tmp) {
1766 * This is important -- we are using the descriptor on
1767 * the disk only to get a pointer to the descriptor on
1768 * the main superblock, which might be more recent.
1770 desc = sb->disks + rdev->desc_nr;
1771 raid_disk = desc->raid_disk;
1772 disk = conf->disks + raid_disk;
1774 if (disk_faulty(desc)) {
1775 printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1776 if (!rdev->faulty) {
1777 MD_BUG();
1778 goto abort;
1780 disk->number = desc->number;
1781 disk->raid_disk = raid_disk;
1782 disk->dev = rdev->dev;
1784 disk->operational = 0;
1785 disk->write_only = 0;
1786 disk->spare = 0;
1787 disk->used_slot = 1;
1788 continue;
1790 if (disk_active(desc)) {
1791 if (!disk_sync(desc)) {
1792 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1793 MD_BUG();
1794 goto abort;
1796 if (raid_disk > sb->raid_disks) {
1797 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1798 continue;
1800 if (disk->operational) {
1801 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1802 continue;
1804 printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1806 disk->number = desc->number;
1807 disk->raid_disk = raid_disk;
1808 disk->dev = rdev->dev;
1809 disk->operational = 1;
1810 disk->used_slot = 1;
1812 conf->working_disks++;
1813 } else {
1815 * Must be a spare disk ..
1817 printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1818 disk->number = desc->number;
1819 disk->raid_disk = raid_disk;
1820 disk->dev = rdev->dev;
1822 disk->operational = 0;
1823 disk->write_only = 0;
1824 disk->spare = 1;
1825 disk->used_slot = 1;
1829 for (i = 0; i < MD_SB_DISKS; i++) {
1830 desc = sb->disks + i;
1831 raid_disk = desc->raid_disk;
1832 disk = conf->disks + raid_disk;
1834 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1835 !conf->disks[raid_disk].used_slot) {
1837 disk->number = desc->number;
1838 disk->raid_disk = raid_disk;
1839 disk->dev = MKDEV(0,0);
1841 disk->operational = 0;
1842 disk->write_only = 0;
1843 disk->spare = 0;
1844 disk->used_slot = 1;
1848 conf->raid_disks = sb->raid_disks;
1850 * 0 for a fully functional array, 1 for a degraded array.
1852 conf->failed_disks = conf->raid_disks - conf->working_disks;
1853 conf->mddev = mddev;
1854 conf->chunk_size = sb->chunk_size;
1855 conf->level = sb->level;
1856 conf->algorithm = sb->layout;
1857 conf->max_nr_stripes = NR_STRIPES;
1859 #if 0
1860 for (i = 0; i < conf->raid_disks; i++) {
1861 if (!conf->disks[i].used_slot) {
1862 MD_BUG();
1863 goto abort;
1866 #endif
1867 if (!conf->chunk_size || conf->chunk_size % 4) {
1868 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1869 goto abort;
1871 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1872 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1873 goto abort;
1875 if (conf->failed_disks > 1) {
1876 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1877 goto abort;
1880 if (conf->working_disks != sb->raid_disks) {
1881 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1882 start_recovery = 1;
1885 if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
1886 check_consistency(mddev)) {
1887 printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
1888 sb->state &= ~(1 << MD_SB_CLEAN);
1892 const char * name = "raid5d";
1894 conf->thread = md_register_thread(raid5d, conf, name);
1895 if (!conf->thread) {
1896 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1897 goto abort;
1901 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1902 conf->raid_disks * (sizeof(struct buffer_head) +
1903 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1904 if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1905 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1906 shrink_stripes(conf, conf->max_nr_stripes);
1907 goto abort;
1908 } else
1909 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1912 * Regenerate the "device is in sync with the raid set" bit for
1913 * each device.
1915 for (i = 0; i < MD_SB_DISKS ; i++) {
1916 mark_disk_nonsync(sb->disks + i);
1917 for (j = 0; j < sb->raid_disks; j++) {
1918 if (!conf->disks[j].operational)
1919 continue;
1920 if (sb->disks[i].number == conf->disks[j].number)
1921 mark_disk_sync(sb->disks + i);
1924 sb->active_disks = conf->working_disks;
1926 if (sb->active_disks == sb->raid_disks)
1927 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1928 else
1929 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1931 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1932 const char * name = "raid5syncd";
1934 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1935 if (!conf->resync_thread) {
1936 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1937 goto abort;
1940 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1941 conf->resync_parity = 1;
1942 md_wakeup_thread(conf->resync_thread);
1945 print_raid5_conf(conf);
1946 if (start_recovery)
1947 md_recover_arrays();
1948 print_raid5_conf(conf);
1950 /* Ok, everything is just fine now */
1951 return (0);
1952 abort:
1953 if (conf) {
1954 print_raid5_conf(conf);
1955 if (conf->stripe_hashtbl)
1956 free_pages((unsigned long) conf->stripe_hashtbl,
1957 HASH_PAGES_ORDER);
1958 kfree(conf);
1960 mddev->private = NULL;
1961 printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1962 MOD_DEC_USE_COUNT;
1963 return -EIO;
1966 static int raid5_stop_resync (mddev_t *mddev)
1968 raid5_conf_t *conf = mddev_to_conf(mddev);
1969 mdk_thread_t *thread = conf->resync_thread;
1971 if (thread) {
1972 if (conf->resync_parity) {
1973 conf->resync_parity = 2;
1974 md_interrupt_thread(thread);
1975 printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1976 return 1;
1978 return 0;
1980 return 0;
1983 static int raid5_restart_resync (mddev_t *mddev)
1985 raid5_conf_t *conf = mddev_to_conf(mddev);
1987 if (conf->resync_parity) {
1988 if (!conf->resync_thread) {
1989 MD_BUG();
1990 return 0;
1992 printk("raid5: waking up raid5resync.\n");
1993 conf->resync_parity = 1;
1994 md_wakeup_thread(conf->resync_thread);
1995 return 1;
1996 } else
1997 printk("raid5: no restart-resync needed.\n");
1998 return 0;
2002 static int raid5_stop (mddev_t *mddev)
2004 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2006 shrink_stripe_cache(conf, conf->max_nr_stripes);
2007 shrink_stripes(conf, conf->max_nr_stripes);
2008 md_unregister_thread(conf->thread);
2009 if (conf->resync_thread)
2010 md_unregister_thread(conf->resync_thread);
2011 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
2012 kfree(conf);
2013 mddev->private = NULL;
2014 MOD_DEC_USE_COUNT;
2015 return 0;
2018 #if RAID5_DEBUG
2019 static void print_sh (struct stripe_head *sh)
2021 int i;
2023 printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd);
2024 printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count));
2025 printk("sh %lu, ", sh->sector);
2026 for (i = 0; i < MD_SB_DISKS; i++) {
2027 if (sh->bh_old[i])
2028 printk("(old%d: %p) ", i, sh->bh_old[i]);
2029 if (sh->bh_new[i])
2030 printk("(new%d: %p) ", i, sh->bh_new[i]);
2031 if (sh->bh_copy[i])
2032 printk("(copy%d: %p) ", i, sh->bh_copy[i]);
2033 if (sh->bh_req[i])
2034 printk("(req%d: %p) ", i, sh->bh_req[i]);
2036 printk("\n");
2037 for (i = 0; i < MD_SB_DISKS; i++)
2038 printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]);
2039 printk("\n");
2042 static void printall (raid5_conf_t *conf)
2044 struct stripe_head *sh;
2045 int i;
2047 md_spin_lock_irq(&conf->device_lock);
2048 for (i = 0; i < NR_HASH; i++) {
2049 sh = conf->stripe_hashtbl[i];
2050 for (; sh; sh = sh->hash_next) {
2051 if (sh->raid_conf != conf)
2052 continue;
2053 print_sh(sh);
2056 md_spin_unlock_irq(&conf->device_lock);
2058 PRINTK("--- raid5d inactive\n");
2060 #endif
2062 static int raid5_status (char *page, mddev_t *mddev)
2064 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
2065 mdp_super_t *sb = mddev->sb;
2066 int sz = 0, i;
2068 sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
2069 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
2070 for (i = 0; i < conf->raid_disks; i++)
2071 sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
2072 sz += sprintf (page+sz, "]");
2073 #if RAID5_DEBUG
2074 #define D(x) \
2075 sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
2076 D(nr_handle);
2077 D(nr_stripes);
2078 D(nr_hashed_stripes);
2079 D(nr_locked_stripes);
2080 D(nr_pending_stripes);
2081 D(nr_cached_stripes);
2082 D(nr_free_sh);
2083 printall(conf);
2084 #endif
2085 return sz;
2088 static void print_raid5_conf (raid5_conf_t *conf)
2090 int i;
2091 struct disk_info *tmp;
2093 printk("RAID5 conf printout:\n");
2094 if (!conf) {
2095 printk("(conf==NULL)\n");
2096 return;
2098 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
2099 conf->working_disks, conf->failed_disks);
2101 for (i = 0; i < MD_SB_DISKS; i++) {
2102 tmp = conf->disks + i;
2103 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
2104 i, tmp->spare,tmp->operational,
2105 tmp->number,tmp->raid_disk,tmp->used_slot,
2106 partition_name(tmp->dev));
2110 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
2112 int err = 0;
2113 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
2114 raid5_conf_t *conf = mddev->private;
2115 struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
2116 mdp_super_t *sb = mddev->sb;
2117 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
2119 print_raid5_conf(conf);
2120 md_spin_lock_irq(&conf->device_lock);
2122 * find the disk ...
2124 switch (state) {
2126 case DISKOP_SPARE_ACTIVE:
2129 * Find the failed disk within the RAID5 configuration ...
2130 * (this can only be in the first conf->raid_disks part)
2132 for (i = 0; i < conf->raid_disks; i++) {
2133 tmp = conf->disks + i;
2134 if ((!tmp->operational && !tmp->spare) ||
2135 !tmp->used_slot) {
2136 failed_disk = i;
2137 break;
2141 * When we activate a spare disk we _must_ have a disk in
2142 * the lower (active) part of the array to replace.
2144 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
2145 MD_BUG();
2146 err = 1;
2147 goto abort;
2149 /* fall through */
2151 case DISKOP_SPARE_WRITE:
2152 case DISKOP_SPARE_INACTIVE:
2155 * Find the spare disk ... (can only be in the 'high'
2156 * area of the array)
2158 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
2159 tmp = conf->disks + i;
2160 if (tmp->spare && tmp->number == (*d)->number) {
2161 spare_disk = i;
2162 break;
2165 if (spare_disk == -1) {
2166 MD_BUG();
2167 err = 1;
2168 goto abort;
2170 break;
2172 case DISKOP_HOT_REMOVE_DISK:
2174 for (i = 0; i < MD_SB_DISKS; i++) {
2175 tmp = conf->disks + i;
2176 if (tmp->used_slot && (tmp->number == (*d)->number)) {
2177 if (tmp->operational) {
2178 err = -EBUSY;
2179 goto abort;
2181 removed_disk = i;
2182 break;
2185 if (removed_disk == -1) {
2186 MD_BUG();
2187 err = 1;
2188 goto abort;
2190 break;
2192 case DISKOP_HOT_ADD_DISK:
2194 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
2195 tmp = conf->disks + i;
2196 if (!tmp->used_slot) {
2197 added_disk = i;
2198 break;
2201 if (added_disk == -1) {
2202 MD_BUG();
2203 err = 1;
2204 goto abort;
2206 break;
2209 switch (state) {
2211 * Switch the spare disk to write-only mode:
2213 case DISKOP_SPARE_WRITE:
2214 if (conf->spare) {
2215 MD_BUG();
2216 err = 1;
2217 goto abort;
2219 sdisk = conf->disks + spare_disk;
2220 sdisk->operational = 1;
2221 sdisk->write_only = 1;
2222 conf->spare = sdisk;
2223 break;
2225 * Deactivate a spare disk:
2227 case DISKOP_SPARE_INACTIVE:
2228 sdisk = conf->disks + spare_disk;
2229 sdisk->operational = 0;
2230 sdisk->write_only = 0;
2232 * Was the spare being resynced?
2234 if (conf->spare == sdisk)
2235 conf->spare = NULL;
2236 break;
2238 * Activate (mark read-write) the (now sync) spare disk,
2239 * which means we switch it's 'raid position' (->raid_disk)
2240 * with the failed disk. (only the first 'conf->raid_disks'
2241 * slots are used for 'real' disks and we must preserve this
2242 * property)
2244 case DISKOP_SPARE_ACTIVE:
2245 if (!conf->spare) {
2246 MD_BUG();
2247 err = 1;
2248 goto abort;
2250 sdisk = conf->disks + spare_disk;
2251 fdisk = conf->disks + failed_disk;
2253 spare_desc = &sb->disks[sdisk->number];
2254 failed_desc = &sb->disks[fdisk->number];
2256 if (spare_desc != *d) {
2257 MD_BUG();
2258 err = 1;
2259 goto abort;
2262 if (spare_desc->raid_disk != sdisk->raid_disk) {
2263 MD_BUG();
2264 err = 1;
2265 goto abort;
2268 if (sdisk->raid_disk != spare_disk) {
2269 MD_BUG();
2270 err = 1;
2271 goto abort;
2274 if (failed_desc->raid_disk != fdisk->raid_disk) {
2275 MD_BUG();
2276 err = 1;
2277 goto abort;
2280 if (fdisk->raid_disk != failed_disk) {
2281 MD_BUG();
2282 err = 1;
2283 goto abort;
2287 * do the switch finally
2289 xchg_values(*spare_desc, *failed_desc);
2290 xchg_values(*fdisk, *sdisk);
2293 * (careful, 'failed' and 'spare' are switched from now on)
2295 * we want to preserve linear numbering and we want to
2296 * give the proper raid_disk number to the now activated
2297 * disk. (this means we switch back these values)
2300 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
2301 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
2302 xchg_values(spare_desc->number, failed_desc->number);
2303 xchg_values(sdisk->number, fdisk->number);
2305 *d = failed_desc;
2307 if (sdisk->dev == MKDEV(0,0))
2308 sdisk->used_slot = 0;
2311 * this really activates the spare.
2313 fdisk->spare = 0;
2314 fdisk->write_only = 0;
2317 * if we activate a spare, we definitely replace a
2318 * non-operational disk slot in the 'low' area of
2319 * the disk array.
2321 conf->failed_disks--;
2322 conf->working_disks++;
2323 conf->spare = NULL;
2325 break;
2327 case DISKOP_HOT_REMOVE_DISK:
2328 rdisk = conf->disks + removed_disk;
2330 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
2331 MD_BUG();
2332 err = 1;
2333 goto abort;
2335 rdisk->dev = MKDEV(0,0);
2336 rdisk->used_slot = 0;
2338 break;
2340 case DISKOP_HOT_ADD_DISK:
2341 adisk = conf->disks + added_disk;
2342 added_desc = *d;
2344 if (added_disk != added_desc->number) {
2345 MD_BUG();
2346 err = 1;
2347 goto abort;
2350 adisk->number = added_desc->number;
2351 adisk->raid_disk = added_desc->raid_disk;
2352 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
2354 adisk->operational = 0;
2355 adisk->write_only = 0;
2356 adisk->spare = 1;
2357 adisk->used_slot = 1;
2360 break;
2362 default:
2363 MD_BUG();
2364 err = 1;
2365 goto abort;
2367 abort:
2368 md_spin_unlock_irq(&conf->device_lock);
2369 print_raid5_conf(conf);
2370 return err;
2373 static mdk_personality_t raid5_personality=
2375 name: "raid5",
2376 make_request: raid5_make_request,
2377 run: raid5_run,
2378 stop: raid5_stop,
2379 status: raid5_status,
2380 error_handler: raid5_error,
2381 diskop: raid5_diskop,
2382 stop_resync: raid5_stop_resync,
2383 restart_resync: raid5_restart_resync,
2384 sync_request: raid5_sync_request
2387 int raid5_init (void)
2389 int err;
2391 err = register_md_personality (RAID5, &raid5_personality);
2392 if (err)
2393 return err;
2394 return 0;
2397 #ifdef MODULE
2398 int init_module (void)
2400 return raid5_init();
2403 void cleanup_module (void)
2405 unregister_md_personality (RAID5);
2407 #endif