2 * raid6main.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
7 * RAID-6 management functions. This code is derived from raid5.c.
8 * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
10 * Thanks to Penguin Computing for making the RAID-6 development possible
11 * by donating a test server!
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
18 * You should have received a copy of the GNU General Public License
19 * (for example /usr/src/linux/COPYING); if not, write to the Free
20 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include <linux/config.h>
25 #include <linux/module.h>
26 #include <linux/slab.h>
27 #include <linux/highmem.h>
28 #include <linux/bitops.h>
29 #include <asm/atomic.h>
32 #include <linux/raid/bitmap.h>
38 #define NR_STRIPES 256
39 #define STRIPE_SIZE PAGE_SIZE
40 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
41 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
42 #define IO_THRESHOLD 1
43 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44 #define HASH_MASK (NR_HASH - 1)
46 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
48 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
49 * order without overlap. There may be several bio's per stripe+device, and
50 * a bio could span several devices.
51 * When walking this list for a particular stripe+device, we must never proceed
52 * beyond a bio that extends past this device, as the next bio might no longer
54 * This macro is used to determine the 'next' bio in the list, given the sector
55 * of the current stripe+device
57 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
59 * The following can be used to debug the driver
61 #define RAID6_DEBUG 0 /* Extremely verbose printk */
62 #define RAID6_PARANOIA 1 /* Check spinlocks */
63 #define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
64 #if RAID6_PARANOIA && defined(CONFIG_SMP)
65 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
67 # define CHECK_DEVLOCK()
70 #define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
78 #if !RAID6_USE_EMPTY_ZERO_PAGE
79 /* In .bss so it's zeroed */
80 const char raid6_empty_zero_page
[PAGE_SIZE
] __attribute__((aligned(256)));
83 static inline int raid6_next_disk(int disk
, int raid_disks
)
86 return (disk
< raid_disks
) ? disk
: 0;
89 static void print_raid6_conf (raid6_conf_t
*conf
);
91 static void __release_stripe(raid6_conf_t
*conf
, struct stripe_head
*sh
)
93 if (atomic_dec_and_test(&sh
->count
)) {
94 if (!list_empty(&sh
->lru
))
96 if (atomic_read(&conf
->active_stripes
)==0)
98 if (test_bit(STRIPE_HANDLE
, &sh
->state
)) {
99 if (test_bit(STRIPE_DELAYED
, &sh
->state
))
100 list_add_tail(&sh
->lru
, &conf
->delayed_list
);
101 else if (test_bit(STRIPE_BIT_DELAY
, &sh
->state
) &&
102 conf
->seq_write
== sh
->bm_seq
)
103 list_add_tail(&sh
->lru
, &conf
->bitmap_list
);
105 clear_bit(STRIPE_BIT_DELAY
, &sh
->state
);
106 list_add_tail(&sh
->lru
, &conf
->handle_list
);
108 md_wakeup_thread(conf
->mddev
->thread
);
110 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
111 atomic_dec(&conf
->preread_active_stripes
);
112 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
)
113 md_wakeup_thread(conf
->mddev
->thread
);
115 list_add_tail(&sh
->lru
, &conf
->inactive_list
);
116 atomic_dec(&conf
->active_stripes
);
117 if (!conf
->inactive_blocked
||
118 atomic_read(&conf
->active_stripes
) < (conf
->max_nr_stripes
*3/4))
119 wake_up(&conf
->wait_for_stripe
);
123 static void release_stripe(struct stripe_head
*sh
)
125 raid6_conf_t
*conf
= sh
->raid_conf
;
128 spin_lock_irqsave(&conf
->device_lock
, flags
);
129 __release_stripe(conf
, sh
);
130 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
133 static inline void remove_hash(struct stripe_head
*sh
)
135 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh
->sector
);
137 hlist_del_init(&sh
->hash
);
140 static inline void insert_hash(raid6_conf_t
*conf
, struct stripe_head
*sh
)
142 struct hlist_head
*hp
= stripe_hash(conf
, sh
->sector
);
144 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh
->sector
);
147 hlist_add_head(&sh
->hash
, hp
);
151 /* find an idle stripe, make sure it is unhashed, and return it. */
152 static struct stripe_head
*get_free_stripe(raid6_conf_t
*conf
)
154 struct stripe_head
*sh
= NULL
;
155 struct list_head
*first
;
158 if (list_empty(&conf
->inactive_list
))
160 first
= conf
->inactive_list
.next
;
161 sh
= list_entry(first
, struct stripe_head
, lru
);
162 list_del_init(first
);
164 atomic_inc(&conf
->active_stripes
);
169 static void shrink_buffers(struct stripe_head
*sh
, int num
)
174 for (i
=0; i
<num
; i
++) {
178 sh
->dev
[i
].page
= NULL
;
183 static int grow_buffers(struct stripe_head
*sh
, int num
)
187 for (i
=0; i
<num
; i
++) {
190 if (!(page
= alloc_page(GFP_KERNEL
))) {
193 sh
->dev
[i
].page
= page
;
198 static void raid6_build_block (struct stripe_head
*sh
, int i
);
200 static void init_stripe(struct stripe_head
*sh
, sector_t sector
, int pd_idx
)
202 raid6_conf_t
*conf
= sh
->raid_conf
;
203 int disks
= conf
->raid_disks
, i
;
205 if (atomic_read(&sh
->count
) != 0)
207 if (test_bit(STRIPE_HANDLE
, &sh
->state
))
211 PRINTK("init_stripe called, stripe %llu\n",
212 (unsigned long long)sh
->sector
);
220 for (i
=disks
; i
--; ) {
221 struct r5dev
*dev
= &sh
->dev
[i
];
223 if (dev
->toread
|| dev
->towrite
|| dev
->written
||
224 test_bit(R5_LOCKED
, &dev
->flags
)) {
225 PRINTK("sector=%llx i=%d %p %p %p %d\n",
226 (unsigned long long)sh
->sector
, i
, dev
->toread
,
227 dev
->towrite
, dev
->written
,
228 test_bit(R5_LOCKED
, &dev
->flags
));
232 raid6_build_block(sh
, i
);
234 insert_hash(conf
, sh
);
237 static struct stripe_head
*__find_stripe(raid6_conf_t
*conf
, sector_t sector
)
239 struct stripe_head
*sh
;
240 struct hlist_node
*hn
;
243 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector
);
244 hlist_for_each_entry (sh
, hn
, stripe_hash(conf
, sector
), hash
)
245 if (sh
->sector
== sector
)
247 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector
);
251 static void unplug_slaves(mddev_t
*mddev
);
253 static struct stripe_head
*get_active_stripe(raid6_conf_t
*conf
, sector_t sector
,
254 int pd_idx
, int noblock
)
256 struct stripe_head
*sh
;
258 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector
);
260 spin_lock_irq(&conf
->device_lock
);
263 wait_event_lock_irq(conf
->wait_for_stripe
,
265 conf
->device_lock
, /* nothing */);
266 sh
= __find_stripe(conf
, sector
);
268 if (!conf
->inactive_blocked
)
269 sh
= get_free_stripe(conf
);
270 if (noblock
&& sh
== NULL
)
273 conf
->inactive_blocked
= 1;
274 wait_event_lock_irq(conf
->wait_for_stripe
,
275 !list_empty(&conf
->inactive_list
) &&
276 (atomic_read(&conf
->active_stripes
)
277 < (conf
->max_nr_stripes
*3/4)
278 || !conf
->inactive_blocked
),
280 unplug_slaves(conf
->mddev
);
282 conf
->inactive_blocked
= 0;
284 init_stripe(sh
, sector
, pd_idx
);
286 if (atomic_read(&sh
->count
)) {
287 if (!list_empty(&sh
->lru
))
290 if (!test_bit(STRIPE_HANDLE
, &sh
->state
))
291 atomic_inc(&conf
->active_stripes
);
292 if (list_empty(&sh
->lru
))
294 list_del_init(&sh
->lru
);
297 } while (sh
== NULL
);
300 atomic_inc(&sh
->count
);
302 spin_unlock_irq(&conf
->device_lock
);
306 static int grow_one_stripe(raid6_conf_t
*conf
)
308 struct stripe_head
*sh
;
309 sh
= kmem_cache_alloc(conf
->slab_cache
, GFP_KERNEL
);
312 memset(sh
, 0, sizeof(*sh
) + (conf
->raid_disks
-1)*sizeof(struct r5dev
));
313 sh
->raid_conf
= conf
;
314 spin_lock_init(&sh
->lock
);
316 if (grow_buffers(sh
, conf
->raid_disks
)) {
317 shrink_buffers(sh
, conf
->raid_disks
);
318 kmem_cache_free(conf
->slab_cache
, sh
);
321 /* we just created an active stripe so... */
322 atomic_set(&sh
->count
, 1);
323 atomic_inc(&conf
->active_stripes
);
324 INIT_LIST_HEAD(&sh
->lru
);
329 static int grow_stripes(raid6_conf_t
*conf
, int num
)
332 int devs
= conf
->raid_disks
;
334 sprintf(conf
->cache_name
[0], "raid6/%s", mdname(conf
->mddev
));
336 sc
= kmem_cache_create(conf
->cache_name
[0],
337 sizeof(struct stripe_head
)+(devs
-1)*sizeof(struct r5dev
),
341 conf
->slab_cache
= sc
;
343 if (!grow_one_stripe(conf
))
348 static int drop_one_stripe(raid6_conf_t
*conf
)
350 struct stripe_head
*sh
;
351 spin_lock_irq(&conf
->device_lock
);
352 sh
= get_free_stripe(conf
);
353 spin_unlock_irq(&conf
->device_lock
);
356 if (atomic_read(&sh
->count
))
358 shrink_buffers(sh
, conf
->raid_disks
);
359 kmem_cache_free(conf
->slab_cache
, sh
);
360 atomic_dec(&conf
->active_stripes
);
364 static void shrink_stripes(raid6_conf_t
*conf
)
366 while (drop_one_stripe(conf
))
369 if (conf
->slab_cache
)
370 kmem_cache_destroy(conf
->slab_cache
);
371 conf
->slab_cache
= NULL
;
374 static int raid6_end_read_request(struct bio
* bi
, unsigned int bytes_done
,
377 struct stripe_head
*sh
= bi
->bi_private
;
378 raid6_conf_t
*conf
= sh
->raid_conf
;
379 int disks
= conf
->raid_disks
, i
;
380 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
385 for (i
=0 ; i
<disks
; i
++)
386 if (bi
== &sh
->dev
[i
].req
)
389 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
390 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
401 spin_lock_irqsave(&conf
->device_lock
, flags
);
402 /* we can return a buffer if we bypassed the cache or
403 * if the top buffer is not in highmem. If there are
404 * multiple buffers, leave the extra work to
407 buffer
= sh
->bh_read
[i
];
409 (!PageHighMem(buffer
->b_page
)
410 || buffer
->b_page
== bh
->b_page
)
412 sh
->bh_read
[i
] = buffer
->b_reqnext
;
413 buffer
->b_reqnext
= NULL
;
416 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
417 if (sh
->bh_page
[i
]==bh
->b_page
)
418 set_buffer_uptodate(bh
);
420 if (buffer
->b_page
!= bh
->b_page
)
421 memcpy(buffer
->b_data
, bh
->b_data
, bh
->b_size
);
422 buffer
->b_end_io(buffer
, 1);
425 set_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
427 if (test_bit(R5_ReadError
, &sh
->dev
[i
].flags
)) {
428 printk(KERN_INFO
"raid6: read error corrected!!\n");
429 clear_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
430 clear_bit(R5_ReWrite
, &sh
->dev
[i
].flags
);
432 if (atomic_read(&conf
->disks
[i
].rdev
->read_errors
))
433 atomic_set(&conf
->disks
[i
].rdev
->read_errors
, 0);
436 clear_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
437 atomic_inc(&conf
->disks
[i
].rdev
->read_errors
);
438 if (conf
->mddev
->degraded
)
439 printk(KERN_WARNING
"raid6: read error not correctable.\n");
440 else if (test_bit(R5_ReWrite
, &sh
->dev
[i
].flags
))
442 printk(KERN_WARNING
"raid6: read error NOT corrected!!\n");
443 else if (atomic_read(&conf
->disks
[i
].rdev
->read_errors
)
444 > conf
->max_nr_stripes
)
446 "raid6: Too many read errors, failing device.\n");
450 set_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
452 clear_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
453 clear_bit(R5_ReWrite
, &sh
->dev
[i
].flags
);
454 md_error(conf
->mddev
, conf
->disks
[i
].rdev
);
457 rdev_dec_pending(conf
->disks
[i
].rdev
, conf
->mddev
);
459 /* must restore b_page before unlocking buffer... */
460 if (sh
->bh_page
[i
] != bh
->b_page
) {
461 bh
->b_page
= sh
->bh_page
[i
];
462 bh
->b_data
= page_address(bh
->b_page
);
463 clear_buffer_uptodate(bh
);
466 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
467 set_bit(STRIPE_HANDLE
, &sh
->state
);
472 static int raid6_end_write_request (struct bio
*bi
, unsigned int bytes_done
,
475 struct stripe_head
*sh
= bi
->bi_private
;
476 raid6_conf_t
*conf
= sh
->raid_conf
;
477 int disks
= conf
->raid_disks
, i
;
479 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
484 for (i
=0 ; i
<disks
; i
++)
485 if (bi
== &sh
->dev
[i
].req
)
488 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
489 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
496 spin_lock_irqsave(&conf
->device_lock
, flags
);
498 md_error(conf
->mddev
, conf
->disks
[i
].rdev
);
500 rdev_dec_pending(conf
->disks
[i
].rdev
, conf
->mddev
);
502 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
503 set_bit(STRIPE_HANDLE
, &sh
->state
);
504 __release_stripe(conf
, sh
);
505 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
510 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
);
512 static void raid6_build_block (struct stripe_head
*sh
, int i
)
514 struct r5dev
*dev
= &sh
->dev
[i
];
515 int pd_idx
= sh
->pd_idx
;
516 int qd_idx
= raid6_next_disk(pd_idx
, sh
->raid_conf
->raid_disks
);
519 dev
->req
.bi_io_vec
= &dev
->vec
;
521 dev
->req
.bi_max_vecs
++;
522 dev
->vec
.bv_page
= dev
->page
;
523 dev
->vec
.bv_len
= STRIPE_SIZE
;
524 dev
->vec
.bv_offset
= 0;
526 dev
->req
.bi_sector
= sh
->sector
;
527 dev
->req
.bi_private
= sh
;
530 if (i
!= pd_idx
&& i
!= qd_idx
)
531 dev
->sector
= compute_blocknr(sh
, i
);
534 static void error(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
536 char b
[BDEVNAME_SIZE
];
537 raid6_conf_t
*conf
= (raid6_conf_t
*) mddev
->private;
538 PRINTK("raid6: error called\n");
540 if (!test_bit(Faulty
, &rdev
->flags
)) {
542 if (test_bit(In_sync
, &rdev
->flags
)) {
543 conf
->working_disks
--;
545 conf
->failed_disks
++;
546 clear_bit(In_sync
, &rdev
->flags
);
548 * if recovery was running, make sure it aborts.
550 set_bit(MD_RECOVERY_ERR
, &mddev
->recovery
);
552 set_bit(Faulty
, &rdev
->flags
);
554 "raid6: Disk failure on %s, disabling device."
555 " Operation continuing on %d devices\n",
556 bdevname(rdev
->bdev
,b
), conf
->working_disks
);
561 * Input: a 'big' sector number,
562 * Output: index of the data and parity disk, and the sector # in them.
564 static sector_t
raid6_compute_sector(sector_t r_sector
, unsigned int raid_disks
,
565 unsigned int data_disks
, unsigned int * dd_idx
,
566 unsigned int * pd_idx
, raid6_conf_t
*conf
)
569 unsigned long chunk_number
;
570 unsigned int chunk_offset
;
572 int sectors_per_chunk
= conf
->chunk_size
>> 9;
574 /* First compute the information on this sector */
577 * Compute the chunk number and the sector offset inside the chunk
579 chunk_offset
= sector_div(r_sector
, sectors_per_chunk
);
580 chunk_number
= r_sector
;
581 if ( r_sector
!= chunk_number
) {
582 printk(KERN_CRIT
"raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
583 (unsigned long long)r_sector
, (unsigned long)chunk_number
);
588 * Compute the stripe number
590 stripe
= chunk_number
/ data_disks
;
593 * Compute the data disk and parity disk indexes inside the stripe
595 *dd_idx
= chunk_number
% data_disks
;
598 * Select the parity disk based on the user selected algorithm.
602 switch (conf
->algorithm
) {
603 case ALGORITHM_LEFT_ASYMMETRIC
:
604 *pd_idx
= raid_disks
- 1 - (stripe
% raid_disks
);
605 if (*pd_idx
== raid_disks
-1)
606 (*dd_idx
)++; /* Q D D D P */
607 else if (*dd_idx
>= *pd_idx
)
608 (*dd_idx
) += 2; /* D D P Q D */
610 case ALGORITHM_RIGHT_ASYMMETRIC
:
611 *pd_idx
= stripe
% raid_disks
;
612 if (*pd_idx
== raid_disks
-1)
613 (*dd_idx
)++; /* Q D D D P */
614 else if (*dd_idx
>= *pd_idx
)
615 (*dd_idx
) += 2; /* D D P Q D */
617 case ALGORITHM_LEFT_SYMMETRIC
:
618 *pd_idx
= raid_disks
- 1 - (stripe
% raid_disks
);
619 *dd_idx
= (*pd_idx
+ 2 + *dd_idx
) % raid_disks
;
621 case ALGORITHM_RIGHT_SYMMETRIC
:
622 *pd_idx
= stripe
% raid_disks
;
623 *dd_idx
= (*pd_idx
+ 2 + *dd_idx
) % raid_disks
;
626 printk (KERN_CRIT
"raid6: unsupported algorithm %d\n",
630 PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
631 chunk_number
, *pd_idx
, *dd_idx
);
634 * Finally, compute the new sector number
636 new_sector
= (sector_t
) stripe
* sectors_per_chunk
+ chunk_offset
;
641 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
)
643 raid6_conf_t
*conf
= sh
->raid_conf
;
644 int raid_disks
= conf
->raid_disks
, data_disks
= raid_disks
- 2;
645 sector_t new_sector
= sh
->sector
, check
;
646 int sectors_per_chunk
= conf
->chunk_size
>> 9;
649 int chunk_number
, dummy1
, dummy2
, dd_idx
= i
;
653 chunk_offset
= sector_div(new_sector
, sectors_per_chunk
);
655 if ( new_sector
!= stripe
) {
656 printk(KERN_CRIT
"raid6: ERROR: new_sector = %llu, stripe = %lu\n",
657 (unsigned long long)new_sector
, (unsigned long)stripe
);
661 switch (conf
->algorithm
) {
662 case ALGORITHM_LEFT_ASYMMETRIC
:
663 case ALGORITHM_RIGHT_ASYMMETRIC
:
664 if (sh
->pd_idx
== raid_disks
-1)
666 else if (i
> sh
->pd_idx
)
667 i
-= 2; /* D D P Q D */
669 case ALGORITHM_LEFT_SYMMETRIC
:
670 case ALGORITHM_RIGHT_SYMMETRIC
:
671 if (sh
->pd_idx
== raid_disks
-1)
677 i
-= (sh
->pd_idx
+ 2);
681 printk (KERN_CRIT
"raid6: unsupported algorithm %d\n",
685 PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh
->pd_idx
, i0
, i
);
687 chunk_number
= stripe
* data_disks
+ i
;
688 r_sector
= (sector_t
)chunk_number
* sectors_per_chunk
+ chunk_offset
;
690 check
= raid6_compute_sector (r_sector
, raid_disks
, data_disks
, &dummy1
, &dummy2
, conf
);
691 if (check
!= sh
->sector
|| dummy1
!= dd_idx
|| dummy2
!= sh
->pd_idx
) {
692 printk(KERN_CRIT
"raid6: compute_blocknr: map not correct\n");
701 * Copy data between a page in the stripe cache, and one or more bion
702 * The page could align with the middle of the bio, or there could be
703 * several bion, each with several bio_vecs, which cover part of the page
704 * Multiple bion are linked together on bi_next. There may be extras
705 * at the end of this list. We ignore them.
707 static void copy_data(int frombio
, struct bio
*bio
,
711 char *pa
= page_address(page
);
716 if (bio
->bi_sector
>= sector
)
717 page_offset
= (signed)(bio
->bi_sector
- sector
) * 512;
719 page_offset
= (signed)(sector
- bio
->bi_sector
) * -512;
720 bio_for_each_segment(bvl
, bio
, i
) {
721 int len
= bio_iovec_idx(bio
,i
)->bv_len
;
725 if (page_offset
< 0) {
726 b_offset
= -page_offset
;
727 page_offset
+= b_offset
;
731 if (len
> 0 && page_offset
+ len
> STRIPE_SIZE
)
732 clen
= STRIPE_SIZE
- page_offset
;
736 char *ba
= __bio_kmap_atomic(bio
, i
, KM_USER0
);
738 memcpy(pa
+page_offset
, ba
+b_offset
, clen
);
740 memcpy(ba
+b_offset
, pa
+page_offset
, clen
);
741 __bio_kunmap_atomic(ba
, KM_USER0
);
743 if (clen
< len
) /* hit end of page */
749 #define check_xor() do { \
750 if (count == MAX_XOR_BLOCKS) { \
751 xor_block(count, STRIPE_SIZE, ptr); \
756 /* Compute P and Q syndromes */
757 static void compute_parity(struct stripe_head
*sh
, int method
)
759 raid6_conf_t
*conf
= sh
->raid_conf
;
760 int i
, pd_idx
= sh
->pd_idx
, qd_idx
, d0_idx
, disks
= conf
->raid_disks
, count
;
762 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
765 qd_idx
= raid6_next_disk(pd_idx
, disks
);
766 d0_idx
= raid6_next_disk(qd_idx
, disks
);
768 PRINTK("compute_parity, stripe %llu, method %d\n",
769 (unsigned long long)sh
->sector
, method
);
772 case READ_MODIFY_WRITE
:
773 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
774 case RECONSTRUCT_WRITE
:
775 for (i
= disks
; i
-- ;)
776 if ( i
!= pd_idx
&& i
!= qd_idx
&& sh
->dev
[i
].towrite
) {
777 chosen
= sh
->dev
[i
].towrite
;
778 sh
->dev
[i
].towrite
= NULL
;
780 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
781 wake_up(&conf
->wait_for_overlap
);
783 if (sh
->dev
[i
].written
) BUG();
784 sh
->dev
[i
].written
= chosen
;
788 BUG(); /* Not implemented yet */
791 for (i
= disks
; i
--;)
792 if (sh
->dev
[i
].written
) {
793 sector_t sector
= sh
->dev
[i
].sector
;
794 struct bio
*wbi
= sh
->dev
[i
].written
;
795 while (wbi
&& wbi
->bi_sector
< sector
+ STRIPE_SECTORS
) {
796 copy_data(1, wbi
, sh
->dev
[i
].page
, sector
);
797 wbi
= r5_next_bio(wbi
, sector
);
800 set_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
801 set_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
805 // case RECONSTRUCT_WRITE:
806 // case CHECK_PARITY:
807 // case UPDATE_PARITY:
808 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
809 /* FIX: Is this ordering of drives even remotely optimal? */
813 ptrs
[count
++] = page_address(sh
->dev
[i
].page
);
814 if (count
<= disks
-2 && !test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
))
815 printk("block %d/%d not uptodate on parity calc\n", i
,count
);
816 i
= raid6_next_disk(i
, disks
);
817 } while ( i
!= d0_idx
);
821 raid6_call
.gen_syndrome(disks
, STRIPE_SIZE
, ptrs
);
824 case RECONSTRUCT_WRITE
:
825 set_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
826 set_bit(R5_UPTODATE
, &sh
->dev
[qd_idx
].flags
);
827 set_bit(R5_LOCKED
, &sh
->dev
[pd_idx
].flags
);
828 set_bit(R5_LOCKED
, &sh
->dev
[qd_idx
].flags
);
831 set_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
832 set_bit(R5_UPTODATE
, &sh
->dev
[qd_idx
].flags
);
837 /* Compute one missing block */
838 static void compute_block_1(struct stripe_head
*sh
, int dd_idx
, int nozero
)
840 raid6_conf_t
*conf
= sh
->raid_conf
;
841 int i
, count
, disks
= conf
->raid_disks
;
842 void *ptr
[MAX_XOR_BLOCKS
], *p
;
843 int pd_idx
= sh
->pd_idx
;
844 int qd_idx
= raid6_next_disk(pd_idx
, disks
);
846 PRINTK("compute_block_1, stripe %llu, idx %d\n",
847 (unsigned long long)sh
->sector
, dd_idx
);
849 if ( dd_idx
== qd_idx
) {
850 /* We're actually computing the Q drive */
851 compute_parity(sh
, UPDATE_PARITY
);
853 ptr
[0] = page_address(sh
->dev
[dd_idx
].page
);
854 if (!nozero
) memset(ptr
[0], 0, STRIPE_SIZE
);
856 for (i
= disks
; i
--; ) {
857 if (i
== dd_idx
|| i
== qd_idx
)
859 p
= page_address(sh
->dev
[i
].page
);
860 if (test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
))
863 printk("compute_block() %d, stripe %llu, %d"
864 " not present\n", dd_idx
,
865 (unsigned long long)sh
->sector
, i
);
870 xor_block(count
, STRIPE_SIZE
, ptr
);
871 if (!nozero
) set_bit(R5_UPTODATE
, &sh
->dev
[dd_idx
].flags
);
872 else clear_bit(R5_UPTODATE
, &sh
->dev
[dd_idx
].flags
);
876 /* Compute two missing blocks */
877 static void compute_block_2(struct stripe_head
*sh
, int dd_idx1
, int dd_idx2
)
879 raid6_conf_t
*conf
= sh
->raid_conf
;
880 int i
, count
, disks
= conf
->raid_disks
;
881 int pd_idx
= sh
->pd_idx
;
882 int qd_idx
= raid6_next_disk(pd_idx
, disks
);
883 int d0_idx
= raid6_next_disk(qd_idx
, disks
);
886 /* faila and failb are disk numbers relative to d0_idx */
887 /* pd_idx become disks-2 and qd_idx become disks-1 */
888 faila
= (dd_idx1
< d0_idx
) ? dd_idx1
+(disks
-d0_idx
) : dd_idx1
-d0_idx
;
889 failb
= (dd_idx2
< d0_idx
) ? dd_idx2
+(disks
-d0_idx
) : dd_idx2
-d0_idx
;
891 BUG_ON(faila
== failb
);
892 if ( failb
< faila
) { int tmp
= faila
; faila
= failb
; failb
= tmp
; }
894 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
895 (unsigned long long)sh
->sector
, dd_idx1
, dd_idx2
, faila
, failb
);
897 if ( failb
== disks
-1 ) {
898 /* Q disk is one of the missing disks */
899 if ( faila
== disks
-2 ) {
900 /* Missing P+Q, just recompute */
901 compute_parity(sh
, UPDATE_PARITY
);
904 /* We're missing D+Q; recompute D from P */
905 compute_block_1(sh
, (dd_idx1
== qd_idx
) ? dd_idx2
: dd_idx1
, 0);
906 compute_parity(sh
, UPDATE_PARITY
); /* Is this necessary? */
911 /* We're missing D+P or D+D; build pointer table */
913 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
919 ptrs
[count
++] = page_address(sh
->dev
[i
].page
);
920 i
= raid6_next_disk(i
, disks
);
921 if (i
!= dd_idx1
&& i
!= dd_idx2
&&
922 !test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
))
923 printk("compute_2 with missing block %d/%d\n", count
, i
);
924 } while ( i
!= d0_idx
);
926 if ( failb
== disks
-2 ) {
927 /* We're missing D+P. */
928 raid6_datap_recov(disks
, STRIPE_SIZE
, faila
, ptrs
);
930 /* We're missing D+D. */
931 raid6_2data_recov(disks
, STRIPE_SIZE
, faila
, failb
, ptrs
);
934 /* Both the above update both missing blocks */
935 set_bit(R5_UPTODATE
, &sh
->dev
[dd_idx1
].flags
);
936 set_bit(R5_UPTODATE
, &sh
->dev
[dd_idx2
].flags
);
942 * Each stripe/dev can have one or more bion attached.
943 * toread/towrite point to the first in a chain.
944 * The bi_next chain must be in order.
946 static int add_stripe_bio(struct stripe_head
*sh
, struct bio
*bi
, int dd_idx
, int forwrite
)
949 raid6_conf_t
*conf
= sh
->raid_conf
;
952 PRINTK("adding bh b#%llu to stripe s#%llu\n",
953 (unsigned long long)bi
->bi_sector
,
954 (unsigned long long)sh
->sector
);
957 spin_lock(&sh
->lock
);
958 spin_lock_irq(&conf
->device_lock
);
960 bip
= &sh
->dev
[dd_idx
].towrite
;
961 if (*bip
== NULL
&& sh
->dev
[dd_idx
].written
== NULL
)
964 bip
= &sh
->dev
[dd_idx
].toread
;
965 while (*bip
&& (*bip
)->bi_sector
< bi
->bi_sector
) {
966 if ((*bip
)->bi_sector
+ ((*bip
)->bi_size
>> 9) > bi
->bi_sector
)
968 bip
= &(*bip
)->bi_next
;
970 if (*bip
&& (*bip
)->bi_sector
< bi
->bi_sector
+ ((bi
->bi_size
)>>9))
973 if (*bip
&& bi
->bi_next
&& (*bip
) != bi
->bi_next
)
978 bi
->bi_phys_segments
++;
979 spin_unlock_irq(&conf
->device_lock
);
980 spin_unlock(&sh
->lock
);
982 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
983 (unsigned long long)bi
->bi_sector
,
984 (unsigned long long)sh
->sector
, dd_idx
);
986 if (conf
->mddev
->bitmap
&& firstwrite
) {
987 sh
->bm_seq
= conf
->seq_write
;
988 bitmap_startwrite(conf
->mddev
->bitmap
, sh
->sector
,
990 set_bit(STRIPE_BIT_DELAY
, &sh
->state
);
994 /* check if page is covered */
995 sector_t sector
= sh
->dev
[dd_idx
].sector
;
996 for (bi
=sh
->dev
[dd_idx
].towrite
;
997 sector
< sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
&&
998 bi
&& bi
->bi_sector
<= sector
;
999 bi
= r5_next_bio(bi
, sh
->dev
[dd_idx
].sector
)) {
1000 if (bi
->bi_sector
+ (bi
->bi_size
>>9) >= sector
)
1001 sector
= bi
->bi_sector
+ (bi
->bi_size
>>9);
1003 if (sector
>= sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
)
1004 set_bit(R5_OVERWRITE
, &sh
->dev
[dd_idx
].flags
);
1009 set_bit(R5_Overlap
, &sh
->dev
[dd_idx
].flags
);
1010 spin_unlock_irq(&conf
->device_lock
);
1011 spin_unlock(&sh
->lock
);
1016 static int page_is_zero(struct page
*p
)
1018 char *a
= page_address(p
);
1019 return ((*(u32
*)a
) == 0 &&
1020 memcmp(a
, a
+4, STRIPE_SIZE
-4)==0);
1023 * handle_stripe - do things to a stripe.
1025 * We lock the stripe and then examine the state of various bits
1026 * to see what needs to be done.
1028 * return some read request which now have data
1029 * return some write requests which are safely on disc
1030 * schedule a read on some buffers
1031 * schedule a write of some buffers
1032 * return confirmation of parity correctness
1034 * Parity calculations are done inside the stripe lock
1035 * buffers are taken off read_list or write_list, and bh_cache buffers
1036 * get BH_Lock set before the stripe lock is released.
1040 static void handle_stripe(struct stripe_head
*sh
, struct page
*tmp_page
)
1042 raid6_conf_t
*conf
= sh
->raid_conf
;
1043 int disks
= conf
->raid_disks
;
1044 struct bio
*return_bi
= NULL
;
1048 int locked
=0, uptodate
=0, to_read
=0, to_write
=0, failed
=0, written
=0;
1049 int non_overwrite
= 0;
1050 int failed_num
[2] = {0, 0};
1051 struct r5dev
*dev
, *pdev
, *qdev
;
1052 int pd_idx
= sh
->pd_idx
;
1053 int qd_idx
= raid6_next_disk(pd_idx
, disks
);
1054 int p_failed
, q_failed
;
1056 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1057 (unsigned long long)sh
->sector
, sh
->state
, atomic_read(&sh
->count
),
1060 spin_lock(&sh
->lock
);
1061 clear_bit(STRIPE_HANDLE
, &sh
->state
);
1062 clear_bit(STRIPE_DELAYED
, &sh
->state
);
1064 syncing
= test_bit(STRIPE_SYNCING
, &sh
->state
);
1065 /* Now to look around and see what can be done */
1068 for (i
=disks
; i
--; ) {
1071 clear_bit(R5_Insync
, &dev
->flags
);
1073 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1074 i
, dev
->flags
, dev
->toread
, dev
->towrite
, dev
->written
);
1075 /* maybe we can reply to a read */
1076 if (test_bit(R5_UPTODATE
, &dev
->flags
) && dev
->toread
) {
1077 struct bio
*rbi
, *rbi2
;
1078 PRINTK("Return read for disc %d\n", i
);
1079 spin_lock_irq(&conf
->device_lock
);
1082 if (test_and_clear_bit(R5_Overlap
, &dev
->flags
))
1083 wake_up(&conf
->wait_for_overlap
);
1084 spin_unlock_irq(&conf
->device_lock
);
1085 while (rbi
&& rbi
->bi_sector
< dev
->sector
+ STRIPE_SECTORS
) {
1086 copy_data(0, rbi
, dev
->page
, dev
->sector
);
1087 rbi2
= r5_next_bio(rbi
, dev
->sector
);
1088 spin_lock_irq(&conf
->device_lock
);
1089 if (--rbi
->bi_phys_segments
== 0) {
1090 rbi
->bi_next
= return_bi
;
1093 spin_unlock_irq(&conf
->device_lock
);
1098 /* now count some things */
1099 if (test_bit(R5_LOCKED
, &dev
->flags
)) locked
++;
1100 if (test_bit(R5_UPTODATE
, &dev
->flags
)) uptodate
++;
1103 if (dev
->toread
) to_read
++;
1106 if (!test_bit(R5_OVERWRITE
, &dev
->flags
))
1109 if (dev
->written
) written
++;
1110 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
1111 if (!rdev
|| !test_bit(In_sync
, &rdev
->flags
)) {
1112 /* The ReadError flag will just be confusing now */
1113 clear_bit(R5_ReadError
, &dev
->flags
);
1114 clear_bit(R5_ReWrite
, &dev
->flags
);
1116 if (!rdev
|| !test_bit(In_sync
, &rdev
->flags
)
1117 || test_bit(R5_ReadError
, &dev
->flags
)) {
1119 failed_num
[failed
] = i
;
1122 set_bit(R5_Insync
, &dev
->flags
);
1125 PRINTK("locked=%d uptodate=%d to_read=%d"
1126 " to_write=%d failed=%d failed_num=%d,%d\n",
1127 locked
, uptodate
, to_read
, to_write
, failed
,
1128 failed_num
[0], failed_num
[1]);
1129 /* check if the array has lost >2 devices and, if so, some requests might
1132 if (failed
> 2 && to_read
+to_write
+written
) {
1133 for (i
=disks
; i
--; ) {
1136 if (test_bit(R5_ReadError
, &sh
->dev
[i
].flags
)) {
1139 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
1140 if (rdev
&& test_bit(In_sync
, &rdev
->flags
))
1141 /* multiple read failures in one stripe */
1142 md_error(conf
->mddev
, rdev
);
1146 spin_lock_irq(&conf
->device_lock
);
1147 /* fail all writes first */
1148 bi
= sh
->dev
[i
].towrite
;
1149 sh
->dev
[i
].towrite
= NULL
;
1150 if (bi
) { to_write
--; bitmap_end
= 1; }
1152 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
1153 wake_up(&conf
->wait_for_overlap
);
1155 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
){
1156 struct bio
*nextbi
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
1157 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1158 if (--bi
->bi_phys_segments
== 0) {
1159 md_write_end(conf
->mddev
);
1160 bi
->bi_next
= return_bi
;
1165 /* and fail all 'written' */
1166 bi
= sh
->dev
[i
].written
;
1167 sh
->dev
[i
].written
= NULL
;
1168 if (bi
) bitmap_end
= 1;
1169 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
) {
1170 struct bio
*bi2
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
1171 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1172 if (--bi
->bi_phys_segments
== 0) {
1173 md_write_end(conf
->mddev
);
1174 bi
->bi_next
= return_bi
;
1180 /* fail any reads if this device is non-operational */
1181 if (!test_bit(R5_Insync
, &sh
->dev
[i
].flags
) ||
1182 test_bit(R5_ReadError
, &sh
->dev
[i
].flags
)) {
1183 bi
= sh
->dev
[i
].toread
;
1184 sh
->dev
[i
].toread
= NULL
;
1185 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
1186 wake_up(&conf
->wait_for_overlap
);
1188 while (bi
&& bi
->bi_sector
< sh
->dev
[i
].sector
+ STRIPE_SECTORS
){
1189 struct bio
*nextbi
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
1190 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1191 if (--bi
->bi_phys_segments
== 0) {
1192 bi
->bi_next
= return_bi
;
1198 spin_unlock_irq(&conf
->device_lock
);
1200 bitmap_endwrite(conf
->mddev
->bitmap
, sh
->sector
,
1201 STRIPE_SECTORS
, 0, 0);
1204 if (failed
> 2 && syncing
) {
1205 md_done_sync(conf
->mddev
, STRIPE_SECTORS
,0);
1206 clear_bit(STRIPE_SYNCING
, &sh
->state
);
1211 * might be able to return some write requests if the parity blocks
1212 * are safe, or on a failed drive
1214 pdev
= &sh
->dev
[pd_idx
];
1215 p_failed
= (failed
>= 1 && failed_num
[0] == pd_idx
)
1216 || (failed
>= 2 && failed_num
[1] == pd_idx
);
1217 qdev
= &sh
->dev
[qd_idx
];
1218 q_failed
= (failed
>= 1 && failed_num
[0] == qd_idx
)
1219 || (failed
>= 2 && failed_num
[1] == qd_idx
);
1222 ( p_failed
|| ((test_bit(R5_Insync
, &pdev
->flags
)
1223 && !test_bit(R5_LOCKED
, &pdev
->flags
)
1224 && test_bit(R5_UPTODATE
, &pdev
->flags
))) ) &&
1225 ( q_failed
|| ((test_bit(R5_Insync
, &qdev
->flags
)
1226 && !test_bit(R5_LOCKED
, &qdev
->flags
)
1227 && test_bit(R5_UPTODATE
, &qdev
->flags
))) ) ) {
1228 /* any written block on an uptodate or failed drive can be
1229 * returned. Note that if we 'wrote' to a failed drive,
1230 * it will be UPTODATE, but never LOCKED, so we don't need
1231 * to test 'failed' directly.
1233 for (i
=disks
; i
--; )
1234 if (sh
->dev
[i
].written
) {
1236 if (!test_bit(R5_LOCKED
, &dev
->flags
) &&
1237 test_bit(R5_UPTODATE
, &dev
->flags
) ) {
1238 /* We can return any write requests */
1240 struct bio
*wbi
, *wbi2
;
1241 PRINTK("Return write for stripe %llu disc %d\n",
1242 (unsigned long long)sh
->sector
, i
);
1243 spin_lock_irq(&conf
->device_lock
);
1245 dev
->written
= NULL
;
1246 while (wbi
&& wbi
->bi_sector
< dev
->sector
+ STRIPE_SECTORS
) {
1247 wbi2
= r5_next_bio(wbi
, dev
->sector
);
1248 if (--wbi
->bi_phys_segments
== 0) {
1249 md_write_end(conf
->mddev
);
1250 wbi
->bi_next
= return_bi
;
1255 if (dev
->towrite
== NULL
)
1257 spin_unlock_irq(&conf
->device_lock
);
1259 bitmap_endwrite(conf
->mddev
->bitmap
, sh
->sector
,
1261 !test_bit(STRIPE_DEGRADED
, &sh
->state
), 0);
1266 /* Now we might consider reading some blocks, either to check/generate
1267 * parity, or to satisfy requests
1268 * or to load a block that is being partially written.
1270 if (to_read
|| non_overwrite
|| (to_write
&& failed
) || (syncing
&& (uptodate
< disks
))) {
1271 for (i
=disks
; i
--;) {
1273 if (!test_bit(R5_LOCKED
, &dev
->flags
) && !test_bit(R5_UPTODATE
, &dev
->flags
) &&
1275 (dev
->towrite
&& !test_bit(R5_OVERWRITE
, &dev
->flags
)) ||
1277 (failed
>= 1 && (sh
->dev
[failed_num
[0]].toread
|| to_write
)) ||
1278 (failed
>= 2 && (sh
->dev
[failed_num
[1]].toread
|| to_write
))
1281 /* we would like to get this block, possibly
1282 * by computing it, but we might not be able to
1284 if (uptodate
== disks
-1) {
1285 PRINTK("Computing stripe %llu block %d\n",
1286 (unsigned long long)sh
->sector
, i
);
1287 compute_block_1(sh
, i
, 0);
1289 } else if ( uptodate
== disks
-2 && failed
>= 2 ) {
1290 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
1292 for (other
=disks
; other
--;) {
1295 if ( !test_bit(R5_UPTODATE
, &sh
->dev
[other
].flags
) )
1299 PRINTK("Computing stripe %llu blocks %d,%d\n",
1300 (unsigned long long)sh
->sector
, i
, other
);
1301 compute_block_2(sh
, i
, other
);
1303 } else if (test_bit(R5_Insync
, &dev
->flags
)) {
1304 set_bit(R5_LOCKED
, &dev
->flags
);
1305 set_bit(R5_Wantread
, &dev
->flags
);
1307 /* if I am just reading this block and we don't have
1308 a failed drive, or any pending writes then sidestep the cache */
1309 if (sh
->bh_read
[i
] && !sh
->bh_read
[i
]->b_reqnext
&&
1310 ! syncing
&& !failed
&& !to_write
) {
1311 sh
->bh_cache
[i
]->b_page
= sh
->bh_read
[i
]->b_page
;
1312 sh
->bh_cache
[i
]->b_data
= sh
->bh_read
[i
]->b_data
;
1316 PRINTK("Reading block %d (sync=%d)\n",
1321 set_bit(STRIPE_HANDLE
, &sh
->state
);
1324 /* now to consider writing and what else, if anything should be read */
1326 int rcw
=0, must_compute
=0;
1327 for (i
=disks
; i
--;) {
1329 /* Would I have to read this buffer for reconstruct_write */
1330 if (!test_bit(R5_OVERWRITE
, &dev
->flags
)
1331 && i
!= pd_idx
&& i
!= qd_idx
1332 && (!test_bit(R5_LOCKED
, &dev
->flags
)
1334 || sh
->bh_page
[i
] != bh
->b_page
1337 !test_bit(R5_UPTODATE
, &dev
->flags
)) {
1338 if (test_bit(R5_Insync
, &dev
->flags
)) rcw
++;
1340 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i
, dev
->flags
);
1345 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1346 (unsigned long long)sh
->sector
, rcw
, must_compute
);
1347 set_bit(STRIPE_HANDLE
, &sh
->state
);
1350 /* want reconstruct write, but need to get some data */
1351 for (i
=disks
; i
--;) {
1353 if (!test_bit(R5_OVERWRITE
, &dev
->flags
)
1354 && !(failed
== 0 && (i
== pd_idx
|| i
== qd_idx
))
1355 && !test_bit(R5_LOCKED
, &dev
->flags
) && !test_bit(R5_UPTODATE
, &dev
->flags
) &&
1356 test_bit(R5_Insync
, &dev
->flags
)) {
1357 if (test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
1359 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
1360 (unsigned long long)sh
->sector
, i
);
1361 set_bit(R5_LOCKED
, &dev
->flags
);
1362 set_bit(R5_Wantread
, &dev
->flags
);
1365 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
1366 (unsigned long long)sh
->sector
, i
);
1367 set_bit(STRIPE_DELAYED
, &sh
->state
);
1368 set_bit(STRIPE_HANDLE
, &sh
->state
);
1372 /* now if nothing is locked, and if we have enough data, we can start a write request */
1373 if (locked
== 0 && rcw
== 0 &&
1374 !test_bit(STRIPE_BIT_DELAY
, &sh
->state
)) {
1375 if ( must_compute
> 0 ) {
1376 /* We have failed blocks and need to compute them */
1379 case 1: compute_block_1(sh
, failed_num
[0], 0); break;
1380 case 2: compute_block_2(sh
, failed_num
[0], failed_num
[1]); break;
1381 default: BUG(); /* This request should have been failed? */
1385 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh
->sector
);
1386 compute_parity(sh
, RECONSTRUCT_WRITE
);
1387 /* now every locked buffer is ready to be written */
1389 if (test_bit(R5_LOCKED
, &sh
->dev
[i
].flags
)) {
1390 PRINTK("Writing stripe %llu block %d\n",
1391 (unsigned long long)sh
->sector
, i
);
1393 set_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
);
1395 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1396 set_bit(STRIPE_INSYNC
, &sh
->state
);
1398 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
1399 atomic_dec(&conf
->preread_active_stripes
);
1400 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
)
1401 md_wakeup_thread(conf
->mddev
->thread
);
1406 /* maybe we need to check and possibly fix the parity for this stripe
1407 * Any reads will already have been scheduled, so we just see if enough data
1410 if (syncing
&& locked
== 0 && !test_bit(STRIPE_INSYNC
, &sh
->state
)) {
1411 int update_p
= 0, update_q
= 0;
1414 set_bit(STRIPE_HANDLE
, &sh
->state
);
1417 BUG_ON(uptodate
< disks
);
1418 /* Want to check and possibly repair P and Q.
1419 * However there could be one 'failed' device, in which
1420 * case we can only check one of them, possibly using the
1421 * other to generate missing data
1424 /* If !tmp_page, we cannot do the calculations,
1425 * but as we have set STRIPE_HANDLE, we will soon be called
1426 * by stripe_handle with a tmp_page - just wait until then.
1429 if (failed
== q_failed
) {
1430 /* The only possible failed device holds 'Q', so it makes
1431 * sense to check P (If anything else were failed, we would
1432 * have used P to recreate it).
1434 compute_block_1(sh
, pd_idx
, 1);
1435 if (!page_is_zero(sh
->dev
[pd_idx
].page
)) {
1436 compute_block_1(sh
,pd_idx
,0);
1440 if (!q_failed
&& failed
< 2) {
1441 /* q is not failed, and we didn't use it to generate
1442 * anything, so it makes sense to check it
1444 memcpy(page_address(tmp_page
),
1445 page_address(sh
->dev
[qd_idx
].page
),
1447 compute_parity(sh
, UPDATE_PARITY
);
1448 if (memcmp(page_address(tmp_page
),
1449 page_address(sh
->dev
[qd_idx
].page
),
1451 clear_bit(STRIPE_INSYNC
, &sh
->state
);
1455 if (update_p
|| update_q
) {
1456 conf
->mddev
->resync_mismatches
+= STRIPE_SECTORS
;
1457 if (test_bit(MD_RECOVERY_CHECK
, &conf
->mddev
->recovery
))
1458 /* don't try to repair!! */
1459 update_p
= update_q
= 0;
1462 /* now write out any block on a failed drive,
1463 * or P or Q if they need it
1467 dev
= &sh
->dev
[failed_num
[1]];
1469 set_bit(R5_LOCKED
, &dev
->flags
);
1470 set_bit(R5_Wantwrite
, &dev
->flags
);
1473 dev
= &sh
->dev
[failed_num
[0]];
1475 set_bit(R5_LOCKED
, &dev
->flags
);
1476 set_bit(R5_Wantwrite
, &dev
->flags
);
1480 dev
= &sh
->dev
[pd_idx
];
1482 set_bit(R5_LOCKED
, &dev
->flags
);
1483 set_bit(R5_Wantwrite
, &dev
->flags
);
1486 dev
= &sh
->dev
[qd_idx
];
1488 set_bit(R5_LOCKED
, &dev
->flags
);
1489 set_bit(R5_Wantwrite
, &dev
->flags
);
1491 clear_bit(STRIPE_DEGRADED
, &sh
->state
);
1493 set_bit(STRIPE_INSYNC
, &sh
->state
);
1497 if (syncing
&& locked
== 0 && test_bit(STRIPE_INSYNC
, &sh
->state
)) {
1498 md_done_sync(conf
->mddev
, STRIPE_SECTORS
,1);
1499 clear_bit(STRIPE_SYNCING
, &sh
->state
);
1502 /* If the failed drives are just a ReadError, then we might need
1503 * to progress the repair/check process
1505 if (failed
<= 2 && ! conf
->mddev
->ro
)
1506 for (i
=0; i
<failed
;i
++) {
1507 dev
= &sh
->dev
[failed_num
[i
]];
1508 if (test_bit(R5_ReadError
, &dev
->flags
)
1509 && !test_bit(R5_LOCKED
, &dev
->flags
)
1510 && test_bit(R5_UPTODATE
, &dev
->flags
)
1512 if (!test_bit(R5_ReWrite
, &dev
->flags
)) {
1513 set_bit(R5_Wantwrite
, &dev
->flags
);
1514 set_bit(R5_ReWrite
, &dev
->flags
);
1515 set_bit(R5_LOCKED
, &dev
->flags
);
1517 /* let's read it back */
1518 set_bit(R5_Wantread
, &dev
->flags
);
1519 set_bit(R5_LOCKED
, &dev
->flags
);
1523 spin_unlock(&sh
->lock
);
1525 while ((bi
=return_bi
)) {
1526 int bytes
= bi
->bi_size
;
1528 return_bi
= bi
->bi_next
;
1531 bi
->bi_end_io(bi
, bytes
, 0);
1533 for (i
=disks
; i
-- ;) {
1537 if (test_and_clear_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
))
1539 else if (test_and_clear_bit(R5_Wantread
, &sh
->dev
[i
].flags
))
1544 bi
= &sh
->dev
[i
].req
;
1548 bi
->bi_end_io
= raid6_end_write_request
;
1550 bi
->bi_end_io
= raid6_end_read_request
;
1553 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
1554 if (rdev
&& test_bit(Faulty
, &rdev
->flags
))
1557 atomic_inc(&rdev
->nr_pending
);
1562 md_sync_acct(rdev
->bdev
, STRIPE_SECTORS
);
1564 bi
->bi_bdev
= rdev
->bdev
;
1565 PRINTK("for %llu schedule op %ld on disc %d\n",
1566 (unsigned long long)sh
->sector
, bi
->bi_rw
, i
);
1567 atomic_inc(&sh
->count
);
1568 bi
->bi_sector
= sh
->sector
+ rdev
->data_offset
;
1569 bi
->bi_flags
= 1 << BIO_UPTODATE
;
1571 bi
->bi_max_vecs
= 1;
1573 bi
->bi_io_vec
= &sh
->dev
[i
].vec
;
1574 bi
->bi_io_vec
[0].bv_len
= STRIPE_SIZE
;
1575 bi
->bi_io_vec
[0].bv_offset
= 0;
1576 bi
->bi_size
= STRIPE_SIZE
;
1579 test_bit(R5_ReWrite
, &sh
->dev
[i
].flags
))
1580 atomic_add(STRIPE_SECTORS
, &rdev
->corrected_errors
);
1581 generic_make_request(bi
);
1584 set_bit(STRIPE_DEGRADED
, &sh
->state
);
1585 PRINTK("skip op %ld on disc %d for sector %llu\n",
1586 bi
->bi_rw
, i
, (unsigned long long)sh
->sector
);
1587 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
1588 set_bit(STRIPE_HANDLE
, &sh
->state
);
1593 static void raid6_activate_delayed(raid6_conf_t
*conf
)
1595 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
) {
1596 while (!list_empty(&conf
->delayed_list
)) {
1597 struct list_head
*l
= conf
->delayed_list
.next
;
1598 struct stripe_head
*sh
;
1599 sh
= list_entry(l
, struct stripe_head
, lru
);
1601 clear_bit(STRIPE_DELAYED
, &sh
->state
);
1602 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
1603 atomic_inc(&conf
->preread_active_stripes
);
1604 list_add_tail(&sh
->lru
, &conf
->handle_list
);
1609 static void activate_bit_delay(raid6_conf_t
*conf
)
1611 /* device_lock is held */
1612 struct list_head head
;
1613 list_add(&head
, &conf
->bitmap_list
);
1614 list_del_init(&conf
->bitmap_list
);
1615 while (!list_empty(&head
)) {
1616 struct stripe_head
*sh
= list_entry(head
.next
, struct stripe_head
, lru
);
1617 list_del_init(&sh
->lru
);
1618 atomic_inc(&sh
->count
);
1619 __release_stripe(conf
, sh
);
1623 static void unplug_slaves(mddev_t
*mddev
)
1625 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1629 for (i
=0; i
<mddev
->raid_disks
; i
++) {
1630 mdk_rdev_t
*rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
1631 if (rdev
&& !test_bit(Faulty
, &rdev
->flags
) && atomic_read(&rdev
->nr_pending
)) {
1632 request_queue_t
*r_queue
= bdev_get_queue(rdev
->bdev
);
1634 atomic_inc(&rdev
->nr_pending
);
1637 if (r_queue
->unplug_fn
)
1638 r_queue
->unplug_fn(r_queue
);
1640 rdev_dec_pending(rdev
, mddev
);
1647 static void raid6_unplug_device(request_queue_t
*q
)
1649 mddev_t
*mddev
= q
->queuedata
;
1650 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1651 unsigned long flags
;
1653 spin_lock_irqsave(&conf
->device_lock
, flags
);
1655 if (blk_remove_plug(q
)) {
1657 raid6_activate_delayed(conf
);
1659 md_wakeup_thread(mddev
->thread
);
1661 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
1663 unplug_slaves(mddev
);
1666 static int raid6_issue_flush(request_queue_t
*q
, struct gendisk
*disk
,
1667 sector_t
*error_sector
)
1669 mddev_t
*mddev
= q
->queuedata
;
1670 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1674 for (i
=0; i
<mddev
->raid_disks
&& ret
== 0; i
++) {
1675 mdk_rdev_t
*rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
1676 if (rdev
&& !test_bit(Faulty
, &rdev
->flags
)) {
1677 struct block_device
*bdev
= rdev
->bdev
;
1678 request_queue_t
*r_queue
= bdev_get_queue(bdev
);
1680 if (!r_queue
->issue_flush_fn
)
1683 atomic_inc(&rdev
->nr_pending
);
1685 ret
= r_queue
->issue_flush_fn(r_queue
, bdev
->bd_disk
,
1687 rdev_dec_pending(rdev
, mddev
);
1696 static inline void raid6_plug_device(raid6_conf_t
*conf
)
1698 spin_lock_irq(&conf
->device_lock
);
1699 blk_plug_device(conf
->mddev
->queue
);
1700 spin_unlock_irq(&conf
->device_lock
);
1703 static int make_request (request_queue_t
*q
, struct bio
* bi
)
1705 mddev_t
*mddev
= q
->queuedata
;
1706 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1707 const unsigned int raid_disks
= conf
->raid_disks
;
1708 const unsigned int data_disks
= raid_disks
- 2;
1709 unsigned int dd_idx
, pd_idx
;
1710 sector_t new_sector
;
1711 sector_t logical_sector
, last_sector
;
1712 struct stripe_head
*sh
;
1713 const int rw
= bio_data_dir(bi
);
1715 if (unlikely(bio_barrier(bi
))) {
1716 bio_endio(bi
, bi
->bi_size
, -EOPNOTSUPP
);
1720 md_write_start(mddev
, bi
);
1722 disk_stat_inc(mddev
->gendisk
, ios
[rw
]);
1723 disk_stat_add(mddev
->gendisk
, sectors
[rw
], bio_sectors(bi
));
1725 logical_sector
= bi
->bi_sector
& ~((sector_t
)STRIPE_SECTORS
-1);
1726 last_sector
= bi
->bi_sector
+ (bi
->bi_size
>>9);
1729 bi
->bi_phys_segments
= 1; /* over-loaded to count active stripes */
1731 for (;logical_sector
< last_sector
; logical_sector
+= STRIPE_SECTORS
) {
1734 new_sector
= raid6_compute_sector(logical_sector
,
1735 raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1737 PRINTK("raid6: make_request, sector %llu logical %llu\n",
1738 (unsigned long long)new_sector
,
1739 (unsigned long long)logical_sector
);
1742 prepare_to_wait(&conf
->wait_for_overlap
, &w
, TASK_UNINTERRUPTIBLE
);
1743 sh
= get_active_stripe(conf
, new_sector
, pd_idx
, (bi
->bi_rw
&RWA_MASK
));
1745 if (!add_stripe_bio(sh
, bi
, dd_idx
, (bi
->bi_rw
&RW_MASK
))) {
1746 /* Add failed due to overlap. Flush everything
1749 raid6_unplug_device(mddev
->queue
);
1754 finish_wait(&conf
->wait_for_overlap
, &w
);
1755 raid6_plug_device(conf
);
1756 handle_stripe(sh
, NULL
);
1759 /* cannot get stripe for read-ahead, just give-up */
1760 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1761 finish_wait(&conf
->wait_for_overlap
, &w
);
1766 spin_lock_irq(&conf
->device_lock
);
1767 if (--bi
->bi_phys_segments
== 0) {
1768 int bytes
= bi
->bi_size
;
1771 md_write_end(mddev
);
1773 bi
->bi_end_io(bi
, bytes
, 0);
1775 spin_unlock_irq(&conf
->device_lock
);
1779 /* FIXME go_faster isn't used */
1780 static sector_t
sync_request(mddev_t
*mddev
, sector_t sector_nr
, int *skipped
, int go_faster
)
1782 raid6_conf_t
*conf
= (raid6_conf_t
*) mddev
->private;
1783 struct stripe_head
*sh
;
1784 int sectors_per_chunk
= conf
->chunk_size
>> 9;
1786 unsigned long stripe
;
1789 sector_t first_sector
;
1790 int raid_disks
= conf
->raid_disks
;
1791 int data_disks
= raid_disks
- 2;
1792 sector_t max_sector
= mddev
->size
<< 1;
1794 int still_degraded
= 0;
1797 if (sector_nr
>= max_sector
) {
1798 /* just being told to finish up .. nothing much to do */
1799 unplug_slaves(mddev
);
1801 if (mddev
->curr_resync
< max_sector
) /* aborted */
1802 bitmap_end_sync(mddev
->bitmap
, mddev
->curr_resync
,
1804 else /* completed sync */
1806 bitmap_close_sync(mddev
->bitmap
);
1810 /* if there are 2 or more failed drives and we are trying
1811 * to resync, then assert that we are finished, because there is
1812 * nothing we can do.
1814 if (mddev
->degraded
>= 2 && test_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
)) {
1815 sector_t rv
= (mddev
->size
<< 1) - sector_nr
;
1819 if (!bitmap_start_sync(mddev
->bitmap
, sector_nr
, &sync_blocks
, 1) &&
1820 !test_bit(MD_RECOVERY_REQUESTED
, &mddev
->recovery
) &&
1821 !conf
->fullsync
&& sync_blocks
>= STRIPE_SECTORS
) {
1822 /* we can skip this block, and probably more */
1823 sync_blocks
/= STRIPE_SECTORS
;
1825 return sync_blocks
* STRIPE_SECTORS
; /* keep things rounded to whole stripes */
1829 chunk_offset
= sector_div(x
, sectors_per_chunk
);
1831 BUG_ON(x
!= stripe
);
1833 first_sector
= raid6_compute_sector((sector_t
)stripe
*data_disks
*sectors_per_chunk
1834 + chunk_offset
, raid_disks
, data_disks
, &dd_idx
, &pd_idx
, conf
);
1835 sh
= get_active_stripe(conf
, sector_nr
, pd_idx
, 1);
1837 sh
= get_active_stripe(conf
, sector_nr
, pd_idx
, 0);
1838 /* make sure we don't swamp the stripe cache if someone else
1839 * is trying to get access
1841 schedule_timeout_uninterruptible(1);
1843 /* Need to check if array will still be degraded after recovery/resync
1844 * We don't need to check the 'failed' flag as when that gets set,
1847 for (i
=0; i
<mddev
->raid_disks
; i
++)
1848 if (conf
->disks
[i
].rdev
== NULL
)
1851 bitmap_start_sync(mddev
->bitmap
, sector_nr
, &sync_blocks
, still_degraded
);
1853 spin_lock(&sh
->lock
);
1854 set_bit(STRIPE_SYNCING
, &sh
->state
);
1855 clear_bit(STRIPE_INSYNC
, &sh
->state
);
1856 spin_unlock(&sh
->lock
);
1858 handle_stripe(sh
, NULL
);
1861 return STRIPE_SECTORS
;
1865 * This is our raid6 kernel thread.
1867 * We scan the hash table for stripes which can be handled now.
1868 * During the scan, completed stripes are saved for us by the interrupt
1869 * handler, so that they will not have to wait for our next wakeup.
1871 static void raid6d (mddev_t
*mddev
)
1873 struct stripe_head
*sh
;
1874 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1877 PRINTK("+++ raid6d active\n");
1879 md_check_recovery(mddev
);
1882 spin_lock_irq(&conf
->device_lock
);
1884 struct list_head
*first
;
1886 if (conf
->seq_flush
- conf
->seq_write
> 0) {
1887 int seq
= conf
->seq_flush
;
1888 spin_unlock_irq(&conf
->device_lock
);
1889 bitmap_unplug(mddev
->bitmap
);
1890 spin_lock_irq(&conf
->device_lock
);
1891 conf
->seq_write
= seq
;
1892 activate_bit_delay(conf
);
1895 if (list_empty(&conf
->handle_list
) &&
1896 atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
&&
1897 !blk_queue_plugged(mddev
->queue
) &&
1898 !list_empty(&conf
->delayed_list
))
1899 raid6_activate_delayed(conf
);
1901 if (list_empty(&conf
->handle_list
))
1904 first
= conf
->handle_list
.next
;
1905 sh
= list_entry(first
, struct stripe_head
, lru
);
1907 list_del_init(first
);
1908 atomic_inc(&sh
->count
);
1909 if (atomic_read(&sh
->count
)!= 1)
1911 spin_unlock_irq(&conf
->device_lock
);
1914 handle_stripe(sh
, conf
->spare_page
);
1917 spin_lock_irq(&conf
->device_lock
);
1919 PRINTK("%d stripes handled\n", handled
);
1921 spin_unlock_irq(&conf
->device_lock
);
1923 unplug_slaves(mddev
);
1925 PRINTK("--- raid6d inactive\n");
1929 raid6_show_stripe_cache_size(mddev_t
*mddev
, char *page
)
1931 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1933 return sprintf(page
, "%d\n", conf
->max_nr_stripes
);
1939 raid6_store_stripe_cache_size(mddev_t
*mddev
, const char *page
, size_t len
)
1941 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1944 if (len
>= PAGE_SIZE
)
1949 new = simple_strtoul(page
, &end
, 10);
1950 if (!*page
|| (*end
&& *end
!= '\n') )
1952 if (new <= 16 || new > 32768)
1954 while (new < conf
->max_nr_stripes
) {
1955 if (drop_one_stripe(conf
))
1956 conf
->max_nr_stripes
--;
1960 while (new > conf
->max_nr_stripes
) {
1961 if (grow_one_stripe(conf
))
1962 conf
->max_nr_stripes
++;
1968 static struct md_sysfs_entry
1969 raid6_stripecache_size
= __ATTR(stripe_cache_size
, S_IRUGO
| S_IWUSR
,
1970 raid6_show_stripe_cache_size
,
1971 raid6_store_stripe_cache_size
);
1974 stripe_cache_active_show(mddev_t
*mddev
, char *page
)
1976 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
1978 return sprintf(page
, "%d\n", atomic_read(&conf
->active_stripes
));
1983 static struct md_sysfs_entry
1984 raid6_stripecache_active
= __ATTR_RO(stripe_cache_active
);
1986 static struct attribute
*raid6_attrs
[] = {
1987 &raid6_stripecache_size
.attr
,
1988 &raid6_stripecache_active
.attr
,
1991 static struct attribute_group raid6_attrs_group
= {
1993 .attrs
= raid6_attrs
,
1996 static int run(mddev_t
*mddev
)
1999 int raid_disk
, memory
;
2001 struct disk_info
*disk
;
2002 struct list_head
*tmp
;
2004 if (mddev
->level
!= 6) {
2005 PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev
), mddev
->level
);
2009 mddev
->private = kzalloc(sizeof (raid6_conf_t
), GFP_KERNEL
);
2010 if ((conf
= mddev
->private) == NULL
)
2012 conf
->disks
= kzalloc(mddev
->raid_disks
* sizeof(struct disk_info
),
2017 conf
->mddev
= mddev
;
2019 if ((conf
->stripe_hashtbl
= kzalloc(PAGE_SIZE
, GFP_KERNEL
)) == NULL
)
2022 conf
->spare_page
= alloc_page(GFP_KERNEL
);
2023 if (!conf
->spare_page
)
2026 spin_lock_init(&conf
->device_lock
);
2027 init_waitqueue_head(&conf
->wait_for_stripe
);
2028 init_waitqueue_head(&conf
->wait_for_overlap
);
2029 INIT_LIST_HEAD(&conf
->handle_list
);
2030 INIT_LIST_HEAD(&conf
->delayed_list
);
2031 INIT_LIST_HEAD(&conf
->bitmap_list
);
2032 INIT_LIST_HEAD(&conf
->inactive_list
);
2033 atomic_set(&conf
->active_stripes
, 0);
2034 atomic_set(&conf
->preread_active_stripes
, 0);
2036 PRINTK("raid6: run(%s) called.\n", mdname(mddev
));
2038 ITERATE_RDEV(mddev
,rdev
,tmp
) {
2039 raid_disk
= rdev
->raid_disk
;
2040 if (raid_disk
>= mddev
->raid_disks
2043 disk
= conf
->disks
+ raid_disk
;
2047 if (test_bit(In_sync
, &rdev
->flags
)) {
2048 char b
[BDEVNAME_SIZE
];
2049 printk(KERN_INFO
"raid6: device %s operational as raid"
2050 " disk %d\n", bdevname(rdev
->bdev
,b
),
2052 conf
->working_disks
++;
2056 conf
->raid_disks
= mddev
->raid_disks
;
2059 * 0 for a fully functional array, 1 or 2 for a degraded array.
2061 mddev
->degraded
= conf
->failed_disks
= conf
->raid_disks
- conf
->working_disks
;
2062 conf
->mddev
= mddev
;
2063 conf
->chunk_size
= mddev
->chunk_size
;
2064 conf
->level
= mddev
->level
;
2065 conf
->algorithm
= mddev
->layout
;
2066 conf
->max_nr_stripes
= NR_STRIPES
;
2068 /* device size must be a multiple of chunk size */
2069 mddev
->size
&= ~(mddev
->chunk_size
/1024 -1);
2070 mddev
->resync_max_sectors
= mddev
->size
<< 1;
2072 if (conf
->raid_disks
< 4) {
2073 printk(KERN_ERR
"raid6: not enough configured devices for %s (%d, minimum 4)\n",
2074 mdname(mddev
), conf
->raid_disks
);
2077 if (!conf
->chunk_size
|| conf
->chunk_size
% 4) {
2078 printk(KERN_ERR
"raid6: invalid chunk size %d for %s\n",
2079 conf
->chunk_size
, mdname(mddev
));
2082 if (conf
->algorithm
> ALGORITHM_RIGHT_SYMMETRIC
) {
2084 "raid6: unsupported parity algorithm %d for %s\n",
2085 conf
->algorithm
, mdname(mddev
));
2088 if (mddev
->degraded
> 2) {
2089 printk(KERN_ERR
"raid6: not enough operational devices for %s"
2090 " (%d/%d failed)\n",
2091 mdname(mddev
), conf
->failed_disks
, conf
->raid_disks
);
2095 if (mddev
->degraded
> 0 &&
2096 mddev
->recovery_cp
!= MaxSector
) {
2097 if (mddev
->ok_start_degraded
)
2098 printk(KERN_WARNING
"raid6: starting dirty degraded array:%s"
2099 "- data corruption possible.\n",
2102 printk(KERN_ERR
"raid6: cannot start dirty degraded array"
2103 " for %s\n", mdname(mddev
));
2109 mddev
->thread
= md_register_thread(raid6d
, mddev
, "%s_raid6");
2110 if (!mddev
->thread
) {
2112 "raid6: couldn't allocate thread for %s\n",
2118 memory
= conf
->max_nr_stripes
* (sizeof(struct stripe_head
) +
2119 conf
->raid_disks
* ((sizeof(struct bio
) + PAGE_SIZE
))) / 1024;
2120 if (grow_stripes(conf
, conf
->max_nr_stripes
)) {
2122 "raid6: couldn't allocate %dkB for buffers\n", memory
);
2123 shrink_stripes(conf
);
2124 md_unregister_thread(mddev
->thread
);
2127 printk(KERN_INFO
"raid6: allocated %dkB for %s\n",
2128 memory
, mdname(mddev
));
2130 if (mddev
->degraded
== 0)
2131 printk(KERN_INFO
"raid6: raid level %d set %s active with %d out of %d"
2132 " devices, algorithm %d\n", conf
->level
, mdname(mddev
),
2133 mddev
->raid_disks
-mddev
->degraded
, mddev
->raid_disks
,
2136 printk(KERN_ALERT
"raid6: raid level %d set %s active with %d"
2137 " out of %d devices, algorithm %d\n", conf
->level
,
2138 mdname(mddev
), mddev
->raid_disks
- mddev
->degraded
,
2139 mddev
->raid_disks
, conf
->algorithm
);
2141 print_raid6_conf(conf
);
2143 /* read-ahead size must cover two whole stripes, which is
2144 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
2147 int stripe
= (mddev
->raid_disks
-2) * mddev
->chunk_size
2149 if (mddev
->queue
->backing_dev_info
.ra_pages
< 2 * stripe
)
2150 mddev
->queue
->backing_dev_info
.ra_pages
= 2 * stripe
;
2153 /* Ok, everything is just fine now */
2154 sysfs_create_group(&mddev
->kobj
, &raid6_attrs_group
);
2156 mddev
->array_size
= mddev
->size
* (mddev
->raid_disks
- 2);
2158 mddev
->queue
->unplug_fn
= raid6_unplug_device
;
2159 mddev
->queue
->issue_flush_fn
= raid6_issue_flush
;
2163 print_raid6_conf(conf
);
2164 safe_put_page(conf
->spare_page
);
2165 kfree(conf
->stripe_hashtbl
);
2169 mddev
->private = NULL
;
2170 printk(KERN_ALERT
"raid6: failed to run raid set %s\n", mdname(mddev
));
2176 static int stop (mddev_t
*mddev
)
2178 raid6_conf_t
*conf
= (raid6_conf_t
*) mddev
->private;
2180 md_unregister_thread(mddev
->thread
);
2181 mddev
->thread
= NULL
;
2182 shrink_stripes(conf
);
2183 kfree(conf
->stripe_hashtbl
);
2184 blk_sync_queue(mddev
->queue
); /* the unplug fn references 'conf'*/
2185 sysfs_remove_group(&mddev
->kobj
, &raid6_attrs_group
);
2187 mddev
->private = NULL
;
2192 static void print_sh (struct seq_file
*seq
, struct stripe_head
*sh
)
2196 seq_printf(seq
, "sh %llu, pd_idx %d, state %ld.\n",
2197 (unsigned long long)sh
->sector
, sh
->pd_idx
, sh
->state
);
2198 seq_printf(seq
, "sh %llu, count %d.\n",
2199 (unsigned long long)sh
->sector
, atomic_read(&sh
->count
));
2200 seq_printf(seq
, "sh %llu, ", (unsigned long long)sh
->sector
);
2201 for (i
= 0; i
< sh
->raid_conf
->raid_disks
; i
++) {
2202 seq_printf(seq
, "(cache%d: %p %ld) ",
2203 i
, sh
->dev
[i
].page
, sh
->dev
[i
].flags
);
2205 seq_printf(seq
, "\n");
2208 static void printall (struct seq_file
*seq
, raid6_conf_t
*conf
)
2210 struct stripe_head
*sh
;
2211 struct hlist_node
*hn
;
2214 spin_lock_irq(&conf
->device_lock
);
2215 for (i
= 0; i
< NR_HASH
; i
++) {
2216 sh
= conf
->stripe_hashtbl
[i
];
2217 hlist_for_each_entry(sh
, hn
, &conf
->stripe_hashtbl
[i
], hash
) {
2218 if (sh
->raid_conf
!= conf
)
2223 spin_unlock_irq(&conf
->device_lock
);
2227 static void status (struct seq_file
*seq
, mddev_t
*mddev
)
2229 raid6_conf_t
*conf
= (raid6_conf_t
*) mddev
->private;
2232 seq_printf (seq
, " level %d, %dk chunk, algorithm %d", mddev
->level
, mddev
->chunk_size
>> 10, mddev
->layout
);
2233 seq_printf (seq
, " [%d/%d] [", conf
->raid_disks
, conf
->working_disks
);
2234 for (i
= 0; i
< conf
->raid_disks
; i
++)
2235 seq_printf (seq
, "%s",
2236 conf
->disks
[i
].rdev
&&
2237 test_bit(In_sync
, &conf
->disks
[i
].rdev
->flags
) ? "U" : "_");
2238 seq_printf (seq
, "]");
2240 seq_printf (seq
, "\n");
2241 printall(seq
, conf
);
2245 static void print_raid6_conf (raid6_conf_t
*conf
)
2248 struct disk_info
*tmp
;
2250 printk("RAID6 conf printout:\n");
2252 printk("(conf==NULL)\n");
2255 printk(" --- rd:%d wd:%d fd:%d\n", conf
->raid_disks
,
2256 conf
->working_disks
, conf
->failed_disks
);
2258 for (i
= 0; i
< conf
->raid_disks
; i
++) {
2259 char b
[BDEVNAME_SIZE
];
2260 tmp
= conf
->disks
+ i
;
2262 printk(" disk %d, o:%d, dev:%s\n",
2263 i
, !test_bit(Faulty
, &tmp
->rdev
->flags
),
2264 bdevname(tmp
->rdev
->bdev
,b
));
2268 static int raid6_spare_active(mddev_t
*mddev
)
2271 raid6_conf_t
*conf
= mddev
->private;
2272 struct disk_info
*tmp
;
2274 for (i
= 0; i
< conf
->raid_disks
; i
++) {
2275 tmp
= conf
->disks
+ i
;
2277 && !test_bit(Faulty
, &tmp
->rdev
->flags
)
2278 && !test_bit(In_sync
, &tmp
->rdev
->flags
)) {
2280 conf
->failed_disks
--;
2281 conf
->working_disks
++;
2282 set_bit(In_sync
, &tmp
->rdev
->flags
);
2285 print_raid6_conf(conf
);
2289 static int raid6_remove_disk(mddev_t
*mddev
, int number
)
2291 raid6_conf_t
*conf
= mddev
->private;
2294 struct disk_info
*p
= conf
->disks
+ number
;
2296 print_raid6_conf(conf
);
2299 if (test_bit(In_sync
, &rdev
->flags
) ||
2300 atomic_read(&rdev
->nr_pending
)) {
2306 if (atomic_read(&rdev
->nr_pending
)) {
2307 /* lost the race, try later */
2315 print_raid6_conf(conf
);
2319 static int raid6_add_disk(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
2321 raid6_conf_t
*conf
= mddev
->private;
2324 struct disk_info
*p
;
2326 if (mddev
->degraded
> 2)
2327 /* no point adding a device */
2330 * find the disk ... but prefer rdev->saved_raid_disk
2333 if (rdev
->saved_raid_disk
>= 0 &&
2334 conf
->disks
[rdev
->saved_raid_disk
].rdev
== NULL
)
2335 disk
= rdev
->saved_raid_disk
;
2338 for ( ; disk
< mddev
->raid_disks
; disk
++)
2339 if ((p
=conf
->disks
+ disk
)->rdev
== NULL
) {
2340 clear_bit(In_sync
, &rdev
->flags
);
2341 rdev
->raid_disk
= disk
;
2343 if (rdev
->saved_raid_disk
!= disk
)
2345 rcu_assign_pointer(p
->rdev
, rdev
);
2348 print_raid6_conf(conf
);
2352 static int raid6_resize(mddev_t
*mddev
, sector_t sectors
)
2354 /* no resync is happening, and there is enough space
2355 * on all devices, so we can resize.
2356 * We need to make sure resync covers any new space.
2357 * If the array is shrinking we should possibly wait until
2358 * any io in the removed space completes, but it hardly seems
2361 sectors
&= ~((sector_t
)mddev
->chunk_size
/512 - 1);
2362 mddev
->array_size
= (sectors
* (mddev
->raid_disks
-2))>>1;
2363 set_capacity(mddev
->gendisk
, mddev
->array_size
<< 1);
2365 if (sectors
/2 > mddev
->size
&& mddev
->recovery_cp
== MaxSector
) {
2366 mddev
->recovery_cp
= mddev
->size
<< 1;
2367 set_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
);
2369 mddev
->size
= sectors
/2;
2370 mddev
->resync_max_sectors
= sectors
;
2374 static void raid6_quiesce(mddev_t
*mddev
, int state
)
2376 raid6_conf_t
*conf
= mddev_to_conf(mddev
);
2379 case 1: /* stop all writes */
2380 spin_lock_irq(&conf
->device_lock
);
2382 wait_event_lock_irq(conf
->wait_for_stripe
,
2383 atomic_read(&conf
->active_stripes
) == 0,
2384 conf
->device_lock
, /* nothing */);
2385 spin_unlock_irq(&conf
->device_lock
);
2388 case 0: /* re-enable writes */
2389 spin_lock_irq(&conf
->device_lock
);
2391 wake_up(&conf
->wait_for_stripe
);
2392 spin_unlock_irq(&conf
->device_lock
);
2397 static struct mdk_personality raid6_personality
=
2401 .owner
= THIS_MODULE
,
2402 .make_request
= make_request
,
2406 .error_handler
= error
,
2407 .hot_add_disk
= raid6_add_disk
,
2408 .hot_remove_disk
= raid6_remove_disk
,
2409 .spare_active
= raid6_spare_active
,
2410 .sync_request
= sync_request
,
2411 .resize
= raid6_resize
,
2412 .quiesce
= raid6_quiesce
,
2415 static int __init
raid6_init(void)
2419 e
= raid6_select_algo();
2423 return register_md_personality(&raid6_personality
);
2426 static void raid6_exit (void)
2428 unregister_md_personality(&raid6_personality
);
2431 module_init(raid6_init
);
2432 module_exit(raid6_exit
);
2433 MODULE_LICENSE("GPL");
2434 MODULE_ALIAS("md-personality-8"); /* RAID6 */
2435 MODULE_ALIAS("md-raid6");
2436 MODULE_ALIAS("md-level-6");