2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->seq_flush is the number of the last batch that was closed to
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/module.h>
51 #include <linux/async.h>
52 #include <linux/seq_file.h>
53 #include <linux/cpu.h>
54 #include <linux/slab.h>
55 #include <linux/ratelimit.h>
56 #include <linux/nodemask.h>
57 #include <trace/events/block.h>
64 #define cpu_to_group(cpu) cpu_to_node(cpu)
65 #define ANY_GROUP NUMA_NO_NODE
67 static struct workqueue_struct
*raid5_wq
;
72 #define NR_STRIPES 256
73 #define STRIPE_SIZE PAGE_SIZE
74 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
75 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
76 #define IO_THRESHOLD 1
77 #define BYPASS_THRESHOLD 1
78 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
79 #define HASH_MASK (NR_HASH - 1)
80 #define MAX_STRIPE_BATCH 8
82 static inline struct hlist_head
*stripe_hash(struct r5conf
*conf
, sector_t sect
)
84 int hash
= (sect
>> STRIPE_SHIFT
) & HASH_MASK
;
85 return &conf
->stripe_hashtbl
[hash
];
88 static inline int stripe_hash_locks_hash(sector_t sect
)
90 return (sect
>> STRIPE_SHIFT
) & STRIPE_HASH_LOCKS_MASK
;
93 static inline void lock_device_hash_lock(struct r5conf
*conf
, int hash
)
95 spin_lock_irq(conf
->hash_locks
+ hash
);
96 spin_lock(&conf
->device_lock
);
99 static inline void unlock_device_hash_lock(struct r5conf
*conf
, int hash
)
101 spin_unlock(&conf
->device_lock
);
102 spin_unlock_irq(conf
->hash_locks
+ hash
);
105 static inline void lock_all_device_hash_locks_irq(struct r5conf
*conf
)
109 spin_lock(conf
->hash_locks
);
110 for (i
= 1; i
< NR_STRIPE_HASH_LOCKS
; i
++)
111 spin_lock_nest_lock(conf
->hash_locks
+ i
, conf
->hash_locks
);
112 spin_lock(&conf
->device_lock
);
115 static inline void unlock_all_device_hash_locks_irq(struct r5conf
*conf
)
118 spin_unlock(&conf
->device_lock
);
119 for (i
= NR_STRIPE_HASH_LOCKS
; i
; i
--)
120 spin_unlock(conf
->hash_locks
+ i
- 1);
124 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
125 * order without overlap. There may be several bio's per stripe+device, and
126 * a bio could span several devices.
127 * When walking this list for a particular stripe+device, we must never proceed
128 * beyond a bio that extends past this device, as the next bio might no longer
130 * This function is used to determine the 'next' bio in the list, given the sector
131 * of the current stripe+device
133 static inline struct bio
*r5_next_bio(struct bio
*bio
, sector_t sector
)
135 int sectors
= bio_sectors(bio
);
136 if (bio
->bi_iter
.bi_sector
+ sectors
< sector
+ STRIPE_SECTORS
)
143 * We maintain a biased count of active stripes in the bottom 16 bits of
144 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
146 static inline int raid5_bi_processed_stripes(struct bio
*bio
)
148 atomic_t
*segments
= (atomic_t
*)&bio
->bi_phys_segments
;
149 return (atomic_read(segments
) >> 16) & 0xffff;
152 static inline int raid5_dec_bi_active_stripes(struct bio
*bio
)
154 atomic_t
*segments
= (atomic_t
*)&bio
->bi_phys_segments
;
155 return atomic_sub_return(1, segments
) & 0xffff;
158 static inline void raid5_inc_bi_active_stripes(struct bio
*bio
)
160 atomic_t
*segments
= (atomic_t
*)&bio
->bi_phys_segments
;
161 atomic_inc(segments
);
164 static inline void raid5_set_bi_processed_stripes(struct bio
*bio
,
167 atomic_t
*segments
= (atomic_t
*)&bio
->bi_phys_segments
;
171 old
= atomic_read(segments
);
172 new = (old
& 0xffff) | (cnt
<< 16);
173 } while (atomic_cmpxchg(segments
, old
, new) != old
);
176 static inline void raid5_set_bi_stripes(struct bio
*bio
, unsigned int cnt
)
178 atomic_t
*segments
= (atomic_t
*)&bio
->bi_phys_segments
;
179 atomic_set(segments
, cnt
);
182 /* Find first data disk in a raid6 stripe */
183 static inline int raid6_d0(struct stripe_head
*sh
)
186 /* ddf always start from first device */
188 /* md starts just after Q block */
189 if (sh
->qd_idx
== sh
->disks
- 1)
192 return sh
->qd_idx
+ 1;
194 static inline int raid6_next_disk(int disk
, int raid_disks
)
197 return (disk
< raid_disks
) ? disk
: 0;
200 /* When walking through the disks in a raid5, starting at raid6_d0,
201 * We need to map each disk to a 'slot', where the data disks are slot
202 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
203 * is raid_disks-1. This help does that mapping.
205 static int raid6_idx_to_slot(int idx
, struct stripe_head
*sh
,
206 int *count
, int syndrome_disks
)
212 if (idx
== sh
->pd_idx
)
213 return syndrome_disks
;
214 if (idx
== sh
->qd_idx
)
215 return syndrome_disks
+ 1;
221 static void return_io(struct bio
*return_bi
)
223 struct bio
*bi
= return_bi
;
226 return_bi
= bi
->bi_next
;
228 bi
->bi_iter
.bi_size
= 0;
229 trace_block_bio_complete(bdev_get_queue(bi
->bi_bdev
),
236 static void print_raid5_conf (struct r5conf
*conf
);
238 static int stripe_operations_active(struct stripe_head
*sh
)
240 return sh
->check_state
|| sh
->reconstruct_state
||
241 test_bit(STRIPE_BIOFILL_RUN
, &sh
->state
) ||
242 test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
245 static void raid5_wakeup_stripe_thread(struct stripe_head
*sh
)
247 struct r5conf
*conf
= sh
->raid_conf
;
248 struct r5worker_group
*group
;
250 int i
, cpu
= sh
->cpu
;
252 if (!cpu_online(cpu
)) {
253 cpu
= cpumask_any(cpu_online_mask
);
257 if (list_empty(&sh
->lru
)) {
258 struct r5worker_group
*group
;
259 group
= conf
->worker_groups
+ cpu_to_group(cpu
);
260 list_add_tail(&sh
->lru
, &group
->handle_list
);
261 group
->stripes_cnt
++;
265 if (conf
->worker_cnt_per_group
== 0) {
266 md_wakeup_thread(conf
->mddev
->thread
);
270 group
= conf
->worker_groups
+ cpu_to_group(sh
->cpu
);
272 group
->workers
[0].working
= true;
273 /* at least one worker should run to avoid race */
274 queue_work_on(sh
->cpu
, raid5_wq
, &group
->workers
[0].work
);
276 thread_cnt
= group
->stripes_cnt
/ MAX_STRIPE_BATCH
- 1;
277 /* wakeup more workers */
278 for (i
= 1; i
< conf
->worker_cnt_per_group
&& thread_cnt
> 0; i
++) {
279 if (group
->workers
[i
].working
== false) {
280 group
->workers
[i
].working
= true;
281 queue_work_on(sh
->cpu
, raid5_wq
,
282 &group
->workers
[i
].work
);
288 static void do_release_stripe(struct r5conf
*conf
, struct stripe_head
*sh
,
289 struct list_head
*temp_inactive_list
)
291 BUG_ON(!list_empty(&sh
->lru
));
292 BUG_ON(atomic_read(&conf
->active_stripes
)==0);
293 if (test_bit(STRIPE_HANDLE
, &sh
->state
)) {
294 if (test_bit(STRIPE_DELAYED
, &sh
->state
) &&
295 !test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
296 list_add_tail(&sh
->lru
, &conf
->delayed_list
);
297 else if (test_bit(STRIPE_BIT_DELAY
, &sh
->state
) &&
298 sh
->bm_seq
- conf
->seq_write
> 0)
299 list_add_tail(&sh
->lru
, &conf
->bitmap_list
);
301 clear_bit(STRIPE_DELAYED
, &sh
->state
);
302 clear_bit(STRIPE_BIT_DELAY
, &sh
->state
);
303 if (conf
->worker_cnt_per_group
== 0) {
304 list_add_tail(&sh
->lru
, &conf
->handle_list
);
306 raid5_wakeup_stripe_thread(sh
);
310 md_wakeup_thread(conf
->mddev
->thread
);
312 BUG_ON(stripe_operations_active(sh
));
313 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
314 if (atomic_dec_return(&conf
->preread_active_stripes
)
316 md_wakeup_thread(conf
->mddev
->thread
);
317 atomic_dec(&conf
->active_stripes
);
318 if (!test_bit(STRIPE_EXPANDING
, &sh
->state
))
319 list_add_tail(&sh
->lru
, temp_inactive_list
);
323 static void __release_stripe(struct r5conf
*conf
, struct stripe_head
*sh
,
324 struct list_head
*temp_inactive_list
)
326 if (atomic_dec_and_test(&sh
->count
))
327 do_release_stripe(conf
, sh
, temp_inactive_list
);
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
337 static void release_inactive_stripe_list(struct r5conf
*conf
,
338 struct list_head
*temp_inactive_list
,
342 bool do_wakeup
= false;
345 if (hash
== NR_STRIPE_HASH_LOCKS
) {
346 size
= NR_STRIPE_HASH_LOCKS
;
347 hash
= NR_STRIPE_HASH_LOCKS
- 1;
351 struct list_head
*list
= &temp_inactive_list
[size
- 1];
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
357 if (!list_empty_careful(list
)) {
358 spin_lock_irqsave(conf
->hash_locks
+ hash
, flags
);
359 if (list_empty(conf
->inactive_list
+ hash
) &&
361 atomic_dec(&conf
->empty_inactive_list_nr
);
362 list_splice_tail_init(list
, conf
->inactive_list
+ hash
);
364 spin_unlock_irqrestore(conf
->hash_locks
+ hash
, flags
);
371 wake_up(&conf
->wait_for_stripe
);
372 if (conf
->retry_read_aligned
)
373 md_wakeup_thread(conf
->mddev
->thread
);
377 /* should hold conf->device_lock already */
378 static int release_stripe_list(struct r5conf
*conf
,
379 struct list_head
*temp_inactive_list
)
381 struct stripe_head
*sh
;
383 struct llist_node
*head
;
385 head
= llist_del_all(&conf
->released_stripes
);
386 head
= llist_reverse_order(head
);
390 sh
= llist_entry(head
, struct stripe_head
, release_list
);
391 head
= llist_next(head
);
392 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
394 clear_bit(STRIPE_ON_RELEASE_LIST
, &sh
->state
);
396 * Don't worry the bit is set here, because if the bit is set
397 * again, the count is always > 1. This is true for
398 * STRIPE_ON_UNPLUG_LIST bit too.
400 hash
= sh
->hash_lock_index
;
401 __release_stripe(conf
, sh
, &temp_inactive_list
[hash
]);
408 static void release_stripe(struct stripe_head
*sh
)
410 struct r5conf
*conf
= sh
->raid_conf
;
412 struct list_head list
;
416 if (unlikely(!conf
->mddev
->thread
) ||
417 test_and_set_bit(STRIPE_ON_RELEASE_LIST
, &sh
->state
))
419 wakeup
= llist_add(&sh
->release_list
, &conf
->released_stripes
);
421 md_wakeup_thread(conf
->mddev
->thread
);
424 local_irq_save(flags
);
425 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
426 if (atomic_dec_and_lock(&sh
->count
, &conf
->device_lock
)) {
427 INIT_LIST_HEAD(&list
);
428 hash
= sh
->hash_lock_index
;
429 do_release_stripe(conf
, sh
, &list
);
430 spin_unlock(&conf
->device_lock
);
431 release_inactive_stripe_list(conf
, &list
, hash
);
433 local_irq_restore(flags
);
436 static inline void remove_hash(struct stripe_head
*sh
)
438 pr_debug("remove_hash(), stripe %llu\n",
439 (unsigned long long)sh
->sector
);
441 hlist_del_init(&sh
->hash
);
444 static inline void insert_hash(struct r5conf
*conf
, struct stripe_head
*sh
)
446 struct hlist_head
*hp
= stripe_hash(conf
, sh
->sector
);
448 pr_debug("insert_hash(), stripe %llu\n",
449 (unsigned long long)sh
->sector
);
451 hlist_add_head(&sh
->hash
, hp
);
455 /* find an idle stripe, make sure it is unhashed, and return it. */
456 static struct stripe_head
*get_free_stripe(struct r5conf
*conf
, int hash
)
458 struct stripe_head
*sh
= NULL
;
459 struct list_head
*first
;
461 if (list_empty(conf
->inactive_list
+ hash
))
463 first
= (conf
->inactive_list
+ hash
)->next
;
464 sh
= list_entry(first
, struct stripe_head
, lru
);
465 list_del_init(first
);
467 atomic_inc(&conf
->active_stripes
);
468 BUG_ON(hash
!= sh
->hash_lock_index
);
469 if (list_empty(conf
->inactive_list
+ hash
))
470 atomic_inc(&conf
->empty_inactive_list_nr
);
475 static void shrink_buffers(struct stripe_head
*sh
)
479 int num
= sh
->raid_conf
->pool_size
;
481 for (i
= 0; i
< num
; i
++) {
485 sh
->dev
[i
].page
= NULL
;
490 static int grow_buffers(struct stripe_head
*sh
)
493 int num
= sh
->raid_conf
->pool_size
;
495 for (i
= 0; i
< num
; i
++) {
498 if (!(page
= alloc_page(GFP_KERNEL
))) {
501 sh
->dev
[i
].page
= page
;
506 static void raid5_build_block(struct stripe_head
*sh
, int i
, int previous
);
507 static void stripe_set_idx(sector_t stripe
, struct r5conf
*conf
, int previous
,
508 struct stripe_head
*sh
);
510 static void init_stripe(struct stripe_head
*sh
, sector_t sector
, int previous
)
512 struct r5conf
*conf
= sh
->raid_conf
;
515 BUG_ON(atomic_read(&sh
->count
) != 0);
516 BUG_ON(test_bit(STRIPE_HANDLE
, &sh
->state
));
517 BUG_ON(stripe_operations_active(sh
));
519 pr_debug("init_stripe called, stripe %llu\n",
520 (unsigned long long)sh
->sector
);
524 seq
= read_seqcount_begin(&conf
->gen_lock
);
525 sh
->generation
= conf
->generation
- previous
;
526 sh
->disks
= previous
? conf
->previous_raid_disks
: conf
->raid_disks
;
528 stripe_set_idx(sector
, conf
, previous
, sh
);
532 for (i
= sh
->disks
; i
--; ) {
533 struct r5dev
*dev
= &sh
->dev
[i
];
535 if (dev
->toread
|| dev
->read
|| dev
->towrite
|| dev
->written
||
536 test_bit(R5_LOCKED
, &dev
->flags
)) {
537 printk(KERN_ERR
"sector=%llx i=%d %p %p %p %p %d\n",
538 (unsigned long long)sh
->sector
, i
, dev
->toread
,
539 dev
->read
, dev
->towrite
, dev
->written
,
540 test_bit(R5_LOCKED
, &dev
->flags
));
544 raid5_build_block(sh
, i
, previous
);
546 if (read_seqcount_retry(&conf
->gen_lock
, seq
))
548 insert_hash(conf
, sh
);
549 sh
->cpu
= smp_processor_id();
552 static struct stripe_head
*__find_stripe(struct r5conf
*conf
, sector_t sector
,
555 struct stripe_head
*sh
;
557 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector
);
558 hlist_for_each_entry(sh
, stripe_hash(conf
, sector
), hash
)
559 if (sh
->sector
== sector
&& sh
->generation
== generation
)
561 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector
);
566 * Need to check if array has failed when deciding whether to:
568 * - remove non-faulty devices
571 * This determination is simple when no reshape is happening.
572 * However if there is a reshape, we need to carefully check
573 * both the before and after sections.
574 * This is because some failed devices may only affect one
575 * of the two sections, and some non-in_sync devices may
576 * be insync in the section most affected by failed devices.
578 static int calc_degraded(struct r5conf
*conf
)
580 int degraded
, degraded2
;
585 for (i
= 0; i
< conf
->previous_raid_disks
; i
++) {
586 struct md_rdev
*rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
587 if (rdev
&& test_bit(Faulty
, &rdev
->flags
))
588 rdev
= rcu_dereference(conf
->disks
[i
].replacement
);
589 if (!rdev
|| test_bit(Faulty
, &rdev
->flags
))
591 else if (test_bit(In_sync
, &rdev
->flags
))
594 /* not in-sync or faulty.
595 * If the reshape increases the number of devices,
596 * this is being recovered by the reshape, so
597 * this 'previous' section is not in_sync.
598 * If the number of devices is being reduced however,
599 * the device can only be part of the array if
600 * we are reverting a reshape, so this section will
603 if (conf
->raid_disks
>= conf
->previous_raid_disks
)
607 if (conf
->raid_disks
== conf
->previous_raid_disks
)
611 for (i
= 0; i
< conf
->raid_disks
; i
++) {
612 struct md_rdev
*rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
613 if (rdev
&& test_bit(Faulty
, &rdev
->flags
))
614 rdev
= rcu_dereference(conf
->disks
[i
].replacement
);
615 if (!rdev
|| test_bit(Faulty
, &rdev
->flags
))
617 else if (test_bit(In_sync
, &rdev
->flags
))
620 /* not in-sync or faulty.
621 * If reshape increases the number of devices, this
622 * section has already been recovered, else it
623 * almost certainly hasn't.
625 if (conf
->raid_disks
<= conf
->previous_raid_disks
)
629 if (degraded2
> degraded
)
634 static int has_failed(struct r5conf
*conf
)
638 if (conf
->mddev
->reshape_position
== MaxSector
)
639 return conf
->mddev
->degraded
> conf
->max_degraded
;
641 degraded
= calc_degraded(conf
);
642 if (degraded
> conf
->max_degraded
)
647 static struct stripe_head
*
648 get_active_stripe(struct r5conf
*conf
, sector_t sector
,
649 int previous
, int noblock
, int noquiesce
)
651 struct stripe_head
*sh
;
652 int hash
= stripe_hash_locks_hash(sector
);
654 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector
);
656 spin_lock_irq(conf
->hash_locks
+ hash
);
659 wait_event_lock_irq(conf
->wait_for_stripe
,
660 conf
->quiesce
== 0 || noquiesce
,
661 *(conf
->hash_locks
+ hash
));
662 sh
= __find_stripe(conf
, sector
, conf
->generation
- previous
);
664 if (!conf
->inactive_blocked
)
665 sh
= get_free_stripe(conf
, hash
);
666 if (noblock
&& sh
== NULL
)
669 conf
->inactive_blocked
= 1;
671 conf
->wait_for_stripe
,
672 !list_empty(conf
->inactive_list
+ hash
) &&
673 (atomic_read(&conf
->active_stripes
)
674 < (conf
->max_nr_stripes
* 3 / 4)
675 || !conf
->inactive_blocked
),
676 *(conf
->hash_locks
+ hash
));
677 conf
->inactive_blocked
= 0;
679 init_stripe(sh
, sector
, previous
);
680 atomic_inc(&sh
->count
);
682 } else if (!atomic_inc_not_zero(&sh
->count
)) {
683 spin_lock(&conf
->device_lock
);
684 if (!atomic_read(&sh
->count
)) {
685 if (!test_bit(STRIPE_HANDLE
, &sh
->state
))
686 atomic_inc(&conf
->active_stripes
);
687 BUG_ON(list_empty(&sh
->lru
) &&
688 !test_bit(STRIPE_EXPANDING
, &sh
->state
));
689 list_del_init(&sh
->lru
);
691 sh
->group
->stripes_cnt
--;
695 atomic_inc(&sh
->count
);
696 spin_unlock(&conf
->device_lock
);
698 } while (sh
== NULL
);
700 spin_unlock_irq(conf
->hash_locks
+ hash
);
704 /* Determine if 'data_offset' or 'new_data_offset' should be used
705 * in this stripe_head.
707 static int use_new_offset(struct r5conf
*conf
, struct stripe_head
*sh
)
709 sector_t progress
= conf
->reshape_progress
;
710 /* Need a memory barrier to make sure we see the value
711 * of conf->generation, or ->data_offset that was set before
712 * reshape_progress was updated.
715 if (progress
== MaxSector
)
717 if (sh
->generation
== conf
->generation
- 1)
719 /* We are in a reshape, and this is a new-generation stripe,
720 * so use new_data_offset.
726 raid5_end_read_request(struct bio
*bi
, int error
);
728 raid5_end_write_request(struct bio
*bi
, int error
);
730 static void ops_run_io(struct stripe_head
*sh
, struct stripe_head_state
*s
)
732 struct r5conf
*conf
= sh
->raid_conf
;
733 int i
, disks
= sh
->disks
;
737 for (i
= disks
; i
--; ) {
739 int replace_only
= 0;
740 struct bio
*bi
, *rbi
;
741 struct md_rdev
*rdev
, *rrdev
= NULL
;
742 if (test_and_clear_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
)) {
743 if (test_and_clear_bit(R5_WantFUA
, &sh
->dev
[i
].flags
))
747 if (test_bit(R5_Discard
, &sh
->dev
[i
].flags
))
749 } else if (test_and_clear_bit(R5_Wantread
, &sh
->dev
[i
].flags
))
751 else if (test_and_clear_bit(R5_WantReplace
,
752 &sh
->dev
[i
].flags
)) {
757 if (test_and_clear_bit(R5_SyncIO
, &sh
->dev
[i
].flags
))
760 bi
= &sh
->dev
[i
].req
;
761 rbi
= &sh
->dev
[i
].rreq
; /* For writing to replacement */
764 rrdev
= rcu_dereference(conf
->disks
[i
].replacement
);
765 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
766 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
775 /* We raced and saw duplicates */
778 if (test_bit(R5_ReadRepl
, &sh
->dev
[i
].flags
) && rrdev
)
783 if (rdev
&& test_bit(Faulty
, &rdev
->flags
))
786 atomic_inc(&rdev
->nr_pending
);
787 if (rrdev
&& test_bit(Faulty
, &rrdev
->flags
))
790 atomic_inc(&rrdev
->nr_pending
);
793 /* We have already checked bad blocks for reads. Now
794 * need to check for writes. We never accept write errors
795 * on the replacement, so we don't to check rrdev.
797 while ((rw
& WRITE
) && rdev
&&
798 test_bit(WriteErrorSeen
, &rdev
->flags
)) {
801 int bad
= is_badblock(rdev
, sh
->sector
, STRIPE_SECTORS
,
802 &first_bad
, &bad_sectors
);
807 set_bit(BlockedBadBlocks
, &rdev
->flags
);
808 if (!conf
->mddev
->external
&&
809 conf
->mddev
->flags
) {
810 /* It is very unlikely, but we might
811 * still need to write out the
812 * bad block log - better give it
814 md_check_recovery(conf
->mddev
);
817 * Because md_wait_for_blocked_rdev
818 * will dec nr_pending, we must
819 * increment it first.
821 atomic_inc(&rdev
->nr_pending
);
822 md_wait_for_blocked_rdev(rdev
, conf
->mddev
);
824 /* Acknowledged bad block - skip the write */
825 rdev_dec_pending(rdev
, conf
->mddev
);
831 if (s
->syncing
|| s
->expanding
|| s
->expanded
833 md_sync_acct(rdev
->bdev
, STRIPE_SECTORS
);
835 set_bit(STRIPE_IO_STARTED
, &sh
->state
);
838 bi
->bi_bdev
= rdev
->bdev
;
840 bi
->bi_end_io
= (rw
& WRITE
)
841 ? raid5_end_write_request
842 : raid5_end_read_request
;
845 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
846 __func__
, (unsigned long long)sh
->sector
,
848 atomic_inc(&sh
->count
);
849 if (use_new_offset(conf
, sh
))
850 bi
->bi_iter
.bi_sector
= (sh
->sector
851 + rdev
->new_data_offset
);
853 bi
->bi_iter
.bi_sector
= (sh
->sector
854 + rdev
->data_offset
);
855 if (test_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
))
856 bi
->bi_rw
|= REQ_NOMERGE
;
859 bi
->bi_io_vec
[0].bv_len
= STRIPE_SIZE
;
860 bi
->bi_io_vec
[0].bv_offset
= 0;
861 bi
->bi_iter
.bi_size
= STRIPE_SIZE
;
863 * If this is discard request, set bi_vcnt 0. We don't
864 * want to confuse SCSI because SCSI will replace payload
866 if (rw
& REQ_DISCARD
)
869 set_bit(R5_DOUBLE_LOCKED
, &sh
->dev
[i
].flags
);
871 if (conf
->mddev
->gendisk
)
872 trace_block_bio_remap(bdev_get_queue(bi
->bi_bdev
),
873 bi
, disk_devt(conf
->mddev
->gendisk
),
875 generic_make_request(bi
);
878 if (s
->syncing
|| s
->expanding
|| s
->expanded
880 md_sync_acct(rrdev
->bdev
, STRIPE_SECTORS
);
882 set_bit(STRIPE_IO_STARTED
, &sh
->state
);
885 rbi
->bi_bdev
= rrdev
->bdev
;
887 BUG_ON(!(rw
& WRITE
));
888 rbi
->bi_end_io
= raid5_end_write_request
;
889 rbi
->bi_private
= sh
;
891 pr_debug("%s: for %llu schedule op %ld on "
892 "replacement disc %d\n",
893 __func__
, (unsigned long long)sh
->sector
,
895 atomic_inc(&sh
->count
);
896 if (use_new_offset(conf
, sh
))
897 rbi
->bi_iter
.bi_sector
= (sh
->sector
898 + rrdev
->new_data_offset
);
900 rbi
->bi_iter
.bi_sector
= (sh
->sector
901 + rrdev
->data_offset
);
903 rbi
->bi_io_vec
[0].bv_len
= STRIPE_SIZE
;
904 rbi
->bi_io_vec
[0].bv_offset
= 0;
905 rbi
->bi_iter
.bi_size
= STRIPE_SIZE
;
907 * If this is discard request, set bi_vcnt 0. We don't
908 * want to confuse SCSI because SCSI will replace payload
910 if (rw
& REQ_DISCARD
)
912 if (conf
->mddev
->gendisk
)
913 trace_block_bio_remap(bdev_get_queue(rbi
->bi_bdev
),
914 rbi
, disk_devt(conf
->mddev
->gendisk
),
916 generic_make_request(rbi
);
918 if (!rdev
&& !rrdev
) {
920 set_bit(STRIPE_DEGRADED
, &sh
->state
);
921 pr_debug("skip op %ld on disc %d for sector %llu\n",
922 bi
->bi_rw
, i
, (unsigned long long)sh
->sector
);
923 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
924 set_bit(STRIPE_HANDLE
, &sh
->state
);
929 static struct dma_async_tx_descriptor
*
930 async_copy_data(int frombio
, struct bio
*bio
, struct page
*page
,
931 sector_t sector
, struct dma_async_tx_descriptor
*tx
)
934 struct bvec_iter iter
;
935 struct page
*bio_page
;
937 struct async_submit_ctl submit
;
938 enum async_tx_flags flags
= 0;
940 if (bio
->bi_iter
.bi_sector
>= sector
)
941 page_offset
= (signed)(bio
->bi_iter
.bi_sector
- sector
) * 512;
943 page_offset
= (signed)(sector
- bio
->bi_iter
.bi_sector
) * -512;
946 flags
|= ASYNC_TX_FENCE
;
947 init_async_submit(&submit
, flags
, tx
, NULL
, NULL
, NULL
);
949 bio_for_each_segment(bvl
, bio
, iter
) {
950 int len
= bvl
.bv_len
;
954 if (page_offset
< 0) {
955 b_offset
= -page_offset
;
956 page_offset
+= b_offset
;
960 if (len
> 0 && page_offset
+ len
> STRIPE_SIZE
)
961 clen
= STRIPE_SIZE
- page_offset
;
966 b_offset
+= bvl
.bv_offset
;
967 bio_page
= bvl
.bv_page
;
969 tx
= async_memcpy(page
, bio_page
, page_offset
,
970 b_offset
, clen
, &submit
);
972 tx
= async_memcpy(bio_page
, page
, b_offset
,
973 page_offset
, clen
, &submit
);
975 /* chain the operations */
976 submit
.depend_tx
= tx
;
978 if (clen
< len
) /* hit end of page */
986 static void ops_complete_biofill(void *stripe_head_ref
)
988 struct stripe_head
*sh
= stripe_head_ref
;
989 struct bio
*return_bi
= NULL
;
992 pr_debug("%s: stripe %llu\n", __func__
,
993 (unsigned long long)sh
->sector
);
995 /* clear completed biofills */
996 for (i
= sh
->disks
; i
--; ) {
997 struct r5dev
*dev
= &sh
->dev
[i
];
999 /* acknowledge completion of a biofill operation */
1000 /* and check if we need to reply to a read request,
1001 * new R5_Wantfill requests are held off until
1002 * !STRIPE_BIOFILL_RUN
1004 if (test_and_clear_bit(R5_Wantfill
, &dev
->flags
)) {
1005 struct bio
*rbi
, *rbi2
;
1010 while (rbi
&& rbi
->bi_iter
.bi_sector
<
1011 dev
->sector
+ STRIPE_SECTORS
) {
1012 rbi2
= r5_next_bio(rbi
, dev
->sector
);
1013 if (!raid5_dec_bi_active_stripes(rbi
)) {
1014 rbi
->bi_next
= return_bi
;
1021 clear_bit(STRIPE_BIOFILL_RUN
, &sh
->state
);
1023 return_io(return_bi
);
1025 set_bit(STRIPE_HANDLE
, &sh
->state
);
1029 static void ops_run_biofill(struct stripe_head
*sh
)
1031 struct dma_async_tx_descriptor
*tx
= NULL
;
1032 struct async_submit_ctl submit
;
1035 pr_debug("%s: stripe %llu\n", __func__
,
1036 (unsigned long long)sh
->sector
);
1038 for (i
= sh
->disks
; i
--; ) {
1039 struct r5dev
*dev
= &sh
->dev
[i
];
1040 if (test_bit(R5_Wantfill
, &dev
->flags
)) {
1042 spin_lock_irq(&sh
->stripe_lock
);
1043 dev
->read
= rbi
= dev
->toread
;
1045 spin_unlock_irq(&sh
->stripe_lock
);
1046 while (rbi
&& rbi
->bi_iter
.bi_sector
<
1047 dev
->sector
+ STRIPE_SECTORS
) {
1048 tx
= async_copy_data(0, rbi
, dev
->page
,
1050 rbi
= r5_next_bio(rbi
, dev
->sector
);
1055 atomic_inc(&sh
->count
);
1056 init_async_submit(&submit
, ASYNC_TX_ACK
, tx
, ops_complete_biofill
, sh
, NULL
);
1057 async_trigger_callback(&submit
);
1060 static void mark_target_uptodate(struct stripe_head
*sh
, int target
)
1067 tgt
= &sh
->dev
[target
];
1068 set_bit(R5_UPTODATE
, &tgt
->flags
);
1069 BUG_ON(!test_bit(R5_Wantcompute
, &tgt
->flags
));
1070 clear_bit(R5_Wantcompute
, &tgt
->flags
);
1073 static void ops_complete_compute(void *stripe_head_ref
)
1075 struct stripe_head
*sh
= stripe_head_ref
;
1077 pr_debug("%s: stripe %llu\n", __func__
,
1078 (unsigned long long)sh
->sector
);
1080 /* mark the computed target(s) as uptodate */
1081 mark_target_uptodate(sh
, sh
->ops
.target
);
1082 mark_target_uptodate(sh
, sh
->ops
.target2
);
1084 clear_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
1085 if (sh
->check_state
== check_state_compute_run
)
1086 sh
->check_state
= check_state_compute_result
;
1087 set_bit(STRIPE_HANDLE
, &sh
->state
);
1091 /* return a pointer to the address conversion region of the scribble buffer */
1092 static addr_conv_t
*to_addr_conv(struct stripe_head
*sh
,
1093 struct raid5_percpu
*percpu
)
1095 return percpu
->scribble
+ sizeof(struct page
*) * (sh
->disks
+ 2);
1098 static struct dma_async_tx_descriptor
*
1099 ops_run_compute5(struct stripe_head
*sh
, struct raid5_percpu
*percpu
)
1101 int disks
= sh
->disks
;
1102 struct page
**xor_srcs
= percpu
->scribble
;
1103 int target
= sh
->ops
.target
;
1104 struct r5dev
*tgt
= &sh
->dev
[target
];
1105 struct page
*xor_dest
= tgt
->page
;
1107 struct dma_async_tx_descriptor
*tx
;
1108 struct async_submit_ctl submit
;
1111 pr_debug("%s: stripe %llu block: %d\n",
1112 __func__
, (unsigned long long)sh
->sector
, target
);
1113 BUG_ON(!test_bit(R5_Wantcompute
, &tgt
->flags
));
1115 for (i
= disks
; i
--; )
1117 xor_srcs
[count
++] = sh
->dev
[i
].page
;
1119 atomic_inc(&sh
->count
);
1121 init_async_submit(&submit
, ASYNC_TX_FENCE
|ASYNC_TX_XOR_ZERO_DST
, NULL
,
1122 ops_complete_compute
, sh
, to_addr_conv(sh
, percpu
));
1123 if (unlikely(count
== 1))
1124 tx
= async_memcpy(xor_dest
, xor_srcs
[0], 0, 0, STRIPE_SIZE
, &submit
);
1126 tx
= async_xor(xor_dest
, xor_srcs
, 0, count
, STRIPE_SIZE
, &submit
);
1131 /* set_syndrome_sources - populate source buffers for gen_syndrome
1132 * @srcs - (struct page *) array of size sh->disks
1133 * @sh - stripe_head to parse
1135 * Populates srcs in proper layout order for the stripe and returns the
1136 * 'count' of sources to be used in a call to async_gen_syndrome. The P
1137 * destination buffer is recorded in srcs[count] and the Q destination
1138 * is recorded in srcs[count+1]].
1140 static int set_syndrome_sources(struct page
**srcs
, struct stripe_head
*sh
)
1142 int disks
= sh
->disks
;
1143 int syndrome_disks
= sh
->ddf_layout
? disks
: (disks
- 2);
1144 int d0_idx
= raid6_d0(sh
);
1148 for (i
= 0; i
< disks
; i
++)
1154 int slot
= raid6_idx_to_slot(i
, sh
, &count
, syndrome_disks
);
1156 srcs
[slot
] = sh
->dev
[i
].page
;
1157 i
= raid6_next_disk(i
, disks
);
1158 } while (i
!= d0_idx
);
1160 return syndrome_disks
;
1163 static struct dma_async_tx_descriptor
*
1164 ops_run_compute6_1(struct stripe_head
*sh
, struct raid5_percpu
*percpu
)
1166 int disks
= sh
->disks
;
1167 struct page
**blocks
= percpu
->scribble
;
1169 int qd_idx
= sh
->qd_idx
;
1170 struct dma_async_tx_descriptor
*tx
;
1171 struct async_submit_ctl submit
;
1177 if (sh
->ops
.target
< 0)
1178 target
= sh
->ops
.target2
;
1179 else if (sh
->ops
.target2
< 0)
1180 target
= sh
->ops
.target
;
1182 /* we should only have one valid target */
1185 pr_debug("%s: stripe %llu block: %d\n",
1186 __func__
, (unsigned long long)sh
->sector
, target
);
1188 tgt
= &sh
->dev
[target
];
1189 BUG_ON(!test_bit(R5_Wantcompute
, &tgt
->flags
));
1192 atomic_inc(&sh
->count
);
1194 if (target
== qd_idx
) {
1195 count
= set_syndrome_sources(blocks
, sh
);
1196 blocks
[count
] = NULL
; /* regenerating p is not necessary */
1197 BUG_ON(blocks
[count
+1] != dest
); /* q should already be set */
1198 init_async_submit(&submit
, ASYNC_TX_FENCE
, NULL
,
1199 ops_complete_compute
, sh
,
1200 to_addr_conv(sh
, percpu
));
1201 tx
= async_gen_syndrome(blocks
, 0, count
+2, STRIPE_SIZE
, &submit
);
1203 /* Compute any data- or p-drive using XOR */
1205 for (i
= disks
; i
-- ; ) {
1206 if (i
== target
|| i
== qd_idx
)
1208 blocks
[count
++] = sh
->dev
[i
].page
;
1211 init_async_submit(&submit
, ASYNC_TX_FENCE
|ASYNC_TX_XOR_ZERO_DST
,
1212 NULL
, ops_complete_compute
, sh
,
1213 to_addr_conv(sh
, percpu
));
1214 tx
= async_xor(dest
, blocks
, 0, count
, STRIPE_SIZE
, &submit
);
1220 static struct dma_async_tx_descriptor
*
1221 ops_run_compute6_2(struct stripe_head
*sh
, struct raid5_percpu
*percpu
)
1223 int i
, count
, disks
= sh
->disks
;
1224 int syndrome_disks
= sh
->ddf_layout
? disks
: disks
-2;
1225 int d0_idx
= raid6_d0(sh
);
1226 int faila
= -1, failb
= -1;
1227 int target
= sh
->ops
.target
;
1228 int target2
= sh
->ops
.target2
;
1229 struct r5dev
*tgt
= &sh
->dev
[target
];
1230 struct r5dev
*tgt2
= &sh
->dev
[target2
];
1231 struct dma_async_tx_descriptor
*tx
;
1232 struct page
**blocks
= percpu
->scribble
;
1233 struct async_submit_ctl submit
;
1235 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1236 __func__
, (unsigned long long)sh
->sector
, target
, target2
);
1237 BUG_ON(target
< 0 || target2
< 0);
1238 BUG_ON(!test_bit(R5_Wantcompute
, &tgt
->flags
));
1239 BUG_ON(!test_bit(R5_Wantcompute
, &tgt2
->flags
));
1241 /* we need to open-code set_syndrome_sources to handle the
1242 * slot number conversion for 'faila' and 'failb'
1244 for (i
= 0; i
< disks
; i
++)
1249 int slot
= raid6_idx_to_slot(i
, sh
, &count
, syndrome_disks
);
1251 blocks
[slot
] = sh
->dev
[i
].page
;
1257 i
= raid6_next_disk(i
, disks
);
1258 } while (i
!= d0_idx
);
1260 BUG_ON(faila
== failb
);
1263 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1264 __func__
, (unsigned long long)sh
->sector
, faila
, failb
);
1266 atomic_inc(&sh
->count
);
1268 if (failb
== syndrome_disks
+1) {
1269 /* Q disk is one of the missing disks */
1270 if (faila
== syndrome_disks
) {
1271 /* Missing P+Q, just recompute */
1272 init_async_submit(&submit
, ASYNC_TX_FENCE
, NULL
,
1273 ops_complete_compute
, sh
,
1274 to_addr_conv(sh
, percpu
));
1275 return async_gen_syndrome(blocks
, 0, syndrome_disks
+2,
1276 STRIPE_SIZE
, &submit
);
1280 int qd_idx
= sh
->qd_idx
;
1282 /* Missing D+Q: recompute D from P, then recompute Q */
1283 if (target
== qd_idx
)
1284 data_target
= target2
;
1286 data_target
= target
;
1289 for (i
= disks
; i
-- ; ) {
1290 if (i
== data_target
|| i
== qd_idx
)
1292 blocks
[count
++] = sh
->dev
[i
].page
;
1294 dest
= sh
->dev
[data_target
].page
;
1295 init_async_submit(&submit
,
1296 ASYNC_TX_FENCE
|ASYNC_TX_XOR_ZERO_DST
,
1298 to_addr_conv(sh
, percpu
));
1299 tx
= async_xor(dest
, blocks
, 0, count
, STRIPE_SIZE
,
1302 count
= set_syndrome_sources(blocks
, sh
);
1303 init_async_submit(&submit
, ASYNC_TX_FENCE
, tx
,
1304 ops_complete_compute
, sh
,
1305 to_addr_conv(sh
, percpu
));
1306 return async_gen_syndrome(blocks
, 0, count
+2,
1307 STRIPE_SIZE
, &submit
);
1310 init_async_submit(&submit
, ASYNC_TX_FENCE
, NULL
,
1311 ops_complete_compute
, sh
,
1312 to_addr_conv(sh
, percpu
));
1313 if (failb
== syndrome_disks
) {
1314 /* We're missing D+P. */
1315 return async_raid6_datap_recov(syndrome_disks
+2,
1319 /* We're missing D+D. */
1320 return async_raid6_2data_recov(syndrome_disks
+2,
1321 STRIPE_SIZE
, faila
, failb
,
1328 static void ops_complete_prexor(void *stripe_head_ref
)
1330 struct stripe_head
*sh
= stripe_head_ref
;
1332 pr_debug("%s: stripe %llu\n", __func__
,
1333 (unsigned long long)sh
->sector
);
1336 static struct dma_async_tx_descriptor
*
1337 ops_run_prexor(struct stripe_head
*sh
, struct raid5_percpu
*percpu
,
1338 struct dma_async_tx_descriptor
*tx
)
1340 int disks
= sh
->disks
;
1341 struct page
**xor_srcs
= percpu
->scribble
;
1342 int count
= 0, pd_idx
= sh
->pd_idx
, i
;
1343 struct async_submit_ctl submit
;
1345 /* existing parity data subtracted */
1346 struct page
*xor_dest
= xor_srcs
[count
++] = sh
->dev
[pd_idx
].page
;
1348 pr_debug("%s: stripe %llu\n", __func__
,
1349 (unsigned long long)sh
->sector
);
1351 for (i
= disks
; i
--; ) {
1352 struct r5dev
*dev
= &sh
->dev
[i
];
1353 /* Only process blocks that are known to be uptodate */
1354 if (test_bit(R5_Wantdrain
, &dev
->flags
))
1355 xor_srcs
[count
++] = dev
->page
;
1358 init_async_submit(&submit
, ASYNC_TX_FENCE
|ASYNC_TX_XOR_DROP_DST
, tx
,
1359 ops_complete_prexor
, sh
, to_addr_conv(sh
, percpu
));
1360 tx
= async_xor(xor_dest
, xor_srcs
, 0, count
, STRIPE_SIZE
, &submit
);
1365 static struct dma_async_tx_descriptor
*
1366 ops_run_biodrain(struct stripe_head
*sh
, struct dma_async_tx_descriptor
*tx
)
1368 int disks
= sh
->disks
;
1371 pr_debug("%s: stripe %llu\n", __func__
,
1372 (unsigned long long)sh
->sector
);
1374 for (i
= disks
; i
--; ) {
1375 struct r5dev
*dev
= &sh
->dev
[i
];
1378 if (test_and_clear_bit(R5_Wantdrain
, &dev
->flags
)) {
1381 spin_lock_irq(&sh
->stripe_lock
);
1382 chosen
= dev
->towrite
;
1383 dev
->towrite
= NULL
;
1384 BUG_ON(dev
->written
);
1385 wbi
= dev
->written
= chosen
;
1386 spin_unlock_irq(&sh
->stripe_lock
);
1388 while (wbi
&& wbi
->bi_iter
.bi_sector
<
1389 dev
->sector
+ STRIPE_SECTORS
) {
1390 if (wbi
->bi_rw
& REQ_FUA
)
1391 set_bit(R5_WantFUA
, &dev
->flags
);
1392 if (wbi
->bi_rw
& REQ_SYNC
)
1393 set_bit(R5_SyncIO
, &dev
->flags
);
1394 if (wbi
->bi_rw
& REQ_DISCARD
)
1395 set_bit(R5_Discard
, &dev
->flags
);
1397 tx
= async_copy_data(1, wbi
, dev
->page
,
1399 wbi
= r5_next_bio(wbi
, dev
->sector
);
1407 static void ops_complete_reconstruct(void *stripe_head_ref
)
1409 struct stripe_head
*sh
= stripe_head_ref
;
1410 int disks
= sh
->disks
;
1411 int pd_idx
= sh
->pd_idx
;
1412 int qd_idx
= sh
->qd_idx
;
1414 bool fua
= false, sync
= false, discard
= false;
1416 pr_debug("%s: stripe %llu\n", __func__
,
1417 (unsigned long long)sh
->sector
);
1419 for (i
= disks
; i
--; ) {
1420 fua
|= test_bit(R5_WantFUA
, &sh
->dev
[i
].flags
);
1421 sync
|= test_bit(R5_SyncIO
, &sh
->dev
[i
].flags
);
1422 discard
|= test_bit(R5_Discard
, &sh
->dev
[i
].flags
);
1425 for (i
= disks
; i
--; ) {
1426 struct r5dev
*dev
= &sh
->dev
[i
];
1428 if (dev
->written
|| i
== pd_idx
|| i
== qd_idx
) {
1430 set_bit(R5_UPTODATE
, &dev
->flags
);
1432 set_bit(R5_WantFUA
, &dev
->flags
);
1434 set_bit(R5_SyncIO
, &dev
->flags
);
1438 if (sh
->reconstruct_state
== reconstruct_state_drain_run
)
1439 sh
->reconstruct_state
= reconstruct_state_drain_result
;
1440 else if (sh
->reconstruct_state
== reconstruct_state_prexor_drain_run
)
1441 sh
->reconstruct_state
= reconstruct_state_prexor_drain_result
;
1443 BUG_ON(sh
->reconstruct_state
!= reconstruct_state_run
);
1444 sh
->reconstruct_state
= reconstruct_state_result
;
1447 set_bit(STRIPE_HANDLE
, &sh
->state
);
1452 ops_run_reconstruct5(struct stripe_head
*sh
, struct raid5_percpu
*percpu
,
1453 struct dma_async_tx_descriptor
*tx
)
1455 int disks
= sh
->disks
;
1456 struct page
**xor_srcs
= percpu
->scribble
;
1457 struct async_submit_ctl submit
;
1458 int count
= 0, pd_idx
= sh
->pd_idx
, i
;
1459 struct page
*xor_dest
;
1461 unsigned long flags
;
1463 pr_debug("%s: stripe %llu\n", __func__
,
1464 (unsigned long long)sh
->sector
);
1466 for (i
= 0; i
< sh
->disks
; i
++) {
1469 if (!test_bit(R5_Discard
, &sh
->dev
[i
].flags
))
1472 if (i
>= sh
->disks
) {
1473 atomic_inc(&sh
->count
);
1474 set_bit(R5_Discard
, &sh
->dev
[pd_idx
].flags
);
1475 ops_complete_reconstruct(sh
);
1478 /* check if prexor is active which means only process blocks
1479 * that are part of a read-modify-write (written)
1481 if (sh
->reconstruct_state
== reconstruct_state_prexor_drain_run
) {
1483 xor_dest
= xor_srcs
[count
++] = sh
->dev
[pd_idx
].page
;
1484 for (i
= disks
; i
--; ) {
1485 struct r5dev
*dev
= &sh
->dev
[i
];
1487 xor_srcs
[count
++] = dev
->page
;
1490 xor_dest
= sh
->dev
[pd_idx
].page
;
1491 for (i
= disks
; i
--; ) {
1492 struct r5dev
*dev
= &sh
->dev
[i
];
1494 xor_srcs
[count
++] = dev
->page
;
1498 /* 1/ if we prexor'd then the dest is reused as a source
1499 * 2/ if we did not prexor then we are redoing the parity
1500 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1501 * for the synchronous xor case
1503 flags
= ASYNC_TX_ACK
|
1504 (prexor
? ASYNC_TX_XOR_DROP_DST
: ASYNC_TX_XOR_ZERO_DST
);
1506 atomic_inc(&sh
->count
);
1508 init_async_submit(&submit
, flags
, tx
, ops_complete_reconstruct
, sh
,
1509 to_addr_conv(sh
, percpu
));
1510 if (unlikely(count
== 1))
1511 tx
= async_memcpy(xor_dest
, xor_srcs
[0], 0, 0, STRIPE_SIZE
, &submit
);
1513 tx
= async_xor(xor_dest
, xor_srcs
, 0, count
, STRIPE_SIZE
, &submit
);
1517 ops_run_reconstruct6(struct stripe_head
*sh
, struct raid5_percpu
*percpu
,
1518 struct dma_async_tx_descriptor
*tx
)
1520 struct async_submit_ctl submit
;
1521 struct page
**blocks
= percpu
->scribble
;
1524 pr_debug("%s: stripe %llu\n", __func__
, (unsigned long long)sh
->sector
);
1526 for (i
= 0; i
< sh
->disks
; i
++) {
1527 if (sh
->pd_idx
== i
|| sh
->qd_idx
== i
)
1529 if (!test_bit(R5_Discard
, &sh
->dev
[i
].flags
))
1532 if (i
>= sh
->disks
) {
1533 atomic_inc(&sh
->count
);
1534 set_bit(R5_Discard
, &sh
->dev
[sh
->pd_idx
].flags
);
1535 set_bit(R5_Discard
, &sh
->dev
[sh
->qd_idx
].flags
);
1536 ops_complete_reconstruct(sh
);
1540 count
= set_syndrome_sources(blocks
, sh
);
1542 atomic_inc(&sh
->count
);
1544 init_async_submit(&submit
, ASYNC_TX_ACK
, tx
, ops_complete_reconstruct
,
1545 sh
, to_addr_conv(sh
, percpu
));
1546 async_gen_syndrome(blocks
, 0, count
+2, STRIPE_SIZE
, &submit
);
1549 static void ops_complete_check(void *stripe_head_ref
)
1551 struct stripe_head
*sh
= stripe_head_ref
;
1553 pr_debug("%s: stripe %llu\n", __func__
,
1554 (unsigned long long)sh
->sector
);
1556 sh
->check_state
= check_state_check_result
;
1557 set_bit(STRIPE_HANDLE
, &sh
->state
);
1561 static void ops_run_check_p(struct stripe_head
*sh
, struct raid5_percpu
*percpu
)
1563 int disks
= sh
->disks
;
1564 int pd_idx
= sh
->pd_idx
;
1565 int qd_idx
= sh
->qd_idx
;
1566 struct page
*xor_dest
;
1567 struct page
**xor_srcs
= percpu
->scribble
;
1568 struct dma_async_tx_descriptor
*tx
;
1569 struct async_submit_ctl submit
;
1573 pr_debug("%s: stripe %llu\n", __func__
,
1574 (unsigned long long)sh
->sector
);
1577 xor_dest
= sh
->dev
[pd_idx
].page
;
1578 xor_srcs
[count
++] = xor_dest
;
1579 for (i
= disks
; i
--; ) {
1580 if (i
== pd_idx
|| i
== qd_idx
)
1582 xor_srcs
[count
++] = sh
->dev
[i
].page
;
1585 init_async_submit(&submit
, 0, NULL
, NULL
, NULL
,
1586 to_addr_conv(sh
, percpu
));
1587 tx
= async_xor_val(xor_dest
, xor_srcs
, 0, count
, STRIPE_SIZE
,
1588 &sh
->ops
.zero_sum_result
, &submit
);
1590 atomic_inc(&sh
->count
);
1591 init_async_submit(&submit
, ASYNC_TX_ACK
, tx
, ops_complete_check
, sh
, NULL
);
1592 tx
= async_trigger_callback(&submit
);
1595 static void ops_run_check_pq(struct stripe_head
*sh
, struct raid5_percpu
*percpu
, int checkp
)
1597 struct page
**srcs
= percpu
->scribble
;
1598 struct async_submit_ctl submit
;
1601 pr_debug("%s: stripe %llu checkp: %d\n", __func__
,
1602 (unsigned long long)sh
->sector
, checkp
);
1604 count
= set_syndrome_sources(srcs
, sh
);
1608 atomic_inc(&sh
->count
);
1609 init_async_submit(&submit
, ASYNC_TX_ACK
, NULL
, ops_complete_check
,
1610 sh
, to_addr_conv(sh
, percpu
));
1611 async_syndrome_val(srcs
, 0, count
+2, STRIPE_SIZE
,
1612 &sh
->ops
.zero_sum_result
, percpu
->spare_page
, &submit
);
1615 static void raid_run_ops(struct stripe_head
*sh
, unsigned long ops_request
)
1617 int overlap_clear
= 0, i
, disks
= sh
->disks
;
1618 struct dma_async_tx_descriptor
*tx
= NULL
;
1619 struct r5conf
*conf
= sh
->raid_conf
;
1620 int level
= conf
->level
;
1621 struct raid5_percpu
*percpu
;
1625 percpu
= per_cpu_ptr(conf
->percpu
, cpu
);
1626 if (test_bit(STRIPE_OP_BIOFILL
, &ops_request
)) {
1627 ops_run_biofill(sh
);
1631 if (test_bit(STRIPE_OP_COMPUTE_BLK
, &ops_request
)) {
1633 tx
= ops_run_compute5(sh
, percpu
);
1635 if (sh
->ops
.target2
< 0 || sh
->ops
.target
< 0)
1636 tx
= ops_run_compute6_1(sh
, percpu
);
1638 tx
= ops_run_compute6_2(sh
, percpu
);
1640 /* terminate the chain if reconstruct is not set to be run */
1641 if (tx
&& !test_bit(STRIPE_OP_RECONSTRUCT
, &ops_request
))
1645 if (test_bit(STRIPE_OP_PREXOR
, &ops_request
))
1646 tx
= ops_run_prexor(sh
, percpu
, tx
);
1648 if (test_bit(STRIPE_OP_BIODRAIN
, &ops_request
)) {
1649 tx
= ops_run_biodrain(sh
, tx
);
1653 if (test_bit(STRIPE_OP_RECONSTRUCT
, &ops_request
)) {
1655 ops_run_reconstruct5(sh
, percpu
, tx
);
1657 ops_run_reconstruct6(sh
, percpu
, tx
);
1660 if (test_bit(STRIPE_OP_CHECK
, &ops_request
)) {
1661 if (sh
->check_state
== check_state_run
)
1662 ops_run_check_p(sh
, percpu
);
1663 else if (sh
->check_state
== check_state_run_q
)
1664 ops_run_check_pq(sh
, percpu
, 0);
1665 else if (sh
->check_state
== check_state_run_pq
)
1666 ops_run_check_pq(sh
, percpu
, 1);
1672 for (i
= disks
; i
--; ) {
1673 struct r5dev
*dev
= &sh
->dev
[i
];
1674 if (test_and_clear_bit(R5_Overlap
, &dev
->flags
))
1675 wake_up(&sh
->raid_conf
->wait_for_overlap
);
1680 static int grow_one_stripe(struct r5conf
*conf
, int hash
)
1682 struct stripe_head
*sh
;
1683 sh
= kmem_cache_zalloc(conf
->slab_cache
, GFP_KERNEL
);
1687 sh
->raid_conf
= conf
;
1689 spin_lock_init(&sh
->stripe_lock
);
1691 if (grow_buffers(sh
)) {
1693 kmem_cache_free(conf
->slab_cache
, sh
);
1696 sh
->hash_lock_index
= hash
;
1697 /* we just created an active stripe so... */
1698 atomic_set(&sh
->count
, 1);
1699 atomic_inc(&conf
->active_stripes
);
1700 INIT_LIST_HEAD(&sh
->lru
);
1705 static int grow_stripes(struct r5conf
*conf
, int num
)
1707 struct kmem_cache
*sc
;
1708 int devs
= max(conf
->raid_disks
, conf
->previous_raid_disks
);
1711 if (conf
->mddev
->gendisk
)
1712 sprintf(conf
->cache_name
[0],
1713 "raid%d-%s", conf
->level
, mdname(conf
->mddev
));
1715 sprintf(conf
->cache_name
[0],
1716 "raid%d-%p", conf
->level
, conf
->mddev
);
1717 sprintf(conf
->cache_name
[1], "%s-alt", conf
->cache_name
[0]);
1719 conf
->active_name
= 0;
1720 sc
= kmem_cache_create(conf
->cache_name
[conf
->active_name
],
1721 sizeof(struct stripe_head
)+(devs
-1)*sizeof(struct r5dev
),
1725 conf
->slab_cache
= sc
;
1726 conf
->pool_size
= devs
;
1727 hash
= conf
->max_nr_stripes
% NR_STRIPE_HASH_LOCKS
;
1729 if (!grow_one_stripe(conf
, hash
))
1731 conf
->max_nr_stripes
++;
1732 hash
= (hash
+ 1) % NR_STRIPE_HASH_LOCKS
;
1738 * scribble_len - return the required size of the scribble region
1739 * @num - total number of disks in the array
1741 * The size must be enough to contain:
1742 * 1/ a struct page pointer for each device in the array +2
1743 * 2/ room to convert each entry in (1) to its corresponding dma
1744 * (dma_map_page()) or page (page_address()) address.
1746 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1747 * calculate over all devices (not just the data blocks), using zeros in place
1748 * of the P and Q blocks.
1750 static size_t scribble_len(int num
)
1754 len
= sizeof(struct page
*) * (num
+2) + sizeof(addr_conv_t
) * (num
+2);
1759 static int resize_stripes(struct r5conf
*conf
, int newsize
)
1761 /* Make all the stripes able to hold 'newsize' devices.
1762 * New slots in each stripe get 'page' set to a new page.
1764 * This happens in stages:
1765 * 1/ create a new kmem_cache and allocate the required number of
1767 * 2/ gather all the old stripe_heads and transfer the pages across
1768 * to the new stripe_heads. This will have the side effect of
1769 * freezing the array as once all stripe_heads have been collected,
1770 * no IO will be possible. Old stripe heads are freed once their
1771 * pages have been transferred over, and the old kmem_cache is
1772 * freed when all stripes are done.
1773 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
1774 * we simple return a failre status - no need to clean anything up.
1775 * 4/ allocate new pages for the new slots in the new stripe_heads.
1776 * If this fails, we don't bother trying the shrink the
1777 * stripe_heads down again, we just leave them as they are.
1778 * As each stripe_head is processed the new one is released into
1781 * Once step2 is started, we cannot afford to wait for a write,
1782 * so we use GFP_NOIO allocations.
1784 struct stripe_head
*osh
, *nsh
;
1785 LIST_HEAD(newstripes
);
1786 struct disk_info
*ndisks
;
1789 struct kmem_cache
*sc
;
1793 if (newsize
<= conf
->pool_size
)
1794 return 0; /* never bother to shrink */
1796 err
= md_allow_write(conf
->mddev
);
1801 sc
= kmem_cache_create(conf
->cache_name
[1-conf
->active_name
],
1802 sizeof(struct stripe_head
)+(newsize
-1)*sizeof(struct r5dev
),
1807 for (i
= conf
->max_nr_stripes
; i
; i
--) {
1808 nsh
= kmem_cache_zalloc(sc
, GFP_KERNEL
);
1812 nsh
->raid_conf
= conf
;
1813 spin_lock_init(&nsh
->stripe_lock
);
1815 list_add(&nsh
->lru
, &newstripes
);
1818 /* didn't get enough, give up */
1819 while (!list_empty(&newstripes
)) {
1820 nsh
= list_entry(newstripes
.next
, struct stripe_head
, lru
);
1821 list_del(&nsh
->lru
);
1822 kmem_cache_free(sc
, nsh
);
1824 kmem_cache_destroy(sc
);
1827 /* Step 2 - Must use GFP_NOIO now.
1828 * OK, we have enough stripes, start collecting inactive
1829 * stripes and copying them over
1833 list_for_each_entry(nsh
, &newstripes
, lru
) {
1834 lock_device_hash_lock(conf
, hash
);
1835 wait_event_cmd(conf
->wait_for_stripe
,
1836 !list_empty(conf
->inactive_list
+ hash
),
1837 unlock_device_hash_lock(conf
, hash
),
1838 lock_device_hash_lock(conf
, hash
));
1839 osh
= get_free_stripe(conf
, hash
);
1840 unlock_device_hash_lock(conf
, hash
);
1841 atomic_set(&nsh
->count
, 1);
1842 for(i
=0; i
<conf
->pool_size
; i
++)
1843 nsh
->dev
[i
].page
= osh
->dev
[i
].page
;
1844 for( ; i
<newsize
; i
++)
1845 nsh
->dev
[i
].page
= NULL
;
1846 nsh
->hash_lock_index
= hash
;
1847 kmem_cache_free(conf
->slab_cache
, osh
);
1849 if (cnt
>= conf
->max_nr_stripes
/ NR_STRIPE_HASH_LOCKS
+
1850 !!((conf
->max_nr_stripes
% NR_STRIPE_HASH_LOCKS
) > hash
)) {
1855 kmem_cache_destroy(conf
->slab_cache
);
1858 * At this point, we are holding all the stripes so the array
1859 * is completely stalled, so now is a good time to resize
1860 * conf->disks and the scribble region
1862 ndisks
= kzalloc(newsize
* sizeof(struct disk_info
), GFP_NOIO
);
1864 for (i
=0; i
<conf
->raid_disks
; i
++)
1865 ndisks
[i
] = conf
->disks
[i
];
1867 conf
->disks
= ndisks
;
1872 conf
->scribble_len
= scribble_len(newsize
);
1873 for_each_present_cpu(cpu
) {
1874 struct raid5_percpu
*percpu
;
1877 percpu
= per_cpu_ptr(conf
->percpu
, cpu
);
1878 scribble
= kmalloc(conf
->scribble_len
, GFP_NOIO
);
1881 kfree(percpu
->scribble
);
1882 percpu
->scribble
= scribble
;
1890 /* Step 4, return new stripes to service */
1891 while(!list_empty(&newstripes
)) {
1892 nsh
= list_entry(newstripes
.next
, struct stripe_head
, lru
);
1893 list_del_init(&nsh
->lru
);
1895 for (i
=conf
->raid_disks
; i
< newsize
; i
++)
1896 if (nsh
->dev
[i
].page
== NULL
) {
1897 struct page
*p
= alloc_page(GFP_NOIO
);
1898 nsh
->dev
[i
].page
= p
;
1902 release_stripe(nsh
);
1904 /* critical section pass, GFP_NOIO no longer needed */
1906 conf
->slab_cache
= sc
;
1907 conf
->active_name
= 1-conf
->active_name
;
1908 conf
->pool_size
= newsize
;
1912 static int drop_one_stripe(struct r5conf
*conf
, int hash
)
1914 struct stripe_head
*sh
;
1916 spin_lock_irq(conf
->hash_locks
+ hash
);
1917 sh
= get_free_stripe(conf
, hash
);
1918 spin_unlock_irq(conf
->hash_locks
+ hash
);
1921 BUG_ON(atomic_read(&sh
->count
));
1923 kmem_cache_free(conf
->slab_cache
, sh
);
1924 atomic_dec(&conf
->active_stripes
);
1928 static void shrink_stripes(struct r5conf
*conf
)
1931 for (hash
= 0; hash
< NR_STRIPE_HASH_LOCKS
; hash
++)
1932 while (drop_one_stripe(conf
, hash
))
1935 if (conf
->slab_cache
)
1936 kmem_cache_destroy(conf
->slab_cache
);
1937 conf
->slab_cache
= NULL
;
1940 static void raid5_end_read_request(struct bio
* bi
, int error
)
1942 struct stripe_head
*sh
= bi
->bi_private
;
1943 struct r5conf
*conf
= sh
->raid_conf
;
1944 int disks
= sh
->disks
, i
;
1945 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
1946 char b
[BDEVNAME_SIZE
];
1947 struct md_rdev
*rdev
= NULL
;
1950 for (i
=0 ; i
<disks
; i
++)
1951 if (bi
== &sh
->dev
[i
].req
)
1954 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1955 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
1961 if (test_bit(R5_ReadRepl
, &sh
->dev
[i
].flags
))
1962 /* If replacement finished while this request was outstanding,
1963 * 'replacement' might be NULL already.
1964 * In that case it moved down to 'rdev'.
1965 * rdev is not removed until all requests are finished.
1967 rdev
= conf
->disks
[i
].replacement
;
1969 rdev
= conf
->disks
[i
].rdev
;
1971 if (use_new_offset(conf
, sh
))
1972 s
= sh
->sector
+ rdev
->new_data_offset
;
1974 s
= sh
->sector
+ rdev
->data_offset
;
1976 set_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
1977 if (test_bit(R5_ReadError
, &sh
->dev
[i
].flags
)) {
1978 /* Note that this cannot happen on a
1979 * replacement device. We just fail those on
1984 "md/raid:%s: read error corrected"
1985 " (%lu sectors at %llu on %s)\n",
1986 mdname(conf
->mddev
), STRIPE_SECTORS
,
1987 (unsigned long long)s
,
1988 bdevname(rdev
->bdev
, b
));
1989 atomic_add(STRIPE_SECTORS
, &rdev
->corrected_errors
);
1990 clear_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
1991 clear_bit(R5_ReWrite
, &sh
->dev
[i
].flags
);
1992 } else if (test_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
))
1993 clear_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
);
1995 if (atomic_read(&rdev
->read_errors
))
1996 atomic_set(&rdev
->read_errors
, 0);
1998 const char *bdn
= bdevname(rdev
->bdev
, b
);
2002 clear_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
);
2003 atomic_inc(&rdev
->read_errors
);
2004 if (test_bit(R5_ReadRepl
, &sh
->dev
[i
].flags
))
2007 "md/raid:%s: read error on replacement device "
2008 "(sector %llu on %s).\n",
2009 mdname(conf
->mddev
),
2010 (unsigned long long)s
,
2012 else if (conf
->mddev
->degraded
>= conf
->max_degraded
) {
2016 "md/raid:%s: read error not correctable "
2017 "(sector %llu on %s).\n",
2018 mdname(conf
->mddev
),
2019 (unsigned long long)s
,
2021 } else if (test_bit(R5_ReWrite
, &sh
->dev
[i
].flags
)) {
2026 "md/raid:%s: read error NOT corrected!! "
2027 "(sector %llu on %s).\n",
2028 mdname(conf
->mddev
),
2029 (unsigned long long)s
,
2031 } else if (atomic_read(&rdev
->read_errors
)
2032 > conf
->max_nr_stripes
)
2034 "md/raid:%s: Too many read errors, failing device %s.\n",
2035 mdname(conf
->mddev
), bdn
);
2038 if (set_bad
&& test_bit(In_sync
, &rdev
->flags
)
2039 && !test_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
))
2042 if (test_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
)) {
2043 set_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
2044 clear_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
);
2046 set_bit(R5_ReadNoMerge
, &sh
->dev
[i
].flags
);
2048 clear_bit(R5_ReadError
, &sh
->dev
[i
].flags
);
2049 clear_bit(R5_ReWrite
, &sh
->dev
[i
].flags
);
2051 && test_bit(In_sync
, &rdev
->flags
)
2052 && rdev_set_badblocks(
2053 rdev
, sh
->sector
, STRIPE_SECTORS
, 0)))
2054 md_error(conf
->mddev
, rdev
);
2057 rdev_dec_pending(rdev
, conf
->mddev
);
2058 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
2059 set_bit(STRIPE_HANDLE
, &sh
->state
);
2063 static void raid5_end_write_request(struct bio
*bi
, int error
)
2065 struct stripe_head
*sh
= bi
->bi_private
;
2066 struct r5conf
*conf
= sh
->raid_conf
;
2067 int disks
= sh
->disks
, i
;
2068 struct md_rdev
*uninitialized_var(rdev
);
2069 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
2072 int replacement
= 0;
2074 for (i
= 0 ; i
< disks
; i
++) {
2075 if (bi
== &sh
->dev
[i
].req
) {
2076 rdev
= conf
->disks
[i
].rdev
;
2079 if (bi
== &sh
->dev
[i
].rreq
) {
2080 rdev
= conf
->disks
[i
].replacement
;
2084 /* rdev was removed and 'replacement'
2085 * replaced it. rdev is not removed
2086 * until all requests are finished.
2088 rdev
= conf
->disks
[i
].rdev
;
2092 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
2093 (unsigned long long)sh
->sector
, i
, atomic_read(&sh
->count
),
2102 md_error(conf
->mddev
, rdev
);
2103 else if (is_badblock(rdev
, sh
->sector
,
2105 &first_bad
, &bad_sectors
))
2106 set_bit(R5_MadeGoodRepl
, &sh
->dev
[i
].flags
);
2109 set_bit(STRIPE_DEGRADED
, &sh
->state
);
2110 set_bit(WriteErrorSeen
, &rdev
->flags
);
2111 set_bit(R5_WriteError
, &sh
->dev
[i
].flags
);
2112 if (!test_and_set_bit(WantReplacement
, &rdev
->flags
))
2113 set_bit(MD_RECOVERY_NEEDED
,
2114 &rdev
->mddev
->recovery
);
2115 } else if (is_badblock(rdev
, sh
->sector
,
2117 &first_bad
, &bad_sectors
)) {
2118 set_bit(R5_MadeGood
, &sh
->dev
[i
].flags
);
2119 if (test_bit(R5_ReadError
, &sh
->dev
[i
].flags
))
2120 /* That was a successful write so make
2121 * sure it looks like we already did
2124 set_bit(R5_ReWrite
, &sh
->dev
[i
].flags
);
2127 rdev_dec_pending(rdev
, conf
->mddev
);
2129 if (!test_and_clear_bit(R5_DOUBLE_LOCKED
, &sh
->dev
[i
].flags
))
2130 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
2131 set_bit(STRIPE_HANDLE
, &sh
->state
);
2135 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
, int previous
);
2137 static void raid5_build_block(struct stripe_head
*sh
, int i
, int previous
)
2139 struct r5dev
*dev
= &sh
->dev
[i
];
2141 bio_init(&dev
->req
);
2142 dev
->req
.bi_io_vec
= &dev
->vec
;
2144 dev
->req
.bi_max_vecs
++;
2145 dev
->req
.bi_private
= sh
;
2146 dev
->vec
.bv_page
= dev
->page
;
2148 bio_init(&dev
->rreq
);
2149 dev
->rreq
.bi_io_vec
= &dev
->rvec
;
2150 dev
->rreq
.bi_vcnt
++;
2151 dev
->rreq
.bi_max_vecs
++;
2152 dev
->rreq
.bi_private
= sh
;
2153 dev
->rvec
.bv_page
= dev
->page
;
2156 dev
->sector
= compute_blocknr(sh
, i
, previous
);
2159 static void error(struct mddev
*mddev
, struct md_rdev
*rdev
)
2161 char b
[BDEVNAME_SIZE
];
2162 struct r5conf
*conf
= mddev
->private;
2163 unsigned long flags
;
2164 pr_debug("raid456: error called\n");
2166 spin_lock_irqsave(&conf
->device_lock
, flags
);
2167 clear_bit(In_sync
, &rdev
->flags
);
2168 mddev
->degraded
= calc_degraded(conf
);
2169 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
2170 set_bit(MD_RECOVERY_INTR
, &mddev
->recovery
);
2172 set_bit(Blocked
, &rdev
->flags
);
2173 set_bit(Faulty
, &rdev
->flags
);
2174 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
2176 "md/raid:%s: Disk failure on %s, disabling device.\n"
2177 "md/raid:%s: Operation continuing on %d devices.\n",
2179 bdevname(rdev
->bdev
, b
),
2181 conf
->raid_disks
- mddev
->degraded
);
2185 * Input: a 'big' sector number,
2186 * Output: index of the data and parity disk, and the sector # in them.
2188 static sector_t
raid5_compute_sector(struct r5conf
*conf
, sector_t r_sector
,
2189 int previous
, int *dd_idx
,
2190 struct stripe_head
*sh
)
2192 sector_t stripe
, stripe2
;
2193 sector_t chunk_number
;
2194 unsigned int chunk_offset
;
2197 sector_t new_sector
;
2198 int algorithm
= previous
? conf
->prev_algo
2200 int sectors_per_chunk
= previous
? conf
->prev_chunk_sectors
2201 : conf
->chunk_sectors
;
2202 int raid_disks
= previous
? conf
->previous_raid_disks
2204 int data_disks
= raid_disks
- conf
->max_degraded
;
2206 /* First compute the information on this sector */
2209 * Compute the chunk number and the sector offset inside the chunk
2211 chunk_offset
= sector_div(r_sector
, sectors_per_chunk
);
2212 chunk_number
= r_sector
;
2215 * Compute the stripe number
2217 stripe
= chunk_number
;
2218 *dd_idx
= sector_div(stripe
, data_disks
);
2221 * Select the parity disk based on the user selected algorithm.
2223 pd_idx
= qd_idx
= -1;
2224 switch(conf
->level
) {
2226 pd_idx
= data_disks
;
2229 switch (algorithm
) {
2230 case ALGORITHM_LEFT_ASYMMETRIC
:
2231 pd_idx
= data_disks
- sector_div(stripe2
, raid_disks
);
2232 if (*dd_idx
>= pd_idx
)
2235 case ALGORITHM_RIGHT_ASYMMETRIC
:
2236 pd_idx
= sector_div(stripe2
, raid_disks
);
2237 if (*dd_idx
>= pd_idx
)
2240 case ALGORITHM_LEFT_SYMMETRIC
:
2241 pd_idx
= data_disks
- sector_div(stripe2
, raid_disks
);
2242 *dd_idx
= (pd_idx
+ 1 + *dd_idx
) % raid_disks
;
2244 case ALGORITHM_RIGHT_SYMMETRIC
:
2245 pd_idx
= sector_div(stripe2
, raid_disks
);
2246 *dd_idx
= (pd_idx
+ 1 + *dd_idx
) % raid_disks
;
2248 case ALGORITHM_PARITY_0
:
2252 case ALGORITHM_PARITY_N
:
2253 pd_idx
= data_disks
;
2261 switch (algorithm
) {
2262 case ALGORITHM_LEFT_ASYMMETRIC
:
2263 pd_idx
= raid_disks
- 1 - sector_div(stripe2
, raid_disks
);
2264 qd_idx
= pd_idx
+ 1;
2265 if (pd_idx
== raid_disks
-1) {
2266 (*dd_idx
)++; /* Q D D D P */
2268 } else if (*dd_idx
>= pd_idx
)
2269 (*dd_idx
) += 2; /* D D P Q D */
2271 case ALGORITHM_RIGHT_ASYMMETRIC
:
2272 pd_idx
= sector_div(stripe2
, raid_disks
);
2273 qd_idx
= pd_idx
+ 1;
2274 if (pd_idx
== raid_disks
-1) {
2275 (*dd_idx
)++; /* Q D D D P */
2277 } else if (*dd_idx
>= pd_idx
)
2278 (*dd_idx
) += 2; /* D D P Q D */
2280 case ALGORITHM_LEFT_SYMMETRIC
:
2281 pd_idx
= raid_disks
- 1 - sector_div(stripe2
, raid_disks
);
2282 qd_idx
= (pd_idx
+ 1) % raid_disks
;
2283 *dd_idx
= (pd_idx
+ 2 + *dd_idx
) % raid_disks
;
2285 case ALGORITHM_RIGHT_SYMMETRIC
:
2286 pd_idx
= sector_div(stripe2
, raid_disks
);
2287 qd_idx
= (pd_idx
+ 1) % raid_disks
;
2288 *dd_idx
= (pd_idx
+ 2 + *dd_idx
) % raid_disks
;
2291 case ALGORITHM_PARITY_0
:
2296 case ALGORITHM_PARITY_N
:
2297 pd_idx
= data_disks
;
2298 qd_idx
= data_disks
+ 1;
2301 case ALGORITHM_ROTATING_ZERO_RESTART
:
2302 /* Exactly the same as RIGHT_ASYMMETRIC, but or
2303 * of blocks for computing Q is different.
2305 pd_idx
= sector_div(stripe2
, raid_disks
);
2306 qd_idx
= pd_idx
+ 1;
2307 if (pd_idx
== raid_disks
-1) {
2308 (*dd_idx
)++; /* Q D D D P */
2310 } else if (*dd_idx
>= pd_idx
)
2311 (*dd_idx
) += 2; /* D D P Q D */
2315 case ALGORITHM_ROTATING_N_RESTART
:
2316 /* Same a left_asymmetric, by first stripe is
2317 * D D D P Q rather than
2321 pd_idx
= raid_disks
- 1 - sector_div(stripe2
, raid_disks
);
2322 qd_idx
= pd_idx
+ 1;
2323 if (pd_idx
== raid_disks
-1) {
2324 (*dd_idx
)++; /* Q D D D P */
2326 } else if (*dd_idx
>= pd_idx
)
2327 (*dd_idx
) += 2; /* D D P Q D */
2331 case ALGORITHM_ROTATING_N_CONTINUE
:
2332 /* Same as left_symmetric but Q is before P */
2333 pd_idx
= raid_disks
- 1 - sector_div(stripe2
, raid_disks
);
2334 qd_idx
= (pd_idx
+ raid_disks
- 1) % raid_disks
;
2335 *dd_idx
= (pd_idx
+ 1 + *dd_idx
) % raid_disks
;
2339 case ALGORITHM_LEFT_ASYMMETRIC_6
:
2340 /* RAID5 left_asymmetric, with Q on last device */
2341 pd_idx
= data_disks
- sector_div(stripe2
, raid_disks
-1);
2342 if (*dd_idx
>= pd_idx
)
2344 qd_idx
= raid_disks
- 1;
2347 case ALGORITHM_RIGHT_ASYMMETRIC_6
:
2348 pd_idx
= sector_div(stripe2
, raid_disks
-1);
2349 if (*dd_idx
>= pd_idx
)
2351 qd_idx
= raid_disks
- 1;
2354 case ALGORITHM_LEFT_SYMMETRIC_6
:
2355 pd_idx
= data_disks
- sector_div(stripe2
, raid_disks
-1);
2356 *dd_idx
= (pd_idx
+ 1 + *dd_idx
) % (raid_disks
-1);
2357 qd_idx
= raid_disks
- 1;
2360 case ALGORITHM_RIGHT_SYMMETRIC_6
:
2361 pd_idx
= sector_div(stripe2
, raid_disks
-1);
2362 *dd_idx
= (pd_idx
+ 1 + *dd_idx
) % (raid_disks
-1);
2363 qd_idx
= raid_disks
- 1;
2366 case ALGORITHM_PARITY_0_6
:
2369 qd_idx
= raid_disks
- 1;
2379 sh
->pd_idx
= pd_idx
;
2380 sh
->qd_idx
= qd_idx
;
2381 sh
->ddf_layout
= ddf_layout
;
2384 * Finally, compute the new sector number
2386 new_sector
= (sector_t
)stripe
* sectors_per_chunk
+ chunk_offset
;
2391 static sector_t
compute_blocknr(struct stripe_head
*sh
, int i
, int previous
)
2393 struct r5conf
*conf
= sh
->raid_conf
;
2394 int raid_disks
= sh
->disks
;
2395 int data_disks
= raid_disks
- conf
->max_degraded
;
2396 sector_t new_sector
= sh
->sector
, check
;
2397 int sectors_per_chunk
= previous
? conf
->prev_chunk_sectors
2398 : conf
->chunk_sectors
;
2399 int algorithm
= previous
? conf
->prev_algo
2403 sector_t chunk_number
;
2404 int dummy1
, dd_idx
= i
;
2406 struct stripe_head sh2
;
2409 chunk_offset
= sector_div(new_sector
, sectors_per_chunk
);
2410 stripe
= new_sector
;
2412 if (i
== sh
->pd_idx
)
2414 switch(conf
->level
) {
2417 switch (algorithm
) {
2418 case ALGORITHM_LEFT_ASYMMETRIC
:
2419 case ALGORITHM_RIGHT_ASYMMETRIC
:
2423 case ALGORITHM_LEFT_SYMMETRIC
:
2424 case ALGORITHM_RIGHT_SYMMETRIC
:
2427 i
-= (sh
->pd_idx
+ 1);
2429 case ALGORITHM_PARITY_0
:
2432 case ALGORITHM_PARITY_N
:
2439 if (i
== sh
->qd_idx
)
2440 return 0; /* It is the Q disk */
2441 switch (algorithm
) {
2442 case ALGORITHM_LEFT_ASYMMETRIC
:
2443 case ALGORITHM_RIGHT_ASYMMETRIC
:
2444 case ALGORITHM_ROTATING_ZERO_RESTART
:
2445 case ALGORITHM_ROTATING_N_RESTART
:
2446 if (sh
->pd_idx
== raid_disks
-1)
2447 i
--; /* Q D D D P */
2448 else if (i
> sh
->pd_idx
)
2449 i
-= 2; /* D D P Q D */
2451 case ALGORITHM_LEFT_SYMMETRIC
:
2452 case ALGORITHM_RIGHT_SYMMETRIC
:
2453 if (sh
->pd_idx
== raid_disks
-1)
2454 i
--; /* Q D D D P */
2459 i
-= (sh
->pd_idx
+ 2);
2462 case ALGORITHM_PARITY_0
:
2465 case ALGORITHM_PARITY_N
:
2467 case ALGORITHM_ROTATING_N_CONTINUE
:
2468 /* Like left_symmetric, but P is before Q */
2469 if (sh
->pd_idx
== 0)
2470 i
--; /* P D D D Q */
2475 i
-= (sh
->pd_idx
+ 1);
2478 case ALGORITHM_LEFT_ASYMMETRIC_6
:
2479 case ALGORITHM_RIGHT_ASYMMETRIC_6
:
2483 case ALGORITHM_LEFT_SYMMETRIC_6
:
2484 case ALGORITHM_RIGHT_SYMMETRIC_6
:
2486 i
+= data_disks
+ 1;
2487 i
-= (sh
->pd_idx
+ 1);
2489 case ALGORITHM_PARITY_0_6
:
2498 chunk_number
= stripe
* data_disks
+ i
;
2499 r_sector
= chunk_number
* sectors_per_chunk
+ chunk_offset
;
2501 check
= raid5_compute_sector(conf
, r_sector
,
2502 previous
, &dummy1
, &sh2
);
2503 if (check
!= sh
->sector
|| dummy1
!= dd_idx
|| sh2
.pd_idx
!= sh
->pd_idx
2504 || sh2
.qd_idx
!= sh
->qd_idx
) {
2505 printk(KERN_ERR
"md/raid:%s: compute_blocknr: map not correct\n",
2506 mdname(conf
->mddev
));
2514 schedule_reconstruction(struct stripe_head
*sh
, struct stripe_head_state
*s
,
2515 int rcw
, int expand
)
2517 int i
, pd_idx
= sh
->pd_idx
, disks
= sh
->disks
;
2518 struct r5conf
*conf
= sh
->raid_conf
;
2519 int level
= conf
->level
;
2523 for (i
= disks
; i
--; ) {
2524 struct r5dev
*dev
= &sh
->dev
[i
];
2527 set_bit(R5_LOCKED
, &dev
->flags
);
2528 set_bit(R5_Wantdrain
, &dev
->flags
);
2530 clear_bit(R5_UPTODATE
, &dev
->flags
);
2534 /* if we are not expanding this is a proper write request, and
2535 * there will be bios with new data to be drained into the
2540 /* False alarm, nothing to do */
2542 sh
->reconstruct_state
= reconstruct_state_drain_run
;
2543 set_bit(STRIPE_OP_BIODRAIN
, &s
->ops_request
);
2545 sh
->reconstruct_state
= reconstruct_state_run
;
2547 set_bit(STRIPE_OP_RECONSTRUCT
, &s
->ops_request
);
2549 if (s
->locked
+ conf
->max_degraded
== disks
)
2550 if (!test_and_set_bit(STRIPE_FULL_WRITE
, &sh
->state
))
2551 atomic_inc(&conf
->pending_full_writes
);
2554 BUG_ON(!(test_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
) ||
2555 test_bit(R5_Wantcompute
, &sh
->dev
[pd_idx
].flags
)));
2557 for (i
= disks
; i
--; ) {
2558 struct r5dev
*dev
= &sh
->dev
[i
];
2563 (test_bit(R5_UPTODATE
, &dev
->flags
) ||
2564 test_bit(R5_Wantcompute
, &dev
->flags
))) {
2565 set_bit(R5_Wantdrain
, &dev
->flags
);
2566 set_bit(R5_LOCKED
, &dev
->flags
);
2567 clear_bit(R5_UPTODATE
, &dev
->flags
);
2572 /* False alarm - nothing to do */
2574 sh
->reconstruct_state
= reconstruct_state_prexor_drain_run
;
2575 set_bit(STRIPE_OP_PREXOR
, &s
->ops_request
);
2576 set_bit(STRIPE_OP_BIODRAIN
, &s
->ops_request
);
2577 set_bit(STRIPE_OP_RECONSTRUCT
, &s
->ops_request
);
2580 /* keep the parity disk(s) locked while asynchronous operations
2583 set_bit(R5_LOCKED
, &sh
->dev
[pd_idx
].flags
);
2584 clear_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
2588 int qd_idx
= sh
->qd_idx
;
2589 struct r5dev
*dev
= &sh
->dev
[qd_idx
];
2591 set_bit(R5_LOCKED
, &dev
->flags
);
2592 clear_bit(R5_UPTODATE
, &dev
->flags
);
2596 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2597 __func__
, (unsigned long long)sh
->sector
,
2598 s
->locked
, s
->ops_request
);
2602 * Each stripe/dev can have one or more bion attached.
2603 * toread/towrite point to the first in a chain.
2604 * The bi_next chain must be in order.
2606 static int add_stripe_bio(struct stripe_head
*sh
, struct bio
*bi
, int dd_idx
, int forwrite
)
2609 struct r5conf
*conf
= sh
->raid_conf
;
2612 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2613 (unsigned long long)bi
->bi_iter
.bi_sector
,
2614 (unsigned long long)sh
->sector
);
2617 * If several bio share a stripe. The bio bi_phys_segments acts as a
2618 * reference count to avoid race. The reference count should already be
2619 * increased before this function is called (for example, in
2620 * make_request()), so other bio sharing this stripe will not free the
2621 * stripe. If a stripe is owned by one stripe, the stripe lock will
2624 spin_lock_irq(&sh
->stripe_lock
);
2626 bip
= &sh
->dev
[dd_idx
].towrite
;
2630 bip
= &sh
->dev
[dd_idx
].toread
;
2631 while (*bip
&& (*bip
)->bi_iter
.bi_sector
< bi
->bi_iter
.bi_sector
) {
2632 if (bio_end_sector(*bip
) > bi
->bi_iter
.bi_sector
)
2634 bip
= & (*bip
)->bi_next
;
2636 if (*bip
&& (*bip
)->bi_iter
.bi_sector
< bio_end_sector(bi
))
2639 BUG_ON(*bip
&& bi
->bi_next
&& (*bip
) != bi
->bi_next
);
2643 raid5_inc_bi_active_stripes(bi
);
2646 /* check if page is covered */
2647 sector_t sector
= sh
->dev
[dd_idx
].sector
;
2648 for (bi
=sh
->dev
[dd_idx
].towrite
;
2649 sector
< sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
&&
2650 bi
&& bi
->bi_iter
.bi_sector
<= sector
;
2651 bi
= r5_next_bio(bi
, sh
->dev
[dd_idx
].sector
)) {
2652 if (bio_end_sector(bi
) >= sector
)
2653 sector
= bio_end_sector(bi
);
2655 if (sector
>= sh
->dev
[dd_idx
].sector
+ STRIPE_SECTORS
)
2656 set_bit(R5_OVERWRITE
, &sh
->dev
[dd_idx
].flags
);
2659 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2660 (unsigned long long)(*bip
)->bi_iter
.bi_sector
,
2661 (unsigned long long)sh
->sector
, dd_idx
);
2662 spin_unlock_irq(&sh
->stripe_lock
);
2664 if (conf
->mddev
->bitmap
&& firstwrite
) {
2665 bitmap_startwrite(conf
->mddev
->bitmap
, sh
->sector
,
2667 sh
->bm_seq
= conf
->seq_flush
+1;
2668 set_bit(STRIPE_BIT_DELAY
, &sh
->state
);
2673 set_bit(R5_Overlap
, &sh
->dev
[dd_idx
].flags
);
2674 spin_unlock_irq(&sh
->stripe_lock
);
2678 static void end_reshape(struct r5conf
*conf
);
2680 static void stripe_set_idx(sector_t stripe
, struct r5conf
*conf
, int previous
,
2681 struct stripe_head
*sh
)
2683 int sectors_per_chunk
=
2684 previous
? conf
->prev_chunk_sectors
: conf
->chunk_sectors
;
2686 int chunk_offset
= sector_div(stripe
, sectors_per_chunk
);
2687 int disks
= previous
? conf
->previous_raid_disks
: conf
->raid_disks
;
2689 raid5_compute_sector(conf
,
2690 stripe
* (disks
- conf
->max_degraded
)
2691 *sectors_per_chunk
+ chunk_offset
,
2697 handle_failed_stripe(struct r5conf
*conf
, struct stripe_head
*sh
,
2698 struct stripe_head_state
*s
, int disks
,
2699 struct bio
**return_bi
)
2702 for (i
= disks
; i
--; ) {
2706 if (test_bit(R5_ReadError
, &sh
->dev
[i
].flags
)) {
2707 struct md_rdev
*rdev
;
2709 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
2710 if (rdev
&& test_bit(In_sync
, &rdev
->flags
))
2711 atomic_inc(&rdev
->nr_pending
);
2716 if (!rdev_set_badblocks(
2720 md_error(conf
->mddev
, rdev
);
2721 rdev_dec_pending(rdev
, conf
->mddev
);
2724 spin_lock_irq(&sh
->stripe_lock
);
2725 /* fail all writes first */
2726 bi
= sh
->dev
[i
].towrite
;
2727 sh
->dev
[i
].towrite
= NULL
;
2728 spin_unlock_irq(&sh
->stripe_lock
);
2732 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
2733 wake_up(&conf
->wait_for_overlap
);
2735 while (bi
&& bi
->bi_iter
.bi_sector
<
2736 sh
->dev
[i
].sector
+ STRIPE_SECTORS
) {
2737 struct bio
*nextbi
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
2738 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
2739 if (!raid5_dec_bi_active_stripes(bi
)) {
2740 md_write_end(conf
->mddev
);
2741 bi
->bi_next
= *return_bi
;
2747 bitmap_endwrite(conf
->mddev
->bitmap
, sh
->sector
,
2748 STRIPE_SECTORS
, 0, 0);
2750 /* and fail all 'written' */
2751 bi
= sh
->dev
[i
].written
;
2752 sh
->dev
[i
].written
= NULL
;
2753 if (bi
) bitmap_end
= 1;
2754 while (bi
&& bi
->bi_iter
.bi_sector
<
2755 sh
->dev
[i
].sector
+ STRIPE_SECTORS
) {
2756 struct bio
*bi2
= r5_next_bio(bi
, sh
->dev
[i
].sector
);
2757 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
2758 if (!raid5_dec_bi_active_stripes(bi
)) {
2759 md_write_end(conf
->mddev
);
2760 bi
->bi_next
= *return_bi
;
2766 /* fail any reads if this device is non-operational and
2767 * the data has not reached the cache yet.
2769 if (!test_bit(R5_Wantfill
, &sh
->dev
[i
].flags
) &&
2770 (!test_bit(R5_Insync
, &sh
->dev
[i
].flags
) ||
2771 test_bit(R5_ReadError
, &sh
->dev
[i
].flags
))) {
2772 spin_lock_irq(&sh
->stripe_lock
);
2773 bi
= sh
->dev
[i
].toread
;
2774 sh
->dev
[i
].toread
= NULL
;
2775 spin_unlock_irq(&sh
->stripe_lock
);
2776 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[i
].flags
))
2777 wake_up(&conf
->wait_for_overlap
);
2778 while (bi
&& bi
->bi_iter
.bi_sector
<
2779 sh
->dev
[i
].sector
+ STRIPE_SECTORS
) {
2780 struct bio
*nextbi
=
2781 r5_next_bio(bi
, sh
->dev
[i
].sector
);
2782 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
2783 if (!raid5_dec_bi_active_stripes(bi
)) {
2784 bi
->bi_next
= *return_bi
;
2791 bitmap_endwrite(conf
->mddev
->bitmap
, sh
->sector
,
2792 STRIPE_SECTORS
, 0, 0);
2793 /* If we were in the middle of a write the parity block might
2794 * still be locked - so just clear all R5_LOCKED flags
2796 clear_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
2799 if (test_and_clear_bit(STRIPE_FULL_WRITE
, &sh
->state
))
2800 if (atomic_dec_and_test(&conf
->pending_full_writes
))
2801 md_wakeup_thread(conf
->mddev
->thread
);
2805 handle_failed_sync(struct r5conf
*conf
, struct stripe_head
*sh
,
2806 struct stripe_head_state
*s
)
2811 clear_bit(STRIPE_SYNCING
, &sh
->state
);
2812 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[sh
->pd_idx
].flags
))
2813 wake_up(&conf
->wait_for_overlap
);
2816 /* There is nothing more to do for sync/check/repair.
2817 * Don't even need to abort as that is handled elsewhere
2818 * if needed, and not always wanted e.g. if there is a known
2820 * For recover/replace we need to record a bad block on all
2821 * non-sync devices, or abort the recovery
2823 if (test_bit(MD_RECOVERY_RECOVER
, &conf
->mddev
->recovery
)) {
2824 /* During recovery devices cannot be removed, so
2825 * locking and refcounting of rdevs is not needed
2827 for (i
= 0; i
< conf
->raid_disks
; i
++) {
2828 struct md_rdev
*rdev
= conf
->disks
[i
].rdev
;
2830 && !test_bit(Faulty
, &rdev
->flags
)
2831 && !test_bit(In_sync
, &rdev
->flags
)
2832 && !rdev_set_badblocks(rdev
, sh
->sector
,
2835 rdev
= conf
->disks
[i
].replacement
;
2837 && !test_bit(Faulty
, &rdev
->flags
)
2838 && !test_bit(In_sync
, &rdev
->flags
)
2839 && !rdev_set_badblocks(rdev
, sh
->sector
,
2844 conf
->recovery_disabled
=
2845 conf
->mddev
->recovery_disabled
;
2847 md_done_sync(conf
->mddev
, STRIPE_SECTORS
, !abort
);
2850 static int want_replace(struct stripe_head
*sh
, int disk_idx
)
2852 struct md_rdev
*rdev
;
2854 /* Doing recovery so rcu locking not required */
2855 rdev
= sh
->raid_conf
->disks
[disk_idx
].replacement
;
2857 && !test_bit(Faulty
, &rdev
->flags
)
2858 && !test_bit(In_sync
, &rdev
->flags
)
2859 && (rdev
->recovery_offset
<= sh
->sector
2860 || rdev
->mddev
->recovery_cp
<= sh
->sector
))
2866 /* fetch_block - checks the given member device to see if its data needs
2867 * to be read or computed to satisfy a request.
2869 * Returns 1 when no more member devices need to be checked, otherwise returns
2870 * 0 to tell the loop in handle_stripe_fill to continue
2872 static int fetch_block(struct stripe_head
*sh
, struct stripe_head_state
*s
,
2873 int disk_idx
, int disks
)
2875 struct r5dev
*dev
= &sh
->dev
[disk_idx
];
2876 struct r5dev
*fdev
[2] = { &sh
->dev
[s
->failed_num
[0]],
2877 &sh
->dev
[s
->failed_num
[1]] };
2879 /* is the data in this block needed, and can we get it? */
2880 if (!test_bit(R5_LOCKED
, &dev
->flags
) &&
2881 !test_bit(R5_UPTODATE
, &dev
->flags
) &&
2883 (dev
->towrite
&& !test_bit(R5_OVERWRITE
, &dev
->flags
)) ||
2884 s
->syncing
|| s
->expanding
||
2885 (s
->replacing
&& want_replace(sh
, disk_idx
)) ||
2886 (s
->failed
>= 1 && fdev
[0]->toread
) ||
2887 (s
->failed
>= 2 && fdev
[1]->toread
) ||
2888 (sh
->raid_conf
->level
<= 5 && s
->failed
&& fdev
[0]->towrite
&&
2889 !test_bit(R5_OVERWRITE
, &fdev
[0]->flags
)) ||
2890 (sh
->raid_conf
->level
== 6 && s
->failed
&& s
->to_write
))) {
2891 /* we would like to get this block, possibly by computing it,
2892 * otherwise read it if the backing disk is insync
2894 BUG_ON(test_bit(R5_Wantcompute
, &dev
->flags
));
2895 BUG_ON(test_bit(R5_Wantread
, &dev
->flags
));
2896 if ((s
->uptodate
== disks
- 1) &&
2897 (s
->failed
&& (disk_idx
== s
->failed_num
[0] ||
2898 disk_idx
== s
->failed_num
[1]))) {
2899 /* have disk failed, and we're requested to fetch it;
2902 pr_debug("Computing stripe %llu block %d\n",
2903 (unsigned long long)sh
->sector
, disk_idx
);
2904 set_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
2905 set_bit(STRIPE_OP_COMPUTE_BLK
, &s
->ops_request
);
2906 set_bit(R5_Wantcompute
, &dev
->flags
);
2907 sh
->ops
.target
= disk_idx
;
2908 sh
->ops
.target2
= -1; /* no 2nd target */
2910 /* Careful: from this point on 'uptodate' is in the eye
2911 * of raid_run_ops which services 'compute' operations
2912 * before writes. R5_Wantcompute flags a block that will
2913 * be R5_UPTODATE by the time it is needed for a
2914 * subsequent operation.
2918 } else if (s
->uptodate
== disks
-2 && s
->failed
>= 2) {
2919 /* Computing 2-failure is *very* expensive; only
2920 * do it if failed >= 2
2923 for (other
= disks
; other
--; ) {
2924 if (other
== disk_idx
)
2926 if (!test_bit(R5_UPTODATE
,
2927 &sh
->dev
[other
].flags
))
2931 pr_debug("Computing stripe %llu blocks %d,%d\n",
2932 (unsigned long long)sh
->sector
,
2934 set_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
2935 set_bit(STRIPE_OP_COMPUTE_BLK
, &s
->ops_request
);
2936 set_bit(R5_Wantcompute
, &sh
->dev
[disk_idx
].flags
);
2937 set_bit(R5_Wantcompute
, &sh
->dev
[other
].flags
);
2938 sh
->ops
.target
= disk_idx
;
2939 sh
->ops
.target2
= other
;
2943 } else if (test_bit(R5_Insync
, &dev
->flags
)) {
2944 set_bit(R5_LOCKED
, &dev
->flags
);
2945 set_bit(R5_Wantread
, &dev
->flags
);
2947 pr_debug("Reading block %d (sync=%d)\n",
2948 disk_idx
, s
->syncing
);
2956 * handle_stripe_fill - read or compute data to satisfy pending requests.
2958 static void handle_stripe_fill(struct stripe_head
*sh
,
2959 struct stripe_head_state
*s
,
2964 /* look for blocks to read/compute, skip this if a compute
2965 * is already in flight, or if the stripe contents are in the
2966 * midst of changing due to a write
2968 if (!test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
) && !sh
->check_state
&&
2969 !sh
->reconstruct_state
)
2970 for (i
= disks
; i
--; )
2971 if (fetch_block(sh
, s
, i
, disks
))
2973 set_bit(STRIPE_HANDLE
, &sh
->state
);
2977 /* handle_stripe_clean_event
2978 * any written block on an uptodate or failed drive can be returned.
2979 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2980 * never LOCKED, so we don't need to test 'failed' directly.
2982 static void handle_stripe_clean_event(struct r5conf
*conf
,
2983 struct stripe_head
*sh
, int disks
, struct bio
**return_bi
)
2987 int discard_pending
= 0;
2989 for (i
= disks
; i
--; )
2990 if (sh
->dev
[i
].written
) {
2992 if (!test_bit(R5_LOCKED
, &dev
->flags
) &&
2993 (test_bit(R5_UPTODATE
, &dev
->flags
) ||
2994 test_bit(R5_Discard
, &dev
->flags
))) {
2995 /* We can return any write requests */
2996 struct bio
*wbi
, *wbi2
;
2997 pr_debug("Return write for disc %d\n", i
);
2998 if (test_and_clear_bit(R5_Discard
, &dev
->flags
))
2999 clear_bit(R5_UPTODATE
, &dev
->flags
);
3001 dev
->written
= NULL
;
3002 while (wbi
&& wbi
->bi_iter
.bi_sector
<
3003 dev
->sector
+ STRIPE_SECTORS
) {
3004 wbi2
= r5_next_bio(wbi
, dev
->sector
);
3005 if (!raid5_dec_bi_active_stripes(wbi
)) {
3006 md_write_end(conf
->mddev
);
3007 wbi
->bi_next
= *return_bi
;
3012 bitmap_endwrite(conf
->mddev
->bitmap
, sh
->sector
,
3014 !test_bit(STRIPE_DEGRADED
, &sh
->state
),
3016 } else if (test_bit(R5_Discard
, &dev
->flags
))
3017 discard_pending
= 1;
3019 if (!discard_pending
&&
3020 test_bit(R5_Discard
, &sh
->dev
[sh
->pd_idx
].flags
)) {
3021 clear_bit(R5_Discard
, &sh
->dev
[sh
->pd_idx
].flags
);
3022 clear_bit(R5_UPTODATE
, &sh
->dev
[sh
->pd_idx
].flags
);
3023 if (sh
->qd_idx
>= 0) {
3024 clear_bit(R5_Discard
, &sh
->dev
[sh
->qd_idx
].flags
);
3025 clear_bit(R5_UPTODATE
, &sh
->dev
[sh
->qd_idx
].flags
);
3027 /* now that discard is done we can proceed with any sync */
3028 clear_bit(STRIPE_DISCARD
, &sh
->state
);
3030 * SCSI discard will change some bio fields and the stripe has
3031 * no updated data, so remove it from hash list and the stripe
3032 * will be reinitialized
3034 spin_lock_irq(&conf
->device_lock
);
3036 spin_unlock_irq(&conf
->device_lock
);
3037 if (test_bit(STRIPE_SYNC_REQUESTED
, &sh
->state
))
3038 set_bit(STRIPE_HANDLE
, &sh
->state
);
3042 if (test_and_clear_bit(STRIPE_FULL_WRITE
, &sh
->state
))
3043 if (atomic_dec_and_test(&conf
->pending_full_writes
))
3044 md_wakeup_thread(conf
->mddev
->thread
);
3047 static void handle_stripe_dirtying(struct r5conf
*conf
,
3048 struct stripe_head
*sh
,
3049 struct stripe_head_state
*s
,
3052 int rmw
= 0, rcw
= 0, i
;
3053 sector_t recovery_cp
= conf
->mddev
->recovery_cp
;
3055 /* RAID6 requires 'rcw' in current implementation.
3056 * Otherwise, check whether resync is now happening or should start.
3057 * If yes, then the array is dirty (after unclean shutdown or
3058 * initial creation), so parity in some stripes might be inconsistent.
3059 * In this case, we need to always do reconstruct-write, to ensure
3060 * that in case of drive failure or read-error correction, we
3061 * generate correct data from the parity.
3063 if (conf
->max_degraded
== 2 ||
3064 (recovery_cp
< MaxSector
&& sh
->sector
>= recovery_cp
)) {
3065 /* Calculate the real rcw later - for now make it
3066 * look like rcw is cheaper
3069 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
3070 conf
->max_degraded
, (unsigned long long)recovery_cp
,
3071 (unsigned long long)sh
->sector
);
3072 } else for (i
= disks
; i
--; ) {
3073 /* would I have to read this buffer for read_modify_write */
3074 struct r5dev
*dev
= &sh
->dev
[i
];
3075 if ((dev
->towrite
|| i
== sh
->pd_idx
) &&
3076 !test_bit(R5_LOCKED
, &dev
->flags
) &&
3077 !(test_bit(R5_UPTODATE
, &dev
->flags
) ||
3078 test_bit(R5_Wantcompute
, &dev
->flags
))) {
3079 if (test_bit(R5_Insync
, &dev
->flags
))
3082 rmw
+= 2*disks
; /* cannot read it */
3084 /* Would I have to read this buffer for reconstruct_write */
3085 if (!test_bit(R5_OVERWRITE
, &dev
->flags
) && i
!= sh
->pd_idx
&&
3086 !test_bit(R5_LOCKED
, &dev
->flags
) &&
3087 !(test_bit(R5_UPTODATE
, &dev
->flags
) ||
3088 test_bit(R5_Wantcompute
, &dev
->flags
))) {
3089 if (test_bit(R5_Insync
, &dev
->flags
)) rcw
++;
3094 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3095 (unsigned long long)sh
->sector
, rmw
, rcw
);
3096 set_bit(STRIPE_HANDLE
, &sh
->state
);
3097 if (rmw
< rcw
&& rmw
> 0) {
3098 /* prefer read-modify-write, but need to get some data */
3099 if (conf
->mddev
->queue
)
3100 blk_add_trace_msg(conf
->mddev
->queue
,
3101 "raid5 rmw %llu %d",
3102 (unsigned long long)sh
->sector
, rmw
);
3103 for (i
= disks
; i
--; ) {
3104 struct r5dev
*dev
= &sh
->dev
[i
];
3105 if ((dev
->towrite
|| i
== sh
->pd_idx
) &&
3106 !test_bit(R5_LOCKED
, &dev
->flags
) &&
3107 !(test_bit(R5_UPTODATE
, &dev
->flags
) ||
3108 test_bit(R5_Wantcompute
, &dev
->flags
)) &&
3109 test_bit(R5_Insync
, &dev
->flags
)) {
3111 test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
3112 pr_debug("Read_old block "
3113 "%d for r-m-w\n", i
);
3114 set_bit(R5_LOCKED
, &dev
->flags
);
3115 set_bit(R5_Wantread
, &dev
->flags
);
3118 set_bit(STRIPE_DELAYED
, &sh
->state
);
3119 set_bit(STRIPE_HANDLE
, &sh
->state
);
3124 if (rcw
<= rmw
&& rcw
> 0) {
3125 /* want reconstruct write, but need to get some data */
3128 for (i
= disks
; i
--; ) {
3129 struct r5dev
*dev
= &sh
->dev
[i
];
3130 if (!test_bit(R5_OVERWRITE
, &dev
->flags
) &&
3131 i
!= sh
->pd_idx
&& i
!= sh
->qd_idx
&&
3132 !test_bit(R5_LOCKED
, &dev
->flags
) &&
3133 !(test_bit(R5_UPTODATE
, &dev
->flags
) ||
3134 test_bit(R5_Wantcompute
, &dev
->flags
))) {
3136 if (!test_bit(R5_Insync
, &dev
->flags
))
3137 continue; /* it's a failed drive */
3139 test_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
)) {
3140 pr_debug("Read_old block "
3141 "%d for Reconstruct\n", i
);
3142 set_bit(R5_LOCKED
, &dev
->flags
);
3143 set_bit(R5_Wantread
, &dev
->flags
);
3147 set_bit(STRIPE_DELAYED
, &sh
->state
);
3148 set_bit(STRIPE_HANDLE
, &sh
->state
);
3152 if (rcw
&& conf
->mddev
->queue
)
3153 blk_add_trace_msg(conf
->mddev
->queue
, "raid5 rcw %llu %d %d %d",
3154 (unsigned long long)sh
->sector
,
3155 rcw
, qread
, test_bit(STRIPE_DELAYED
, &sh
->state
));
3157 /* now if nothing is locked, and if we have enough data,
3158 * we can start a write request
3160 /* since handle_stripe can be called at any time we need to handle the
3161 * case where a compute block operation has been submitted and then a
3162 * subsequent call wants to start a write request. raid_run_ops only
3163 * handles the case where compute block and reconstruct are requested
3164 * simultaneously. If this is not the case then new writes need to be
3165 * held off until the compute completes.
3167 if ((s
->req_compute
|| !test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
)) &&
3168 (s
->locked
== 0 && (rcw
== 0 || rmw
== 0) &&
3169 !test_bit(STRIPE_BIT_DELAY
, &sh
->state
)))
3170 schedule_reconstruction(sh
, s
, rcw
== 0, 0);
3173 static void handle_parity_checks5(struct r5conf
*conf
, struct stripe_head
*sh
,
3174 struct stripe_head_state
*s
, int disks
)
3176 struct r5dev
*dev
= NULL
;
3178 set_bit(STRIPE_HANDLE
, &sh
->state
);
3180 switch (sh
->check_state
) {
3181 case check_state_idle
:
3182 /* start a new check operation if there are no failures */
3183 if (s
->failed
== 0) {
3184 BUG_ON(s
->uptodate
!= disks
);
3185 sh
->check_state
= check_state_run
;
3186 set_bit(STRIPE_OP_CHECK
, &s
->ops_request
);
3187 clear_bit(R5_UPTODATE
, &sh
->dev
[sh
->pd_idx
].flags
);
3191 dev
= &sh
->dev
[s
->failed_num
[0]];
3193 case check_state_compute_result
:
3194 sh
->check_state
= check_state_idle
;
3196 dev
= &sh
->dev
[sh
->pd_idx
];
3198 /* check that a write has not made the stripe insync */
3199 if (test_bit(STRIPE_INSYNC
, &sh
->state
))
3202 /* either failed parity check, or recovery is happening */
3203 BUG_ON(!test_bit(R5_UPTODATE
, &dev
->flags
));
3204 BUG_ON(s
->uptodate
!= disks
);
3206 set_bit(R5_LOCKED
, &dev
->flags
);
3208 set_bit(R5_Wantwrite
, &dev
->flags
);
3210 clear_bit(STRIPE_DEGRADED
, &sh
->state
);
3211 set_bit(STRIPE_INSYNC
, &sh
->state
);
3213 case check_state_run
:
3214 break; /* we will be called again upon completion */
3215 case check_state_check_result
:
3216 sh
->check_state
= check_state_idle
;
3218 /* if a failure occurred during the check operation, leave
3219 * STRIPE_INSYNC not set and let the stripe be handled again
3224 /* handle a successful check operation, if parity is correct
3225 * we are done. Otherwise update the mismatch count and repair
3226 * parity if !MD_RECOVERY_CHECK
3228 if ((sh
->ops
.zero_sum_result
& SUM_CHECK_P_RESULT
) == 0)
3229 /* parity is correct (on disc,
3230 * not in buffer any more)
3232 set_bit(STRIPE_INSYNC
, &sh
->state
);
3234 atomic64_add(STRIPE_SECTORS
, &conf
->mddev
->resync_mismatches
);
3235 if (test_bit(MD_RECOVERY_CHECK
, &conf
->mddev
->recovery
))
3236 /* don't try to repair!! */
3237 set_bit(STRIPE_INSYNC
, &sh
->state
);
3239 sh
->check_state
= check_state_compute_run
;
3240 set_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
3241 set_bit(STRIPE_OP_COMPUTE_BLK
, &s
->ops_request
);
3242 set_bit(R5_Wantcompute
,
3243 &sh
->dev
[sh
->pd_idx
].flags
);
3244 sh
->ops
.target
= sh
->pd_idx
;
3245 sh
->ops
.target2
= -1;
3250 case check_state_compute_run
:
3253 printk(KERN_ERR
"%s: unknown check_state: %d sector: %llu\n",
3254 __func__
, sh
->check_state
,
3255 (unsigned long long) sh
->sector
);
3261 static void handle_parity_checks6(struct r5conf
*conf
, struct stripe_head
*sh
,
3262 struct stripe_head_state
*s
,
3265 int pd_idx
= sh
->pd_idx
;
3266 int qd_idx
= sh
->qd_idx
;
3269 set_bit(STRIPE_HANDLE
, &sh
->state
);
3271 BUG_ON(s
->failed
> 2);
3273 /* Want to check and possibly repair P and Q.
3274 * However there could be one 'failed' device, in which
3275 * case we can only check one of them, possibly using the
3276 * other to generate missing data
3279 switch (sh
->check_state
) {
3280 case check_state_idle
:
3281 /* start a new check operation if there are < 2 failures */
3282 if (s
->failed
== s
->q_failed
) {
3283 /* The only possible failed device holds Q, so it
3284 * makes sense to check P (If anything else were failed,
3285 * we would have used P to recreate it).
3287 sh
->check_state
= check_state_run
;
3289 if (!s
->q_failed
&& s
->failed
< 2) {
3290 /* Q is not failed, and we didn't use it to generate
3291 * anything, so it makes sense to check it
3293 if (sh
->check_state
== check_state_run
)
3294 sh
->check_state
= check_state_run_pq
;
3296 sh
->check_state
= check_state_run_q
;
3299 /* discard potentially stale zero_sum_result */
3300 sh
->ops
.zero_sum_result
= 0;
3302 if (sh
->check_state
== check_state_run
) {
3303 /* async_xor_zero_sum destroys the contents of P */
3304 clear_bit(R5_UPTODATE
, &sh
->dev
[pd_idx
].flags
);
3307 if (sh
->check_state
>= check_state_run
&&
3308 sh
->check_state
<= check_state_run_pq
) {
3309 /* async_syndrome_zero_sum preserves P and Q, so
3310 * no need to mark them !uptodate here
3312 set_bit(STRIPE_OP_CHECK
, &s
->ops_request
);
3316 /* we have 2-disk failure */
3317 BUG_ON(s
->failed
!= 2);
3319 case check_state_compute_result
:
3320 sh
->check_state
= check_state_idle
;
3322 /* check that a write has not made the stripe insync */
3323 if (test_bit(STRIPE_INSYNC
, &sh
->state
))
3326 /* now write out any block on a failed drive,
3327 * or P or Q if they were recomputed
3329 BUG_ON(s
->uptodate
< disks
- 1); /* We don't need Q to recover */
3330 if (s
->failed
== 2) {
3331 dev
= &sh
->dev
[s
->failed_num
[1]];
3333 set_bit(R5_LOCKED
, &dev
->flags
);
3334 set_bit(R5_Wantwrite
, &dev
->flags
);
3336 if (s
->failed
>= 1) {
3337 dev
= &sh
->dev
[s
->failed_num
[0]];
3339 set_bit(R5_LOCKED
, &dev
->flags
);
3340 set_bit(R5_Wantwrite
, &dev
->flags
);
3342 if (sh
->ops
.zero_sum_result
& SUM_CHECK_P_RESULT
) {
3343 dev
= &sh
->dev
[pd_idx
];
3345 set_bit(R5_LOCKED
, &dev
->flags
);
3346 set_bit(R5_Wantwrite
, &dev
->flags
);
3348 if (sh
->ops
.zero_sum_result
& SUM_CHECK_Q_RESULT
) {
3349 dev
= &sh
->dev
[qd_idx
];
3351 set_bit(R5_LOCKED
, &dev
->flags
);
3352 set_bit(R5_Wantwrite
, &dev
->flags
);
3354 clear_bit(STRIPE_DEGRADED
, &sh
->state
);
3356 set_bit(STRIPE_INSYNC
, &sh
->state
);
3358 case check_state_run
:
3359 case check_state_run_q
:
3360 case check_state_run_pq
:
3361 break; /* we will be called again upon completion */
3362 case check_state_check_result
:
3363 sh
->check_state
= check_state_idle
;
3365 /* handle a successful check operation, if parity is correct
3366 * we are done. Otherwise update the mismatch count and repair
3367 * parity if !MD_RECOVERY_CHECK
3369 if (sh
->ops
.zero_sum_result
== 0) {
3370 /* both parities are correct */
3372 set_bit(STRIPE_INSYNC
, &sh
->state
);
3374 /* in contrast to the raid5 case we can validate
3375 * parity, but still have a failure to write
3378 sh
->check_state
= check_state_compute_result
;
3379 /* Returning at this point means that we may go
3380 * off and bring p and/or q uptodate again so
3381 * we make sure to check zero_sum_result again
3382 * to verify if p or q need writeback
3386 atomic64_add(STRIPE_SECTORS
, &conf
->mddev
->resync_mismatches
);
3387 if (test_bit(MD_RECOVERY_CHECK
, &conf
->mddev
->recovery
))
3388 /* don't try to repair!! */
3389 set_bit(STRIPE_INSYNC
, &sh
->state
);
3391 int *target
= &sh
->ops
.target
;
3393 sh
->ops
.target
= -1;
3394 sh
->ops
.target2
= -1;
3395 sh
->check_state
= check_state_compute_run
;
3396 set_bit(STRIPE_COMPUTE_RUN
, &sh
->state
);
3397 set_bit(STRIPE_OP_COMPUTE_BLK
, &s
->ops_request
);
3398 if (sh
->ops
.zero_sum_result
& SUM_CHECK_P_RESULT
) {
3399 set_bit(R5_Wantcompute
,
3400 &sh
->dev
[pd_idx
].flags
);
3402 target
= &sh
->ops
.target2
;
3405 if (sh
->ops
.zero_sum_result
& SUM_CHECK_Q_RESULT
) {
3406 set_bit(R5_Wantcompute
,
3407 &sh
->dev
[qd_idx
].flags
);
3414 case check_state_compute_run
:
3417 printk(KERN_ERR
"%s: unknown check_state: %d sector: %llu\n",
3418 __func__
, sh
->check_state
,
3419 (unsigned long long) sh
->sector
);
3424 static void handle_stripe_expansion(struct r5conf
*conf
, struct stripe_head
*sh
)
3428 /* We have read all the blocks in this stripe and now we need to
3429 * copy some of them into a target stripe for expand.
3431 struct dma_async_tx_descriptor
*tx
= NULL
;
3432 clear_bit(STRIPE_EXPAND_SOURCE
, &sh
->state
);
3433 for (i
= 0; i
< sh
->disks
; i
++)
3434 if (i
!= sh
->pd_idx
&& i
!= sh
->qd_idx
) {
3436 struct stripe_head
*sh2
;
3437 struct async_submit_ctl submit
;
3439 sector_t bn
= compute_blocknr(sh
, i
, 1);
3440 sector_t s
= raid5_compute_sector(conf
, bn
, 0,
3442 sh2
= get_active_stripe(conf
, s
, 0, 1, 1);
3444 /* so far only the early blocks of this stripe
3445 * have been requested. When later blocks
3446 * get requested, we will try again
3449 if (!test_bit(STRIPE_EXPANDING
, &sh2
->state
) ||
3450 test_bit(R5_Expanded
, &sh2
->dev
[dd_idx
].flags
)) {
3451 /* must have already done this block */
3452 release_stripe(sh2
);
3456 /* place all the copies on one channel */
3457 init_async_submit(&submit
, 0, tx
, NULL
, NULL
, NULL
);
3458 tx
= async_memcpy(sh2
->dev
[dd_idx
].page
,
3459 sh
->dev
[i
].page
, 0, 0, STRIPE_SIZE
,
3462 set_bit(R5_Expanded
, &sh2
->dev
[dd_idx
].flags
);
3463 set_bit(R5_UPTODATE
, &sh2
->dev
[dd_idx
].flags
);
3464 for (j
= 0; j
< conf
->raid_disks
; j
++)
3465 if (j
!= sh2
->pd_idx
&&
3467 !test_bit(R5_Expanded
, &sh2
->dev
[j
].flags
))
3469 if (j
== conf
->raid_disks
) {
3470 set_bit(STRIPE_EXPAND_READY
, &sh2
->state
);
3471 set_bit(STRIPE_HANDLE
, &sh2
->state
);
3473 release_stripe(sh2
);
3476 /* done submitting copies, wait for them to complete */
3477 async_tx_quiesce(&tx
);
3481 * handle_stripe - do things to a stripe.
3483 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3484 * state of various bits to see what needs to be done.
3486 * return some read requests which now have data
3487 * return some write requests which are safely on storage
3488 * schedule a read on some buffers
3489 * schedule a write of some buffers
3490 * return confirmation of parity correctness
3494 static void analyse_stripe(struct stripe_head
*sh
, struct stripe_head_state
*s
)
3496 struct r5conf
*conf
= sh
->raid_conf
;
3497 int disks
= sh
->disks
;
3500 int do_recovery
= 0;
3502 memset(s
, 0, sizeof(*s
));
3504 s
->expanding
= test_bit(STRIPE_EXPAND_SOURCE
, &sh
->state
);
3505 s
->expanded
= test_bit(STRIPE_EXPAND_READY
, &sh
->state
);
3506 s
->failed_num
[0] = -1;
3507 s
->failed_num
[1] = -1;
3509 /* Now to look around and see what can be done */
3511 for (i
=disks
; i
--; ) {
3512 struct md_rdev
*rdev
;
3519 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3521 dev
->toread
, dev
->towrite
, dev
->written
);
3522 /* maybe we can reply to a read
3524 * new wantfill requests are only permitted while
3525 * ops_complete_biofill is guaranteed to be inactive
3527 if (test_bit(R5_UPTODATE
, &dev
->flags
) && dev
->toread
&&
3528 !test_bit(STRIPE_BIOFILL_RUN
, &sh
->state
))
3529 set_bit(R5_Wantfill
, &dev
->flags
);
3531 /* now count some things */
3532 if (test_bit(R5_LOCKED
, &dev
->flags
))
3534 if (test_bit(R5_UPTODATE
, &dev
->flags
))
3536 if (test_bit(R5_Wantcompute
, &dev
->flags
)) {
3538 BUG_ON(s
->compute
> 2);
3541 if (test_bit(R5_Wantfill
, &dev
->flags
))
3543 else if (dev
->toread
)
3547 if (!test_bit(R5_OVERWRITE
, &dev
->flags
))
3552 /* Prefer to use the replacement for reads, but only
3553 * if it is recovered enough and has no bad blocks.
3555 rdev
= rcu_dereference(conf
->disks
[i
].replacement
);
3556 if (rdev
&& !test_bit(Faulty
, &rdev
->flags
) &&
3557 rdev
->recovery_offset
>= sh
->sector
+ STRIPE_SECTORS
&&
3558 !is_badblock(rdev
, sh
->sector
, STRIPE_SECTORS
,
3559 &first_bad
, &bad_sectors
))
3560 set_bit(R5_ReadRepl
, &dev
->flags
);
3563 set_bit(R5_NeedReplace
, &dev
->flags
);
3564 rdev
= rcu_dereference(conf
->disks
[i
].rdev
);
3565 clear_bit(R5_ReadRepl
, &dev
->flags
);
3567 if (rdev
&& test_bit(Faulty
, &rdev
->flags
))
3570 is_bad
= is_badblock(rdev
, sh
->sector
, STRIPE_SECTORS
,
3571 &first_bad
, &bad_sectors
);
3572 if (s
->blocked_rdev
== NULL
3573 && (test_bit(Blocked
, &rdev
->flags
)
3576 set_bit(BlockedBadBlocks
,
3578 s
->blocked_rdev
= rdev
;
3579 atomic_inc(&rdev
->nr_pending
);
3582 clear_bit(R5_Insync
, &dev
->flags
);
3586 /* also not in-sync */
3587 if (!test_bit(WriteErrorSeen
, &rdev
->flags
) &&
3588 test_bit(R5_UPTODATE
, &dev
->flags
)) {
3589 /* treat as in-sync, but with a read error
3590 * which we can now try to correct
3592 set_bit(R5_Insync
, &dev
->flags
);
3593 set_bit(R5_ReadError
, &dev
->flags
);
3595 } else if (test_bit(In_sync
, &rdev
->flags
))
3596 set_bit(R5_Insync
, &dev
->flags
);
3597 else if (sh
->sector
+ STRIPE_SECTORS
<= rdev
->recovery_offset
)
3598 /* in sync if before recovery_offset */
3599 set_bit(R5_Insync
, &dev
->flags
);
3600 else if (test_bit(R5_UPTODATE
, &dev
->flags
) &&
3601 test_bit(R5_Expanded
, &dev
->flags
))
3602 /* If we've reshaped into here, we assume it is Insync.
3603 * We will shortly update recovery_offset to make
3606 set_bit(R5_Insync
, &dev
->flags
);
3608 if (test_bit(R5_WriteError
, &dev
->flags
)) {
3609 /* This flag does not apply to '.replacement'
3610 * only to .rdev, so make sure to check that*/
3611 struct md_rdev
*rdev2
= rcu_dereference(
3612 conf
->disks
[i
].rdev
);
3614 clear_bit(R5_Insync
, &dev
->flags
);
3615 if (rdev2
&& !test_bit(Faulty
, &rdev2
->flags
)) {
3616 s
->handle_bad_blocks
= 1;
3617 atomic_inc(&rdev2
->nr_pending
);
3619 clear_bit(R5_WriteError
, &dev
->flags
);
3621 if (test_bit(R5_MadeGood
, &dev
->flags
)) {
3622 /* This flag does not apply to '.replacement'
3623 * only to .rdev, so make sure to check that*/
3624 struct md_rdev
*rdev2
= rcu_dereference(
3625 conf
->disks
[i
].rdev
);
3626 if (rdev2
&& !test_bit(Faulty
, &rdev2
->flags
)) {
3627 s
->handle_bad_blocks
= 1;
3628 atomic_inc(&rdev2
->nr_pending
);
3630 clear_bit(R5_MadeGood
, &dev
->flags
);
3632 if (test_bit(R5_MadeGoodRepl
, &dev
->flags
)) {
3633 struct md_rdev
*rdev2
= rcu_dereference(
3634 conf
->disks
[i
].replacement
);
3635 if (rdev2
&& !test_bit(Faulty
, &rdev2
->flags
)) {
3636 s
->handle_bad_blocks
= 1;
3637 atomic_inc(&rdev2
->nr_pending
);
3639 clear_bit(R5_MadeGoodRepl
, &dev
->flags
);
3641 if (!test_bit(R5_Insync
, &dev
->flags
)) {
3642 /* The ReadError flag will just be confusing now */
3643 clear_bit(R5_ReadError
, &dev
->flags
);
3644 clear_bit(R5_ReWrite
, &dev
->flags
);
3646 if (test_bit(R5_ReadError
, &dev
->flags
))
3647 clear_bit(R5_Insync
, &dev
->flags
);
3648 if (!test_bit(R5_Insync
, &dev
->flags
)) {
3650 s
->failed_num
[s
->failed
] = i
;
3652 if (rdev
&& !test_bit(Faulty
, &rdev
->flags
))
3656 if (test_bit(STRIPE_SYNCING
, &sh
->state
)) {
3657 /* If there is a failed device being replaced,
3658 * we must be recovering.
3659 * else if we are after recovery_cp, we must be syncing
3660 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
3661 * else we can only be replacing
3662 * sync and recovery both need to read all devices, and so
3663 * use the same flag.
3666 sh
->sector
>= conf
->mddev
->recovery_cp
||
3667 test_bit(MD_RECOVERY_REQUESTED
, &(conf
->mddev
->recovery
)))
3675 static void handle_stripe(struct stripe_head
*sh
)
3677 struct stripe_head_state s
;
3678 struct r5conf
*conf
= sh
->raid_conf
;
3681 int disks
= sh
->disks
;
3682 struct r5dev
*pdev
, *qdev
;
3684 clear_bit(STRIPE_HANDLE
, &sh
->state
);
3685 if (test_and_set_bit_lock(STRIPE_ACTIVE
, &sh
->state
)) {
3686 /* already being handled, ensure it gets handled
3687 * again when current action finishes */
3688 set_bit(STRIPE_HANDLE
, &sh
->state
);
3692 if (test_bit(STRIPE_SYNC_REQUESTED
, &sh
->state
)) {
3693 spin_lock(&sh
->stripe_lock
);
3694 /* Cannot process 'sync' concurrently with 'discard' */
3695 if (!test_bit(STRIPE_DISCARD
, &sh
->state
) &&
3696 test_and_clear_bit(STRIPE_SYNC_REQUESTED
, &sh
->state
)) {
3697 set_bit(STRIPE_SYNCING
, &sh
->state
);
3698 clear_bit(STRIPE_INSYNC
, &sh
->state
);
3699 clear_bit(STRIPE_REPLACED
, &sh
->state
);
3701 spin_unlock(&sh
->stripe_lock
);
3703 clear_bit(STRIPE_DELAYED
, &sh
->state
);
3705 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3706 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3707 (unsigned long long)sh
->sector
, sh
->state
,
3708 atomic_read(&sh
->count
), sh
->pd_idx
, sh
->qd_idx
,
3709 sh
->check_state
, sh
->reconstruct_state
);
3711 analyse_stripe(sh
, &s
);
3713 if (s
.handle_bad_blocks
) {
3714 set_bit(STRIPE_HANDLE
, &sh
->state
);
3718 if (unlikely(s
.blocked_rdev
)) {
3719 if (s
.syncing
|| s
.expanding
|| s
.expanded
||
3720 s
.replacing
|| s
.to_write
|| s
.written
) {
3721 set_bit(STRIPE_HANDLE
, &sh
->state
);
3724 /* There is nothing for the blocked_rdev to block */
3725 rdev_dec_pending(s
.blocked_rdev
, conf
->mddev
);
3726 s
.blocked_rdev
= NULL
;
3729 if (s
.to_fill
&& !test_bit(STRIPE_BIOFILL_RUN
, &sh
->state
)) {
3730 set_bit(STRIPE_OP_BIOFILL
, &s
.ops_request
);
3731 set_bit(STRIPE_BIOFILL_RUN
, &sh
->state
);
3734 pr_debug("locked=%d uptodate=%d to_read=%d"
3735 " to_write=%d failed=%d failed_num=%d,%d\n",
3736 s
.locked
, s
.uptodate
, s
.to_read
, s
.to_write
, s
.failed
,
3737 s
.failed_num
[0], s
.failed_num
[1]);
3738 /* check if the array has lost more than max_degraded devices and,
3739 * if so, some requests might need to be failed.
3741 if (s
.failed
> conf
->max_degraded
) {
3742 sh
->check_state
= 0;
3743 sh
->reconstruct_state
= 0;
3744 if (s
.to_read
+s
.to_write
+s
.written
)
3745 handle_failed_stripe(conf
, sh
, &s
, disks
, &s
.return_bi
);
3746 if (s
.syncing
+ s
.replacing
)
3747 handle_failed_sync(conf
, sh
, &s
);
3750 /* Now we check to see if any write operations have recently
3754 if (sh
->reconstruct_state
== reconstruct_state_prexor_drain_result
)
3756 if (sh
->reconstruct_state
== reconstruct_state_drain_result
||
3757 sh
->reconstruct_state
== reconstruct_state_prexor_drain_result
) {
3758 sh
->reconstruct_state
= reconstruct_state_idle
;
3760 /* All the 'written' buffers and the parity block are ready to
3761 * be written back to disk
3763 BUG_ON(!test_bit(R5_UPTODATE
, &sh
->dev
[sh
->pd_idx
].flags
) &&
3764 !test_bit(R5_Discard
, &sh
->dev
[sh
->pd_idx
].flags
));
3765 BUG_ON(sh
->qd_idx
>= 0 &&
3766 !test_bit(R5_UPTODATE
, &sh
->dev
[sh
->qd_idx
].flags
) &&
3767 !test_bit(R5_Discard
, &sh
->dev
[sh
->qd_idx
].flags
));
3768 for (i
= disks
; i
--; ) {
3769 struct r5dev
*dev
= &sh
->dev
[i
];
3770 if (test_bit(R5_LOCKED
, &dev
->flags
) &&
3771 (i
== sh
->pd_idx
|| i
== sh
->qd_idx
||
3773 pr_debug("Writing block %d\n", i
);
3774 set_bit(R5_Wantwrite
, &dev
->flags
);
3777 if (!test_bit(R5_Insync
, &dev
->flags
) ||
3778 ((i
== sh
->pd_idx
|| i
== sh
->qd_idx
) &&
3780 set_bit(STRIPE_INSYNC
, &sh
->state
);
3783 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
3784 s
.dec_preread_active
= 1;
3788 * might be able to return some write requests if the parity blocks
3789 * are safe, or on a failed drive
3791 pdev
= &sh
->dev
[sh
->pd_idx
];
3792 s
.p_failed
= (s
.failed
>= 1 && s
.failed_num
[0] == sh
->pd_idx
)
3793 || (s
.failed
>= 2 && s
.failed_num
[1] == sh
->pd_idx
);
3794 qdev
= &sh
->dev
[sh
->qd_idx
];
3795 s
.q_failed
= (s
.failed
>= 1 && s
.failed_num
[0] == sh
->qd_idx
)
3796 || (s
.failed
>= 2 && s
.failed_num
[1] == sh
->qd_idx
)
3800 (s
.p_failed
|| ((test_bit(R5_Insync
, &pdev
->flags
)
3801 && !test_bit(R5_LOCKED
, &pdev
->flags
)
3802 && (test_bit(R5_UPTODATE
, &pdev
->flags
) ||
3803 test_bit(R5_Discard
, &pdev
->flags
))))) &&
3804 (s
.q_failed
|| ((test_bit(R5_Insync
, &qdev
->flags
)
3805 && !test_bit(R5_LOCKED
, &qdev
->flags
)
3806 && (test_bit(R5_UPTODATE
, &qdev
->flags
) ||
3807 test_bit(R5_Discard
, &qdev
->flags
))))))
3808 handle_stripe_clean_event(conf
, sh
, disks
, &s
.return_bi
);
3810 /* Now we might consider reading some blocks, either to check/generate
3811 * parity, or to satisfy requests
3812 * or to load a block that is being partially written.
3814 if (s
.to_read
|| s
.non_overwrite
3815 || (conf
->level
== 6 && s
.to_write
&& s
.failed
)
3816 || (s
.syncing
&& (s
.uptodate
+ s
.compute
< disks
))
3819 handle_stripe_fill(sh
, &s
, disks
);
3821 /* Now to consider new write requests and what else, if anything
3822 * should be read. We do not handle new writes when:
3823 * 1/ A 'write' operation (copy+xor) is already in flight.
3824 * 2/ A 'check' operation is in flight, as it may clobber the parity
3827 if (s
.to_write
&& !sh
->reconstruct_state
&& !sh
->check_state
)
3828 handle_stripe_dirtying(conf
, sh
, &s
, disks
);
3830 /* maybe we need to check and possibly fix the parity for this stripe
3831 * Any reads will already have been scheduled, so we just see if enough
3832 * data is available. The parity check is held off while parity
3833 * dependent operations are in flight.
3835 if (sh
->check_state
||
3836 (s
.syncing
&& s
.locked
== 0 &&
3837 !test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
) &&
3838 !test_bit(STRIPE_INSYNC
, &sh
->state
))) {
3839 if (conf
->level
== 6)
3840 handle_parity_checks6(conf
, sh
, &s
, disks
);
3842 handle_parity_checks5(conf
, sh
, &s
, disks
);
3845 if ((s
.replacing
|| s
.syncing
) && s
.locked
== 0
3846 && !test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
)
3847 && !test_bit(STRIPE_REPLACED
, &sh
->state
)) {
3848 /* Write out to replacement devices where possible */
3849 for (i
= 0; i
< conf
->raid_disks
; i
++)
3850 if (test_bit(R5_NeedReplace
, &sh
->dev
[i
].flags
)) {
3851 WARN_ON(!test_bit(R5_UPTODATE
, &sh
->dev
[i
].flags
));
3852 set_bit(R5_WantReplace
, &sh
->dev
[i
].flags
);
3853 set_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
3857 set_bit(STRIPE_INSYNC
, &sh
->state
);
3858 set_bit(STRIPE_REPLACED
, &sh
->state
);
3860 if ((s
.syncing
|| s
.replacing
) && s
.locked
== 0 &&
3861 !test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
) &&
3862 test_bit(STRIPE_INSYNC
, &sh
->state
)) {
3863 md_done_sync(conf
->mddev
, STRIPE_SECTORS
, 1);
3864 clear_bit(STRIPE_SYNCING
, &sh
->state
);
3865 if (test_and_clear_bit(R5_Overlap
, &sh
->dev
[sh
->pd_idx
].flags
))
3866 wake_up(&conf
->wait_for_overlap
);
3869 /* If the failed drives are just a ReadError, then we might need
3870 * to progress the repair/check process
3872 if (s
.failed
<= conf
->max_degraded
&& !conf
->mddev
->ro
)
3873 for (i
= 0; i
< s
.failed
; i
++) {
3874 struct r5dev
*dev
= &sh
->dev
[s
.failed_num
[i
]];
3875 if (test_bit(R5_ReadError
, &dev
->flags
)
3876 && !test_bit(R5_LOCKED
, &dev
->flags
)
3877 && test_bit(R5_UPTODATE
, &dev
->flags
)
3879 if (!test_bit(R5_ReWrite
, &dev
->flags
)) {
3880 set_bit(R5_Wantwrite
, &dev
->flags
);
3881 set_bit(R5_ReWrite
, &dev
->flags
);
3882 set_bit(R5_LOCKED
, &dev
->flags
);
3885 /* let's read it back */
3886 set_bit(R5_Wantread
, &dev
->flags
);
3887 set_bit(R5_LOCKED
, &dev
->flags
);
3894 /* Finish reconstruct operations initiated by the expansion process */
3895 if (sh
->reconstruct_state
== reconstruct_state_result
) {
3896 struct stripe_head
*sh_src
3897 = get_active_stripe(conf
, sh
->sector
, 1, 1, 1);
3898 if (sh_src
&& test_bit(STRIPE_EXPAND_SOURCE
, &sh_src
->state
)) {
3899 /* sh cannot be written until sh_src has been read.
3900 * so arrange for sh to be delayed a little
3902 set_bit(STRIPE_DELAYED
, &sh
->state
);
3903 set_bit(STRIPE_HANDLE
, &sh
->state
);
3904 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE
,
3906 atomic_inc(&conf
->preread_active_stripes
);
3907 release_stripe(sh_src
);
3911 release_stripe(sh_src
);
3913 sh
->reconstruct_state
= reconstruct_state_idle
;
3914 clear_bit(STRIPE_EXPANDING
, &sh
->state
);
3915 for (i
= conf
->raid_disks
; i
--; ) {
3916 set_bit(R5_Wantwrite
, &sh
->dev
[i
].flags
);
3917 set_bit(R5_LOCKED
, &sh
->dev
[i
].flags
);
3922 if (s
.expanded
&& test_bit(STRIPE_EXPANDING
, &sh
->state
) &&
3923 !sh
->reconstruct_state
) {
3924 /* Need to write out all blocks after computing parity */
3925 sh
->disks
= conf
->raid_disks
;
3926 stripe_set_idx(sh
->sector
, conf
, 0, sh
);
3927 schedule_reconstruction(sh
, &s
, 1, 1);
3928 } else if (s
.expanded
&& !sh
->reconstruct_state
&& s
.locked
== 0) {
3929 clear_bit(STRIPE_EXPAND_READY
, &sh
->state
);
3930 atomic_dec(&conf
->reshape_stripes
);
3931 wake_up(&conf
->wait_for_overlap
);
3932 md_done_sync(conf
->mddev
, STRIPE_SECTORS
, 1);
3935 if (s
.expanding
&& s
.locked
== 0 &&
3936 !test_bit(STRIPE_COMPUTE_RUN
, &sh
->state
))
3937 handle_stripe_expansion(conf
, sh
);
3940 /* wait for this device to become unblocked */
3941 if (unlikely(s
.blocked_rdev
)) {
3942 if (conf
->mddev
->external
)
3943 md_wait_for_blocked_rdev(s
.blocked_rdev
,
3946 /* Internal metadata will immediately
3947 * be written by raid5d, so we don't
3948 * need to wait here.
3950 rdev_dec_pending(s
.blocked_rdev
,
3954 if (s
.handle_bad_blocks
)
3955 for (i
= disks
; i
--; ) {
3956 struct md_rdev
*rdev
;
3957 struct r5dev
*dev
= &sh
->dev
[i
];
3958 if (test_and_clear_bit(R5_WriteError
, &dev
->flags
)) {
3959 /* We own a safe reference to the rdev */
3960 rdev
= conf
->disks
[i
].rdev
;
3961 if (!rdev_set_badblocks(rdev
, sh
->sector
,
3963 md_error(conf
->mddev
, rdev
);
3964 rdev_dec_pending(rdev
, conf
->mddev
);
3966 if (test_and_clear_bit(R5_MadeGood
, &dev
->flags
)) {
3967 rdev
= conf
->disks
[i
].rdev
;
3968 rdev_clear_badblocks(rdev
, sh
->sector
,
3970 rdev_dec_pending(rdev
, conf
->mddev
);
3972 if (test_and_clear_bit(R5_MadeGoodRepl
, &dev
->flags
)) {
3973 rdev
= conf
->disks
[i
].replacement
;
3975 /* rdev have been moved down */
3976 rdev
= conf
->disks
[i
].rdev
;
3977 rdev_clear_badblocks(rdev
, sh
->sector
,
3979 rdev_dec_pending(rdev
, conf
->mddev
);
3984 raid_run_ops(sh
, s
.ops_request
);
3988 if (s
.dec_preread_active
) {
3989 /* We delay this until after ops_run_io so that if make_request
3990 * is waiting on a flush, it won't continue until the writes
3991 * have actually been submitted.
3993 atomic_dec(&conf
->preread_active_stripes
);
3994 if (atomic_read(&conf
->preread_active_stripes
) <
3996 md_wakeup_thread(conf
->mddev
->thread
);
3999 return_io(s
.return_bi
);
4001 clear_bit_unlock(STRIPE_ACTIVE
, &sh
->state
);
4004 static void raid5_activate_delayed(struct r5conf
*conf
)
4006 if (atomic_read(&conf
->preread_active_stripes
) < IO_THRESHOLD
) {
4007 while (!list_empty(&conf
->delayed_list
)) {
4008 struct list_head
*l
= conf
->delayed_list
.next
;
4009 struct stripe_head
*sh
;
4010 sh
= list_entry(l
, struct stripe_head
, lru
);
4012 clear_bit(STRIPE_DELAYED
, &sh
->state
);
4013 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
4014 atomic_inc(&conf
->preread_active_stripes
);
4015 list_add_tail(&sh
->lru
, &conf
->hold_list
);
4016 raid5_wakeup_stripe_thread(sh
);
4021 static void activate_bit_delay(struct r5conf
*conf
,
4022 struct list_head
*temp_inactive_list
)
4024 /* device_lock is held */
4025 struct list_head head
;
4026 list_add(&head
, &conf
->bitmap_list
);
4027 list_del_init(&conf
->bitmap_list
);
4028 while (!list_empty(&head
)) {
4029 struct stripe_head
*sh
= list_entry(head
.next
, struct stripe_head
, lru
);
4031 list_del_init(&sh
->lru
);
4032 atomic_inc(&sh
->count
);
4033 hash
= sh
->hash_lock_index
;
4034 __release_stripe(conf
, sh
, &temp_inactive_list
[hash
]);
4038 int md_raid5_congested(struct mddev
*mddev
, int bits
)
4040 struct r5conf
*conf
= mddev
->private;
4042 /* No difference between reads and writes. Just check
4043 * how busy the stripe_cache is
4046 if (conf
->inactive_blocked
)
4050 if (atomic_read(&conf
->empty_inactive_list_nr
))
4055 EXPORT_SYMBOL_GPL(md_raid5_congested
);
4057 static int raid5_congested(void *data
, int bits
)
4059 struct mddev
*mddev
= data
;
4061 return mddev_congested(mddev
, bits
) ||
4062 md_raid5_congested(mddev
, bits
);
4065 /* We want read requests to align with chunks where possible,
4066 * but write requests don't need to.
4068 static int raid5_mergeable_bvec(struct request_queue
*q
,
4069 struct bvec_merge_data
*bvm
,
4070 struct bio_vec
*biovec
)
4072 struct mddev
*mddev
= q
->queuedata
;
4073 sector_t sector
= bvm
->bi_sector
+ get_start_sect(bvm
->bi_bdev
);
4075 unsigned int chunk_sectors
= mddev
->chunk_sectors
;
4076 unsigned int bio_sectors
= bvm
->bi_size
>> 9;
4078 if ((bvm
->bi_rw
& 1) == WRITE
)
4079 return biovec
->bv_len
; /* always allow writes to be mergeable */
4081 if (mddev
->new_chunk_sectors
< mddev
->chunk_sectors
)
4082 chunk_sectors
= mddev
->new_chunk_sectors
;
4083 max
= (chunk_sectors
- ((sector
& (chunk_sectors
- 1)) + bio_sectors
)) << 9;
4084 if (max
< 0) max
= 0;
4085 if (max
<= biovec
->bv_len
&& bio_sectors
== 0)
4086 return biovec
->bv_len
;
4092 static int in_chunk_boundary(struct mddev
*mddev
, struct bio
*bio
)
4094 sector_t sector
= bio
->bi_iter
.bi_sector
+ get_start_sect(bio
->bi_bdev
);
4095 unsigned int chunk_sectors
= mddev
->chunk_sectors
;
4096 unsigned int bio_sectors
= bio_sectors(bio
);
4098 if (mddev
->new_chunk_sectors
< mddev
->chunk_sectors
)
4099 chunk_sectors
= mddev
->new_chunk_sectors
;
4100 return chunk_sectors
>=
4101 ((sector
& (chunk_sectors
- 1)) + bio_sectors
);
4105 * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
4106 * later sampled by raid5d.
4108 static void add_bio_to_retry(struct bio
*bi
,struct r5conf
*conf
)
4110 unsigned long flags
;
4112 spin_lock_irqsave(&conf
->device_lock
, flags
);
4114 bi
->bi_next
= conf
->retry_read_aligned_list
;
4115 conf
->retry_read_aligned_list
= bi
;
4117 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
4118 md_wakeup_thread(conf
->mddev
->thread
);
4122 static struct bio
*remove_bio_from_retry(struct r5conf
*conf
)
4126 bi
= conf
->retry_read_aligned
;
4128 conf
->retry_read_aligned
= NULL
;
4131 bi
= conf
->retry_read_aligned_list
;
4133 conf
->retry_read_aligned_list
= bi
->bi_next
;
4136 * this sets the active strip count to 1 and the processed
4137 * strip count to zero (upper 8 bits)
4139 raid5_set_bi_stripes(bi
, 1); /* biased count of active stripes */
4147 * The "raid5_align_endio" should check if the read succeeded and if it
4148 * did, call bio_endio on the original bio (having bio_put the new bio
4150 * If the read failed..
4152 static void raid5_align_endio(struct bio
*bi
, int error
)
4154 struct bio
* raid_bi
= bi
->bi_private
;
4155 struct mddev
*mddev
;
4156 struct r5conf
*conf
;
4157 int uptodate
= test_bit(BIO_UPTODATE
, &bi
->bi_flags
);
4158 struct md_rdev
*rdev
;
4162 rdev
= (void*)raid_bi
->bi_next
;
4163 raid_bi
->bi_next
= NULL
;
4164 mddev
= rdev
->mddev
;
4165 conf
= mddev
->private;
4167 rdev_dec_pending(rdev
, conf
->mddev
);
4169 if (!error
&& uptodate
) {
4170 trace_block_bio_complete(bdev_get_queue(raid_bi
->bi_bdev
),
4172 bio_endio(raid_bi
, 0);
4173 if (atomic_dec_and_test(&conf
->active_aligned_reads
))
4174 wake_up(&conf
->wait_for_stripe
);
4179 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
4181 add_bio_to_retry(raid_bi
, conf
);
4184 static int bio_fits_rdev(struct bio
*bi
)
4186 struct request_queue
*q
= bdev_get_queue(bi
->bi_bdev
);
4188 if (bio_sectors(bi
) > queue_max_sectors(q
))
4190 blk_recount_segments(q
, bi
);
4191 if (bi
->bi_phys_segments
> queue_max_segments(q
))
4194 if (q
->merge_bvec_fn
)
4195 /* it's too hard to apply the merge_bvec_fn at this stage,
4204 static int chunk_aligned_read(struct mddev
*mddev
, struct bio
* raid_bio
)
4206 struct r5conf
*conf
= mddev
->private;
4208 struct bio
* align_bi
;
4209 struct md_rdev
*rdev
;
4210 sector_t end_sector
;
4212 if (!in_chunk_boundary(mddev
, raid_bio
)) {
4213 pr_debug("chunk_aligned_read : non aligned\n");
4217 * use bio_clone_mddev to make a copy of the bio
4219 align_bi
= bio_clone_mddev(raid_bio
, GFP_NOIO
, mddev
);
4223 * set bi_end_io to a new function, and set bi_private to the
4226 align_bi
->bi_end_io
= raid5_align_endio
;
4227 align_bi
->bi_private
= raid_bio
;
4231 align_bi
->bi_iter
.bi_sector
=
4232 raid5_compute_sector(conf
, raid_bio
->bi_iter
.bi_sector
,
4235 end_sector
= bio_end_sector(align_bi
);
4237 rdev
= rcu_dereference(conf
->disks
[dd_idx
].replacement
);
4238 if (!rdev
|| test_bit(Faulty
, &rdev
->flags
) ||
4239 rdev
->recovery_offset
< end_sector
) {
4240 rdev
= rcu_dereference(conf
->disks
[dd_idx
].rdev
);
4242 (test_bit(Faulty
, &rdev
->flags
) ||
4243 !(test_bit(In_sync
, &rdev
->flags
) ||
4244 rdev
->recovery_offset
>= end_sector
)))
4251 atomic_inc(&rdev
->nr_pending
);
4253 raid_bio
->bi_next
= (void*)rdev
;
4254 align_bi
->bi_bdev
= rdev
->bdev
;
4255 align_bi
->bi_flags
&= ~(1 << BIO_SEG_VALID
);
4257 if (!bio_fits_rdev(align_bi
) ||
4258 is_badblock(rdev
, align_bi
->bi_iter
.bi_sector
,
4259 bio_sectors(align_bi
),
4260 &first_bad
, &bad_sectors
)) {
4261 /* too big in some way, or has a known bad block */
4263 rdev_dec_pending(rdev
, mddev
);
4267 /* No reshape active, so we can trust rdev->data_offset */
4268 align_bi
->bi_iter
.bi_sector
+= rdev
->data_offset
;
4270 spin_lock_irq(&conf
->device_lock
);
4271 wait_event_lock_irq(conf
->wait_for_stripe
,
4274 atomic_inc(&conf
->active_aligned_reads
);
4275 spin_unlock_irq(&conf
->device_lock
);
4278 trace_block_bio_remap(bdev_get_queue(align_bi
->bi_bdev
),
4279 align_bi
, disk_devt(mddev
->gendisk
),
4280 raid_bio
->bi_iter
.bi_sector
);
4281 generic_make_request(align_bi
);
4290 /* __get_priority_stripe - get the next stripe to process
4292 * Full stripe writes are allowed to pass preread active stripes up until
4293 * the bypass_threshold is exceeded. In general the bypass_count
4294 * increments when the handle_list is handled before the hold_list; however, it
4295 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
4296 * stripe with in flight i/o. The bypass_count will be reset when the
4297 * head of the hold_list has changed, i.e. the head was promoted to the
4300 static struct stripe_head
*__get_priority_stripe(struct r5conf
*conf
, int group
)
4302 struct stripe_head
*sh
= NULL
, *tmp
;
4303 struct list_head
*handle_list
= NULL
;
4304 struct r5worker_group
*wg
= NULL
;
4306 if (conf
->worker_cnt_per_group
== 0) {
4307 handle_list
= &conf
->handle_list
;
4308 } else if (group
!= ANY_GROUP
) {
4309 handle_list
= &conf
->worker_groups
[group
].handle_list
;
4310 wg
= &conf
->worker_groups
[group
];
4313 for (i
= 0; i
< conf
->group_cnt
; i
++) {
4314 handle_list
= &conf
->worker_groups
[i
].handle_list
;
4315 wg
= &conf
->worker_groups
[i
];
4316 if (!list_empty(handle_list
))
4321 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
4323 list_empty(handle_list
) ? "empty" : "busy",
4324 list_empty(&conf
->hold_list
) ? "empty" : "busy",
4325 atomic_read(&conf
->pending_full_writes
), conf
->bypass_count
);
4327 if (!list_empty(handle_list
)) {
4328 sh
= list_entry(handle_list
->next
, typeof(*sh
), lru
);
4330 if (list_empty(&conf
->hold_list
))
4331 conf
->bypass_count
= 0;
4332 else if (!test_bit(STRIPE_IO_STARTED
, &sh
->state
)) {
4333 if (conf
->hold_list
.next
== conf
->last_hold
)
4334 conf
->bypass_count
++;
4336 conf
->last_hold
= conf
->hold_list
.next
;
4337 conf
->bypass_count
-= conf
->bypass_threshold
;
4338 if (conf
->bypass_count
< 0)
4339 conf
->bypass_count
= 0;
4342 } else if (!list_empty(&conf
->hold_list
) &&
4343 ((conf
->bypass_threshold
&&
4344 conf
->bypass_count
> conf
->bypass_threshold
) ||
4345 atomic_read(&conf
->pending_full_writes
) == 0)) {
4347 list_for_each_entry(tmp
, &conf
->hold_list
, lru
) {
4348 if (conf
->worker_cnt_per_group
== 0 ||
4349 group
== ANY_GROUP
||
4350 !cpu_online(tmp
->cpu
) ||
4351 cpu_to_group(tmp
->cpu
) == group
) {
4358 conf
->bypass_count
-= conf
->bypass_threshold
;
4359 if (conf
->bypass_count
< 0)
4360 conf
->bypass_count
= 0;
4372 list_del_init(&sh
->lru
);
4373 atomic_inc(&sh
->count
);
4374 BUG_ON(atomic_read(&sh
->count
) != 1);
4378 struct raid5_plug_cb
{
4379 struct blk_plug_cb cb
;
4380 struct list_head list
;
4381 struct list_head temp_inactive_list
[NR_STRIPE_HASH_LOCKS
];
4384 static void raid5_unplug(struct blk_plug_cb
*blk_cb
, bool from_schedule
)
4386 struct raid5_plug_cb
*cb
= container_of(
4387 blk_cb
, struct raid5_plug_cb
, cb
);
4388 struct stripe_head
*sh
;
4389 struct mddev
*mddev
= cb
->cb
.data
;
4390 struct r5conf
*conf
= mddev
->private;
4394 if (cb
->list
.next
&& !list_empty(&cb
->list
)) {
4395 spin_lock_irq(&conf
->device_lock
);
4396 while (!list_empty(&cb
->list
)) {
4397 sh
= list_first_entry(&cb
->list
, struct stripe_head
, lru
);
4398 list_del_init(&sh
->lru
);
4400 * avoid race release_stripe_plug() sees
4401 * STRIPE_ON_UNPLUG_LIST clear but the stripe
4402 * is still in our list
4404 smp_mb__before_clear_bit();
4405 clear_bit(STRIPE_ON_UNPLUG_LIST
, &sh
->state
);
4407 * STRIPE_ON_RELEASE_LIST could be set here. In that
4408 * case, the count is always > 1 here
4410 hash
= sh
->hash_lock_index
;
4411 __release_stripe(conf
, sh
, &cb
->temp_inactive_list
[hash
]);
4414 spin_unlock_irq(&conf
->device_lock
);
4416 release_inactive_stripe_list(conf
, cb
->temp_inactive_list
,
4417 NR_STRIPE_HASH_LOCKS
);
4419 trace_block_unplug(mddev
->queue
, cnt
, !from_schedule
);
4423 static void release_stripe_plug(struct mddev
*mddev
,
4424 struct stripe_head
*sh
)
4426 struct blk_plug_cb
*blk_cb
= blk_check_plugged(
4427 raid5_unplug
, mddev
,
4428 sizeof(struct raid5_plug_cb
));
4429 struct raid5_plug_cb
*cb
;
4436 cb
= container_of(blk_cb
, struct raid5_plug_cb
, cb
);
4438 if (cb
->list
.next
== NULL
) {
4440 INIT_LIST_HEAD(&cb
->list
);
4441 for (i
= 0; i
< NR_STRIPE_HASH_LOCKS
; i
++)
4442 INIT_LIST_HEAD(cb
->temp_inactive_list
+ i
);
4445 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST
, &sh
->state
))
4446 list_add_tail(&sh
->lru
, &cb
->list
);
4451 static void make_discard_request(struct mddev
*mddev
, struct bio
*bi
)
4453 struct r5conf
*conf
= mddev
->private;
4454 sector_t logical_sector
, last_sector
;
4455 struct stripe_head
*sh
;
4459 if (mddev
->reshape_position
!= MaxSector
)
4460 /* Skip discard while reshape is happening */
4463 logical_sector
= bi
->bi_iter
.bi_sector
& ~((sector_t
)STRIPE_SECTORS
-1);
4464 last_sector
= bi
->bi_iter
.bi_sector
+ (bi
->bi_iter
.bi_size
>>9);
4467 bi
->bi_phys_segments
= 1; /* over-loaded to count active stripes */
4469 stripe_sectors
= conf
->chunk_sectors
*
4470 (conf
->raid_disks
- conf
->max_degraded
);
4471 logical_sector
= DIV_ROUND_UP_SECTOR_T(logical_sector
,
4473 sector_div(last_sector
, stripe_sectors
);
4475 logical_sector
*= conf
->chunk_sectors
;
4476 last_sector
*= conf
->chunk_sectors
;
4478 for (; logical_sector
< last_sector
;
4479 logical_sector
+= STRIPE_SECTORS
) {
4483 sh
= get_active_stripe(conf
, logical_sector
, 0, 0, 0);
4484 prepare_to_wait(&conf
->wait_for_overlap
, &w
,
4485 TASK_UNINTERRUPTIBLE
);
4486 set_bit(R5_Overlap
, &sh
->dev
[sh
->pd_idx
].flags
);
4487 if (test_bit(STRIPE_SYNCING
, &sh
->state
)) {
4492 clear_bit(R5_Overlap
, &sh
->dev
[sh
->pd_idx
].flags
);
4493 spin_lock_irq(&sh
->stripe_lock
);
4494 for (d
= 0; d
< conf
->raid_disks
; d
++) {
4495 if (d
== sh
->pd_idx
|| d
== sh
->qd_idx
)
4497 if (sh
->dev
[d
].towrite
|| sh
->dev
[d
].toread
) {
4498 set_bit(R5_Overlap
, &sh
->dev
[d
].flags
);
4499 spin_unlock_irq(&sh
->stripe_lock
);
4505 set_bit(STRIPE_DISCARD
, &sh
->state
);
4506 finish_wait(&conf
->wait_for_overlap
, &w
);
4507 for (d
= 0; d
< conf
->raid_disks
; d
++) {
4508 if (d
== sh
->pd_idx
|| d
== sh
->qd_idx
)
4510 sh
->dev
[d
].towrite
= bi
;
4511 set_bit(R5_OVERWRITE
, &sh
->dev
[d
].flags
);
4512 raid5_inc_bi_active_stripes(bi
);
4514 spin_unlock_irq(&sh
->stripe_lock
);
4515 if (conf
->mddev
->bitmap
) {
4517 d
< conf
->raid_disks
- conf
->max_degraded
;
4519 bitmap_startwrite(mddev
->bitmap
,
4523 sh
->bm_seq
= conf
->seq_flush
+ 1;
4524 set_bit(STRIPE_BIT_DELAY
, &sh
->state
);
4527 set_bit(STRIPE_HANDLE
, &sh
->state
);
4528 clear_bit(STRIPE_DELAYED
, &sh
->state
);
4529 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
4530 atomic_inc(&conf
->preread_active_stripes
);
4531 release_stripe_plug(mddev
, sh
);
4534 remaining
= raid5_dec_bi_active_stripes(bi
);
4535 if (remaining
== 0) {
4536 md_write_end(mddev
);
4541 static void make_request(struct mddev
*mddev
, struct bio
* bi
)
4543 struct r5conf
*conf
= mddev
->private;
4545 sector_t new_sector
;
4546 sector_t logical_sector
, last_sector
;
4547 struct stripe_head
*sh
;
4548 const int rw
= bio_data_dir(bi
);
4553 if (unlikely(bi
->bi_rw
& REQ_FLUSH
)) {
4554 md_flush_request(mddev
, bi
);
4558 md_write_start(mddev
, bi
);
4561 mddev
->reshape_position
== MaxSector
&&
4562 chunk_aligned_read(mddev
,bi
))
4565 if (unlikely(bi
->bi_rw
& REQ_DISCARD
)) {
4566 make_discard_request(mddev
, bi
);
4570 logical_sector
= bi
->bi_iter
.bi_sector
& ~((sector_t
)STRIPE_SECTORS
-1);
4571 last_sector
= bio_end_sector(bi
);
4573 bi
->bi_phys_segments
= 1; /* over-loaded to count active stripes */
4575 prepare_to_wait(&conf
->wait_for_overlap
, &w
, TASK_UNINTERRUPTIBLE
);
4576 for (;logical_sector
< last_sector
; logical_sector
+= STRIPE_SECTORS
) {
4582 seq
= read_seqcount_begin(&conf
->gen_lock
);
4585 prepare_to_wait(&conf
->wait_for_overlap
, &w
,
4586 TASK_UNINTERRUPTIBLE
);
4587 if (unlikely(conf
->reshape_progress
!= MaxSector
)) {
4588 /* spinlock is needed as reshape_progress may be
4589 * 64bit on a 32bit platform, and so it might be
4590 * possible to see a half-updated value
4591 * Of course reshape_progress could change after
4592 * the lock is dropped, so once we get a reference
4593 * to the stripe that we think it is, we will have
4596 spin_lock_irq(&conf
->device_lock
);
4597 if (mddev
->reshape_backwards
4598 ? logical_sector
< conf
->reshape_progress
4599 : logical_sector
>= conf
->reshape_progress
) {
4602 if (mddev
->reshape_backwards
4603 ? logical_sector
< conf
->reshape_safe
4604 : logical_sector
>= conf
->reshape_safe
) {
4605 spin_unlock_irq(&conf
->device_lock
);
4611 spin_unlock_irq(&conf
->device_lock
);
4614 new_sector
= raid5_compute_sector(conf
, logical_sector
,
4617 pr_debug("raid456: make_request, sector %llu logical %llu\n",
4618 (unsigned long long)new_sector
,
4619 (unsigned long long)logical_sector
);
4621 sh
= get_active_stripe(conf
, new_sector
, previous
,
4622 (bi
->bi_rw
&RWA_MASK
), 0);
4624 if (unlikely(previous
)) {
4625 /* expansion might have moved on while waiting for a
4626 * stripe, so we must do the range check again.
4627 * Expansion could still move past after this
4628 * test, but as we are holding a reference to
4629 * 'sh', we know that if that happens,
4630 * STRIPE_EXPANDING will get set and the expansion
4631 * won't proceed until we finish with the stripe.
4634 spin_lock_irq(&conf
->device_lock
);
4635 if (mddev
->reshape_backwards
4636 ? logical_sector
>= conf
->reshape_progress
4637 : logical_sector
< conf
->reshape_progress
)
4638 /* mismatch, need to try again */
4640 spin_unlock_irq(&conf
->device_lock
);
4648 if (read_seqcount_retry(&conf
->gen_lock
, seq
)) {
4649 /* Might have got the wrong stripe_head
4657 logical_sector
>= mddev
->suspend_lo
&&
4658 logical_sector
< mddev
->suspend_hi
) {
4660 /* As the suspend_* range is controlled by
4661 * userspace, we want an interruptible
4664 flush_signals(current
);
4665 prepare_to_wait(&conf
->wait_for_overlap
,
4666 &w
, TASK_INTERRUPTIBLE
);
4667 if (logical_sector
>= mddev
->suspend_lo
&&
4668 logical_sector
< mddev
->suspend_hi
) {
4675 if (test_bit(STRIPE_EXPANDING
, &sh
->state
) ||
4676 !add_stripe_bio(sh
, bi
, dd_idx
, rw
)) {
4677 /* Stripe is busy expanding or
4678 * add failed due to overlap. Flush everything
4681 md_wakeup_thread(mddev
->thread
);
4687 set_bit(STRIPE_HANDLE
, &sh
->state
);
4688 clear_bit(STRIPE_DELAYED
, &sh
->state
);
4689 if ((bi
->bi_rw
& REQ_SYNC
) &&
4690 !test_and_set_bit(STRIPE_PREREAD_ACTIVE
, &sh
->state
))
4691 atomic_inc(&conf
->preread_active_stripes
);
4692 release_stripe_plug(mddev
, sh
);
4694 /* cannot get stripe for read-ahead, just give-up */
4695 clear_bit(BIO_UPTODATE
, &bi
->bi_flags
);
4699 finish_wait(&conf
->wait_for_overlap
, &w
);
4701 remaining
= raid5_dec_bi_active_stripes(bi
);
4702 if (remaining
== 0) {
4705 md_write_end(mddev
);
4707 trace_block_bio_complete(bdev_get_queue(bi
->bi_bdev
),
4713 static sector_t
raid5_size(struct mddev
*mddev
, sector_t sectors
, int raid_disks
);
4715 static sector_t
reshape_request(struct mddev
*mddev
, sector_t sector_nr
, int *skipped
)
4717 /* reshaping is quite different to recovery/resync so it is
4718 * handled quite separately ... here.
4720 * On each call to sync_request, we gather one chunk worth of
4721 * destination stripes and flag them as expanding.
4722 * Then we find all the source stripes and request reads.
4723 * As the reads complete, handle_stripe will copy the data
4724 * into the destination stripe and release that stripe.
4726 struct r5conf
*conf
= mddev
->private;
4727 struct stripe_head
*sh
;
4728 sector_t first_sector
, last_sector
;
4729 int raid_disks
= conf
->previous_raid_disks
;
4730 int data_disks
= raid_disks
- conf
->max_degraded
;
4731 int new_data_disks
= conf
->raid_disks
- conf
->max_degraded
;
4734 sector_t writepos
, readpos
, safepos
;
4735 sector_t stripe_addr
;
4736 int reshape_sectors
;
4737 struct list_head stripes
;
4739 if (sector_nr
== 0) {
4740 /* If restarting in the middle, skip the initial sectors */
4741 if (mddev
->reshape_backwards
&&
4742 conf
->reshape_progress
< raid5_size(mddev
, 0, 0)) {
4743 sector_nr
= raid5_size(mddev
, 0, 0)
4744 - conf
->reshape_progress
;
4745 } else if (!mddev
->reshape_backwards
&&
4746 conf
->reshape_progress
> 0)
4747 sector_nr
= conf
->reshape_progress
;
4748 sector_div(sector_nr
, new_data_disks
);
4750 mddev
->curr_resync_completed
= sector_nr
;
4751 sysfs_notify(&mddev
->kobj
, NULL
, "sync_completed");
4757 /* We need to process a full chunk at a time.
4758 * If old and new chunk sizes differ, we need to process the
4761 if (mddev
->new_chunk_sectors
> mddev
->chunk_sectors
)
4762 reshape_sectors
= mddev
->new_chunk_sectors
;
4764 reshape_sectors
= mddev
->chunk_sectors
;
4766 /* We update the metadata at least every 10 seconds, or when
4767 * the data about to be copied would over-write the source of
4768 * the data at the front of the range. i.e. one new_stripe
4769 * along from reshape_progress new_maps to after where
4770 * reshape_safe old_maps to
4772 writepos
= conf
->reshape_progress
;
4773 sector_div(writepos
, new_data_disks
);
4774 readpos
= conf
->reshape_progress
;
4775 sector_div(readpos
, data_disks
);
4776 safepos
= conf
->reshape_safe
;
4777 sector_div(safepos
, data_disks
);
4778 if (mddev
->reshape_backwards
) {
4779 writepos
-= min_t(sector_t
, reshape_sectors
, writepos
);
4780 readpos
+= reshape_sectors
;
4781 safepos
+= reshape_sectors
;
4783 writepos
+= reshape_sectors
;
4784 readpos
-= min_t(sector_t
, reshape_sectors
, readpos
);
4785 safepos
-= min_t(sector_t
, reshape_sectors
, safepos
);
4788 /* Having calculated the 'writepos' possibly use it
4789 * to set 'stripe_addr' which is where we will write to.
4791 if (mddev
->reshape_backwards
) {
4792 BUG_ON(conf
->reshape_progress
== 0);
4793 stripe_addr
= writepos
;
4794 BUG_ON((mddev
->dev_sectors
&
4795 ~((sector_t
)reshape_sectors
- 1))
4796 - reshape_sectors
- stripe_addr
4799 BUG_ON(writepos
!= sector_nr
+ reshape_sectors
);
4800 stripe_addr
= sector_nr
;
4803 /* 'writepos' is the most advanced device address we might write.
4804 * 'readpos' is the least advanced device address we might read.
4805 * 'safepos' is the least address recorded in the metadata as having
4807 * If there is a min_offset_diff, these are adjusted either by
4808 * increasing the safepos/readpos if diff is negative, or
4809 * increasing writepos if diff is positive.
4810 * If 'readpos' is then behind 'writepos', there is no way that we can
4811 * ensure safety in the face of a crash - that must be done by userspace
4812 * making a backup of the data. So in that case there is no particular
4813 * rush to update metadata.
4814 * Otherwise if 'safepos' is behind 'writepos', then we really need to
4815 * update the metadata to advance 'safepos' to match 'readpos' so that
4816 * we can be safe in the event of a crash.
4817 * So we insist on updating metadata if safepos is behind writepos and
4818 * readpos is beyond writepos.
4819 * In any case, update the metadata every 10 seconds.
4820 * Maybe that number should be configurable, but I'm not sure it is
4821 * worth it.... maybe it could be a multiple of safemode_delay???
4823 if (conf
->min_offset_diff
< 0) {
4824 safepos
+= -conf
->min_offset_diff
;
4825 readpos
+= -conf
->min_offset_diff
;
4827 writepos
+= conf
->min_offset_diff
;
4829 if ((mddev
->reshape_backwards
4830 ? (safepos
> writepos
&& readpos
< writepos
)
4831 : (safepos
< writepos
&& readpos
> writepos
)) ||
4832 time_after(jiffies
, conf
->reshape_checkpoint
+ 10*HZ
)) {
4833 /* Cannot proceed until we've updated the superblock... */
4834 wait_event(conf
->wait_for_overlap
,
4835 atomic_read(&conf
->reshape_stripes
)==0
4836 || test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
));
4837 if (atomic_read(&conf
->reshape_stripes
) != 0)
4839 mddev
->reshape_position
= conf
->reshape_progress
;
4840 mddev
->curr_resync_completed
= sector_nr
;
4841 conf
->reshape_checkpoint
= jiffies
;
4842 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
4843 md_wakeup_thread(mddev
->thread
);
4844 wait_event(mddev
->sb_wait
, mddev
->flags
== 0 ||
4845 test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
));
4846 if (test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
))
4848 spin_lock_irq(&conf
->device_lock
);
4849 conf
->reshape_safe
= mddev
->reshape_position
;
4850 spin_unlock_irq(&conf
->device_lock
);
4851 wake_up(&conf
->wait_for_overlap
);
4852 sysfs_notify(&mddev
->kobj
, NULL
, "sync_completed");
4855 INIT_LIST_HEAD(&stripes
);
4856 for (i
= 0; i
< reshape_sectors
; i
+= STRIPE_SECTORS
) {
4858 int skipped_disk
= 0;
4859 sh
= get_active_stripe(conf
, stripe_addr
+i
, 0, 0, 1);
4860 set_bit(STRIPE_EXPANDING
, &sh
->state
);
4861 atomic_inc(&conf
->reshape_stripes
);
4862 /* If any of this stripe is beyond the end of the old
4863 * array, then we need to zero those blocks
4865 for (j
=sh
->disks
; j
--;) {
4867 if (j
== sh
->pd_idx
)
4869 if (conf
->level
== 6 &&
4872 s
= compute_blocknr(sh
, j
, 0);
4873 if (s
< raid5_size(mddev
, 0, 0)) {
4877 memset(page_address(sh
->dev
[j
].page
), 0, STRIPE_SIZE
);
4878 set_bit(R5_Expanded
, &sh
->dev
[j
].flags
);
4879 set_bit(R5_UPTODATE
, &sh
->dev
[j
].flags
);
4881 if (!skipped_disk
) {
4882 set_bit(STRIPE_EXPAND_READY
, &sh
->state
);
4883 set_bit(STRIPE_HANDLE
, &sh
->state
);
4885 list_add(&sh
->lru
, &stripes
);
4887 spin_lock_irq(&conf
->device_lock
);
4888 if (mddev
->reshape_backwards
)
4889 conf
->reshape_progress
-= reshape_sectors
* new_data_disks
;
4891 conf
->reshape_progress
+= reshape_sectors
* new_data_disks
;
4892 spin_unlock_irq(&conf
->device_lock
);
4893 /* Ok, those stripe are ready. We can start scheduling
4894 * reads on the source stripes.
4895 * The source stripes are determined by mapping the first and last
4896 * block on the destination stripes.
4899 raid5_compute_sector(conf
, stripe_addr
*(new_data_disks
),
4902 raid5_compute_sector(conf
, ((stripe_addr
+reshape_sectors
)
4903 * new_data_disks
- 1),
4905 if (last_sector
>= mddev
->dev_sectors
)
4906 last_sector
= mddev
->dev_sectors
- 1;
4907 while (first_sector
<= last_sector
) {
4908 sh
= get_active_stripe(conf
, first_sector
, 1, 0, 1);
4909 set_bit(STRIPE_EXPAND_SOURCE
, &sh
->state
);
4910 set_bit(STRIPE_HANDLE
, &sh
->state
);
4912 first_sector
+= STRIPE_SECTORS
;
4914 /* Now that the sources are clearly marked, we can release
4915 * the destination stripes
4917 while (!list_empty(&stripes
)) {
4918 sh
= list_entry(stripes
.next
, struct stripe_head
, lru
);
4919 list_del_init(&sh
->lru
);
4922 /* If this takes us to the resync_max point where we have to pause,
4923 * then we need to write out the superblock.
4925 sector_nr
+= reshape_sectors
;
4926 if ((sector_nr
- mddev
->curr_resync_completed
) * 2
4927 >= mddev
->resync_max
- mddev
->curr_resync_completed
) {
4928 /* Cannot proceed until we've updated the superblock... */
4929 wait_event(conf
->wait_for_overlap
,
4930 atomic_read(&conf
->reshape_stripes
) == 0
4931 || test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
));
4932 if (atomic_read(&conf
->reshape_stripes
) != 0)
4934 mddev
->reshape_position
= conf
->reshape_progress
;
4935 mddev
->curr_resync_completed
= sector_nr
;
4936 conf
->reshape_checkpoint
= jiffies
;
4937 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
4938 md_wakeup_thread(mddev
->thread
);
4939 wait_event(mddev
->sb_wait
,
4940 !test_bit(MD_CHANGE_DEVS
, &mddev
->flags
)
4941 || test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
));
4942 if (test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
))
4944 spin_lock_irq(&conf
->device_lock
);
4945 conf
->reshape_safe
= mddev
->reshape_position
;
4946 spin_unlock_irq(&conf
->device_lock
);
4947 wake_up(&conf
->wait_for_overlap
);
4948 sysfs_notify(&mddev
->kobj
, NULL
, "sync_completed");
4951 return reshape_sectors
;
4954 /* FIXME go_faster isn't used */
4955 static inline sector_t
sync_request(struct mddev
*mddev
, sector_t sector_nr
, int *skipped
, int go_faster
)
4957 struct r5conf
*conf
= mddev
->private;
4958 struct stripe_head
*sh
;
4959 sector_t max_sector
= mddev
->dev_sectors
;
4960 sector_t sync_blocks
;
4961 int still_degraded
= 0;
4964 if (sector_nr
>= max_sector
) {
4965 /* just being told to finish up .. nothing much to do */
4967 if (test_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
)) {
4972 if (mddev
->curr_resync
< max_sector
) /* aborted */
4973 bitmap_end_sync(mddev
->bitmap
, mddev
->curr_resync
,
4975 else /* completed sync */
4977 bitmap_close_sync(mddev
->bitmap
);
4982 /* Allow raid5_quiesce to complete */
4983 wait_event(conf
->wait_for_overlap
, conf
->quiesce
!= 2);
4985 if (test_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
))
4986 return reshape_request(mddev
, sector_nr
, skipped
);
4988 /* No need to check resync_max as we never do more than one
4989 * stripe, and as resync_max will always be on a chunk boundary,
4990 * if the check in md_do_sync didn't fire, there is no chance
4991 * of overstepping resync_max here
4994 /* if there is too many failed drives and we are trying
4995 * to resync, then assert that we are finished, because there is
4996 * nothing we can do.
4998 if (mddev
->degraded
>= conf
->max_degraded
&&
4999 test_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
)) {
5000 sector_t rv
= mddev
->dev_sectors
- sector_nr
;
5004 if (!test_bit(MD_RECOVERY_REQUESTED
, &mddev
->recovery
) &&
5006 !bitmap_start_sync(mddev
->bitmap
, sector_nr
, &sync_blocks
, 1) &&
5007 sync_blocks
>= STRIPE_SECTORS
) {
5008 /* we can skip this block, and probably more */
5009 sync_blocks
/= STRIPE_SECTORS
;
5011 return sync_blocks
* STRIPE_SECTORS
; /* keep things rounded to whole stripes */
5014 bitmap_cond_end_sync(mddev
->bitmap
, sector_nr
);
5016 sh
= get_active_stripe(conf
, sector_nr
, 0, 1, 0);
5018 sh
= get_active_stripe(conf
, sector_nr
, 0, 0, 0);
5019 /* make sure we don't swamp the stripe cache if someone else
5020 * is trying to get access
5022 schedule_timeout_uninterruptible(1);
5024 /* Need to check if array will still be degraded after recovery/resync
5025 * We don't need to check the 'failed' flag as when that gets set,
5028 for (i
= 0; i
< conf
->raid_disks
; i
++)
5029 if (conf
->disks
[i
].rdev
== NULL
)
5032 bitmap_start_sync(mddev
->bitmap
, sector_nr
, &sync_blocks
, still_degraded
);
5034 set_bit(STRIPE_SYNC_REQUESTED
, &sh
->state
);
5039 return STRIPE_SECTORS
;
5042 static int retry_aligned_read(struct r5conf
*conf
, struct bio
*raid_bio
)
5044 /* We may not be able to submit a whole bio at once as there
5045 * may not be enough stripe_heads available.
5046 * We cannot pre-allocate enough stripe_heads as we may need
5047 * more than exist in the cache (if we allow ever large chunks).
5048 * So we do one stripe head at a time and record in
5049 * ->bi_hw_segments how many have been done.
5051 * We *know* that this entire raid_bio is in one chunk, so
5052 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
5054 struct stripe_head
*sh
;
5056 sector_t sector
, logical_sector
, last_sector
;
5061 logical_sector
= raid_bio
->bi_iter
.bi_sector
&
5062 ~((sector_t
)STRIPE_SECTORS
-1);
5063 sector
= raid5_compute_sector(conf
, logical_sector
,
5065 last_sector
= bio_end_sector(raid_bio
);
5067 for (; logical_sector
< last_sector
;
5068 logical_sector
+= STRIPE_SECTORS
,
5069 sector
+= STRIPE_SECTORS
,
5072 if (scnt
< raid5_bi_processed_stripes(raid_bio
))
5073 /* already done this stripe */
5076 sh
= get_active_stripe(conf
, sector
, 0, 1, 0);
5079 /* failed to get a stripe - must wait */
5080 raid5_set_bi_processed_stripes(raid_bio
, scnt
);
5081 conf
->retry_read_aligned
= raid_bio
;
5085 if (!add_stripe_bio(sh
, raid_bio
, dd_idx
, 0)) {
5087 raid5_set_bi_processed_stripes(raid_bio
, scnt
);
5088 conf
->retry_read_aligned
= raid_bio
;
5092 set_bit(R5_ReadNoMerge
, &sh
->dev
[dd_idx
].flags
);
5097 remaining
= raid5_dec_bi_active_stripes(raid_bio
);
5098 if (remaining
== 0) {
5099 trace_block_bio_complete(bdev_get_queue(raid_bio
->bi_bdev
),
5101 bio_endio(raid_bio
, 0);
5103 if (atomic_dec_and_test(&conf
->active_aligned_reads
))
5104 wake_up(&conf
->wait_for_stripe
);
5108 static int handle_active_stripes(struct r5conf
*conf
, int group
,
5109 struct r5worker
*worker
,
5110 struct list_head
*temp_inactive_list
)
5112 struct stripe_head
*batch
[MAX_STRIPE_BATCH
], *sh
;
5113 int i
, batch_size
= 0, hash
;
5114 bool release_inactive
= false;
5116 while (batch_size
< MAX_STRIPE_BATCH
&&
5117 (sh
= __get_priority_stripe(conf
, group
)) != NULL
)
5118 batch
[batch_size
++] = sh
;
5120 if (batch_size
== 0) {
5121 for (i
= 0; i
< NR_STRIPE_HASH_LOCKS
; i
++)
5122 if (!list_empty(temp_inactive_list
+ i
))
5124 if (i
== NR_STRIPE_HASH_LOCKS
)
5126 release_inactive
= true;
5128 spin_unlock_irq(&conf
->device_lock
);
5130 release_inactive_stripe_list(conf
, temp_inactive_list
,
5131 NR_STRIPE_HASH_LOCKS
);
5133 if (release_inactive
) {
5134 spin_lock_irq(&conf
->device_lock
);
5138 for (i
= 0; i
< batch_size
; i
++)
5139 handle_stripe(batch
[i
]);
5143 spin_lock_irq(&conf
->device_lock
);
5144 for (i
= 0; i
< batch_size
; i
++) {
5145 hash
= batch
[i
]->hash_lock_index
;
5146 __release_stripe(conf
, batch
[i
], &temp_inactive_list
[hash
]);
5151 static void raid5_do_work(struct work_struct
*work
)
5153 struct r5worker
*worker
= container_of(work
, struct r5worker
, work
);
5154 struct r5worker_group
*group
= worker
->group
;
5155 struct r5conf
*conf
= group
->conf
;
5156 int group_id
= group
- conf
->worker_groups
;
5158 struct blk_plug plug
;
5160 pr_debug("+++ raid5worker active\n");
5162 blk_start_plug(&plug
);
5164 spin_lock_irq(&conf
->device_lock
);
5166 int batch_size
, released
;
5168 released
= release_stripe_list(conf
, worker
->temp_inactive_list
);
5170 batch_size
= handle_active_stripes(conf
, group_id
, worker
,
5171 worker
->temp_inactive_list
);
5172 worker
->working
= false;
5173 if (!batch_size
&& !released
)
5175 handled
+= batch_size
;
5177 pr_debug("%d stripes handled\n", handled
);
5179 spin_unlock_irq(&conf
->device_lock
);
5180 blk_finish_plug(&plug
);
5182 pr_debug("--- raid5worker inactive\n");
5186 * This is our raid5 kernel thread.
5188 * We scan the hash table for stripes which can be handled now.
5189 * During the scan, completed stripes are saved for us by the interrupt
5190 * handler, so that they will not have to wait for our next wakeup.
5192 static void raid5d(struct md_thread
*thread
)
5194 struct mddev
*mddev
= thread
->mddev
;
5195 struct r5conf
*conf
= mddev
->private;
5197 struct blk_plug plug
;
5199 pr_debug("+++ raid5d active\n");
5201 md_check_recovery(mddev
);
5203 blk_start_plug(&plug
);
5205 spin_lock_irq(&conf
->device_lock
);
5208 int batch_size
, released
;
5210 released
= release_stripe_list(conf
, conf
->temp_inactive_list
);
5213 !list_empty(&conf
->bitmap_list
)) {
5214 /* Now is a good time to flush some bitmap updates */
5216 spin_unlock_irq(&conf
->device_lock
);
5217 bitmap_unplug(mddev
->bitmap
);
5218 spin_lock_irq(&conf
->device_lock
);
5219 conf
->seq_write
= conf
->seq_flush
;
5220 activate_bit_delay(conf
, conf
->temp_inactive_list
);
5222 raid5_activate_delayed(conf
);
5224 while ((bio
= remove_bio_from_retry(conf
))) {
5226 spin_unlock_irq(&conf
->device_lock
);
5227 ok
= retry_aligned_read(conf
, bio
);
5228 spin_lock_irq(&conf
->device_lock
);
5234 batch_size
= handle_active_stripes(conf
, ANY_GROUP
, NULL
,
5235 conf
->temp_inactive_list
);
5236 if (!batch_size
&& !released
)
5238 handled
+= batch_size
;
5240 if (mddev
->flags
& ~(1<<MD_CHANGE_PENDING
)) {
5241 spin_unlock_irq(&conf
->device_lock
);
5242 md_check_recovery(mddev
);
5243 spin_lock_irq(&conf
->device_lock
);
5246 pr_debug("%d stripes handled\n", handled
);
5248 spin_unlock_irq(&conf
->device_lock
);
5250 async_tx_issue_pending_all();
5251 blk_finish_plug(&plug
);
5253 pr_debug("--- raid5d inactive\n");
5257 raid5_show_stripe_cache_size(struct mddev
*mddev
, char *page
)
5259 struct r5conf
*conf
= mddev
->private;
5261 return sprintf(page
, "%d\n", conf
->max_nr_stripes
);
5267 raid5_set_cache_size(struct mddev
*mddev
, int size
)
5269 struct r5conf
*conf
= mddev
->private;
5273 if (size
<= 16 || size
> 32768)
5275 hash
= (conf
->max_nr_stripes
- 1) % NR_STRIPE_HASH_LOCKS
;
5276 while (size
< conf
->max_nr_stripes
) {
5277 if (drop_one_stripe(conf
, hash
))
5278 conf
->max_nr_stripes
--;
5283 hash
= NR_STRIPE_HASH_LOCKS
- 1;
5285 err
= md_allow_write(mddev
);
5288 hash
= conf
->max_nr_stripes
% NR_STRIPE_HASH_LOCKS
;
5289 while (size
> conf
->max_nr_stripes
) {
5290 if (grow_one_stripe(conf
, hash
))
5291 conf
->max_nr_stripes
++;
5293 hash
= (hash
+ 1) % NR_STRIPE_HASH_LOCKS
;
5297 EXPORT_SYMBOL(raid5_set_cache_size
);
5300 raid5_store_stripe_cache_size(struct mddev
*mddev
, const char *page
, size_t len
)
5302 struct r5conf
*conf
= mddev
->private;
5306 if (len
>= PAGE_SIZE
)
5311 if (kstrtoul(page
, 10, &new))
5313 err
= raid5_set_cache_size(mddev
, new);
5319 static struct md_sysfs_entry
5320 raid5_stripecache_size
= __ATTR(stripe_cache_size
, S_IRUGO
| S_IWUSR
,
5321 raid5_show_stripe_cache_size
,
5322 raid5_store_stripe_cache_size
);
5325 raid5_show_preread_threshold(struct mddev
*mddev
, char *page
)
5327 struct r5conf
*conf
= mddev
->private;
5329 return sprintf(page
, "%d\n", conf
->bypass_threshold
);
5335 raid5_store_preread_threshold(struct mddev
*mddev
, const char *page
, size_t len
)
5337 struct r5conf
*conf
= mddev
->private;
5339 if (len
>= PAGE_SIZE
)
5344 if (kstrtoul(page
, 10, &new))
5346 if (new > conf
->max_nr_stripes
)
5348 conf
->bypass_threshold
= new;
5352 static struct md_sysfs_entry
5353 raid5_preread_bypass_threshold
= __ATTR(preread_bypass_threshold
,
5355 raid5_show_preread_threshold
,
5356 raid5_store_preread_threshold
);
5359 stripe_cache_active_show(struct mddev
*mddev
, char *page
)
5361 struct r5conf
*conf
= mddev
->private;
5363 return sprintf(page
, "%d\n", atomic_read(&conf
->active_stripes
));
5368 static struct md_sysfs_entry
5369 raid5_stripecache_active
= __ATTR_RO(stripe_cache_active
);
5372 raid5_show_group_thread_cnt(struct mddev
*mddev
, char *page
)
5374 struct r5conf
*conf
= mddev
->private;
5376 return sprintf(page
, "%d\n", conf
->worker_cnt_per_group
);
5381 static int alloc_thread_groups(struct r5conf
*conf
, int cnt
,
5383 int *worker_cnt_per_group
,
5384 struct r5worker_group
**worker_groups
);
5386 raid5_store_group_thread_cnt(struct mddev
*mddev
, const char *page
, size_t len
)
5388 struct r5conf
*conf
= mddev
->private;
5391 struct r5worker_group
*new_groups
, *old_groups
;
5392 int group_cnt
, worker_cnt_per_group
;
5394 if (len
>= PAGE_SIZE
)
5399 if (kstrtoul(page
, 10, &new))
5402 if (new == conf
->worker_cnt_per_group
)
5405 mddev_suspend(mddev
);
5407 old_groups
= conf
->worker_groups
;
5409 flush_workqueue(raid5_wq
);
5411 err
= alloc_thread_groups(conf
, new,
5412 &group_cnt
, &worker_cnt_per_group
,
5415 spin_lock_irq(&conf
->device_lock
);
5416 conf
->group_cnt
= group_cnt
;
5417 conf
->worker_cnt_per_group
= worker_cnt_per_group
;
5418 conf
->worker_groups
= new_groups
;
5419 spin_unlock_irq(&conf
->device_lock
);
5422 kfree(old_groups
[0].workers
);
5426 mddev_resume(mddev
);
5433 static struct md_sysfs_entry
5434 raid5_group_thread_cnt
= __ATTR(group_thread_cnt
, S_IRUGO
| S_IWUSR
,
5435 raid5_show_group_thread_cnt
,
5436 raid5_store_group_thread_cnt
);
5438 static struct attribute
*raid5_attrs
[] = {
5439 &raid5_stripecache_size
.attr
,
5440 &raid5_stripecache_active
.attr
,
5441 &raid5_preread_bypass_threshold
.attr
,
5442 &raid5_group_thread_cnt
.attr
,
5445 static struct attribute_group raid5_attrs_group
= {
5447 .attrs
= raid5_attrs
,
5450 static int alloc_thread_groups(struct r5conf
*conf
, int cnt
,
5452 int *worker_cnt_per_group
,
5453 struct r5worker_group
**worker_groups
)
5457 struct r5worker
*workers
;
5459 *worker_cnt_per_group
= cnt
;
5462 *worker_groups
= NULL
;
5465 *group_cnt
= num_possible_nodes();
5466 size
= sizeof(struct r5worker
) * cnt
;
5467 workers
= kzalloc(size
* *group_cnt
, GFP_NOIO
);
5468 *worker_groups
= kzalloc(sizeof(struct r5worker_group
) *
5469 *group_cnt
, GFP_NOIO
);
5470 if (!*worker_groups
|| !workers
) {
5472 kfree(*worker_groups
);
5476 for (i
= 0; i
< *group_cnt
; i
++) {
5477 struct r5worker_group
*group
;
5479 group
= &(*worker_groups
)[i
];
5480 INIT_LIST_HEAD(&group
->handle_list
);
5482 group
->workers
= workers
+ i
* cnt
;
5484 for (j
= 0; j
< cnt
; j
++) {
5485 struct r5worker
*worker
= group
->workers
+ j
;
5486 worker
->group
= group
;
5487 INIT_WORK(&worker
->work
, raid5_do_work
);
5489 for (k
= 0; k
< NR_STRIPE_HASH_LOCKS
; k
++)
5490 INIT_LIST_HEAD(worker
->temp_inactive_list
+ k
);
5497 static void free_thread_groups(struct r5conf
*conf
)
5499 if (conf
->worker_groups
)
5500 kfree(conf
->worker_groups
[0].workers
);
5501 kfree(conf
->worker_groups
);
5502 conf
->worker_groups
= NULL
;
5506 raid5_size(struct mddev
*mddev
, sector_t sectors
, int raid_disks
)
5508 struct r5conf
*conf
= mddev
->private;
5511 sectors
= mddev
->dev_sectors
;
5513 /* size is defined by the smallest of previous and new size */
5514 raid_disks
= min(conf
->raid_disks
, conf
->previous_raid_disks
);
5516 sectors
&= ~((sector_t
)mddev
->chunk_sectors
- 1);
5517 sectors
&= ~((sector_t
)mddev
->new_chunk_sectors
- 1);
5518 return sectors
* (raid_disks
- conf
->max_degraded
);
5521 static void free_scratch_buffer(struct r5conf
*conf
, struct raid5_percpu
*percpu
)
5523 safe_put_page(percpu
->spare_page
);
5524 kfree(percpu
->scribble
);
5525 percpu
->spare_page
= NULL
;
5526 percpu
->scribble
= NULL
;
5529 static int alloc_scratch_buffer(struct r5conf
*conf
, struct raid5_percpu
*percpu
)
5531 if (conf
->level
== 6 && !percpu
->spare_page
)
5532 percpu
->spare_page
= alloc_page(GFP_KERNEL
);
5533 if (!percpu
->scribble
)
5534 percpu
->scribble
= kmalloc(conf
->scribble_len
, GFP_KERNEL
);
5536 if (!percpu
->scribble
|| (conf
->level
== 6 && !percpu
->spare_page
)) {
5537 free_scratch_buffer(conf
, percpu
);
5544 static void raid5_free_percpu(struct r5conf
*conf
)
5551 #ifdef CONFIG_HOTPLUG_CPU
5552 unregister_cpu_notifier(&conf
->cpu_notify
);
5556 for_each_possible_cpu(cpu
)
5557 free_scratch_buffer(conf
, per_cpu_ptr(conf
->percpu
, cpu
));
5560 free_percpu(conf
->percpu
);
5563 static void free_conf(struct r5conf
*conf
)
5565 free_thread_groups(conf
);
5566 shrink_stripes(conf
);
5567 raid5_free_percpu(conf
);
5569 kfree(conf
->stripe_hashtbl
);
5573 #ifdef CONFIG_HOTPLUG_CPU
5574 static int raid456_cpu_notify(struct notifier_block
*nfb
, unsigned long action
,
5577 struct r5conf
*conf
= container_of(nfb
, struct r5conf
, cpu_notify
);
5578 long cpu
= (long)hcpu
;
5579 struct raid5_percpu
*percpu
= per_cpu_ptr(conf
->percpu
, cpu
);
5582 case CPU_UP_PREPARE
:
5583 case CPU_UP_PREPARE_FROZEN
:
5584 if (alloc_scratch_buffer(conf
, percpu
)) {
5585 pr_err("%s: failed memory allocation for cpu%ld\n",
5587 return notifier_from_errno(-ENOMEM
);
5591 case CPU_DEAD_FROZEN
:
5592 free_scratch_buffer(conf
, per_cpu_ptr(conf
->percpu
, cpu
));
5601 static int raid5_alloc_percpu(struct r5conf
*conf
)
5606 conf
->percpu
= alloc_percpu(struct raid5_percpu
);
5610 #ifdef CONFIG_HOTPLUG_CPU
5611 conf
->cpu_notify
.notifier_call
= raid456_cpu_notify
;
5612 conf
->cpu_notify
.priority
= 0;
5613 err
= register_cpu_notifier(&conf
->cpu_notify
);
5619 for_each_present_cpu(cpu
) {
5620 err
= alloc_scratch_buffer(conf
, per_cpu_ptr(conf
->percpu
, cpu
));
5622 pr_err("%s: failed memory allocation for cpu%ld\n",
5632 static struct r5conf
*setup_conf(struct mddev
*mddev
)
5634 struct r5conf
*conf
;
5635 int raid_disk
, memory
, max_disks
;
5636 struct md_rdev
*rdev
;
5637 struct disk_info
*disk
;
5640 int group_cnt
, worker_cnt_per_group
;
5641 struct r5worker_group
*new_group
;
5643 if (mddev
->new_level
!= 5
5644 && mddev
->new_level
!= 4
5645 && mddev
->new_level
!= 6) {
5646 printk(KERN_ERR
"md/raid:%s: raid level not set to 4/5/6 (%d)\n",
5647 mdname(mddev
), mddev
->new_level
);
5648 return ERR_PTR(-EIO
);
5650 if ((mddev
->new_level
== 5
5651 && !algorithm_valid_raid5(mddev
->new_layout
)) ||
5652 (mddev
->new_level
== 6
5653 && !algorithm_valid_raid6(mddev
->new_layout
))) {
5654 printk(KERN_ERR
"md/raid:%s: layout %d not supported\n",
5655 mdname(mddev
), mddev
->new_layout
);
5656 return ERR_PTR(-EIO
);
5658 if (mddev
->new_level
== 6 && mddev
->raid_disks
< 4) {
5659 printk(KERN_ERR
"md/raid:%s: not enough configured devices (%d, minimum 4)\n",
5660 mdname(mddev
), mddev
->raid_disks
);
5661 return ERR_PTR(-EINVAL
);
5664 if (!mddev
->new_chunk_sectors
||
5665 (mddev
->new_chunk_sectors
<< 9) % PAGE_SIZE
||
5666 !is_power_of_2(mddev
->new_chunk_sectors
)) {
5667 printk(KERN_ERR
"md/raid:%s: invalid chunk size %d\n",
5668 mdname(mddev
), mddev
->new_chunk_sectors
<< 9);
5669 return ERR_PTR(-EINVAL
);
5672 conf
= kzalloc(sizeof(struct r5conf
), GFP_KERNEL
);
5675 /* Don't enable multi-threading by default*/
5676 if (!alloc_thread_groups(conf
, 0, &group_cnt
, &worker_cnt_per_group
,
5678 conf
->group_cnt
= group_cnt
;
5679 conf
->worker_cnt_per_group
= worker_cnt_per_group
;
5680 conf
->worker_groups
= new_group
;
5683 spin_lock_init(&conf
->device_lock
);
5684 seqcount_init(&conf
->gen_lock
);
5685 init_waitqueue_head(&conf
->wait_for_stripe
);
5686 init_waitqueue_head(&conf
->wait_for_overlap
);
5687 INIT_LIST_HEAD(&conf
->handle_list
);
5688 INIT_LIST_HEAD(&conf
->hold_list
);
5689 INIT_LIST_HEAD(&conf
->delayed_list
);
5690 INIT_LIST_HEAD(&conf
->bitmap_list
);
5691 init_llist_head(&conf
->released_stripes
);
5692 atomic_set(&conf
->active_stripes
, 0);
5693 atomic_set(&conf
->preread_active_stripes
, 0);
5694 atomic_set(&conf
->active_aligned_reads
, 0);
5695 conf
->bypass_threshold
= BYPASS_THRESHOLD
;
5696 conf
->recovery_disabled
= mddev
->recovery_disabled
- 1;
5698 conf
->raid_disks
= mddev
->raid_disks
;
5699 if (mddev
->reshape_position
== MaxSector
)
5700 conf
->previous_raid_disks
= mddev
->raid_disks
;
5702 conf
->previous_raid_disks
= mddev
->raid_disks
- mddev
->delta_disks
;
5703 max_disks
= max(conf
->raid_disks
, conf
->previous_raid_disks
);
5704 conf
->scribble_len
= scribble_len(max_disks
);
5706 conf
->disks
= kzalloc(max_disks
* sizeof(struct disk_info
),
5711 conf
->mddev
= mddev
;
5713 if ((conf
->stripe_hashtbl
= kzalloc(PAGE_SIZE
, GFP_KERNEL
)) == NULL
)
5716 /* We init hash_locks[0] separately to that it can be used
5717 * as the reference lock in the spin_lock_nest_lock() call
5718 * in lock_all_device_hash_locks_irq in order to convince
5719 * lockdep that we know what we are doing.
5721 spin_lock_init(conf
->hash_locks
);
5722 for (i
= 1; i
< NR_STRIPE_HASH_LOCKS
; i
++)
5723 spin_lock_init(conf
->hash_locks
+ i
);
5725 for (i
= 0; i
< NR_STRIPE_HASH_LOCKS
; i
++)
5726 INIT_LIST_HEAD(conf
->inactive_list
+ i
);
5728 for (i
= 0; i
< NR_STRIPE_HASH_LOCKS
; i
++)
5729 INIT_LIST_HEAD(conf
->temp_inactive_list
+ i
);
5731 conf
->level
= mddev
->new_level
;
5732 if (raid5_alloc_percpu(conf
) != 0)
5735 pr_debug("raid456: run(%s) called.\n", mdname(mddev
));
5737 rdev_for_each(rdev
, mddev
) {
5738 raid_disk
= rdev
->raid_disk
;
5739 if (raid_disk
>= max_disks
5742 disk
= conf
->disks
+ raid_disk
;
5744 if (test_bit(Replacement
, &rdev
->flags
)) {
5745 if (disk
->replacement
)
5747 disk
->replacement
= rdev
;
5754 if (test_bit(In_sync
, &rdev
->flags
)) {
5755 char b
[BDEVNAME_SIZE
];
5756 printk(KERN_INFO
"md/raid:%s: device %s operational as raid"
5758 mdname(mddev
), bdevname(rdev
->bdev
, b
), raid_disk
);
5759 } else if (rdev
->saved_raid_disk
!= raid_disk
)
5760 /* Cannot rely on bitmap to complete recovery */
5764 conf
->chunk_sectors
= mddev
->new_chunk_sectors
;
5765 conf
->level
= mddev
->new_level
;
5766 if (conf
->level
== 6)
5767 conf
->max_degraded
= 2;
5769 conf
->max_degraded
= 1;
5770 conf
->algorithm
= mddev
->new_layout
;
5771 conf
->reshape_progress
= mddev
->reshape_position
;
5772 if (conf
->reshape_progress
!= MaxSector
) {
5773 conf
->prev_chunk_sectors
= mddev
->chunk_sectors
;
5774 conf
->prev_algo
= mddev
->layout
;
5777 memory
= conf
->max_nr_stripes
* (sizeof(struct stripe_head
) +
5778 max_disks
* ((sizeof(struct bio
) + PAGE_SIZE
))) / 1024;
5779 atomic_set(&conf
->empty_inactive_list_nr
, NR_STRIPE_HASH_LOCKS
);
5780 if (grow_stripes(conf
, NR_STRIPES
)) {
5782 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5783 mdname(mddev
), memory
);
5786 printk(KERN_INFO
"md/raid:%s: allocated %dkB\n",
5787 mdname(mddev
), memory
);
5789 sprintf(pers_name
, "raid%d", mddev
->new_level
);
5790 conf
->thread
= md_register_thread(raid5d
, mddev
, pers_name
);
5791 if (!conf
->thread
) {
5793 "md/raid:%s: couldn't allocate thread.\n",
5803 return ERR_PTR(-EIO
);
5805 return ERR_PTR(-ENOMEM
);
5809 static int only_parity(int raid_disk
, int algo
, int raid_disks
, int max_degraded
)
5812 case ALGORITHM_PARITY_0
:
5813 if (raid_disk
< max_degraded
)
5816 case ALGORITHM_PARITY_N
:
5817 if (raid_disk
>= raid_disks
- max_degraded
)
5820 case ALGORITHM_PARITY_0_6
:
5821 if (raid_disk
== 0 ||
5822 raid_disk
== raid_disks
- 1)
5825 case ALGORITHM_LEFT_ASYMMETRIC_6
:
5826 case ALGORITHM_RIGHT_ASYMMETRIC_6
:
5827 case ALGORITHM_LEFT_SYMMETRIC_6
:
5828 case ALGORITHM_RIGHT_SYMMETRIC_6
:
5829 if (raid_disk
== raid_disks
- 1)
5835 static int run(struct mddev
*mddev
)
5837 struct r5conf
*conf
;
5838 int working_disks
= 0;
5839 int dirty_parity_disks
= 0;
5840 struct md_rdev
*rdev
;
5841 sector_t reshape_offset
= 0;
5843 long long min_offset_diff
= 0;
5846 if (mddev
->recovery_cp
!= MaxSector
)
5847 printk(KERN_NOTICE
"md/raid:%s: not clean"
5848 " -- starting background reconstruction\n",
5851 rdev_for_each(rdev
, mddev
) {
5853 if (rdev
->raid_disk
< 0)
5855 diff
= (rdev
->new_data_offset
- rdev
->data_offset
);
5857 min_offset_diff
= diff
;
5859 } else if (mddev
->reshape_backwards
&&
5860 diff
< min_offset_diff
)
5861 min_offset_diff
= diff
;
5862 else if (!mddev
->reshape_backwards
&&
5863 diff
> min_offset_diff
)
5864 min_offset_diff
= diff
;
5867 if (mddev
->reshape_position
!= MaxSector
) {
5868 /* Check that we can continue the reshape.
5869 * Difficulties arise if the stripe we would write to
5870 * next is at or after the stripe we would read from next.
5871 * For a reshape that changes the number of devices, this
5872 * is only possible for a very short time, and mdadm makes
5873 * sure that time appears to have past before assembling
5874 * the array. So we fail if that time hasn't passed.
5875 * For a reshape that keeps the number of devices the same
5876 * mdadm must be monitoring the reshape can keeping the
5877 * critical areas read-only and backed up. It will start
5878 * the array in read-only mode, so we check for that.
5880 sector_t here_new
, here_old
;
5882 int max_degraded
= (mddev
->level
== 6 ? 2 : 1);
5884 if (mddev
->new_level
!= mddev
->level
) {
5885 printk(KERN_ERR
"md/raid:%s: unsupported reshape "
5886 "required - aborting.\n",
5890 old_disks
= mddev
->raid_disks
- mddev
->delta_disks
;
5891 /* reshape_position must be on a new-stripe boundary, and one
5892 * further up in new geometry must map after here in old
5895 here_new
= mddev
->reshape_position
;
5896 if (sector_div(here_new
, mddev
->new_chunk_sectors
*
5897 (mddev
->raid_disks
- max_degraded
))) {
5898 printk(KERN_ERR
"md/raid:%s: reshape_position not "
5899 "on a stripe boundary\n", mdname(mddev
));
5902 reshape_offset
= here_new
* mddev
->new_chunk_sectors
;
5903 /* here_new is the stripe we will write to */
5904 here_old
= mddev
->reshape_position
;
5905 sector_div(here_old
, mddev
->chunk_sectors
*
5906 (old_disks
-max_degraded
));
5907 /* here_old is the first stripe that we might need to read
5909 if (mddev
->delta_disks
== 0) {
5910 if ((here_new
* mddev
->new_chunk_sectors
!=
5911 here_old
* mddev
->chunk_sectors
)) {
5912 printk(KERN_ERR
"md/raid:%s: reshape position is"
5913 " confused - aborting\n", mdname(mddev
));
5916 /* We cannot be sure it is safe to start an in-place
5917 * reshape. It is only safe if user-space is monitoring
5918 * and taking constant backups.
5919 * mdadm always starts a situation like this in
5920 * readonly mode so it can take control before
5921 * allowing any writes. So just check for that.
5923 if (abs(min_offset_diff
) >= mddev
->chunk_sectors
&&
5924 abs(min_offset_diff
) >= mddev
->new_chunk_sectors
)
5925 /* not really in-place - so OK */;
5926 else if (mddev
->ro
== 0) {
5927 printk(KERN_ERR
"md/raid:%s: in-place reshape "
5928 "must be started in read-only mode "
5933 } else if (mddev
->reshape_backwards
5934 ? (here_new
* mddev
->new_chunk_sectors
+ min_offset_diff
<=
5935 here_old
* mddev
->chunk_sectors
)
5936 : (here_new
* mddev
->new_chunk_sectors
>=
5937 here_old
* mddev
->chunk_sectors
+ (-min_offset_diff
))) {
5938 /* Reading from the same stripe as writing to - bad */
5939 printk(KERN_ERR
"md/raid:%s: reshape_position too early for "
5940 "auto-recovery - aborting.\n",
5944 printk(KERN_INFO
"md/raid:%s: reshape will continue\n",
5946 /* OK, we should be able to continue; */
5948 BUG_ON(mddev
->level
!= mddev
->new_level
);
5949 BUG_ON(mddev
->layout
!= mddev
->new_layout
);
5950 BUG_ON(mddev
->chunk_sectors
!= mddev
->new_chunk_sectors
);
5951 BUG_ON(mddev
->delta_disks
!= 0);
5954 if (mddev
->private == NULL
)
5955 conf
= setup_conf(mddev
);
5957 conf
= mddev
->private;
5960 return PTR_ERR(conf
);
5962 conf
->min_offset_diff
= min_offset_diff
;
5963 mddev
->thread
= conf
->thread
;
5964 conf
->thread
= NULL
;
5965 mddev
->private = conf
;
5967 for (i
= 0; i
< conf
->raid_disks
&& conf
->previous_raid_disks
;
5969 rdev
= conf
->disks
[i
].rdev
;
5970 if (!rdev
&& conf
->disks
[i
].replacement
) {
5971 /* The replacement is all we have yet */
5972 rdev
= conf
->disks
[i
].replacement
;
5973 conf
->disks
[i
].replacement
= NULL
;
5974 clear_bit(Replacement
, &rdev
->flags
);
5975 conf
->disks
[i
].rdev
= rdev
;
5979 if (conf
->disks
[i
].replacement
&&
5980 conf
->reshape_progress
!= MaxSector
) {
5981 /* replacements and reshape simply do not mix. */
5982 printk(KERN_ERR
"md: cannot handle concurrent "
5983 "replacement and reshape.\n");
5986 if (test_bit(In_sync
, &rdev
->flags
)) {
5990 /* This disc is not fully in-sync. However if it
5991 * just stored parity (beyond the recovery_offset),
5992 * when we don't need to be concerned about the
5993 * array being dirty.
5994 * When reshape goes 'backwards', we never have
5995 * partially completed devices, so we only need
5996 * to worry about reshape going forwards.
5998 /* Hack because v0.91 doesn't store recovery_offset properly. */
5999 if (mddev
->major_version
== 0 &&
6000 mddev
->minor_version
> 90)
6001 rdev
->recovery_offset
= reshape_offset
;
6003 if (rdev
->recovery_offset
< reshape_offset
) {
6004 /* We need to check old and new layout */
6005 if (!only_parity(rdev
->raid_disk
,
6008 conf
->max_degraded
))
6011 if (!only_parity(rdev
->raid_disk
,
6013 conf
->previous_raid_disks
,
6014 conf
->max_degraded
))
6016 dirty_parity_disks
++;
6020 * 0 for a fully functional array, 1 or 2 for a degraded array.
6022 mddev
->degraded
= calc_degraded(conf
);
6024 if (has_failed(conf
)) {
6025 printk(KERN_ERR
"md/raid:%s: not enough operational devices"
6026 " (%d/%d failed)\n",
6027 mdname(mddev
), mddev
->degraded
, conf
->raid_disks
);
6031 /* device size must be a multiple of chunk size */
6032 mddev
->dev_sectors
&= ~(mddev
->chunk_sectors
- 1);
6033 mddev
->resync_max_sectors
= mddev
->dev_sectors
;
6035 if (mddev
->degraded
> dirty_parity_disks
&&
6036 mddev
->recovery_cp
!= MaxSector
) {
6037 if (mddev
->ok_start_degraded
)
6039 "md/raid:%s: starting dirty degraded array"
6040 " - data corruption possible.\n",
6044 "md/raid:%s: cannot start dirty degraded array.\n",
6050 if (mddev
->degraded
== 0)
6051 printk(KERN_INFO
"md/raid:%s: raid level %d active with %d out of %d"
6052 " devices, algorithm %d\n", mdname(mddev
), conf
->level
,
6053 mddev
->raid_disks
-mddev
->degraded
, mddev
->raid_disks
,
6056 printk(KERN_ALERT
"md/raid:%s: raid level %d active with %d"
6057 " out of %d devices, algorithm %d\n",
6058 mdname(mddev
), conf
->level
,
6059 mddev
->raid_disks
- mddev
->degraded
,
6060 mddev
->raid_disks
, mddev
->new_layout
);
6062 print_raid5_conf(conf
);
6064 if (conf
->reshape_progress
!= MaxSector
) {
6065 conf
->reshape_safe
= conf
->reshape_progress
;
6066 atomic_set(&conf
->reshape_stripes
, 0);
6067 clear_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
);
6068 clear_bit(MD_RECOVERY_CHECK
, &mddev
->recovery
);
6069 set_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
);
6070 set_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
);
6071 mddev
->sync_thread
= md_register_thread(md_do_sync
, mddev
,
6076 /* Ok, everything is just fine now */
6077 if (mddev
->to_remove
== &raid5_attrs_group
)
6078 mddev
->to_remove
= NULL
;
6079 else if (mddev
->kobj
.sd
&&
6080 sysfs_create_group(&mddev
->kobj
, &raid5_attrs_group
))
6082 "raid5: failed to create sysfs attributes for %s\n",
6084 md_set_array_sectors(mddev
, raid5_size(mddev
, 0, 0));
6088 bool discard_supported
= true;
6089 /* read-ahead size must cover two whole stripes, which
6090 * is 2 * (datadisks) * chunksize where 'n' is the
6091 * number of raid devices
6093 int data_disks
= conf
->previous_raid_disks
- conf
->max_degraded
;
6094 int stripe
= data_disks
*
6095 ((mddev
->chunk_sectors
<< 9) / PAGE_SIZE
);
6096 if (mddev
->queue
->backing_dev_info
.ra_pages
< 2 * stripe
)
6097 mddev
->queue
->backing_dev_info
.ra_pages
= 2 * stripe
;
6099 blk_queue_merge_bvec(mddev
->queue
, raid5_mergeable_bvec
);
6101 mddev
->queue
->backing_dev_info
.congested_data
= mddev
;
6102 mddev
->queue
->backing_dev_info
.congested_fn
= raid5_congested
;
6104 chunk_size
= mddev
->chunk_sectors
<< 9;
6105 blk_queue_io_min(mddev
->queue
, chunk_size
);
6106 blk_queue_io_opt(mddev
->queue
, chunk_size
*
6107 (conf
->raid_disks
- conf
->max_degraded
));
6108 mddev
->queue
->limits
.raid_partial_stripes_expensive
= 1;
6110 * We can only discard a whole stripe. It doesn't make sense to
6111 * discard data disk but write parity disk
6113 stripe
= stripe
* PAGE_SIZE
;
6114 /* Round up to power of 2, as discard handling
6115 * currently assumes that */
6116 while ((stripe
-1) & stripe
)
6117 stripe
= (stripe
| (stripe
-1)) + 1;
6118 mddev
->queue
->limits
.discard_alignment
= stripe
;
6119 mddev
->queue
->limits
.discard_granularity
= stripe
;
6121 * unaligned part of discard request will be ignored, so can't
6122 * guarantee discard_zerors_data
6124 mddev
->queue
->limits
.discard_zeroes_data
= 0;
6126 blk_queue_max_write_same_sectors(mddev
->queue
, 0);
6128 rdev_for_each(rdev
, mddev
) {
6129 disk_stack_limits(mddev
->gendisk
, rdev
->bdev
,
6130 rdev
->data_offset
<< 9);
6131 disk_stack_limits(mddev
->gendisk
, rdev
->bdev
,
6132 rdev
->new_data_offset
<< 9);
6134 * discard_zeroes_data is required, otherwise data
6135 * could be lost. Consider a scenario: discard a stripe
6136 * (the stripe could be inconsistent if
6137 * discard_zeroes_data is 0); write one disk of the
6138 * stripe (the stripe could be inconsistent again
6139 * depending on which disks are used to calculate
6140 * parity); the disk is broken; The stripe data of this
6143 if (!blk_queue_discard(bdev_get_queue(rdev
->bdev
)) ||
6144 !bdev_get_queue(rdev
->bdev
)->
6145 limits
.discard_zeroes_data
)
6146 discard_supported
= false;
6149 if (discard_supported
&&
6150 mddev
->queue
->limits
.max_discard_sectors
>= stripe
&&
6151 mddev
->queue
->limits
.discard_granularity
>= stripe
)
6152 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD
,
6155 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD
,
6161 md_unregister_thread(&mddev
->thread
);
6162 print_raid5_conf(conf
);
6164 mddev
->private = NULL
;
6165 printk(KERN_ALERT
"md/raid:%s: failed to run raid set.\n", mdname(mddev
));
6169 static int stop(struct mddev
*mddev
)
6171 struct r5conf
*conf
= mddev
->private;
6173 md_unregister_thread(&mddev
->thread
);
6175 mddev
->queue
->backing_dev_info
.congested_fn
= NULL
;
6177 mddev
->private = NULL
;
6178 mddev
->to_remove
= &raid5_attrs_group
;
6182 static void status(struct seq_file
*seq
, struct mddev
*mddev
)
6184 struct r5conf
*conf
= mddev
->private;
6187 seq_printf(seq
, " level %d, %dk chunk, algorithm %d", mddev
->level
,
6188 mddev
->chunk_sectors
/ 2, mddev
->layout
);
6189 seq_printf (seq
, " [%d/%d] [", conf
->raid_disks
, conf
->raid_disks
- mddev
->degraded
);
6190 for (i
= 0; i
< conf
->raid_disks
; i
++)
6191 seq_printf (seq
, "%s",
6192 conf
->disks
[i
].rdev
&&
6193 test_bit(In_sync
, &conf
->disks
[i
].rdev
->flags
) ? "U" : "_");
6194 seq_printf (seq
, "]");
6197 static void print_raid5_conf (struct r5conf
*conf
)
6200 struct disk_info
*tmp
;
6202 printk(KERN_DEBUG
"RAID conf printout:\n");
6204 printk("(conf==NULL)\n");
6207 printk(KERN_DEBUG
" --- level:%d rd:%d wd:%d\n", conf
->level
,
6209 conf
->raid_disks
- conf
->mddev
->degraded
);
6211 for (i
= 0; i
< conf
->raid_disks
; i
++) {
6212 char b
[BDEVNAME_SIZE
];
6213 tmp
= conf
->disks
+ i
;
6215 printk(KERN_DEBUG
" disk %d, o:%d, dev:%s\n",
6216 i
, !test_bit(Faulty
, &tmp
->rdev
->flags
),
6217 bdevname(tmp
->rdev
->bdev
, b
));
6221 static int raid5_spare_active(struct mddev
*mddev
)
6224 struct r5conf
*conf
= mddev
->private;
6225 struct disk_info
*tmp
;
6227 unsigned long flags
;
6229 for (i
= 0; i
< conf
->raid_disks
; i
++) {
6230 tmp
= conf
->disks
+ i
;
6231 if (tmp
->replacement
6232 && tmp
->replacement
->recovery_offset
== MaxSector
6233 && !test_bit(Faulty
, &tmp
->replacement
->flags
)
6234 && !test_and_set_bit(In_sync
, &tmp
->replacement
->flags
)) {
6235 /* Replacement has just become active. */
6237 || !test_and_clear_bit(In_sync
, &tmp
->rdev
->flags
))
6240 /* Replaced device not technically faulty,
6241 * but we need to be sure it gets removed
6242 * and never re-added.
6244 set_bit(Faulty
, &tmp
->rdev
->flags
);
6245 sysfs_notify_dirent_safe(
6246 tmp
->rdev
->sysfs_state
);
6248 sysfs_notify_dirent_safe(tmp
->replacement
->sysfs_state
);
6249 } else if (tmp
->rdev
6250 && tmp
->rdev
->recovery_offset
== MaxSector
6251 && !test_bit(Faulty
, &tmp
->rdev
->flags
)
6252 && !test_and_set_bit(In_sync
, &tmp
->rdev
->flags
)) {
6254 sysfs_notify_dirent_safe(tmp
->rdev
->sysfs_state
);
6257 spin_lock_irqsave(&conf
->device_lock
, flags
);
6258 mddev
->degraded
= calc_degraded(conf
);
6259 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
6260 print_raid5_conf(conf
);
6264 static int raid5_remove_disk(struct mddev
*mddev
, struct md_rdev
*rdev
)
6266 struct r5conf
*conf
= mddev
->private;
6268 int number
= rdev
->raid_disk
;
6269 struct md_rdev
**rdevp
;
6270 struct disk_info
*p
= conf
->disks
+ number
;
6272 print_raid5_conf(conf
);
6273 if (rdev
== p
->rdev
)
6275 else if (rdev
== p
->replacement
)
6276 rdevp
= &p
->replacement
;
6280 if (number
>= conf
->raid_disks
&&
6281 conf
->reshape_progress
== MaxSector
)
6282 clear_bit(In_sync
, &rdev
->flags
);
6284 if (test_bit(In_sync
, &rdev
->flags
) ||
6285 atomic_read(&rdev
->nr_pending
)) {
6289 /* Only remove non-faulty devices if recovery
6292 if (!test_bit(Faulty
, &rdev
->flags
) &&
6293 mddev
->recovery_disabled
!= conf
->recovery_disabled
&&
6294 !has_failed(conf
) &&
6295 (!p
->replacement
|| p
->replacement
== rdev
) &&
6296 number
< conf
->raid_disks
) {
6302 if (atomic_read(&rdev
->nr_pending
)) {
6303 /* lost the race, try later */
6306 } else if (p
->replacement
) {
6307 /* We must have just cleared 'rdev' */
6308 p
->rdev
= p
->replacement
;
6309 clear_bit(Replacement
, &p
->replacement
->flags
);
6310 smp_mb(); /* Make sure other CPUs may see both as identical
6311 * but will never see neither - if they are careful
6313 p
->replacement
= NULL
;
6314 clear_bit(WantReplacement
, &rdev
->flags
);
6316 /* We might have just removed the Replacement as faulty-
6317 * clear the bit just in case
6319 clear_bit(WantReplacement
, &rdev
->flags
);
6322 print_raid5_conf(conf
);
6326 static int raid5_add_disk(struct mddev
*mddev
, struct md_rdev
*rdev
)
6328 struct r5conf
*conf
= mddev
->private;
6331 struct disk_info
*p
;
6333 int last
= conf
->raid_disks
- 1;
6335 if (mddev
->recovery_disabled
== conf
->recovery_disabled
)
6338 if (rdev
->saved_raid_disk
< 0 && has_failed(conf
))
6339 /* no point adding a device */
6342 if (rdev
->raid_disk
>= 0)
6343 first
= last
= rdev
->raid_disk
;
6346 * find the disk ... but prefer rdev->saved_raid_disk
6349 if (rdev
->saved_raid_disk
>= 0 &&
6350 rdev
->saved_raid_disk
>= first
&&
6351 conf
->disks
[rdev
->saved_raid_disk
].rdev
== NULL
)
6352 first
= rdev
->saved_raid_disk
;
6354 for (disk
= first
; disk
<= last
; disk
++) {
6355 p
= conf
->disks
+ disk
;
6356 if (p
->rdev
== NULL
) {
6357 clear_bit(In_sync
, &rdev
->flags
);
6358 rdev
->raid_disk
= disk
;
6360 if (rdev
->saved_raid_disk
!= disk
)
6362 rcu_assign_pointer(p
->rdev
, rdev
);
6366 for (disk
= first
; disk
<= last
; disk
++) {
6367 p
= conf
->disks
+ disk
;
6368 if (test_bit(WantReplacement
, &p
->rdev
->flags
) &&
6369 p
->replacement
== NULL
) {
6370 clear_bit(In_sync
, &rdev
->flags
);
6371 set_bit(Replacement
, &rdev
->flags
);
6372 rdev
->raid_disk
= disk
;
6375 rcu_assign_pointer(p
->replacement
, rdev
);
6380 print_raid5_conf(conf
);
6384 static int raid5_resize(struct mddev
*mddev
, sector_t sectors
)
6386 /* no resync is happening, and there is enough space
6387 * on all devices, so we can resize.
6388 * We need to make sure resync covers any new space.
6389 * If the array is shrinking we should possibly wait until
6390 * any io in the removed space completes, but it hardly seems
6394 sectors
&= ~((sector_t
)mddev
->chunk_sectors
- 1);
6395 newsize
= raid5_size(mddev
, sectors
, mddev
->raid_disks
);
6396 if (mddev
->external_size
&&
6397 mddev
->array_sectors
> newsize
)
6399 if (mddev
->bitmap
) {
6400 int ret
= bitmap_resize(mddev
->bitmap
, sectors
, 0, 0);
6404 md_set_array_sectors(mddev
, newsize
);
6405 set_capacity(mddev
->gendisk
, mddev
->array_sectors
);
6406 revalidate_disk(mddev
->gendisk
);
6407 if (sectors
> mddev
->dev_sectors
&&
6408 mddev
->recovery_cp
> mddev
->dev_sectors
) {
6409 mddev
->recovery_cp
= mddev
->dev_sectors
;
6410 set_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
);
6412 mddev
->dev_sectors
= sectors
;
6413 mddev
->resync_max_sectors
= sectors
;
6417 static int check_stripe_cache(struct mddev
*mddev
)
6419 /* Can only proceed if there are plenty of stripe_heads.
6420 * We need a minimum of one full stripe,, and for sensible progress
6421 * it is best to have about 4 times that.
6422 * If we require 4 times, then the default 256 4K stripe_heads will
6423 * allow for chunk sizes up to 256K, which is probably OK.
6424 * If the chunk size is greater, user-space should request more
6425 * stripe_heads first.
6427 struct r5conf
*conf
= mddev
->private;
6428 if (((mddev
->chunk_sectors
<< 9) / STRIPE_SIZE
) * 4
6429 > conf
->max_nr_stripes
||
6430 ((mddev
->new_chunk_sectors
<< 9) / STRIPE_SIZE
) * 4
6431 > conf
->max_nr_stripes
) {
6432 printk(KERN_WARNING
"md/raid:%s: reshape: not enough stripes. Needed %lu\n",
6434 ((max(mddev
->chunk_sectors
, mddev
->new_chunk_sectors
) << 9)
6441 static int check_reshape(struct mddev
*mddev
)
6443 struct r5conf
*conf
= mddev
->private;
6445 if (mddev
->delta_disks
== 0 &&
6446 mddev
->new_layout
== mddev
->layout
&&
6447 mddev
->new_chunk_sectors
== mddev
->chunk_sectors
)
6448 return 0; /* nothing to do */
6449 if (has_failed(conf
))
6451 if (mddev
->delta_disks
< 0 && mddev
->reshape_position
== MaxSector
) {
6452 /* We might be able to shrink, but the devices must
6453 * be made bigger first.
6454 * For raid6, 4 is the minimum size.
6455 * Otherwise 2 is the minimum
6458 if (mddev
->level
== 6)
6460 if (mddev
->raid_disks
+ mddev
->delta_disks
< min
)
6464 if (!check_stripe_cache(mddev
))
6467 return resize_stripes(conf
, (conf
->previous_raid_disks
6468 + mddev
->delta_disks
));
6471 static int raid5_start_reshape(struct mddev
*mddev
)
6473 struct r5conf
*conf
= mddev
->private;
6474 struct md_rdev
*rdev
;
6476 unsigned long flags
;
6478 if (test_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
))
6481 if (!check_stripe_cache(mddev
))
6484 if (has_failed(conf
))
6487 rdev_for_each(rdev
, mddev
) {
6488 if (!test_bit(In_sync
, &rdev
->flags
)
6489 && !test_bit(Faulty
, &rdev
->flags
))
6493 if (spares
- mddev
->degraded
< mddev
->delta_disks
- conf
->max_degraded
)
6494 /* Not enough devices even to make a degraded array
6499 /* Refuse to reduce size of the array. Any reductions in
6500 * array size must be through explicit setting of array_size
6503 if (raid5_size(mddev
, 0, conf
->raid_disks
+ mddev
->delta_disks
)
6504 < mddev
->array_sectors
) {
6505 printk(KERN_ERR
"md/raid:%s: array size must be reduced "
6506 "before number of disks\n", mdname(mddev
));
6510 atomic_set(&conf
->reshape_stripes
, 0);
6511 spin_lock_irq(&conf
->device_lock
);
6512 write_seqcount_begin(&conf
->gen_lock
);
6513 conf
->previous_raid_disks
= conf
->raid_disks
;
6514 conf
->raid_disks
+= mddev
->delta_disks
;
6515 conf
->prev_chunk_sectors
= conf
->chunk_sectors
;
6516 conf
->chunk_sectors
= mddev
->new_chunk_sectors
;
6517 conf
->prev_algo
= conf
->algorithm
;
6518 conf
->algorithm
= mddev
->new_layout
;
6520 /* Code that selects data_offset needs to see the generation update
6521 * if reshape_progress has been set - so a memory barrier needed.
6524 if (mddev
->reshape_backwards
)
6525 conf
->reshape_progress
= raid5_size(mddev
, 0, 0);
6527 conf
->reshape_progress
= 0;
6528 conf
->reshape_safe
= conf
->reshape_progress
;
6529 write_seqcount_end(&conf
->gen_lock
);
6530 spin_unlock_irq(&conf
->device_lock
);
6532 /* Now make sure any requests that proceeded on the assumption
6533 * the reshape wasn't running - like Discard or Read - have
6536 mddev_suspend(mddev
);
6537 mddev_resume(mddev
);
6539 /* Add some new drives, as many as will fit.
6540 * We know there are enough to make the newly sized array work.
6541 * Don't add devices if we are reducing the number of
6542 * devices in the array. This is because it is not possible
6543 * to correctly record the "partially reconstructed" state of
6544 * such devices during the reshape and confusion could result.
6546 if (mddev
->delta_disks
>= 0) {
6547 rdev_for_each(rdev
, mddev
)
6548 if (rdev
->raid_disk
< 0 &&
6549 !test_bit(Faulty
, &rdev
->flags
)) {
6550 if (raid5_add_disk(mddev
, rdev
) == 0) {
6552 >= conf
->previous_raid_disks
)
6553 set_bit(In_sync
, &rdev
->flags
);
6555 rdev
->recovery_offset
= 0;
6557 if (sysfs_link_rdev(mddev
, rdev
))
6558 /* Failure here is OK */;
6560 } else if (rdev
->raid_disk
>= conf
->previous_raid_disks
6561 && !test_bit(Faulty
, &rdev
->flags
)) {
6562 /* This is a spare that was manually added */
6563 set_bit(In_sync
, &rdev
->flags
);
6566 /* When a reshape changes the number of devices,
6567 * ->degraded is measured against the larger of the
6568 * pre and post number of devices.
6570 spin_lock_irqsave(&conf
->device_lock
, flags
);
6571 mddev
->degraded
= calc_degraded(conf
);
6572 spin_unlock_irqrestore(&conf
->device_lock
, flags
);
6574 mddev
->raid_disks
= conf
->raid_disks
;
6575 mddev
->reshape_position
= conf
->reshape_progress
;
6576 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
6578 clear_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
);
6579 clear_bit(MD_RECOVERY_CHECK
, &mddev
->recovery
);
6580 set_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
);
6581 set_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
);
6582 mddev
->sync_thread
= md_register_thread(md_do_sync
, mddev
,
6584 if (!mddev
->sync_thread
) {
6585 mddev
->recovery
= 0;
6586 spin_lock_irq(&conf
->device_lock
);
6587 write_seqcount_begin(&conf
->gen_lock
);
6588 mddev
->raid_disks
= conf
->raid_disks
= conf
->previous_raid_disks
;
6589 mddev
->new_chunk_sectors
=
6590 conf
->chunk_sectors
= conf
->prev_chunk_sectors
;
6591 mddev
->new_layout
= conf
->algorithm
= conf
->prev_algo
;
6592 rdev_for_each(rdev
, mddev
)
6593 rdev
->new_data_offset
= rdev
->data_offset
;
6595 conf
->generation
--;
6596 conf
->reshape_progress
= MaxSector
;
6597 mddev
->reshape_position
= MaxSector
;
6598 write_seqcount_end(&conf
->gen_lock
);
6599 spin_unlock_irq(&conf
->device_lock
);
6602 conf
->reshape_checkpoint
= jiffies
;
6603 md_wakeup_thread(mddev
->sync_thread
);
6604 md_new_event(mddev
);
6608 /* This is called from the reshape thread and should make any
6609 * changes needed in 'conf'
6611 static void end_reshape(struct r5conf
*conf
)
6614 if (!test_bit(MD_RECOVERY_INTR
, &conf
->mddev
->recovery
)) {
6615 struct md_rdev
*rdev
;
6617 spin_lock_irq(&conf
->device_lock
);
6618 conf
->previous_raid_disks
= conf
->raid_disks
;
6619 rdev_for_each(rdev
, conf
->mddev
)
6620 rdev
->data_offset
= rdev
->new_data_offset
;
6622 conf
->reshape_progress
= MaxSector
;
6623 spin_unlock_irq(&conf
->device_lock
);
6624 wake_up(&conf
->wait_for_overlap
);
6626 /* read-ahead size must cover two whole stripes, which is
6627 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
6629 if (conf
->mddev
->queue
) {
6630 int data_disks
= conf
->raid_disks
- conf
->max_degraded
;
6631 int stripe
= data_disks
* ((conf
->chunk_sectors
<< 9)
6633 if (conf
->mddev
->queue
->backing_dev_info
.ra_pages
< 2 * stripe
)
6634 conf
->mddev
->queue
->backing_dev_info
.ra_pages
= 2 * stripe
;
6639 /* This is called from the raid5d thread with mddev_lock held.
6640 * It makes config changes to the device.
6642 static void raid5_finish_reshape(struct mddev
*mddev
)
6644 struct r5conf
*conf
= mddev
->private;
6646 if (!test_bit(MD_RECOVERY_INTR
, &mddev
->recovery
)) {
6648 if (mddev
->delta_disks
> 0) {
6649 md_set_array_sectors(mddev
, raid5_size(mddev
, 0, 0));
6650 set_capacity(mddev
->gendisk
, mddev
->array_sectors
);
6651 revalidate_disk(mddev
->gendisk
);
6654 spin_lock_irq(&conf
->device_lock
);
6655 mddev
->degraded
= calc_degraded(conf
);
6656 spin_unlock_irq(&conf
->device_lock
);
6657 for (d
= conf
->raid_disks
;
6658 d
< conf
->raid_disks
- mddev
->delta_disks
;
6660 struct md_rdev
*rdev
= conf
->disks
[d
].rdev
;
6662 clear_bit(In_sync
, &rdev
->flags
);
6663 rdev
= conf
->disks
[d
].replacement
;
6665 clear_bit(In_sync
, &rdev
->flags
);
6668 mddev
->layout
= conf
->algorithm
;
6669 mddev
->chunk_sectors
= conf
->chunk_sectors
;
6670 mddev
->reshape_position
= MaxSector
;
6671 mddev
->delta_disks
= 0;
6672 mddev
->reshape_backwards
= 0;
6676 static void raid5_quiesce(struct mddev
*mddev
, int state
)
6678 struct r5conf
*conf
= mddev
->private;
6681 case 2: /* resume for a suspend */
6682 wake_up(&conf
->wait_for_overlap
);
6685 case 1: /* stop all writes */
6686 lock_all_device_hash_locks_irq(conf
);
6687 /* '2' tells resync/reshape to pause so that all
6688 * active stripes can drain
6691 wait_event_cmd(conf
->wait_for_stripe
,
6692 atomic_read(&conf
->active_stripes
) == 0 &&
6693 atomic_read(&conf
->active_aligned_reads
) == 0,
6694 unlock_all_device_hash_locks_irq(conf
),
6695 lock_all_device_hash_locks_irq(conf
));
6697 unlock_all_device_hash_locks_irq(conf
);
6698 /* allow reshape to continue */
6699 wake_up(&conf
->wait_for_overlap
);
6702 case 0: /* re-enable writes */
6703 lock_all_device_hash_locks_irq(conf
);
6705 wake_up(&conf
->wait_for_stripe
);
6706 wake_up(&conf
->wait_for_overlap
);
6707 unlock_all_device_hash_locks_irq(conf
);
6713 static void *raid45_takeover_raid0(struct mddev
*mddev
, int level
)
6715 struct r0conf
*raid0_conf
= mddev
->private;
6718 /* for raid0 takeover only one zone is supported */
6719 if (raid0_conf
->nr_strip_zones
> 1) {
6720 printk(KERN_ERR
"md/raid:%s: cannot takeover raid0 with more than one zone.\n",
6722 return ERR_PTR(-EINVAL
);
6725 sectors
= raid0_conf
->strip_zone
[0].zone_end
;
6726 sector_div(sectors
, raid0_conf
->strip_zone
[0].nb_dev
);
6727 mddev
->dev_sectors
= sectors
;
6728 mddev
->new_level
= level
;
6729 mddev
->new_layout
= ALGORITHM_PARITY_N
;
6730 mddev
->new_chunk_sectors
= mddev
->chunk_sectors
;
6731 mddev
->raid_disks
+= 1;
6732 mddev
->delta_disks
= 1;
6733 /* make sure it will be not marked as dirty */
6734 mddev
->recovery_cp
= MaxSector
;
6736 return setup_conf(mddev
);
6740 static void *raid5_takeover_raid1(struct mddev
*mddev
)
6744 if (mddev
->raid_disks
!= 2 ||
6745 mddev
->degraded
> 1)
6746 return ERR_PTR(-EINVAL
);
6748 /* Should check if there are write-behind devices? */
6750 chunksect
= 64*2; /* 64K by default */
6752 /* The array must be an exact multiple of chunksize */
6753 while (chunksect
&& (mddev
->array_sectors
& (chunksect
-1)))
6756 if ((chunksect
<<9) < STRIPE_SIZE
)
6757 /* array size does not allow a suitable chunk size */
6758 return ERR_PTR(-EINVAL
);
6760 mddev
->new_level
= 5;
6761 mddev
->new_layout
= ALGORITHM_LEFT_SYMMETRIC
;
6762 mddev
->new_chunk_sectors
= chunksect
;
6764 return setup_conf(mddev
);
6767 static void *raid5_takeover_raid6(struct mddev
*mddev
)
6771 switch (mddev
->layout
) {
6772 case ALGORITHM_LEFT_ASYMMETRIC_6
:
6773 new_layout
= ALGORITHM_LEFT_ASYMMETRIC
;
6775 case ALGORITHM_RIGHT_ASYMMETRIC_6
:
6776 new_layout
= ALGORITHM_RIGHT_ASYMMETRIC
;
6778 case ALGORITHM_LEFT_SYMMETRIC_6
:
6779 new_layout
= ALGORITHM_LEFT_SYMMETRIC
;
6781 case ALGORITHM_RIGHT_SYMMETRIC_6
:
6782 new_layout
= ALGORITHM_RIGHT_SYMMETRIC
;
6784 case ALGORITHM_PARITY_0_6
:
6785 new_layout
= ALGORITHM_PARITY_0
;
6787 case ALGORITHM_PARITY_N
:
6788 new_layout
= ALGORITHM_PARITY_N
;
6791 return ERR_PTR(-EINVAL
);
6793 mddev
->new_level
= 5;
6794 mddev
->new_layout
= new_layout
;
6795 mddev
->delta_disks
= -1;
6796 mddev
->raid_disks
-= 1;
6797 return setup_conf(mddev
);
6801 static int raid5_check_reshape(struct mddev
*mddev
)
6803 /* For a 2-drive array, the layout and chunk size can be changed
6804 * immediately as not restriping is needed.
6805 * For larger arrays we record the new value - after validation
6806 * to be used by a reshape pass.
6808 struct r5conf
*conf
= mddev
->private;
6809 int new_chunk
= mddev
->new_chunk_sectors
;
6811 if (mddev
->new_layout
>= 0 && !algorithm_valid_raid5(mddev
->new_layout
))
6813 if (new_chunk
> 0) {
6814 if (!is_power_of_2(new_chunk
))
6816 if (new_chunk
< (PAGE_SIZE
>>9))
6818 if (mddev
->array_sectors
& (new_chunk
-1))
6819 /* not factor of array size */
6823 /* They look valid */
6825 if (mddev
->raid_disks
== 2) {
6826 /* can make the change immediately */
6827 if (mddev
->new_layout
>= 0) {
6828 conf
->algorithm
= mddev
->new_layout
;
6829 mddev
->layout
= mddev
->new_layout
;
6831 if (new_chunk
> 0) {
6832 conf
->chunk_sectors
= new_chunk
;
6833 mddev
->chunk_sectors
= new_chunk
;
6835 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
6836 md_wakeup_thread(mddev
->thread
);
6838 return check_reshape(mddev
);
6841 static int raid6_check_reshape(struct mddev
*mddev
)
6843 int new_chunk
= mddev
->new_chunk_sectors
;
6845 if (mddev
->new_layout
>= 0 && !algorithm_valid_raid6(mddev
->new_layout
))
6847 if (new_chunk
> 0) {
6848 if (!is_power_of_2(new_chunk
))
6850 if (new_chunk
< (PAGE_SIZE
>> 9))
6852 if (mddev
->array_sectors
& (new_chunk
-1))
6853 /* not factor of array size */
6857 /* They look valid */
6858 return check_reshape(mddev
);
6861 static void *raid5_takeover(struct mddev
*mddev
)
6863 /* raid5 can take over:
6864 * raid0 - if there is only one strip zone - make it a raid4 layout
6865 * raid1 - if there are two drives. We need to know the chunk size
6866 * raid4 - trivial - just use a raid4 layout.
6867 * raid6 - Providing it is a *_6 layout
6869 if (mddev
->level
== 0)
6870 return raid45_takeover_raid0(mddev
, 5);
6871 if (mddev
->level
== 1)
6872 return raid5_takeover_raid1(mddev
);
6873 if (mddev
->level
== 4) {
6874 mddev
->new_layout
= ALGORITHM_PARITY_N
;
6875 mddev
->new_level
= 5;
6876 return setup_conf(mddev
);
6878 if (mddev
->level
== 6)
6879 return raid5_takeover_raid6(mddev
);
6881 return ERR_PTR(-EINVAL
);
6884 static void *raid4_takeover(struct mddev
*mddev
)
6886 /* raid4 can take over:
6887 * raid0 - if there is only one strip zone
6888 * raid5 - if layout is right
6890 if (mddev
->level
== 0)
6891 return raid45_takeover_raid0(mddev
, 4);
6892 if (mddev
->level
== 5 &&
6893 mddev
->layout
== ALGORITHM_PARITY_N
) {
6894 mddev
->new_layout
= 0;
6895 mddev
->new_level
= 4;
6896 return setup_conf(mddev
);
6898 return ERR_PTR(-EINVAL
);
6901 static struct md_personality raid5_personality
;
6903 static void *raid6_takeover(struct mddev
*mddev
)
6905 /* Currently can only take over a raid5. We map the
6906 * personality to an equivalent raid6 personality
6907 * with the Q block at the end.
6911 if (mddev
->pers
!= &raid5_personality
)
6912 return ERR_PTR(-EINVAL
);
6913 if (mddev
->degraded
> 1)
6914 return ERR_PTR(-EINVAL
);
6915 if (mddev
->raid_disks
> 253)
6916 return ERR_PTR(-EINVAL
);
6917 if (mddev
->raid_disks
< 3)
6918 return ERR_PTR(-EINVAL
);
6920 switch (mddev
->layout
) {
6921 case ALGORITHM_LEFT_ASYMMETRIC
:
6922 new_layout
= ALGORITHM_LEFT_ASYMMETRIC_6
;
6924 case ALGORITHM_RIGHT_ASYMMETRIC
:
6925 new_layout
= ALGORITHM_RIGHT_ASYMMETRIC_6
;
6927 case ALGORITHM_LEFT_SYMMETRIC
:
6928 new_layout
= ALGORITHM_LEFT_SYMMETRIC_6
;
6930 case ALGORITHM_RIGHT_SYMMETRIC
:
6931 new_layout
= ALGORITHM_RIGHT_SYMMETRIC_6
;
6933 case ALGORITHM_PARITY_0
:
6934 new_layout
= ALGORITHM_PARITY_0_6
;
6936 case ALGORITHM_PARITY_N
:
6937 new_layout
= ALGORITHM_PARITY_N
;
6940 return ERR_PTR(-EINVAL
);
6942 mddev
->new_level
= 6;
6943 mddev
->new_layout
= new_layout
;
6944 mddev
->delta_disks
= 1;
6945 mddev
->raid_disks
+= 1;
6946 return setup_conf(mddev
);
6950 static struct md_personality raid6_personality
=
6954 .owner
= THIS_MODULE
,
6955 .make_request
= make_request
,
6959 .error_handler
= error
,
6960 .hot_add_disk
= raid5_add_disk
,
6961 .hot_remove_disk
= raid5_remove_disk
,
6962 .spare_active
= raid5_spare_active
,
6963 .sync_request
= sync_request
,
6964 .resize
= raid5_resize
,
6966 .check_reshape
= raid6_check_reshape
,
6967 .start_reshape
= raid5_start_reshape
,
6968 .finish_reshape
= raid5_finish_reshape
,
6969 .quiesce
= raid5_quiesce
,
6970 .takeover
= raid6_takeover
,
6972 static struct md_personality raid5_personality
=
6976 .owner
= THIS_MODULE
,
6977 .make_request
= make_request
,
6981 .error_handler
= error
,
6982 .hot_add_disk
= raid5_add_disk
,
6983 .hot_remove_disk
= raid5_remove_disk
,
6984 .spare_active
= raid5_spare_active
,
6985 .sync_request
= sync_request
,
6986 .resize
= raid5_resize
,
6988 .check_reshape
= raid5_check_reshape
,
6989 .start_reshape
= raid5_start_reshape
,
6990 .finish_reshape
= raid5_finish_reshape
,
6991 .quiesce
= raid5_quiesce
,
6992 .takeover
= raid5_takeover
,
6995 static struct md_personality raid4_personality
=
6999 .owner
= THIS_MODULE
,
7000 .make_request
= make_request
,
7004 .error_handler
= error
,
7005 .hot_add_disk
= raid5_add_disk
,
7006 .hot_remove_disk
= raid5_remove_disk
,
7007 .spare_active
= raid5_spare_active
,
7008 .sync_request
= sync_request
,
7009 .resize
= raid5_resize
,
7011 .check_reshape
= raid5_check_reshape
,
7012 .start_reshape
= raid5_start_reshape
,
7013 .finish_reshape
= raid5_finish_reshape
,
7014 .quiesce
= raid5_quiesce
,
7015 .takeover
= raid4_takeover
,
7018 static int __init
raid5_init(void)
7020 raid5_wq
= alloc_workqueue("raid5wq",
7021 WQ_UNBOUND
|WQ_MEM_RECLAIM
|WQ_CPU_INTENSIVE
|WQ_SYSFS
, 0);
7024 register_md_personality(&raid6_personality
);
7025 register_md_personality(&raid5_personality
);
7026 register_md_personality(&raid4_personality
);
7030 static void raid5_exit(void)
7032 unregister_md_personality(&raid6_personality
);
7033 unregister_md_personality(&raid5_personality
);
7034 unregister_md_personality(&raid4_personality
);
7035 destroy_workqueue(raid5_wq
);
7038 module_init(raid5_init
);
7039 module_exit(raid5_exit
);
7040 MODULE_LICENSE("GPL");
7041 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
7042 MODULE_ALIAS("md-personality-4"); /* RAID5 */
7043 MODULE_ALIAS("md-raid5");
7044 MODULE_ALIAS("md-raid4");
7045 MODULE_ALIAS("md-level-5");
7046 MODULE_ALIAS("md-level-4");
7047 MODULE_ALIAS("md-personality-8"); /* RAID6 */
7048 MODULE_ALIAS("md-raid6");
7049 MODULE_ALIAS("md-level-6");
7051 /* This used to be two separate modules, they were: */
7052 MODULE_ALIAS("raid5");
7053 MODULE_ALIAS("raid6");