2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
19 Neil Brown <neilb@cse.unsw.edu.au>.
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/sysctl.h>
34 #include <linux/raid/md.h>
35 #include <linux/raid/xor.h>
36 #include <linux/devfs_fs_kernel.h>
38 #include <linux/init.h>
41 #include <linux/kmod.h>
44 #define __KERNEL_SYSCALLS__
45 #include <linux/unistd.h>
47 #include <asm/unaligned.h>
49 extern asmlinkage
int sys_sched_yield(void);
50 extern asmlinkage
long sys_setsid(void);
52 #define MAJOR_NR MD_MAJOR
55 #include <linux/blk.h>
59 # define dprintk(x...) printk(x)
61 # define dprintk(x...) do { } while(0)
64 static mdk_personality_t
*pers
[MAX_PERSONALITY
];
67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
68 * is 100 KB/sec, so the extra system load does not show up that much.
69 * Increase it if you want to have more _guaranteed_ speed. Note that
70 * the RAID driver will use the maximum available bandwith if the IO
71 * subsystem is idle. There is also an 'absolute maximum' reconstruction
72 * speed limit - in case reconstruction slows down your system despite
75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
78 static int sysctl_speed_limit_min
= 100;
79 static int sysctl_speed_limit_max
= 100000;
81 static struct ctl_table_header
*raid_table_header
;
83 static ctl_table raid_table
[] = {
84 {DEV_RAID_SPEED_LIMIT_MIN
, "speed_limit_min",
85 &sysctl_speed_limit_min
, sizeof(int), 0644, NULL
, &proc_dointvec
},
86 {DEV_RAID_SPEED_LIMIT_MAX
, "speed_limit_max",
87 &sysctl_speed_limit_max
, sizeof(int), 0644, NULL
, &proc_dointvec
},
91 static ctl_table raid_dir_table
[] = {
92 {DEV_RAID
, "raid", NULL
, 0, 0555, raid_table
},
96 static ctl_table raid_root_table
[] = {
97 {CTL_DEV
, "dev", NULL
, 0, 0555, raid_dir_table
},
102 * these have to be allocated separately because external
103 * subsystems want to have a pre-defined structure
105 struct hd_struct md_hd_struct
[MAX_MD_DEVS
];
106 static int md_blocksizes
[MAX_MD_DEVS
];
107 static int md_hardsect_sizes
[MAX_MD_DEVS
];
108 static int md_maxreadahead
[MAX_MD_DEVS
];
109 static mdk_thread_t
*md_recovery_thread
;
111 int md_size
[MAX_MD_DEVS
];
113 extern struct block_device_operations md_fops
;
114 static devfs_handle_t devfs_handle
;
116 static struct gendisk md_gendisk
=
124 nr_real
: MAX_MD_DEVS
,
131 * Enables to iterate over all existing md arrays
133 static MD_LIST_HEAD(all_mddevs
);
136 * The mapping between kdev and mddev is not necessary a simple
137 * one! Eg. HSM uses several sub-devices to implement Logical
138 * Volumes. All these sub-devices map to the same mddev.
140 dev_mapping_t mddev_map
[MAX_MD_DEVS
];
142 void add_mddev_mapping (mddev_t
* mddev
, kdev_t dev
, void *data
)
144 unsigned int minor
= MINOR(dev
);
146 if (MAJOR(dev
) != MD_MAJOR
) {
150 if (mddev_map
[minor
].mddev
!= NULL
) {
154 mddev_map
[minor
].mddev
= mddev
;
155 mddev_map
[minor
].data
= data
;
158 void del_mddev_mapping (mddev_t
* mddev
, kdev_t dev
)
160 unsigned int minor
= MINOR(dev
);
162 if (MAJOR(dev
) != MD_MAJOR
) {
166 if (mddev_map
[minor
].mddev
!= mddev
) {
170 mddev_map
[minor
].mddev
= NULL
;
171 mddev_map
[minor
].data
= NULL
;
174 static int md_make_request (request_queue_t
*q
, int rw
, struct buffer_head
* bh
)
176 mddev_t
*mddev
= kdev_to_mddev(bh
->b_rdev
);
178 if (mddev
&& mddev
->pers
)
179 return mddev
->pers
->make_request(mddev
, rw
, bh
);
186 static mddev_t
* alloc_mddev (kdev_t dev
)
190 if (MAJOR(dev
) != MD_MAJOR
) {
194 mddev
= (mddev_t
*) kmalloc(sizeof(*mddev
), GFP_KERNEL
);
198 memset(mddev
, 0, sizeof(*mddev
));
200 mddev
->__minor
= MINOR(dev
);
201 init_MUTEX(&mddev
->reconfig_sem
);
202 init_MUTEX(&mddev
->recovery_sem
);
203 init_MUTEX(&mddev
->resync_sem
);
204 MD_INIT_LIST_HEAD(&mddev
->disks
);
205 MD_INIT_LIST_HEAD(&mddev
->all_mddevs
);
208 * The 'base' mddev is the one with data NULL.
209 * personalities can create additional mddevs
212 add_mddev_mapping(mddev
, dev
, 0);
213 md_list_add(&mddev
->all_mddevs
, &all_mddevs
);
220 struct gendisk
* find_gendisk (kdev_t dev
)
222 struct gendisk
*tmp
= gendisk_head
;
224 while (tmp
!= NULL
) {
225 if (tmp
->major
== MAJOR(dev
))
232 mdk_rdev_t
* find_rdev_nr(mddev_t
*mddev
, int nr
)
235 struct md_list_head
*tmp
;
237 ITERATE_RDEV(mddev
,rdev
,tmp
) {
238 if (rdev
->desc_nr
== nr
)
244 mdk_rdev_t
* find_rdev(mddev_t
* mddev
, kdev_t dev
)
246 struct md_list_head
*tmp
;
249 ITERATE_RDEV(mddev
,rdev
,tmp
) {
250 if (rdev
->dev
== dev
)
256 static MD_LIST_HEAD(device_names
);
258 char * partition_name (kdev_t dev
)
261 static char nomem
[] = "<nomem>";
263 struct md_list_head
*tmp
= device_names
.next
;
265 while (tmp
!= &device_names
) {
266 dname
= md_list_entry(tmp
, dev_name_t
, list
);
267 if (dname
->dev
== dev
)
272 dname
= (dev_name_t
*) kmalloc(sizeof(*dname
), GFP_KERNEL
);
277 * ok, add this new device name to the list
279 hd
= find_gendisk (dev
);
282 dname
->name
= disk_name (hd
, MINOR(dev
), dname
->namebuf
);
284 sprintf (dname
->namebuf
, "[dev %s]", kdevname(dev
));
285 dname
->name
= dname
->namebuf
;
289 MD_INIT_LIST_HEAD(&dname
->list
);
290 md_list_add(&dname
->list
, &device_names
);
295 static unsigned int calc_dev_sboffset (kdev_t dev
, mddev_t
*mddev
,
298 unsigned int size
= 0;
300 if (blk_size
[MAJOR(dev
)])
301 size
= blk_size
[MAJOR(dev
)][MINOR(dev
)];
303 size
= MD_NEW_SIZE_BLOCKS(size
);
307 static unsigned int calc_dev_size (kdev_t dev
, mddev_t
*mddev
, int persistent
)
311 size
= calc_dev_sboffset(dev
, mddev
, persistent
);
316 if (mddev
->sb
->chunk_size
)
317 size
&= ~(mddev
->sb
->chunk_size
/1024 - 1);
321 static unsigned int zoned_raid_size (mddev_t
*mddev
)
325 struct md_list_head
*tmp
;
332 * do size and offset calculations.
334 mask
= ~(mddev
->sb
->chunk_size
/1024 - 1);
336 ITERATE_RDEV(mddev
,rdev
,tmp
) {
338 md_size
[mdidx(mddev
)] += rdev
->size
;
344 * We check wether all devices are numbered from 0 to nb_dev-1. The
345 * order is guaranteed even after device name changes.
347 * Some personalities (raid0, linear) use this. Personalities that
348 * provide data have to be able to deal with loss of individual
349 * disks, so they do their checking themselves.
351 int md_check_ordering (mddev_t
*mddev
)
355 struct md_list_head
*tmp
;
358 * First, all devices must be fully functional
360 ITERATE_RDEV(mddev
,rdev
,tmp
) {
362 printk("md: md%d's device %s faulty, aborting.\n",
363 mdidx(mddev
), partition_name(rdev
->dev
));
369 ITERATE_RDEV(mddev
,rdev
,tmp
) {
372 if (c
!= mddev
->nb_dev
) {
376 if (mddev
->nb_dev
!= mddev
->sb
->raid_disks
) {
377 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
378 mdidx(mddev
), mddev
->sb
->raid_disks
, mddev
->nb_dev
);
382 * Now the numbering check
384 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
386 ITERATE_RDEV(mddev
,rdev
,tmp
) {
387 if (rdev
->desc_nr
== i
)
391 printk("md: md%d, missing disk #%d, aborting.\n",
396 printk("md: md%d, too many disks #%d, aborting.\n",
406 static void remove_descriptor (mdp_disk_t
*disk
, mdp_super_t
*sb
)
408 if (disk_active(disk
)) {
411 if (disk_spare(disk
)) {
421 mark_disk_removed(disk
);
424 #define BAD_MAGIC KERN_ERR \
425 "md: invalid raid superblock magic on %s\n"
427 #define BAD_MINOR KERN_ERR \
428 "md: %s: invalid raid minor (%x)\n"
430 #define OUT_OF_MEM KERN_ALERT \
431 "md: out of memory.\n"
433 #define NO_SB KERN_ERR \
434 "md: disabled device %s, could not read superblock.\n"
436 #define BAD_CSUM KERN_WARNING \
437 "md: invalid superblock checksum on %s\n"
439 static int alloc_array_sb (mddev_t
* mddev
)
446 mddev
->sb
= (mdp_super_t
*) __get_free_page (GFP_KERNEL
);
449 md_clear_page(mddev
->sb
);
453 static int alloc_disk_sb (mdk_rdev_t
* rdev
)
458 rdev
->sb
= (mdp_super_t
*) __get_free_page(GFP_KERNEL
);
463 md_clear_page(rdev
->sb
);
468 static void free_disk_sb (mdk_rdev_t
* rdev
)
471 free_page((unsigned long) rdev
->sb
);
481 static void mark_rdev_faulty (mdk_rdev_t
* rdev
)
491 static int read_disk_sb (mdk_rdev_t
* rdev
)
494 struct buffer_head
*bh
= NULL
;
495 kdev_t dev
= rdev
->dev
;
497 unsigned long sb_offset
;
505 * Calculate the position of the superblock,
506 * it's at the end of the disk
508 sb_offset
= calc_dev_sboffset(rdev
->dev
, rdev
->mddev
, 1);
509 rdev
->sb_offset
= sb_offset
;
510 printk("(read) %s's sb offset: %ld", partition_name(dev
), sb_offset
);
512 set_blocksize (dev
, MD_SB_BYTES
);
513 bh
= bread (dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
516 sb
= (mdp_super_t
*) bh
->b_data
;
517 memcpy (rdev
->sb
, sb
, MD_SB_BYTES
);
519 printk (NO_SB
,partition_name(rdev
->dev
));
522 printk(" [events: %08lx]\n", (unsigned long)rdev
->sb
->events_lo
);
530 static unsigned int calc_sb_csum (mdp_super_t
* sb
)
532 unsigned int disk_csum
, csum
;
534 disk_csum
= sb
->sb_csum
;
536 csum
= csum_partial((void *)sb
, MD_SB_BYTES
, 0);
537 sb
->sb_csum
= disk_csum
;
542 * Check one RAID superblock for generic plausibility
545 static int check_disk_sb (mdk_rdev_t
* rdev
)
556 if (sb
->md_magic
!= MD_SB_MAGIC
) {
557 printk (BAD_MAGIC
, partition_name(rdev
->dev
));
561 if (sb
->md_minor
>= MAX_MD_DEVS
) {
562 printk (BAD_MINOR
, partition_name(rdev
->dev
),
567 if (calc_sb_csum(sb
) != sb
->sb_csum
)
568 printk(BAD_CSUM
, partition_name(rdev
->dev
));
574 static kdev_t
dev_unit(kdev_t dev
)
577 struct gendisk
*hd
= find_gendisk(dev
);
581 mask
= ~((1 << hd
->minor_shift
) - 1);
583 return MKDEV(MAJOR(dev
), MINOR(dev
) & mask
);
586 static mdk_rdev_t
* match_dev_unit(mddev_t
*mddev
, kdev_t dev
)
588 struct md_list_head
*tmp
;
591 ITERATE_RDEV(mddev
,rdev
,tmp
)
592 if (dev_unit(rdev
->dev
) == dev_unit(dev
))
598 static int match_mddev_units(mddev_t
*mddev1
, mddev_t
*mddev2
)
600 struct md_list_head
*tmp
;
603 ITERATE_RDEV(mddev1
,rdev
,tmp
)
604 if (match_dev_unit(mddev2
, rdev
->dev
))
610 static MD_LIST_HEAD(all_raid_disks
);
611 static MD_LIST_HEAD(pending_raid_disks
);
613 static void bind_rdev_to_array (mdk_rdev_t
* rdev
, mddev_t
* mddev
)
615 mdk_rdev_t
*same_pdev
;
621 same_pdev
= match_dev_unit(mddev
, rdev
->dev
);
624 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
625 " protection against single-disk failure might be compromised.\n",
626 mdidx(mddev
), partition_name(rdev
->dev
),
627 partition_name(same_pdev
->dev
));
629 md_list_add(&rdev
->same_set
, &mddev
->disks
);
632 printk("bind<%s,%d>\n", partition_name(rdev
->dev
), mddev
->nb_dev
);
635 static void unbind_rdev_from_array (mdk_rdev_t
* rdev
)
641 md_list_del(&rdev
->same_set
);
642 MD_INIT_LIST_HEAD(&rdev
->same_set
);
643 rdev
->mddev
->nb_dev
--;
644 printk("unbind<%s,%d>\n", partition_name(rdev
->dev
),
645 rdev
->mddev
->nb_dev
);
650 * prevent the device from being mounted, repartitioned or
651 * otherwise reused by a RAID array (or any other kernel
652 * subsystem), by opening the device. [simply getting an
653 * inode is not enough, the SCSI module usage code needs
654 * an explicit open() on the device]
656 static int lock_rdev (mdk_rdev_t
*rdev
)
661 * First insert a dummy inode.
665 rdev
->inode
= get_empty_inode();
669 * we dont care about any other fields
671 rdev
->inode
->i_dev
= rdev
->inode
->i_rdev
= rdev
->dev
;
672 insert_inode_hash(rdev
->inode
);
674 memset(&rdev
->filp
, 0, sizeof(rdev
->filp
));
675 rdev
->filp
.f_mode
= 3; /* read write */
679 static void unlock_rdev (mdk_rdev_t
*rdev
)
687 static void export_rdev (mdk_rdev_t
* rdev
)
689 printk("export_rdev(%s)\n",partition_name(rdev
->dev
));
694 md_list_del(&rdev
->all
);
695 MD_INIT_LIST_HEAD(&rdev
->all
);
696 if (rdev
->pending
.next
!= &rdev
->pending
) {
697 printk("(%s was pending)\n",partition_name(rdev
->dev
));
698 md_list_del(&rdev
->pending
);
699 MD_INIT_LIST_HEAD(&rdev
->pending
);
706 static void kick_rdev_from_array (mdk_rdev_t
* rdev
)
708 unbind_rdev_from_array(rdev
);
712 static void export_array (mddev_t
*mddev
)
714 struct md_list_head
*tmp
;
716 mdp_super_t
*sb
= mddev
->sb
;
720 free_page((unsigned long) sb
);
723 ITERATE_RDEV(mddev
,rdev
,tmp
) {
728 kick_rdev_from_array(rdev
);
734 static void free_mddev (mddev_t
*mddev
)
742 md_size
[mdidx(mddev
)] = 0;
743 md_hd_struct
[mdidx(mddev
)].nr_sects
= 0;
746 * Make sure nobody else is using this mddev
747 * (careful, we rely on the global kernel lock here)
749 while (md_atomic_read(&mddev
->resync_sem
.count
) != 1)
751 while (md_atomic_read(&mddev
->recovery_sem
.count
) != 1)
754 del_mddev_mapping(mddev
, MKDEV(MD_MAJOR
, mdidx(mddev
)));
755 md_list_del(&mddev
->all_mddevs
);
756 MD_INIT_LIST_HEAD(&mddev
->all_mddevs
);
766 static void print_desc(mdp_disk_t
*desc
)
768 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc
->number
,
769 partition_name(MKDEV(desc
->major
,desc
->minor
)),
770 desc
->major
,desc
->minor
,desc
->raid_disk
,desc
->state
);
773 static void print_sb(mdp_super_t
*sb
)
777 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
778 sb
->major_version
, sb
->minor_version
, sb
->patch_version
,
779 sb
->set_uuid0
, sb
->set_uuid1
, sb
->set_uuid2
, sb
->set_uuid3
,
781 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb
->level
,
782 sb
->size
, sb
->nr_disks
, sb
->raid_disks
, sb
->md_minor
,
783 sb
->layout
, sb
->chunk_size
);
784 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
785 sb
->utime
, sb
->state
, sb
->active_disks
, sb
->working_disks
,
786 sb
->failed_disks
, sb
->spare_disks
,
787 sb
->sb_csum
, (unsigned long)sb
->events_lo
);
789 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
792 desc
= sb
->disks
+ i
;
793 printk(" D %2d: ", i
);
797 print_desc(&sb
->this_disk
);
801 static void print_rdev(mdk_rdev_t
*rdev
)
803 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
804 partition_name(rdev
->dev
), partition_name(rdev
->old_dev
),
805 rdev
->size
, rdev
->faulty
, rdev
->desc_nr
);
807 printk("rdev superblock:\n");
810 printk("no rdev superblock!\n");
813 void md_print_devices (void)
815 struct md_list_head
*tmp
, *tmp2
;
820 printk(" **********************************\n");
821 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
822 printk(" **********************************\n");
823 ITERATE_MDDEV(mddev
,tmp
) {
824 printk("md%d: ", mdidx(mddev
));
826 ITERATE_RDEV(mddev
,rdev
,tmp2
)
827 printk("<%s>", partition_name(rdev
->dev
));
830 printk(" array superblock:\n");
833 printk(" no array superblock.\n");
835 ITERATE_RDEV(mddev
,rdev
,tmp2
)
838 printk(" **********************************\n");
842 static int sb_equal ( mdp_super_t
*sb1
, mdp_super_t
*sb2
)
845 mdp_super_t
*tmp1
, *tmp2
;
847 tmp1
= kmalloc(sizeof(*tmp1
),GFP_KERNEL
);
848 tmp2
= kmalloc(sizeof(*tmp2
),GFP_KERNEL
);
850 if (!tmp1
|| !tmp2
) {
859 * nr_disks is not constant
864 if (memcmp(tmp1
, tmp2
, MD_SB_GENERIC_CONSTANT_WORDS
* 4))
878 static int uuid_equal(mdk_rdev_t
*rdev1
, mdk_rdev_t
*rdev2
)
880 if ( (rdev1
->sb
->set_uuid0
== rdev2
->sb
->set_uuid0
) &&
881 (rdev1
->sb
->set_uuid1
== rdev2
->sb
->set_uuid1
) &&
882 (rdev1
->sb
->set_uuid2
== rdev2
->sb
->set_uuid2
) &&
883 (rdev1
->sb
->set_uuid3
== rdev2
->sb
->set_uuid3
))
890 static mdk_rdev_t
* find_rdev_all (kdev_t dev
)
892 struct md_list_head
*tmp
;
895 tmp
= all_raid_disks
.next
;
896 while (tmp
!= &all_raid_disks
) {
897 rdev
= md_list_entry(tmp
, mdk_rdev_t
, all
);
898 if (rdev
->dev
== dev
)
905 #define GETBLK_FAILED KERN_ERR \
906 "md: getblk failed for device %s\n"
908 static int write_disk_sb(mdk_rdev_t
* rdev
)
910 struct buffer_head
*bh
;
912 unsigned long sb_offset
, size
;
923 if (rdev
->sb
->md_magic
!= MD_SB_MAGIC
) {
929 sb_offset
= calc_dev_sboffset(dev
, rdev
->mddev
, 1);
930 if (rdev
->sb_offset
!= sb_offset
) {
931 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev
), rdev
->sb_offset
, sb_offset
);
935 * If the disk went offline meanwhile and it's just a spare, then
936 * it's size has changed to zero silently, and the MD code does
937 * not yet know that it's faulty.
939 size
= calc_dev_size(dev
, rdev
->mddev
, 1);
940 if (size
!= rdev
->size
) {
941 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev
), rdev
->size
, size
);
945 printk("(write) %s's sb offset: %ld\n", partition_name(dev
), sb_offset
);
947 set_blocksize(dev
, MD_SB_BYTES
);
948 bh
= getblk(dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
950 printk(GETBLK_FAILED
, partition_name(dev
));
953 memset(bh
->b_data
,0,bh
->b_size
);
954 sb
= (mdp_super_t
*) bh
->b_data
;
955 memcpy(sb
, rdev
->sb
, MD_SB_BYTES
);
957 mark_buffer_uptodate(bh
, 1);
958 mark_buffer_dirty(bh
);
959 ll_rw_block(WRITE
, 1, &bh
);
968 static void set_this_disk(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
973 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
974 desc
= mddev
->sb
->disks
+ i
;
976 if (disk_faulty(desc
)) {
977 if (MKDEV(desc
->major
,desc
->minor
) == rdev
->dev
)
982 if (MKDEV(desc
->major
,desc
->minor
) == rdev
->dev
) {
983 rdev
->sb
->this_disk
= *desc
;
984 rdev
->desc_nr
= desc
->number
;
995 static int sync_sbs(mddev_t
* mddev
)
999 struct md_list_head
*tmp
;
1001 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1006 set_this_disk(mddev
, rdev
);
1007 sb
->sb_csum
= calc_sb_csum(sb
);
1012 int md_update_sb(mddev_t
* mddev
)
1014 int first
, err
, count
= 100;
1015 struct md_list_head
*tmp
;
1019 mddev
->sb
->utime
= CURRENT_TIME
;
1020 if ((++mddev
->sb
->events_lo
)==0)
1021 ++mddev
->sb
->events_hi
;
1023 if ((mddev
->sb
->events_lo
|mddev
->sb
->events_hi
)==0) {
1025 * oops, this 64-bit counter should never wrap.
1026 * Either we are in around ~1 trillion A.C., assuming
1027 * 1 reboot per second, or we have a bug:
1030 mddev
->sb
->events_lo
= mddev
->sb
->events_hi
= 0xffffffff;
1035 * do not write anything to disk if using
1036 * nonpersistent superblocks
1038 if (mddev
->sb
->not_persistent
)
1041 printk(KERN_INFO
"md: updating md%d RAID superblock on device\n",
1046 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1052 printk("(skipping faulty ");
1053 printk("%s ", partition_name(rdev
->dev
));
1054 if (!rdev
->faulty
) {
1055 printk("[events: %08lx]",
1056 (unsigned long)rdev
->sb
->events_lo
);
1057 err
+= write_disk_sb(rdev
);
1063 printk("errors occured during superblock update, repeating\n");
1066 printk("excessive errors occured during superblock update, exiting\n");
1072 * Import a device. If 'on_disk', then sanity check the superblock
1074 * mark the device faulty if:
1076 * - the device is nonexistent (zero size)
1077 * - the device has no valid superblock
1079 * a faulty rdev _never_ has rdev->sb set.
1081 static int md_import_device (kdev_t newdev
, int on_disk
)
1087 if (find_rdev_all(newdev
))
1090 rdev
= (mdk_rdev_t
*) kmalloc(sizeof(*rdev
), GFP_KERNEL
);
1092 printk("could not alloc mem for %s!\n", partition_name(newdev
));
1095 memset(rdev
, 0, sizeof(*rdev
));
1097 if (get_super(newdev
)) {
1098 printk("md: can not import %s, has active inodes!\n",
1099 partition_name(newdev
));
1104 if ((err
= alloc_disk_sb(rdev
)))
1108 if (lock_rdev(rdev
)) {
1109 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1110 partition_name(newdev
));
1118 if (blk_size
[MAJOR(newdev
)])
1119 size
= blk_size
[MAJOR(newdev
)][MINOR(newdev
)];
1121 printk("md: %s has zero size, marking faulty!\n",
1122 partition_name(newdev
));
1128 if ((err
= read_disk_sb(rdev
))) {
1129 printk("md: could not read %s's sb, not importing!\n",
1130 partition_name(newdev
));
1133 if ((err
= check_disk_sb(rdev
))) {
1134 printk("md: %s has invalid sb, not importing!\n",
1135 partition_name(newdev
));
1139 rdev
->old_dev
= MKDEV(rdev
->sb
->this_disk
.major
,
1140 rdev
->sb
->this_disk
.minor
);
1141 rdev
->desc_nr
= rdev
->sb
->this_disk
.number
;
1143 md_list_add(&rdev
->all
, &all_raid_disks
);
1144 MD_INIT_LIST_HEAD(&rdev
->pending
);
1146 if (rdev
->faulty
&& rdev
->sb
)
1161 * Check a full RAID array for plausibility
1164 #define INCONSISTENT KERN_ERR \
1165 "md: fatal superblock inconsistency in %s -- removing from array\n"
1167 #define OUT_OF_DATE KERN_ERR \
1168 "md: superblock update time inconsistency -- using the most recent one\n"
1170 #define OLD_VERSION KERN_ALERT \
1171 "md: md%d: unsupported raid array version %d.%d.%d\n"
1173 #define NOT_CLEAN_IGNORE KERN_ERR \
1174 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1176 #define UNKNOWN_LEVEL KERN_ERR \
1177 "md: md%d: unsupported raid level %d\n"
1179 static int analyze_sbs (mddev_t
* mddev
)
1181 int out_of_date
= 0, i
;
1182 struct md_list_head
*tmp
, *tmp2
;
1183 mdk_rdev_t
*rdev
, *rdev2
, *freshest
;
1187 * Verify the RAID superblock on each real device
1189 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1198 if (check_disk_sb(rdev
))
1203 * The superblock constant part has to be the same
1204 * for all disks in the array.
1208 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1213 if (!sb_equal(sb
, rdev
->sb
)) {
1214 printk (INCONSISTENT
, partition_name(rdev
->dev
));
1215 kick_rdev_from_array(rdev
);
1221 * OK, we have all disks and the array is ready to run. Let's
1222 * find the freshest superblock, that one will be the superblock
1223 * that represents the whole array.
1226 if (alloc_array_sb(mddev
))
1231 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1234 * if the checksum is invalid, use the superblock
1235 * only as a last resort. (decrease it's age by
1238 if (calc_sb_csum(rdev
->sb
) != rdev
->sb
->sb_csum
) {
1239 if (rdev
->sb
->events_lo
|| rdev
->sb
->events_hi
)
1240 if ((rdev
->sb
->events_lo
--)==0)
1241 rdev
->sb
->events_hi
--;
1244 printk("%s's event counter: %08lx\n", partition_name(rdev
->dev
),
1245 (unsigned long)rdev
->sb
->events_lo
);
1251 * Find the newest superblock version
1253 ev1
= md_event(rdev
->sb
);
1254 ev2
= md_event(freshest
->sb
);
1262 printk(OUT_OF_DATE
);
1263 printk("freshest: %s\n", partition_name(freshest
->dev
));
1265 memcpy (sb
, freshest
->sb
, sizeof(*sb
));
1268 * at this point we have picked the 'best' superblock
1269 * from all available superblocks.
1270 * now we validate this superblock and kick out possibly
1273 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1275 * Kick all non-fresh devices faulty
1278 ev1
= md_event(rdev
->sb
);
1282 printk("md: kicking non-fresh %s from array!\n",
1283 partition_name(rdev
->dev
));
1284 kick_rdev_from_array(rdev
);
1290 * Fix up changed device names ... but only if this disk has a
1291 * recent update time. Use faulty checksum ones too.
1293 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1294 __u64 ev1
, ev2
, ev3
;
1295 if (rdev
->faulty
) { /* REMOVEME */
1299 ev1
= md_event(rdev
->sb
);
1303 if ((rdev
->dev
!= rdev
->old_dev
) &&
1304 ((ev1
== ev2
) || (ev1
== ev3
))) {
1307 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev
->old_dev
), partition_name(rdev
->dev
));
1308 if (rdev
->desc_nr
== -1) {
1312 desc
= &sb
->disks
[rdev
->desc_nr
];
1313 if (rdev
->old_dev
!= MKDEV(desc
->major
, desc
->minor
)) {
1317 desc
->major
= MAJOR(rdev
->dev
);
1318 desc
->minor
= MINOR(rdev
->dev
);
1319 desc
= &rdev
->sb
->this_disk
;
1320 desc
->major
= MAJOR(rdev
->dev
);
1321 desc
->minor
= MINOR(rdev
->dev
);
1326 * Remove unavailable and faulty devices ...
1328 * note that if an array becomes completely unrunnable due to
1329 * missing devices, we do not write the superblock back, so the
1330 * administrator has a chance to fix things up. The removal thus
1331 * only happens if it's nonfatal to the contents of the array.
1333 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1338 desc
= sb
->disks
+ i
;
1339 dev
= MKDEV(desc
->major
, desc
->minor
);
1342 * We kick faulty devices/descriptors immediately.
1344 if (disk_faulty(desc
)) {
1346 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1347 if (rdev
->desc_nr
!= desc
->number
)
1349 printk("md%d: kicking faulty %s!\n",
1350 mdidx(mddev
),partition_name(rdev
->dev
));
1351 kick_rdev_from_array(rdev
);
1356 if (dev
== MKDEV(0,0))
1358 printk("md%d: removing former faulty %s!\n",
1359 mdidx(mddev
), partition_name(dev
));
1361 remove_descriptor(desc
, sb
);
1365 if (dev
== MKDEV(0,0))
1368 * Is this device present in the rdev ring?
1371 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1372 if (rdev
->desc_nr
== desc
->number
) {
1380 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev
), partition_name(dev
));
1381 remove_descriptor(desc
, sb
);
1385 * Double check wether all devices mentioned in the
1386 * superblock are in the rdev ring.
1388 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1392 desc
= sb
->disks
+ i
;
1393 dev
= MKDEV(desc
->major
, desc
->minor
);
1395 if (dev
== MKDEV(0,0))
1398 if (disk_faulty(desc
)) {
1403 rdev
= find_rdev(mddev
, dev
);
1411 * Do a final reality check.
1413 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1414 if (rdev
->desc_nr
== -1) {
1419 * is the desc_nr unique?
1421 ITERATE_RDEV(mddev
,rdev2
,tmp2
) {
1422 if ((rdev2
!= rdev
) &&
1423 (rdev2
->desc_nr
== rdev
->desc_nr
)) {
1429 * is the device unique?
1431 ITERATE_RDEV(mddev
,rdev2
,tmp2
) {
1432 if ((rdev2
!= rdev
) &&
1433 (rdev2
->dev
== rdev
->dev
)) {
1441 * Check if we can support this RAID array
1443 if (sb
->major_version
!= MD_MAJOR_VERSION
||
1444 sb
->minor_version
> MD_MINOR_VERSION
) {
1446 printk (OLD_VERSION
, mdidx(mddev
), sb
->major_version
,
1447 sb
->minor_version
, sb
->patch_version
);
1451 if ((sb
->state
!= (1 << MD_SB_CLEAN
)) && ((sb
->level
== 1) ||
1452 (sb
->level
== 4) || (sb
->level
== 5)))
1453 printk (NOT_CLEAN_IGNORE
, mdidx(mddev
));
1465 static int device_size_calculation (mddev_t
* mddev
)
1467 int data_disks
= 0, persistent
;
1468 unsigned int readahead
;
1469 mdp_super_t
*sb
= mddev
->sb
;
1470 struct md_list_head
*tmp
;
1474 * Do device size calculation. Bail out if too small.
1475 * (we have to do this after having validated chunk_size,
1476 * because device size has to be modulo chunk_size)
1478 persistent
= !mddev
->sb
->not_persistent
;
1479 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1486 rdev
->size
= calc_dev_size(rdev
->dev
, mddev
, persistent
);
1487 if (rdev
->size
< sb
->chunk_size
/ 1024) {
1488 printk (KERN_WARNING
1489 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1490 partition_name(rdev
->dev
),
1491 rdev
->size
, sb
->chunk_size
/ 1024);
1496 switch (sb
->level
) {
1504 zoned_raid_size(mddev
);
1508 zoned_raid_size(mddev
);
1509 data_disks
= sb
->raid_disks
;
1516 data_disks
= sb
->raid_disks
-1;
1519 printk (UNKNOWN_LEVEL
, mdidx(mddev
), sb
->level
);
1522 if (!md_size
[mdidx(mddev
)])
1523 md_size
[mdidx(mddev
)] = sb
->size
* data_disks
;
1525 readahead
= MD_READAHEAD
;
1526 if ((sb
->level
== 0) || (sb
->level
== 4) || (sb
->level
== 5)) {
1527 readahead
= (mddev
->sb
->chunk_size
>>PAGE_SHIFT
) * 4 * data_disks
;
1528 if (readahead
< data_disks
* (MAX_SECTORS
>>(PAGE_SHIFT
-9))*2)
1529 readahead
= data_disks
* (MAX_SECTORS
>>(PAGE_SHIFT
-9))*2;
1531 if (sb
->level
== -3)
1534 md_maxreadahead
[mdidx(mddev
)] = readahead
;
1536 printk(KERN_INFO
"md%d: max total readahead window set to %ldk\n",
1537 mdidx(mddev
), readahead
*(PAGE_SIZE
/1024));
1540 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1541 mdidx(mddev
), data_disks
, readahead
/data_disks
*(PAGE_SIZE
/1024));
1548 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1549 "too big chunk_size: %d > %d\n"
1551 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1552 "too small chunk_size: %d < %ld\n"
1554 #define BAD_CHUNKSIZE KERN_ERR \
1555 "no chunksize specified, see 'man raidtab'\n"
1557 static int do_md_run (mddev_t
* mddev
)
1561 struct md_list_head
*tmp
;
1565 if (!mddev
->nb_dev
) {
1574 * Resize disks to align partitions size on a given
1577 md_size
[mdidx(mddev
)] = 0;
1580 * Analyze all RAID superblock(s)
1582 if (analyze_sbs(mddev
)) {
1587 chunk_size
= mddev
->sb
->chunk_size
;
1588 pnum
= level_to_pers(mddev
->sb
->level
);
1590 mddev
->param
.chunk_size
= chunk_size
;
1591 mddev
->param
.personality
= pnum
;
1593 if (chunk_size
> MAX_CHUNK_SIZE
) {
1594 printk(TOO_BIG_CHUNKSIZE
, chunk_size
, MAX_CHUNK_SIZE
);
1598 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1600 if ( (1 << ffz(~chunk_size
)) != chunk_size
) {
1604 if (chunk_size
< PAGE_SIZE
) {
1605 printk(TOO_SMALL_CHUNKSIZE
, chunk_size
, PAGE_SIZE
);
1609 if (pnum
>= MAX_PERSONALITY
) {
1614 if ((pnum
!= RAID1
) && (pnum
!= LINEAR
) && !chunk_size
) {
1616 * 'default chunksize' in the old md code used to
1617 * be PAGE_SIZE, baaad.
1618 * we abort here to be on the safe side. We dont
1619 * want to continue the bad practice.
1621 printk(BAD_CHUNKSIZE
);
1628 char module_name
[80];
1629 sprintf (module_name
, "md-personality-%d", pnum
);
1630 request_module (module_name
);
1636 if (device_size_calculation(mddev
))
1640 * Drop all container device buffers, from now on
1641 * the only valid external interface is through the md
1643 * Also find largest hardsector size
1645 md_hardsect_sizes
[mdidx(mddev
)] = 512;
1646 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1649 fsync_dev(rdev
->dev
);
1650 invalidate_buffers(rdev
->dev
);
1651 if (get_hardsect_size(rdev
->dev
)
1652 > md_hardsect_sizes
[mdidx(mddev
)])
1653 md_hardsect_sizes
[mdidx(mddev
)] =
1654 get_hardsect_size(rdev
->dev
);
1656 md_blocksizes
[mdidx(mddev
)] = 1024;
1657 if (md_blocksizes
[mdidx(mddev
)] < md_hardsect_sizes
[mdidx(mddev
)])
1658 md_blocksizes
[mdidx(mddev
)] = md_hardsect_sizes
[mdidx(mddev
)];
1659 mddev
->pers
= pers
[pnum
];
1661 err
= mddev
->pers
->run(mddev
);
1663 printk("pers->run() failed ...\n");
1668 mddev
->sb
->state
&= ~(1 << MD_SB_CLEAN
);
1669 md_update_sb(mddev
);
1672 * md_size has units of 1K blocks, which are
1673 * twice as large as sectors.
1675 md_hd_struct
[mdidx(mddev
)].start_sect
= 0;
1676 md_hd_struct
[mdidx(mddev
)].nr_sects
= md_size
[mdidx(mddev
)] << 1;
1678 read_ahead
[MD_MAJOR
] = 1024;
1682 #undef TOO_BIG_CHUNKSIZE
1683 #undef BAD_CHUNKSIZE
1685 #define OUT(x) do { err = (x); goto out; } while (0)
1687 static int restart_array (mddev_t
*mddev
)
1692 * Complain if it has no devices
1702 set_device_ro(mddev_to_kdev(mddev
), 0);
1705 "md%d switched to read-write mode.\n", mdidx(mddev
));
1707 * Kick recovery or resync if necessary
1709 md_recover_arrays();
1710 if (mddev
->pers
->restart_resync
)
1711 mddev
->pers
->restart_resync(mddev
);
1719 #define STILL_MOUNTED KERN_WARNING \
1720 "md: md%d still mounted.\n"
1722 static int do_md_stop (mddev_t
* mddev
, int ro
)
1724 int err
= 0, resync_interrupted
= 0;
1725 kdev_t dev
= mddev_to_kdev(mddev
);
1727 if (!ro
&& get_super(dev
)) {
1728 printk (STILL_MOUNTED
, mdidx(mddev
));
1734 * It is safe to call stop here, it only frees private
1735 * data. Also, it tells us if a device is unstoppable
1736 * (eg. resyncing is in progress)
1738 if (mddev
->pers
->stop_resync
)
1739 if (mddev
->pers
->stop_resync(mddev
))
1740 resync_interrupted
= 1;
1742 if (mddev
->recovery_running
)
1743 md_interrupt_thread(md_recovery_thread
);
1746 * This synchronizes with signal delivery to the
1747 * resync or reconstruction thread. It also nicely
1748 * hangs the process if some reconstruction has not
1751 down(&mddev
->recovery_sem
);
1752 up(&mddev
->recovery_sem
);
1755 * sync and invalidate buffers because we cannot kill the
1756 * main thread with valid IO transfers still around.
1757 * the kernel lock protects us from new requests being
1758 * added after invalidate_buffers().
1760 fsync_dev (mddev_to_kdev(mddev
));
1762 invalidate_buffers (dev
);
1770 set_device_ro(dev
, 0);
1771 if (mddev
->pers
->stop(mddev
)) {
1773 set_device_ro(dev
, 1);
1781 * mark it clean only if there was no resync
1784 if (!mddev
->recovery_running
&& !resync_interrupted
) {
1785 printk("marking sb clean...\n");
1786 mddev
->sb
->state
|= 1 << MD_SB_CLEAN
;
1788 md_update_sb(mddev
);
1791 set_device_ro(dev
, 1);
1795 * Free resources if final stop
1798 printk (KERN_INFO
"md%d stopped.\n", mdidx(mddev
));
1803 "md%d switched to read-only mode.\n", mdidx(mddev
));
1811 * We have to safely support old arrays too.
1813 int detect_old_array (mdp_super_t
*sb
)
1815 if (sb
->major_version
> 0)
1817 if (sb
->minor_version
>= 90)
1824 static void autorun_array (mddev_t
*mddev
)
1827 struct md_list_head
*tmp
;
1830 if (mddev
->disks
.prev
== &mddev
->disks
) {
1835 printk("running: ");
1837 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1838 printk("<%s>", partition_name(rdev
->dev
));
1842 err
= do_md_run (mddev
);
1844 printk("do_md_run() returned %d\n", err
);
1846 * prevent the writeback of an unrunnable array
1848 mddev
->sb_dirty
= 0;
1849 do_md_stop (mddev
, 0);
1854 * lets try to run arrays based on all disks that have arrived
1855 * until now. (those are in the ->pending list)
1857 * the method: pick the first pending disk, collect all disks with
1858 * the same UUID, remove all from the pending list and put them into
1859 * the 'same_array' list. Then order this list based on superblock
1860 * update time (freshest comes first), kick out 'old' disks and
1861 * compare superblocks. If everything's fine then run it.
1863 static void autorun_devices (void)
1865 struct md_list_head candidates
;
1866 struct md_list_head
*tmp
;
1867 mdk_rdev_t
*rdev0
, *rdev
;
1872 printk("autorun ...\n");
1873 while (pending_raid_disks
.next
!= &pending_raid_disks
) {
1874 rdev0
= md_list_entry(pending_raid_disks
.next
,
1875 mdk_rdev_t
, pending
);
1877 printk("considering %s ...\n", partition_name(rdev0
->dev
));
1878 MD_INIT_LIST_HEAD(&candidates
);
1879 ITERATE_RDEV_PENDING(rdev
,tmp
) {
1880 if (uuid_equal(rdev0
, rdev
)) {
1881 if (!sb_equal(rdev0
->sb
, rdev
->sb
)) {
1882 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
1885 printk(" adding %s ...\n", partition_name(rdev
->dev
));
1886 md_list_del(&rdev
->pending
);
1887 md_list_add(&rdev
->pending
, &candidates
);
1891 * now we have a set of devices, with all of them having
1892 * mostly sane superblocks. It's time to allocate the
1895 md_kdev
= MKDEV(MD_MAJOR
, rdev0
->sb
->md_minor
);
1896 mddev
= kdev_to_mddev(md_kdev
);
1898 printk("md%d already running, cannot run %s\n",
1899 mdidx(mddev
), partition_name(rdev0
->dev
));
1900 ITERATE_RDEV_GENERIC(candidates
,pending
,rdev
,tmp
)
1904 mddev
= alloc_mddev(md_kdev
);
1905 printk("created md%d\n", mdidx(mddev
));
1906 ITERATE_RDEV_GENERIC(candidates
,pending
,rdev
,tmp
) {
1907 bind_rdev_to_array(rdev
, mddev
);
1908 md_list_del(&rdev
->pending
);
1909 MD_INIT_LIST_HEAD(&rdev
->pending
);
1911 autorun_array(mddev
);
1913 printk("... autorun DONE.\n");
1917 * import RAID devices based on one partition
1918 * if possible, the array gets run as well.
1921 #define BAD_VERSION KERN_ERR \
1922 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1924 #define OUT_OF_MEM KERN_ALERT \
1925 "md: out of memory.\n"
1927 #define NO_DEVICE KERN_ERR \
1928 "md: disabled device %s\n"
1930 #define AUTOADD_FAILED KERN_ERR \
1931 "md: auto-adding devices to md%d FAILED (error %d).\n"
1933 #define AUTOADD_FAILED_USED KERN_ERR \
1934 "md: cannot auto-add device %s to md%d, already used.\n"
1936 #define AUTORUN_FAILED KERN_ERR \
1937 "md: auto-running md%d FAILED (error %d).\n"
1939 #define MDDEV_BUSY KERN_ERR \
1940 "md: cannot auto-add to md%d, already running.\n"
1942 #define AUTOADDING KERN_INFO \
1943 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1945 #define AUTORUNNING KERN_INFO \
1946 "md: auto-running md%d.\n"
1948 static int autostart_array (kdev_t startdev
)
1950 int err
= -EINVAL
, i
;
1951 mdp_super_t
*sb
= NULL
;
1952 mdk_rdev_t
*start_rdev
= NULL
, *rdev
;
1954 if (md_import_device(startdev
, 1)) {
1955 printk("could not import %s!\n", partition_name(startdev
));
1959 start_rdev
= find_rdev_all(startdev
);
1964 if (start_rdev
->faulty
) {
1965 printk("can not autostart based on faulty %s!\n",
1966 partition_name(startdev
));
1969 md_list_add(&start_rdev
->pending
, &pending_raid_disks
);
1971 sb
= start_rdev
->sb
;
1973 err
= detect_old_array(sb
);
1975 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1979 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1983 desc
= sb
->disks
+ i
;
1984 dev
= MKDEV(desc
->major
, desc
->minor
);
1986 if (dev
== MKDEV(0,0))
1988 if (dev
== startdev
)
1990 if (md_import_device(dev
, 1)) {
1991 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev
));
1994 rdev
= find_rdev_all(dev
);
1999 md_list_add(&rdev
->pending
, &pending_raid_disks
);
2003 * possibly return codes
2010 export_rdev(start_rdev
);
2017 #undef AUTOADD_FAILED_USED
2018 #undef AUTOADD_FAILED
2019 #undef AUTORUN_FAILED
2027 } raid_setup_args md__initdata
= { 0, 0 };
2029 void md_setup_drive(void) md__init
;
2032 * Searches all registered partitions for autorun RAID arrays
2035 #ifdef CONFIG_AUTODETECT_RAID
2036 static int detected_devices
[128] md__initdata
= { 0, };
2037 static int dev_cnt
=0;
2038 void md_autodetect_dev(kdev_t dev
)
2040 if (dev_cnt
>= 0 && dev_cnt
< 127)
2041 detected_devices
[dev_cnt
++] = dev
;
2045 int md__init
md_run_setup(void)
2047 #ifdef CONFIG_AUTODETECT_RAID
2051 if (raid_setup_args
.noautodetect
)
2052 printk(KERN_INFO
"skipping autodetection of RAID arrays\n");
2055 printk(KERN_INFO
"autodetecting RAID arrays\n");
2057 for (i
=0; i
<dev_cnt
; i
++) {
2058 kdev_t dev
= detected_devices
[i
];
2060 if (md_import_device(dev
,1)) {
2061 printk(KERN_ALERT
"could not import %s!\n",
2062 partition_name(dev
));
2068 rdev
= find_rdev_all(dev
);
2077 md_list_add(&rdev
->pending
, &pending_raid_disks
);
2083 dev_cnt
= -1; /* make sure further calls to md_autodetect_dev are ignored */
2085 #ifdef CONFIG_MD_BOOT
2091 static int get_version (void * arg
)
2095 ver
.major
= MD_MAJOR_VERSION
;
2096 ver
.minor
= MD_MINOR_VERSION
;
2097 ver
.patchlevel
= MD_PATCHLEVEL_VERSION
;
2099 if (md_copy_to_user(arg
, &ver
, sizeof(ver
)))
2105 #define SET_FROM_SB(x) info.x = mddev->sb->x
2106 static int get_array_info (mddev_t
* mddev
, void * arg
)
2108 mdu_array_info_t info
;
2113 SET_FROM_SB(major_version
);
2114 SET_FROM_SB(minor_version
);
2115 SET_FROM_SB(patch_version
);
2119 SET_FROM_SB(nr_disks
);
2120 SET_FROM_SB(raid_disks
);
2121 SET_FROM_SB(md_minor
);
2122 SET_FROM_SB(not_persistent
);
2126 SET_FROM_SB(active_disks
);
2127 SET_FROM_SB(working_disks
);
2128 SET_FROM_SB(failed_disks
);
2129 SET_FROM_SB(spare_disks
);
2131 SET_FROM_SB(layout
);
2132 SET_FROM_SB(chunk_size
);
2134 if (md_copy_to_user(arg
, &info
, sizeof(info
)))
2141 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2142 static int get_disk_info (mddev_t
* mddev
, void * arg
)
2144 mdu_disk_info_t info
;
2150 if (md_copy_from_user(&info
, arg
, sizeof(info
)))
2154 if (nr
>= mddev
->sb
->nr_disks
)
2159 SET_FROM_SB(raid_disk
);
2162 if (md_copy_to_user(arg
, &info
, sizeof(info
)))
2169 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2171 static int add_new_disk (mddev_t
* mddev
, mdu_disk_info_t
*info
)
2173 int err
, size
, persistent
;
2177 dev
= MKDEV(info
->major
,info
->minor
);
2179 if (find_rdev_all(dev
)) {
2180 printk("device %s already used in a RAID array!\n",
2181 partition_name(dev
));
2185 /* expecting a device which has a superblock */
2186 err
= md_import_device(dev
, 1);
2188 printk("md error, md_import_device returned %d\n", err
);
2191 rdev
= find_rdev_all(dev
);
2196 if (mddev
->nb_dev
) {
2197 mdk_rdev_t
*rdev0
= md_list_entry(mddev
->disks
.next
,
2198 mdk_rdev_t
, same_set
);
2199 if (!uuid_equal(rdev0
, rdev
)) {
2200 printk("md: %s has different UUID to %s\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
2204 if (!sb_equal(rdev0
->sb
, rdev
->sb
)) {
2205 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
2210 bind_rdev_to_array(rdev
, mddev
);
2215 if (nr
>= mddev
->sb
->nr_disks
)
2224 if ((info
->state
& (1<<MD_DISK_FAULTY
))==0) {
2225 err
= md_import_device (dev
, 0);
2227 printk("md: error, md_import_device() returned %d\n", err
);
2230 rdev
= find_rdev_all(dev
);
2236 rdev
->old_dev
= dev
;
2237 rdev
->desc_nr
= info
->number
;
2239 bind_rdev_to_array(rdev
, mddev
);
2241 persistent
= !mddev
->sb
->not_persistent
;
2243 printk("nonpersistent superblock ...\n");
2244 if (!mddev
->sb
->chunk_size
)
2245 printk("no chunksize?\n");
2247 size
= calc_dev_size(dev
, mddev
, persistent
);
2248 rdev
->sb_offset
= calc_dev_sboffset(dev
, mddev
, persistent
);
2250 if (!mddev
->sb
->size
|| (mddev
->sb
->size
> size
))
2251 mddev
->sb
->size
= size
;
2255 * sync all other superblocks with the main superblock
2263 static int hot_remove_disk (mddev_t
* mddev
, kdev_t dev
)
2272 printk("trying to remove %s from md%d ... \n",
2273 partition_name(dev
), mdidx(mddev
));
2275 if (!mddev
->pers
->diskop
) {
2276 printk("md%d: personality does not support diskops!\n",
2281 rdev
= find_rdev(mddev
, dev
);
2285 if (rdev
->desc_nr
== -1) {
2289 disk
= &mddev
->sb
->disks
[rdev
->desc_nr
];
2290 if (disk_active(disk
))
2292 if (disk_removed(disk
)) {
2297 err
= mddev
->pers
->diskop(mddev
, &disk
, DISKOP_HOT_REMOVE_DISK
);
2305 remove_descriptor(disk
, mddev
->sb
);
2306 kick_rdev_from_array(rdev
);
2307 mddev
->sb_dirty
= 1;
2308 md_update_sb(mddev
);
2312 printk("cannot remove active disk %s from md%d ... \n",
2313 partition_name(dev
), mdidx(mddev
));
2317 static int hot_add_disk (mddev_t
* mddev
, kdev_t dev
)
2319 int i
, err
, persistent
;
2327 printk("trying to hot-add %s to md%d ... \n",
2328 partition_name(dev
), mdidx(mddev
));
2330 if (!mddev
->pers
->diskop
) {
2331 printk("md%d: personality does not support diskops!\n",
2336 persistent
= !mddev
->sb
->not_persistent
;
2337 size
= calc_dev_size(dev
, mddev
, persistent
);
2339 if (size
< mddev
->sb
->size
) {
2340 printk("md%d: disk size %d blocks < array size %d\n",
2341 mdidx(mddev
), size
, mddev
->sb
->size
);
2345 rdev
= find_rdev(mddev
, dev
);
2349 err
= md_import_device (dev
, 0);
2351 printk("md: error, md_import_device() returned %d\n", err
);
2354 rdev
= find_rdev_all(dev
);
2360 printk("md: can not hot-add faulty %s disk to md%d!\n",
2361 partition_name(dev
), mdidx(mddev
));
2365 bind_rdev_to_array(rdev
, mddev
);
2368 * The rest should better be atomic, we can have disk failures
2369 * noticed in interrupt contexts ...
2371 rdev
->old_dev
= dev
;
2373 rdev
->sb_offset
= calc_dev_sboffset(dev
, mddev
, persistent
);
2375 disk
= mddev
->sb
->disks
+ mddev
->sb
->raid_disks
;
2376 for (i
= mddev
->sb
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
2377 disk
= mddev
->sb
->disks
+ i
;
2379 if (!disk
->major
&& !disk
->minor
)
2381 if (disk_removed(disk
))
2384 if (i
== MD_SB_DISKS
) {
2385 printk("md%d: can not hot-add to full array!\n", mdidx(mddev
));
2387 goto abort_unbind_export
;
2390 if (disk_removed(disk
)) {
2394 if (disk
->number
!= i
) {
2397 goto abort_unbind_export
;
2403 disk
->raid_disk
= disk
->number
;
2404 disk
->major
= MAJOR(dev
);
2405 disk
->minor
= MINOR(dev
);
2407 if (mddev
->pers
->diskop(mddev
, &disk
, DISKOP_HOT_ADD_DISK
)) {
2410 goto abort_unbind_export
;
2413 mark_disk_spare(disk
);
2414 mddev
->sb
->nr_disks
++;
2415 mddev
->sb
->spare_disks
++;
2416 mddev
->sb
->working_disks
++;
2418 mddev
->sb_dirty
= 1;
2420 md_update_sb(mddev
);
2423 * Kick recovery, maybe this spare has to be added to the
2424 * array immediately.
2426 md_recover_arrays();
2430 abort_unbind_export
:
2431 unbind_rdev_from_array(rdev
);
2438 #define SET_SB(x) mddev->sb->x = info->x
2439 static int set_array_info (mddev_t
* mddev
, mdu_array_info_t
*info
)
2442 if (alloc_array_sb(mddev
))
2445 mddev
->sb
->major_version
= MD_MAJOR_VERSION
;
2446 mddev
->sb
->minor_version
= MD_MINOR_VERSION
;
2447 mddev
->sb
->patch_version
= MD_PATCHLEVEL_VERSION
;
2448 mddev
->sb
->ctime
= CURRENT_TIME
;
2455 SET_SB(not_persistent
);
2458 SET_SB(active_disks
);
2459 SET_SB(working_disks
);
2460 SET_SB(failed_disks
);
2461 SET_SB(spare_disks
);
2466 mddev
->sb
->md_magic
= MD_SB_MAGIC
;
2469 * Generate a 128 bit UUID
2471 get_random_bytes(&mddev
->sb
->set_uuid0
, 4);
2472 get_random_bytes(&mddev
->sb
->set_uuid1
, 4);
2473 get_random_bytes(&mddev
->sb
->set_uuid2
, 4);
2474 get_random_bytes(&mddev
->sb
->set_uuid3
, 4);
2480 static int set_disk_info (mddev_t
* mddev
, void * arg
)
2486 static int clear_array (mddev_t
* mddev
)
2492 static int write_raid_info (mddev_t
* mddev
)
2498 static int protect_array (mddev_t
* mddev
)
2504 static int unprotect_array (mddev_t
* mddev
)
2510 static int set_disk_faulty (mddev_t
*mddev
, kdev_t dev
)
2514 fsync_dev(mddev_to_kdev(mddev
));
2515 ret
= md_error(mddev_to_kdev(mddev
), dev
);
2519 static int md_ioctl (struct inode
*inode
, struct file
*file
,
2520 unsigned int cmd
, unsigned long arg
)
2524 struct hd_geometry
*loc
= (struct hd_geometry
*) arg
;
2525 mddev_t
*mddev
= NULL
;
2528 if (!md_capable_admin())
2531 dev
= inode
->i_rdev
;
2533 if (minor
>= MAX_MD_DEVS
)
2537 * Commands dealing with the RAID driver but not any
2543 err
= get_version((void *)arg
);
2546 case PRINT_RAID_DEBUG
:
2551 case BLKGETSIZE
: /* Return device size */
2556 err
= md_put_user(md_hd_struct
[minor
].nr_sects
,
2562 invalidate_buffers(dev
);
2570 read_ahead
[MAJOR(dev
)] = arg
;
2578 err
= md_put_user (read_ahead
[
2579 MAJOR(dev
)], (long *) arg
);
2585 * Commands creating/starting a new array:
2588 mddev
= kdev_to_mddev(dev
);
2592 case SET_ARRAY_INFO
:
2595 printk("array md%d already exists!\n",
2604 case SET_ARRAY_INFO
:
2605 mddev
= alloc_mddev(dev
);
2611 * alloc_mddev() should possibly self-lock.
2613 err
= lock_mddev(mddev
);
2615 printk("ioctl, reason %d, cmd %d\n", err
, cmd
);
2620 printk("array md%d already has a superblock!\n",
2626 mdu_array_info_t info
;
2627 if (md_copy_from_user(&info
, (void*)arg
, sizeof(info
))) {
2631 err
= set_array_info(mddev
, &info
);
2633 printk("couldnt set array info. %d\n", err
);
2641 * possibly make it lock the array ...
2643 err
= autostart_array((kdev_t
)arg
);
2645 printk("autostart %s failed!\n",
2646 partition_name((kdev_t
)arg
));
2655 * Commands querying/configuring an existing array:
2662 err
= lock_mddev(mddev
);
2664 printk("ioctl lock interrupted, reason %d, cmd %d\n",err
, cmd
);
2667 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2668 if (!mddev
->sb
&& cmd
!= ADD_NEW_DISK
&& cmd
!= STOP_ARRAY
&& cmd
!= RUN_ARRAY
) {
2674 * Commands even a read-only array can execute:
2678 case GET_ARRAY_INFO
:
2679 err
= get_array_info(mddev
, (void *)arg
);
2683 err
= get_disk_info(mddev
, (void *)arg
);
2686 case RESTART_ARRAY_RW
:
2687 err
= restart_array(mddev
);
2691 if (!(err
= do_md_stop (mddev
, 0)))
2696 err
= do_md_stop (mddev
, 1);
2700 * We have a problem here : there is no easy way to give a CHS
2701 * virtual geometry. We currently pretend that we have a 2 heads
2702 * 4 sectors (with a BIG number of cylinders...). This drives
2703 * dosfs just mad... ;-)
2710 err
= md_put_user (2, (char *) &loc
->heads
);
2713 err
= md_put_user (4, (char *) &loc
->sectors
);
2716 err
= md_put_user (md_hd_struct
[mdidx(mddev
)].nr_sects
/8,
2717 (short *) &loc
->cylinders
);
2720 err
= md_put_user (md_hd_struct
[minor
].start_sect
,
2721 (long *) &loc
->start
);
2726 * The remaining ioctls are changing the state of the
2727 * superblock, so we do not allow read-only arrays
2738 err
= clear_array(mddev
);
2743 mdu_disk_info_t info
;
2744 if (md_copy_from_user(&info
, (void*)arg
, sizeof(info
)))
2747 err
= add_new_disk(mddev
, &info
);
2750 case HOT_REMOVE_DISK
:
2751 err
= hot_remove_disk(mddev
, (kdev_t
)arg
);
2755 err
= hot_add_disk(mddev
, (kdev_t
)arg
);
2759 err
= set_disk_info(mddev
, (void *)arg
);
2762 case WRITE_RAID_INFO
:
2763 err
= write_raid_info(mddev
);
2766 case UNPROTECT_ARRAY
:
2767 err
= unprotect_array(mddev
);
2771 err
= protect_array(mddev
);
2774 case SET_DISK_FAULTY
:
2775 err
= set_disk_faulty(mddev
, (kdev_t
)arg
);
2780 /* The data is never used....
2782 err = md_copy_from_user(¶m, (mdu_param_t *)arg,
2787 err
= do_md_run (mddev
);
2789 * we have to clean up the mess if
2790 * the array cannot be run for some
2794 mddev
->sb_dirty
= 0;
2795 if (!do_md_stop (mddev
, 0))
2802 printk(KERN_WARNING
"%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current
->comm
, current
->pid
);
2810 unlock_mddev(mddev
);
2820 static int md_open (struct inode
*inode
, struct file
*file
)
2828 static struct block_device_operations md_fops
=
2835 int md_thread(void * arg
)
2837 mdk_thread_t
*thread
= arg
;
2847 sprintf(current
->comm
, thread
->name
);
2850 thread
->tsk
= current
;
2853 * md_thread is a 'system-thread', it's priority should be very
2854 * high. We avoid resource deadlocks individually in each
2855 * raid personality. (RAID5 does preallocation) We also use RR and
2856 * the very same RT priority as kswapd, thus we will never get
2857 * into a priority inversion deadlock.
2859 * we definitely have to have equal or higher priority than
2860 * bdflush, otherwise bdflush will deadlock if there are too
2861 * many dirty RAID5 blocks.
2863 current
->policy
= SCHED_OTHER
;
2864 current
->nice
= -20;
2865 // md_unlock_kernel();
2870 DECLARE_WAITQUEUE(wait
, current
);
2872 add_wait_queue(&thread
->wqueue
, &wait
);
2873 set_task_state(current
, TASK_INTERRUPTIBLE
);
2874 if (!test_bit(THREAD_WAKEUP
, &thread
->flags
)) {
2875 dprintk("thread %p went to sleep.\n", thread
);
2877 dprintk("thread %p woke up.\n", thread
);
2879 current
->state
= TASK_RUNNING
;
2880 remove_wait_queue(&thread
->wqueue
, &wait
);
2881 clear_bit(THREAD_WAKEUP
, &thread
->flags
);
2884 thread
->run(thread
->data
);
2885 run_task_queue(&tq_disk
);
2888 if (md_signal_pending(current
)) {
2889 printk("%8s(%d) flushing signals.\n", current
->comm
,
2898 void md_wakeup_thread(mdk_thread_t
*thread
)
2900 dprintk("waking up MD thread %p.\n", thread
);
2901 set_bit(THREAD_WAKEUP
, &thread
->flags
);
2902 wake_up(&thread
->wqueue
);
2905 mdk_thread_t
*md_register_thread (void (*run
) (void *),
2906 void *data
, const char *name
)
2908 mdk_thread_t
*thread
;
2910 DECLARE_MUTEX_LOCKED(sem
);
2912 thread
= (mdk_thread_t
*) kmalloc
2913 (sizeof(mdk_thread_t
), GFP_KERNEL
);
2917 memset(thread
, 0, sizeof(mdk_thread_t
));
2918 md_init_waitqueue_head(&thread
->wqueue
);
2922 thread
->data
= data
;
2923 thread
->name
= name
;
2924 ret
= kernel_thread(md_thread
, thread
, 0);
2933 void md_interrupt_thread (mdk_thread_t
*thread
)
2939 printk("interrupting MD-thread pid %d\n", thread
->tsk
->pid
);
2940 send_sig(SIGKILL
, thread
->tsk
, 1);
2943 void md_unregister_thread (mdk_thread_t
*thread
)
2945 DECLARE_MUTEX_LOCKED(sem
);
2949 thread
->name
= NULL
;
2954 md_interrupt_thread(thread
);
2958 void md_recover_arrays (void)
2960 if (!md_recovery_thread
) {
2964 md_wakeup_thread(md_recovery_thread
);
2968 int md_error (kdev_t dev
, kdev_t rdev
)
2974 mddev
= kdev_to_mddev(dev
);
2975 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
2981 rrdev
= find_rdev(mddev
, rdev
);
2982 mark_rdev_faulty(rrdev
);
2984 * if recovery was running, stop it now.
2986 if (mddev
->pers
->stop_resync
)
2987 mddev
->pers
->stop_resync(mddev
);
2988 if (mddev
->recovery_running
)
2989 md_interrupt_thread(md_recovery_thread
);
2990 if (mddev
->pers
->error_handler
) {
2991 rc
= mddev
->pers
->error_handler(mddev
, rdev
);
2992 md_recover_arrays();
2998 static int status_unused (char * page
)
3002 struct md_list_head
*tmp
;
3004 sz
+= sprintf(page
+ sz
, "unused devices: ");
3006 ITERATE_RDEV_ALL(rdev
,tmp
) {
3007 if (!rdev
->same_set
.next
&& !rdev
->same_set
.prev
) {
3009 * The device is not yet used by any array.
3012 sz
+= sprintf(page
+ sz
, "%s ",
3013 partition_name(rdev
->dev
));
3017 sz
+= sprintf(page
+ sz
, "<none>");
3019 sz
+= sprintf(page
+ sz
, "\n");
3024 static int status_resync (char * page
, mddev_t
* mddev
)
3027 unsigned long max_blocks
, resync
, res
, dt
, db
, rt
;
3029 resync
= mddev
->curr_resync
- atomic_read(&mddev
->recovery_active
);
3030 max_blocks
= mddev
->sb
->size
;
3033 * Should not happen.
3039 res
= (resync
/1024)*1000/(max_blocks
/1024 + 1);
3041 int i
, x
= res
/50, y
= 20-x
;
3042 sz
+= sprintf(page
+ sz
, "[");
3043 for (i
= 0; i
< x
; i
++)
3044 sz
+= sprintf(page
+ sz
, "=");
3045 sz
+= sprintf(page
+ sz
, ">");
3046 for (i
= 0; i
< y
; i
++)
3047 sz
+= sprintf(page
+ sz
, ".");
3048 sz
+= sprintf(page
+ sz
, "] ");
3050 if (!mddev
->recovery_running
)
3054 sz
+= sprintf(page
+ sz
, " resync =%3lu.%lu%% (%lu/%lu)",
3055 res
/10, res
% 10, resync
, max_blocks
);
3060 sz
+= sprintf(page
+ sz
, " recovery =%3lu.%lu%% (%lu/%lu)",
3061 res
/10, res
% 10, resync
, max_blocks
);
3064 * We do not want to overflow, so the order of operands and
3065 * the * 100 / 100 trick are important. We do a +1 to be
3066 * safe against division by zero. We only estimate anyway.
3068 * dt: time from mark until now
3069 * db: blocks written from mark until now
3070 * rt: remaining time
3072 dt
= ((jiffies
- mddev
->resync_mark
) / HZ
);
3074 db
= resync
- mddev
->resync_mark_cnt
;
3075 rt
= (dt
* ((max_blocks
-resync
) / (db
/100+1)))/100;
3077 sz
+= sprintf(page
+ sz
, " finish=%lu.%lumin", rt
/ 60, (rt
% 60)/6);
3079 sz
+= sprintf(page
+ sz
, " speed=%ldK/sec", db
/dt
);
3084 static int md_status_read_proc(char *page
, char **start
, off_t off
,
3085 int count
, int *eof
, void *data
)
3087 int sz
= 0, j
, size
;
3088 struct md_list_head
*tmp
, *tmp2
;
3092 sz
+= sprintf(page
+ sz
, "Personalities : ");
3093 for (j
= 0; j
< MAX_PERSONALITY
; j
++)
3095 sz
+= sprintf(page
+sz
, "[%s] ", pers
[j
]->name
);
3097 sz
+= sprintf(page
+sz
, "\n");
3100 sz
+= sprintf(page
+sz
, "read_ahead ");
3101 if (read_ahead
[MD_MAJOR
] == INT_MAX
)
3102 sz
+= sprintf(page
+sz
, "not set\n");
3104 sz
+= sprintf(page
+sz
, "%d sectors\n", read_ahead
[MD_MAJOR
]);
3106 ITERATE_MDDEV(mddev
,tmp
) {
3107 sz
+= sprintf(page
+ sz
, "md%d : %sactive", mdidx(mddev
),
3108 mddev
->pers
? "" : "in");
3111 sz
+= sprintf(page
+ sz
, " (read-only)");
3112 sz
+= sprintf(page
+ sz
, " %s", mddev
->pers
->name
);
3116 ITERATE_RDEV(mddev
,rdev
,tmp2
) {
3117 sz
+= sprintf(page
+ sz
, " %s[%d]",
3118 partition_name(rdev
->dev
), rdev
->desc_nr
);
3120 sz
+= sprintf(page
+ sz
, "(F)");
3126 if (mddev
->nb_dev
) {
3128 sz
+= sprintf(page
+ sz
, "\n %d blocks",
3129 md_size
[mdidx(mddev
)]);
3131 sz
+= sprintf(page
+ sz
, "\n %d blocks", size
);
3135 sz
+= sprintf(page
+sz
, "\n");
3139 sz
+= mddev
->pers
->status (page
+sz
, mddev
);
3141 sz
+= sprintf(page
+sz
, "\n ");
3142 if (mddev
->curr_resync
) {
3143 sz
+= status_resync (page
+sz
, mddev
);
3145 if (md_atomic_read(&mddev
->resync_sem
.count
) != 1)
3146 sz
+= sprintf(page
+ sz
, " resync=DELAYED");
3148 sz
+= sprintf(page
+ sz
, "\n");
3150 sz
+= status_unused (page
+ sz
);
3155 int register_md_personality (int pnum
, mdk_personality_t
*p
)
3157 if (pnum
>= MAX_PERSONALITY
)
3164 printk(KERN_INFO
"%s personality registered\n", p
->name
);
3168 int unregister_md_personality (int pnum
)
3170 if (pnum
>= MAX_PERSONALITY
)
3173 printk(KERN_INFO
"%s personality unregistered\n", pers
[pnum
]->name
);
3178 static mdp_disk_t
*get_spare(mddev_t
*mddev
)
3180 mdp_super_t
*sb
= mddev
->sb
;
3183 struct md_list_head
*tmp
;
3185 ITERATE_RDEV(mddev
,rdev
,tmp
) {
3192 disk
= &sb
->disks
[rdev
->desc_nr
];
3193 if (disk_faulty(disk
)) {
3197 if (disk_active(disk
))
3204 static unsigned int sync_io
[DK_MAX_MAJOR
][DK_MAX_DISK
];
3205 void md_sync_acct(kdev_t dev
, unsigned long nr_sectors
)
3207 unsigned int major
= MAJOR(dev
);
3210 index
= disk_index(dev
);
3211 if ((index
>= DK_MAX_DISK
) || (major
>= DK_MAX_MAJOR
))
3214 sync_io
[major
][index
] += nr_sectors
;
3217 static int is_mddev_idle (mddev_t
*mddev
)
3220 struct md_list_head
*tmp
;
3222 unsigned long curr_events
;
3225 ITERATE_RDEV(mddev
,rdev
,tmp
) {
3226 int major
= MAJOR(rdev
->dev
);
3227 int idx
= disk_index(rdev
->dev
);
3229 if ((idx
>= DK_MAX_DISK
) || (major
>= DK_MAX_MAJOR
))
3232 curr_events
= kstat
.dk_drive_rblk
[major
][idx
] +
3233 kstat
.dk_drive_wblk
[major
][idx
] ;
3234 curr_events
-= sync_io
[major
][idx
];
3235 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3236 if (curr_events
!= rdev
->last_events
) {
3237 // printk("!I(%ld)", curr_events - rdev->last_events);
3238 rdev
->last_events
= curr_events
;
3245 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait
);
3247 void md_done_sync(mddev_t
*mddev
, int blocks
, int ok
)
3249 /* another "blocks" (1K) blocks have been synced */
3250 atomic_sub(blocks
, &mddev
->recovery_active
);
3251 wake_up(&mddev
->recovery_wait
);
3253 // stop recovery, signal do_sync ....
3257 #define SYNC_MARKS 10
3258 #define SYNC_MARK_STEP (3*HZ)
3259 int md_do_sync(mddev_t
*mddev
, mdp_disk_t
*spare
)
3262 unsigned int max_blocks
, currspeed
,
3263 j
, window
, err
, serialize
;
3264 kdev_t read_disk
= mddev_to_kdev(mddev
);
3265 unsigned long mark
[SYNC_MARKS
];
3266 unsigned long mark_cnt
[SYNC_MARKS
];
3268 struct md_list_head
*tmp
;
3269 unsigned long last_check
;
3272 err
= down_interruptible(&mddev
->resync_sem
);
3278 ITERATE_MDDEV(mddev2
,tmp
) {
3279 if (mddev2
== mddev
)
3281 if (mddev2
->curr_resync
&& match_mddev_units(mddev
,mddev2
)) {
3282 printk(KERN_INFO
"md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev
), mdidx(mddev2
));
3288 interruptible_sleep_on(&resync_wait
);
3289 if (md_signal_pending(current
)) {
3297 mddev
->curr_resync
= 1;
3299 max_blocks
= mddev
->sb
->size
;
3301 printk(KERN_INFO
"md: syncing RAID array md%d\n", mdidx(mddev
));
3302 printk(KERN_INFO
"md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3303 sysctl_speed_limit_min
);
3304 printk(KERN_INFO
"md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max
);
3307 * Resync has low priority.
3311 is_mddev_idle(mddev
); /* this also initializes IO event counters */
3312 for (m
= 0; m
< SYNC_MARKS
; m
++) {
3317 mddev
->resync_mark
= mark
[last_mark
];
3318 mddev
->resync_mark_cnt
= mark_cnt
[last_mark
];
3321 * Tune reconstruction:
3323 window
= MAX_READAHEAD
*(PAGE_SIZE
/1024);
3324 printk(KERN_INFO
"md: using %dk window, over a total of %d blocks.\n",window
,max_blocks
);
3326 atomic_set(&mddev
->recovery_active
, 0);
3327 init_waitqueue_head(&mddev
->recovery_wait
);
3329 for (j
= 0; j
< max_blocks
;) {
3332 blocks
= mddev
->pers
->sync_request(mddev
, j
);
3338 atomic_add(blocks
, &mddev
->recovery_active
);
3340 mddev
->curr_resync
= j
;
3342 if (last_check
+ window
> j
)
3345 run_task_queue(&tq_disk
); //??
3347 if (jiffies
>= mark
[last_mark
] + SYNC_MARK_STEP
) {
3349 int next
= (last_mark
+1) % SYNC_MARKS
;
3351 mddev
->resync_mark
= mark
[next
];
3352 mddev
->resync_mark_cnt
= mark_cnt
[next
];
3353 mark
[next
] = jiffies
;
3354 mark_cnt
[next
] = j
- atomic_read(&mddev
->recovery_active
);
3359 if (md_signal_pending(current
)) {
3361 * got a signal, exit.
3363 mddev
->curr_resync
= 0;
3364 printk("md_do_sync() got signal ... exiting\n");
3371 * this loop exits only if either when we are slower than
3372 * the 'hard' speed limit, or the system was IO-idle for
3374 * the system might be non-idle CPU-wise, but we only care
3375 * about not overloading the IO subsystem. (things like an
3376 * e2fsck being done on the RAID array should execute fast)
3379 if (md_need_resched(current
))
3382 currspeed
= (j
-mddev
->resync_mark_cnt
)/((jiffies
-mddev
->resync_mark
)/HZ
+1) +1;
3384 if (currspeed
> sysctl_speed_limit_min
) {
3387 if ((currspeed
> sysctl_speed_limit_max
) ||
3388 !is_mddev_idle(mddev
)) {
3389 current
->state
= TASK_INTERRUPTIBLE
;
3390 md_schedule_timeout(HZ
/4);
3391 if (!md_signal_pending(current
))
3395 current
->nice
= -20;
3397 fsync_dev(read_disk
);
3398 printk(KERN_INFO
"md: md%d: sync done.\n",mdidx(mddev
));
3401 * this also signals 'finished resyncing' to md_stop
3404 wait_event(mddev
->recovery_wait
, atomic_read(&mddev
->recovery_active
)==0);
3405 up(&mddev
->resync_sem
);
3407 mddev
->curr_resync
= 0;
3408 wake_up(&resync_wait
);
3414 * This is a kernel thread which syncs a spare disk with the active array
3416 * the amount of foolproofing might seem to be a tad excessive, but an
3417 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3418 * of my root partition with the first 0.5 gigs of my /home partition ... so
3419 * i'm a bit nervous ;)
3421 void md_do_recovery (void *data
)
3427 struct md_list_head
*tmp
;
3429 printk(KERN_INFO
"md: recovery thread got woken up ...\n");
3431 ITERATE_MDDEV(mddev
,tmp
) {
3435 if (mddev
->recovery_running
)
3437 if (sb
->active_disks
== sb
->raid_disks
)
3439 if (!sb
->spare_disks
) {
3440 printk(KERN_ERR
"md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev
));
3444 * now here we get the spare and resync it.
3446 if ((spare
= get_spare(mddev
)) == NULL
)
3448 printk(KERN_INFO
"md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev
), partition_name(MKDEV(spare
->major
,spare
->minor
)));
3449 if (!mddev
->pers
->diskop
)
3451 if (mddev
->pers
->diskop(mddev
, &spare
, DISKOP_SPARE_WRITE
))
3453 down(&mddev
->recovery_sem
);
3454 mddev
->recovery_running
= 1;
3455 err
= md_do_sync(mddev
, spare
);
3457 printk(KERN_INFO
"md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev
), partition_name(MKDEV(spare
->major
,spare
->minor
)));
3458 if (!disk_faulty(spare
)) {
3459 mddev
->pers
->diskop(mddev
,&spare
,DISKOP_SPARE_INACTIVE
);
3460 mark_disk_faulty(spare
);
3461 mark_disk_nonsync(spare
);
3462 mark_disk_inactive(spare
);
3464 sb
->working_disks
--;
3468 if (disk_faulty(spare
))
3469 mddev
->pers
->diskop(mddev
, &spare
,
3470 DISKOP_SPARE_INACTIVE
);
3471 if (err
== -EINTR
|| err
== -ENOMEM
) {
3473 * Recovery got interrupted, or ran out of mem ...
3474 * signal back that we have finished using the array.
3476 mddev
->pers
->diskop(mddev
, &spare
,
3477 DISKOP_SPARE_INACTIVE
);
3478 up(&mddev
->recovery_sem
);
3479 mddev
->recovery_running
= 0;
3482 mddev
->recovery_running
= 0;
3483 up(&mddev
->recovery_sem
);
3485 if (!disk_faulty(spare
)) {
3487 * the SPARE_ACTIVE diskop possibly changes the
3490 mddev
->pers
->diskop(mddev
, &spare
, DISKOP_SPARE_ACTIVE
);
3491 mark_disk_sync(spare
);
3492 mark_disk_active(spare
);
3496 mddev
->sb_dirty
= 1;
3497 md_update_sb(mddev
);
3500 printk(KERN_INFO
"md: recovery thread finished ...\n");
3504 int md_notify_reboot(struct notifier_block
*this,
3505 unsigned long code
, void *x
)
3507 struct md_list_head
*tmp
;
3510 if ((code
== MD_SYS_DOWN
) || (code
== MD_SYS_HALT
)
3511 || (code
== MD_SYS_POWER_OFF
)) {
3513 printk(KERN_INFO
"stopping all md devices.\n");
3515 ITERATE_MDDEV(mddev
,tmp
)
3516 do_md_stop (mddev
, 1);
3518 * certain more exotic SCSI devices are known to be
3519 * volatile wrt too early system reboots. While the
3520 * right place to handle this issue is the given
3521 * driver, we do want to have a safe RAID driver ...
3528 struct notifier_block md_notifier
= {
3534 static int md__init
raid_setup(char *str
)
3538 len
= strlen(str
) + 1;
3542 char *comma
= strchr(str
+pos
, ',');
3545 wlen
= (comma
-str
)-pos
;
3546 else wlen
= (len
-1)-pos
;
3548 if (strncmp(str
, "noautodetect", wlen
) == 0)
3549 raid_setup_args
.noautodetect
= 1;
3552 raid_setup_args
.set
= 1;
3555 __setup("raid=", raid_setup
);
3557 static void md_geninit (void)
3561 for(i
= 0; i
< MAX_MD_DEVS
; i
++) {
3562 md_blocksizes
[i
] = 1024;
3564 md_hardsect_sizes
[i
] = 512;
3565 md_maxreadahead
[i
] = MD_READAHEAD
;
3566 register_disk(&md_gendisk
, MKDEV(MAJOR_NR
,i
), 1, &md_fops
, 0);
3568 blksize_size
[MAJOR_NR
] = md_blocksizes
;
3569 blk_size
[MAJOR_NR
] = md_size
;
3570 max_readahead
[MAJOR_NR
] = md_maxreadahead
;
3571 hardsect_size
[MAJOR_NR
] = md_hardsect_sizes
;
3573 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t
));
3575 #ifdef CONFIG_PROC_FS
3576 create_proc_read_entry("mdstat", 0, NULL
, md_status_read_proc
, NULL
);
3579 void hsm_init (void);
3580 void translucent_init (void);
3581 void linear_init (void);
3582 void raid0_init (void);
3583 void raid1_init (void);
3584 void raid5_init (void);
3586 int md__init
md_init (void)
3588 static char * name
= "mdrecoveryd";
3590 printk (KERN_INFO
"md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
3591 MD_MAJOR_VERSION
, MD_MINOR_VERSION
,
3592 MD_PATCHLEVEL_VERSION
, MAX_MD_DEVS
, MAX_REAL
);
3594 if (devfs_register_blkdev (MAJOR_NR
, "md", &md_fops
))
3596 printk (KERN_ALERT
"Unable to get major %d for md\n", MAJOR_NR
);
3599 devfs_handle
= devfs_mk_dir (NULL
, "md", NULL
);
3600 devfs_register_series (devfs_handle
, "%u",MAX_MD_DEVS
,DEVFS_FL_DEFAULT
,
3601 MAJOR_NR
, 0, S_IFBLK
| S_IRUSR
| S_IWUSR
,
3604 /* forward all md request to md_make_request */
3605 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR
), md_make_request
);
3608 read_ahead
[MAJOR_NR
] = INT_MAX
;
3609 md_gendisk
.next
= gendisk_head
;
3611 gendisk_head
= &md_gendisk
;
3613 md_recovery_thread
= md_register_thread(md_do_recovery
, NULL
, name
);
3614 if (!md_recovery_thread
)
3615 printk(KERN_ALERT
"bug: couldn't allocate md_recovery_thread\n");
3617 md_register_reboot_notifier(&md_notifier
);
3618 raid_table_header
= register_sysctl_table(raid_root_table
, 1);
3620 #ifdef CONFIG_MD_LINEAR
3623 #ifdef CONFIG_MD_RAID0
3626 #ifdef CONFIG_MD_RAID1
3629 #ifdef CONFIG_MD_RAID5
3636 #ifdef CONFIG_MD_BOOT
3637 #define MAX_MD_BOOT_DEVS 8
3640 int pers
[MAX_MD_BOOT_DEVS
];
3641 int chunk
[MAX_MD_BOOT_DEVS
];
3642 kdev_t devices
[MAX_MD_BOOT_DEVS
][MAX_REAL
];
3643 } md_setup_args md__initdata
= { 0, };
3646 * Parse the command-line parameters given our kernel, but do not
3647 * actually try to invoke the MD device now; that is handled by
3648 * md_setup_drive after the low-level disk drivers have initialised.
3650 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3651 * assigns the task of parsing integer arguments to the
3652 * invoked program now). Added ability to initialise all
3653 * the MD devices (by specifying multiple "md=" lines)
3654 * instead of just one. -- KTK
3655 * 18May2000: Added support for persistant-superblock arrays:
3656 * md=n,0,factor,fault,device-list uses RAID0 for device n
3657 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3658 * md=n,device-list reads a RAID superblock from the devices
3659 * elements in device-list are read by name_to_kdev_t so can be
3660 * a hex number or something like /dev/hda1 /dev/sdb
3662 extern kdev_t
name_to_kdev_t(char *line
) md__init
;
3663 static int md__init
md_setup(char *str
)
3665 int minor
, level
, factor
, fault
, i
=0;
3667 char *devnames
, *pername
= "";
3669 if(get_option(&str
, &minor
) != 2) { /* MD Number */
3670 printk("md: Too few arguments supplied to md=.\n");
3673 if (minor
>= MAX_MD_BOOT_DEVS
) {
3674 printk ("md: Minor device number too high.\n");
3676 } else if (md_setup_args
.set
& (1 << minor
)) {
3677 printk ("md: Warning - md=%d,... has been specified twice;\n"
3678 " will discard the first definition.\n", minor
);
3680 switch(get_option(&str
, &level
)) { /* RAID Personality */
3681 case 2: /* could be 0 or -1.. */
3682 if (level
== 0 || level
== -1) {
3683 if (get_option(&str
, &factor
) != 2 || /* Chunk Size */
3684 get_option(&str
, &fault
) != 2) {
3685 printk("md: Too few arguments supplied to md=.\n");
3688 md_setup_args
.pers
[minor
] = level
;
3689 md_setup_args
.chunk
[minor
] = 1 << (factor
+12);
3700 printk ("md: The kernel has not been configured for raid%d"
3701 " support!\n", level
);
3704 md_setup_args
.pers
[minor
] = level
;
3708 case 1: /* the first device is numeric */
3709 md_setup_args
.devices
[minor
][i
++] = level
;
3712 md_setup_args
.pers
[minor
] = 0;
3713 pername
="super-block";
3716 for (; i
<MAX_REAL
&& str
; i
++) {
3717 if ((device
= name_to_kdev_t(str
))) {
3718 md_setup_args
.devices
[minor
][i
] = device
;
3720 printk ("md: Unknown device name, %s.\n", str
);
3723 if ((str
= strchr(str
, ',')) != NULL
)
3727 printk ("md: No devices specified for md%d?\n", minor
);
3731 printk ("md: Will configure md%d (%s) from %s, below.\n",
3732 minor
, pername
, devnames
);
3733 md_setup_args
.devices
[minor
][i
] = (kdev_t
) 0;
3734 md_setup_args
.set
|= (1 << minor
);
3738 void md__init
md_setup_drive(void)
3744 for (minor
= 0; minor
< MAX_MD_BOOT_DEVS
; minor
++) {
3745 mdu_disk_info_t dinfo
;
3747 if (!(md_setup_args
.set
& (1 << minor
)))
3749 printk("md: Loading md%d.\n", minor
);
3750 if (mddev_map
[minor
].mddev
) {
3751 printk(".. md%d already autodetected - use raid=noautodetect\n", minor
);
3754 mddev
= alloc_mddev(MKDEV(MD_MAJOR
,minor
));
3755 if (md_setup_args
.pers
[minor
]) {
3756 /* non-persistent */
3757 mdu_array_info_t ainfo
;
3758 ainfo
.level
= pers_to_level(md_setup_args
.pers
[minor
]);
3761 ainfo
.raid_disks
=0;
3762 ainfo
.md_minor
=minor
;
3763 ainfo
.not_persistent
= 1;
3765 ainfo
.state
= MD_SB_CLEAN
;
3766 ainfo
.active_disks
= 0;
3767 ainfo
.working_disks
= 0;
3768 ainfo
.failed_disks
= 0;
3769 ainfo
.spare_disks
= 0;
3771 ainfo
.chunk_size
= md_setup_args
.chunk
[minor
];
3772 err
= set_array_info(mddev
, &ainfo
);
3773 for (i
=0; !err
&& (dev
= md_setup_args
.devices
[minor
][i
]); i
++) {
3775 dinfo
.raid_disk
= i
;
3776 dinfo
.state
= (1<<MD_DISK_ACTIVE
)|(1<<MD_DISK_SYNC
);
3777 dinfo
.major
= MAJOR(dev
);
3778 dinfo
.minor
= MINOR(dev
);
3779 mddev
->sb
->nr_disks
++;
3780 mddev
->sb
->raid_disks
++;
3781 mddev
->sb
->active_disks
++;
3782 mddev
->sb
->working_disks
++;
3783 err
= add_new_disk (mddev
, &dinfo
);
3787 for (i
= 0; (dev
= md_setup_args
.devices
[minor
][i
]); i
++) {
3788 dinfo
.major
= MAJOR(dev
);
3789 dinfo
.minor
= MINOR(dev
);
3790 add_new_disk (mddev
, &dinfo
);
3794 err
= do_md_run(mddev
);
3796 mddev
->sb_dirty
= 0;
3797 do_md_stop(mddev
, 0);
3798 printk("md: starting md%d failed\n", minor
);
3803 __setup("md=", md_setup
);
3807 int init_module (void)
3812 static void free_device_names(void)
3814 while (device_names
.next
!= &device_names
) {
3815 struct list_head
*tmp
= device_names
.next
;
3822 void cleanup_module (void)
3824 struct gendisk
**gendisk_ptr
;
3826 md_unregister_thread(md_recovery_thread
);
3827 devfs_unregister(devfs_handle
);
3829 devfs_unregister_blkdev(MAJOR_NR
,"md");
3830 unregister_reboot_notifier(&md_notifier
);
3831 unregister_sysctl_table(raid_table_header
);
3832 #ifdef CONFIG_PROC_FS
3833 remove_proc_entry("mdstat", NULL
);
3836 gendisk_ptr
= &gendisk_head
;
3837 while (*gendisk_ptr
) {
3838 if (*gendisk_ptr
== &md_gendisk
) {
3839 *gendisk_ptr
= md_gendisk
.next
;
3842 gendisk_ptr
= & (*gendisk_ptr
)->next
;
3844 blk_dev
[MAJOR_NR
].queue
= NULL
;
3845 blksize_size
[MAJOR_NR
] = NULL
;
3846 blk_size
[MAJOR_NR
] = NULL
;
3847 max_readahead
[MAJOR_NR
] = NULL
;
3848 hardsect_size
[MAJOR_NR
] = NULL
;
3850 free_device_names();
3855 __initcall(md_init
);
3856 #ifdef CONFIG_AUTODETECT_RAID
3857 __initcall(md_run_setup
);
3860 MD_EXPORT_SYMBOL(md_size
);
3861 MD_EXPORT_SYMBOL(register_md_personality
);
3862 MD_EXPORT_SYMBOL(unregister_md_personality
);
3863 MD_EXPORT_SYMBOL(partition_name
);
3864 MD_EXPORT_SYMBOL(md_error
);
3865 MD_EXPORT_SYMBOL(md_do_sync
);
3866 MD_EXPORT_SYMBOL(md_sync_acct
);
3867 MD_EXPORT_SYMBOL(md_done_sync
);
3868 MD_EXPORT_SYMBOL(md_recover_arrays
);
3869 MD_EXPORT_SYMBOL(md_register_thread
);
3870 MD_EXPORT_SYMBOL(md_unregister_thread
);
3871 MD_EXPORT_SYMBOL(md_update_sb
);
3872 MD_EXPORT_SYMBOL(md_wakeup_thread
);
3873 MD_EXPORT_SYMBOL(md_print_devices
);
3874 MD_EXPORT_SYMBOL(find_rdev_nr
);
3875 MD_EXPORT_SYMBOL(md_interrupt_thread
);
3876 MD_EXPORT_SYMBOL(mddev_map
);
3877 MD_EXPORT_SYMBOL(md_check_ordering
);