2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
19 Neil Brown <neilb@cse.unsw.edu.au>.
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/raid/md.h>
34 #include <linux/raid/xor.h>
35 #include <linux/devfs_fs_kernel.h>
38 #include <linux/kmod.h>
41 #define __KERNEL_SYSCALLS__
42 #include <linux/unistd.h>
44 #include <asm/unaligned.h>
46 extern asmlinkage
int sys_sched_yield(void);
47 extern asmlinkage
long sys_setsid(void);
49 #define MAJOR_NR MD_MAJOR
52 #include <linux/blk.h>
56 # define dprintk(x...) printk(x)
58 # define dprintk(x...) do { } while(0)
61 static mdk_personality_t
*pers
[MAX_PERSONALITY
] = {NULL
, };
64 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
65 * is 100 KB/sec, so the extra system load does not show up that much.
66 * Increase it if you want to have more _guaranteed_ speed. Note that
67 * the RAID driver will use the maximum available bandwith if the IO
68 * subsystem is idle. There is also an 'absolute maximum' reconstruction
69 * speed limit - in case reconstruction slows down your system despite
72 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
75 static int sysctl_speed_limit_min
= 100;
76 static int sysctl_speed_limit_max
= 100000;
78 static struct ctl_table_header
*raid_table_header
;
80 static ctl_table raid_table
[] = {
81 {DEV_RAID_SPEED_LIMIT_MIN
, "speed_limit_min",
82 &sysctl_speed_limit_min
, sizeof(int), 0644, NULL
, &proc_dointvec
},
83 {DEV_RAID_SPEED_LIMIT_MAX
, "speed_limit_max",
84 &sysctl_speed_limit_max
, sizeof(int), 0644, NULL
, &proc_dointvec
},
88 static ctl_table raid_dir_table
[] = {
89 {DEV_RAID
, "raid", NULL
, 0, 0555, raid_table
},
93 static ctl_table raid_root_table
[] = {
94 {CTL_DEV
, "dev", NULL
, 0, 0555, raid_dir_table
},
99 * these have to be allocated separately because external
100 * subsystems want to have a pre-defined structure
102 struct hd_struct md_hd_struct
[MAX_MD_DEVS
];
103 static int md_blocksizes
[MAX_MD_DEVS
];
104 static int md_hardsect_sizes
[MAX_MD_DEVS
];
105 static int md_maxreadahead
[MAX_MD_DEVS
];
106 static mdk_thread_t
*md_recovery_thread
= NULL
;
108 int md_size
[MAX_MD_DEVS
] = {0, };
110 extern struct block_device_operations md_fops
;
111 static devfs_handle_t devfs_handle
= NULL
;
113 static struct gendisk md_gendisk
=
121 nr_real
: MAX_MD_DEVS
,
128 * Enables to iterate over all existing md arrays
130 static MD_LIST_HEAD(all_mddevs
);
133 * The mapping between kdev and mddev is not necessary a simple
134 * one! Eg. HSM uses several sub-devices to implement Logical
135 * Volumes. All these sub-devices map to the same mddev.
137 dev_mapping_t mddev_map
[MAX_MD_DEVS
] = { {NULL
, 0}, };
139 void add_mddev_mapping (mddev_t
* mddev
, kdev_t dev
, void *data
)
141 unsigned int minor
= MINOR(dev
);
143 if (MAJOR(dev
) != MD_MAJOR
) {
147 if (mddev_map
[minor
].mddev
!= NULL
) {
151 mddev_map
[minor
].mddev
= mddev
;
152 mddev_map
[minor
].data
= data
;
155 void del_mddev_mapping (mddev_t
* mddev
, kdev_t dev
)
157 unsigned int minor
= MINOR(dev
);
159 if (MAJOR(dev
) != MD_MAJOR
) {
163 if (mddev_map
[minor
].mddev
!= mddev
) {
167 mddev_map
[minor
].mddev
= NULL
;
168 mddev_map
[minor
].data
= NULL
;
171 static int md_make_request (request_queue_t
*q
, int rw
, struct buffer_head
* bh
)
173 mddev_t
*mddev
= kdev_to_mddev(bh
->b_rdev
);
175 if (mddev
&& mddev
->pers
)
176 return mddev
->pers
->make_request(mddev
, rw
, bh
);
183 static mddev_t
* alloc_mddev (kdev_t dev
)
187 if (MAJOR(dev
) != MD_MAJOR
) {
191 mddev
= (mddev_t
*) kmalloc(sizeof(*mddev
), GFP_KERNEL
);
195 memset(mddev
, 0, sizeof(*mddev
));
197 mddev
->__minor
= MINOR(dev
);
198 init_MUTEX(&mddev
->reconfig_sem
);
199 init_MUTEX(&mddev
->recovery_sem
);
200 init_MUTEX(&mddev
->resync_sem
);
201 MD_INIT_LIST_HEAD(&mddev
->disks
);
202 MD_INIT_LIST_HEAD(&mddev
->all_mddevs
);
205 * The 'base' mddev is the one with data NULL.
206 * personalities can create additional mddevs
209 add_mddev_mapping(mddev
, dev
, 0);
210 md_list_add(&mddev
->all_mddevs
, &all_mddevs
);
217 struct gendisk
* find_gendisk (kdev_t dev
)
219 struct gendisk
*tmp
= gendisk_head
;
221 while (tmp
!= NULL
) {
222 if (tmp
->major
== MAJOR(dev
))
229 mdk_rdev_t
* find_rdev_nr(mddev_t
*mddev
, int nr
)
232 struct md_list_head
*tmp
;
234 ITERATE_RDEV(mddev
,rdev
,tmp
) {
235 if (rdev
->desc_nr
== nr
)
241 mdk_rdev_t
* find_rdev(mddev_t
* mddev
, kdev_t dev
)
243 struct md_list_head
*tmp
;
246 ITERATE_RDEV(mddev
,rdev
,tmp
) {
247 if (rdev
->dev
== dev
)
253 static MD_LIST_HEAD(device_names
);
255 char * partition_name (kdev_t dev
)
258 static char nomem
[] = "<nomem>";
260 struct md_list_head
*tmp
= device_names
.next
;
262 while (tmp
!= &device_names
) {
263 dname
= md_list_entry(tmp
, dev_name_t
, list
);
264 if (dname
->dev
== dev
)
269 dname
= (dev_name_t
*) kmalloc(sizeof(*dname
), GFP_KERNEL
);
274 * ok, add this new device name to the list
276 hd
= find_gendisk (dev
);
279 dname
->name
= disk_name (hd
, MINOR(dev
), dname
->namebuf
);
281 sprintf (dname
->namebuf
, "[dev %s]", kdevname(dev
));
282 dname
->name
= dname
->namebuf
;
286 MD_INIT_LIST_HEAD(&dname
->list
);
287 md_list_add(&dname
->list
, &device_names
);
292 static unsigned int calc_dev_sboffset (kdev_t dev
, mddev_t
*mddev
,
295 unsigned int size
= 0;
297 if (blk_size
[MAJOR(dev
)])
298 size
= blk_size
[MAJOR(dev
)][MINOR(dev
)];
300 size
= MD_NEW_SIZE_BLOCKS(size
);
304 static unsigned int calc_dev_size (kdev_t dev
, mddev_t
*mddev
, int persistent
)
308 size
= calc_dev_sboffset(dev
, mddev
, persistent
);
313 if (mddev
->sb
->chunk_size
)
314 size
&= ~(mddev
->sb
->chunk_size
/1024 - 1);
318 static unsigned int zoned_raid_size (mddev_t
*mddev
)
322 struct md_list_head
*tmp
;
329 * do size and offset calculations.
331 mask
= ~(mddev
->sb
->chunk_size
/1024 - 1);
333 ITERATE_RDEV(mddev
,rdev
,tmp
) {
335 md_size
[mdidx(mddev
)] += rdev
->size
;
341 * We check wether all devices are numbered from 0 to nb_dev-1. The
342 * order is guaranteed even after device name changes.
344 * Some personalities (raid0, linear) use this. Personalities that
345 * provide data have to be able to deal with loss of individual
346 * disks, so they do their checking themselves.
348 int md_check_ordering (mddev_t
*mddev
)
352 struct md_list_head
*tmp
;
355 * First, all devices must be fully functional
357 ITERATE_RDEV(mddev
,rdev
,tmp
) {
359 printk("md: md%d's device %s faulty, aborting.\n",
360 mdidx(mddev
), partition_name(rdev
->dev
));
366 ITERATE_RDEV(mddev
,rdev
,tmp
) {
369 if (c
!= mddev
->nb_dev
) {
373 if (mddev
->nb_dev
!= mddev
->sb
->raid_disks
) {
374 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
375 mdidx(mddev
), mddev
->sb
->raid_disks
, mddev
->nb_dev
);
379 * Now the numbering check
381 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
383 ITERATE_RDEV(mddev
,rdev
,tmp
) {
384 if (rdev
->desc_nr
== i
)
388 printk("md: md%d, missing disk #%d, aborting.\n",
393 printk("md: md%d, too many disks #%d, aborting.\n",
403 static void remove_descriptor (mdp_disk_t
*disk
, mdp_super_t
*sb
)
405 if (disk_active(disk
)) {
408 if (disk_spare(disk
)) {
418 mark_disk_removed(disk
);
421 #define BAD_MAGIC KERN_ERR \
422 "md: invalid raid superblock magic on %s\n"
424 #define BAD_MINOR KERN_ERR \
425 "md: %s: invalid raid minor (%x)\n"
427 #define OUT_OF_MEM KERN_ALERT \
428 "md: out of memory.\n"
430 #define NO_SB KERN_ERR \
431 "md: disabled device %s, could not read superblock.\n"
433 #define BAD_CSUM KERN_WARNING \
434 "md: invalid superblock checksum on %s\n"
436 static int alloc_array_sb (mddev_t
* mddev
)
443 mddev
->sb
= (mdp_super_t
*) __get_free_page (GFP_KERNEL
);
446 md_clear_page(mddev
->sb
);
450 static int alloc_disk_sb (mdk_rdev_t
* rdev
)
455 rdev
->sb
= (mdp_super_t
*) __get_free_page(GFP_KERNEL
);
460 md_clear_page(rdev
->sb
);
465 static void free_disk_sb (mdk_rdev_t
* rdev
)
468 free_page((unsigned long) rdev
->sb
);
478 static void mark_rdev_faulty (mdk_rdev_t
* rdev
)
488 static int read_disk_sb (mdk_rdev_t
* rdev
)
491 struct buffer_head
*bh
= NULL
;
492 kdev_t dev
= rdev
->dev
;
494 unsigned long sb_offset
;
502 * Calculate the position of the superblock,
503 * it's at the end of the disk
505 sb_offset
= calc_dev_sboffset(rdev
->dev
, rdev
->mddev
, 1);
506 rdev
->sb_offset
= sb_offset
;
507 printk("(read) %s's sb offset: %ld", partition_name(dev
), sb_offset
);
509 set_blocksize (dev
, MD_SB_BYTES
);
510 bh
= bread (dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
513 sb
= (mdp_super_t
*) bh
->b_data
;
514 memcpy (rdev
->sb
, sb
, MD_SB_BYTES
);
516 printk (NO_SB
,partition_name(rdev
->dev
));
519 printk(" [events: %08lx]\n", (unsigned long)rdev
->sb
->events_lo
);
527 static unsigned int calc_sb_csum (mdp_super_t
* sb
)
529 unsigned int disk_csum
, csum
;
531 disk_csum
= sb
->sb_csum
;
533 csum
= csum_partial((void *)sb
, MD_SB_BYTES
, 0);
534 sb
->sb_csum
= disk_csum
;
539 * Check one RAID superblock for generic plausibility
542 static int check_disk_sb (mdk_rdev_t
* rdev
)
553 if (sb
->md_magic
!= MD_SB_MAGIC
) {
554 printk (BAD_MAGIC
, partition_name(rdev
->dev
));
558 if (sb
->md_minor
>= MAX_MD_DEVS
) {
559 printk (BAD_MINOR
, partition_name(rdev
->dev
),
564 if (calc_sb_csum(sb
) != sb
->sb_csum
)
565 printk(BAD_CSUM
, partition_name(rdev
->dev
));
571 static kdev_t
dev_unit(kdev_t dev
)
574 struct gendisk
*hd
= find_gendisk(dev
);
578 mask
= ~((1 << hd
->minor_shift
) - 1);
580 return MKDEV(MAJOR(dev
), MINOR(dev
) & mask
);
583 static mdk_rdev_t
* match_dev_unit(mddev_t
*mddev
, kdev_t dev
)
585 struct md_list_head
*tmp
;
588 ITERATE_RDEV(mddev
,rdev
,tmp
)
589 if (dev_unit(rdev
->dev
) == dev_unit(dev
))
595 static int match_mddev_units(mddev_t
*mddev1
, mddev_t
*mddev2
)
597 struct md_list_head
*tmp
;
600 ITERATE_RDEV(mddev1
,rdev
,tmp
)
601 if (match_dev_unit(mddev2
, rdev
->dev
))
607 static MD_LIST_HEAD(all_raid_disks
);
608 static MD_LIST_HEAD(pending_raid_disks
);
610 static void bind_rdev_to_array (mdk_rdev_t
* rdev
, mddev_t
* mddev
)
612 mdk_rdev_t
*same_pdev
;
618 same_pdev
= match_dev_unit(mddev
, rdev
->dev
);
621 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
622 " protection against single-disk failure might be compromised.\n",
623 mdidx(mddev
), partition_name(rdev
->dev
),
624 partition_name(same_pdev
->dev
));
626 md_list_add(&rdev
->same_set
, &mddev
->disks
);
629 printk("bind<%s,%d>\n", partition_name(rdev
->dev
), mddev
->nb_dev
);
632 static void unbind_rdev_from_array (mdk_rdev_t
* rdev
)
638 md_list_del(&rdev
->same_set
);
639 MD_INIT_LIST_HEAD(&rdev
->same_set
);
640 rdev
->mddev
->nb_dev
--;
641 printk("unbind<%s,%d>\n", partition_name(rdev
->dev
),
642 rdev
->mddev
->nb_dev
);
647 * prevent the device from being mounted, repartitioned or
648 * otherwise reused by a RAID array (or any other kernel
649 * subsystem), by opening the device. [simply getting an
650 * inode is not enough, the SCSI module usage code needs
651 * an explicit open() on the device]
653 static int lock_rdev (mdk_rdev_t
*rdev
)
658 * First insert a dummy inode.
662 rdev
->inode
= get_empty_inode();
666 * we dont care about any other fields
668 rdev
->inode
->i_dev
= rdev
->inode
->i_rdev
= rdev
->dev
;
669 insert_inode_hash(rdev
->inode
);
671 memset(&rdev
->filp
, 0, sizeof(rdev
->filp
));
672 rdev
->filp
.f_mode
= 3; /* read write */
676 static void unlock_rdev (mdk_rdev_t
*rdev
)
684 static void export_rdev (mdk_rdev_t
* rdev
)
686 printk("export_rdev(%s)\n",partition_name(rdev
->dev
));
691 md_list_del(&rdev
->all
);
692 MD_INIT_LIST_HEAD(&rdev
->all
);
693 if (rdev
->pending
.next
!= &rdev
->pending
) {
694 printk("(%s was pending)\n",partition_name(rdev
->dev
));
695 md_list_del(&rdev
->pending
);
696 MD_INIT_LIST_HEAD(&rdev
->pending
);
703 static void kick_rdev_from_array (mdk_rdev_t
* rdev
)
705 unbind_rdev_from_array(rdev
);
709 static void export_array (mddev_t
*mddev
)
711 struct md_list_head
*tmp
;
713 mdp_super_t
*sb
= mddev
->sb
;
717 free_page((unsigned long) sb
);
720 ITERATE_RDEV(mddev
,rdev
,tmp
) {
725 kick_rdev_from_array(rdev
);
731 static void free_mddev (mddev_t
*mddev
)
739 md_size
[mdidx(mddev
)] = 0;
740 md_hd_struct
[mdidx(mddev
)].nr_sects
= 0;
743 * Make sure nobody else is using this mddev
744 * (careful, we rely on the global kernel lock here)
746 while (md_atomic_read(&mddev
->resync_sem
.count
) != 1)
748 while (md_atomic_read(&mddev
->recovery_sem
.count
) != 1)
751 del_mddev_mapping(mddev
, MKDEV(MD_MAJOR
, mdidx(mddev
)));
752 md_list_del(&mddev
->all_mddevs
);
753 MD_INIT_LIST_HEAD(&mddev
->all_mddevs
);
763 static void print_desc(mdp_disk_t
*desc
)
765 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc
->number
,
766 partition_name(MKDEV(desc
->major
,desc
->minor
)),
767 desc
->major
,desc
->minor
,desc
->raid_disk
,desc
->state
);
770 static void print_sb(mdp_super_t
*sb
)
774 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
775 sb
->major_version
, sb
->minor_version
, sb
->patch_version
,
776 sb
->set_uuid0
, sb
->set_uuid1
, sb
->set_uuid2
, sb
->set_uuid3
,
778 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb
->level
,
779 sb
->size
, sb
->nr_disks
, sb
->raid_disks
, sb
->md_minor
,
780 sb
->layout
, sb
->chunk_size
);
781 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
782 sb
->utime
, sb
->state
, sb
->active_disks
, sb
->working_disks
,
783 sb
->failed_disks
, sb
->spare_disks
,
784 sb
->sb_csum
, (unsigned long)sb
->events_lo
);
786 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
789 desc
= sb
->disks
+ i
;
790 printk(" D %2d: ", i
);
794 print_desc(&sb
->this_disk
);
798 static void print_rdev(mdk_rdev_t
*rdev
)
800 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
801 partition_name(rdev
->dev
), partition_name(rdev
->old_dev
),
802 rdev
->size
, rdev
->faulty
, rdev
->desc_nr
);
804 printk("rdev superblock:\n");
807 printk("no rdev superblock!\n");
810 void md_print_devices (void)
812 struct md_list_head
*tmp
, *tmp2
;
817 printk(" **********************************\n");
818 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
819 printk(" **********************************\n");
820 ITERATE_MDDEV(mddev
,tmp
) {
821 printk("md%d: ", mdidx(mddev
));
823 ITERATE_RDEV(mddev
,rdev
,tmp2
)
824 printk("<%s>", partition_name(rdev
->dev
));
827 printk(" array superblock:\n");
830 printk(" no array superblock.\n");
832 ITERATE_RDEV(mddev
,rdev
,tmp2
)
835 printk(" **********************************\n");
839 static int sb_equal ( mdp_super_t
*sb1
, mdp_super_t
*sb2
)
842 mdp_super_t
*tmp1
, *tmp2
;
844 tmp1
= kmalloc(sizeof(*tmp1
),GFP_KERNEL
);
845 tmp2
= kmalloc(sizeof(*tmp2
),GFP_KERNEL
);
847 if (!tmp1
|| !tmp2
) {
856 * nr_disks is not constant
861 if (memcmp(tmp1
, tmp2
, MD_SB_GENERIC_CONSTANT_WORDS
* 4))
875 static int uuid_equal(mdk_rdev_t
*rdev1
, mdk_rdev_t
*rdev2
)
877 if ( (rdev1
->sb
->set_uuid0
== rdev2
->sb
->set_uuid0
) &&
878 (rdev1
->sb
->set_uuid1
== rdev2
->sb
->set_uuid1
) &&
879 (rdev1
->sb
->set_uuid2
== rdev2
->sb
->set_uuid2
) &&
880 (rdev1
->sb
->set_uuid3
== rdev2
->sb
->set_uuid3
))
887 static mdk_rdev_t
* find_rdev_all (kdev_t dev
)
889 struct md_list_head
*tmp
;
892 tmp
= all_raid_disks
.next
;
893 while (tmp
!= &all_raid_disks
) {
894 rdev
= md_list_entry(tmp
, mdk_rdev_t
, all
);
895 if (rdev
->dev
== dev
)
902 #define GETBLK_FAILED KERN_ERR \
903 "md: getblk failed for device %s\n"
905 static int write_disk_sb(mdk_rdev_t
* rdev
)
907 struct buffer_head
*bh
;
909 unsigned long sb_offset
, size
;
920 if (rdev
->sb
->md_magic
!= MD_SB_MAGIC
) {
926 sb_offset
= calc_dev_sboffset(dev
, rdev
->mddev
, 1);
927 if (rdev
->sb_offset
!= sb_offset
) {
928 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev
), rdev
->sb_offset
, sb_offset
);
932 * If the disk went offline meanwhile and it's just a spare, then
933 * it's size has changed to zero silently, and the MD code does
934 * not yet know that it's faulty.
936 size
= calc_dev_size(dev
, rdev
->mddev
, 1);
937 if (size
!= rdev
->size
) {
938 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev
), rdev
->size
, size
);
942 printk("(write) %s's sb offset: %ld\n", partition_name(dev
), sb_offset
);
944 set_blocksize(dev
, MD_SB_BYTES
);
945 bh
= getblk(dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
947 printk(GETBLK_FAILED
, partition_name(dev
));
950 memset(bh
->b_data
,0,bh
->b_size
);
951 sb
= (mdp_super_t
*) bh
->b_data
;
952 memcpy(sb
, rdev
->sb
, MD_SB_BYTES
);
954 mark_buffer_uptodate(bh
, 1);
955 mark_buffer_dirty(bh
);
956 ll_rw_block(WRITE
, 1, &bh
);
965 static void set_this_disk(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
970 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
971 desc
= mddev
->sb
->disks
+ i
;
973 if (disk_faulty(desc
)) {
974 if (MKDEV(desc
->major
,desc
->minor
) == rdev
->dev
)
979 if (MKDEV(desc
->major
,desc
->minor
) == rdev
->dev
) {
980 rdev
->sb
->this_disk
= *desc
;
981 rdev
->desc_nr
= desc
->number
;
992 static int sync_sbs(mddev_t
* mddev
)
996 struct md_list_head
*tmp
;
998 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1003 set_this_disk(mddev
, rdev
);
1004 sb
->sb_csum
= calc_sb_csum(sb
);
1009 int md_update_sb(mddev_t
* mddev
)
1011 int first
, err
, count
= 100;
1012 struct md_list_head
*tmp
;
1016 mddev
->sb
->utime
= CURRENT_TIME
;
1017 if ((++mddev
->sb
->events_lo
)==0)
1018 ++mddev
->sb
->events_hi
;
1020 if ((mddev
->sb
->events_lo
|mddev
->sb
->events_hi
)==0) {
1022 * oops, this 64-bit counter should never wrap.
1023 * Either we are in around ~1 trillion A.C., assuming
1024 * 1 reboot per second, or we have a bug:
1027 mddev
->sb
->events_lo
= mddev
->sb
->events_hi
= 0xffffffff;
1032 * do not write anything to disk if using
1033 * nonpersistent superblocks
1035 if (mddev
->sb
->not_persistent
)
1038 printk(KERN_INFO
"md: updating md%d RAID superblock on device\n",
1043 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1049 printk("(skipping faulty ");
1050 printk("%s ", partition_name(rdev
->dev
));
1051 if (!rdev
->faulty
) {
1052 printk("[events: %08lx]",
1053 (unsigned long)rdev
->sb
->events_lo
);
1054 err
+= write_disk_sb(rdev
);
1060 printk("errors occured during superblock update, repeating\n");
1063 printk("excessive errors occured during superblock update, exiting\n");
1069 * Import a device. If 'on_disk', then sanity check the superblock
1071 * mark the device faulty if:
1073 * - the device is nonexistent (zero size)
1074 * - the device has no valid superblock
1076 * a faulty rdev _never_ has rdev->sb set.
1078 static int md_import_device (kdev_t newdev
, int on_disk
)
1084 if (find_rdev_all(newdev
))
1087 rdev
= (mdk_rdev_t
*) kmalloc(sizeof(*rdev
), GFP_KERNEL
);
1089 printk("could not alloc mem for %s!\n", partition_name(newdev
));
1092 memset(rdev
, 0, sizeof(*rdev
));
1094 if (get_super(newdev
)) {
1095 printk("md: can not import %s, has active inodes!\n",
1096 partition_name(newdev
));
1101 if ((err
= alloc_disk_sb(rdev
)))
1105 if (lock_rdev(rdev
)) {
1106 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1107 partition_name(newdev
));
1115 if (blk_size
[MAJOR(newdev
)])
1116 size
= blk_size
[MAJOR(newdev
)][MINOR(newdev
)];
1118 printk("md: %s has zero size, marking faulty!\n",
1119 partition_name(newdev
));
1125 if ((err
= read_disk_sb(rdev
))) {
1126 printk("md: could not read %s's sb, not importing!\n",
1127 partition_name(newdev
));
1130 if ((err
= check_disk_sb(rdev
))) {
1131 printk("md: %s has invalid sb, not importing!\n",
1132 partition_name(newdev
));
1136 rdev
->old_dev
= MKDEV(rdev
->sb
->this_disk
.major
,
1137 rdev
->sb
->this_disk
.minor
);
1138 rdev
->desc_nr
= rdev
->sb
->this_disk
.number
;
1140 md_list_add(&rdev
->all
, &all_raid_disks
);
1141 MD_INIT_LIST_HEAD(&rdev
->pending
);
1143 if (rdev
->faulty
&& rdev
->sb
)
1158 * Check a full RAID array for plausibility
1161 #define INCONSISTENT KERN_ERR \
1162 "md: fatal superblock inconsistency in %s -- removing from array\n"
1164 #define OUT_OF_DATE KERN_ERR \
1165 "md: superblock update time inconsistency -- using the most recent one\n"
1167 #define OLD_VERSION KERN_ALERT \
1168 "md: md%d: unsupported raid array version %d.%d.%d\n"
1170 #define NOT_CLEAN_IGNORE KERN_ERR \
1171 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1173 #define UNKNOWN_LEVEL KERN_ERR \
1174 "md: md%d: unsupported raid level %d\n"
1176 static int analyze_sbs (mddev_t
* mddev
)
1178 int out_of_date
= 0, i
;
1179 struct md_list_head
*tmp
, *tmp2
;
1180 mdk_rdev_t
*rdev
, *rdev2
, *freshest
;
1184 * Verify the RAID superblock on each real device
1186 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1195 if (check_disk_sb(rdev
))
1200 * The superblock constant part has to be the same
1201 * for all disks in the array.
1205 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1210 if (!sb_equal(sb
, rdev
->sb
)) {
1211 printk (INCONSISTENT
, partition_name(rdev
->dev
));
1212 kick_rdev_from_array(rdev
);
1218 * OK, we have all disks and the array is ready to run. Let's
1219 * find the freshest superblock, that one will be the superblock
1220 * that represents the whole array.
1223 if (alloc_array_sb(mddev
))
1228 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1231 * if the checksum is invalid, use the superblock
1232 * only as a last resort. (decrease it's age by
1235 if (calc_sb_csum(rdev
->sb
) != rdev
->sb
->sb_csum
) {
1236 if (rdev
->sb
->events_lo
|| rdev
->sb
->events_hi
)
1237 if ((rdev
->sb
->events_lo
--)==0)
1238 rdev
->sb
->events_hi
--;
1241 printk("%s's event counter: %08lx\n", partition_name(rdev
->dev
),
1242 (unsigned long)rdev
->sb
->events_lo
);
1248 * Find the newest superblock version
1250 ev1
= md_event(rdev
->sb
);
1251 ev2
= md_event(freshest
->sb
);
1259 printk(OUT_OF_DATE
);
1260 printk("freshest: %s\n", partition_name(freshest
->dev
));
1262 memcpy (sb
, freshest
->sb
, sizeof(*sb
));
1265 * at this point we have picked the 'best' superblock
1266 * from all available superblocks.
1267 * now we validate this superblock and kick out possibly
1270 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1272 * Kick all non-fresh devices faulty
1275 ev1
= md_event(rdev
->sb
);
1279 printk("md: kicking non-fresh %s from array!\n",
1280 partition_name(rdev
->dev
));
1281 kick_rdev_from_array(rdev
);
1287 * Fix up changed device names ... but only if this disk has a
1288 * recent update time. Use faulty checksum ones too.
1290 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1291 __u64 ev1
, ev2
, ev3
;
1292 if (rdev
->faulty
) { /* REMOVEME */
1296 ev1
= md_event(rdev
->sb
);
1300 if ((rdev
->dev
!= rdev
->old_dev
) &&
1301 ((ev1
== ev2
) || (ev1
== ev3
))) {
1304 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev
->old_dev
), partition_name(rdev
->dev
));
1305 if (rdev
->desc_nr
== -1) {
1309 desc
= &sb
->disks
[rdev
->desc_nr
];
1310 if (rdev
->old_dev
!= MKDEV(desc
->major
, desc
->minor
)) {
1314 desc
->major
= MAJOR(rdev
->dev
);
1315 desc
->minor
= MINOR(rdev
->dev
);
1316 desc
= &rdev
->sb
->this_disk
;
1317 desc
->major
= MAJOR(rdev
->dev
);
1318 desc
->minor
= MINOR(rdev
->dev
);
1323 * Remove unavailable and faulty devices ...
1325 * note that if an array becomes completely unrunnable due to
1326 * missing devices, we do not write the superblock back, so the
1327 * administrator has a chance to fix things up. The removal thus
1328 * only happens if it's nonfatal to the contents of the array.
1330 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1335 desc
= sb
->disks
+ i
;
1336 dev
= MKDEV(desc
->major
, desc
->minor
);
1339 * We kick faulty devices/descriptors immediately.
1341 if (disk_faulty(desc
)) {
1343 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1344 if (rdev
->desc_nr
!= desc
->number
)
1346 printk("md%d: kicking faulty %s!\n",
1347 mdidx(mddev
),partition_name(rdev
->dev
));
1348 kick_rdev_from_array(rdev
);
1353 if (dev
== MKDEV(0,0))
1355 printk("md%d: removing former faulty %s!\n",
1356 mdidx(mddev
), partition_name(dev
));
1358 remove_descriptor(desc
, sb
);
1362 if (dev
== MKDEV(0,0))
1365 * Is this device present in the rdev ring?
1368 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1369 if (rdev
->desc_nr
== desc
->number
) {
1377 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev
), partition_name(dev
));
1378 remove_descriptor(desc
, sb
);
1382 * Double check wether all devices mentioned in the
1383 * superblock are in the rdev ring.
1385 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1389 desc
= sb
->disks
+ i
;
1390 dev
= MKDEV(desc
->major
, desc
->minor
);
1392 if (dev
== MKDEV(0,0))
1395 if (disk_faulty(desc
)) {
1400 rdev
= find_rdev(mddev
, dev
);
1408 * Do a final reality check.
1410 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1411 if (rdev
->desc_nr
== -1) {
1416 * is the desc_nr unique?
1418 ITERATE_RDEV(mddev
,rdev2
,tmp2
) {
1419 if ((rdev2
!= rdev
) &&
1420 (rdev2
->desc_nr
== rdev
->desc_nr
)) {
1426 * is the device unique?
1428 ITERATE_RDEV(mddev
,rdev2
,tmp2
) {
1429 if ((rdev2
!= rdev
) &&
1430 (rdev2
->dev
== rdev
->dev
)) {
1438 * Check if we can support this RAID array
1440 if (sb
->major_version
!= MD_MAJOR_VERSION
||
1441 sb
->minor_version
> MD_MINOR_VERSION
) {
1443 printk (OLD_VERSION
, mdidx(mddev
), sb
->major_version
,
1444 sb
->minor_version
, sb
->patch_version
);
1448 if ((sb
->state
!= (1 << MD_SB_CLEAN
)) && ((sb
->level
== 1) ||
1449 (sb
->level
== 4) || (sb
->level
== 5)))
1450 printk (NOT_CLEAN_IGNORE
, mdidx(mddev
));
1462 static int device_size_calculation (mddev_t
* mddev
)
1464 int data_disks
= 0, persistent
;
1465 unsigned int readahead
;
1466 mdp_super_t
*sb
= mddev
->sb
;
1467 struct md_list_head
*tmp
;
1471 * Do device size calculation. Bail out if too small.
1472 * (we have to do this after having validated chunk_size,
1473 * because device size has to be modulo chunk_size)
1475 persistent
= !mddev
->sb
->not_persistent
;
1476 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1483 rdev
->size
= calc_dev_size(rdev
->dev
, mddev
, persistent
);
1484 if (rdev
->size
< sb
->chunk_size
/ 1024) {
1485 printk (KERN_WARNING
1486 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1487 partition_name(rdev
->dev
),
1488 rdev
->size
, sb
->chunk_size
/ 1024);
1493 switch (sb
->level
) {
1501 zoned_raid_size(mddev
);
1505 zoned_raid_size(mddev
);
1506 data_disks
= sb
->raid_disks
;
1513 data_disks
= sb
->raid_disks
-1;
1516 printk (UNKNOWN_LEVEL
, mdidx(mddev
), sb
->level
);
1519 if (!md_size
[mdidx(mddev
)])
1520 md_size
[mdidx(mddev
)] = sb
->size
* data_disks
;
1522 readahead
= MD_READAHEAD
;
1523 if ((sb
->level
== 0) || (sb
->level
== 4) || (sb
->level
== 5)) {
1524 readahead
= (mddev
->sb
->chunk_size
>>PAGE_SHIFT
) * 4 * data_disks
;
1525 if (readahead
< data_disks
* (MAX_SECTORS
>>(PAGE_SHIFT
-9))*2)
1526 readahead
= data_disks
* (MAX_SECTORS
>>(PAGE_SHIFT
-9))*2;
1528 if (sb
->level
== -3)
1531 md_maxreadahead
[mdidx(mddev
)] = readahead
;
1533 printk(KERN_INFO
"md%d: max total readahead window set to %ldk\n",
1534 mdidx(mddev
), readahead
*(PAGE_SIZE
/1024));
1537 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1538 mdidx(mddev
), data_disks
, readahead
/data_disks
*(PAGE_SIZE
/1024));
1545 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1546 "too big chunk_size: %d > %d\n"
1548 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1549 "too small chunk_size: %d < %ld\n"
1551 #define BAD_CHUNKSIZE KERN_ERR \
1552 "no chunksize specified, see 'man raidtab'\n"
1554 static int do_md_run (mddev_t
* mddev
)
1558 struct md_list_head
*tmp
;
1562 if (!mddev
->nb_dev
) {
1571 * Resize disks to align partitions size on a given
1574 md_size
[mdidx(mddev
)] = 0;
1577 * Analyze all RAID superblock(s)
1579 if (analyze_sbs(mddev
)) {
1584 chunk_size
= mddev
->sb
->chunk_size
;
1585 pnum
= level_to_pers(mddev
->sb
->level
);
1587 mddev
->param
.chunk_size
= chunk_size
;
1588 mddev
->param
.personality
= pnum
;
1590 if (chunk_size
> MAX_CHUNK_SIZE
) {
1591 printk(TOO_BIG_CHUNKSIZE
, chunk_size
, MAX_CHUNK_SIZE
);
1595 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1597 if ( (1 << ffz(~chunk_size
)) != chunk_size
) {
1601 if (chunk_size
< PAGE_SIZE
) {
1602 printk(TOO_SMALL_CHUNKSIZE
, chunk_size
, PAGE_SIZE
);
1606 if (pnum
>= MAX_PERSONALITY
) {
1611 if ((pnum
!= RAID1
) && (pnum
!= LINEAR
) && !chunk_size
) {
1613 * 'default chunksize' in the old md code used to
1614 * be PAGE_SIZE, baaad.
1615 * we abort here to be on the safe side. We dont
1616 * want to continue the bad practice.
1618 printk(BAD_CHUNKSIZE
);
1625 char module_name
[80];
1626 sprintf (module_name
, "md-personality-%d", pnum
);
1627 request_module (module_name
);
1633 if (device_size_calculation(mddev
))
1637 * Drop all container device buffers, from now on
1638 * the only valid external interface is through the md
1640 * Also find largest hardsector size
1642 md_hardsect_sizes
[mdidx(mddev
)] = 512;
1643 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1646 fsync_dev(rdev
->dev
);
1647 invalidate_buffers(rdev
->dev
);
1648 if (get_hardsect_size(rdev
->dev
)
1649 > md_hardsect_sizes
[mdidx(mddev
)])
1650 md_hardsect_sizes
[mdidx(mddev
)] =
1651 get_hardsect_size(rdev
->dev
);
1653 md_blocksizes
[mdidx(mddev
)] = 1024;
1654 if (md_blocksizes
[mdidx(mddev
)] < md_hardsect_sizes
[mdidx(mddev
)])
1655 md_blocksizes
[mdidx(mddev
)] = md_hardsect_sizes
[mdidx(mddev
)];
1656 mddev
->pers
= pers
[pnum
];
1658 err
= mddev
->pers
->run(mddev
);
1660 printk("pers->run() failed ...\n");
1665 mddev
->sb
->state
&= ~(1 << MD_SB_CLEAN
);
1666 md_update_sb(mddev
);
1669 * md_size has units of 1K blocks, which are
1670 * twice as large as sectors.
1672 md_hd_struct
[mdidx(mddev
)].start_sect
= 0;
1673 md_hd_struct
[mdidx(mddev
)].nr_sects
= md_size
[mdidx(mddev
)] << 1;
1675 read_ahead
[MD_MAJOR
] = 1024;
1679 #undef TOO_BIG_CHUNKSIZE
1680 #undef BAD_CHUNKSIZE
1682 #define OUT(x) do { err = (x); goto out; } while (0)
1684 static int restart_array (mddev_t
*mddev
)
1689 * Complain if it has no devices
1699 set_device_ro(mddev_to_kdev(mddev
), 0);
1702 "md%d switched to read-write mode.\n", mdidx(mddev
));
1704 * Kick recovery or resync if necessary
1706 md_recover_arrays();
1707 if (mddev
->pers
->restart_resync
)
1708 mddev
->pers
->restart_resync(mddev
);
1716 #define STILL_MOUNTED KERN_WARNING \
1717 "md: md%d still mounted.\n"
1719 static int do_md_stop (mddev_t
* mddev
, int ro
)
1721 int err
= 0, resync_interrupted
= 0;
1722 kdev_t dev
= mddev_to_kdev(mddev
);
1724 if (!ro
&& get_super(dev
)) {
1725 printk (STILL_MOUNTED
, mdidx(mddev
));
1731 * It is safe to call stop here, it only frees private
1732 * data. Also, it tells us if a device is unstoppable
1733 * (eg. resyncing is in progress)
1735 if (mddev
->pers
->stop_resync
)
1736 if (mddev
->pers
->stop_resync(mddev
))
1737 resync_interrupted
= 1;
1739 if (mddev
->recovery_running
)
1740 md_interrupt_thread(md_recovery_thread
);
1743 * This synchronizes with signal delivery to the
1744 * resync or reconstruction thread. It also nicely
1745 * hangs the process if some reconstruction has not
1748 down(&mddev
->recovery_sem
);
1749 up(&mddev
->recovery_sem
);
1752 * sync and invalidate buffers because we cannot kill the
1753 * main thread with valid IO transfers still around.
1754 * the kernel lock protects us from new requests being
1755 * added after invalidate_buffers().
1757 fsync_dev (mddev_to_kdev(mddev
));
1759 invalidate_buffers (dev
);
1767 set_device_ro(dev
, 0);
1768 if (mddev
->pers
->stop(mddev
)) {
1770 set_device_ro(dev
, 1);
1778 * mark it clean only if there was no resync
1781 if (!mddev
->recovery_running
&& !resync_interrupted
) {
1782 printk("marking sb clean...\n");
1783 mddev
->sb
->state
|= 1 << MD_SB_CLEAN
;
1785 md_update_sb(mddev
);
1788 set_device_ro(dev
, 1);
1792 * Free resources if final stop
1795 printk (KERN_INFO
"md%d stopped.\n", mdidx(mddev
));
1800 "md%d switched to read-only mode.\n", mdidx(mddev
));
1808 * We have to safely support old arrays too.
1810 int detect_old_array (mdp_super_t
*sb
)
1812 if (sb
->major_version
> 0)
1814 if (sb
->minor_version
>= 90)
1821 static void autorun_array (mddev_t
*mddev
)
1824 struct md_list_head
*tmp
;
1827 if (mddev
->disks
.prev
== &mddev
->disks
) {
1832 printk("running: ");
1834 ITERATE_RDEV(mddev
,rdev
,tmp
) {
1835 printk("<%s>", partition_name(rdev
->dev
));
1839 err
= do_md_run (mddev
);
1841 printk("do_md_run() returned %d\n", err
);
1843 * prevent the writeback of an unrunnable array
1845 mddev
->sb_dirty
= 0;
1846 do_md_stop (mddev
, 0);
1851 * lets try to run arrays based on all disks that have arrived
1852 * until now. (those are in the ->pending list)
1854 * the method: pick the first pending disk, collect all disks with
1855 * the same UUID, remove all from the pending list and put them into
1856 * the 'same_array' list. Then order this list based on superblock
1857 * update time (freshest comes first), kick out 'old' disks and
1858 * compare superblocks. If everything's fine then run it.
1860 static void autorun_devices (void)
1862 struct md_list_head candidates
;
1863 struct md_list_head
*tmp
;
1864 mdk_rdev_t
*rdev0
, *rdev
;
1869 printk("autorun ...\n");
1870 while (pending_raid_disks
.next
!= &pending_raid_disks
) {
1871 rdev0
= md_list_entry(pending_raid_disks
.next
,
1872 mdk_rdev_t
, pending
);
1874 printk("considering %s ...\n", partition_name(rdev0
->dev
));
1875 MD_INIT_LIST_HEAD(&candidates
);
1876 ITERATE_RDEV_PENDING(rdev
,tmp
) {
1877 if (uuid_equal(rdev0
, rdev
)) {
1878 if (!sb_equal(rdev0
->sb
, rdev
->sb
)) {
1879 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
1882 printk(" adding %s ...\n", partition_name(rdev
->dev
));
1883 md_list_del(&rdev
->pending
);
1884 md_list_add(&rdev
->pending
, &candidates
);
1888 * now we have a set of devices, with all of them having
1889 * mostly sane superblocks. It's time to allocate the
1892 md_kdev
= MKDEV(MD_MAJOR
, rdev0
->sb
->md_minor
);
1893 mddev
= kdev_to_mddev(md_kdev
);
1895 printk("md%d already running, cannot run %s\n",
1896 mdidx(mddev
), partition_name(rdev0
->dev
));
1897 ITERATE_RDEV_GENERIC(candidates
,pending
,rdev
,tmp
)
1901 mddev
= alloc_mddev(md_kdev
);
1902 printk("created md%d\n", mdidx(mddev
));
1903 ITERATE_RDEV_GENERIC(candidates
,pending
,rdev
,tmp
) {
1904 bind_rdev_to_array(rdev
, mddev
);
1905 md_list_del(&rdev
->pending
);
1906 MD_INIT_LIST_HEAD(&rdev
->pending
);
1908 autorun_array(mddev
);
1910 printk("... autorun DONE.\n");
1914 * import RAID devices based on one partition
1915 * if possible, the array gets run as well.
1918 #define BAD_VERSION KERN_ERR \
1919 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1921 #define OUT_OF_MEM KERN_ALERT \
1922 "md: out of memory.\n"
1924 #define NO_DEVICE KERN_ERR \
1925 "md: disabled device %s\n"
1927 #define AUTOADD_FAILED KERN_ERR \
1928 "md: auto-adding devices to md%d FAILED (error %d).\n"
1930 #define AUTOADD_FAILED_USED KERN_ERR \
1931 "md: cannot auto-add device %s to md%d, already used.\n"
1933 #define AUTORUN_FAILED KERN_ERR \
1934 "md: auto-running md%d FAILED (error %d).\n"
1936 #define MDDEV_BUSY KERN_ERR \
1937 "md: cannot auto-add to md%d, already running.\n"
1939 #define AUTOADDING KERN_INFO \
1940 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1942 #define AUTORUNNING KERN_INFO \
1943 "md: auto-running md%d.\n"
1945 static int autostart_array (kdev_t startdev
)
1947 int err
= -EINVAL
, i
;
1948 mdp_super_t
*sb
= NULL
;
1949 mdk_rdev_t
*start_rdev
= NULL
, *rdev
;
1951 if (md_import_device(startdev
, 1)) {
1952 printk("could not import %s!\n", partition_name(startdev
));
1956 start_rdev
= find_rdev_all(startdev
);
1961 if (start_rdev
->faulty
) {
1962 printk("can not autostart based on faulty %s!\n",
1963 partition_name(startdev
));
1966 md_list_add(&start_rdev
->pending
, &pending_raid_disks
);
1968 sb
= start_rdev
->sb
;
1970 err
= detect_old_array(sb
);
1972 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1976 for (i
= 0; i
< MD_SB_DISKS
; i
++) {
1980 desc
= sb
->disks
+ i
;
1981 dev
= MKDEV(desc
->major
, desc
->minor
);
1983 if (dev
== MKDEV(0,0))
1985 if (dev
== startdev
)
1987 if (md_import_device(dev
, 1)) {
1988 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev
));
1991 rdev
= find_rdev_all(dev
);
1996 md_list_add(&rdev
->pending
, &pending_raid_disks
);
2000 * possibly return codes
2007 export_rdev(start_rdev
);
2014 #undef AUTOADD_FAILED_USED
2015 #undef AUTOADD_FAILED
2016 #undef AUTORUN_FAILED
2024 } raid_setup_args md__initdata
= { 0, 0 };
2026 void md_setup_drive(void) md__init
;
2029 * Searches all registered partitions for autorun RAID arrays
2032 #ifdef CONFIG_AUTODETECT_RAID
2033 static int detected_devices
[128] md__initdata
;
2034 static int dev_cnt
=0;
2035 void md_autodetect_dev(kdev_t dev
)
2037 if (dev_cnt
>= 0 && dev_cnt
< 127)
2038 detected_devices
[dev_cnt
++] = dev
;
2042 void md__init
md_run_setup(void)
2044 #ifdef CONFIG_AUTODETECT_RAID
2048 if (raid_setup_args
.noautodetect
)
2049 printk(KERN_INFO
"skipping autodetection of RAID arrays\n");
2052 printk(KERN_INFO
"autodetecting RAID arrays\n");
2054 for (i
=0; i
<dev_cnt
; i
++) {
2055 kdev_t dev
= detected_devices
[i
];
2057 if (md_import_device(dev
,1)) {
2058 printk(KERN_ALERT
"could not import %s!\n",
2059 partition_name(dev
));
2065 rdev
= find_rdev_all(dev
);
2074 md_list_add(&rdev
->pending
, &pending_raid_disks
);
2080 dev_cnt
= -1; /* make sure further calls to md_autodetect_dev are ignored */
2082 #ifdef CONFIG_MD_BOOT
2088 static int get_version (void * arg
)
2092 ver
.major
= MD_MAJOR_VERSION
;
2093 ver
.minor
= MD_MINOR_VERSION
;
2094 ver
.patchlevel
= MD_PATCHLEVEL_VERSION
;
2096 if (md_copy_to_user(arg
, &ver
, sizeof(ver
)))
2102 #define SET_FROM_SB(x) info.x = mddev->sb->x
2103 static int get_array_info (mddev_t
* mddev
, void * arg
)
2105 mdu_array_info_t info
;
2110 SET_FROM_SB(major_version
);
2111 SET_FROM_SB(minor_version
);
2112 SET_FROM_SB(patch_version
);
2116 SET_FROM_SB(nr_disks
);
2117 SET_FROM_SB(raid_disks
);
2118 SET_FROM_SB(md_minor
);
2119 SET_FROM_SB(not_persistent
);
2123 SET_FROM_SB(active_disks
);
2124 SET_FROM_SB(working_disks
);
2125 SET_FROM_SB(failed_disks
);
2126 SET_FROM_SB(spare_disks
);
2128 SET_FROM_SB(layout
);
2129 SET_FROM_SB(chunk_size
);
2131 if (md_copy_to_user(arg
, &info
, sizeof(info
)))
2138 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2139 static int get_disk_info (mddev_t
* mddev
, void * arg
)
2141 mdu_disk_info_t info
;
2147 if (md_copy_from_user(&info
, arg
, sizeof(info
)))
2151 if (nr
>= mddev
->sb
->nr_disks
)
2156 SET_FROM_SB(raid_disk
);
2159 if (md_copy_to_user(arg
, &info
, sizeof(info
)))
2166 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2168 static int add_new_disk (mddev_t
* mddev
, mdu_disk_info_t
*info
)
2170 int err
, size
, persistent
;
2174 dev
= MKDEV(info
->major
,info
->minor
);
2176 if (find_rdev_all(dev
)) {
2177 printk("device %s already used in a RAID array!\n",
2178 partition_name(dev
));
2182 /* expecting a device which has a superblock */
2183 err
= md_import_device(dev
, 1);
2185 printk("md error, md_import_device returned %d\n", err
);
2188 rdev
= find_rdev_all(dev
);
2193 if (mddev
->nb_dev
) {
2194 mdk_rdev_t
*rdev0
= md_list_entry(mddev
->disks
.next
,
2195 mdk_rdev_t
, same_set
);
2196 if (!uuid_equal(rdev0
, rdev
)) {
2197 printk("md: %s has different UUID to %s\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
2201 if (!sb_equal(rdev0
->sb
, rdev
->sb
)) {
2202 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev
->dev
), partition_name(rdev0
->dev
));
2207 bind_rdev_to_array(rdev
, mddev
);
2212 if (nr
>= mddev
->sb
->nr_disks
)
2221 if ((info
->state
& (1<<MD_DISK_FAULTY
))==0) {
2222 err
= md_import_device (dev
, 0);
2224 printk("md: error, md_import_device() returned %d\n", err
);
2227 rdev
= find_rdev_all(dev
);
2233 rdev
->old_dev
= dev
;
2234 rdev
->desc_nr
= info
->number
;
2236 bind_rdev_to_array(rdev
, mddev
);
2238 persistent
= !mddev
->sb
->not_persistent
;
2240 printk("nonpersistent superblock ...\n");
2241 if (!mddev
->sb
->chunk_size
)
2242 printk("no chunksize?\n");
2244 size
= calc_dev_size(dev
, mddev
, persistent
);
2245 rdev
->sb_offset
= calc_dev_sboffset(dev
, mddev
, persistent
);
2247 if (!mddev
->sb
->size
|| (mddev
->sb
->size
> size
))
2248 mddev
->sb
->size
= size
;
2252 * sync all other superblocks with the main superblock
2260 static int hot_remove_disk (mddev_t
* mddev
, kdev_t dev
)
2269 printk("trying to remove %s from md%d ... \n",
2270 partition_name(dev
), mdidx(mddev
));
2272 if (!mddev
->pers
->diskop
) {
2273 printk("md%d: personality does not support diskops!\n",
2278 rdev
= find_rdev(mddev
, dev
);
2282 if (rdev
->desc_nr
== -1) {
2286 disk
= &mddev
->sb
->disks
[rdev
->desc_nr
];
2287 if (disk_active(disk
))
2289 if (disk_removed(disk
)) {
2294 err
= mddev
->pers
->diskop(mddev
, &disk
, DISKOP_HOT_REMOVE_DISK
);
2302 remove_descriptor(disk
, mddev
->sb
);
2303 kick_rdev_from_array(rdev
);
2304 mddev
->sb_dirty
= 1;
2305 md_update_sb(mddev
);
2309 printk("cannot remove active disk %s from md%d ... \n",
2310 partition_name(dev
), mdidx(mddev
));
2314 static int hot_add_disk (mddev_t
* mddev
, kdev_t dev
)
2316 int i
, err
, persistent
;
2324 printk("trying to hot-add %s to md%d ... \n",
2325 partition_name(dev
), mdidx(mddev
));
2327 if (!mddev
->pers
->diskop
) {
2328 printk("md%d: personality does not support diskops!\n",
2333 persistent
= !mddev
->sb
->not_persistent
;
2334 size
= calc_dev_size(dev
, mddev
, persistent
);
2336 if (size
< mddev
->sb
->size
) {
2337 printk("md%d: disk size %d blocks < array size %d\n",
2338 mdidx(mddev
), size
, mddev
->sb
->size
);
2342 rdev
= find_rdev(mddev
, dev
);
2346 err
= md_import_device (dev
, 0);
2348 printk("md: error, md_import_device() returned %d\n", err
);
2351 rdev
= find_rdev_all(dev
);
2357 printk("md: can not hot-add faulty %s disk to md%d!\n",
2358 partition_name(dev
), mdidx(mddev
));
2362 bind_rdev_to_array(rdev
, mddev
);
2365 * The rest should better be atomic, we can have disk failures
2366 * noticed in interrupt contexts ...
2368 rdev
->old_dev
= dev
;
2370 rdev
->sb_offset
= calc_dev_sboffset(dev
, mddev
, persistent
);
2372 disk
= mddev
->sb
->disks
+ mddev
->sb
->raid_disks
;
2373 for (i
= mddev
->sb
->raid_disks
; i
< MD_SB_DISKS
; i
++) {
2374 disk
= mddev
->sb
->disks
+ i
;
2376 if (!disk
->major
&& !disk
->minor
)
2378 if (disk_removed(disk
))
2381 if (i
== MD_SB_DISKS
) {
2382 printk("md%d: can not hot-add to full array!\n", mdidx(mddev
));
2384 goto abort_unbind_export
;
2387 if (disk_removed(disk
)) {
2391 if (disk
->number
!= i
) {
2394 goto abort_unbind_export
;
2400 disk
->raid_disk
= disk
->number
;
2401 disk
->major
= MAJOR(dev
);
2402 disk
->minor
= MINOR(dev
);
2404 if (mddev
->pers
->diskop(mddev
, &disk
, DISKOP_HOT_ADD_DISK
)) {
2407 goto abort_unbind_export
;
2410 mark_disk_spare(disk
);
2411 mddev
->sb
->nr_disks
++;
2412 mddev
->sb
->spare_disks
++;
2413 mddev
->sb
->working_disks
++;
2415 mddev
->sb_dirty
= 1;
2417 md_update_sb(mddev
);
2420 * Kick recovery, maybe this spare has to be added to the
2421 * array immediately.
2423 md_recover_arrays();
2427 abort_unbind_export
:
2428 unbind_rdev_from_array(rdev
);
2435 #define SET_SB(x) mddev->sb->x = info->x
2436 static int set_array_info (mddev_t
* mddev
, mdu_array_info_t
*info
)
2439 if (alloc_array_sb(mddev
))
2442 mddev
->sb
->major_version
= MD_MAJOR_VERSION
;
2443 mddev
->sb
->minor_version
= MD_MINOR_VERSION
;
2444 mddev
->sb
->patch_version
= MD_PATCHLEVEL_VERSION
;
2445 mddev
->sb
->ctime
= CURRENT_TIME
;
2452 SET_SB(not_persistent
);
2455 SET_SB(active_disks
);
2456 SET_SB(working_disks
);
2457 SET_SB(failed_disks
);
2458 SET_SB(spare_disks
);
2463 mddev
->sb
->md_magic
= MD_SB_MAGIC
;
2466 * Generate a 128 bit UUID
2468 get_random_bytes(&mddev
->sb
->set_uuid0
, 4);
2469 get_random_bytes(&mddev
->sb
->set_uuid1
, 4);
2470 get_random_bytes(&mddev
->sb
->set_uuid2
, 4);
2471 get_random_bytes(&mddev
->sb
->set_uuid3
, 4);
2477 static int set_disk_info (mddev_t
* mddev
, void * arg
)
2483 static int clear_array (mddev_t
* mddev
)
2489 static int write_raid_info (mddev_t
* mddev
)
2495 static int protect_array (mddev_t
* mddev
)
2501 static int unprotect_array (mddev_t
* mddev
)
2507 static int set_disk_faulty (mddev_t
*mddev
, kdev_t dev
)
2511 fsync_dev(mddev_to_kdev(mddev
));
2512 ret
= md_error(mddev_to_kdev(mddev
), dev
);
2516 static int md_ioctl (struct inode
*inode
, struct file
*file
,
2517 unsigned int cmd
, unsigned long arg
)
2521 struct hd_geometry
*loc
= (struct hd_geometry
*) arg
;
2522 mddev_t
*mddev
= NULL
;
2525 if (!md_capable_admin())
2528 dev
= inode
->i_rdev
;
2530 if (minor
>= MAX_MD_DEVS
)
2534 * Commands dealing with the RAID driver but not any
2540 err
= get_version((void *)arg
);
2543 case PRINT_RAID_DEBUG
:
2548 case BLKGETSIZE
: /* Return device size */
2553 err
= md_put_user(md_hd_struct
[minor
].nr_sects
,
2559 invalidate_buffers(dev
);
2567 read_ahead
[MAJOR(dev
)] = arg
;
2575 err
= md_put_user (read_ahead
[
2576 MAJOR(dev
)], (long *) arg
);
2582 * Commands creating/starting a new array:
2585 mddev
= kdev_to_mddev(dev
);
2589 case SET_ARRAY_INFO
:
2592 printk("array md%d already exists!\n",
2601 case SET_ARRAY_INFO
:
2602 mddev
= alloc_mddev(dev
);
2608 * alloc_mddev() should possibly self-lock.
2610 err
= lock_mddev(mddev
);
2612 printk("ioctl, reason %d, cmd %d\n", err
, cmd
);
2617 printk("array md%d already has a superblock!\n",
2623 mdu_array_info_t info
;
2624 if (md_copy_from_user(&info
, (void*)arg
, sizeof(info
))) {
2628 err
= set_array_info(mddev
, &info
);
2630 printk("couldnt set array info. %d\n", err
);
2638 * possibly make it lock the array ...
2640 err
= autostart_array((kdev_t
)arg
);
2642 printk("autostart %s failed!\n",
2643 partition_name((kdev_t
)arg
));
2652 * Commands querying/configuring an existing array:
2659 err
= lock_mddev(mddev
);
2661 printk("ioctl lock interrupted, reason %d, cmd %d\n",err
, cmd
);
2664 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2665 if (!mddev
->sb
&& cmd
!= ADD_NEW_DISK
&& cmd
!= STOP_ARRAY
&& cmd
!= RUN_ARRAY
) {
2671 * Commands even a read-only array can execute:
2675 case GET_ARRAY_INFO
:
2676 err
= get_array_info(mddev
, (void *)arg
);
2680 err
= get_disk_info(mddev
, (void *)arg
);
2683 case RESTART_ARRAY_RW
:
2684 err
= restart_array(mddev
);
2688 if (!(err
= do_md_stop (mddev
, 0)))
2693 err
= do_md_stop (mddev
, 1);
2697 * We have a problem here : there is no easy way to give a CHS
2698 * virtual geometry. We currently pretend that we have a 2 heads
2699 * 4 sectors (with a BIG number of cylinders...). This drives
2700 * dosfs just mad... ;-)
2707 err
= md_put_user (2, (char *) &loc
->heads
);
2710 err
= md_put_user (4, (char *) &loc
->sectors
);
2713 err
= md_put_user (md_hd_struct
[mdidx(mddev
)].nr_sects
/8,
2714 (short *) &loc
->cylinders
);
2717 err
= md_put_user (md_hd_struct
[minor
].start_sect
,
2718 (long *) &loc
->start
);
2723 * The remaining ioctls are changing the state of the
2724 * superblock, so we do not allow read-only arrays
2735 err
= clear_array(mddev
);
2740 mdu_disk_info_t info
;
2741 if (md_copy_from_user(&info
, (void*)arg
, sizeof(info
)))
2744 err
= add_new_disk(mddev
, &info
);
2747 case HOT_REMOVE_DISK
:
2748 err
= hot_remove_disk(mddev
, (kdev_t
)arg
);
2752 err
= hot_add_disk(mddev
, (kdev_t
)arg
);
2756 err
= set_disk_info(mddev
, (void *)arg
);
2759 case WRITE_RAID_INFO
:
2760 err
= write_raid_info(mddev
);
2763 case UNPROTECT_ARRAY
:
2764 err
= unprotect_array(mddev
);
2768 err
= protect_array(mddev
);
2771 case SET_DISK_FAULTY
:
2772 err
= set_disk_faulty(mddev
, (kdev_t
)arg
);
2777 /* The data is never used....
2779 err = md_copy_from_user(¶m, (mdu_param_t *)arg,
2784 err
= do_md_run (mddev
);
2786 * we have to clean up the mess if
2787 * the array cannot be run for some
2791 mddev
->sb_dirty
= 0;
2792 if (!do_md_stop (mddev
, 0))
2799 printk(KERN_WARNING
"%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current
->comm
, current
->pid
);
2807 unlock_mddev(mddev
);
2817 static int md_open (struct inode
*inode
, struct file
*file
)
2825 static struct block_device_operations md_fops
=
2832 int md_thread(void * arg
)
2834 mdk_thread_t
*thread
= arg
;
2838 exit_files(current
);
2845 sprintf(current
->comm
, thread
->name
);
2848 thread
->tsk
= current
;
2851 * md_thread is a 'system-thread', it's priority should be very
2852 * high. We avoid resource deadlocks individually in each
2853 * raid personality. (RAID5 does preallocation) We also use RR and
2854 * the very same RT priority as kswapd, thus we will never get
2855 * into a priority inversion deadlock.
2857 * we definitely have to have equal or higher priority than
2858 * bdflush, otherwise bdflush will deadlock if there are too
2859 * many dirty RAID5 blocks.
2861 current
->policy
= SCHED_OTHER
;
2862 current
->nice
= -20;
2863 // md_unlock_kernel();
2868 DECLARE_WAITQUEUE(wait
, current
);
2870 add_wait_queue(&thread
->wqueue
, &wait
);
2871 set_task_state(current
, TASK_INTERRUPTIBLE
);
2872 if (!test_bit(THREAD_WAKEUP
, &thread
->flags
)) {
2873 dprintk("thread %p went to sleep.\n", thread
);
2875 dprintk("thread %p woke up.\n", thread
);
2877 current
->state
= TASK_RUNNING
;
2878 remove_wait_queue(&thread
->wqueue
, &wait
);
2879 clear_bit(THREAD_WAKEUP
, &thread
->flags
);
2882 thread
->run(thread
->data
);
2883 run_task_queue(&tq_disk
);
2886 if (md_signal_pending(current
)) {
2887 printk("%8s(%d) flushing signals.\n", current
->comm
,
2896 void md_wakeup_thread(mdk_thread_t
*thread
)
2898 dprintk("waking up MD thread %p.\n", thread
);
2899 set_bit(THREAD_WAKEUP
, &thread
->flags
);
2900 wake_up(&thread
->wqueue
);
2903 mdk_thread_t
*md_register_thread (void (*run
) (void *),
2904 void *data
, const char *name
)
2906 mdk_thread_t
*thread
;
2908 DECLARE_MUTEX_LOCKED(sem
);
2910 thread
= (mdk_thread_t
*) kmalloc
2911 (sizeof(mdk_thread_t
), GFP_KERNEL
);
2915 memset(thread
, 0, sizeof(mdk_thread_t
));
2916 md_init_waitqueue_head(&thread
->wqueue
);
2920 thread
->data
= data
;
2921 thread
->name
= name
;
2922 ret
= kernel_thread(md_thread
, thread
, 0);
2931 void md_interrupt_thread (mdk_thread_t
*thread
)
2937 printk("interrupting MD-thread pid %d\n", thread
->tsk
->pid
);
2938 send_sig(SIGKILL
, thread
->tsk
, 1);
2941 void md_unregister_thread (mdk_thread_t
*thread
)
2943 DECLARE_MUTEX_LOCKED(sem
);
2947 thread
->name
= NULL
;
2952 md_interrupt_thread(thread
);
2956 void md_recover_arrays (void)
2958 if (!md_recovery_thread
) {
2962 md_wakeup_thread(md_recovery_thread
);
2966 int md_error (kdev_t dev
, kdev_t rdev
)
2972 mddev
= kdev_to_mddev(dev
);
2973 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
2979 rrdev
= find_rdev(mddev
, rdev
);
2980 mark_rdev_faulty(rrdev
);
2982 * if recovery was running, stop it now.
2984 if (mddev
->pers
->stop_resync
)
2985 mddev
->pers
->stop_resync(mddev
);
2986 if (mddev
->recovery_running
)
2987 md_interrupt_thread(md_recovery_thread
);
2988 if (mddev
->pers
->error_handler
) {
2989 rc
= mddev
->pers
->error_handler(mddev
, rdev
);
2990 md_recover_arrays();
2996 static int status_unused (char * page
)
3000 struct md_list_head
*tmp
;
3002 sz
+= sprintf(page
+ sz
, "unused devices: ");
3004 ITERATE_RDEV_ALL(rdev
,tmp
) {
3005 if (!rdev
->same_set
.next
&& !rdev
->same_set
.prev
) {
3007 * The device is not yet used by any array.
3010 sz
+= sprintf(page
+ sz
, "%s ",
3011 partition_name(rdev
->dev
));
3015 sz
+= sprintf(page
+ sz
, "<none>");
3017 sz
+= sprintf(page
+ sz
, "\n");
3022 static int status_resync (char * page
, mddev_t
* mddev
)
3025 unsigned long max_blocks
, resync
, res
, dt
, db
, rt
;
3027 resync
= mddev
->curr_resync
- atomic_read(&mddev
->recovery_active
);
3028 max_blocks
= mddev
->sb
->size
;
3031 * Should not happen.
3037 res
= (resync
/1024)*1000/(max_blocks
/1024 + 1);
3039 int i
, x
= res
/50, y
= 20-x
;
3040 sz
+= sprintf(page
+ sz
, "[");
3041 for (i
= 0; i
< x
; i
++)
3042 sz
+= sprintf(page
+ sz
, "=");
3043 sz
+= sprintf(page
+ sz
, ">");
3044 for (i
= 0; i
< y
; i
++)
3045 sz
+= sprintf(page
+ sz
, ".");
3046 sz
+= sprintf(page
+ sz
, "] ");
3048 if (!mddev
->recovery_running
)
3052 sz
+= sprintf(page
+ sz
, " resync =%3lu.%lu%% (%lu/%lu)",
3053 res
/10, res
% 10, resync
, max_blocks
);
3058 sz
+= sprintf(page
+ sz
, " recovery =%3lu.%lu%% (%lu/%lu)",
3059 res
/10, res
% 10, resync
, max_blocks
);
3062 * We do not want to overflow, so the order of operands and
3063 * the * 100 / 100 trick are important. We do a +1 to be
3064 * safe against division by zero. We only estimate anyway.
3066 * dt: time from mark until now
3067 * db: blocks written from mark until now
3068 * rt: remaining time
3070 dt
= ((jiffies
- mddev
->resync_mark
) / HZ
);
3072 db
= resync
- mddev
->resync_mark_cnt
;
3073 rt
= (dt
* ((max_blocks
-resync
) / (db
/100+1)))/100;
3075 sz
+= sprintf(page
+ sz
, " finish=%lu.%lumin", rt
/ 60, (rt
% 60)/6);
3077 sz
+= sprintf(page
+ sz
, " speed=%ldK/sec", db
/dt
);
3082 static int md_status_read_proc(char *page
, char **start
, off_t off
,
3083 int count
, int *eof
, void *data
)
3085 int sz
= 0, j
, size
;
3086 struct md_list_head
*tmp
, *tmp2
;
3090 sz
+= sprintf(page
+ sz
, "Personalities : ");
3091 for (j
= 0; j
< MAX_PERSONALITY
; j
++)
3093 sz
+= sprintf(page
+sz
, "[%s] ", pers
[j
]->name
);
3095 sz
+= sprintf(page
+sz
, "\n");
3098 sz
+= sprintf(page
+sz
, "read_ahead ");
3099 if (read_ahead
[MD_MAJOR
] == INT_MAX
)
3100 sz
+= sprintf(page
+sz
, "not set\n");
3102 sz
+= sprintf(page
+sz
, "%d sectors\n", read_ahead
[MD_MAJOR
]);
3104 ITERATE_MDDEV(mddev
,tmp
) {
3105 sz
+= sprintf(page
+ sz
, "md%d : %sactive", mdidx(mddev
),
3106 mddev
->pers
? "" : "in");
3109 sz
+= sprintf(page
+ sz
, " (read-only)");
3110 sz
+= sprintf(page
+ sz
, " %s", mddev
->pers
->name
);
3114 ITERATE_RDEV(mddev
,rdev
,tmp2
) {
3115 sz
+= sprintf(page
+ sz
, " %s[%d]",
3116 partition_name(rdev
->dev
), rdev
->desc_nr
);
3118 sz
+= sprintf(page
+ sz
, "(F)");
3124 if (mddev
->nb_dev
) {
3126 sz
+= sprintf(page
+ sz
, "\n %d blocks",
3127 md_size
[mdidx(mddev
)]);
3129 sz
+= sprintf(page
+ sz
, "\n %d blocks", size
);
3133 sz
+= sprintf(page
+sz
, "\n");
3137 sz
+= mddev
->pers
->status (page
+sz
, mddev
);
3139 sz
+= sprintf(page
+sz
, "\n ");
3140 if (mddev
->curr_resync
) {
3141 sz
+= status_resync (page
+sz
, mddev
);
3143 if (md_atomic_read(&mddev
->resync_sem
.count
) != 1)
3144 sz
+= sprintf(page
+ sz
, " resync=DELAYED");
3146 sz
+= sprintf(page
+ sz
, "\n");
3148 sz
+= status_unused (page
+ sz
);
3153 int register_md_personality (int pnum
, mdk_personality_t
*p
)
3155 if (pnum
>= MAX_PERSONALITY
)
3162 printk(KERN_INFO
"%s personality registered\n", p
->name
);
3166 int unregister_md_personality (int pnum
)
3168 if (pnum
>= MAX_PERSONALITY
)
3171 printk(KERN_INFO
"%s personality unregistered\n", pers
[pnum
]->name
);
3176 static mdp_disk_t
*get_spare(mddev_t
*mddev
)
3178 mdp_super_t
*sb
= mddev
->sb
;
3181 struct md_list_head
*tmp
;
3183 ITERATE_RDEV(mddev
,rdev
,tmp
) {
3190 disk
= &sb
->disks
[rdev
->desc_nr
];
3191 if (disk_faulty(disk
)) {
3195 if (disk_active(disk
))
3202 static unsigned int sync_io
[DK_MAX_MAJOR
][DK_MAX_DISK
];
3203 void md_sync_acct(kdev_t dev
, unsigned long nr_sectors
)
3205 unsigned int major
= MAJOR(dev
);
3208 index
= disk_index(dev
);
3209 if ((index
>= DK_MAX_DISK
) || (major
>= DK_MAX_MAJOR
))
3212 sync_io
[major
][index
] += nr_sectors
;
3215 static int is_mddev_idle (mddev_t
*mddev
)
3218 struct md_list_head
*tmp
;
3220 unsigned long curr_events
;
3223 ITERATE_RDEV(mddev
,rdev
,tmp
) {
3224 int major
= MAJOR(rdev
->dev
);
3225 int idx
= disk_index(rdev
->dev
);
3227 if ((idx
>= DK_MAX_DISK
) || (major
>= DK_MAX_MAJOR
))
3230 curr_events
= kstat
.dk_drive_rblk
[major
][idx
] +
3231 kstat
.dk_drive_wblk
[major
][idx
] ;
3232 curr_events
-= sync_io
[major
][idx
];
3233 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3234 if (curr_events
!= rdev
->last_events
) {
3235 // printk("!I(%ld)", curr_events - rdev->last_events);
3236 rdev
->last_events
= curr_events
;
3243 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait
);
3245 void md_done_sync(mddev_t
*mddev
, int blocks
, int ok
)
3247 /* another "blocks" (1K) blocks have been synced */
3248 atomic_sub(blocks
, &mddev
->recovery_active
);
3249 wake_up(&mddev
->recovery_wait
);
3251 // stop recovery, signal do_sync ....
3255 #define SYNC_MARKS 10
3256 #define SYNC_MARK_STEP (3*HZ)
3257 int md_do_sync(mddev_t
*mddev
, mdp_disk_t
*spare
)
3260 unsigned int max_blocks
, currspeed
,
3261 j
, window
, err
, serialize
;
3262 kdev_t read_disk
= mddev_to_kdev(mddev
);
3263 unsigned long mark
[SYNC_MARKS
];
3264 unsigned long mark_cnt
[SYNC_MARKS
];
3266 struct md_list_head
*tmp
;
3267 unsigned long last_check
;
3270 err
= down_interruptible(&mddev
->resync_sem
);
3276 ITERATE_MDDEV(mddev2
,tmp
) {
3277 if (mddev2
== mddev
)
3279 if (mddev2
->curr_resync
&& match_mddev_units(mddev
,mddev2
)) {
3280 printk(KERN_INFO
"md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev
), mdidx(mddev2
));
3286 interruptible_sleep_on(&resync_wait
);
3287 if (md_signal_pending(current
)) {
3295 mddev
->curr_resync
= 1;
3297 max_blocks
= mddev
->sb
->size
;
3299 printk(KERN_INFO
"md: syncing RAID array md%d\n", mdidx(mddev
));
3300 printk(KERN_INFO
"md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3301 sysctl_speed_limit_min
);
3302 printk(KERN_INFO
"md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max
);
3305 * Resync has low priority.
3309 is_mddev_idle(mddev
); /* this also initializes IO event counters */
3310 for (m
= 0; m
< SYNC_MARKS
; m
++) {
3315 mddev
->resync_mark
= mark
[last_mark
];
3316 mddev
->resync_mark_cnt
= mark_cnt
[last_mark
];
3319 * Tune reconstruction:
3321 window
= MAX_READAHEAD
*(PAGE_SIZE
/1024);
3322 printk(KERN_INFO
"md: using %dk window, over a total of %d blocks.\n",window
,max_blocks
);
3324 atomic_set(&mddev
->recovery_active
, 0);
3325 init_waitqueue_head(&mddev
->recovery_wait
);
3327 for (j
= 0; j
< max_blocks
;) {
3330 blocks
= mddev
->pers
->sync_request(mddev
, j
);
3336 atomic_add(blocks
, &mddev
->recovery_active
);
3338 mddev
->curr_resync
= j
;
3340 if (last_check
+ window
> j
)
3343 run_task_queue(&tq_disk
); //??
3345 if (jiffies
>= mark
[last_mark
] + SYNC_MARK_STEP
) {
3347 int next
= (last_mark
+1) % SYNC_MARKS
;
3349 mddev
->resync_mark
= mark
[next
];
3350 mddev
->resync_mark_cnt
= mark_cnt
[next
];
3351 mark
[next
] = jiffies
;
3352 mark_cnt
[next
] = j
- atomic_read(&mddev
->recovery_active
);
3357 if (md_signal_pending(current
)) {
3359 * got a signal, exit.
3361 mddev
->curr_resync
= 0;
3362 printk("md_do_sync() got signal ... exiting\n");
3369 * this loop exits only if either when we are slower than
3370 * the 'hard' speed limit, or the system was IO-idle for
3372 * the system might be non-idle CPU-wise, but we only care
3373 * about not overloading the IO subsystem. (things like an
3374 * e2fsck being done on the RAID array should execute fast)
3377 if (md_need_resched(current
))
3380 currspeed
= (j
-mddev
->resync_mark_cnt
)/((jiffies
-mddev
->resync_mark
)/HZ
+1) +1;
3382 if (currspeed
> sysctl_speed_limit_min
) {
3385 if ((currspeed
> sysctl_speed_limit_max
) ||
3386 !is_mddev_idle(mddev
)) {
3387 current
->state
= TASK_INTERRUPTIBLE
;
3388 md_schedule_timeout(HZ
/4);
3389 if (!md_signal_pending(current
))
3393 current
->nice
= -20;
3395 fsync_dev(read_disk
);
3396 printk(KERN_INFO
"md: md%d: sync done.\n",mdidx(mddev
));
3399 * this also signals 'finished resyncing' to md_stop
3402 wait_event(mddev
->recovery_wait
, atomic_read(&mddev
->recovery_active
)==0);
3403 up(&mddev
->resync_sem
);
3405 mddev
->curr_resync
= 0;
3406 wake_up(&resync_wait
);
3412 * This is a kernel thread which syncs a spare disk with the active array
3414 * the amount of foolproofing might seem to be a tad excessive, but an
3415 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3416 * of my root partition with the first 0.5 gigs of my /home partition ... so
3417 * i'm a bit nervous ;)
3419 void md_do_recovery (void *data
)
3425 struct md_list_head
*tmp
;
3427 printk(KERN_INFO
"md: recovery thread got woken up ...\n");
3429 ITERATE_MDDEV(mddev
,tmp
) {
3433 if (mddev
->recovery_running
)
3435 if (sb
->active_disks
== sb
->raid_disks
)
3437 if (!sb
->spare_disks
) {
3438 printk(KERN_ERR
"md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev
));
3442 * now here we get the spare and resync it.
3444 if ((spare
= get_spare(mddev
)) == NULL
)
3446 printk(KERN_INFO
"md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev
), partition_name(MKDEV(spare
->major
,spare
->minor
)));
3447 if (!mddev
->pers
->diskop
)
3449 if (mddev
->pers
->diskop(mddev
, &spare
, DISKOP_SPARE_WRITE
))
3451 down(&mddev
->recovery_sem
);
3452 mddev
->recovery_running
= 1;
3453 err
= md_do_sync(mddev
, spare
);
3455 printk(KERN_INFO
"md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev
), partition_name(MKDEV(spare
->major
,spare
->minor
)));
3456 if (!disk_faulty(spare
)) {
3457 mddev
->pers
->diskop(mddev
,&spare
,DISKOP_SPARE_INACTIVE
);
3458 mark_disk_faulty(spare
);
3459 mark_disk_nonsync(spare
);
3460 mark_disk_inactive(spare
);
3462 sb
->working_disks
--;
3466 if (disk_faulty(spare
))
3467 mddev
->pers
->diskop(mddev
, &spare
,
3468 DISKOP_SPARE_INACTIVE
);
3469 if (err
== -EINTR
|| err
== -ENOMEM
) {
3471 * Recovery got interrupted, or ran out of mem ...
3472 * signal back that we have finished using the array.
3474 mddev
->pers
->diskop(mddev
, &spare
,
3475 DISKOP_SPARE_INACTIVE
);
3476 up(&mddev
->recovery_sem
);
3477 mddev
->recovery_running
= 0;
3480 mddev
->recovery_running
= 0;
3481 up(&mddev
->recovery_sem
);
3483 if (!disk_faulty(spare
)) {
3485 * the SPARE_ACTIVE diskop possibly changes the
3488 mddev
->pers
->diskop(mddev
, &spare
, DISKOP_SPARE_ACTIVE
);
3489 mark_disk_sync(spare
);
3490 mark_disk_active(spare
);
3494 mddev
->sb_dirty
= 1;
3495 md_update_sb(mddev
);
3498 printk(KERN_INFO
"md: recovery thread finished ...\n");
3502 int md_notify_reboot(struct notifier_block
*this,
3503 unsigned long code
, void *x
)
3505 struct md_list_head
*tmp
;
3508 if ((code
== MD_SYS_DOWN
) || (code
== MD_SYS_HALT
)
3509 || (code
== MD_SYS_POWER_OFF
)) {
3511 printk(KERN_INFO
"stopping all md devices.\n");
3513 ITERATE_MDDEV(mddev
,tmp
)
3514 do_md_stop (mddev
, 1);
3516 * certain more exotic SCSI devices are known to be
3517 * volatile wrt too early system reboots. While the
3518 * right place to handle this issue is the given
3519 * driver, we do want to have a safe RAID driver ...
3526 struct notifier_block md_notifier
= {
3532 static int md__init
raid_setup(char *str
)
3536 len
= strlen(str
) + 1;
3540 char *comma
= strchr(str
+pos
, ',');
3543 wlen
= (comma
-str
)-pos
;
3544 else wlen
= (len
-1)-pos
;
3546 if (strncmp(str
, "noautodetect", wlen
) == 0)
3547 raid_setup_args
.noautodetect
= 1;
3550 raid_setup_args
.set
= 1;
3553 __setup("raid=", raid_setup
);
3555 static void md_geninit (void)
3559 for(i
= 0; i
< MAX_MD_DEVS
; i
++) {
3560 md_blocksizes
[i
] = 1024;
3562 md_hardsect_sizes
[i
] = 512;
3563 md_maxreadahead
[i
] = MD_READAHEAD
;
3564 register_disk(&md_gendisk
, MKDEV(MAJOR_NR
,i
), 1, &md_fops
, 0);
3566 blksize_size
[MAJOR_NR
] = md_blocksizes
;
3567 blk_size
[MAJOR_NR
] = md_size
;
3568 max_readahead
[MAJOR_NR
] = md_maxreadahead
;
3569 hardsect_size
[MAJOR_NR
] = md_hardsect_sizes
;
3571 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t
));
3573 #ifdef CONFIG_PROC_FS
3574 create_proc_read_entry("mdstat", 0, NULL
, md_status_read_proc
, NULL
);
3577 void hsm_init (void);
3578 void translucent_init (void);
3579 void linear_init (void);
3580 void raid0_init (void);
3581 void raid1_init (void);
3582 void raid5_init (void);
3584 int md__init
md_init (void)
3586 static char * name
= "mdrecoveryd";
3588 printk (KERN_INFO
"md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
3589 MD_MAJOR_VERSION
, MD_MINOR_VERSION
,
3590 MD_PATCHLEVEL_VERSION
, MAX_MD_DEVS
, MAX_REAL
);
3592 if (devfs_register_blkdev (MAJOR_NR
, "md", &md_fops
))
3594 printk (KERN_ALERT
"Unable to get major %d for md\n", MAJOR_NR
);
3597 devfs_handle
= devfs_mk_dir (NULL
, "md", NULL
);
3598 devfs_register_series (devfs_handle
, "%u",MAX_MD_DEVS
,DEVFS_FL_DEFAULT
,
3599 MAJOR_NR
, 0, S_IFBLK
| S_IRUSR
| S_IWUSR
,
3602 /* forward all md request to md_make_request */
3603 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR
), md_make_request
);
3606 read_ahead
[MAJOR_NR
] = INT_MAX
;
3607 md_gendisk
.next
= gendisk_head
;
3609 gendisk_head
= &md_gendisk
;
3611 md_recovery_thread
= md_register_thread(md_do_recovery
, NULL
, name
);
3612 if (!md_recovery_thread
)
3613 printk(KERN_ALERT
"bug: couldn't allocate md_recovery_thread\n");
3615 md_register_reboot_notifier(&md_notifier
);
3616 raid_table_header
= register_sysctl_table(raid_root_table
, 1);
3618 #ifdef CONFIG_MD_LINEAR
3621 #ifdef CONFIG_MD_RAID0
3624 #ifdef CONFIG_MD_RAID1
3627 #ifdef CONFIG_MD_RAID5
3634 #ifdef CONFIG_MD_BOOT
3635 #define MAX_MD_BOOT_DEVS 8
3638 int pers
[MAX_MD_BOOT_DEVS
];
3639 int chunk
[MAX_MD_BOOT_DEVS
];
3640 kdev_t devices
[MAX_MD_BOOT_DEVS
][MAX_REAL
];
3641 } md_setup_args md__initdata
;
3644 * Parse the command-line parameters given our kernel, but do not
3645 * actually try to invoke the MD device now; that is handled by
3646 * md_setup_drive after the low-level disk drivers have initialised.
3648 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3649 * assigns the task of parsing integer arguments to the
3650 * invoked program now). Added ability to initialise all
3651 * the MD devices (by specifying multiple "md=" lines)
3652 * instead of just one. -- KTK
3653 * 18May2000: Added support for persistant-superblock arrays:
3654 * md=n,0,factor,fault,device-list uses RAID0 for device n
3655 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3656 * md=n,device-list reads a RAID superblock from the devices
3657 * elements in device-list are read by name_to_kdev_t so can be
3658 * a hex number or something like /dev/hda1 /dev/sdb
3660 extern kdev_t
name_to_kdev_t(char *line
) md__init
;
3661 static int md__init
md_setup(char *str
)
3663 int minor
, level
, factor
, fault
, i
=0;
3665 char *devnames
, *pername
= "";
3667 if(get_option(&str
, &minor
) != 2) { /* MD Number */
3668 printk("md: Too few arguments supplied to md=.\n");
3671 if (minor
>= MAX_MD_BOOT_DEVS
) {
3672 printk ("md: Minor device number too high.\n");
3674 } else if (md_setup_args
.set
& (1 << minor
)) {
3675 printk ("md: Warning - md=%d,... has been specified twice;\n"
3676 " will discard the first definition.\n", minor
);
3678 switch(get_option(&str
, &level
)) { /* RAID Personality */
3679 case 2: /* could be 0 or -1.. */
3680 if (level
== 0 || level
== -1) {
3681 if (get_option(&str
, &factor
) != 2 || /* Chunk Size */
3682 get_option(&str
, &fault
) != 2) {
3683 printk("md: Too few arguments supplied to md=.\n");
3686 md_setup_args
.pers
[minor
] = level
;
3687 md_setup_args
.chunk
[minor
] = 1 << (factor
+12);
3698 printk ("md: The kernel has not been configured for raid%d"
3699 " support!\n", level
);
3702 md_setup_args
.pers
[minor
] = level
;
3706 case 1: /* the first device is numeric */
3707 md_setup_args
.devices
[minor
][i
++] = level
;
3710 md_setup_args
.pers
[minor
] = 0;
3711 pername
="super-block";
3714 for (; i
<MAX_REAL
&& str
; i
++) {
3715 if ((device
= name_to_kdev_t(str
))) {
3716 md_setup_args
.devices
[minor
][i
] = device
;
3718 printk ("md: Unknown device name, %s.\n", str
);
3721 if ((str
= strchr(str
, ',')) != NULL
)
3725 printk ("md: No devices specified for md%d?\n", minor
);
3729 printk ("md: Will configure md%d (%s) from %s, below.\n",
3730 minor
, pername
, devnames
);
3731 md_setup_args
.devices
[minor
][i
] = (kdev_t
) 0;
3732 md_setup_args
.set
|= (1 << minor
);
3736 void md__init
md_setup_drive(void)
3742 for (minor
= 0; minor
< MAX_MD_BOOT_DEVS
; minor
++) {
3743 mdu_disk_info_t dinfo
;
3745 if (!(md_setup_args
.set
& (1 << minor
)))
3747 printk("md: Loading md%d.\n", minor
);
3748 mddev
= alloc_mddev(MKDEV(MD_MAJOR
,minor
));
3749 if (md_setup_args
.pers
[minor
]) {
3750 /* non-persistent */
3751 mdu_array_info_t ainfo
;
3752 ainfo
.level
= pers_to_level(md_setup_args
.pers
[minor
]);
3755 ainfo
.raid_disks
=0;
3756 ainfo
.md_minor
=minor
;
3757 ainfo
.not_persistent
= 1;
3759 ainfo
.state
= MD_SB_CLEAN
;
3760 ainfo
.active_disks
= 0;
3761 ainfo
.working_disks
= 0;
3762 ainfo
.failed_disks
= 0;
3763 ainfo
.spare_disks
= 0;
3765 ainfo
.chunk_size
= md_setup_args
.chunk
[minor
];
3766 err
= set_array_info(mddev
, &ainfo
);
3767 for (i
=0; !err
&& (dev
= md_setup_args
.devices
[minor
][i
]); i
++) {
3769 dinfo
.raid_disk
= i
;
3770 dinfo
.state
= (1<<MD_DISK_ACTIVE
)|(1<<MD_DISK_SYNC
);
3771 dinfo
.major
= MAJOR(dev
);
3772 dinfo
.minor
= MINOR(dev
);
3773 mddev
->sb
->nr_disks
++;
3774 mddev
->sb
->raid_disks
++;
3775 mddev
->sb
->active_disks
++;
3776 mddev
->sb
->working_disks
++;
3777 err
= add_new_disk (mddev
, &dinfo
);
3781 for (i
= 0; (dev
= md_setup_args
.devices
[minor
][i
]); i
++) {
3782 dinfo
.major
= MAJOR(dev
);
3783 dinfo
.minor
= MINOR(dev
);
3784 add_new_disk (mddev
, &dinfo
);
3788 err
= do_md_run(mddev
);
3790 mddev
->sb_dirty
= 0;
3791 do_md_stop(mddev
, 0);
3792 printk("md: starting md%d failed\n", minor
);
3797 __setup("md=", md_setup
);
3801 int init_module (void)
3806 static void free_device_names(void)
3808 while (device_names
.next
!= &device_names
) {
3809 struct list_head
*tmp
= device_names
.next
;
3816 void cleanup_module (void)
3818 struct gendisk
**gendisk_ptr
;
3820 md_unregister_thread(md_recovery_thread
);
3821 devfs_unregister(devfs_handle
);
3823 devfs_unregister_blkdev(MAJOR_NR
,"md");
3824 unregister_reboot_notifier(&md_notifier
);
3825 unregister_sysctl_table(raid_table_header
);
3826 #ifdef CONFIG_PROC_FS
3827 remove_proc_entry("mdstat", NULL
);
3830 gendisk_ptr
= &gendisk_head
;
3831 while (*gendisk_ptr
) {
3832 if (*gendisk_ptr
== &md_gendisk
) {
3833 *gendisk_ptr
= md_gendisk
.next
;
3836 gendisk_ptr
= & (*gendisk_ptr
)->next
;
3838 blk_dev
[MAJOR_NR
].queue
= NULL
;
3839 blksize_size
[MAJOR_NR
] = NULL
;
3840 blk_size
[MAJOR_NR
] = NULL
;
3841 max_readahead
[MAJOR_NR
] = NULL
;
3842 hardsect_size
[MAJOR_NR
] = NULL
;
3844 free_device_names();
3849 MD_EXPORT_SYMBOL(md_size
);
3850 MD_EXPORT_SYMBOL(register_md_personality
);
3851 MD_EXPORT_SYMBOL(unregister_md_personality
);
3852 MD_EXPORT_SYMBOL(partition_name
);
3853 MD_EXPORT_SYMBOL(md_error
);
3854 MD_EXPORT_SYMBOL(md_do_sync
);
3855 MD_EXPORT_SYMBOL(md_sync_acct
);
3856 MD_EXPORT_SYMBOL(md_done_sync
);
3857 MD_EXPORT_SYMBOL(md_recover_arrays
);
3858 MD_EXPORT_SYMBOL(md_register_thread
);
3859 MD_EXPORT_SYMBOL(md_unregister_thread
);
3860 MD_EXPORT_SYMBOL(md_update_sb
);
3861 MD_EXPORT_SYMBOL(md_wakeup_thread
);
3862 MD_EXPORT_SYMBOL(md_print_devices
);
3863 MD_EXPORT_SYMBOL(find_rdev_nr
);
3864 MD_EXPORT_SYMBOL(md_interrupt_thread
);
3865 MD_EXPORT_SYMBOL(mddev_map
);
3866 MD_EXPORT_SYMBOL(md_check_ordering
);