- pre5:
[davej-history.git] / drivers / block / md.c
blob587bfcbf6adfe227cfc1d62dd94f77484320643b
1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
7 Changes:
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
19 Neil Brown <neilb@cse.unsw.edu.au>.
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
24 any later version.
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/raid/md.h>
34 #include <linux/raid/xor.h>
35 #include <linux/devfs_fs_kernel.h>
37 #ifdef CONFIG_KMOD
38 #include <linux/kmod.h>
39 #endif
41 #define __KERNEL_SYSCALLS__
42 #include <linux/unistd.h>
44 #include <asm/unaligned.h>
46 extern asmlinkage int sys_sched_yield(void);
47 extern asmlinkage long sys_setsid(void);
49 #define MAJOR_NR MD_MAJOR
50 #define MD_DRIVER
52 #include <linux/blk.h>
54 #define DEBUG 0
55 #if DEBUG
56 # define dprintk(x...) printk(x)
57 #else
58 # define dprintk(x...) do { } while(0)
59 #endif
61 static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
64 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
65 * is 100 KB/sec, so the extra system load does not show up that much.
66 * Increase it if you want to have more _guaranteed_ speed. Note that
67 * the RAID driver will use the maximum available bandwith if the IO
68 * subsystem is idle. There is also an 'absolute maximum' reconstruction
69 * speed limit - in case reconstruction slows down your system despite
70 * idle IO detection.
72 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
75 static int sysctl_speed_limit_min = 100;
76 static int sysctl_speed_limit_max = 100000;
78 static struct ctl_table_header *raid_table_header;
80 static ctl_table raid_table[] = {
81 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
82 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
83 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
84 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
85 {0}
88 static ctl_table raid_dir_table[] = {
89 {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
90 {0}
93 static ctl_table raid_root_table[] = {
94 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
95 {0}
99 * these have to be allocated separately because external
100 * subsystems want to have a pre-defined structure
102 struct hd_struct md_hd_struct[MAX_MD_DEVS];
103 static int md_blocksizes[MAX_MD_DEVS];
104 static int md_hardsect_sizes[MAX_MD_DEVS];
105 static int md_maxreadahead[MAX_MD_DEVS];
106 static mdk_thread_t *md_recovery_thread = NULL;
108 int md_size[MAX_MD_DEVS] = {0, };
110 extern struct block_device_operations md_fops;
111 static devfs_handle_t devfs_handle = NULL;
113 static struct gendisk md_gendisk=
115 major: MD_MAJOR,
116 major_name: "md",
117 minor_shift: 0,
118 max_p: 1,
119 part: md_hd_struct,
120 sizes: md_size,
121 nr_real: MAX_MD_DEVS,
122 real_devices: NULL,
123 next: NULL,
124 fops: &md_fops,
128 * Enables to iterate over all existing md arrays
130 static MD_LIST_HEAD(all_mddevs);
133 * The mapping between kdev and mddev is not necessary a simple
134 * one! Eg. HSM uses several sub-devices to implement Logical
135 * Volumes. All these sub-devices map to the same mddev.
137 dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
139 void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
141 unsigned int minor = MINOR(dev);
143 if (MAJOR(dev) != MD_MAJOR) {
144 MD_BUG();
145 return;
147 if (mddev_map[minor].mddev != NULL) {
148 MD_BUG();
149 return;
151 mddev_map[minor].mddev = mddev;
152 mddev_map[minor].data = data;
155 void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
157 unsigned int minor = MINOR(dev);
159 if (MAJOR(dev) != MD_MAJOR) {
160 MD_BUG();
161 return;
163 if (mddev_map[minor].mddev != mddev) {
164 MD_BUG();
165 return;
167 mddev_map[minor].mddev = NULL;
168 mddev_map[minor].data = NULL;
171 static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
173 mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
175 if (mddev && mddev->pers)
176 return mddev->pers->make_request(mddev, rw, bh);
177 else {
178 buffer_IO_error(bh);
179 return -1;
183 static mddev_t * alloc_mddev (kdev_t dev)
185 mddev_t *mddev;
187 if (MAJOR(dev) != MD_MAJOR) {
188 MD_BUG();
189 return 0;
191 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
192 if (!mddev)
193 return NULL;
195 memset(mddev, 0, sizeof(*mddev));
197 mddev->__minor = MINOR(dev);
198 init_MUTEX(&mddev->reconfig_sem);
199 init_MUTEX(&mddev->recovery_sem);
200 init_MUTEX(&mddev->resync_sem);
201 MD_INIT_LIST_HEAD(&mddev->disks);
202 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
205 * The 'base' mddev is the one with data NULL.
206 * personalities can create additional mddevs
207 * if necessary.
209 add_mddev_mapping(mddev, dev, 0);
210 md_list_add(&mddev->all_mddevs, &all_mddevs);
212 MOD_INC_USE_COUNT;
214 return mddev;
217 struct gendisk * find_gendisk (kdev_t dev)
219 struct gendisk *tmp = gendisk_head;
221 while (tmp != NULL) {
222 if (tmp->major == MAJOR(dev))
223 return (tmp);
224 tmp = tmp->next;
226 return (NULL);
229 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
231 mdk_rdev_t * rdev;
232 struct md_list_head *tmp;
234 ITERATE_RDEV(mddev,rdev,tmp) {
235 if (rdev->desc_nr == nr)
236 return rdev;
238 return NULL;
241 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
243 struct md_list_head *tmp;
244 mdk_rdev_t *rdev;
246 ITERATE_RDEV(mddev,rdev,tmp) {
247 if (rdev->dev == dev)
248 return rdev;
250 return NULL;
253 static MD_LIST_HEAD(device_names);
255 char * partition_name (kdev_t dev)
257 struct gendisk *hd;
258 static char nomem [] = "<nomem>";
259 dev_name_t *dname;
260 struct md_list_head *tmp = device_names.next;
262 while (tmp != &device_names) {
263 dname = md_list_entry(tmp, dev_name_t, list);
264 if (dname->dev == dev)
265 return dname->name;
266 tmp = tmp->next;
269 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
271 if (!dname)
272 return nomem;
274 * ok, add this new device name to the list
276 hd = find_gendisk (dev);
277 dname->name = NULL;
278 if (hd)
279 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
280 if (!dname->name) {
281 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
282 dname->name = dname->namebuf;
285 dname->dev = dev;
286 MD_INIT_LIST_HEAD(&dname->list);
287 md_list_add(&dname->list, &device_names);
289 return dname->name;
292 static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
293 int persistent)
295 unsigned int size = 0;
297 if (blk_size[MAJOR(dev)])
298 size = blk_size[MAJOR(dev)][MINOR(dev)];
299 if (persistent)
300 size = MD_NEW_SIZE_BLOCKS(size);
301 return size;
304 static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
306 unsigned int size;
308 size = calc_dev_sboffset(dev, mddev, persistent);
309 if (!mddev->sb) {
310 MD_BUG();
311 return size;
313 if (mddev->sb->chunk_size)
314 size &= ~(mddev->sb->chunk_size/1024 - 1);
315 return size;
318 static unsigned int zoned_raid_size (mddev_t *mddev)
320 unsigned int mask;
321 mdk_rdev_t * rdev;
322 struct md_list_head *tmp;
324 if (!mddev->sb) {
325 MD_BUG();
326 return -EINVAL;
329 * do size and offset calculations.
331 mask = ~(mddev->sb->chunk_size/1024 - 1);
333 ITERATE_RDEV(mddev,rdev,tmp) {
334 rdev->size &= mask;
335 md_size[mdidx(mddev)] += rdev->size;
337 return 0;
341 * We check wether all devices are numbered from 0 to nb_dev-1. The
342 * order is guaranteed even after device name changes.
344 * Some personalities (raid0, linear) use this. Personalities that
345 * provide data have to be able to deal with loss of individual
346 * disks, so they do their checking themselves.
348 int md_check_ordering (mddev_t *mddev)
350 int i, c;
351 mdk_rdev_t *rdev;
352 struct md_list_head *tmp;
355 * First, all devices must be fully functional
357 ITERATE_RDEV(mddev,rdev,tmp) {
358 if (rdev->faulty) {
359 printk("md: md%d's device %s faulty, aborting.\n",
360 mdidx(mddev), partition_name(rdev->dev));
361 goto abort;
365 c = 0;
366 ITERATE_RDEV(mddev,rdev,tmp) {
367 c++;
369 if (c != mddev->nb_dev) {
370 MD_BUG();
371 goto abort;
373 if (mddev->nb_dev != mddev->sb->raid_disks) {
374 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
375 mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
376 goto abort;
379 * Now the numbering check
381 for (i = 0; i < mddev->nb_dev; i++) {
382 c = 0;
383 ITERATE_RDEV(mddev,rdev,tmp) {
384 if (rdev->desc_nr == i)
385 c++;
387 if (!c) {
388 printk("md: md%d, missing disk #%d, aborting.\n",
389 mdidx(mddev), i);
390 goto abort;
392 if (c > 1) {
393 printk("md: md%d, too many disks #%d, aborting.\n",
394 mdidx(mddev), i);
395 goto abort;
398 return 0;
399 abort:
400 return 1;
403 static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
405 if (disk_active(disk)) {
406 sb->working_disks--;
407 } else {
408 if (disk_spare(disk)) {
409 sb->spare_disks--;
410 sb->working_disks--;
411 } else {
412 sb->failed_disks--;
415 sb->nr_disks--;
416 disk->major = 0;
417 disk->minor = 0;
418 mark_disk_removed(disk);
421 #define BAD_MAGIC KERN_ERR \
422 "md: invalid raid superblock magic on %s\n"
424 #define BAD_MINOR KERN_ERR \
425 "md: %s: invalid raid minor (%x)\n"
427 #define OUT_OF_MEM KERN_ALERT \
428 "md: out of memory.\n"
430 #define NO_SB KERN_ERR \
431 "md: disabled device %s, could not read superblock.\n"
433 #define BAD_CSUM KERN_WARNING \
434 "md: invalid superblock checksum on %s\n"
436 static int alloc_array_sb (mddev_t * mddev)
438 if (mddev->sb) {
439 MD_BUG();
440 return 0;
443 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
444 if (!mddev->sb)
445 return -ENOMEM;
446 md_clear_page(mddev->sb);
447 return 0;
450 static int alloc_disk_sb (mdk_rdev_t * rdev)
452 if (rdev->sb)
453 MD_BUG();
455 rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
456 if (!rdev->sb) {
457 printk (OUT_OF_MEM);
458 return -EINVAL;
460 md_clear_page(rdev->sb);
462 return 0;
465 static void free_disk_sb (mdk_rdev_t * rdev)
467 if (rdev->sb) {
468 free_page((unsigned long) rdev->sb);
469 rdev->sb = NULL;
470 rdev->sb_offset = 0;
471 rdev->size = 0;
472 } else {
473 if (!rdev->faulty)
474 MD_BUG();
478 static void mark_rdev_faulty (mdk_rdev_t * rdev)
480 if (!rdev) {
481 MD_BUG();
482 return;
484 free_disk_sb(rdev);
485 rdev->faulty = 1;
488 static int read_disk_sb (mdk_rdev_t * rdev)
490 int ret = -EINVAL;
491 struct buffer_head *bh = NULL;
492 kdev_t dev = rdev->dev;
493 mdp_super_t *sb;
494 unsigned long sb_offset;
496 if (!rdev->sb) {
497 MD_BUG();
498 goto abort;
502 * Calculate the position of the superblock,
503 * it's at the end of the disk
505 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
506 rdev->sb_offset = sb_offset;
507 printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
508 fsync_dev(dev);
509 set_blocksize (dev, MD_SB_BYTES);
510 bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
512 if (bh) {
513 sb = (mdp_super_t *) bh->b_data;
514 memcpy (rdev->sb, sb, MD_SB_BYTES);
515 } else {
516 printk (NO_SB,partition_name(rdev->dev));
517 goto abort;
519 printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
520 ret = 0;
521 abort:
522 if (bh)
523 brelse (bh);
524 return ret;
527 static unsigned int calc_sb_csum (mdp_super_t * sb)
529 unsigned int disk_csum, csum;
531 disk_csum = sb->sb_csum;
532 sb->sb_csum = 0;
533 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
534 sb->sb_csum = disk_csum;
535 return csum;
539 * Check one RAID superblock for generic plausibility
542 static int check_disk_sb (mdk_rdev_t * rdev)
544 mdp_super_t *sb;
545 int ret = -EINVAL;
547 sb = rdev->sb;
548 if (!sb) {
549 MD_BUG();
550 goto abort;
553 if (sb->md_magic != MD_SB_MAGIC) {
554 printk (BAD_MAGIC, partition_name(rdev->dev));
555 goto abort;
558 if (sb->md_minor >= MAX_MD_DEVS) {
559 printk (BAD_MINOR, partition_name(rdev->dev),
560 sb->md_minor);
561 goto abort;
564 if (calc_sb_csum(sb) != sb->sb_csum)
565 printk(BAD_CSUM, partition_name(rdev->dev));
566 ret = 0;
567 abort:
568 return ret;
571 static kdev_t dev_unit(kdev_t dev)
573 unsigned int mask;
574 struct gendisk *hd = find_gendisk(dev);
576 if (!hd)
577 return 0;
578 mask = ~((1 << hd->minor_shift) - 1);
580 return MKDEV(MAJOR(dev), MINOR(dev) & mask);
583 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
585 struct md_list_head *tmp;
586 mdk_rdev_t *rdev;
588 ITERATE_RDEV(mddev,rdev,tmp)
589 if (dev_unit(rdev->dev) == dev_unit(dev))
590 return rdev;
592 return NULL;
595 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
597 struct md_list_head *tmp;
598 mdk_rdev_t *rdev;
600 ITERATE_RDEV(mddev1,rdev,tmp)
601 if (match_dev_unit(mddev2, rdev->dev))
602 return 1;
604 return 0;
607 static MD_LIST_HEAD(all_raid_disks);
608 static MD_LIST_HEAD(pending_raid_disks);
610 static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
612 mdk_rdev_t *same_pdev;
614 if (rdev->mddev) {
615 MD_BUG();
616 return;
618 same_pdev = match_dev_unit(mddev, rdev->dev);
619 if (same_pdev)
620 printk( KERN_WARNING
621 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
622 " protection against single-disk failure might be compromised.\n",
623 mdidx(mddev), partition_name(rdev->dev),
624 partition_name(same_pdev->dev));
626 md_list_add(&rdev->same_set, &mddev->disks);
627 rdev->mddev = mddev;
628 mddev->nb_dev++;
629 printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
632 static void unbind_rdev_from_array (mdk_rdev_t * rdev)
634 if (!rdev->mddev) {
635 MD_BUG();
636 return;
638 md_list_del(&rdev->same_set);
639 MD_INIT_LIST_HEAD(&rdev->same_set);
640 rdev->mddev->nb_dev--;
641 printk("unbind<%s,%d>\n", partition_name(rdev->dev),
642 rdev->mddev->nb_dev);
643 rdev->mddev = NULL;
647 * prevent the device from being mounted, repartitioned or
648 * otherwise reused by a RAID array (or any other kernel
649 * subsystem), by opening the device. [simply getting an
650 * inode is not enough, the SCSI module usage code needs
651 * an explicit open() on the device]
653 static int lock_rdev (mdk_rdev_t *rdev)
655 int err = 0;
658 * First insert a dummy inode.
660 if (rdev->inode)
661 MD_BUG();
662 rdev->inode = get_empty_inode();
663 if (!rdev->inode)
664 return -ENOMEM;
666 * we dont care about any other fields
668 rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
669 insert_inode_hash(rdev->inode);
671 memset(&rdev->filp, 0, sizeof(rdev->filp));
672 rdev->filp.f_mode = 3; /* read write */
673 return err;
676 static void unlock_rdev (mdk_rdev_t *rdev)
678 if (!rdev->inode)
679 MD_BUG();
680 iput(rdev->inode);
681 rdev->inode = NULL;
684 static void export_rdev (mdk_rdev_t * rdev)
686 printk("export_rdev(%s)\n",partition_name(rdev->dev));
687 if (rdev->mddev)
688 MD_BUG();
689 unlock_rdev(rdev);
690 free_disk_sb(rdev);
691 md_list_del(&rdev->all);
692 MD_INIT_LIST_HEAD(&rdev->all);
693 if (rdev->pending.next != &rdev->pending) {
694 printk("(%s was pending)\n",partition_name(rdev->dev));
695 md_list_del(&rdev->pending);
696 MD_INIT_LIST_HEAD(&rdev->pending);
698 rdev->dev = 0;
699 rdev->faulty = 0;
700 kfree(rdev);
703 static void kick_rdev_from_array (mdk_rdev_t * rdev)
705 unbind_rdev_from_array(rdev);
706 export_rdev(rdev);
709 static void export_array (mddev_t *mddev)
711 struct md_list_head *tmp;
712 mdk_rdev_t *rdev;
713 mdp_super_t *sb = mddev->sb;
715 if (mddev->sb) {
716 mddev->sb = NULL;
717 free_page((unsigned long) sb);
720 ITERATE_RDEV(mddev,rdev,tmp) {
721 if (!rdev->mddev) {
722 MD_BUG();
723 continue;
725 kick_rdev_from_array(rdev);
727 if (mddev->nb_dev)
728 MD_BUG();
731 static void free_mddev (mddev_t *mddev)
733 if (!mddev) {
734 MD_BUG();
735 return;
738 export_array(mddev);
739 md_size[mdidx(mddev)] = 0;
740 md_hd_struct[mdidx(mddev)].nr_sects = 0;
743 * Make sure nobody else is using this mddev
744 * (careful, we rely on the global kernel lock here)
746 while (md_atomic_read(&mddev->resync_sem.count) != 1)
747 schedule();
748 while (md_atomic_read(&mddev->recovery_sem.count) != 1)
749 schedule();
751 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
752 md_list_del(&mddev->all_mddevs);
753 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
754 kfree(mddev);
755 MOD_DEC_USE_COUNT;
758 #undef BAD_CSUM
759 #undef BAD_MAGIC
760 #undef OUT_OF_MEM
761 #undef NO_SB
763 static void print_desc(mdp_disk_t *desc)
765 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
766 partition_name(MKDEV(desc->major,desc->minor)),
767 desc->major,desc->minor,desc->raid_disk,desc->state);
770 static void print_sb(mdp_super_t *sb)
772 int i;
774 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
775 sb->major_version, sb->minor_version, sb->patch_version,
776 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
777 sb->ctime);
778 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
779 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
780 sb->layout, sb->chunk_size);
781 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
782 sb->utime, sb->state, sb->active_disks, sb->working_disks,
783 sb->failed_disks, sb->spare_disks,
784 sb->sb_csum, (unsigned long)sb->events_lo);
786 for (i = 0; i < MD_SB_DISKS; i++) {
787 mdp_disk_t *desc;
789 desc = sb->disks + i;
790 printk(" D %2d: ", i);
791 print_desc(desc);
793 printk(" THIS: ");
794 print_desc(&sb->this_disk);
798 static void print_rdev(mdk_rdev_t *rdev)
800 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
801 partition_name(rdev->dev), partition_name(rdev->old_dev),
802 rdev->size, rdev->faulty, rdev->desc_nr);
803 if (rdev->sb) {
804 printk("rdev superblock:\n");
805 print_sb(rdev->sb);
806 } else
807 printk("no rdev superblock!\n");
810 void md_print_devices (void)
812 struct md_list_head *tmp, *tmp2;
813 mdk_rdev_t *rdev;
814 mddev_t *mddev;
816 printk("\n");
817 printk(" **********************************\n");
818 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
819 printk(" **********************************\n");
820 ITERATE_MDDEV(mddev,tmp) {
821 printk("md%d: ", mdidx(mddev));
823 ITERATE_RDEV(mddev,rdev,tmp2)
824 printk("<%s>", partition_name(rdev->dev));
826 if (mddev->sb) {
827 printk(" array superblock:\n");
828 print_sb(mddev->sb);
829 } else
830 printk(" no array superblock.\n");
832 ITERATE_RDEV(mddev,rdev,tmp2)
833 print_rdev(rdev);
835 printk(" **********************************\n");
836 printk("\n");
839 static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
841 int ret;
842 mdp_super_t *tmp1, *tmp2;
844 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
845 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
847 if (!tmp1 || !tmp2) {
848 ret = 0;
849 goto abort;
852 *tmp1 = *sb1;
853 *tmp2 = *sb2;
856 * nr_disks is not constant
858 tmp1->nr_disks = 0;
859 tmp2->nr_disks = 0;
861 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
862 ret = 0;
863 else
864 ret = 1;
866 abort:
867 if (tmp1)
868 kfree(tmp1);
869 if (tmp2)
870 kfree(tmp2);
872 return ret;
875 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
877 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
878 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
879 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
880 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
882 return 1;
884 return 0;
887 static mdk_rdev_t * find_rdev_all (kdev_t dev)
889 struct md_list_head *tmp;
890 mdk_rdev_t *rdev;
892 tmp = all_raid_disks.next;
893 while (tmp != &all_raid_disks) {
894 rdev = md_list_entry(tmp, mdk_rdev_t, all);
895 if (rdev->dev == dev)
896 return rdev;
897 tmp = tmp->next;
899 return NULL;
902 #define GETBLK_FAILED KERN_ERR \
903 "md: getblk failed for device %s\n"
905 static int write_disk_sb(mdk_rdev_t * rdev)
907 struct buffer_head *bh;
908 kdev_t dev;
909 unsigned long sb_offset, size;
910 mdp_super_t *sb;
912 if (!rdev->sb) {
913 MD_BUG();
914 return -1;
916 if (rdev->faulty) {
917 MD_BUG();
918 return -1;
920 if (rdev->sb->md_magic != MD_SB_MAGIC) {
921 MD_BUG();
922 return -1;
925 dev = rdev->dev;
926 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
927 if (rdev->sb_offset != sb_offset) {
928 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
929 goto skip;
932 * If the disk went offline meanwhile and it's just a spare, then
933 * it's size has changed to zero silently, and the MD code does
934 * not yet know that it's faulty.
936 size = calc_dev_size(dev, rdev->mddev, 1);
937 if (size != rdev->size) {
938 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
939 goto skip;
942 printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
943 fsync_dev(dev);
944 set_blocksize(dev, MD_SB_BYTES);
945 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
946 if (!bh) {
947 printk(GETBLK_FAILED, partition_name(dev));
948 return 1;
950 memset(bh->b_data,0,bh->b_size);
951 sb = (mdp_super_t *) bh->b_data;
952 memcpy(sb, rdev->sb, MD_SB_BYTES);
954 mark_buffer_uptodate(bh, 1);
955 mark_buffer_dirty(bh);
956 ll_rw_block(WRITE, 1, &bh);
957 wait_on_buffer(bh);
958 brelse(bh);
959 fsync_dev(dev);
960 skip:
961 return 0;
963 #undef GETBLK_FAILED
965 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
967 int i, ok = 0;
968 mdp_disk_t *desc;
970 for (i = 0; i < MD_SB_DISKS; i++) {
971 desc = mddev->sb->disks + i;
972 #if 0
973 if (disk_faulty(desc)) {
974 if (MKDEV(desc->major,desc->minor) == rdev->dev)
975 ok = 1;
976 continue;
978 #endif
979 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
980 rdev->sb->this_disk = *desc;
981 rdev->desc_nr = desc->number;
982 ok = 1;
983 break;
987 if (!ok) {
988 MD_BUG();
992 static int sync_sbs(mddev_t * mddev)
994 mdk_rdev_t *rdev;
995 mdp_super_t *sb;
996 struct md_list_head *tmp;
998 ITERATE_RDEV(mddev,rdev,tmp) {
999 if (rdev->faulty)
1000 continue;
1001 sb = rdev->sb;
1002 *sb = *mddev->sb;
1003 set_this_disk(mddev, rdev);
1004 sb->sb_csum = calc_sb_csum(sb);
1006 return 0;
1009 int md_update_sb(mddev_t * mddev)
1011 int first, err, count = 100;
1012 struct md_list_head *tmp;
1013 mdk_rdev_t *rdev;
1015 repeat:
1016 mddev->sb->utime = CURRENT_TIME;
1017 if ((++mddev->sb->events_lo)==0)
1018 ++mddev->sb->events_hi;
1020 if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1022 * oops, this 64-bit counter should never wrap.
1023 * Either we are in around ~1 trillion A.C., assuming
1024 * 1 reboot per second, or we have a bug:
1026 MD_BUG();
1027 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1029 sync_sbs(mddev);
1032 * do not write anything to disk if using
1033 * nonpersistent superblocks
1035 if (mddev->sb->not_persistent)
1036 return 0;
1038 printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1039 mdidx(mddev));
1041 first = 1;
1042 err = 0;
1043 ITERATE_RDEV(mddev,rdev,tmp) {
1044 if (!first) {
1045 first = 0;
1046 printk(", ");
1048 if (rdev->faulty)
1049 printk("(skipping faulty ");
1050 printk("%s ", partition_name(rdev->dev));
1051 if (!rdev->faulty) {
1052 printk("[events: %08lx]",
1053 (unsigned long)rdev->sb->events_lo);
1054 err += write_disk_sb(rdev);
1055 } else
1056 printk(")\n");
1058 printk(".\n");
1059 if (err) {
1060 printk("errors occured during superblock update, repeating\n");
1061 if (--count)
1062 goto repeat;
1063 printk("excessive errors occured during superblock update, exiting\n");
1065 return 0;
1069 * Import a device. If 'on_disk', then sanity check the superblock
1071 * mark the device faulty if:
1073 * - the device is nonexistent (zero size)
1074 * - the device has no valid superblock
1076 * a faulty rdev _never_ has rdev->sb set.
1078 static int md_import_device (kdev_t newdev, int on_disk)
1080 int err;
1081 mdk_rdev_t *rdev;
1082 unsigned int size;
1084 if (find_rdev_all(newdev))
1085 return -EEXIST;
1087 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1088 if (!rdev) {
1089 printk("could not alloc mem for %s!\n", partition_name(newdev));
1090 return -ENOMEM;
1092 memset(rdev, 0, sizeof(*rdev));
1094 if (get_super(newdev)) {
1095 printk("md: can not import %s, has active inodes!\n",
1096 partition_name(newdev));
1097 err = -EBUSY;
1098 goto abort_free;
1101 if ((err = alloc_disk_sb(rdev)))
1102 goto abort_free;
1104 rdev->dev = newdev;
1105 if (lock_rdev(rdev)) {
1106 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1107 partition_name(newdev));
1108 err = -EINVAL;
1109 goto abort_free;
1111 rdev->desc_nr = -1;
1112 rdev->faulty = 0;
1114 size = 0;
1115 if (blk_size[MAJOR(newdev)])
1116 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1117 if (!size) {
1118 printk("md: %s has zero size, marking faulty!\n",
1119 partition_name(newdev));
1120 err = -EINVAL;
1121 goto abort_free;
1124 if (on_disk) {
1125 if ((err = read_disk_sb(rdev))) {
1126 printk("md: could not read %s's sb, not importing!\n",
1127 partition_name(newdev));
1128 goto abort_free;
1130 if ((err = check_disk_sb(rdev))) {
1131 printk("md: %s has invalid sb, not importing!\n",
1132 partition_name(newdev));
1133 goto abort_free;
1136 rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1137 rdev->sb->this_disk.minor);
1138 rdev->desc_nr = rdev->sb->this_disk.number;
1140 md_list_add(&rdev->all, &all_raid_disks);
1141 MD_INIT_LIST_HEAD(&rdev->pending);
1143 if (rdev->faulty && rdev->sb)
1144 free_disk_sb(rdev);
1145 return 0;
1147 abort_free:
1148 if (rdev->sb) {
1149 if (rdev->inode)
1150 unlock_rdev(rdev);
1151 free_disk_sb(rdev);
1153 kfree(rdev);
1154 return err;
1158 * Check a full RAID array for plausibility
1161 #define INCONSISTENT KERN_ERR \
1162 "md: fatal superblock inconsistency in %s -- removing from array\n"
1164 #define OUT_OF_DATE KERN_ERR \
1165 "md: superblock update time inconsistency -- using the most recent one\n"
1167 #define OLD_VERSION KERN_ALERT \
1168 "md: md%d: unsupported raid array version %d.%d.%d\n"
1170 #define NOT_CLEAN_IGNORE KERN_ERR \
1171 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1173 #define UNKNOWN_LEVEL KERN_ERR \
1174 "md: md%d: unsupported raid level %d\n"
1176 static int analyze_sbs (mddev_t * mddev)
1178 int out_of_date = 0, i;
1179 struct md_list_head *tmp, *tmp2;
1180 mdk_rdev_t *rdev, *rdev2, *freshest;
1181 mdp_super_t *sb;
1184 * Verify the RAID superblock on each real device
1186 ITERATE_RDEV(mddev,rdev,tmp) {
1187 if (rdev->faulty) {
1188 MD_BUG();
1189 goto abort;
1191 if (!rdev->sb) {
1192 MD_BUG();
1193 goto abort;
1195 if (check_disk_sb(rdev))
1196 goto abort;
1200 * The superblock constant part has to be the same
1201 * for all disks in the array.
1203 sb = NULL;
1205 ITERATE_RDEV(mddev,rdev,tmp) {
1206 if (!sb) {
1207 sb = rdev->sb;
1208 continue;
1210 if (!sb_equal(sb, rdev->sb)) {
1211 printk (INCONSISTENT, partition_name(rdev->dev));
1212 kick_rdev_from_array(rdev);
1213 continue;
1218 * OK, we have all disks and the array is ready to run. Let's
1219 * find the freshest superblock, that one will be the superblock
1220 * that represents the whole array.
1222 if (!mddev->sb)
1223 if (alloc_array_sb(mddev))
1224 goto abort;
1225 sb = mddev->sb;
1226 freshest = NULL;
1228 ITERATE_RDEV(mddev,rdev,tmp) {
1229 __u64 ev1, ev2;
1231 * if the checksum is invalid, use the superblock
1232 * only as a last resort. (decrease it's age by
1233 * one event)
1235 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1236 if (rdev->sb->events_lo || rdev->sb->events_hi)
1237 if ((rdev->sb->events_lo--)==0)
1238 rdev->sb->events_hi--;
1241 printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
1242 (unsigned long)rdev->sb->events_lo);
1243 if (!freshest) {
1244 freshest = rdev;
1245 continue;
1248 * Find the newest superblock version
1250 ev1 = md_event(rdev->sb);
1251 ev2 = md_event(freshest->sb);
1252 if (ev1 != ev2) {
1253 out_of_date = 1;
1254 if (ev1 > ev2)
1255 freshest = rdev;
1258 if (out_of_date) {
1259 printk(OUT_OF_DATE);
1260 printk("freshest: %s\n", partition_name(freshest->dev));
1262 memcpy (sb, freshest->sb, sizeof(*sb));
1265 * at this point we have picked the 'best' superblock
1266 * from all available superblocks.
1267 * now we validate this superblock and kick out possibly
1268 * failed disks.
1270 ITERATE_RDEV(mddev,rdev,tmp) {
1272 * Kick all non-fresh devices faulty
1274 __u64 ev1, ev2;
1275 ev1 = md_event(rdev->sb);
1276 ev2 = md_event(sb);
1277 ++ev1;
1278 if (ev1 < ev2) {
1279 printk("md: kicking non-fresh %s from array!\n",
1280 partition_name(rdev->dev));
1281 kick_rdev_from_array(rdev);
1282 continue;
1287 * Fix up changed device names ... but only if this disk has a
1288 * recent update time. Use faulty checksum ones too.
1290 ITERATE_RDEV(mddev,rdev,tmp) {
1291 __u64 ev1, ev2, ev3;
1292 if (rdev->faulty) { /* REMOVEME */
1293 MD_BUG();
1294 goto abort;
1296 ev1 = md_event(rdev->sb);
1297 ev2 = md_event(sb);
1298 ev3 = ev2;
1299 --ev3;
1300 if ((rdev->dev != rdev->old_dev) &&
1301 ((ev1 == ev2) || (ev1 == ev3))) {
1302 mdp_disk_t *desc;
1304 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1305 if (rdev->desc_nr == -1) {
1306 MD_BUG();
1307 goto abort;
1309 desc = &sb->disks[rdev->desc_nr];
1310 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1311 MD_BUG();
1312 goto abort;
1314 desc->major = MAJOR(rdev->dev);
1315 desc->minor = MINOR(rdev->dev);
1316 desc = &rdev->sb->this_disk;
1317 desc->major = MAJOR(rdev->dev);
1318 desc->minor = MINOR(rdev->dev);
1323 * Remove unavailable and faulty devices ...
1325 * note that if an array becomes completely unrunnable due to
1326 * missing devices, we do not write the superblock back, so the
1327 * administrator has a chance to fix things up. The removal thus
1328 * only happens if it's nonfatal to the contents of the array.
1330 for (i = 0; i < MD_SB_DISKS; i++) {
1331 int found;
1332 mdp_disk_t *desc;
1333 kdev_t dev;
1335 desc = sb->disks + i;
1336 dev = MKDEV(desc->major, desc->minor);
1339 * We kick faulty devices/descriptors immediately.
1341 if (disk_faulty(desc)) {
1342 found = 0;
1343 ITERATE_RDEV(mddev,rdev,tmp) {
1344 if (rdev->desc_nr != desc->number)
1345 continue;
1346 printk("md%d: kicking faulty %s!\n",
1347 mdidx(mddev),partition_name(rdev->dev));
1348 kick_rdev_from_array(rdev);
1349 found = 1;
1350 break;
1352 if (!found) {
1353 if (dev == MKDEV(0,0))
1354 continue;
1355 printk("md%d: removing former faulty %s!\n",
1356 mdidx(mddev), partition_name(dev));
1358 remove_descriptor(desc, sb);
1359 continue;
1362 if (dev == MKDEV(0,0))
1363 continue;
1365 * Is this device present in the rdev ring?
1367 found = 0;
1368 ITERATE_RDEV(mddev,rdev,tmp) {
1369 if (rdev->desc_nr == desc->number) {
1370 found = 1;
1371 break;
1374 if (found)
1375 continue;
1377 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1378 remove_descriptor(desc, sb);
1382 * Double check wether all devices mentioned in the
1383 * superblock are in the rdev ring.
1385 for (i = 0; i < MD_SB_DISKS; i++) {
1386 mdp_disk_t *desc;
1387 kdev_t dev;
1389 desc = sb->disks + i;
1390 dev = MKDEV(desc->major, desc->minor);
1392 if (dev == MKDEV(0,0))
1393 continue;
1395 if (disk_faulty(desc)) {
1396 MD_BUG();
1397 goto abort;
1400 rdev = find_rdev(mddev, dev);
1401 if (!rdev) {
1402 MD_BUG();
1403 goto abort;
1408 * Do a final reality check.
1410 ITERATE_RDEV(mddev,rdev,tmp) {
1411 if (rdev->desc_nr == -1) {
1412 MD_BUG();
1413 goto abort;
1416 * is the desc_nr unique?
1418 ITERATE_RDEV(mddev,rdev2,tmp2) {
1419 if ((rdev2 != rdev) &&
1420 (rdev2->desc_nr == rdev->desc_nr)) {
1421 MD_BUG();
1422 goto abort;
1426 * is the device unique?
1428 ITERATE_RDEV(mddev,rdev2,tmp2) {
1429 if ((rdev2 != rdev) &&
1430 (rdev2->dev == rdev->dev)) {
1431 MD_BUG();
1432 goto abort;
1438 * Check if we can support this RAID array
1440 if (sb->major_version != MD_MAJOR_VERSION ||
1441 sb->minor_version > MD_MINOR_VERSION) {
1443 printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1444 sb->minor_version, sb->patch_version);
1445 goto abort;
1448 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1449 (sb->level == 4) || (sb->level == 5)))
1450 printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1452 return 0;
1453 abort:
1454 return 1;
1457 #undef INCONSISTENT
1458 #undef OUT_OF_DATE
1459 #undef OLD_VERSION
1460 #undef OLD_LEVEL
1462 static int device_size_calculation (mddev_t * mddev)
1464 int data_disks = 0, persistent;
1465 unsigned int readahead;
1466 mdp_super_t *sb = mddev->sb;
1467 struct md_list_head *tmp;
1468 mdk_rdev_t *rdev;
1471 * Do device size calculation. Bail out if too small.
1472 * (we have to do this after having validated chunk_size,
1473 * because device size has to be modulo chunk_size)
1475 persistent = !mddev->sb->not_persistent;
1476 ITERATE_RDEV(mddev,rdev,tmp) {
1477 if (rdev->faulty)
1478 continue;
1479 if (rdev->size) {
1480 MD_BUG();
1481 continue;
1483 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1484 if (rdev->size < sb->chunk_size / 1024) {
1485 printk (KERN_WARNING
1486 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1487 partition_name(rdev->dev),
1488 rdev->size, sb->chunk_size / 1024);
1489 return -EINVAL;
1493 switch (sb->level) {
1494 case -3:
1495 data_disks = 1;
1496 break;
1497 case -2:
1498 data_disks = 1;
1499 break;
1500 case -1:
1501 zoned_raid_size(mddev);
1502 data_disks = 1;
1503 break;
1504 case 0:
1505 zoned_raid_size(mddev);
1506 data_disks = sb->raid_disks;
1507 break;
1508 case 1:
1509 data_disks = 1;
1510 break;
1511 case 4:
1512 case 5:
1513 data_disks = sb->raid_disks-1;
1514 break;
1515 default:
1516 printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1517 goto abort;
1519 if (!md_size[mdidx(mddev)])
1520 md_size[mdidx(mddev)] = sb->size * data_disks;
1522 readahead = MD_READAHEAD;
1523 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1524 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1525 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1526 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1527 } else {
1528 if (sb->level == -3)
1529 readahead = 0;
1531 md_maxreadahead[mdidx(mddev)] = readahead;
1533 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1534 mdidx(mddev), readahead*(PAGE_SIZE/1024));
1536 printk(KERN_INFO
1537 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1538 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1539 return 0;
1540 abort:
1541 return 1;
1545 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1546 "too big chunk_size: %d > %d\n"
1548 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1549 "too small chunk_size: %d < %ld\n"
1551 #define BAD_CHUNKSIZE KERN_ERR \
1552 "no chunksize specified, see 'man raidtab'\n"
1554 static int do_md_run (mddev_t * mddev)
1556 int pnum, err;
1557 int chunk_size;
1558 struct md_list_head *tmp;
1559 mdk_rdev_t *rdev;
1562 if (!mddev->nb_dev) {
1563 MD_BUG();
1564 return -EINVAL;
1567 if (mddev->pers)
1568 return -EBUSY;
1571 * Resize disks to align partitions size on a given
1572 * chunk size.
1574 md_size[mdidx(mddev)] = 0;
1577 * Analyze all RAID superblock(s)
1579 if (analyze_sbs(mddev)) {
1580 MD_BUG();
1581 return -EINVAL;
1584 chunk_size = mddev->sb->chunk_size;
1585 pnum = level_to_pers(mddev->sb->level);
1587 mddev->param.chunk_size = chunk_size;
1588 mddev->param.personality = pnum;
1590 if (chunk_size > MAX_CHUNK_SIZE) {
1591 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1592 return -EINVAL;
1595 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1597 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1598 MD_BUG();
1599 return -EINVAL;
1601 if (chunk_size < PAGE_SIZE) {
1602 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1603 return -EINVAL;
1606 if (pnum >= MAX_PERSONALITY) {
1607 MD_BUG();
1608 return -EINVAL;
1611 if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
1613 * 'default chunksize' in the old md code used to
1614 * be PAGE_SIZE, baaad.
1615 * we abort here to be on the safe side. We dont
1616 * want to continue the bad practice.
1618 printk(BAD_CHUNKSIZE);
1619 return -EINVAL;
1622 if (!pers[pnum])
1624 #ifdef CONFIG_KMOD
1625 char module_name[80];
1626 sprintf (module_name, "md-personality-%d", pnum);
1627 request_module (module_name);
1628 if (!pers[pnum])
1629 #endif
1630 return -EINVAL;
1633 if (device_size_calculation(mddev))
1634 return -EINVAL;
1637 * Drop all container device buffers, from now on
1638 * the only valid external interface is through the md
1639 * device.
1640 * Also find largest hardsector size
1642 md_hardsect_sizes[mdidx(mddev)] = 512;
1643 ITERATE_RDEV(mddev,rdev,tmp) {
1644 if (rdev->faulty)
1645 continue;
1646 fsync_dev(rdev->dev);
1647 invalidate_buffers(rdev->dev);
1648 if (get_hardsect_size(rdev->dev)
1649 > md_hardsect_sizes[mdidx(mddev)])
1650 md_hardsect_sizes[mdidx(mddev)] =
1651 get_hardsect_size(rdev->dev);
1653 md_blocksizes[mdidx(mddev)] = 1024;
1654 if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1655 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1656 mddev->pers = pers[pnum];
1658 err = mddev->pers->run(mddev);
1659 if (err) {
1660 printk("pers->run() failed ...\n");
1661 mddev->pers = NULL;
1662 return -EINVAL;
1665 mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1666 md_update_sb(mddev);
1669 * md_size has units of 1K blocks, which are
1670 * twice as large as sectors.
1672 md_hd_struct[mdidx(mddev)].start_sect = 0;
1673 md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
1675 read_ahead[MD_MAJOR] = 1024;
1676 return (0);
1679 #undef TOO_BIG_CHUNKSIZE
1680 #undef BAD_CHUNKSIZE
1682 #define OUT(x) do { err = (x); goto out; } while (0)
1684 static int restart_array (mddev_t *mddev)
1686 int err = 0;
1689 * Complain if it has no devices
1691 if (!mddev->nb_dev)
1692 OUT(-ENXIO);
1694 if (mddev->pers) {
1695 if (!mddev->ro)
1696 OUT(-EBUSY);
1698 mddev->ro = 0;
1699 set_device_ro(mddev_to_kdev(mddev), 0);
1701 printk (KERN_INFO
1702 "md%d switched to read-write mode.\n", mdidx(mddev));
1704 * Kick recovery or resync if necessary
1706 md_recover_arrays();
1707 if (mddev->pers->restart_resync)
1708 mddev->pers->restart_resync(mddev);
1709 } else
1710 err = -EINVAL;
1712 out:
1713 return err;
1716 #define STILL_MOUNTED KERN_WARNING \
1717 "md: md%d still mounted.\n"
1719 static int do_md_stop (mddev_t * mddev, int ro)
1721 int err = 0, resync_interrupted = 0;
1722 kdev_t dev = mddev_to_kdev(mddev);
1724 if (!ro && get_super(dev)) {
1725 printk (STILL_MOUNTED, mdidx(mddev));
1726 OUT(-EBUSY);
1729 if (mddev->pers) {
1731 * It is safe to call stop here, it only frees private
1732 * data. Also, it tells us if a device is unstoppable
1733 * (eg. resyncing is in progress)
1735 if (mddev->pers->stop_resync)
1736 if (mddev->pers->stop_resync(mddev))
1737 resync_interrupted = 1;
1739 if (mddev->recovery_running)
1740 md_interrupt_thread(md_recovery_thread);
1743 * This synchronizes with signal delivery to the
1744 * resync or reconstruction thread. It also nicely
1745 * hangs the process if some reconstruction has not
1746 * finished.
1748 down(&mddev->recovery_sem);
1749 up(&mddev->recovery_sem);
1752 * sync and invalidate buffers because we cannot kill the
1753 * main thread with valid IO transfers still around.
1754 * the kernel lock protects us from new requests being
1755 * added after invalidate_buffers().
1757 fsync_dev (mddev_to_kdev(mddev));
1758 fsync_dev (dev);
1759 invalidate_buffers (dev);
1761 if (ro) {
1762 if (mddev->ro)
1763 OUT(-ENXIO);
1764 mddev->ro = 1;
1765 } else {
1766 if (mddev->ro)
1767 set_device_ro(dev, 0);
1768 if (mddev->pers->stop(mddev)) {
1769 if (mddev->ro)
1770 set_device_ro(dev, 1);
1771 OUT(-EBUSY);
1773 if (mddev->ro)
1774 mddev->ro = 0;
1776 if (mddev->sb) {
1778 * mark it clean only if there was no resync
1779 * interrupted.
1781 if (!mddev->recovery_running && !resync_interrupted) {
1782 printk("marking sb clean...\n");
1783 mddev->sb->state |= 1 << MD_SB_CLEAN;
1785 md_update_sb(mddev);
1787 if (ro)
1788 set_device_ro(dev, 1);
1792 * Free resources if final stop
1794 if (!ro) {
1795 printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
1796 free_mddev(mddev);
1798 } else
1799 printk (KERN_INFO
1800 "md%d switched to read-only mode.\n", mdidx(mddev));
1801 out:
1802 return err;
1805 #undef OUT
1808 * We have to safely support old arrays too.
1810 int detect_old_array (mdp_super_t *sb)
1812 if (sb->major_version > 0)
1813 return 0;
1814 if (sb->minor_version >= 90)
1815 return 0;
1817 return -EINVAL;
1821 static void autorun_array (mddev_t *mddev)
1823 mdk_rdev_t *rdev;
1824 struct md_list_head *tmp;
1825 int err;
1827 if (mddev->disks.prev == &mddev->disks) {
1828 MD_BUG();
1829 return;
1832 printk("running: ");
1834 ITERATE_RDEV(mddev,rdev,tmp) {
1835 printk("<%s>", partition_name(rdev->dev));
1837 printk("\nnow!\n");
1839 err = do_md_run (mddev);
1840 if (err) {
1841 printk("do_md_run() returned %d\n", err);
1843 * prevent the writeback of an unrunnable array
1845 mddev->sb_dirty = 0;
1846 do_md_stop (mddev, 0);
1851 * lets try to run arrays based on all disks that have arrived
1852 * until now. (those are in the ->pending list)
1854 * the method: pick the first pending disk, collect all disks with
1855 * the same UUID, remove all from the pending list and put them into
1856 * the 'same_array' list. Then order this list based on superblock
1857 * update time (freshest comes first), kick out 'old' disks and
1858 * compare superblocks. If everything's fine then run it.
1860 static void autorun_devices (void)
1862 struct md_list_head candidates;
1863 struct md_list_head *tmp;
1864 mdk_rdev_t *rdev0, *rdev;
1865 mddev_t *mddev;
1866 kdev_t md_kdev;
1869 printk("autorun ...\n");
1870 while (pending_raid_disks.next != &pending_raid_disks) {
1871 rdev0 = md_list_entry(pending_raid_disks.next,
1872 mdk_rdev_t, pending);
1874 printk("considering %s ...\n", partition_name(rdev0->dev));
1875 MD_INIT_LIST_HEAD(&candidates);
1876 ITERATE_RDEV_PENDING(rdev,tmp) {
1877 if (uuid_equal(rdev0, rdev)) {
1878 if (!sb_equal(rdev0->sb, rdev->sb)) {
1879 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1880 continue;
1882 printk(" adding %s ...\n", partition_name(rdev->dev));
1883 md_list_del(&rdev->pending);
1884 md_list_add(&rdev->pending, &candidates);
1888 * now we have a set of devices, with all of them having
1889 * mostly sane superblocks. It's time to allocate the
1890 * mddev.
1892 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1893 mddev = kdev_to_mddev(md_kdev);
1894 if (mddev) {
1895 printk("md%d already running, cannot run %s\n",
1896 mdidx(mddev), partition_name(rdev0->dev));
1897 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1898 export_rdev(rdev);
1899 continue;
1901 mddev = alloc_mddev(md_kdev);
1902 printk("created md%d\n", mdidx(mddev));
1903 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1904 bind_rdev_to_array(rdev, mddev);
1905 md_list_del(&rdev->pending);
1906 MD_INIT_LIST_HEAD(&rdev->pending);
1908 autorun_array(mddev);
1910 printk("... autorun DONE.\n");
1914 * import RAID devices based on one partition
1915 * if possible, the array gets run as well.
1918 #define BAD_VERSION KERN_ERR \
1919 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1921 #define OUT_OF_MEM KERN_ALERT \
1922 "md: out of memory.\n"
1924 #define NO_DEVICE KERN_ERR \
1925 "md: disabled device %s\n"
1927 #define AUTOADD_FAILED KERN_ERR \
1928 "md: auto-adding devices to md%d FAILED (error %d).\n"
1930 #define AUTOADD_FAILED_USED KERN_ERR \
1931 "md: cannot auto-add device %s to md%d, already used.\n"
1933 #define AUTORUN_FAILED KERN_ERR \
1934 "md: auto-running md%d FAILED (error %d).\n"
1936 #define MDDEV_BUSY KERN_ERR \
1937 "md: cannot auto-add to md%d, already running.\n"
1939 #define AUTOADDING KERN_INFO \
1940 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1942 #define AUTORUNNING KERN_INFO \
1943 "md: auto-running md%d.\n"
1945 static int autostart_array (kdev_t startdev)
1947 int err = -EINVAL, i;
1948 mdp_super_t *sb = NULL;
1949 mdk_rdev_t *start_rdev = NULL, *rdev;
1951 if (md_import_device(startdev, 1)) {
1952 printk("could not import %s!\n", partition_name(startdev));
1953 goto abort;
1956 start_rdev = find_rdev_all(startdev);
1957 if (!start_rdev) {
1958 MD_BUG();
1959 goto abort;
1961 if (start_rdev->faulty) {
1962 printk("can not autostart based on faulty %s!\n",
1963 partition_name(startdev));
1964 goto abort;
1966 md_list_add(&start_rdev->pending, &pending_raid_disks);
1968 sb = start_rdev->sb;
1970 err = detect_old_array(sb);
1971 if (err) {
1972 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1973 goto abort;
1976 for (i = 0; i < MD_SB_DISKS; i++) {
1977 mdp_disk_t *desc;
1978 kdev_t dev;
1980 desc = sb->disks + i;
1981 dev = MKDEV(desc->major, desc->minor);
1983 if (dev == MKDEV(0,0))
1984 continue;
1985 if (dev == startdev)
1986 continue;
1987 if (md_import_device(dev, 1)) {
1988 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
1989 continue;
1991 rdev = find_rdev_all(dev);
1992 if (!rdev) {
1993 MD_BUG();
1994 goto abort;
1996 md_list_add(&rdev->pending, &pending_raid_disks);
2000 * possibly return codes
2002 autorun_devices();
2003 return 0;
2005 abort:
2006 if (start_rdev)
2007 export_rdev(start_rdev);
2008 return err;
2011 #undef BAD_VERSION
2012 #undef OUT_OF_MEM
2013 #undef NO_DEVICE
2014 #undef AUTOADD_FAILED_USED
2015 #undef AUTOADD_FAILED
2016 #undef AUTORUN_FAILED
2017 #undef AUTOADDING
2018 #undef AUTORUNNING
2020 struct {
2021 int set;
2022 int noautodetect;
2024 } raid_setup_args md__initdata = { 0, 0 };
2026 void md_setup_drive(void) md__init;
2029 * Searches all registered partitions for autorun RAID arrays
2030 * at boot time.
2032 #ifdef CONFIG_AUTODETECT_RAID
2033 static int detected_devices[128] md__initdata;
2034 static int dev_cnt=0;
2035 void md_autodetect_dev(kdev_t dev)
2037 if (dev_cnt >= 0 && dev_cnt < 127)
2038 detected_devices[dev_cnt++] = dev;
2040 #endif
2042 void md__init md_run_setup(void)
2044 #ifdef CONFIG_AUTODETECT_RAID
2045 mdk_rdev_t *rdev;
2046 int i;
2048 if (raid_setup_args.noautodetect)
2049 printk(KERN_INFO "skipping autodetection of RAID arrays\n");
2050 else {
2052 printk(KERN_INFO "autodetecting RAID arrays\n");
2054 for (i=0; i<dev_cnt; i++) {
2055 kdev_t dev = detected_devices[i];
2057 if (md_import_device(dev,1)) {
2058 printk(KERN_ALERT "could not import %s!\n",
2059 partition_name(dev));
2060 continue;
2063 * Sanity checks:
2065 rdev = find_rdev_all(dev);
2066 if (!rdev) {
2067 MD_BUG();
2068 continue;
2070 if (rdev->faulty) {
2071 MD_BUG();
2072 continue;
2074 md_list_add(&rdev->pending, &pending_raid_disks);
2077 autorun_devices();
2080 dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */
2081 #endif
2082 #ifdef CONFIG_MD_BOOT
2083 md_setup_drive();
2084 #endif
2088 static int get_version (void * arg)
2090 mdu_version_t ver;
2092 ver.major = MD_MAJOR_VERSION;
2093 ver.minor = MD_MINOR_VERSION;
2094 ver.patchlevel = MD_PATCHLEVEL_VERSION;
2096 if (md_copy_to_user(arg, &ver, sizeof(ver)))
2097 return -EFAULT;
2099 return 0;
2102 #define SET_FROM_SB(x) info.x = mddev->sb->x
2103 static int get_array_info (mddev_t * mddev, void * arg)
2105 mdu_array_info_t info;
2107 if (!mddev->sb)
2108 return -EINVAL;
2110 SET_FROM_SB(major_version);
2111 SET_FROM_SB(minor_version);
2112 SET_FROM_SB(patch_version);
2113 SET_FROM_SB(ctime);
2114 SET_FROM_SB(level);
2115 SET_FROM_SB(size);
2116 SET_FROM_SB(nr_disks);
2117 SET_FROM_SB(raid_disks);
2118 SET_FROM_SB(md_minor);
2119 SET_FROM_SB(not_persistent);
2121 SET_FROM_SB(utime);
2122 SET_FROM_SB(state);
2123 SET_FROM_SB(active_disks);
2124 SET_FROM_SB(working_disks);
2125 SET_FROM_SB(failed_disks);
2126 SET_FROM_SB(spare_disks);
2128 SET_FROM_SB(layout);
2129 SET_FROM_SB(chunk_size);
2131 if (md_copy_to_user(arg, &info, sizeof(info)))
2132 return -EFAULT;
2134 return 0;
2136 #undef SET_FROM_SB
2138 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2139 static int get_disk_info (mddev_t * mddev, void * arg)
2141 mdu_disk_info_t info;
2142 unsigned int nr;
2144 if (!mddev->sb)
2145 return -EINVAL;
2147 if (md_copy_from_user(&info, arg, sizeof(info)))
2148 return -EFAULT;
2150 nr = info.number;
2151 if (nr >= mddev->sb->nr_disks)
2152 return -EINVAL;
2154 SET_FROM_SB(major);
2155 SET_FROM_SB(minor);
2156 SET_FROM_SB(raid_disk);
2157 SET_FROM_SB(state);
2159 if (md_copy_to_user(arg, &info, sizeof(info)))
2160 return -EFAULT;
2162 return 0;
2164 #undef SET_FROM_SB
2166 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2168 static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2170 int err, size, persistent;
2171 mdk_rdev_t *rdev;
2172 unsigned int nr;
2173 kdev_t dev;
2174 dev = MKDEV(info->major,info->minor);
2176 if (find_rdev_all(dev)) {
2177 printk("device %s already used in a RAID array!\n",
2178 partition_name(dev));
2179 return -EBUSY;
2181 if (!mddev->sb) {
2182 /* expecting a device which has a superblock */
2183 err = md_import_device(dev, 1);
2184 if (err) {
2185 printk("md error, md_import_device returned %d\n", err);
2186 return -EINVAL;
2188 rdev = find_rdev_all(dev);
2189 if (!rdev) {
2190 MD_BUG();
2191 return -EINVAL;
2193 if (mddev->nb_dev) {
2194 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2195 mdk_rdev_t, same_set);
2196 if (!uuid_equal(rdev0, rdev)) {
2197 printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2198 export_rdev(rdev);
2199 return -EINVAL;
2201 if (!sb_equal(rdev0->sb, rdev->sb)) {
2202 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2203 export_rdev(rdev);
2204 return -EINVAL;
2207 bind_rdev_to_array(rdev, mddev);
2208 return 0;
2211 nr = info->number;
2212 if (nr >= mddev->sb->nr_disks)
2213 return -EINVAL;
2215 SET_SB(number);
2216 SET_SB(major);
2217 SET_SB(minor);
2218 SET_SB(raid_disk);
2219 SET_SB(state);
2221 if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2222 err = md_import_device (dev, 0);
2223 if (err) {
2224 printk("md: error, md_import_device() returned %d\n", err);
2225 return -EINVAL;
2227 rdev = find_rdev_all(dev);
2228 if (!rdev) {
2229 MD_BUG();
2230 return -EINVAL;
2233 rdev->old_dev = dev;
2234 rdev->desc_nr = info->number;
2236 bind_rdev_to_array(rdev, mddev);
2238 persistent = !mddev->sb->not_persistent;
2239 if (!persistent)
2240 printk("nonpersistent superblock ...\n");
2241 if (!mddev->sb->chunk_size)
2242 printk("no chunksize?\n");
2244 size = calc_dev_size(dev, mddev, persistent);
2245 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2247 if (!mddev->sb->size || (mddev->sb->size > size))
2248 mddev->sb->size = size;
2252 * sync all other superblocks with the main superblock
2254 sync_sbs(mddev);
2256 return 0;
2258 #undef SET_SB
2260 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2262 int err;
2263 mdk_rdev_t *rdev;
2264 mdp_disk_t *disk;
2266 if (!mddev->pers)
2267 return -ENODEV;
2269 printk("trying to remove %s from md%d ... \n",
2270 partition_name(dev), mdidx(mddev));
2272 if (!mddev->pers->diskop) {
2273 printk("md%d: personality does not support diskops!\n",
2274 mdidx(mddev));
2275 return -EINVAL;
2278 rdev = find_rdev(mddev, dev);
2279 if (!rdev)
2280 return -ENXIO;
2282 if (rdev->desc_nr == -1) {
2283 MD_BUG();
2284 return -EINVAL;
2286 disk = &mddev->sb->disks[rdev->desc_nr];
2287 if (disk_active(disk))
2288 goto busy;
2289 if (disk_removed(disk)) {
2290 MD_BUG();
2291 return -EINVAL;
2294 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2295 if (err == -EBUSY)
2296 goto busy;
2297 if (err) {
2298 MD_BUG();
2299 return -EINVAL;
2302 remove_descriptor(disk, mddev->sb);
2303 kick_rdev_from_array(rdev);
2304 mddev->sb_dirty = 1;
2305 md_update_sb(mddev);
2307 return 0;
2308 busy:
2309 printk("cannot remove active disk %s from md%d ... \n",
2310 partition_name(dev), mdidx(mddev));
2311 return -EBUSY;
2314 static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2316 int i, err, persistent;
2317 unsigned int size;
2318 mdk_rdev_t *rdev;
2319 mdp_disk_t *disk;
2321 if (!mddev->pers)
2322 return -ENODEV;
2324 printk("trying to hot-add %s to md%d ... \n",
2325 partition_name(dev), mdidx(mddev));
2327 if (!mddev->pers->diskop) {
2328 printk("md%d: personality does not support diskops!\n",
2329 mdidx(mddev));
2330 return -EINVAL;
2333 persistent = !mddev->sb->not_persistent;
2334 size = calc_dev_size(dev, mddev, persistent);
2336 if (size < mddev->sb->size) {
2337 printk("md%d: disk size %d blocks < array size %d\n",
2338 mdidx(mddev), size, mddev->sb->size);
2339 return -ENOSPC;
2342 rdev = find_rdev(mddev, dev);
2343 if (rdev)
2344 return -EBUSY;
2346 err = md_import_device (dev, 0);
2347 if (err) {
2348 printk("md: error, md_import_device() returned %d\n", err);
2349 return -EINVAL;
2351 rdev = find_rdev_all(dev);
2352 if (!rdev) {
2353 MD_BUG();
2354 return -EINVAL;
2356 if (rdev->faulty) {
2357 printk("md: can not hot-add faulty %s disk to md%d!\n",
2358 partition_name(dev), mdidx(mddev));
2359 err = -EINVAL;
2360 goto abort_export;
2362 bind_rdev_to_array(rdev, mddev);
2365 * The rest should better be atomic, we can have disk failures
2366 * noticed in interrupt contexts ...
2368 rdev->old_dev = dev;
2369 rdev->size = size;
2370 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2372 disk = mddev->sb->disks + mddev->sb->raid_disks;
2373 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2374 disk = mddev->sb->disks + i;
2376 if (!disk->major && !disk->minor)
2377 break;
2378 if (disk_removed(disk))
2379 break;
2381 if (i == MD_SB_DISKS) {
2382 printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2383 err = -EBUSY;
2384 goto abort_unbind_export;
2387 if (disk_removed(disk)) {
2389 * reuse slot
2391 if (disk->number != i) {
2392 MD_BUG();
2393 err = -EINVAL;
2394 goto abort_unbind_export;
2396 } else {
2397 disk->number = i;
2400 disk->raid_disk = disk->number;
2401 disk->major = MAJOR(dev);
2402 disk->minor = MINOR(dev);
2404 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2405 MD_BUG();
2406 err = -EINVAL;
2407 goto abort_unbind_export;
2410 mark_disk_spare(disk);
2411 mddev->sb->nr_disks++;
2412 mddev->sb->spare_disks++;
2413 mddev->sb->working_disks++;
2415 mddev->sb_dirty = 1;
2417 md_update_sb(mddev);
2420 * Kick recovery, maybe this spare has to be added to the
2421 * array immediately.
2423 md_recover_arrays();
2425 return 0;
2427 abort_unbind_export:
2428 unbind_rdev_from_array(rdev);
2430 abort_export:
2431 export_rdev(rdev);
2432 return err;
2435 #define SET_SB(x) mddev->sb->x = info->x
2436 static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2439 if (alloc_array_sb(mddev))
2440 return -ENOMEM;
2442 mddev->sb->major_version = MD_MAJOR_VERSION;
2443 mddev->sb->minor_version = MD_MINOR_VERSION;
2444 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2445 mddev->sb->ctime = CURRENT_TIME;
2447 SET_SB(level);
2448 SET_SB(size);
2449 SET_SB(nr_disks);
2450 SET_SB(raid_disks);
2451 SET_SB(md_minor);
2452 SET_SB(not_persistent);
2454 SET_SB(state);
2455 SET_SB(active_disks);
2456 SET_SB(working_disks);
2457 SET_SB(failed_disks);
2458 SET_SB(spare_disks);
2460 SET_SB(layout);
2461 SET_SB(chunk_size);
2463 mddev->sb->md_magic = MD_SB_MAGIC;
2466 * Generate a 128 bit UUID
2468 get_random_bytes(&mddev->sb->set_uuid0, 4);
2469 get_random_bytes(&mddev->sb->set_uuid1, 4);
2470 get_random_bytes(&mddev->sb->set_uuid2, 4);
2471 get_random_bytes(&mddev->sb->set_uuid3, 4);
2473 return 0;
2475 #undef SET_SB
2477 static int set_disk_info (mddev_t * mddev, void * arg)
2479 printk("not yet");
2480 return -EINVAL;
2483 static int clear_array (mddev_t * mddev)
2485 printk("not yet");
2486 return -EINVAL;
2489 static int write_raid_info (mddev_t * mddev)
2491 printk("not yet");
2492 return -EINVAL;
2495 static int protect_array (mddev_t * mddev)
2497 printk("not yet");
2498 return -EINVAL;
2501 static int unprotect_array (mddev_t * mddev)
2503 printk("not yet");
2504 return -EINVAL;
2507 static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2509 int ret;
2511 fsync_dev(mddev_to_kdev(mddev));
2512 ret = md_error(mddev_to_kdev(mddev), dev);
2513 return ret;
2516 static int md_ioctl (struct inode *inode, struct file *file,
2517 unsigned int cmd, unsigned long arg)
2519 unsigned int minor;
2520 int err = 0;
2521 struct hd_geometry *loc = (struct hd_geometry *) arg;
2522 mddev_t *mddev = NULL;
2523 kdev_t dev;
2525 if (!md_capable_admin())
2526 return -EACCES;
2528 dev = inode->i_rdev;
2529 minor = MINOR(dev);
2530 if (minor >= MAX_MD_DEVS)
2531 return -EINVAL;
2534 * Commands dealing with the RAID driver but not any
2535 * particular array:
2537 switch (cmd)
2539 case RAID_VERSION:
2540 err = get_version((void *)arg);
2541 goto done;
2543 case PRINT_RAID_DEBUG:
2544 err = 0;
2545 md_print_devices();
2546 goto done_unlock;
2548 case BLKGETSIZE: /* Return device size */
2549 if (!arg) {
2550 err = -EINVAL;
2551 goto abort;
2553 err = md_put_user(md_hd_struct[minor].nr_sects,
2554 (long *) arg);
2555 goto done;
2557 case BLKFLSBUF:
2558 fsync_dev(dev);
2559 invalidate_buffers(dev);
2560 goto done;
2562 case BLKRASET:
2563 if (arg > 0xff) {
2564 err = -EINVAL;
2565 goto abort;
2567 read_ahead[MAJOR(dev)] = arg;
2568 goto done;
2570 case BLKRAGET:
2571 if (!arg) {
2572 err = -EINVAL;
2573 goto abort;
2575 err = md_put_user (read_ahead[
2576 MAJOR(dev)], (long *) arg);
2577 goto done;
2578 default:
2582 * Commands creating/starting a new array:
2585 mddev = kdev_to_mddev(dev);
2587 switch (cmd)
2589 case SET_ARRAY_INFO:
2590 case START_ARRAY:
2591 if (mddev) {
2592 printk("array md%d already exists!\n",
2593 mdidx(mddev));
2594 err = -EEXIST;
2595 goto abort;
2597 default:
2599 switch (cmd)
2601 case SET_ARRAY_INFO:
2602 mddev = alloc_mddev(dev);
2603 if (!mddev) {
2604 err = -ENOMEM;
2605 goto abort;
2608 * alloc_mddev() should possibly self-lock.
2610 err = lock_mddev(mddev);
2611 if (err) {
2612 printk("ioctl, reason %d, cmd %d\n", err, cmd);
2613 goto abort;
2616 if (mddev->sb) {
2617 printk("array md%d already has a superblock!\n",
2618 mdidx(mddev));
2619 err = -EBUSY;
2620 goto abort_unlock;
2622 if (arg) {
2623 mdu_array_info_t info;
2624 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2625 err = -EFAULT;
2626 goto abort_unlock;
2628 err = set_array_info(mddev, &info);
2629 if (err) {
2630 printk("couldnt set array info. %d\n", err);
2631 goto abort_unlock;
2634 goto done_unlock;
2636 case START_ARRAY:
2638 * possibly make it lock the array ...
2640 err = autostart_array((kdev_t)arg);
2641 if (err) {
2642 printk("autostart %s failed!\n",
2643 partition_name((kdev_t)arg));
2644 goto abort;
2646 goto done;
2648 default:
2652 * Commands querying/configuring an existing array:
2655 if (!mddev) {
2656 err = -ENODEV;
2657 goto abort;
2659 err = lock_mddev(mddev);
2660 if (err) {
2661 printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2662 goto abort;
2664 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2665 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2666 err = -ENODEV;
2667 goto abort_unlock;
2671 * Commands even a read-only array can execute:
2673 switch (cmd)
2675 case GET_ARRAY_INFO:
2676 err = get_array_info(mddev, (void *)arg);
2677 goto done_unlock;
2679 case GET_DISK_INFO:
2680 err = get_disk_info(mddev, (void *)arg);
2681 goto done_unlock;
2683 case RESTART_ARRAY_RW:
2684 err = restart_array(mddev);
2685 goto done_unlock;
2687 case STOP_ARRAY:
2688 if (!(err = do_md_stop (mddev, 0)))
2689 mddev = NULL;
2690 goto done_unlock;
2692 case STOP_ARRAY_RO:
2693 err = do_md_stop (mddev, 1);
2694 goto done_unlock;
2697 * We have a problem here : there is no easy way to give a CHS
2698 * virtual geometry. We currently pretend that we have a 2 heads
2699 * 4 sectors (with a BIG number of cylinders...). This drives
2700 * dosfs just mad... ;-)
2702 case HDIO_GETGEO:
2703 if (!loc) {
2704 err = -EINVAL;
2705 goto abort_unlock;
2707 err = md_put_user (2, (char *) &loc->heads);
2708 if (err)
2709 goto abort_unlock;
2710 err = md_put_user (4, (char *) &loc->sectors);
2711 if (err)
2712 goto abort_unlock;
2713 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2714 (short *) &loc->cylinders);
2715 if (err)
2716 goto abort_unlock;
2717 err = md_put_user (md_hd_struct[minor].start_sect,
2718 (long *) &loc->start);
2719 goto done_unlock;
2723 * The remaining ioctls are changing the state of the
2724 * superblock, so we do not allow read-only arrays
2725 * here:
2727 if (mddev->ro) {
2728 err = -EROFS;
2729 goto abort_unlock;
2732 switch (cmd)
2734 case CLEAR_ARRAY:
2735 err = clear_array(mddev);
2736 goto done_unlock;
2738 case ADD_NEW_DISK:
2740 mdu_disk_info_t info;
2741 if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2742 err = -EFAULT;
2743 else
2744 err = add_new_disk(mddev, &info);
2745 goto done_unlock;
2747 case HOT_REMOVE_DISK:
2748 err = hot_remove_disk(mddev, (kdev_t)arg);
2749 goto done_unlock;
2751 case HOT_ADD_DISK:
2752 err = hot_add_disk(mddev, (kdev_t)arg);
2753 goto done_unlock;
2755 case SET_DISK_INFO:
2756 err = set_disk_info(mddev, (void *)arg);
2757 goto done_unlock;
2759 case WRITE_RAID_INFO:
2760 err = write_raid_info(mddev);
2761 goto done_unlock;
2763 case UNPROTECT_ARRAY:
2764 err = unprotect_array(mddev);
2765 goto done_unlock;
2767 case PROTECT_ARRAY:
2768 err = protect_array(mddev);
2769 goto done_unlock;
2771 case SET_DISK_FAULTY:
2772 err = set_disk_faulty(mddev, (kdev_t)arg);
2773 goto done_unlock;
2775 case RUN_ARRAY:
2777 /* The data is never used....
2778 mdu_param_t param;
2779 err = md_copy_from_user(&param, (mdu_param_t *)arg,
2780 sizeof(param));
2781 if (err)
2782 goto abort_unlock;
2784 err = do_md_run (mddev);
2786 * we have to clean up the mess if
2787 * the array cannot be run for some
2788 * reason ...
2790 if (err) {
2791 mddev->sb_dirty = 0;
2792 if (!do_md_stop (mddev, 0))
2793 mddev = NULL;
2795 goto done_unlock;
2798 default:
2799 printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2800 err = -EINVAL;
2801 goto abort_unlock;
2804 done_unlock:
2805 abort_unlock:
2806 if (mddev)
2807 unlock_mddev(mddev);
2809 return err;
2810 done:
2811 if (err)
2812 printk("huh12?\n");
2813 abort:
2814 return err;
2817 static int md_open (struct inode *inode, struct file *file)
2820 * Always succeed
2822 return (0);
2825 static struct block_device_operations md_fops=
2827 open: md_open,
2828 ioctl: md_ioctl,
2832 int md_thread(void * arg)
2834 mdk_thread_t *thread = arg;
2836 md_lock_kernel();
2837 exit_mm(current);
2838 exit_files(current);
2839 exit_fs(current);
2842 * Detach thread
2844 daemonize();
2845 sprintf(current->comm, thread->name);
2846 md_init_signals();
2847 md_flush_signals();
2848 thread->tsk = current;
2851 * md_thread is a 'system-thread', it's priority should be very
2852 * high. We avoid resource deadlocks individually in each
2853 * raid personality. (RAID5 does preallocation) We also use RR and
2854 * the very same RT priority as kswapd, thus we will never get
2855 * into a priority inversion deadlock.
2857 * we definitely have to have equal or higher priority than
2858 * bdflush, otherwise bdflush will deadlock if there are too
2859 * many dirty RAID5 blocks.
2861 current->policy = SCHED_OTHER;
2862 current->nice = -20;
2863 // md_unlock_kernel();
2865 up(thread->sem);
2867 for (;;) {
2868 DECLARE_WAITQUEUE(wait, current);
2870 add_wait_queue(&thread->wqueue, &wait);
2871 set_task_state(current, TASK_INTERRUPTIBLE);
2872 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2873 dprintk("thread %p went to sleep.\n", thread);
2874 schedule();
2875 dprintk("thread %p woke up.\n", thread);
2877 current->state = TASK_RUNNING;
2878 remove_wait_queue(&thread->wqueue, &wait);
2879 clear_bit(THREAD_WAKEUP, &thread->flags);
2881 if (thread->run) {
2882 thread->run(thread->data);
2883 run_task_queue(&tq_disk);
2884 } else
2885 break;
2886 if (md_signal_pending(current)) {
2887 printk("%8s(%d) flushing signals.\n", current->comm,
2888 current->pid);
2889 md_flush_signals();
2892 up(thread->sem);
2893 return 0;
2896 void md_wakeup_thread(mdk_thread_t *thread)
2898 dprintk("waking up MD thread %p.\n", thread);
2899 set_bit(THREAD_WAKEUP, &thread->flags);
2900 wake_up(&thread->wqueue);
2903 mdk_thread_t *md_register_thread (void (*run) (void *),
2904 void *data, const char *name)
2906 mdk_thread_t *thread;
2907 int ret;
2908 DECLARE_MUTEX_LOCKED(sem);
2910 thread = (mdk_thread_t *) kmalloc
2911 (sizeof(mdk_thread_t), GFP_KERNEL);
2912 if (!thread)
2913 return NULL;
2915 memset(thread, 0, sizeof(mdk_thread_t));
2916 md_init_waitqueue_head(&thread->wqueue);
2918 thread->sem = &sem;
2919 thread->run = run;
2920 thread->data = data;
2921 thread->name = name;
2922 ret = kernel_thread(md_thread, thread, 0);
2923 if (ret < 0) {
2924 kfree(thread);
2925 return NULL;
2927 down(&sem);
2928 return thread;
2931 void md_interrupt_thread (mdk_thread_t *thread)
2933 if (!thread->tsk) {
2934 MD_BUG();
2935 return;
2937 printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2938 send_sig(SIGKILL, thread->tsk, 1);
2941 void md_unregister_thread (mdk_thread_t *thread)
2943 DECLARE_MUTEX_LOCKED(sem);
2945 thread->sem = &sem;
2946 thread->run = NULL;
2947 thread->name = NULL;
2948 if (!thread->tsk) {
2949 MD_BUG();
2950 return;
2952 md_interrupt_thread(thread);
2953 down(&sem);
2956 void md_recover_arrays (void)
2958 if (!md_recovery_thread) {
2959 MD_BUG();
2960 return;
2962 md_wakeup_thread(md_recovery_thread);
2966 int md_error (kdev_t dev, kdev_t rdev)
2968 mddev_t *mddev;
2969 mdk_rdev_t * rrdev;
2970 int rc;
2972 mddev = kdev_to_mddev(dev);
2973 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
2975 if (!mddev) {
2976 MD_BUG();
2977 return 0;
2979 rrdev = find_rdev(mddev, rdev);
2980 mark_rdev_faulty(rrdev);
2982 * if recovery was running, stop it now.
2984 if (mddev->pers->stop_resync)
2985 mddev->pers->stop_resync(mddev);
2986 if (mddev->recovery_running)
2987 md_interrupt_thread(md_recovery_thread);
2988 if (mddev->pers->error_handler) {
2989 rc = mddev->pers->error_handler(mddev, rdev);
2990 md_recover_arrays();
2991 return rc;
2993 return 0;
2996 static int status_unused (char * page)
2998 int sz = 0, i = 0;
2999 mdk_rdev_t *rdev;
3000 struct md_list_head *tmp;
3002 sz += sprintf(page + sz, "unused devices: ");
3004 ITERATE_RDEV_ALL(rdev,tmp) {
3005 if (!rdev->same_set.next && !rdev->same_set.prev) {
3007 * The device is not yet used by any array.
3009 i++;
3010 sz += sprintf(page + sz, "%s ",
3011 partition_name(rdev->dev));
3014 if (!i)
3015 sz += sprintf(page + sz, "<none>");
3017 sz += sprintf(page + sz, "\n");
3018 return sz;
3022 static int status_resync (char * page, mddev_t * mddev)
3024 int sz = 0;
3025 unsigned long max_blocks, resync, res, dt, db, rt;
3027 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
3028 max_blocks = mddev->sb->size;
3031 * Should not happen.
3033 if (!max_blocks) {
3034 MD_BUG();
3035 return 0;
3037 res = (resync/1024)*1000/(max_blocks/1024 + 1);
3039 int i, x = res/50, y = 20-x;
3040 sz += sprintf(page + sz, "[");
3041 for (i = 0; i < x; i++)
3042 sz += sprintf(page + sz, "=");
3043 sz += sprintf(page + sz, ">");
3044 for (i = 0; i < y; i++)
3045 sz += sprintf(page + sz, ".");
3046 sz += sprintf(page + sz, "] ");
3048 if (!mddev->recovery_running)
3050 * true resync
3052 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3053 res/10, res % 10, resync, max_blocks);
3054 else
3056 * recovery ...
3058 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3059 res/10, res % 10, resync, max_blocks);
3062 * We do not want to overflow, so the order of operands and
3063 * the * 100 / 100 trick are important. We do a +1 to be
3064 * safe against division by zero. We only estimate anyway.
3066 * dt: time from mark until now
3067 * db: blocks written from mark until now
3068 * rt: remaining time
3070 dt = ((jiffies - mddev->resync_mark) / HZ);
3071 if (!dt) dt++;
3072 db = resync - mddev->resync_mark_cnt;
3073 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3075 sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3077 sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3079 return sz;
3082 static int md_status_read_proc(char *page, char **start, off_t off,
3083 int count, int *eof, void *data)
3085 int sz = 0, j, size;
3086 struct md_list_head *tmp, *tmp2;
3087 mdk_rdev_t *rdev;
3088 mddev_t *mddev;
3090 sz += sprintf(page + sz, "Personalities : ");
3091 for (j = 0; j < MAX_PERSONALITY; j++)
3092 if (pers[j])
3093 sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3095 sz += sprintf(page+sz, "\n");
3098 sz += sprintf(page+sz, "read_ahead ");
3099 if (read_ahead[MD_MAJOR] == INT_MAX)
3100 sz += sprintf(page+sz, "not set\n");
3101 else
3102 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3104 ITERATE_MDDEV(mddev,tmp) {
3105 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3106 mddev->pers ? "" : "in");
3107 if (mddev->pers) {
3108 if (mddev->ro)
3109 sz += sprintf(page + sz, " (read-only)");
3110 sz += sprintf(page + sz, " %s", mddev->pers->name);
3113 size = 0;
3114 ITERATE_RDEV(mddev,rdev,tmp2) {
3115 sz += sprintf(page + sz, " %s[%d]",
3116 partition_name(rdev->dev), rdev->desc_nr);
3117 if (rdev->faulty) {
3118 sz += sprintf(page + sz, "(F)");
3119 continue;
3121 size += rdev->size;
3124 if (mddev->nb_dev) {
3125 if (mddev->pers)
3126 sz += sprintf(page + sz, "\n %d blocks",
3127 md_size[mdidx(mddev)]);
3128 else
3129 sz += sprintf(page + sz, "\n %d blocks", size);
3132 if (!mddev->pers) {
3133 sz += sprintf(page+sz, "\n");
3134 continue;
3137 sz += mddev->pers->status (page+sz, mddev);
3139 sz += sprintf(page+sz, "\n ");
3140 if (mddev->curr_resync) {
3141 sz += status_resync (page+sz, mddev);
3142 } else {
3143 if (md_atomic_read(&mddev->resync_sem.count) != 1)
3144 sz += sprintf(page + sz, " resync=DELAYED");
3146 sz += sprintf(page + sz, "\n");
3148 sz += status_unused (page + sz);
3150 return sz;
3153 int register_md_personality (int pnum, mdk_personality_t *p)
3155 if (pnum >= MAX_PERSONALITY)
3156 return -EINVAL;
3158 if (pers[pnum])
3159 return -EBUSY;
3161 pers[pnum] = p;
3162 printk(KERN_INFO "%s personality registered\n", p->name);
3163 return 0;
3166 int unregister_md_personality (int pnum)
3168 if (pnum >= MAX_PERSONALITY)
3169 return -EINVAL;
3171 printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
3172 pers[pnum] = NULL;
3173 return 0;
3176 static mdp_disk_t *get_spare(mddev_t *mddev)
3178 mdp_super_t *sb = mddev->sb;
3179 mdp_disk_t *disk;
3180 mdk_rdev_t *rdev;
3181 struct md_list_head *tmp;
3183 ITERATE_RDEV(mddev,rdev,tmp) {
3184 if (rdev->faulty)
3185 continue;
3186 if (!rdev->sb) {
3187 MD_BUG();
3188 continue;
3190 disk = &sb->disks[rdev->desc_nr];
3191 if (disk_faulty(disk)) {
3192 MD_BUG();
3193 continue;
3195 if (disk_active(disk))
3196 continue;
3197 return disk;
3199 return NULL;
3202 static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3203 void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3205 unsigned int major = MAJOR(dev);
3206 unsigned int index;
3208 index = disk_index(dev);
3209 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3210 return;
3212 sync_io[major][index] += nr_sectors;
3215 static int is_mddev_idle (mddev_t *mddev)
3217 mdk_rdev_t * rdev;
3218 struct md_list_head *tmp;
3219 int idle;
3220 unsigned long curr_events;
3222 idle = 1;
3223 ITERATE_RDEV(mddev,rdev,tmp) {
3224 int major = MAJOR(rdev->dev);
3225 int idx = disk_index(rdev->dev);
3227 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3228 continue;
3230 curr_events = kstat.dk_drive_rblk[major][idx] +
3231 kstat.dk_drive_wblk[major][idx] ;
3232 curr_events -= sync_io[major][idx];
3233 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3234 if (curr_events != rdev->last_events) {
3235 // printk("!I(%ld)", curr_events - rdev->last_events);
3236 rdev->last_events = curr_events;
3237 idle = 0;
3240 return idle;
3243 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3245 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3247 /* another "blocks" (1K) blocks have been synced */
3248 atomic_sub(blocks, &mddev->recovery_active);
3249 wake_up(&mddev->recovery_wait);
3250 if (!ok) {
3251 // stop recovery, signal do_sync ....
3255 #define SYNC_MARKS 10
3256 #define SYNC_MARK_STEP (3*HZ)
3257 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3259 mddev_t *mddev2;
3260 unsigned int max_blocks, currspeed,
3261 j, window, err, serialize;
3262 kdev_t read_disk = mddev_to_kdev(mddev);
3263 unsigned long mark[SYNC_MARKS];
3264 unsigned long mark_cnt[SYNC_MARKS];
3265 int last_mark,m;
3266 struct md_list_head *tmp;
3267 unsigned long last_check;
3270 err = down_interruptible(&mddev->resync_sem);
3271 if (err)
3272 goto out_nolock;
3274 recheck:
3275 serialize = 0;
3276 ITERATE_MDDEV(mddev2,tmp) {
3277 if (mddev2 == mddev)
3278 continue;
3279 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3280 printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
3281 serialize = 1;
3282 break;
3285 if (serialize) {
3286 interruptible_sleep_on(&resync_wait);
3287 if (md_signal_pending(current)) {
3288 md_flush_signals();
3289 err = -EINTR;
3290 goto out;
3292 goto recheck;
3295 mddev->curr_resync = 1;
3297 max_blocks = mddev->sb->size;
3299 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3300 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3301 sysctl_speed_limit_min);
3302 printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3305 * Resync has low priority.
3307 current->nice = 19;
3309 is_mddev_idle(mddev); /* this also initializes IO event counters */
3310 for (m = 0; m < SYNC_MARKS; m++) {
3311 mark[m] = jiffies;
3312 mark_cnt[m] = 0;
3314 last_mark = 0;
3315 mddev->resync_mark = mark[last_mark];
3316 mddev->resync_mark_cnt = mark_cnt[last_mark];
3319 * Tune reconstruction:
3321 window = MAX_READAHEAD*(PAGE_SIZE/1024);
3322 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks);
3324 atomic_set(&mddev->recovery_active, 0);
3325 init_waitqueue_head(&mddev->recovery_wait);
3326 last_check = 0;
3327 for (j = 0; j < max_blocks;) {
3328 int blocks;
3330 blocks = mddev->pers->sync_request(mddev, j);
3332 if (blocks < 0) {
3333 err = blocks;
3334 goto out;
3336 atomic_add(blocks, &mddev->recovery_active);
3337 j += blocks;
3338 mddev->curr_resync = j;
3340 if (last_check + window > j)
3341 continue;
3343 run_task_queue(&tq_disk); //??
3345 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3346 /* step marks */
3347 int next = (last_mark+1) % SYNC_MARKS;
3349 mddev->resync_mark = mark[next];
3350 mddev->resync_mark_cnt = mark_cnt[next];
3351 mark[next] = jiffies;
3352 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3353 last_mark = next;
3357 if (md_signal_pending(current)) {
3359 * got a signal, exit.
3361 mddev->curr_resync = 0;
3362 printk("md_do_sync() got signal ... exiting\n");
3363 md_flush_signals();
3364 err = -EINTR;
3365 goto out;
3369 * this loop exits only if either when we are slower than
3370 * the 'hard' speed limit, or the system was IO-idle for
3371 * a jiffy.
3372 * the system might be non-idle CPU-wise, but we only care
3373 * about not overloading the IO subsystem. (things like an
3374 * e2fsck being done on the RAID array should execute fast)
3376 repeat:
3377 if (md_need_resched(current))
3378 schedule();
3380 currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1;
3382 if (currspeed > sysctl_speed_limit_min) {
3383 current->nice = 19;
3385 if ((currspeed > sysctl_speed_limit_max) ||
3386 !is_mddev_idle(mddev)) {
3387 current->state = TASK_INTERRUPTIBLE;
3388 md_schedule_timeout(HZ/4);
3389 if (!md_signal_pending(current))
3390 goto repeat;
3392 } else
3393 current->nice = -20;
3395 fsync_dev(read_disk);
3396 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3397 err = 0;
3399 * this also signals 'finished resyncing' to md_stop
3401 out:
3402 wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3403 up(&mddev->resync_sem);
3404 out_nolock:
3405 mddev->curr_resync = 0;
3406 wake_up(&resync_wait);
3407 return err;
3412 * This is a kernel thread which syncs a spare disk with the active array
3414 * the amount of foolproofing might seem to be a tad excessive, but an
3415 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3416 * of my root partition with the first 0.5 gigs of my /home partition ... so
3417 * i'm a bit nervous ;)
3419 void md_do_recovery (void *data)
3421 int err;
3422 mddev_t *mddev;
3423 mdp_super_t *sb;
3424 mdp_disk_t *spare;
3425 struct md_list_head *tmp;
3427 printk(KERN_INFO "md: recovery thread got woken up ...\n");
3428 restart:
3429 ITERATE_MDDEV(mddev,tmp) {
3430 sb = mddev->sb;
3431 if (!sb)
3432 continue;
3433 if (mddev->recovery_running)
3434 continue;
3435 if (sb->active_disks == sb->raid_disks)
3436 continue;
3437 if (!sb->spare_disks) {
3438 printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3439 continue;
3442 * now here we get the spare and resync it.
3444 if ((spare = get_spare(mddev)) == NULL)
3445 continue;
3446 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3447 if (!mddev->pers->diskop)
3448 continue;
3449 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3450 continue;
3451 down(&mddev->recovery_sem);
3452 mddev->recovery_running = 1;
3453 err = md_do_sync(mddev, spare);
3454 if (err == -EIO) {
3455 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3456 if (!disk_faulty(spare)) {
3457 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3458 mark_disk_faulty(spare);
3459 mark_disk_nonsync(spare);
3460 mark_disk_inactive(spare);
3461 sb->spare_disks--;
3462 sb->working_disks--;
3463 sb->failed_disks++;
3465 } else
3466 if (disk_faulty(spare))
3467 mddev->pers->diskop(mddev, &spare,
3468 DISKOP_SPARE_INACTIVE);
3469 if (err == -EINTR || err == -ENOMEM) {
3471 * Recovery got interrupted, or ran out of mem ...
3472 * signal back that we have finished using the array.
3474 mddev->pers->diskop(mddev, &spare,
3475 DISKOP_SPARE_INACTIVE);
3476 up(&mddev->recovery_sem);
3477 mddev->recovery_running = 0;
3478 continue;
3479 } else {
3480 mddev->recovery_running = 0;
3481 up(&mddev->recovery_sem);
3483 if (!disk_faulty(spare)) {
3485 * the SPARE_ACTIVE diskop possibly changes the
3486 * pointer too
3488 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3489 mark_disk_sync(spare);
3490 mark_disk_active(spare);
3491 sb->active_disks++;
3492 sb->spare_disks--;
3494 mddev->sb_dirty = 1;
3495 md_update_sb(mddev);
3496 goto restart;
3498 printk(KERN_INFO "md: recovery thread finished ...\n");
3502 int md_notify_reboot(struct notifier_block *this,
3503 unsigned long code, void *x)
3505 struct md_list_head *tmp;
3506 mddev_t *mddev;
3508 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3509 || (code == MD_SYS_POWER_OFF)) {
3511 printk(KERN_INFO "stopping all md devices.\n");
3513 ITERATE_MDDEV(mddev,tmp)
3514 do_md_stop (mddev, 1);
3516 * certain more exotic SCSI devices are known to be
3517 * volatile wrt too early system reboots. While the
3518 * right place to handle this issue is the given
3519 * driver, we do want to have a safe RAID driver ...
3521 md_mdelay(1000*1);
3523 return NOTIFY_DONE;
3526 struct notifier_block md_notifier = {
3527 md_notify_reboot,
3528 NULL,
3531 #ifndef MODULE
3532 static int md__init raid_setup(char *str)
3534 int len, pos;
3536 len = strlen(str) + 1;
3537 pos = 0;
3539 while (pos < len) {
3540 char *comma = strchr(str+pos, ',');
3541 int wlen;
3542 if (comma)
3543 wlen = (comma-str)-pos;
3544 else wlen = (len-1)-pos;
3546 if (strncmp(str, "noautodetect", wlen) == 0)
3547 raid_setup_args.noautodetect = 1;
3548 pos += wlen+1;
3550 raid_setup_args.set = 1;
3551 return 1;
3553 __setup("raid=", raid_setup);
3554 #endif
3555 static void md_geninit (void)
3557 int i;
3559 for(i = 0; i < MAX_MD_DEVS; i++) {
3560 md_blocksizes[i] = 1024;
3561 md_size[i] = 0;
3562 md_hardsect_sizes[i] = 512;
3563 md_maxreadahead[i] = MD_READAHEAD;
3564 register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0);
3566 blksize_size[MAJOR_NR] = md_blocksizes;
3567 blk_size[MAJOR_NR] = md_size;
3568 max_readahead[MAJOR_NR] = md_maxreadahead;
3569 hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3571 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3573 #ifdef CONFIG_PROC_FS
3574 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3575 #endif
3577 void hsm_init (void);
3578 void translucent_init (void);
3579 void linear_init (void);
3580 void raid0_init (void);
3581 void raid1_init (void);
3582 void raid5_init (void);
3584 int md__init md_init (void)
3586 static char * name = "mdrecoveryd";
3588 printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
3589 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3590 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
3592 if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3594 printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR);
3595 return (-1);
3597 devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3598 devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT,
3599 MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR,
3600 &md_fops, NULL);
3602 /* forward all md request to md_make_request */
3603 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3606 read_ahead[MAJOR_NR] = INT_MAX;
3607 md_gendisk.next = gendisk_head;
3609 gendisk_head = &md_gendisk;
3611 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3612 if (!md_recovery_thread)
3613 printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
3615 md_register_reboot_notifier(&md_notifier);
3616 raid_table_header = register_sysctl_table(raid_root_table, 1);
3618 #ifdef CONFIG_MD_LINEAR
3619 linear_init ();
3620 #endif
3621 #ifdef CONFIG_MD_RAID0
3622 raid0_init ();
3623 #endif
3624 #ifdef CONFIG_MD_RAID1
3625 raid1_init ();
3626 #endif
3627 #ifdef CONFIG_MD_RAID5
3628 raid5_init ();
3629 #endif
3630 md_geninit();
3631 return (0);
3634 #ifdef CONFIG_MD_BOOT
3635 #define MAX_MD_BOOT_DEVS 8
3636 struct {
3637 unsigned long set;
3638 int pers[MAX_MD_BOOT_DEVS];
3639 int chunk[MAX_MD_BOOT_DEVS];
3640 kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL];
3641 } md_setup_args md__initdata;
3644 * Parse the command-line parameters given our kernel, but do not
3645 * actually try to invoke the MD device now; that is handled by
3646 * md_setup_drive after the low-level disk drivers have initialised.
3648 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3649 * assigns the task of parsing integer arguments to the
3650 * invoked program now). Added ability to initialise all
3651 * the MD devices (by specifying multiple "md=" lines)
3652 * instead of just one. -- KTK
3653 * 18May2000: Added support for persistant-superblock arrays:
3654 * md=n,0,factor,fault,device-list uses RAID0 for device n
3655 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3656 * md=n,device-list reads a RAID superblock from the devices
3657 * elements in device-list are read by name_to_kdev_t so can be
3658 * a hex number or something like /dev/hda1 /dev/sdb
3660 extern kdev_t name_to_kdev_t(char *line) md__init;
3661 static int md__init md_setup(char *str)
3663 int minor, level, factor, fault, i=0;
3664 kdev_t device;
3665 char *devnames, *pername = "";
3667 if(get_option(&str, &minor) != 2) { /* MD Number */
3668 printk("md: Too few arguments supplied to md=.\n");
3669 return 0;
3671 if (minor >= MAX_MD_BOOT_DEVS) {
3672 printk ("md: Minor device number too high.\n");
3673 return 0;
3674 } else if (md_setup_args.set & (1 << minor)) {
3675 printk ("md: Warning - md=%d,... has been specified twice;\n"
3676 " will discard the first definition.\n", minor);
3678 switch(get_option(&str, &level)) { /* RAID Personality */
3679 case 2: /* could be 0 or -1.. */
3680 if (level == 0 || level == -1) {
3681 if (get_option(&str, &factor) != 2 || /* Chunk Size */
3682 get_option(&str, &fault) != 2) {
3683 printk("md: Too few arguments supplied to md=.\n");
3684 return 0;
3686 md_setup_args.pers[minor] = level;
3687 md_setup_args.chunk[minor] = 1 << (factor+12);
3688 switch(level) {
3689 case -1:
3690 level = LINEAR;
3691 pername = "linear";
3692 break;
3693 case 0:
3694 level = RAID0;
3695 pername = "raid0";
3696 break;
3697 default:
3698 printk ("md: The kernel has not been configured for raid%d"
3699 " support!\n", level);
3700 return 0;
3702 md_setup_args.pers[minor] = level;
3703 break;
3705 /* FALL THROUGH */
3706 case 1: /* the first device is numeric */
3707 md_setup_args.devices[minor][i++] = level;
3708 /* FALL THROUGH */
3709 case 0:
3710 md_setup_args.pers[minor] = 0;
3711 pername="super-block";
3713 devnames = str;
3714 for (; i<MAX_REAL && str; i++) {
3715 if ((device = name_to_kdev_t(str))) {
3716 md_setup_args.devices[minor][i] = device;
3717 } else {
3718 printk ("md: Unknown device name, %s.\n", str);
3719 return 0;
3721 if ((str = strchr(str, ',')) != NULL)
3722 str++;
3724 if (!i) {
3725 printk ("md: No devices specified for md%d?\n", minor);
3726 return 0;
3729 printk ("md: Will configure md%d (%s) from %s, below.\n",
3730 minor, pername, devnames);
3731 md_setup_args.devices[minor][i] = (kdev_t) 0;
3732 md_setup_args.set |= (1 << minor);
3733 return 1;
3736 void md__init md_setup_drive(void)
3738 int minor, i;
3739 kdev_t dev;
3740 mddev_t*mddev;
3742 for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) {
3743 mdu_disk_info_t dinfo;
3744 int err=0;
3745 if (!(md_setup_args.set & (1 << minor)))
3746 continue;
3747 printk("md: Loading md%d.\n", minor);
3748 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3749 if (md_setup_args.pers[minor]) {
3750 /* non-persistent */
3751 mdu_array_info_t ainfo;
3752 ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3753 ainfo.size = 0;
3754 ainfo.nr_disks =0;
3755 ainfo.raid_disks =0;
3756 ainfo.md_minor =minor;
3757 ainfo.not_persistent = 1;
3759 ainfo.state = MD_SB_CLEAN;
3760 ainfo.active_disks = 0;
3761 ainfo.working_disks = 0;
3762 ainfo.failed_disks = 0;
3763 ainfo.spare_disks = 0;
3764 ainfo.layout = 0;
3765 ainfo.chunk_size = md_setup_args.chunk[minor];
3766 err = set_array_info(mddev, &ainfo);
3767 for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) {
3768 dinfo.number = i;
3769 dinfo.raid_disk = i;
3770 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3771 dinfo.major = MAJOR(dev);
3772 dinfo.minor = MINOR(dev);
3773 mddev->sb->nr_disks++;
3774 mddev->sb->raid_disks++;
3775 mddev->sb->active_disks++;
3776 mddev->sb->working_disks++;
3777 err = add_new_disk (mddev, &dinfo);
3779 } else {
3780 /* persistent */
3781 for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) {
3782 dinfo.major = MAJOR(dev);
3783 dinfo.minor = MINOR(dev);
3784 add_new_disk (mddev, &dinfo);
3787 if (!err)
3788 err = do_md_run(mddev);
3789 if (err) {
3790 mddev->sb_dirty = 0;
3791 do_md_stop(mddev, 0);
3792 printk("md: starting md%d failed\n", minor);
3797 __setup("md=", md_setup);
3798 #endif
3800 #ifdef MODULE
3801 int init_module (void)
3803 return md_init();
3806 static void free_device_names(void)
3808 while (device_names.next != &device_names) {
3809 struct list_head *tmp = device_names.next;
3810 list_del(tmp);
3811 kfree(tmp);
3816 void cleanup_module (void)
3818 struct gendisk **gendisk_ptr;
3820 md_unregister_thread(md_recovery_thread);
3821 devfs_unregister(devfs_handle);
3823 devfs_unregister_blkdev(MAJOR_NR,"md");
3824 unregister_reboot_notifier(&md_notifier);
3825 unregister_sysctl_table(raid_table_header);
3826 #ifdef CONFIG_PROC_FS
3827 remove_proc_entry("mdstat", NULL);
3828 #endif
3830 gendisk_ptr = &gendisk_head;
3831 while (*gendisk_ptr) {
3832 if (*gendisk_ptr == &md_gendisk) {
3833 *gendisk_ptr = md_gendisk.next;
3834 break;
3836 gendisk_ptr = & (*gendisk_ptr)->next;
3838 blk_dev[MAJOR_NR].queue = NULL;
3839 blksize_size[MAJOR_NR] = NULL;
3840 blk_size[MAJOR_NR] = NULL;
3841 max_readahead[MAJOR_NR] = NULL;
3842 hardsect_size[MAJOR_NR] = NULL;
3844 free_device_names();
3847 #endif
3849 MD_EXPORT_SYMBOL(md_size);
3850 MD_EXPORT_SYMBOL(register_md_personality);
3851 MD_EXPORT_SYMBOL(unregister_md_personality);
3852 MD_EXPORT_SYMBOL(partition_name);
3853 MD_EXPORT_SYMBOL(md_error);
3854 MD_EXPORT_SYMBOL(md_do_sync);
3855 MD_EXPORT_SYMBOL(md_sync_acct);
3856 MD_EXPORT_SYMBOL(md_done_sync);
3857 MD_EXPORT_SYMBOL(md_recover_arrays);
3858 MD_EXPORT_SYMBOL(md_register_thread);
3859 MD_EXPORT_SYMBOL(md_unregister_thread);
3860 MD_EXPORT_SYMBOL(md_update_sb);
3861 MD_EXPORT_SYMBOL(md_wakeup_thread);
3862 MD_EXPORT_SYMBOL(md_print_devices);
3863 MD_EXPORT_SYMBOL(find_rdev_nr);
3864 MD_EXPORT_SYMBOL(md_interrupt_thread);
3865 MD_EXPORT_SYMBOL(mddev_map);
3866 MD_EXPORT_SYMBOL(md_check_ordering);