Linux 2.4.0-test10pre4
[davej-history.git] / drivers / md / md.c
blobc89314c788efbf59cdaa34c08cd8221b8039f874
1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
7 Changes:
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
19 Neil Brown <neilb@cse.unsw.edu.au>.
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
24 any later version.
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/raid/md.h>
34 #include <linux/raid/xor.h>
35 #include <linux/devfs_fs_kernel.h>
37 #include <linux/init.h>
39 #ifdef CONFIG_KMOD
40 #include <linux/kmod.h>
41 #endif
43 #define __KERNEL_SYSCALLS__
44 #include <linux/unistd.h>
46 #include <asm/unaligned.h>
48 extern asmlinkage int sys_sched_yield(void);
49 extern asmlinkage long sys_setsid(void);
51 #define MAJOR_NR MD_MAJOR
52 #define MD_DRIVER
54 #include <linux/blk.h>
56 #define DEBUG 0
57 #if DEBUG
58 # define dprintk(x...) printk(x)
59 #else
60 # define dprintk(x...) do { } while(0)
61 #endif
63 static mdk_personality_t *pers[MAX_PERSONALITY];
66 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
67 * is 100 KB/sec, so the extra system load does not show up that much.
68 * Increase it if you want to have more _guaranteed_ speed. Note that
69 * the RAID driver will use the maximum available bandwith if the IO
70 * subsystem is idle. There is also an 'absolute maximum' reconstruction
71 * speed limit - in case reconstruction slows down your system despite
72 * idle IO detection.
74 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
77 static int sysctl_speed_limit_min = 100;
78 static int sysctl_speed_limit_max = 100000;
80 static struct ctl_table_header *raid_table_header;
82 static ctl_table raid_table[] = {
83 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
84 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
85 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
86 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
87 {0}
90 static ctl_table raid_dir_table[] = {
91 {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
92 {0}
95 static ctl_table raid_root_table[] = {
96 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
97 {0}
101 * these have to be allocated separately because external
102 * subsystems want to have a pre-defined structure
104 struct hd_struct md_hd_struct[MAX_MD_DEVS];
105 static int md_blocksizes[MAX_MD_DEVS];
106 static int md_hardsect_sizes[MAX_MD_DEVS];
107 static int md_maxreadahead[MAX_MD_DEVS];
108 static mdk_thread_t *md_recovery_thread;
110 int md_size[MAX_MD_DEVS];
112 extern struct block_device_operations md_fops;
113 static devfs_handle_t devfs_handle;
115 static struct gendisk md_gendisk=
117 major: MD_MAJOR,
118 major_name: "md",
119 minor_shift: 0,
120 max_p: 1,
121 part: md_hd_struct,
122 sizes: md_size,
123 nr_real: MAX_MD_DEVS,
124 real_devices: NULL,
125 next: NULL,
126 fops: &md_fops,
130 * Enables to iterate over all existing md arrays
132 static MD_LIST_HEAD(all_mddevs);
135 * The mapping between kdev and mddev is not necessary a simple
136 * one! Eg. HSM uses several sub-devices to implement Logical
137 * Volumes. All these sub-devices map to the same mddev.
139 dev_mapping_t mddev_map[MAX_MD_DEVS];
141 void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
143 unsigned int minor = MINOR(dev);
145 if (MAJOR(dev) != MD_MAJOR) {
146 MD_BUG();
147 return;
149 if (mddev_map[minor].mddev != NULL) {
150 MD_BUG();
151 return;
153 mddev_map[minor].mddev = mddev;
154 mddev_map[minor].data = data;
157 void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
159 unsigned int minor = MINOR(dev);
161 if (MAJOR(dev) != MD_MAJOR) {
162 MD_BUG();
163 return;
165 if (mddev_map[minor].mddev != mddev) {
166 MD_BUG();
167 return;
169 mddev_map[minor].mddev = NULL;
170 mddev_map[minor].data = NULL;
173 static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
175 mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
177 if (mddev && mddev->pers)
178 return mddev->pers->make_request(mddev, rw, bh);
179 else {
180 buffer_IO_error(bh);
181 return -1;
185 static mddev_t * alloc_mddev (kdev_t dev)
187 mddev_t *mddev;
189 if (MAJOR(dev) != MD_MAJOR) {
190 MD_BUG();
191 return 0;
193 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
194 if (!mddev)
195 return NULL;
197 memset(mddev, 0, sizeof(*mddev));
199 mddev->__minor = MINOR(dev);
200 init_MUTEX(&mddev->reconfig_sem);
201 init_MUTEX(&mddev->recovery_sem);
202 init_MUTEX(&mddev->resync_sem);
203 MD_INIT_LIST_HEAD(&mddev->disks);
204 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
207 * The 'base' mddev is the one with data NULL.
208 * personalities can create additional mddevs
209 * if necessary.
211 add_mddev_mapping(mddev, dev, 0);
212 md_list_add(&mddev->all_mddevs, &all_mddevs);
214 MOD_INC_USE_COUNT;
216 return mddev;
219 struct gendisk * find_gendisk (kdev_t dev)
221 struct gendisk *tmp = gendisk_head;
223 while (tmp != NULL) {
224 if (tmp->major == MAJOR(dev))
225 return (tmp);
226 tmp = tmp->next;
228 return (NULL);
231 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
233 mdk_rdev_t * rdev;
234 struct md_list_head *tmp;
236 ITERATE_RDEV(mddev,rdev,tmp) {
237 if (rdev->desc_nr == nr)
238 return rdev;
240 return NULL;
243 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
245 struct md_list_head *tmp;
246 mdk_rdev_t *rdev;
248 ITERATE_RDEV(mddev,rdev,tmp) {
249 if (rdev->dev == dev)
250 return rdev;
252 return NULL;
255 static MD_LIST_HEAD(device_names);
257 char * partition_name (kdev_t dev)
259 struct gendisk *hd;
260 static char nomem [] = "<nomem>";
261 dev_name_t *dname;
262 struct md_list_head *tmp = device_names.next;
264 while (tmp != &device_names) {
265 dname = md_list_entry(tmp, dev_name_t, list);
266 if (dname->dev == dev)
267 return dname->name;
268 tmp = tmp->next;
271 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
273 if (!dname)
274 return nomem;
276 * ok, add this new device name to the list
278 hd = find_gendisk (dev);
279 dname->name = NULL;
280 if (hd)
281 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
282 if (!dname->name) {
283 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
284 dname->name = dname->namebuf;
287 dname->dev = dev;
288 MD_INIT_LIST_HEAD(&dname->list);
289 md_list_add(&dname->list, &device_names);
291 return dname->name;
294 static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
295 int persistent)
297 unsigned int size = 0;
299 if (blk_size[MAJOR(dev)])
300 size = blk_size[MAJOR(dev)][MINOR(dev)];
301 if (persistent)
302 size = MD_NEW_SIZE_BLOCKS(size);
303 return size;
306 static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
308 unsigned int size;
310 size = calc_dev_sboffset(dev, mddev, persistent);
311 if (!mddev->sb) {
312 MD_BUG();
313 return size;
315 if (mddev->sb->chunk_size)
316 size &= ~(mddev->sb->chunk_size/1024 - 1);
317 return size;
320 static unsigned int zoned_raid_size (mddev_t *mddev)
322 unsigned int mask;
323 mdk_rdev_t * rdev;
324 struct md_list_head *tmp;
326 if (!mddev->sb) {
327 MD_BUG();
328 return -EINVAL;
331 * do size and offset calculations.
333 mask = ~(mddev->sb->chunk_size/1024 - 1);
335 ITERATE_RDEV(mddev,rdev,tmp) {
336 rdev->size &= mask;
337 md_size[mdidx(mddev)] += rdev->size;
339 return 0;
343 * We check wether all devices are numbered from 0 to nb_dev-1. The
344 * order is guaranteed even after device name changes.
346 * Some personalities (raid0, linear) use this. Personalities that
347 * provide data have to be able to deal with loss of individual
348 * disks, so they do their checking themselves.
350 int md_check_ordering (mddev_t *mddev)
352 int i, c;
353 mdk_rdev_t *rdev;
354 struct md_list_head *tmp;
357 * First, all devices must be fully functional
359 ITERATE_RDEV(mddev,rdev,tmp) {
360 if (rdev->faulty) {
361 printk("md: md%d's device %s faulty, aborting.\n",
362 mdidx(mddev), partition_name(rdev->dev));
363 goto abort;
367 c = 0;
368 ITERATE_RDEV(mddev,rdev,tmp) {
369 c++;
371 if (c != mddev->nb_dev) {
372 MD_BUG();
373 goto abort;
375 if (mddev->nb_dev != mddev->sb->raid_disks) {
376 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
377 mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
378 goto abort;
381 * Now the numbering check
383 for (i = 0; i < mddev->nb_dev; i++) {
384 c = 0;
385 ITERATE_RDEV(mddev,rdev,tmp) {
386 if (rdev->desc_nr == i)
387 c++;
389 if (!c) {
390 printk("md: md%d, missing disk #%d, aborting.\n",
391 mdidx(mddev), i);
392 goto abort;
394 if (c > 1) {
395 printk("md: md%d, too many disks #%d, aborting.\n",
396 mdidx(mddev), i);
397 goto abort;
400 return 0;
401 abort:
402 return 1;
405 static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
407 if (disk_active(disk)) {
408 sb->working_disks--;
409 } else {
410 if (disk_spare(disk)) {
411 sb->spare_disks--;
412 sb->working_disks--;
413 } else {
414 sb->failed_disks--;
417 sb->nr_disks--;
418 disk->major = 0;
419 disk->minor = 0;
420 mark_disk_removed(disk);
423 #define BAD_MAGIC KERN_ERR \
424 "md: invalid raid superblock magic on %s\n"
426 #define BAD_MINOR KERN_ERR \
427 "md: %s: invalid raid minor (%x)\n"
429 #define OUT_OF_MEM KERN_ALERT \
430 "md: out of memory.\n"
432 #define NO_SB KERN_ERR \
433 "md: disabled device %s, could not read superblock.\n"
435 #define BAD_CSUM KERN_WARNING \
436 "md: invalid superblock checksum on %s\n"
438 static int alloc_array_sb (mddev_t * mddev)
440 if (mddev->sb) {
441 MD_BUG();
442 return 0;
445 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
446 if (!mddev->sb)
447 return -ENOMEM;
448 md_clear_page(mddev->sb);
449 return 0;
452 static int alloc_disk_sb (mdk_rdev_t * rdev)
454 if (rdev->sb)
455 MD_BUG();
457 rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
458 if (!rdev->sb) {
459 printk (OUT_OF_MEM);
460 return -EINVAL;
462 md_clear_page(rdev->sb);
464 return 0;
467 static void free_disk_sb (mdk_rdev_t * rdev)
469 if (rdev->sb) {
470 free_page((unsigned long) rdev->sb);
471 rdev->sb = NULL;
472 rdev->sb_offset = 0;
473 rdev->size = 0;
474 } else {
475 if (!rdev->faulty)
476 MD_BUG();
480 static void mark_rdev_faulty (mdk_rdev_t * rdev)
482 if (!rdev) {
483 MD_BUG();
484 return;
486 free_disk_sb(rdev);
487 rdev->faulty = 1;
490 static int read_disk_sb (mdk_rdev_t * rdev)
492 int ret = -EINVAL;
493 struct buffer_head *bh = NULL;
494 kdev_t dev = rdev->dev;
495 mdp_super_t *sb;
496 unsigned long sb_offset;
498 if (!rdev->sb) {
499 MD_BUG();
500 goto abort;
504 * Calculate the position of the superblock,
505 * it's at the end of the disk
507 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
508 rdev->sb_offset = sb_offset;
509 printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
510 fsync_dev(dev);
511 set_blocksize (dev, MD_SB_BYTES);
512 bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
514 if (bh) {
515 sb = (mdp_super_t *) bh->b_data;
516 memcpy (rdev->sb, sb, MD_SB_BYTES);
517 } else {
518 printk (NO_SB,partition_name(rdev->dev));
519 goto abort;
521 printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
522 ret = 0;
523 abort:
524 if (bh)
525 brelse (bh);
526 return ret;
529 static unsigned int calc_sb_csum (mdp_super_t * sb)
531 unsigned int disk_csum, csum;
533 disk_csum = sb->sb_csum;
534 sb->sb_csum = 0;
535 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
536 sb->sb_csum = disk_csum;
537 return csum;
541 * Check one RAID superblock for generic plausibility
544 static int check_disk_sb (mdk_rdev_t * rdev)
546 mdp_super_t *sb;
547 int ret = -EINVAL;
549 sb = rdev->sb;
550 if (!sb) {
551 MD_BUG();
552 goto abort;
555 if (sb->md_magic != MD_SB_MAGIC) {
556 printk (BAD_MAGIC, partition_name(rdev->dev));
557 goto abort;
560 if (sb->md_minor >= MAX_MD_DEVS) {
561 printk (BAD_MINOR, partition_name(rdev->dev),
562 sb->md_minor);
563 goto abort;
566 if (calc_sb_csum(sb) != sb->sb_csum)
567 printk(BAD_CSUM, partition_name(rdev->dev));
568 ret = 0;
569 abort:
570 return ret;
573 static kdev_t dev_unit(kdev_t dev)
575 unsigned int mask;
576 struct gendisk *hd = find_gendisk(dev);
578 if (!hd)
579 return 0;
580 mask = ~((1 << hd->minor_shift) - 1);
582 return MKDEV(MAJOR(dev), MINOR(dev) & mask);
585 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
587 struct md_list_head *tmp;
588 mdk_rdev_t *rdev;
590 ITERATE_RDEV(mddev,rdev,tmp)
591 if (dev_unit(rdev->dev) == dev_unit(dev))
592 return rdev;
594 return NULL;
597 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
599 struct md_list_head *tmp;
600 mdk_rdev_t *rdev;
602 ITERATE_RDEV(mddev1,rdev,tmp)
603 if (match_dev_unit(mddev2, rdev->dev))
604 return 1;
606 return 0;
609 static MD_LIST_HEAD(all_raid_disks);
610 static MD_LIST_HEAD(pending_raid_disks);
612 static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
614 mdk_rdev_t *same_pdev;
616 if (rdev->mddev) {
617 MD_BUG();
618 return;
620 same_pdev = match_dev_unit(mddev, rdev->dev);
621 if (same_pdev)
622 printk( KERN_WARNING
623 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
624 " protection against single-disk failure might be compromised.\n",
625 mdidx(mddev), partition_name(rdev->dev),
626 partition_name(same_pdev->dev));
628 md_list_add(&rdev->same_set, &mddev->disks);
629 rdev->mddev = mddev;
630 mddev->nb_dev++;
631 printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
634 static void unbind_rdev_from_array (mdk_rdev_t * rdev)
636 if (!rdev->mddev) {
637 MD_BUG();
638 return;
640 md_list_del(&rdev->same_set);
641 MD_INIT_LIST_HEAD(&rdev->same_set);
642 rdev->mddev->nb_dev--;
643 printk("unbind<%s,%d>\n", partition_name(rdev->dev),
644 rdev->mddev->nb_dev);
645 rdev->mddev = NULL;
649 * prevent the device from being mounted, repartitioned or
650 * otherwise reused by a RAID array (or any other kernel
651 * subsystem), by opening the device. [simply getting an
652 * inode is not enough, the SCSI module usage code needs
653 * an explicit open() on the device]
655 static int lock_rdev (mdk_rdev_t *rdev)
657 int err = 0;
660 * First insert a dummy inode.
662 if (rdev->inode)
663 MD_BUG();
664 rdev->inode = get_empty_inode();
665 if (!rdev->inode)
666 return -ENOMEM;
668 * we dont care about any other fields
670 rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
671 insert_inode_hash(rdev->inode);
673 memset(&rdev->filp, 0, sizeof(rdev->filp));
674 rdev->filp.f_mode = 3; /* read write */
675 return err;
678 static void unlock_rdev (mdk_rdev_t *rdev)
680 if (!rdev->inode)
681 MD_BUG();
682 iput(rdev->inode);
683 rdev->inode = NULL;
686 static void export_rdev (mdk_rdev_t * rdev)
688 printk("export_rdev(%s)\n",partition_name(rdev->dev));
689 if (rdev->mddev)
690 MD_BUG();
691 unlock_rdev(rdev);
692 free_disk_sb(rdev);
693 md_list_del(&rdev->all);
694 MD_INIT_LIST_HEAD(&rdev->all);
695 if (rdev->pending.next != &rdev->pending) {
696 printk("(%s was pending)\n",partition_name(rdev->dev));
697 md_list_del(&rdev->pending);
698 MD_INIT_LIST_HEAD(&rdev->pending);
700 rdev->dev = 0;
701 rdev->faulty = 0;
702 kfree(rdev);
705 static void kick_rdev_from_array (mdk_rdev_t * rdev)
707 unbind_rdev_from_array(rdev);
708 export_rdev(rdev);
711 static void export_array (mddev_t *mddev)
713 struct md_list_head *tmp;
714 mdk_rdev_t *rdev;
715 mdp_super_t *sb = mddev->sb;
717 if (mddev->sb) {
718 mddev->sb = NULL;
719 free_page((unsigned long) sb);
722 ITERATE_RDEV(mddev,rdev,tmp) {
723 if (!rdev->mddev) {
724 MD_BUG();
725 continue;
727 kick_rdev_from_array(rdev);
729 if (mddev->nb_dev)
730 MD_BUG();
733 static void free_mddev (mddev_t *mddev)
735 if (!mddev) {
736 MD_BUG();
737 return;
740 export_array(mddev);
741 md_size[mdidx(mddev)] = 0;
742 md_hd_struct[mdidx(mddev)].nr_sects = 0;
745 * Make sure nobody else is using this mddev
746 * (careful, we rely on the global kernel lock here)
748 while (md_atomic_read(&mddev->resync_sem.count) != 1)
749 schedule();
750 while (md_atomic_read(&mddev->recovery_sem.count) != 1)
751 schedule();
753 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
754 md_list_del(&mddev->all_mddevs);
755 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
756 kfree(mddev);
757 MOD_DEC_USE_COUNT;
760 #undef BAD_CSUM
761 #undef BAD_MAGIC
762 #undef OUT_OF_MEM
763 #undef NO_SB
765 static void print_desc(mdp_disk_t *desc)
767 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
768 partition_name(MKDEV(desc->major,desc->minor)),
769 desc->major,desc->minor,desc->raid_disk,desc->state);
772 static void print_sb(mdp_super_t *sb)
774 int i;
776 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
777 sb->major_version, sb->minor_version, sb->patch_version,
778 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
779 sb->ctime);
780 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
781 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
782 sb->layout, sb->chunk_size);
783 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
784 sb->utime, sb->state, sb->active_disks, sb->working_disks,
785 sb->failed_disks, sb->spare_disks,
786 sb->sb_csum, (unsigned long)sb->events_lo);
788 for (i = 0; i < MD_SB_DISKS; i++) {
789 mdp_disk_t *desc;
791 desc = sb->disks + i;
792 printk(" D %2d: ", i);
793 print_desc(desc);
795 printk(" THIS: ");
796 print_desc(&sb->this_disk);
800 static void print_rdev(mdk_rdev_t *rdev)
802 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
803 partition_name(rdev->dev), partition_name(rdev->old_dev),
804 rdev->size, rdev->faulty, rdev->desc_nr);
805 if (rdev->sb) {
806 printk("rdev superblock:\n");
807 print_sb(rdev->sb);
808 } else
809 printk("no rdev superblock!\n");
812 void md_print_devices (void)
814 struct md_list_head *tmp, *tmp2;
815 mdk_rdev_t *rdev;
816 mddev_t *mddev;
818 printk("\n");
819 printk(" **********************************\n");
820 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
821 printk(" **********************************\n");
822 ITERATE_MDDEV(mddev,tmp) {
823 printk("md%d: ", mdidx(mddev));
825 ITERATE_RDEV(mddev,rdev,tmp2)
826 printk("<%s>", partition_name(rdev->dev));
828 if (mddev->sb) {
829 printk(" array superblock:\n");
830 print_sb(mddev->sb);
831 } else
832 printk(" no array superblock.\n");
834 ITERATE_RDEV(mddev,rdev,tmp2)
835 print_rdev(rdev);
837 printk(" **********************************\n");
838 printk("\n");
841 static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
843 int ret;
844 mdp_super_t *tmp1, *tmp2;
846 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
847 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
849 if (!tmp1 || !tmp2) {
850 ret = 0;
851 goto abort;
854 *tmp1 = *sb1;
855 *tmp2 = *sb2;
858 * nr_disks is not constant
860 tmp1->nr_disks = 0;
861 tmp2->nr_disks = 0;
863 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
864 ret = 0;
865 else
866 ret = 1;
868 abort:
869 if (tmp1)
870 kfree(tmp1);
871 if (tmp2)
872 kfree(tmp2);
874 return ret;
877 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
879 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
880 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
881 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
882 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
884 return 1;
886 return 0;
889 static mdk_rdev_t * find_rdev_all (kdev_t dev)
891 struct md_list_head *tmp;
892 mdk_rdev_t *rdev;
894 tmp = all_raid_disks.next;
895 while (tmp != &all_raid_disks) {
896 rdev = md_list_entry(tmp, mdk_rdev_t, all);
897 if (rdev->dev == dev)
898 return rdev;
899 tmp = tmp->next;
901 return NULL;
904 #define GETBLK_FAILED KERN_ERR \
905 "md: getblk failed for device %s\n"
907 static int write_disk_sb(mdk_rdev_t * rdev)
909 struct buffer_head *bh;
910 kdev_t dev;
911 unsigned long sb_offset, size;
912 mdp_super_t *sb;
914 if (!rdev->sb) {
915 MD_BUG();
916 return -1;
918 if (rdev->faulty) {
919 MD_BUG();
920 return -1;
922 if (rdev->sb->md_magic != MD_SB_MAGIC) {
923 MD_BUG();
924 return -1;
927 dev = rdev->dev;
928 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
929 if (rdev->sb_offset != sb_offset) {
930 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
931 goto skip;
934 * If the disk went offline meanwhile and it's just a spare, then
935 * it's size has changed to zero silently, and the MD code does
936 * not yet know that it's faulty.
938 size = calc_dev_size(dev, rdev->mddev, 1);
939 if (size != rdev->size) {
940 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
941 goto skip;
944 printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
945 fsync_dev(dev);
946 set_blocksize(dev, MD_SB_BYTES);
947 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
948 if (!bh) {
949 printk(GETBLK_FAILED, partition_name(dev));
950 return 1;
952 memset(bh->b_data,0,bh->b_size);
953 sb = (mdp_super_t *) bh->b_data;
954 memcpy(sb, rdev->sb, MD_SB_BYTES);
956 mark_buffer_uptodate(bh, 1);
957 mark_buffer_dirty(bh);
958 ll_rw_block(WRITE, 1, &bh);
959 wait_on_buffer(bh);
960 brelse(bh);
961 fsync_dev(dev);
962 skip:
963 return 0;
965 #undef GETBLK_FAILED
967 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
969 int i, ok = 0;
970 mdp_disk_t *desc;
972 for (i = 0; i < MD_SB_DISKS; i++) {
973 desc = mddev->sb->disks + i;
974 #if 0
975 if (disk_faulty(desc)) {
976 if (MKDEV(desc->major,desc->minor) == rdev->dev)
977 ok = 1;
978 continue;
980 #endif
981 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
982 rdev->sb->this_disk = *desc;
983 rdev->desc_nr = desc->number;
984 ok = 1;
985 break;
989 if (!ok) {
990 MD_BUG();
994 static int sync_sbs(mddev_t * mddev)
996 mdk_rdev_t *rdev;
997 mdp_super_t *sb;
998 struct md_list_head *tmp;
1000 ITERATE_RDEV(mddev,rdev,tmp) {
1001 if (rdev->faulty)
1002 continue;
1003 sb = rdev->sb;
1004 *sb = *mddev->sb;
1005 set_this_disk(mddev, rdev);
1006 sb->sb_csum = calc_sb_csum(sb);
1008 return 0;
1011 int md_update_sb(mddev_t * mddev)
1013 int first, err, count = 100;
1014 struct md_list_head *tmp;
1015 mdk_rdev_t *rdev;
1017 repeat:
1018 mddev->sb->utime = CURRENT_TIME;
1019 if ((++mddev->sb->events_lo)==0)
1020 ++mddev->sb->events_hi;
1022 if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1024 * oops, this 64-bit counter should never wrap.
1025 * Either we are in around ~1 trillion A.C., assuming
1026 * 1 reboot per second, or we have a bug:
1028 MD_BUG();
1029 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1031 sync_sbs(mddev);
1034 * do not write anything to disk if using
1035 * nonpersistent superblocks
1037 if (mddev->sb->not_persistent)
1038 return 0;
1040 printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1041 mdidx(mddev));
1043 first = 1;
1044 err = 0;
1045 ITERATE_RDEV(mddev,rdev,tmp) {
1046 if (!first) {
1047 first = 0;
1048 printk(", ");
1050 if (rdev->faulty)
1051 printk("(skipping faulty ");
1052 printk("%s ", partition_name(rdev->dev));
1053 if (!rdev->faulty) {
1054 printk("[events: %08lx]",
1055 (unsigned long)rdev->sb->events_lo);
1056 err += write_disk_sb(rdev);
1057 } else
1058 printk(")\n");
1060 printk(".\n");
1061 if (err) {
1062 printk("errors occured during superblock update, repeating\n");
1063 if (--count)
1064 goto repeat;
1065 printk("excessive errors occured during superblock update, exiting\n");
1067 return 0;
1071 * Import a device. If 'on_disk', then sanity check the superblock
1073 * mark the device faulty if:
1075 * - the device is nonexistent (zero size)
1076 * - the device has no valid superblock
1078 * a faulty rdev _never_ has rdev->sb set.
1080 static int md_import_device (kdev_t newdev, int on_disk)
1082 int err;
1083 mdk_rdev_t *rdev;
1084 unsigned int size;
1086 if (find_rdev_all(newdev))
1087 return -EEXIST;
1089 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1090 if (!rdev) {
1091 printk("could not alloc mem for %s!\n", partition_name(newdev));
1092 return -ENOMEM;
1094 memset(rdev, 0, sizeof(*rdev));
1096 if (get_super(newdev)) {
1097 printk("md: can not import %s, has active inodes!\n",
1098 partition_name(newdev));
1099 err = -EBUSY;
1100 goto abort_free;
1103 if ((err = alloc_disk_sb(rdev)))
1104 goto abort_free;
1106 rdev->dev = newdev;
1107 if (lock_rdev(rdev)) {
1108 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1109 partition_name(newdev));
1110 err = -EINVAL;
1111 goto abort_free;
1113 rdev->desc_nr = -1;
1114 rdev->faulty = 0;
1116 size = 0;
1117 if (blk_size[MAJOR(newdev)])
1118 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1119 if (!size) {
1120 printk("md: %s has zero size, marking faulty!\n",
1121 partition_name(newdev));
1122 err = -EINVAL;
1123 goto abort_free;
1126 if (on_disk) {
1127 if ((err = read_disk_sb(rdev))) {
1128 printk("md: could not read %s's sb, not importing!\n",
1129 partition_name(newdev));
1130 goto abort_free;
1132 if ((err = check_disk_sb(rdev))) {
1133 printk("md: %s has invalid sb, not importing!\n",
1134 partition_name(newdev));
1135 goto abort_free;
1138 rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1139 rdev->sb->this_disk.minor);
1140 rdev->desc_nr = rdev->sb->this_disk.number;
1142 md_list_add(&rdev->all, &all_raid_disks);
1143 MD_INIT_LIST_HEAD(&rdev->pending);
1145 if (rdev->faulty && rdev->sb)
1146 free_disk_sb(rdev);
1147 return 0;
1149 abort_free:
1150 if (rdev->sb) {
1151 if (rdev->inode)
1152 unlock_rdev(rdev);
1153 free_disk_sb(rdev);
1155 kfree(rdev);
1156 return err;
1160 * Check a full RAID array for plausibility
1163 #define INCONSISTENT KERN_ERR \
1164 "md: fatal superblock inconsistency in %s -- removing from array\n"
1166 #define OUT_OF_DATE KERN_ERR \
1167 "md: superblock update time inconsistency -- using the most recent one\n"
1169 #define OLD_VERSION KERN_ALERT \
1170 "md: md%d: unsupported raid array version %d.%d.%d\n"
1172 #define NOT_CLEAN_IGNORE KERN_ERR \
1173 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1175 #define UNKNOWN_LEVEL KERN_ERR \
1176 "md: md%d: unsupported raid level %d\n"
1178 static int analyze_sbs (mddev_t * mddev)
1180 int out_of_date = 0, i;
1181 struct md_list_head *tmp, *tmp2;
1182 mdk_rdev_t *rdev, *rdev2, *freshest;
1183 mdp_super_t *sb;
1186 * Verify the RAID superblock on each real device
1188 ITERATE_RDEV(mddev,rdev,tmp) {
1189 if (rdev->faulty) {
1190 MD_BUG();
1191 goto abort;
1193 if (!rdev->sb) {
1194 MD_BUG();
1195 goto abort;
1197 if (check_disk_sb(rdev))
1198 goto abort;
1202 * The superblock constant part has to be the same
1203 * for all disks in the array.
1205 sb = NULL;
1207 ITERATE_RDEV(mddev,rdev,tmp) {
1208 if (!sb) {
1209 sb = rdev->sb;
1210 continue;
1212 if (!sb_equal(sb, rdev->sb)) {
1213 printk (INCONSISTENT, partition_name(rdev->dev));
1214 kick_rdev_from_array(rdev);
1215 continue;
1220 * OK, we have all disks and the array is ready to run. Let's
1221 * find the freshest superblock, that one will be the superblock
1222 * that represents the whole array.
1224 if (!mddev->sb)
1225 if (alloc_array_sb(mddev))
1226 goto abort;
1227 sb = mddev->sb;
1228 freshest = NULL;
1230 ITERATE_RDEV(mddev,rdev,tmp) {
1231 __u64 ev1, ev2;
1233 * if the checksum is invalid, use the superblock
1234 * only as a last resort. (decrease it's age by
1235 * one event)
1237 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1238 if (rdev->sb->events_lo || rdev->sb->events_hi)
1239 if ((rdev->sb->events_lo--)==0)
1240 rdev->sb->events_hi--;
1243 printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
1244 (unsigned long)rdev->sb->events_lo);
1245 if (!freshest) {
1246 freshest = rdev;
1247 continue;
1250 * Find the newest superblock version
1252 ev1 = md_event(rdev->sb);
1253 ev2 = md_event(freshest->sb);
1254 if (ev1 != ev2) {
1255 out_of_date = 1;
1256 if (ev1 > ev2)
1257 freshest = rdev;
1260 if (out_of_date) {
1261 printk(OUT_OF_DATE);
1262 printk("freshest: %s\n", partition_name(freshest->dev));
1264 memcpy (sb, freshest->sb, sizeof(*sb));
1267 * at this point we have picked the 'best' superblock
1268 * from all available superblocks.
1269 * now we validate this superblock and kick out possibly
1270 * failed disks.
1272 ITERATE_RDEV(mddev,rdev,tmp) {
1274 * Kick all non-fresh devices faulty
1276 __u64 ev1, ev2;
1277 ev1 = md_event(rdev->sb);
1278 ev2 = md_event(sb);
1279 ++ev1;
1280 if (ev1 < ev2) {
1281 printk("md: kicking non-fresh %s from array!\n",
1282 partition_name(rdev->dev));
1283 kick_rdev_from_array(rdev);
1284 continue;
1289 * Fix up changed device names ... but only if this disk has a
1290 * recent update time. Use faulty checksum ones too.
1292 ITERATE_RDEV(mddev,rdev,tmp) {
1293 __u64 ev1, ev2, ev3;
1294 if (rdev->faulty) { /* REMOVEME */
1295 MD_BUG();
1296 goto abort;
1298 ev1 = md_event(rdev->sb);
1299 ev2 = md_event(sb);
1300 ev3 = ev2;
1301 --ev3;
1302 if ((rdev->dev != rdev->old_dev) &&
1303 ((ev1 == ev2) || (ev1 == ev3))) {
1304 mdp_disk_t *desc;
1306 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1307 if (rdev->desc_nr == -1) {
1308 MD_BUG();
1309 goto abort;
1311 desc = &sb->disks[rdev->desc_nr];
1312 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1313 MD_BUG();
1314 goto abort;
1316 desc->major = MAJOR(rdev->dev);
1317 desc->minor = MINOR(rdev->dev);
1318 desc = &rdev->sb->this_disk;
1319 desc->major = MAJOR(rdev->dev);
1320 desc->minor = MINOR(rdev->dev);
1325 * Remove unavailable and faulty devices ...
1327 * note that if an array becomes completely unrunnable due to
1328 * missing devices, we do not write the superblock back, so the
1329 * administrator has a chance to fix things up. The removal thus
1330 * only happens if it's nonfatal to the contents of the array.
1332 for (i = 0; i < MD_SB_DISKS; i++) {
1333 int found;
1334 mdp_disk_t *desc;
1335 kdev_t dev;
1337 desc = sb->disks + i;
1338 dev = MKDEV(desc->major, desc->minor);
1341 * We kick faulty devices/descriptors immediately.
1343 if (disk_faulty(desc)) {
1344 found = 0;
1345 ITERATE_RDEV(mddev,rdev,tmp) {
1346 if (rdev->desc_nr != desc->number)
1347 continue;
1348 printk("md%d: kicking faulty %s!\n",
1349 mdidx(mddev),partition_name(rdev->dev));
1350 kick_rdev_from_array(rdev);
1351 found = 1;
1352 break;
1354 if (!found) {
1355 if (dev == MKDEV(0,0))
1356 continue;
1357 printk("md%d: removing former faulty %s!\n",
1358 mdidx(mddev), partition_name(dev));
1360 remove_descriptor(desc, sb);
1361 continue;
1364 if (dev == MKDEV(0,0))
1365 continue;
1367 * Is this device present in the rdev ring?
1369 found = 0;
1370 ITERATE_RDEV(mddev,rdev,tmp) {
1371 if (rdev->desc_nr == desc->number) {
1372 found = 1;
1373 break;
1376 if (found)
1377 continue;
1379 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1380 remove_descriptor(desc, sb);
1384 * Double check wether all devices mentioned in the
1385 * superblock are in the rdev ring.
1387 for (i = 0; i < MD_SB_DISKS; i++) {
1388 mdp_disk_t *desc;
1389 kdev_t dev;
1391 desc = sb->disks + i;
1392 dev = MKDEV(desc->major, desc->minor);
1394 if (dev == MKDEV(0,0))
1395 continue;
1397 if (disk_faulty(desc)) {
1398 MD_BUG();
1399 goto abort;
1402 rdev = find_rdev(mddev, dev);
1403 if (!rdev) {
1404 MD_BUG();
1405 goto abort;
1410 * Do a final reality check.
1412 ITERATE_RDEV(mddev,rdev,tmp) {
1413 if (rdev->desc_nr == -1) {
1414 MD_BUG();
1415 goto abort;
1418 * is the desc_nr unique?
1420 ITERATE_RDEV(mddev,rdev2,tmp2) {
1421 if ((rdev2 != rdev) &&
1422 (rdev2->desc_nr == rdev->desc_nr)) {
1423 MD_BUG();
1424 goto abort;
1428 * is the device unique?
1430 ITERATE_RDEV(mddev,rdev2,tmp2) {
1431 if ((rdev2 != rdev) &&
1432 (rdev2->dev == rdev->dev)) {
1433 MD_BUG();
1434 goto abort;
1440 * Check if we can support this RAID array
1442 if (sb->major_version != MD_MAJOR_VERSION ||
1443 sb->minor_version > MD_MINOR_VERSION) {
1445 printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1446 sb->minor_version, sb->patch_version);
1447 goto abort;
1450 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1451 (sb->level == 4) || (sb->level == 5)))
1452 printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1454 return 0;
1455 abort:
1456 return 1;
1459 #undef INCONSISTENT
1460 #undef OUT_OF_DATE
1461 #undef OLD_VERSION
1462 #undef OLD_LEVEL
1464 static int device_size_calculation (mddev_t * mddev)
1466 int data_disks = 0, persistent;
1467 unsigned int readahead;
1468 mdp_super_t *sb = mddev->sb;
1469 struct md_list_head *tmp;
1470 mdk_rdev_t *rdev;
1473 * Do device size calculation. Bail out if too small.
1474 * (we have to do this after having validated chunk_size,
1475 * because device size has to be modulo chunk_size)
1477 persistent = !mddev->sb->not_persistent;
1478 ITERATE_RDEV(mddev,rdev,tmp) {
1479 if (rdev->faulty)
1480 continue;
1481 if (rdev->size) {
1482 MD_BUG();
1483 continue;
1485 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1486 if (rdev->size < sb->chunk_size / 1024) {
1487 printk (KERN_WARNING
1488 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1489 partition_name(rdev->dev),
1490 rdev->size, sb->chunk_size / 1024);
1491 return -EINVAL;
1495 switch (sb->level) {
1496 case -3:
1497 data_disks = 1;
1498 break;
1499 case -2:
1500 data_disks = 1;
1501 break;
1502 case -1:
1503 zoned_raid_size(mddev);
1504 data_disks = 1;
1505 break;
1506 case 0:
1507 zoned_raid_size(mddev);
1508 data_disks = sb->raid_disks;
1509 break;
1510 case 1:
1511 data_disks = 1;
1512 break;
1513 case 4:
1514 case 5:
1515 data_disks = sb->raid_disks-1;
1516 break;
1517 default:
1518 printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1519 goto abort;
1521 if (!md_size[mdidx(mddev)])
1522 md_size[mdidx(mddev)] = sb->size * data_disks;
1524 readahead = MD_READAHEAD;
1525 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1526 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1527 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1528 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1529 } else {
1530 if (sb->level == -3)
1531 readahead = 0;
1533 md_maxreadahead[mdidx(mddev)] = readahead;
1535 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1536 mdidx(mddev), readahead*(PAGE_SIZE/1024));
1538 printk(KERN_INFO
1539 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1540 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1541 return 0;
1542 abort:
1543 return 1;
1547 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1548 "too big chunk_size: %d > %d\n"
1550 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1551 "too small chunk_size: %d < %ld\n"
1553 #define BAD_CHUNKSIZE KERN_ERR \
1554 "no chunksize specified, see 'man raidtab'\n"
1556 static int do_md_run (mddev_t * mddev)
1558 int pnum, err;
1559 int chunk_size;
1560 struct md_list_head *tmp;
1561 mdk_rdev_t *rdev;
1564 if (!mddev->nb_dev) {
1565 MD_BUG();
1566 return -EINVAL;
1569 if (mddev->pers)
1570 return -EBUSY;
1573 * Resize disks to align partitions size on a given
1574 * chunk size.
1576 md_size[mdidx(mddev)] = 0;
1579 * Analyze all RAID superblock(s)
1581 if (analyze_sbs(mddev)) {
1582 MD_BUG();
1583 return -EINVAL;
1586 chunk_size = mddev->sb->chunk_size;
1587 pnum = level_to_pers(mddev->sb->level);
1589 mddev->param.chunk_size = chunk_size;
1590 mddev->param.personality = pnum;
1592 if (chunk_size > MAX_CHUNK_SIZE) {
1593 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1594 return -EINVAL;
1597 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1599 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1600 MD_BUG();
1601 return -EINVAL;
1603 if (chunk_size < PAGE_SIZE) {
1604 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1605 return -EINVAL;
1608 if (pnum >= MAX_PERSONALITY) {
1609 MD_BUG();
1610 return -EINVAL;
1613 if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
1615 * 'default chunksize' in the old md code used to
1616 * be PAGE_SIZE, baaad.
1617 * we abort here to be on the safe side. We dont
1618 * want to continue the bad practice.
1620 printk(BAD_CHUNKSIZE);
1621 return -EINVAL;
1624 if (!pers[pnum])
1626 #ifdef CONFIG_KMOD
1627 char module_name[80];
1628 sprintf (module_name, "md-personality-%d", pnum);
1629 request_module (module_name);
1630 if (!pers[pnum])
1631 #endif
1632 return -EINVAL;
1635 if (device_size_calculation(mddev))
1636 return -EINVAL;
1639 * Drop all container device buffers, from now on
1640 * the only valid external interface is through the md
1641 * device.
1642 * Also find largest hardsector size
1644 md_hardsect_sizes[mdidx(mddev)] = 512;
1645 ITERATE_RDEV(mddev,rdev,tmp) {
1646 if (rdev->faulty)
1647 continue;
1648 fsync_dev(rdev->dev);
1649 invalidate_buffers(rdev->dev);
1650 if (get_hardsect_size(rdev->dev)
1651 > md_hardsect_sizes[mdidx(mddev)])
1652 md_hardsect_sizes[mdidx(mddev)] =
1653 get_hardsect_size(rdev->dev);
1655 md_blocksizes[mdidx(mddev)] = 1024;
1656 if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1657 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1658 mddev->pers = pers[pnum];
1660 err = mddev->pers->run(mddev);
1661 if (err) {
1662 printk("pers->run() failed ...\n");
1663 mddev->pers = NULL;
1664 return -EINVAL;
1667 mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1668 md_update_sb(mddev);
1671 * md_size has units of 1K blocks, which are
1672 * twice as large as sectors.
1674 md_hd_struct[mdidx(mddev)].start_sect = 0;
1675 md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
1677 read_ahead[MD_MAJOR] = 1024;
1678 return (0);
1681 #undef TOO_BIG_CHUNKSIZE
1682 #undef BAD_CHUNKSIZE
1684 #define OUT(x) do { err = (x); goto out; } while (0)
1686 static int restart_array (mddev_t *mddev)
1688 int err = 0;
1691 * Complain if it has no devices
1693 if (!mddev->nb_dev)
1694 OUT(-ENXIO);
1696 if (mddev->pers) {
1697 if (!mddev->ro)
1698 OUT(-EBUSY);
1700 mddev->ro = 0;
1701 set_device_ro(mddev_to_kdev(mddev), 0);
1703 printk (KERN_INFO
1704 "md%d switched to read-write mode.\n", mdidx(mddev));
1706 * Kick recovery or resync if necessary
1708 md_recover_arrays();
1709 if (mddev->pers->restart_resync)
1710 mddev->pers->restart_resync(mddev);
1711 } else
1712 err = -EINVAL;
1714 out:
1715 return err;
1718 #define STILL_MOUNTED KERN_WARNING \
1719 "md: md%d still mounted.\n"
1721 static int do_md_stop (mddev_t * mddev, int ro)
1723 int err = 0, resync_interrupted = 0;
1724 kdev_t dev = mddev_to_kdev(mddev);
1726 if (!ro && get_super(dev)) {
1727 printk (STILL_MOUNTED, mdidx(mddev));
1728 OUT(-EBUSY);
1731 if (mddev->pers) {
1733 * It is safe to call stop here, it only frees private
1734 * data. Also, it tells us if a device is unstoppable
1735 * (eg. resyncing is in progress)
1737 if (mddev->pers->stop_resync)
1738 if (mddev->pers->stop_resync(mddev))
1739 resync_interrupted = 1;
1741 if (mddev->recovery_running)
1742 md_interrupt_thread(md_recovery_thread);
1745 * This synchronizes with signal delivery to the
1746 * resync or reconstruction thread. It also nicely
1747 * hangs the process if some reconstruction has not
1748 * finished.
1750 down(&mddev->recovery_sem);
1751 up(&mddev->recovery_sem);
1754 * sync and invalidate buffers because we cannot kill the
1755 * main thread with valid IO transfers still around.
1756 * the kernel lock protects us from new requests being
1757 * added after invalidate_buffers().
1759 fsync_dev (mddev_to_kdev(mddev));
1760 fsync_dev (dev);
1761 invalidate_buffers (dev);
1763 if (ro) {
1764 if (mddev->ro)
1765 OUT(-ENXIO);
1766 mddev->ro = 1;
1767 } else {
1768 if (mddev->ro)
1769 set_device_ro(dev, 0);
1770 if (mddev->pers->stop(mddev)) {
1771 if (mddev->ro)
1772 set_device_ro(dev, 1);
1773 OUT(-EBUSY);
1775 if (mddev->ro)
1776 mddev->ro = 0;
1778 if (mddev->sb) {
1780 * mark it clean only if there was no resync
1781 * interrupted.
1783 if (!mddev->recovery_running && !resync_interrupted) {
1784 printk("marking sb clean...\n");
1785 mddev->sb->state |= 1 << MD_SB_CLEAN;
1787 md_update_sb(mddev);
1789 if (ro)
1790 set_device_ro(dev, 1);
1794 * Free resources if final stop
1796 if (!ro) {
1797 printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
1798 free_mddev(mddev);
1800 } else
1801 printk (KERN_INFO
1802 "md%d switched to read-only mode.\n", mdidx(mddev));
1803 out:
1804 return err;
1807 #undef OUT
1810 * We have to safely support old arrays too.
1812 int detect_old_array (mdp_super_t *sb)
1814 if (sb->major_version > 0)
1815 return 0;
1816 if (sb->minor_version >= 90)
1817 return 0;
1819 return -EINVAL;
1823 static void autorun_array (mddev_t *mddev)
1825 mdk_rdev_t *rdev;
1826 struct md_list_head *tmp;
1827 int err;
1829 if (mddev->disks.prev == &mddev->disks) {
1830 MD_BUG();
1831 return;
1834 printk("running: ");
1836 ITERATE_RDEV(mddev,rdev,tmp) {
1837 printk("<%s>", partition_name(rdev->dev));
1839 printk("\nnow!\n");
1841 err = do_md_run (mddev);
1842 if (err) {
1843 printk("do_md_run() returned %d\n", err);
1845 * prevent the writeback of an unrunnable array
1847 mddev->sb_dirty = 0;
1848 do_md_stop (mddev, 0);
1853 * lets try to run arrays based on all disks that have arrived
1854 * until now. (those are in the ->pending list)
1856 * the method: pick the first pending disk, collect all disks with
1857 * the same UUID, remove all from the pending list and put them into
1858 * the 'same_array' list. Then order this list based on superblock
1859 * update time (freshest comes first), kick out 'old' disks and
1860 * compare superblocks. If everything's fine then run it.
1862 static void autorun_devices (void)
1864 struct md_list_head candidates;
1865 struct md_list_head *tmp;
1866 mdk_rdev_t *rdev0, *rdev;
1867 mddev_t *mddev;
1868 kdev_t md_kdev;
1871 printk("autorun ...\n");
1872 while (pending_raid_disks.next != &pending_raid_disks) {
1873 rdev0 = md_list_entry(pending_raid_disks.next,
1874 mdk_rdev_t, pending);
1876 printk("considering %s ...\n", partition_name(rdev0->dev));
1877 MD_INIT_LIST_HEAD(&candidates);
1878 ITERATE_RDEV_PENDING(rdev,tmp) {
1879 if (uuid_equal(rdev0, rdev)) {
1880 if (!sb_equal(rdev0->sb, rdev->sb)) {
1881 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1882 continue;
1884 printk(" adding %s ...\n", partition_name(rdev->dev));
1885 md_list_del(&rdev->pending);
1886 md_list_add(&rdev->pending, &candidates);
1890 * now we have a set of devices, with all of them having
1891 * mostly sane superblocks. It's time to allocate the
1892 * mddev.
1894 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1895 mddev = kdev_to_mddev(md_kdev);
1896 if (mddev) {
1897 printk("md%d already running, cannot run %s\n",
1898 mdidx(mddev), partition_name(rdev0->dev));
1899 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1900 export_rdev(rdev);
1901 continue;
1903 mddev = alloc_mddev(md_kdev);
1904 printk("created md%d\n", mdidx(mddev));
1905 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1906 bind_rdev_to_array(rdev, mddev);
1907 md_list_del(&rdev->pending);
1908 MD_INIT_LIST_HEAD(&rdev->pending);
1910 autorun_array(mddev);
1912 printk("... autorun DONE.\n");
1916 * import RAID devices based on one partition
1917 * if possible, the array gets run as well.
1920 #define BAD_VERSION KERN_ERR \
1921 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1923 #define OUT_OF_MEM KERN_ALERT \
1924 "md: out of memory.\n"
1926 #define NO_DEVICE KERN_ERR \
1927 "md: disabled device %s\n"
1929 #define AUTOADD_FAILED KERN_ERR \
1930 "md: auto-adding devices to md%d FAILED (error %d).\n"
1932 #define AUTOADD_FAILED_USED KERN_ERR \
1933 "md: cannot auto-add device %s to md%d, already used.\n"
1935 #define AUTORUN_FAILED KERN_ERR \
1936 "md: auto-running md%d FAILED (error %d).\n"
1938 #define MDDEV_BUSY KERN_ERR \
1939 "md: cannot auto-add to md%d, already running.\n"
1941 #define AUTOADDING KERN_INFO \
1942 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1944 #define AUTORUNNING KERN_INFO \
1945 "md: auto-running md%d.\n"
1947 static int autostart_array (kdev_t startdev)
1949 int err = -EINVAL, i;
1950 mdp_super_t *sb = NULL;
1951 mdk_rdev_t *start_rdev = NULL, *rdev;
1953 if (md_import_device(startdev, 1)) {
1954 printk("could not import %s!\n", partition_name(startdev));
1955 goto abort;
1958 start_rdev = find_rdev_all(startdev);
1959 if (!start_rdev) {
1960 MD_BUG();
1961 goto abort;
1963 if (start_rdev->faulty) {
1964 printk("can not autostart based on faulty %s!\n",
1965 partition_name(startdev));
1966 goto abort;
1968 md_list_add(&start_rdev->pending, &pending_raid_disks);
1970 sb = start_rdev->sb;
1972 err = detect_old_array(sb);
1973 if (err) {
1974 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1975 goto abort;
1978 for (i = 0; i < MD_SB_DISKS; i++) {
1979 mdp_disk_t *desc;
1980 kdev_t dev;
1982 desc = sb->disks + i;
1983 dev = MKDEV(desc->major, desc->minor);
1985 if (dev == MKDEV(0,0))
1986 continue;
1987 if (dev == startdev)
1988 continue;
1989 if (md_import_device(dev, 1)) {
1990 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
1991 continue;
1993 rdev = find_rdev_all(dev);
1994 if (!rdev) {
1995 MD_BUG();
1996 goto abort;
1998 md_list_add(&rdev->pending, &pending_raid_disks);
2002 * possibly return codes
2004 autorun_devices();
2005 return 0;
2007 abort:
2008 if (start_rdev)
2009 export_rdev(start_rdev);
2010 return err;
2013 #undef BAD_VERSION
2014 #undef OUT_OF_MEM
2015 #undef NO_DEVICE
2016 #undef AUTOADD_FAILED_USED
2017 #undef AUTOADD_FAILED
2018 #undef AUTORUN_FAILED
2019 #undef AUTOADDING
2020 #undef AUTORUNNING
2022 struct {
2023 int set;
2024 int noautodetect;
2026 } raid_setup_args md__initdata = { 0, 0 };
2028 void md_setup_drive(void) md__init;
2031 * Searches all registered partitions for autorun RAID arrays
2032 * at boot time.
2034 #ifdef CONFIG_AUTODETECT_RAID
2035 static int detected_devices[128] md__initdata = { 0, };
2036 static int dev_cnt=0;
2037 void md_autodetect_dev(kdev_t dev)
2039 if (dev_cnt >= 0 && dev_cnt < 127)
2040 detected_devices[dev_cnt++] = dev;
2042 #endif
2044 int md__init md_run_setup(void)
2046 #ifdef CONFIG_AUTODETECT_RAID
2047 mdk_rdev_t *rdev;
2048 int i;
2050 if (raid_setup_args.noautodetect)
2051 printk(KERN_INFO "skipping autodetection of RAID arrays\n");
2052 else {
2054 printk(KERN_INFO "autodetecting RAID arrays\n");
2056 for (i=0; i<dev_cnt; i++) {
2057 kdev_t dev = detected_devices[i];
2059 if (md_import_device(dev,1)) {
2060 printk(KERN_ALERT "could not import %s!\n",
2061 partition_name(dev));
2062 continue;
2065 * Sanity checks:
2067 rdev = find_rdev_all(dev);
2068 if (!rdev) {
2069 MD_BUG();
2070 continue;
2072 if (rdev->faulty) {
2073 MD_BUG();
2074 continue;
2076 md_list_add(&rdev->pending, &pending_raid_disks);
2079 autorun_devices();
2082 dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */
2083 #endif
2084 #ifdef CONFIG_MD_BOOT
2085 md_setup_drive();
2086 #endif
2087 return 0;
2090 static int get_version (void * arg)
2092 mdu_version_t ver;
2094 ver.major = MD_MAJOR_VERSION;
2095 ver.minor = MD_MINOR_VERSION;
2096 ver.patchlevel = MD_PATCHLEVEL_VERSION;
2098 if (md_copy_to_user(arg, &ver, sizeof(ver)))
2099 return -EFAULT;
2101 return 0;
2104 #define SET_FROM_SB(x) info.x = mddev->sb->x
2105 static int get_array_info (mddev_t * mddev, void * arg)
2107 mdu_array_info_t info;
2109 if (!mddev->sb)
2110 return -EINVAL;
2112 SET_FROM_SB(major_version);
2113 SET_FROM_SB(minor_version);
2114 SET_FROM_SB(patch_version);
2115 SET_FROM_SB(ctime);
2116 SET_FROM_SB(level);
2117 SET_FROM_SB(size);
2118 SET_FROM_SB(nr_disks);
2119 SET_FROM_SB(raid_disks);
2120 SET_FROM_SB(md_minor);
2121 SET_FROM_SB(not_persistent);
2123 SET_FROM_SB(utime);
2124 SET_FROM_SB(state);
2125 SET_FROM_SB(active_disks);
2126 SET_FROM_SB(working_disks);
2127 SET_FROM_SB(failed_disks);
2128 SET_FROM_SB(spare_disks);
2130 SET_FROM_SB(layout);
2131 SET_FROM_SB(chunk_size);
2133 if (md_copy_to_user(arg, &info, sizeof(info)))
2134 return -EFAULT;
2136 return 0;
2138 #undef SET_FROM_SB
2140 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2141 static int get_disk_info (mddev_t * mddev, void * arg)
2143 mdu_disk_info_t info;
2144 unsigned int nr;
2146 if (!mddev->sb)
2147 return -EINVAL;
2149 if (md_copy_from_user(&info, arg, sizeof(info)))
2150 return -EFAULT;
2152 nr = info.number;
2153 if (nr >= mddev->sb->nr_disks)
2154 return -EINVAL;
2156 SET_FROM_SB(major);
2157 SET_FROM_SB(minor);
2158 SET_FROM_SB(raid_disk);
2159 SET_FROM_SB(state);
2161 if (md_copy_to_user(arg, &info, sizeof(info)))
2162 return -EFAULT;
2164 return 0;
2166 #undef SET_FROM_SB
2168 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2170 static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2172 int err, size, persistent;
2173 mdk_rdev_t *rdev;
2174 unsigned int nr;
2175 kdev_t dev;
2176 dev = MKDEV(info->major,info->minor);
2178 if (find_rdev_all(dev)) {
2179 printk("device %s already used in a RAID array!\n",
2180 partition_name(dev));
2181 return -EBUSY;
2183 if (!mddev->sb) {
2184 /* expecting a device which has a superblock */
2185 err = md_import_device(dev, 1);
2186 if (err) {
2187 printk("md error, md_import_device returned %d\n", err);
2188 return -EINVAL;
2190 rdev = find_rdev_all(dev);
2191 if (!rdev) {
2192 MD_BUG();
2193 return -EINVAL;
2195 if (mddev->nb_dev) {
2196 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2197 mdk_rdev_t, same_set);
2198 if (!uuid_equal(rdev0, rdev)) {
2199 printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2200 export_rdev(rdev);
2201 return -EINVAL;
2203 if (!sb_equal(rdev0->sb, rdev->sb)) {
2204 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2205 export_rdev(rdev);
2206 return -EINVAL;
2209 bind_rdev_to_array(rdev, mddev);
2210 return 0;
2213 nr = info->number;
2214 if (nr >= mddev->sb->nr_disks)
2215 return -EINVAL;
2217 SET_SB(number);
2218 SET_SB(major);
2219 SET_SB(minor);
2220 SET_SB(raid_disk);
2221 SET_SB(state);
2223 if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2224 err = md_import_device (dev, 0);
2225 if (err) {
2226 printk("md: error, md_import_device() returned %d\n", err);
2227 return -EINVAL;
2229 rdev = find_rdev_all(dev);
2230 if (!rdev) {
2231 MD_BUG();
2232 return -EINVAL;
2235 rdev->old_dev = dev;
2236 rdev->desc_nr = info->number;
2238 bind_rdev_to_array(rdev, mddev);
2240 persistent = !mddev->sb->not_persistent;
2241 if (!persistent)
2242 printk("nonpersistent superblock ...\n");
2243 if (!mddev->sb->chunk_size)
2244 printk("no chunksize?\n");
2246 size = calc_dev_size(dev, mddev, persistent);
2247 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2249 if (!mddev->sb->size || (mddev->sb->size > size))
2250 mddev->sb->size = size;
2254 * sync all other superblocks with the main superblock
2256 sync_sbs(mddev);
2258 return 0;
2260 #undef SET_SB
2262 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2264 int err;
2265 mdk_rdev_t *rdev;
2266 mdp_disk_t *disk;
2268 if (!mddev->pers)
2269 return -ENODEV;
2271 printk("trying to remove %s from md%d ... \n",
2272 partition_name(dev), mdidx(mddev));
2274 if (!mddev->pers->diskop) {
2275 printk("md%d: personality does not support diskops!\n",
2276 mdidx(mddev));
2277 return -EINVAL;
2280 rdev = find_rdev(mddev, dev);
2281 if (!rdev)
2282 return -ENXIO;
2284 if (rdev->desc_nr == -1) {
2285 MD_BUG();
2286 return -EINVAL;
2288 disk = &mddev->sb->disks[rdev->desc_nr];
2289 if (disk_active(disk))
2290 goto busy;
2291 if (disk_removed(disk)) {
2292 MD_BUG();
2293 return -EINVAL;
2296 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2297 if (err == -EBUSY)
2298 goto busy;
2299 if (err) {
2300 MD_BUG();
2301 return -EINVAL;
2304 remove_descriptor(disk, mddev->sb);
2305 kick_rdev_from_array(rdev);
2306 mddev->sb_dirty = 1;
2307 md_update_sb(mddev);
2309 return 0;
2310 busy:
2311 printk("cannot remove active disk %s from md%d ... \n",
2312 partition_name(dev), mdidx(mddev));
2313 return -EBUSY;
2316 static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2318 int i, err, persistent;
2319 unsigned int size;
2320 mdk_rdev_t *rdev;
2321 mdp_disk_t *disk;
2323 if (!mddev->pers)
2324 return -ENODEV;
2326 printk("trying to hot-add %s to md%d ... \n",
2327 partition_name(dev), mdidx(mddev));
2329 if (!mddev->pers->diskop) {
2330 printk("md%d: personality does not support diskops!\n",
2331 mdidx(mddev));
2332 return -EINVAL;
2335 persistent = !mddev->sb->not_persistent;
2336 size = calc_dev_size(dev, mddev, persistent);
2338 if (size < mddev->sb->size) {
2339 printk("md%d: disk size %d blocks < array size %d\n",
2340 mdidx(mddev), size, mddev->sb->size);
2341 return -ENOSPC;
2344 rdev = find_rdev(mddev, dev);
2345 if (rdev)
2346 return -EBUSY;
2348 err = md_import_device (dev, 0);
2349 if (err) {
2350 printk("md: error, md_import_device() returned %d\n", err);
2351 return -EINVAL;
2353 rdev = find_rdev_all(dev);
2354 if (!rdev) {
2355 MD_BUG();
2356 return -EINVAL;
2358 if (rdev->faulty) {
2359 printk("md: can not hot-add faulty %s disk to md%d!\n",
2360 partition_name(dev), mdidx(mddev));
2361 err = -EINVAL;
2362 goto abort_export;
2364 bind_rdev_to_array(rdev, mddev);
2367 * The rest should better be atomic, we can have disk failures
2368 * noticed in interrupt contexts ...
2370 rdev->old_dev = dev;
2371 rdev->size = size;
2372 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2374 disk = mddev->sb->disks + mddev->sb->raid_disks;
2375 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2376 disk = mddev->sb->disks + i;
2378 if (!disk->major && !disk->minor)
2379 break;
2380 if (disk_removed(disk))
2381 break;
2383 if (i == MD_SB_DISKS) {
2384 printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2385 err = -EBUSY;
2386 goto abort_unbind_export;
2389 if (disk_removed(disk)) {
2391 * reuse slot
2393 if (disk->number != i) {
2394 MD_BUG();
2395 err = -EINVAL;
2396 goto abort_unbind_export;
2398 } else {
2399 disk->number = i;
2402 disk->raid_disk = disk->number;
2403 disk->major = MAJOR(dev);
2404 disk->minor = MINOR(dev);
2406 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2407 MD_BUG();
2408 err = -EINVAL;
2409 goto abort_unbind_export;
2412 mark_disk_spare(disk);
2413 mddev->sb->nr_disks++;
2414 mddev->sb->spare_disks++;
2415 mddev->sb->working_disks++;
2417 mddev->sb_dirty = 1;
2419 md_update_sb(mddev);
2422 * Kick recovery, maybe this spare has to be added to the
2423 * array immediately.
2425 md_recover_arrays();
2427 return 0;
2429 abort_unbind_export:
2430 unbind_rdev_from_array(rdev);
2432 abort_export:
2433 export_rdev(rdev);
2434 return err;
2437 #define SET_SB(x) mddev->sb->x = info->x
2438 static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2441 if (alloc_array_sb(mddev))
2442 return -ENOMEM;
2444 mddev->sb->major_version = MD_MAJOR_VERSION;
2445 mddev->sb->minor_version = MD_MINOR_VERSION;
2446 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2447 mddev->sb->ctime = CURRENT_TIME;
2449 SET_SB(level);
2450 SET_SB(size);
2451 SET_SB(nr_disks);
2452 SET_SB(raid_disks);
2453 SET_SB(md_minor);
2454 SET_SB(not_persistent);
2456 SET_SB(state);
2457 SET_SB(active_disks);
2458 SET_SB(working_disks);
2459 SET_SB(failed_disks);
2460 SET_SB(spare_disks);
2462 SET_SB(layout);
2463 SET_SB(chunk_size);
2465 mddev->sb->md_magic = MD_SB_MAGIC;
2468 * Generate a 128 bit UUID
2470 get_random_bytes(&mddev->sb->set_uuid0, 4);
2471 get_random_bytes(&mddev->sb->set_uuid1, 4);
2472 get_random_bytes(&mddev->sb->set_uuid2, 4);
2473 get_random_bytes(&mddev->sb->set_uuid3, 4);
2475 return 0;
2477 #undef SET_SB
2479 static int set_disk_info (mddev_t * mddev, void * arg)
2481 printk("not yet");
2482 return -EINVAL;
2485 static int clear_array (mddev_t * mddev)
2487 printk("not yet");
2488 return -EINVAL;
2491 static int write_raid_info (mddev_t * mddev)
2493 printk("not yet");
2494 return -EINVAL;
2497 static int protect_array (mddev_t * mddev)
2499 printk("not yet");
2500 return -EINVAL;
2503 static int unprotect_array (mddev_t * mddev)
2505 printk("not yet");
2506 return -EINVAL;
2509 static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2511 int ret;
2513 fsync_dev(mddev_to_kdev(mddev));
2514 ret = md_error(mddev_to_kdev(mddev), dev);
2515 return ret;
2518 static int md_ioctl (struct inode *inode, struct file *file,
2519 unsigned int cmd, unsigned long arg)
2521 unsigned int minor;
2522 int err = 0;
2523 struct hd_geometry *loc = (struct hd_geometry *) arg;
2524 mddev_t *mddev = NULL;
2525 kdev_t dev;
2527 if (!md_capable_admin())
2528 return -EACCES;
2530 dev = inode->i_rdev;
2531 minor = MINOR(dev);
2532 if (minor >= MAX_MD_DEVS)
2533 return -EINVAL;
2536 * Commands dealing with the RAID driver but not any
2537 * particular array:
2539 switch (cmd)
2541 case RAID_VERSION:
2542 err = get_version((void *)arg);
2543 goto done;
2545 case PRINT_RAID_DEBUG:
2546 err = 0;
2547 md_print_devices();
2548 goto done_unlock;
2550 case BLKGETSIZE: /* Return device size */
2551 if (!arg) {
2552 err = -EINVAL;
2553 goto abort;
2555 err = md_put_user(md_hd_struct[minor].nr_sects,
2556 (long *) arg);
2557 goto done;
2559 case BLKFLSBUF:
2560 fsync_dev(dev);
2561 invalidate_buffers(dev);
2562 goto done;
2564 case BLKRASET:
2565 if (arg > 0xff) {
2566 err = -EINVAL;
2567 goto abort;
2569 read_ahead[MAJOR(dev)] = arg;
2570 goto done;
2572 case BLKRAGET:
2573 if (!arg) {
2574 err = -EINVAL;
2575 goto abort;
2577 err = md_put_user (read_ahead[
2578 MAJOR(dev)], (long *) arg);
2579 goto done;
2580 default:
2584 * Commands creating/starting a new array:
2587 mddev = kdev_to_mddev(dev);
2589 switch (cmd)
2591 case SET_ARRAY_INFO:
2592 case START_ARRAY:
2593 if (mddev) {
2594 printk("array md%d already exists!\n",
2595 mdidx(mddev));
2596 err = -EEXIST;
2597 goto abort;
2599 default:
2601 switch (cmd)
2603 case SET_ARRAY_INFO:
2604 mddev = alloc_mddev(dev);
2605 if (!mddev) {
2606 err = -ENOMEM;
2607 goto abort;
2610 * alloc_mddev() should possibly self-lock.
2612 err = lock_mddev(mddev);
2613 if (err) {
2614 printk("ioctl, reason %d, cmd %d\n", err, cmd);
2615 goto abort;
2618 if (mddev->sb) {
2619 printk("array md%d already has a superblock!\n",
2620 mdidx(mddev));
2621 err = -EBUSY;
2622 goto abort_unlock;
2624 if (arg) {
2625 mdu_array_info_t info;
2626 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2627 err = -EFAULT;
2628 goto abort_unlock;
2630 err = set_array_info(mddev, &info);
2631 if (err) {
2632 printk("couldnt set array info. %d\n", err);
2633 goto abort_unlock;
2636 goto done_unlock;
2638 case START_ARRAY:
2640 * possibly make it lock the array ...
2642 err = autostart_array((kdev_t)arg);
2643 if (err) {
2644 printk("autostart %s failed!\n",
2645 partition_name((kdev_t)arg));
2646 goto abort;
2648 goto done;
2650 default:
2654 * Commands querying/configuring an existing array:
2657 if (!mddev) {
2658 err = -ENODEV;
2659 goto abort;
2661 err = lock_mddev(mddev);
2662 if (err) {
2663 printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2664 goto abort;
2666 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2667 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2668 err = -ENODEV;
2669 goto abort_unlock;
2673 * Commands even a read-only array can execute:
2675 switch (cmd)
2677 case GET_ARRAY_INFO:
2678 err = get_array_info(mddev, (void *)arg);
2679 goto done_unlock;
2681 case GET_DISK_INFO:
2682 err = get_disk_info(mddev, (void *)arg);
2683 goto done_unlock;
2685 case RESTART_ARRAY_RW:
2686 err = restart_array(mddev);
2687 goto done_unlock;
2689 case STOP_ARRAY:
2690 if (!(err = do_md_stop (mddev, 0)))
2691 mddev = NULL;
2692 goto done_unlock;
2694 case STOP_ARRAY_RO:
2695 err = do_md_stop (mddev, 1);
2696 goto done_unlock;
2699 * We have a problem here : there is no easy way to give a CHS
2700 * virtual geometry. We currently pretend that we have a 2 heads
2701 * 4 sectors (with a BIG number of cylinders...). This drives
2702 * dosfs just mad... ;-)
2704 case HDIO_GETGEO:
2705 if (!loc) {
2706 err = -EINVAL;
2707 goto abort_unlock;
2709 err = md_put_user (2, (char *) &loc->heads);
2710 if (err)
2711 goto abort_unlock;
2712 err = md_put_user (4, (char *) &loc->sectors);
2713 if (err)
2714 goto abort_unlock;
2715 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2716 (short *) &loc->cylinders);
2717 if (err)
2718 goto abort_unlock;
2719 err = md_put_user (md_hd_struct[minor].start_sect,
2720 (long *) &loc->start);
2721 goto done_unlock;
2725 * The remaining ioctls are changing the state of the
2726 * superblock, so we do not allow read-only arrays
2727 * here:
2729 if (mddev->ro) {
2730 err = -EROFS;
2731 goto abort_unlock;
2734 switch (cmd)
2736 case CLEAR_ARRAY:
2737 err = clear_array(mddev);
2738 goto done_unlock;
2740 case ADD_NEW_DISK:
2742 mdu_disk_info_t info;
2743 if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2744 err = -EFAULT;
2745 else
2746 err = add_new_disk(mddev, &info);
2747 goto done_unlock;
2749 case HOT_REMOVE_DISK:
2750 err = hot_remove_disk(mddev, (kdev_t)arg);
2751 goto done_unlock;
2753 case HOT_ADD_DISK:
2754 err = hot_add_disk(mddev, (kdev_t)arg);
2755 goto done_unlock;
2757 case SET_DISK_INFO:
2758 err = set_disk_info(mddev, (void *)arg);
2759 goto done_unlock;
2761 case WRITE_RAID_INFO:
2762 err = write_raid_info(mddev);
2763 goto done_unlock;
2765 case UNPROTECT_ARRAY:
2766 err = unprotect_array(mddev);
2767 goto done_unlock;
2769 case PROTECT_ARRAY:
2770 err = protect_array(mddev);
2771 goto done_unlock;
2773 case SET_DISK_FAULTY:
2774 err = set_disk_faulty(mddev, (kdev_t)arg);
2775 goto done_unlock;
2777 case RUN_ARRAY:
2779 /* The data is never used....
2780 mdu_param_t param;
2781 err = md_copy_from_user(&param, (mdu_param_t *)arg,
2782 sizeof(param));
2783 if (err)
2784 goto abort_unlock;
2786 err = do_md_run (mddev);
2788 * we have to clean up the mess if
2789 * the array cannot be run for some
2790 * reason ...
2792 if (err) {
2793 mddev->sb_dirty = 0;
2794 if (!do_md_stop (mddev, 0))
2795 mddev = NULL;
2797 goto done_unlock;
2800 default:
2801 printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2802 err = -EINVAL;
2803 goto abort_unlock;
2806 done_unlock:
2807 abort_unlock:
2808 if (mddev)
2809 unlock_mddev(mddev);
2811 return err;
2812 done:
2813 if (err)
2814 printk("huh12?\n");
2815 abort:
2816 return err;
2819 static int md_open (struct inode *inode, struct file *file)
2822 * Always succeed
2824 return (0);
2827 static struct block_device_operations md_fops=
2829 open: md_open,
2830 ioctl: md_ioctl,
2834 int md_thread(void * arg)
2836 mdk_thread_t *thread = arg;
2838 md_lock_kernel();
2839 exit_mm(current);
2840 exit_files(current);
2841 exit_fs(current);
2844 * Detach thread
2846 daemonize();
2847 sprintf(current->comm, thread->name);
2848 md_init_signals();
2849 md_flush_signals();
2850 thread->tsk = current;
2853 * md_thread is a 'system-thread', it's priority should be very
2854 * high. We avoid resource deadlocks individually in each
2855 * raid personality. (RAID5 does preallocation) We also use RR and
2856 * the very same RT priority as kswapd, thus we will never get
2857 * into a priority inversion deadlock.
2859 * we definitely have to have equal or higher priority than
2860 * bdflush, otherwise bdflush will deadlock if there are too
2861 * many dirty RAID5 blocks.
2863 current->policy = SCHED_OTHER;
2864 current->nice = -20;
2865 // md_unlock_kernel();
2867 up(thread->sem);
2869 for (;;) {
2870 DECLARE_WAITQUEUE(wait, current);
2872 add_wait_queue(&thread->wqueue, &wait);
2873 set_task_state(current, TASK_INTERRUPTIBLE);
2874 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2875 dprintk("thread %p went to sleep.\n", thread);
2876 schedule();
2877 dprintk("thread %p woke up.\n", thread);
2879 current->state = TASK_RUNNING;
2880 remove_wait_queue(&thread->wqueue, &wait);
2881 clear_bit(THREAD_WAKEUP, &thread->flags);
2883 if (thread->run) {
2884 thread->run(thread->data);
2885 run_task_queue(&tq_disk);
2886 } else
2887 break;
2888 if (md_signal_pending(current)) {
2889 printk("%8s(%d) flushing signals.\n", current->comm,
2890 current->pid);
2891 md_flush_signals();
2894 up(thread->sem);
2895 return 0;
2898 void md_wakeup_thread(mdk_thread_t *thread)
2900 dprintk("waking up MD thread %p.\n", thread);
2901 set_bit(THREAD_WAKEUP, &thread->flags);
2902 wake_up(&thread->wqueue);
2905 mdk_thread_t *md_register_thread (void (*run) (void *),
2906 void *data, const char *name)
2908 mdk_thread_t *thread;
2909 int ret;
2910 DECLARE_MUTEX_LOCKED(sem);
2912 thread = (mdk_thread_t *) kmalloc
2913 (sizeof(mdk_thread_t), GFP_KERNEL);
2914 if (!thread)
2915 return NULL;
2917 memset(thread, 0, sizeof(mdk_thread_t));
2918 md_init_waitqueue_head(&thread->wqueue);
2920 thread->sem = &sem;
2921 thread->run = run;
2922 thread->data = data;
2923 thread->name = name;
2924 ret = kernel_thread(md_thread, thread, 0);
2925 if (ret < 0) {
2926 kfree(thread);
2927 return NULL;
2929 down(&sem);
2930 return thread;
2933 void md_interrupt_thread (mdk_thread_t *thread)
2935 if (!thread->tsk) {
2936 MD_BUG();
2937 return;
2939 printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2940 send_sig(SIGKILL, thread->tsk, 1);
2943 void md_unregister_thread (mdk_thread_t *thread)
2945 DECLARE_MUTEX_LOCKED(sem);
2947 thread->sem = &sem;
2948 thread->run = NULL;
2949 thread->name = NULL;
2950 if (!thread->tsk) {
2951 MD_BUG();
2952 return;
2954 md_interrupt_thread(thread);
2955 down(&sem);
2958 void md_recover_arrays (void)
2960 if (!md_recovery_thread) {
2961 MD_BUG();
2962 return;
2964 md_wakeup_thread(md_recovery_thread);
2968 int md_error (kdev_t dev, kdev_t rdev)
2970 mddev_t *mddev;
2971 mdk_rdev_t * rrdev;
2972 int rc;
2974 mddev = kdev_to_mddev(dev);
2975 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
2977 if (!mddev) {
2978 MD_BUG();
2979 return 0;
2981 rrdev = find_rdev(mddev, rdev);
2982 mark_rdev_faulty(rrdev);
2984 * if recovery was running, stop it now.
2986 if (mddev->pers->stop_resync)
2987 mddev->pers->stop_resync(mddev);
2988 if (mddev->recovery_running)
2989 md_interrupt_thread(md_recovery_thread);
2990 if (mddev->pers->error_handler) {
2991 rc = mddev->pers->error_handler(mddev, rdev);
2992 md_recover_arrays();
2993 return rc;
2995 return 0;
2998 static int status_unused (char * page)
3000 int sz = 0, i = 0;
3001 mdk_rdev_t *rdev;
3002 struct md_list_head *tmp;
3004 sz += sprintf(page + sz, "unused devices: ");
3006 ITERATE_RDEV_ALL(rdev,tmp) {
3007 if (!rdev->same_set.next && !rdev->same_set.prev) {
3009 * The device is not yet used by any array.
3011 i++;
3012 sz += sprintf(page + sz, "%s ",
3013 partition_name(rdev->dev));
3016 if (!i)
3017 sz += sprintf(page + sz, "<none>");
3019 sz += sprintf(page + sz, "\n");
3020 return sz;
3024 static int status_resync (char * page, mddev_t * mddev)
3026 int sz = 0;
3027 unsigned long max_blocks, resync, res, dt, db, rt;
3029 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
3030 max_blocks = mddev->sb->size;
3033 * Should not happen.
3035 if (!max_blocks) {
3036 MD_BUG();
3037 return 0;
3039 res = (resync/1024)*1000/(max_blocks/1024 + 1);
3041 int i, x = res/50, y = 20-x;
3042 sz += sprintf(page + sz, "[");
3043 for (i = 0; i < x; i++)
3044 sz += sprintf(page + sz, "=");
3045 sz += sprintf(page + sz, ">");
3046 for (i = 0; i < y; i++)
3047 sz += sprintf(page + sz, ".");
3048 sz += sprintf(page + sz, "] ");
3050 if (!mddev->recovery_running)
3052 * true resync
3054 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3055 res/10, res % 10, resync, max_blocks);
3056 else
3058 * recovery ...
3060 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3061 res/10, res % 10, resync, max_blocks);
3064 * We do not want to overflow, so the order of operands and
3065 * the * 100 / 100 trick are important. We do a +1 to be
3066 * safe against division by zero. We only estimate anyway.
3068 * dt: time from mark until now
3069 * db: blocks written from mark until now
3070 * rt: remaining time
3072 dt = ((jiffies - mddev->resync_mark) / HZ);
3073 if (!dt) dt++;
3074 db = resync - mddev->resync_mark_cnt;
3075 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3077 sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3079 sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3081 return sz;
3084 static int md_status_read_proc(char *page, char **start, off_t off,
3085 int count, int *eof, void *data)
3087 int sz = 0, j, size;
3088 struct md_list_head *tmp, *tmp2;
3089 mdk_rdev_t *rdev;
3090 mddev_t *mddev;
3092 sz += sprintf(page + sz, "Personalities : ");
3093 for (j = 0; j < MAX_PERSONALITY; j++)
3094 if (pers[j])
3095 sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3097 sz += sprintf(page+sz, "\n");
3100 sz += sprintf(page+sz, "read_ahead ");
3101 if (read_ahead[MD_MAJOR] == INT_MAX)
3102 sz += sprintf(page+sz, "not set\n");
3103 else
3104 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3106 ITERATE_MDDEV(mddev,tmp) {
3107 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3108 mddev->pers ? "" : "in");
3109 if (mddev->pers) {
3110 if (mddev->ro)
3111 sz += sprintf(page + sz, " (read-only)");
3112 sz += sprintf(page + sz, " %s", mddev->pers->name);
3115 size = 0;
3116 ITERATE_RDEV(mddev,rdev,tmp2) {
3117 sz += sprintf(page + sz, " %s[%d]",
3118 partition_name(rdev->dev), rdev->desc_nr);
3119 if (rdev->faulty) {
3120 sz += sprintf(page + sz, "(F)");
3121 continue;
3123 size += rdev->size;
3126 if (mddev->nb_dev) {
3127 if (mddev->pers)
3128 sz += sprintf(page + sz, "\n %d blocks",
3129 md_size[mdidx(mddev)]);
3130 else
3131 sz += sprintf(page + sz, "\n %d blocks", size);
3134 if (!mddev->pers) {
3135 sz += sprintf(page+sz, "\n");
3136 continue;
3139 sz += mddev->pers->status (page+sz, mddev);
3141 sz += sprintf(page+sz, "\n ");
3142 if (mddev->curr_resync) {
3143 sz += status_resync (page+sz, mddev);
3144 } else {
3145 if (md_atomic_read(&mddev->resync_sem.count) != 1)
3146 sz += sprintf(page + sz, " resync=DELAYED");
3148 sz += sprintf(page + sz, "\n");
3150 sz += status_unused (page + sz);
3152 return sz;
3155 int register_md_personality (int pnum, mdk_personality_t *p)
3157 if (pnum >= MAX_PERSONALITY)
3158 return -EINVAL;
3160 if (pers[pnum])
3161 return -EBUSY;
3163 pers[pnum] = p;
3164 printk(KERN_INFO "%s personality registered\n", p->name);
3165 return 0;
3168 int unregister_md_personality (int pnum)
3170 if (pnum >= MAX_PERSONALITY)
3171 return -EINVAL;
3173 printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
3174 pers[pnum] = NULL;
3175 return 0;
3178 static mdp_disk_t *get_spare(mddev_t *mddev)
3180 mdp_super_t *sb = mddev->sb;
3181 mdp_disk_t *disk;
3182 mdk_rdev_t *rdev;
3183 struct md_list_head *tmp;
3185 ITERATE_RDEV(mddev,rdev,tmp) {
3186 if (rdev->faulty)
3187 continue;
3188 if (!rdev->sb) {
3189 MD_BUG();
3190 continue;
3192 disk = &sb->disks[rdev->desc_nr];
3193 if (disk_faulty(disk)) {
3194 MD_BUG();
3195 continue;
3197 if (disk_active(disk))
3198 continue;
3199 return disk;
3201 return NULL;
3204 static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3205 void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3207 unsigned int major = MAJOR(dev);
3208 unsigned int index;
3210 index = disk_index(dev);
3211 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3212 return;
3214 sync_io[major][index] += nr_sectors;
3217 static int is_mddev_idle (mddev_t *mddev)
3219 mdk_rdev_t * rdev;
3220 struct md_list_head *tmp;
3221 int idle;
3222 unsigned long curr_events;
3224 idle = 1;
3225 ITERATE_RDEV(mddev,rdev,tmp) {
3226 int major = MAJOR(rdev->dev);
3227 int idx = disk_index(rdev->dev);
3229 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3230 continue;
3232 curr_events = kstat.dk_drive_rblk[major][idx] +
3233 kstat.dk_drive_wblk[major][idx] ;
3234 curr_events -= sync_io[major][idx];
3235 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3236 if (curr_events != rdev->last_events) {
3237 // printk("!I(%ld)", curr_events - rdev->last_events);
3238 rdev->last_events = curr_events;
3239 idle = 0;
3242 return idle;
3245 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3247 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3249 /* another "blocks" (1K) blocks have been synced */
3250 atomic_sub(blocks, &mddev->recovery_active);
3251 wake_up(&mddev->recovery_wait);
3252 if (!ok) {
3253 // stop recovery, signal do_sync ....
3257 #define SYNC_MARKS 10
3258 #define SYNC_MARK_STEP (3*HZ)
3259 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3261 mddev_t *mddev2;
3262 unsigned int max_blocks, currspeed,
3263 j, window, err, serialize;
3264 kdev_t read_disk = mddev_to_kdev(mddev);
3265 unsigned long mark[SYNC_MARKS];
3266 unsigned long mark_cnt[SYNC_MARKS];
3267 int last_mark,m;
3268 struct md_list_head *tmp;
3269 unsigned long last_check;
3272 err = down_interruptible(&mddev->resync_sem);
3273 if (err)
3274 goto out_nolock;
3276 recheck:
3277 serialize = 0;
3278 ITERATE_MDDEV(mddev2,tmp) {
3279 if (mddev2 == mddev)
3280 continue;
3281 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3282 printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
3283 serialize = 1;
3284 break;
3287 if (serialize) {
3288 interruptible_sleep_on(&resync_wait);
3289 if (md_signal_pending(current)) {
3290 md_flush_signals();
3291 err = -EINTR;
3292 goto out;
3294 goto recheck;
3297 mddev->curr_resync = 1;
3299 max_blocks = mddev->sb->size;
3301 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3302 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3303 sysctl_speed_limit_min);
3304 printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3307 * Resync has low priority.
3309 current->nice = 19;
3311 is_mddev_idle(mddev); /* this also initializes IO event counters */
3312 for (m = 0; m < SYNC_MARKS; m++) {
3313 mark[m] = jiffies;
3314 mark_cnt[m] = 0;
3316 last_mark = 0;
3317 mddev->resync_mark = mark[last_mark];
3318 mddev->resync_mark_cnt = mark_cnt[last_mark];
3321 * Tune reconstruction:
3323 window = MAX_READAHEAD*(PAGE_SIZE/1024);
3324 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks);
3326 atomic_set(&mddev->recovery_active, 0);
3327 init_waitqueue_head(&mddev->recovery_wait);
3328 last_check = 0;
3329 for (j = 0; j < max_blocks;) {
3330 int blocks;
3332 blocks = mddev->pers->sync_request(mddev, j);
3334 if (blocks < 0) {
3335 err = blocks;
3336 goto out;
3338 atomic_add(blocks, &mddev->recovery_active);
3339 j += blocks;
3340 mddev->curr_resync = j;
3342 if (last_check + window > j)
3343 continue;
3345 run_task_queue(&tq_disk); //??
3347 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3348 /* step marks */
3349 int next = (last_mark+1) % SYNC_MARKS;
3351 mddev->resync_mark = mark[next];
3352 mddev->resync_mark_cnt = mark_cnt[next];
3353 mark[next] = jiffies;
3354 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3355 last_mark = next;
3359 if (md_signal_pending(current)) {
3361 * got a signal, exit.
3363 mddev->curr_resync = 0;
3364 printk("md_do_sync() got signal ... exiting\n");
3365 md_flush_signals();
3366 err = -EINTR;
3367 goto out;
3371 * this loop exits only if either when we are slower than
3372 * the 'hard' speed limit, or the system was IO-idle for
3373 * a jiffy.
3374 * the system might be non-idle CPU-wise, but we only care
3375 * about not overloading the IO subsystem. (things like an
3376 * e2fsck being done on the RAID array should execute fast)
3378 repeat:
3379 if (md_need_resched(current))
3380 schedule();
3382 currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1;
3384 if (currspeed > sysctl_speed_limit_min) {
3385 current->nice = 19;
3387 if ((currspeed > sysctl_speed_limit_max) ||
3388 !is_mddev_idle(mddev)) {
3389 current->state = TASK_INTERRUPTIBLE;
3390 md_schedule_timeout(HZ/4);
3391 if (!md_signal_pending(current))
3392 goto repeat;
3394 } else
3395 current->nice = -20;
3397 fsync_dev(read_disk);
3398 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3399 err = 0;
3401 * this also signals 'finished resyncing' to md_stop
3403 out:
3404 wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3405 up(&mddev->resync_sem);
3406 out_nolock:
3407 mddev->curr_resync = 0;
3408 wake_up(&resync_wait);
3409 return err;
3414 * This is a kernel thread which syncs a spare disk with the active array
3416 * the amount of foolproofing might seem to be a tad excessive, but an
3417 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3418 * of my root partition with the first 0.5 gigs of my /home partition ... so
3419 * i'm a bit nervous ;)
3421 void md_do_recovery (void *data)
3423 int err;
3424 mddev_t *mddev;
3425 mdp_super_t *sb;
3426 mdp_disk_t *spare;
3427 struct md_list_head *tmp;
3429 printk(KERN_INFO "md: recovery thread got woken up ...\n");
3430 restart:
3431 ITERATE_MDDEV(mddev,tmp) {
3432 sb = mddev->sb;
3433 if (!sb)
3434 continue;
3435 if (mddev->recovery_running)
3436 continue;
3437 if (sb->active_disks == sb->raid_disks)
3438 continue;
3439 if (!sb->spare_disks) {
3440 printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3441 continue;
3444 * now here we get the spare and resync it.
3446 if ((spare = get_spare(mddev)) == NULL)
3447 continue;
3448 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3449 if (!mddev->pers->diskop)
3450 continue;
3451 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3452 continue;
3453 down(&mddev->recovery_sem);
3454 mddev->recovery_running = 1;
3455 err = md_do_sync(mddev, spare);
3456 if (err == -EIO) {
3457 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3458 if (!disk_faulty(spare)) {
3459 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3460 mark_disk_faulty(spare);
3461 mark_disk_nonsync(spare);
3462 mark_disk_inactive(spare);
3463 sb->spare_disks--;
3464 sb->working_disks--;
3465 sb->failed_disks++;
3467 } else
3468 if (disk_faulty(spare))
3469 mddev->pers->diskop(mddev, &spare,
3470 DISKOP_SPARE_INACTIVE);
3471 if (err == -EINTR || err == -ENOMEM) {
3473 * Recovery got interrupted, or ran out of mem ...
3474 * signal back that we have finished using the array.
3476 mddev->pers->diskop(mddev, &spare,
3477 DISKOP_SPARE_INACTIVE);
3478 up(&mddev->recovery_sem);
3479 mddev->recovery_running = 0;
3480 continue;
3481 } else {
3482 mddev->recovery_running = 0;
3483 up(&mddev->recovery_sem);
3485 if (!disk_faulty(spare)) {
3487 * the SPARE_ACTIVE diskop possibly changes the
3488 * pointer too
3490 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3491 mark_disk_sync(spare);
3492 mark_disk_active(spare);
3493 sb->active_disks++;
3494 sb->spare_disks--;
3496 mddev->sb_dirty = 1;
3497 md_update_sb(mddev);
3498 goto restart;
3500 printk(KERN_INFO "md: recovery thread finished ...\n");
3504 int md_notify_reboot(struct notifier_block *this,
3505 unsigned long code, void *x)
3507 struct md_list_head *tmp;
3508 mddev_t *mddev;
3510 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3511 || (code == MD_SYS_POWER_OFF)) {
3513 printk(KERN_INFO "stopping all md devices.\n");
3515 ITERATE_MDDEV(mddev,tmp)
3516 do_md_stop (mddev, 1);
3518 * certain more exotic SCSI devices are known to be
3519 * volatile wrt too early system reboots. While the
3520 * right place to handle this issue is the given
3521 * driver, we do want to have a safe RAID driver ...
3523 md_mdelay(1000*1);
3525 return NOTIFY_DONE;
3528 struct notifier_block md_notifier = {
3529 md_notify_reboot,
3530 NULL,
3533 #ifndef MODULE
3534 static int md__init raid_setup(char *str)
3536 int len, pos;
3538 len = strlen(str) + 1;
3539 pos = 0;
3541 while (pos < len) {
3542 char *comma = strchr(str+pos, ',');
3543 int wlen;
3544 if (comma)
3545 wlen = (comma-str)-pos;
3546 else wlen = (len-1)-pos;
3548 if (strncmp(str, "noautodetect", wlen) == 0)
3549 raid_setup_args.noautodetect = 1;
3550 pos += wlen+1;
3552 raid_setup_args.set = 1;
3553 return 1;
3555 __setup("raid=", raid_setup);
3556 #endif
3557 static void md_geninit (void)
3559 int i;
3561 for(i = 0; i < MAX_MD_DEVS; i++) {
3562 md_blocksizes[i] = 1024;
3563 md_size[i] = 0;
3564 md_hardsect_sizes[i] = 512;
3565 md_maxreadahead[i] = MD_READAHEAD;
3566 register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0);
3568 blksize_size[MAJOR_NR] = md_blocksizes;
3569 blk_size[MAJOR_NR] = md_size;
3570 max_readahead[MAJOR_NR] = md_maxreadahead;
3571 hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3573 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3575 #ifdef CONFIG_PROC_FS
3576 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3577 #endif
3579 void hsm_init (void);
3580 void translucent_init (void);
3581 void linear_init (void);
3582 void raid0_init (void);
3583 void raid1_init (void);
3584 void raid5_init (void);
3586 int md__init md_init (void)
3588 static char * name = "mdrecoveryd";
3590 printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
3591 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3592 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
3594 if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3596 printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR);
3597 return (-1);
3599 devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3600 devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT,
3601 MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR,
3602 &md_fops, NULL);
3604 /* forward all md request to md_make_request */
3605 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3608 read_ahead[MAJOR_NR] = INT_MAX;
3609 md_gendisk.next = gendisk_head;
3611 gendisk_head = &md_gendisk;
3613 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3614 if (!md_recovery_thread)
3615 printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
3617 md_register_reboot_notifier(&md_notifier);
3618 raid_table_header = register_sysctl_table(raid_root_table, 1);
3620 #ifdef CONFIG_MD_LINEAR
3621 linear_init ();
3622 #endif
3623 #ifdef CONFIG_MD_RAID0
3624 raid0_init ();
3625 #endif
3626 #ifdef CONFIG_MD_RAID1
3627 raid1_init ();
3628 #endif
3629 #ifdef CONFIG_MD_RAID5
3630 raid5_init ();
3631 #endif
3632 md_geninit();
3633 return (0);
3636 #ifdef CONFIG_MD_BOOT
3637 #define MAX_MD_BOOT_DEVS 8
3638 struct {
3639 unsigned long set;
3640 int pers[MAX_MD_BOOT_DEVS];
3641 int chunk[MAX_MD_BOOT_DEVS];
3642 kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL];
3643 } md_setup_args md__initdata = { 0, };
3646 * Parse the command-line parameters given our kernel, but do not
3647 * actually try to invoke the MD device now; that is handled by
3648 * md_setup_drive after the low-level disk drivers have initialised.
3650 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3651 * assigns the task of parsing integer arguments to the
3652 * invoked program now). Added ability to initialise all
3653 * the MD devices (by specifying multiple "md=" lines)
3654 * instead of just one. -- KTK
3655 * 18May2000: Added support for persistant-superblock arrays:
3656 * md=n,0,factor,fault,device-list uses RAID0 for device n
3657 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3658 * md=n,device-list reads a RAID superblock from the devices
3659 * elements in device-list are read by name_to_kdev_t so can be
3660 * a hex number or something like /dev/hda1 /dev/sdb
3662 extern kdev_t name_to_kdev_t(char *line) md__init;
3663 static int md__init md_setup(char *str)
3665 int minor, level, factor, fault, i=0;
3666 kdev_t device;
3667 char *devnames, *pername = "";
3669 if(get_option(&str, &minor) != 2) { /* MD Number */
3670 printk("md: Too few arguments supplied to md=.\n");
3671 return 0;
3673 if (minor >= MAX_MD_BOOT_DEVS) {
3674 printk ("md: Minor device number too high.\n");
3675 return 0;
3676 } else if (md_setup_args.set & (1 << minor)) {
3677 printk ("md: Warning - md=%d,... has been specified twice;\n"
3678 " will discard the first definition.\n", minor);
3680 switch(get_option(&str, &level)) { /* RAID Personality */
3681 case 2: /* could be 0 or -1.. */
3682 if (level == 0 || level == -1) {
3683 if (get_option(&str, &factor) != 2 || /* Chunk Size */
3684 get_option(&str, &fault) != 2) {
3685 printk("md: Too few arguments supplied to md=.\n");
3686 return 0;
3688 md_setup_args.pers[minor] = level;
3689 md_setup_args.chunk[minor] = 1 << (factor+12);
3690 switch(level) {
3691 case -1:
3692 level = LINEAR;
3693 pername = "linear";
3694 break;
3695 case 0:
3696 level = RAID0;
3697 pername = "raid0";
3698 break;
3699 default:
3700 printk ("md: The kernel has not been configured for raid%d"
3701 " support!\n", level);
3702 return 0;
3704 md_setup_args.pers[minor] = level;
3705 break;
3707 /* FALL THROUGH */
3708 case 1: /* the first device is numeric */
3709 md_setup_args.devices[minor][i++] = level;
3710 /* FALL THROUGH */
3711 case 0:
3712 md_setup_args.pers[minor] = 0;
3713 pername="super-block";
3715 devnames = str;
3716 for (; i<MAX_REAL && str; i++) {
3717 if ((device = name_to_kdev_t(str))) {
3718 md_setup_args.devices[minor][i] = device;
3719 } else {
3720 printk ("md: Unknown device name, %s.\n", str);
3721 return 0;
3723 if ((str = strchr(str, ',')) != NULL)
3724 str++;
3726 if (!i) {
3727 printk ("md: No devices specified for md%d?\n", minor);
3728 return 0;
3731 printk ("md: Will configure md%d (%s) from %s, below.\n",
3732 minor, pername, devnames);
3733 md_setup_args.devices[minor][i] = (kdev_t) 0;
3734 md_setup_args.set |= (1 << minor);
3735 return 1;
3738 void md__init md_setup_drive(void)
3740 int minor, i;
3741 kdev_t dev;
3742 mddev_t*mddev;
3744 for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) {
3745 mdu_disk_info_t dinfo;
3746 int err=0;
3747 if (!(md_setup_args.set & (1 << minor)))
3748 continue;
3749 printk("md: Loading md%d.\n", minor);
3750 if (mddev_map[minor].mddev) {
3751 printk(".. md%d already autodetected - use raid=noautodetect\n", minor);
3752 continue;
3754 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3755 if (md_setup_args.pers[minor]) {
3756 /* non-persistent */
3757 mdu_array_info_t ainfo;
3758 ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3759 ainfo.size = 0;
3760 ainfo.nr_disks =0;
3761 ainfo.raid_disks =0;
3762 ainfo.md_minor =minor;
3763 ainfo.not_persistent = 1;
3765 ainfo.state = MD_SB_CLEAN;
3766 ainfo.active_disks = 0;
3767 ainfo.working_disks = 0;
3768 ainfo.failed_disks = 0;
3769 ainfo.spare_disks = 0;
3770 ainfo.layout = 0;
3771 ainfo.chunk_size = md_setup_args.chunk[minor];
3772 err = set_array_info(mddev, &ainfo);
3773 for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) {
3774 dinfo.number = i;
3775 dinfo.raid_disk = i;
3776 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3777 dinfo.major = MAJOR(dev);
3778 dinfo.minor = MINOR(dev);
3779 mddev->sb->nr_disks++;
3780 mddev->sb->raid_disks++;
3781 mddev->sb->active_disks++;
3782 mddev->sb->working_disks++;
3783 err = add_new_disk (mddev, &dinfo);
3785 } else {
3786 /* persistent */
3787 for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) {
3788 dinfo.major = MAJOR(dev);
3789 dinfo.minor = MINOR(dev);
3790 add_new_disk (mddev, &dinfo);
3793 if (!err)
3794 err = do_md_run(mddev);
3795 if (err) {
3796 mddev->sb_dirty = 0;
3797 do_md_stop(mddev, 0);
3798 printk("md: starting md%d failed\n", minor);
3803 __setup("md=", md_setup);
3804 #endif
3806 #ifdef MODULE
3807 int init_module (void)
3809 return md_init();
3812 static void free_device_names(void)
3814 while (device_names.next != &device_names) {
3815 struct list_head *tmp = device_names.next;
3816 list_del(tmp);
3817 kfree(tmp);
3822 void cleanup_module (void)
3824 struct gendisk **gendisk_ptr;
3826 md_unregister_thread(md_recovery_thread);
3827 devfs_unregister(devfs_handle);
3829 devfs_unregister_blkdev(MAJOR_NR,"md");
3830 unregister_reboot_notifier(&md_notifier);
3831 unregister_sysctl_table(raid_table_header);
3832 #ifdef CONFIG_PROC_FS
3833 remove_proc_entry("mdstat", NULL);
3834 #endif
3836 gendisk_ptr = &gendisk_head;
3837 while (*gendisk_ptr) {
3838 if (*gendisk_ptr == &md_gendisk) {
3839 *gendisk_ptr = md_gendisk.next;
3840 break;
3842 gendisk_ptr = & (*gendisk_ptr)->next;
3844 blk_dev[MAJOR_NR].queue = NULL;
3845 blksize_size[MAJOR_NR] = NULL;
3846 blk_size[MAJOR_NR] = NULL;
3847 max_readahead[MAJOR_NR] = NULL;
3848 hardsect_size[MAJOR_NR] = NULL;
3850 free_device_names();
3853 #endif
3855 __initcall(md_init);
3856 #ifdef CONFIG_AUTODETECT_RAID
3857 __initcall(md_run_setup);
3858 #endif
3860 MD_EXPORT_SYMBOL(md_size);
3861 MD_EXPORT_SYMBOL(register_md_personality);
3862 MD_EXPORT_SYMBOL(unregister_md_personality);
3863 MD_EXPORT_SYMBOL(partition_name);
3864 MD_EXPORT_SYMBOL(md_error);
3865 MD_EXPORT_SYMBOL(md_do_sync);
3866 MD_EXPORT_SYMBOL(md_sync_acct);
3867 MD_EXPORT_SYMBOL(md_done_sync);
3868 MD_EXPORT_SYMBOL(md_recover_arrays);
3869 MD_EXPORT_SYMBOL(md_register_thread);
3870 MD_EXPORT_SYMBOL(md_unregister_thread);
3871 MD_EXPORT_SYMBOL(md_update_sb);
3872 MD_EXPORT_SYMBOL(md_wakeup_thread);
3873 MD_EXPORT_SYMBOL(md_print_devices);
3874 MD_EXPORT_SYMBOL(find_rdev_nr);
3875 MD_EXPORT_SYMBOL(md_interrupt_thread);
3876 MD_EXPORT_SYMBOL(mddev_map);
3877 MD_EXPORT_SYMBOL(md_check_ordering);