Import 2.4.0-test2pre7
[davej-history.git] / drivers / block / md.c
blob918dbdaf26367b4f515c0d08cab0ef2afb4c9c26
1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
7 Changes:
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
19 Neil Brown <neilb@cse.unsw.edu.au>.
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
24 any later version.
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 #include <linux/config.h>
32 #include <linux/raid/md.h>
33 #include <linux/raid/xor.h>
34 #include <linux/devfs_fs_kernel.h>
36 #ifdef CONFIG_KMOD
37 #include <linux/kmod.h>
38 #endif
40 #define __KERNEL_SYSCALLS__
41 #include <linux/unistd.h>
43 #include <asm/unaligned.h>
45 extern asmlinkage int sys_sched_yield(void);
46 extern asmlinkage long sys_setsid(void);
48 #define MAJOR_NR MD_MAJOR
49 #define MD_DRIVER
51 #include <linux/blk.h>
53 #define DEBUG 0
54 #if DEBUG
55 # define dprintk(x...) printk(x)
56 #else
57 # define dprintk(x...) do { } while(0)
58 #endif
60 static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, };
63 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
64 * is 100 KB/sec, so the extra system load does not show up that much.
65 * Increase it if you want to have more _guaranteed_ speed. Note that
66 * the RAID driver will use the maximum available bandwith if the IO
67 * subsystem is idle. There is also an 'absolute maximum' reconstruction
68 * speed limit - in case reconstruction slows down your system despite
69 * idle IO detection.
71 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
74 static int sysctl_speed_limit_min = 100;
75 static int sysctl_speed_limit_max = 100000;
77 static struct ctl_table_header *raid_table_header;
79 static ctl_table raid_table[] = {
80 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
81 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
82 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
83 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
84 {0}
87 static ctl_table raid_dir_table[] = {
88 {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
89 {0}
92 static ctl_table raid_root_table[] = {
93 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
94 {0}
98 * these have to be allocated separately because external
99 * subsystems want to have a pre-defined structure
101 struct hd_struct md_hd_struct[MAX_MD_DEVS];
102 static int md_blocksizes[MAX_MD_DEVS];
103 static int md_maxreadahead[MAX_MD_DEVS];
104 static mdk_thread_t *md_recovery_thread = NULL;
106 int md_size[MAX_MD_DEVS] = {0, };
108 extern struct block_device_operations md_fops;
109 static devfs_handle_t devfs_handle = NULL;
111 static struct gendisk md_gendisk=
113 major: MD_MAJOR,
114 major_name: "md",
115 minor_shift: 0,
116 max_p: 1,
117 part: md_hd_struct,
118 sizes: md_size,
119 nr_real: MAX_MD_DEVS,
120 real_devices: NULL,
121 next: NULL,
122 fops: &md_fops,
125 void md_plug_device (request_queue_t *mdqueue, kdev_t dev)
127 mdk_rdev_t * rdev;
128 struct md_list_head *tmp;
129 request_queue_t *q;
130 mddev_t *mddev;
132 if (!md_test_and_set_bit(0, (atomic_t *)&mdqueue->plugged)) {
133 mddev = kdev_to_mddev(dev);
134 ITERATE_RDEV(mddev,rdev,tmp) {
135 q = blk_get_queue(rdev->dev);
136 generic_unplug_device(q);
138 queue_task(&mdqueue->plug_tq, &tq_disk);
142 static void md_unplug_device (void * data)
144 mdk_rdev_t * rdev;
145 struct md_list_head *tmp;
146 mddev_t *mddev = (mddev_t *)data;
147 request_queue_t *mdqueue = &mddev->queue, *q;
149 clear_bit(0, (atomic_t *)&mdqueue->plugged);
150 ITERATE_RDEV(mddev,rdev,tmp) {
151 q = blk_get_queue(rdev->dev);
152 generic_unplug_device(q);
157 * Enables to iterate over all existing md arrays
159 static MD_LIST_HEAD(all_mddevs);
162 * The mapping between kdev and mddev is not necessary a simple
163 * one! Eg. HSM uses several sub-devices to implement Logical
164 * Volumes. All these sub-devices map to the same mddev.
166 dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, };
168 void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
170 unsigned int minor = MINOR(dev);
172 if (MAJOR(dev) != MD_MAJOR) {
173 MD_BUG();
174 return;
176 if (mddev_map[minor].mddev != NULL) {
177 MD_BUG();
178 return;
180 mddev_map[minor].mddev = mddev;
181 mddev_map[minor].data = data;
184 void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
186 unsigned int minor = MINOR(dev);
188 if (MAJOR(dev) != MD_MAJOR) {
189 MD_BUG();
190 return;
192 if (mddev_map[minor].mddev != mddev) {
193 MD_BUG();
194 return;
196 mddev_map[minor].mddev = NULL;
197 mddev_map[minor].data = NULL;
200 static request_queue_t *md_get_queue (kdev_t dev)
202 mddev_t *mddev = kdev_to_mddev(dev);
204 if (!mddev)
205 return NULL;
206 return &mddev->queue;
209 static void do_md_request (request_queue_t * q)
211 printk(KERN_ALERT "Got md request, not good...");
212 BUG();
213 return;
216 static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
218 mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
220 if (mddev && mddev->pers)
221 return mddev->pers->make_request(q, mddev, rw, bh);
222 else {
223 buffer_IO_error(bh);
224 return -1;
228 static mddev_t * alloc_mddev (kdev_t dev)
230 request_queue_t *q;
231 mddev_t *mddev;
233 if (MAJOR(dev) != MD_MAJOR) {
234 MD_BUG();
235 return 0;
237 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
238 if (!mddev)
239 return NULL;
241 memset(mddev, 0, sizeof(*mddev));
243 mddev->__minor = MINOR(dev);
244 init_MUTEX(&mddev->reconfig_sem);
245 init_MUTEX(&mddev->recovery_sem);
246 init_MUTEX(&mddev->resync_sem);
247 MD_INIT_LIST_HEAD(&mddev->disks);
248 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
250 q = &mddev->queue;
251 blk_init_queue(q, DEVICE_REQUEST);
252 blk_queue_pluggable(q, md_plug_device);
253 blk_queue_make_request(q, md_make_request);
255 q->plug_tq.sync = 0;
256 q->plug_tq.routine = &md_unplug_device;
257 q->plug_tq.data = mddev;
260 * The 'base' mddev is the one with data NULL.
261 * personalities can create additional mddevs
262 * if necessary.
264 add_mddev_mapping(mddev, dev, 0);
265 md_list_add(&mddev->all_mddevs, &all_mddevs);
267 return mddev;
270 struct gendisk * find_gendisk (kdev_t dev)
272 struct gendisk *tmp = gendisk_head;
274 while (tmp != NULL) {
275 if (tmp->major == MAJOR(dev))
276 return (tmp);
277 tmp = tmp->next;
279 return (NULL);
282 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
284 mdk_rdev_t * rdev;
285 struct md_list_head *tmp;
287 ITERATE_RDEV(mddev,rdev,tmp) {
288 if (rdev->desc_nr == nr)
289 return rdev;
291 return NULL;
294 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
296 struct md_list_head *tmp;
297 mdk_rdev_t *rdev;
299 ITERATE_RDEV(mddev,rdev,tmp) {
300 if (rdev->dev == dev)
301 return rdev;
303 return NULL;
306 static MD_LIST_HEAD(device_names);
308 char * partition_name (kdev_t dev)
310 struct gendisk *hd;
311 static char nomem [] = "<nomem>";
312 dev_name_t *dname;
313 struct md_list_head *tmp = device_names.next;
315 while (tmp != &device_names) {
316 dname = md_list_entry(tmp, dev_name_t, list);
317 if (dname->dev == dev)
318 return dname->name;
319 tmp = tmp->next;
322 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
324 if (!dname)
325 return nomem;
327 * ok, add this new device name to the list
329 hd = find_gendisk (dev);
330 dname->name = NULL;
331 if (hd)
332 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
333 if (!dname->name) {
334 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
335 dname->name = dname->namebuf;
338 dname->dev = dev;
339 MD_INIT_LIST_HEAD(&dname->list);
340 md_list_add(&dname->list, &device_names);
342 return dname->name;
345 static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
346 int persistent)
348 unsigned int size = 0;
350 if (blk_size[MAJOR(dev)])
351 size = blk_size[MAJOR(dev)][MINOR(dev)];
352 if (persistent)
353 size = MD_NEW_SIZE_BLOCKS(size);
354 return size;
357 static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
359 unsigned int size;
361 size = calc_dev_sboffset(dev, mddev, persistent);
362 if (!mddev->sb) {
363 MD_BUG();
364 return size;
366 if (mddev->sb->chunk_size)
367 size &= ~(mddev->sb->chunk_size/1024 - 1);
368 return size;
371 static unsigned int zoned_raid_size (mddev_t *mddev)
373 unsigned int mask;
374 mdk_rdev_t * rdev;
375 struct md_list_head *tmp;
377 if (!mddev->sb) {
378 MD_BUG();
379 return -EINVAL;
382 * do size and offset calculations.
384 mask = ~(mddev->sb->chunk_size/1024 - 1);
386 ITERATE_RDEV(mddev,rdev,tmp) {
387 rdev->size &= mask;
388 md_size[mdidx(mddev)] += rdev->size;
390 return 0;
394 * We check wether all devices are numbered from 0 to nb_dev-1. The
395 * order is guaranteed even after device name changes.
397 * Some personalities (raid0, linear) use this. Personalities that
398 * provide data have to be able to deal with loss of individual
399 * disks, so they do their checking themselves.
401 int md_check_ordering (mddev_t *mddev)
403 int i, c;
404 mdk_rdev_t *rdev;
405 struct md_list_head *tmp;
408 * First, all devices must be fully functional
410 ITERATE_RDEV(mddev,rdev,tmp) {
411 if (rdev->faulty) {
412 printk("md: md%d's device %s faulty, aborting.\n",
413 mdidx(mddev), partition_name(rdev->dev));
414 goto abort;
418 c = 0;
419 ITERATE_RDEV(mddev,rdev,tmp) {
420 c++;
422 if (c != mddev->nb_dev) {
423 MD_BUG();
424 goto abort;
426 if (mddev->nb_dev != mddev->sb->raid_disks) {
427 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
428 mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
429 goto abort;
432 * Now the numbering check
434 for (i = 0; i < mddev->nb_dev; i++) {
435 c = 0;
436 ITERATE_RDEV(mddev,rdev,tmp) {
437 if (rdev->desc_nr == i)
438 c++;
440 if (!c) {
441 printk("md: md%d, missing disk #%d, aborting.\n",
442 mdidx(mddev), i);
443 goto abort;
445 if (c > 1) {
446 printk("md: md%d, too many disks #%d, aborting.\n",
447 mdidx(mddev), i);
448 goto abort;
451 return 0;
452 abort:
453 return 1;
456 static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
458 if (disk_active(disk)) {
459 sb->working_disks--;
460 } else {
461 if (disk_spare(disk)) {
462 sb->spare_disks--;
463 sb->working_disks--;
464 } else {
465 sb->failed_disks--;
468 sb->nr_disks--;
469 disk->major = 0;
470 disk->minor = 0;
471 mark_disk_removed(disk);
474 #define BAD_MAGIC KERN_ERR \
475 "md: invalid raid superblock magic on %s\n"
477 #define BAD_MINOR KERN_ERR \
478 "md: %s: invalid raid minor (%x)\n"
480 #define OUT_OF_MEM KERN_ALERT \
481 "md: out of memory.\n"
483 #define NO_SB KERN_ERR \
484 "md: disabled device %s, could not read superblock.\n"
486 #define BAD_CSUM KERN_WARNING \
487 "md: invalid superblock checksum on %s\n"
489 static int alloc_array_sb (mddev_t * mddev)
491 if (mddev->sb) {
492 MD_BUG();
493 return 0;
496 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
497 if (!mddev->sb)
498 return -ENOMEM;
499 md_clear_page((unsigned long)mddev->sb);
500 return 0;
503 static int alloc_disk_sb (mdk_rdev_t * rdev)
505 if (rdev->sb)
506 MD_BUG();
508 rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
509 if (!rdev->sb) {
510 printk (OUT_OF_MEM);
511 return -EINVAL;
513 md_clear_page((unsigned long)rdev->sb);
515 return 0;
518 static void free_disk_sb (mdk_rdev_t * rdev)
520 if (rdev->sb) {
521 free_page((unsigned long) rdev->sb);
522 rdev->sb = NULL;
523 rdev->sb_offset = 0;
524 rdev->size = 0;
525 } else {
526 if (!rdev->faulty)
527 MD_BUG();
531 static void mark_rdev_faulty (mdk_rdev_t * rdev)
533 if (!rdev) {
534 MD_BUG();
535 return;
537 free_disk_sb(rdev);
538 rdev->faulty = 1;
541 static int read_disk_sb (mdk_rdev_t * rdev)
543 int ret = -EINVAL;
544 struct buffer_head *bh = NULL;
545 kdev_t dev = rdev->dev;
546 mdp_super_t *sb;
547 unsigned long sb_offset;
549 if (!rdev->sb) {
550 MD_BUG();
551 goto abort;
555 * Calculate the position of the superblock,
556 * it's at the end of the disk
558 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
559 rdev->sb_offset = sb_offset;
560 printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
561 fsync_dev(dev);
562 set_blocksize (dev, MD_SB_BYTES);
563 bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
565 if (bh) {
566 sb = (mdp_super_t *) bh->b_data;
567 memcpy (rdev->sb, sb, MD_SB_BYTES);
568 } else {
569 printk (NO_SB,partition_name(rdev->dev));
570 goto abort;
572 printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events));
573 ret = 0;
574 abort:
575 if (bh)
576 brelse (bh);
577 return ret;
580 static unsigned int calc_sb_csum (mdp_super_t * sb)
582 unsigned int disk_csum, csum;
584 disk_csum = sb->sb_csum;
585 sb->sb_csum = 0;
586 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
587 sb->sb_csum = disk_csum;
588 return csum;
592 * Check one RAID superblock for generic plausibility
595 static int check_disk_sb (mdk_rdev_t * rdev)
597 mdp_super_t *sb;
598 int ret = -EINVAL;
600 sb = rdev->sb;
601 if (!sb) {
602 MD_BUG();
603 goto abort;
606 if (sb->md_magic != MD_SB_MAGIC) {
607 printk (BAD_MAGIC, partition_name(rdev->dev));
608 goto abort;
611 if (sb->md_minor >= MAX_MD_DEVS) {
612 printk (BAD_MINOR, partition_name(rdev->dev),
613 sb->md_minor);
614 goto abort;
617 if (calc_sb_csum(sb) != sb->sb_csum)
618 printk(BAD_CSUM, partition_name(rdev->dev));
619 ret = 0;
620 abort:
621 return ret;
624 static kdev_t dev_unit(kdev_t dev)
626 unsigned int mask;
627 struct gendisk *hd = find_gendisk(dev);
629 if (!hd)
630 return 0;
631 mask = ~((1 << hd->minor_shift) - 1);
633 return MKDEV(MAJOR(dev), MINOR(dev) & mask);
636 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
638 struct md_list_head *tmp;
639 mdk_rdev_t *rdev;
641 ITERATE_RDEV(mddev,rdev,tmp)
642 if (dev_unit(rdev->dev) == dev_unit(dev))
643 return rdev;
645 return NULL;
648 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
650 struct md_list_head *tmp;
651 mdk_rdev_t *rdev;
653 ITERATE_RDEV(mddev1,rdev,tmp)
654 if (match_dev_unit(mddev2, rdev->dev))
655 return 1;
657 return 0;
660 static MD_LIST_HEAD(all_raid_disks);
661 static MD_LIST_HEAD(pending_raid_disks);
663 static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
665 mdk_rdev_t *same_pdev;
667 if (rdev->mddev) {
668 MD_BUG();
669 return;
671 same_pdev = match_dev_unit(mddev, rdev->dev);
672 if (same_pdev)
673 printk( KERN_WARNING
674 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
675 " protection against single-disk failure might be compromised.\n",
676 mdidx(mddev), partition_name(rdev->dev),
677 partition_name(same_pdev->dev));
679 md_list_add(&rdev->same_set, &mddev->disks);
680 rdev->mddev = mddev;
681 mddev->nb_dev++;
682 printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
685 static void unbind_rdev_from_array (mdk_rdev_t * rdev)
687 if (!rdev->mddev) {
688 MD_BUG();
689 return;
691 md_list_del(&rdev->same_set);
692 MD_INIT_LIST_HEAD(&rdev->same_set);
693 rdev->mddev->nb_dev--;
694 printk("unbind<%s,%d>\n", partition_name(rdev->dev),
695 rdev->mddev->nb_dev);
696 rdev->mddev = NULL;
700 * prevent the device from being mounted, repartitioned or
701 * otherwise reused by a RAID array (or any other kernel
702 * subsystem), by opening the device. [simply getting an
703 * inode is not enough, the SCSI module usage code needs
704 * an explicit open() on the device]
706 static int lock_rdev (mdk_rdev_t *rdev)
708 int err = 0;
711 * First insert a dummy inode.
713 if (rdev->inode)
714 MD_BUG();
715 rdev->inode = get_empty_inode();
716 if (!rdev->inode)
717 return -ENOMEM;
719 * we dont care about any other fields
721 rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev;
722 insert_inode_hash(rdev->inode);
724 memset(&rdev->filp, 0, sizeof(rdev->filp));
725 rdev->filp.f_mode = 3; /* read write */
726 return err;
729 static void unlock_rdev (mdk_rdev_t *rdev)
731 if (!rdev->inode)
732 MD_BUG();
733 iput(rdev->inode);
734 rdev->inode = NULL;
737 static void export_rdev (mdk_rdev_t * rdev)
739 printk("export_rdev(%s)\n",partition_name(rdev->dev));
740 if (rdev->mddev)
741 MD_BUG();
742 unlock_rdev(rdev);
743 free_disk_sb(rdev);
744 md_list_del(&rdev->all);
745 MD_INIT_LIST_HEAD(&rdev->all);
746 if (rdev->pending.next != &rdev->pending) {
747 printk("(%s was pending)\n",partition_name(rdev->dev));
748 md_list_del(&rdev->pending);
749 MD_INIT_LIST_HEAD(&rdev->pending);
751 rdev->dev = 0;
752 rdev->faulty = 0;
753 kfree(rdev);
756 static void kick_rdev_from_array (mdk_rdev_t * rdev)
758 unbind_rdev_from_array(rdev);
759 export_rdev(rdev);
762 static void export_array (mddev_t *mddev)
764 struct md_list_head *tmp;
765 mdk_rdev_t *rdev;
766 mdp_super_t *sb = mddev->sb;
768 if (mddev->sb) {
769 mddev->sb = NULL;
770 free_page((unsigned long) sb);
773 ITERATE_RDEV(mddev,rdev,tmp) {
774 if (!rdev->mddev) {
775 MD_BUG();
776 continue;
778 kick_rdev_from_array(rdev);
780 if (mddev->nb_dev)
781 MD_BUG();
784 static void free_mddev (mddev_t *mddev)
786 if (!mddev) {
787 MD_BUG();
788 return;
791 export_array(mddev);
792 md_size[mdidx(mddev)] = 0;
793 md_hd_struct[mdidx(mddev)].nr_sects = 0;
796 * Make sure nobody else is using this mddev
797 * (careful, we rely on the global kernel lock here)
799 while (md_atomic_read(&mddev->resync_sem.count) != 1)
800 schedule();
801 while (md_atomic_read(&mddev->recovery_sem.count) != 1)
802 schedule();
804 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
805 md_list_del(&mddev->all_mddevs);
806 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
807 kfree(mddev);
810 #undef BAD_CSUM
811 #undef BAD_MAGIC
812 #undef OUT_OF_MEM
813 #undef NO_SB
815 static void print_desc(mdp_disk_t *desc)
817 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
818 partition_name(MKDEV(desc->major,desc->minor)),
819 desc->major,desc->minor,desc->raid_disk,desc->state);
822 static void print_sb(mdp_super_t *sb)
824 int i;
826 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
827 sb->major_version, sb->minor_version, sb->patch_version,
828 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
829 sb->ctime);
830 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
831 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
832 sb->layout, sb->chunk_size);
833 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
834 sb->utime, sb->state, sb->active_disks, sb->working_disks,
835 sb->failed_disks, sb->spare_disks,
836 sb->sb_csum, (unsigned long)get_unaligned(&sb->events));
838 for (i = 0; i < MD_SB_DISKS; i++) {
839 mdp_disk_t *desc;
841 desc = sb->disks + i;
842 printk(" D %2d: ", i);
843 print_desc(desc);
845 printk(" THIS: ");
846 print_desc(&sb->this_disk);
850 static void print_rdev(mdk_rdev_t *rdev)
852 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
853 partition_name(rdev->dev), partition_name(rdev->old_dev),
854 rdev->size, rdev->faulty, rdev->desc_nr);
855 if (rdev->sb) {
856 printk("rdev superblock:\n");
857 print_sb(rdev->sb);
858 } else
859 printk("no rdev superblock!\n");
862 void md_print_devices (void)
864 struct md_list_head *tmp, *tmp2;
865 mdk_rdev_t *rdev;
866 mddev_t *mddev;
868 printk("\n");
869 printk(" **********************************\n");
870 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
871 printk(" **********************************\n");
872 ITERATE_MDDEV(mddev,tmp) {
873 printk("md%d: ", mdidx(mddev));
875 ITERATE_RDEV(mddev,rdev,tmp2)
876 printk("<%s>", partition_name(rdev->dev));
878 if (mddev->sb) {
879 printk(" array superblock:\n");
880 print_sb(mddev->sb);
881 } else
882 printk(" no array superblock.\n");
884 ITERATE_RDEV(mddev,rdev,tmp2)
885 print_rdev(rdev);
887 printk(" **********************************\n");
888 printk("\n");
891 static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
893 int ret;
894 mdp_super_t *tmp1, *tmp2;
896 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
897 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
899 if (!tmp1 || !tmp2) {
900 ret = 0;
901 goto abort;
904 *tmp1 = *sb1;
905 *tmp2 = *sb2;
908 * nr_disks is not constant
910 tmp1->nr_disks = 0;
911 tmp2->nr_disks = 0;
913 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
914 ret = 0;
915 else
916 ret = 1;
918 abort:
919 if (tmp1)
920 kfree(tmp1);
921 if (tmp2)
922 kfree(tmp2);
924 return ret;
927 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
929 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
930 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
931 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
932 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
934 return 1;
936 return 0;
939 static mdk_rdev_t * find_rdev_all (kdev_t dev)
941 struct md_list_head *tmp;
942 mdk_rdev_t *rdev;
944 tmp = all_raid_disks.next;
945 while (tmp != &all_raid_disks) {
946 rdev = md_list_entry(tmp, mdk_rdev_t, all);
947 if (rdev->dev == dev)
948 return rdev;
949 tmp = tmp->next;
951 return NULL;
954 #define GETBLK_FAILED KERN_ERR \
955 "md: getblk failed for device %s\n"
957 static int write_disk_sb(mdk_rdev_t * rdev)
959 struct buffer_head *bh;
960 kdev_t dev;
961 unsigned long sb_offset, size;
962 mdp_super_t *sb;
964 if (!rdev->sb) {
965 MD_BUG();
966 return -1;
968 if (rdev->faulty) {
969 MD_BUG();
970 return -1;
972 if (rdev->sb->md_magic != MD_SB_MAGIC) {
973 MD_BUG();
974 return -1;
977 dev = rdev->dev;
978 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
979 if (rdev->sb_offset != sb_offset) {
980 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
981 goto skip;
984 * If the disk went offline meanwhile and it's just a spare, then
985 * it's size has changed to zero silently, and the MD code does
986 * not yet know that it's faulty.
988 size = calc_dev_size(dev, rdev->mddev, 1);
989 if (size != rdev->size) {
990 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
991 goto skip;
994 printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
995 fsync_dev(dev);
996 set_blocksize(dev, MD_SB_BYTES);
997 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
998 if (!bh) {
999 printk(GETBLK_FAILED, partition_name(dev));
1000 return 1;
1002 memset(bh->b_data,0,bh->b_size);
1003 sb = (mdp_super_t *) bh->b_data;
1004 memcpy(sb, rdev->sb, MD_SB_BYTES);
1006 mark_buffer_uptodate(bh, 1);
1007 mark_buffer_dirty(bh, 1);
1008 ll_rw_block(WRITE, 1, &bh);
1009 wait_on_buffer(bh);
1010 brelse(bh);
1011 fsync_dev(dev);
1012 skip:
1013 return 0;
1015 #undef GETBLK_FAILED
1017 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1019 int i, ok = 0;
1020 mdp_disk_t *desc;
1022 for (i = 0; i < MD_SB_DISKS; i++) {
1023 desc = mddev->sb->disks + i;
1024 #if 0
1025 if (disk_faulty(desc)) {
1026 if (MKDEV(desc->major,desc->minor) == rdev->dev)
1027 ok = 1;
1028 continue;
1030 #endif
1031 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
1032 rdev->sb->this_disk = *desc;
1033 rdev->desc_nr = desc->number;
1034 ok = 1;
1035 break;
1039 if (!ok) {
1040 MD_BUG();
1044 static int sync_sbs(mddev_t * mddev)
1046 mdk_rdev_t *rdev;
1047 mdp_super_t *sb;
1048 struct md_list_head *tmp;
1050 ITERATE_RDEV(mddev,rdev,tmp) {
1051 if (rdev->faulty)
1052 continue;
1053 sb = rdev->sb;
1054 *sb = *mddev->sb;
1055 set_this_disk(mddev, rdev);
1056 sb->sb_csum = calc_sb_csum(sb);
1058 return 0;
1061 int md_update_sb(mddev_t * mddev)
1063 int first, err, count = 100;
1064 struct md_list_head *tmp;
1065 mdk_rdev_t *rdev;
1066 __u64 ev;
1068 repeat:
1069 mddev->sb->utime = CURRENT_TIME;
1070 ev = get_unaligned(&mddev->sb->events);
1071 ++ev;
1072 put_unaligned(ev,&mddev->sb->events);
1073 if (ev == (__u64)0) {
1075 * oops, this 64-bit counter should never wrap.
1076 * Either we are in around ~1 trillion A.C., assuming
1077 * 1 reboot per second, or we have a bug:
1079 MD_BUG();
1080 --ev;
1081 put_unaligned(ev,&mddev->sb->events);
1083 sync_sbs(mddev);
1086 * do not write anything to disk if using
1087 * nonpersistent superblocks
1089 if (mddev->sb->not_persistent)
1090 return 0;
1092 printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1093 mdidx(mddev));
1095 first = 1;
1096 err = 0;
1097 ITERATE_RDEV(mddev,rdev,tmp) {
1098 if (!first) {
1099 first = 0;
1100 printk(", ");
1102 if (rdev->faulty)
1103 printk("(skipping faulty ");
1104 printk("%s ", partition_name(rdev->dev));
1105 if (!rdev->faulty) {
1106 printk("[events: %08lx]",
1107 (unsigned long)get_unaligned(&rdev->sb->events));
1108 err += write_disk_sb(rdev);
1109 } else
1110 printk(")\n");
1112 printk(".\n");
1113 if (err) {
1114 printk("errors occured during superblock update, repeating\n");
1115 if (--count)
1116 goto repeat;
1117 printk("excessive errors occured during superblock update, exiting\n");
1119 return 0;
1123 * Import a device. If 'on_disk', then sanity check the superblock
1125 * mark the device faulty if:
1127 * - the device is nonexistent (zero size)
1128 * - the device has no valid superblock
1130 * a faulty rdev _never_ has rdev->sb set.
1132 static int md_import_device (kdev_t newdev, int on_disk)
1134 int err;
1135 mdk_rdev_t *rdev;
1136 unsigned int size;
1138 if (find_rdev_all(newdev))
1139 return -EEXIST;
1141 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1142 if (!rdev) {
1143 printk("could not alloc mem for %s!\n", partition_name(newdev));
1144 return -ENOMEM;
1146 memset(rdev, 0, sizeof(*rdev));
1148 if (get_super(newdev)) {
1149 printk("md: can not import %s, has active inodes!\n",
1150 partition_name(newdev));
1151 err = -EBUSY;
1152 goto abort_free;
1155 if ((err = alloc_disk_sb(rdev)))
1156 goto abort_free;
1158 rdev->dev = newdev;
1159 if (lock_rdev(rdev)) {
1160 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1161 partition_name(newdev));
1162 err = -EINVAL;
1163 goto abort_free;
1165 rdev->desc_nr = -1;
1166 rdev->faulty = 0;
1168 size = 0;
1169 if (blk_size[MAJOR(newdev)])
1170 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1171 if (!size) {
1172 printk("md: %s has zero size, marking faulty!\n",
1173 partition_name(newdev));
1174 err = -EINVAL;
1175 goto abort_free;
1178 if (on_disk) {
1179 if ((err = read_disk_sb(rdev))) {
1180 printk("md: could not read %s's sb, not importing!\n",
1181 partition_name(newdev));
1182 goto abort_free;
1184 if ((err = check_disk_sb(rdev))) {
1185 printk("md: %s has invalid sb, not importing!\n",
1186 partition_name(newdev));
1187 goto abort_free;
1190 rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1191 rdev->sb->this_disk.minor);
1192 rdev->desc_nr = rdev->sb->this_disk.number;
1194 md_list_add(&rdev->all, &all_raid_disks);
1195 MD_INIT_LIST_HEAD(&rdev->pending);
1197 if (rdev->faulty && rdev->sb)
1198 free_disk_sb(rdev);
1199 return 0;
1201 abort_free:
1202 if (rdev->sb) {
1203 if (rdev->inode)
1204 unlock_rdev(rdev);
1205 free_disk_sb(rdev);
1207 kfree(rdev);
1208 return err;
1212 * Check a full RAID array for plausibility
1215 #define INCONSISTENT KERN_ERR \
1216 "md: fatal superblock inconsistency in %s -- removing from array\n"
1218 #define OUT_OF_DATE KERN_ERR \
1219 "md: superblock update time inconsistency -- using the most recent one\n"
1221 #define OLD_VERSION KERN_ALERT \
1222 "md: md%d: unsupported raid array version %d.%d.%d\n"
1224 #define NOT_CLEAN_IGNORE KERN_ERR \
1225 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1227 #define UNKNOWN_LEVEL KERN_ERR \
1228 "md: md%d: unsupported raid level %d\n"
1230 static int analyze_sbs (mddev_t * mddev)
1232 int out_of_date = 0, i;
1233 struct md_list_head *tmp, *tmp2;
1234 mdk_rdev_t *rdev, *rdev2, *freshest;
1235 mdp_super_t *sb;
1238 * Verify the RAID superblock on each real device
1240 ITERATE_RDEV(mddev,rdev,tmp) {
1241 if (rdev->faulty) {
1242 MD_BUG();
1243 goto abort;
1245 if (!rdev->sb) {
1246 MD_BUG();
1247 goto abort;
1249 if (check_disk_sb(rdev))
1250 goto abort;
1254 * The superblock constant part has to be the same
1255 * for all disks in the array.
1257 sb = NULL;
1259 ITERATE_RDEV(mddev,rdev,tmp) {
1260 if (!sb) {
1261 sb = rdev->sb;
1262 continue;
1264 if (!sb_equal(sb, rdev->sb)) {
1265 printk (INCONSISTENT, partition_name(rdev->dev));
1266 kick_rdev_from_array(rdev);
1267 continue;
1272 * OK, we have all disks and the array is ready to run. Let's
1273 * find the freshest superblock, that one will be the superblock
1274 * that represents the whole array.
1276 if (!mddev->sb)
1277 if (alloc_array_sb(mddev))
1278 goto abort;
1279 sb = mddev->sb;
1280 freshest = NULL;
1282 ITERATE_RDEV(mddev,rdev,tmp) {
1283 __u64 ev1, ev2;
1285 * if the checksum is invalid, use the superblock
1286 * only as a last resort. (decrease it's age by
1287 * one event)
1289 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1290 __u64 ev = get_unaligned(&rdev->sb->events);
1291 if (ev != (__u64)0) {
1292 --ev;
1293 put_unaligned(ev,&rdev->sb->events);
1297 printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
1298 (unsigned long)get_unaligned(&rdev->sb->events));
1299 if (!freshest) {
1300 freshest = rdev;
1301 continue;
1304 * Find the newest superblock version
1306 ev1 = get_unaligned(&rdev->sb->events);
1307 ev2 = get_unaligned(&freshest->sb->events);
1308 if (ev1 != ev2) {
1309 out_of_date = 1;
1310 if (ev1 > ev2)
1311 freshest = rdev;
1314 if (out_of_date) {
1315 printk(OUT_OF_DATE);
1316 printk("freshest: %s\n", partition_name(freshest->dev));
1318 memcpy (sb, freshest->sb, sizeof(*sb));
1321 * at this point we have picked the 'best' superblock
1322 * from all available superblocks.
1323 * now we validate this superblock and kick out possibly
1324 * failed disks.
1326 ITERATE_RDEV(mddev,rdev,tmp) {
1328 * Kick all non-fresh devices faulty
1330 __u64 ev1, ev2;
1331 ev1 = get_unaligned(&rdev->sb->events);
1332 ev2 = get_unaligned(&sb->events);
1333 ++ev1;
1334 if (ev1 < ev2) {
1335 printk("md: kicking non-fresh %s from array!\n",
1336 partition_name(rdev->dev));
1337 kick_rdev_from_array(rdev);
1338 continue;
1343 * Fix up changed device names ... but only if this disk has a
1344 * recent update time. Use faulty checksum ones too.
1346 ITERATE_RDEV(mddev,rdev,tmp) {
1347 __u64 ev1, ev2, ev3;
1348 if (rdev->faulty) { /* REMOVEME */
1349 MD_BUG();
1350 goto abort;
1352 ev1 = get_unaligned(&rdev->sb->events);
1353 ev2 = get_unaligned(&sb->events);
1354 ev3 = ev2;
1355 --ev3;
1356 if ((rdev->dev != rdev->old_dev) &&
1357 ((ev1 == ev2) || (ev1 == ev3))) {
1358 mdp_disk_t *desc;
1360 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1361 if (rdev->desc_nr == -1) {
1362 MD_BUG();
1363 goto abort;
1365 desc = &sb->disks[rdev->desc_nr];
1366 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1367 MD_BUG();
1368 goto abort;
1370 desc->major = MAJOR(rdev->dev);
1371 desc->minor = MINOR(rdev->dev);
1372 desc = &rdev->sb->this_disk;
1373 desc->major = MAJOR(rdev->dev);
1374 desc->minor = MINOR(rdev->dev);
1379 * Remove unavailable and faulty devices ...
1381 * note that if an array becomes completely unrunnable due to
1382 * missing devices, we do not write the superblock back, so the
1383 * administrator has a chance to fix things up. The removal thus
1384 * only happens if it's nonfatal to the contents of the array.
1386 for (i = 0; i < MD_SB_DISKS; i++) {
1387 int found;
1388 mdp_disk_t *desc;
1389 kdev_t dev;
1391 desc = sb->disks + i;
1392 dev = MKDEV(desc->major, desc->minor);
1395 * We kick faulty devices/descriptors immediately.
1397 if (disk_faulty(desc)) {
1398 found = 0;
1399 ITERATE_RDEV(mddev,rdev,tmp) {
1400 if (rdev->desc_nr != desc->number)
1401 continue;
1402 printk("md%d: kicking faulty %s!\n",
1403 mdidx(mddev),partition_name(rdev->dev));
1404 kick_rdev_from_array(rdev);
1405 found = 1;
1406 break;
1408 if (!found) {
1409 if (dev == MKDEV(0,0))
1410 continue;
1411 printk("md%d: removing former faulty %s!\n",
1412 mdidx(mddev), partition_name(dev));
1414 remove_descriptor(desc, sb);
1415 continue;
1418 if (dev == MKDEV(0,0))
1419 continue;
1421 * Is this device present in the rdev ring?
1423 found = 0;
1424 ITERATE_RDEV(mddev,rdev,tmp) {
1425 if (rdev->desc_nr == desc->number) {
1426 found = 1;
1427 break;
1430 if (found)
1431 continue;
1433 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1434 remove_descriptor(desc, sb);
1438 * Double check wether all devices mentioned in the
1439 * superblock are in the rdev ring.
1441 for (i = 0; i < MD_SB_DISKS; i++) {
1442 mdp_disk_t *desc;
1443 kdev_t dev;
1445 desc = sb->disks + i;
1446 dev = MKDEV(desc->major, desc->minor);
1448 if (dev == MKDEV(0,0))
1449 continue;
1451 if (disk_faulty(desc)) {
1452 MD_BUG();
1453 goto abort;
1456 rdev = find_rdev(mddev, dev);
1457 if (!rdev) {
1458 MD_BUG();
1459 goto abort;
1464 * Do a final reality check.
1466 ITERATE_RDEV(mddev,rdev,tmp) {
1467 if (rdev->desc_nr == -1) {
1468 MD_BUG();
1469 goto abort;
1472 * is the desc_nr unique?
1474 ITERATE_RDEV(mddev,rdev2,tmp2) {
1475 if ((rdev2 != rdev) &&
1476 (rdev2->desc_nr == rdev->desc_nr)) {
1477 MD_BUG();
1478 goto abort;
1482 * is the device unique?
1484 ITERATE_RDEV(mddev,rdev2,tmp2) {
1485 if ((rdev2 != rdev) &&
1486 (rdev2->dev == rdev->dev)) {
1487 MD_BUG();
1488 goto abort;
1494 * Check if we can support this RAID array
1496 if (sb->major_version != MD_MAJOR_VERSION ||
1497 sb->minor_version > MD_MINOR_VERSION) {
1499 printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1500 sb->minor_version, sb->patch_version);
1501 goto abort;
1504 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1505 (sb->level == 4) || (sb->level == 5)))
1506 printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1508 return 0;
1509 abort:
1510 return 1;
1513 #undef INCONSISTENT
1514 #undef OUT_OF_DATE
1515 #undef OLD_VERSION
1516 #undef OLD_LEVEL
1518 static int device_size_calculation (mddev_t * mddev)
1520 int data_disks = 0, persistent;
1521 unsigned int readahead;
1522 mdp_super_t *sb = mddev->sb;
1523 struct md_list_head *tmp;
1524 mdk_rdev_t *rdev;
1527 * Do device size calculation. Bail out if too small.
1528 * (we have to do this after having validated chunk_size,
1529 * because device size has to be modulo chunk_size)
1531 persistent = !mddev->sb->not_persistent;
1532 ITERATE_RDEV(mddev,rdev,tmp) {
1533 if (rdev->faulty)
1534 continue;
1535 if (rdev->size) {
1536 MD_BUG();
1537 continue;
1539 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1540 if (rdev->size < sb->chunk_size / 1024) {
1541 printk (KERN_WARNING
1542 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1543 partition_name(rdev->dev),
1544 rdev->size, sb->chunk_size / 1024);
1545 return -EINVAL;
1549 switch (sb->level) {
1550 case -3:
1551 data_disks = 1;
1552 break;
1553 case -2:
1554 data_disks = 1;
1555 break;
1556 case -1:
1557 zoned_raid_size(mddev);
1558 data_disks = 1;
1559 break;
1560 case 0:
1561 zoned_raid_size(mddev);
1562 data_disks = sb->raid_disks;
1563 break;
1564 case 1:
1565 data_disks = 1;
1566 break;
1567 case 4:
1568 case 5:
1569 data_disks = sb->raid_disks-1;
1570 break;
1571 default:
1572 printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1573 goto abort;
1575 if (!md_size[mdidx(mddev)])
1576 md_size[mdidx(mddev)] = sb->size * data_disks;
1578 readahead = MD_READAHEAD;
1579 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5))
1580 readahead = mddev->sb->chunk_size * 4 * data_disks;
1581 if (readahead < data_disks * MAX_SECTORS*512*2)
1582 readahead = data_disks * MAX_SECTORS*512*2;
1583 else {
1584 if (sb->level == -3)
1585 readahead = 0;
1587 md_maxreadahead[mdidx(mddev)] = readahead;
1589 printk(KERN_INFO "md%d: max total readahead window set to %dk\n",
1590 mdidx(mddev), readahead/1024);
1592 printk(KERN_INFO
1593 "md%d: %d data-disks, max readahead per data-disk: %dk\n",
1594 mdidx(mddev), data_disks, readahead/data_disks/1024);
1595 return 0;
1596 abort:
1597 return 1;
1601 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1602 "too big chunk_size: %d > %d\n"
1604 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1605 "too small chunk_size: %d < %ld\n"
1607 #define BAD_CHUNKSIZE KERN_ERR \
1608 "no chunksize specified, see 'man raidtab'\n"
1610 static int do_md_run (mddev_t * mddev)
1612 int pnum, err;
1613 int chunk_size;
1614 struct md_list_head *tmp;
1615 mdk_rdev_t *rdev;
1618 if (!mddev->nb_dev) {
1619 MD_BUG();
1620 return -EINVAL;
1623 if (mddev->pers)
1624 return -EBUSY;
1627 * Resize disks to align partitions size on a given
1628 * chunk size.
1630 md_size[mdidx(mddev)] = 0;
1633 * Analyze all RAID superblock(s)
1635 if (analyze_sbs(mddev)) {
1636 MD_BUG();
1637 return -EINVAL;
1640 chunk_size = mddev->sb->chunk_size;
1641 pnum = level_to_pers(mddev->sb->level);
1643 mddev->param.chunk_size = chunk_size;
1644 mddev->param.personality = pnum;
1646 if (chunk_size > MAX_CHUNK_SIZE) {
1647 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1648 return -EINVAL;
1651 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1653 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1654 MD_BUG();
1655 return -EINVAL;
1657 if (chunk_size < PAGE_SIZE) {
1658 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1659 return -EINVAL;
1662 if (pnum >= MAX_PERSONALITY) {
1663 MD_BUG();
1664 return -EINVAL;
1667 if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
1669 * 'default chunksize' in the old md code used to
1670 * be PAGE_SIZE, baaad.
1671 * we abort here to be on the safe side. We dont
1672 * want to continue the bad practice.
1674 printk(BAD_CHUNKSIZE);
1675 return -EINVAL;
1678 if (!pers[pnum])
1680 #ifdef CONFIG_KMOD
1681 char module_name[80];
1682 sprintf (module_name, "md-personality-%d", pnum);
1683 request_module (module_name);
1684 if (!pers[pnum])
1685 #endif
1686 return -EINVAL;
1689 if (device_size_calculation(mddev))
1690 return -EINVAL;
1693 * Drop all container device buffers, from now on
1694 * the only valid external interface is through the md
1695 * device.
1697 ITERATE_RDEV(mddev,rdev,tmp) {
1698 if (rdev->faulty)
1699 continue;
1700 fsync_dev(rdev->dev);
1701 invalidate_buffers(rdev->dev);
1704 mddev->pers = pers[pnum];
1706 err = mddev->pers->run(mddev);
1707 if (err) {
1708 printk("pers->run() failed ...\n");
1709 mddev->pers = NULL;
1710 return -EINVAL;
1713 mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1714 md_update_sb(mddev);
1717 * md_size has units of 1K blocks, which are
1718 * twice as large as sectors.
1720 md_hd_struct[mdidx(mddev)].start_sect = 0;
1721 md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
1723 read_ahead[MD_MAJOR] = 1024;
1724 return (0);
1727 #undef TOO_BIG_CHUNKSIZE
1728 #undef BAD_CHUNKSIZE
1730 #define OUT(x) do { err = (x); goto out; } while (0)
1732 static int restart_array (mddev_t *mddev)
1734 int err = 0;
1737 * Complain if it has no devices
1739 if (!mddev->nb_dev)
1740 OUT(-ENXIO);
1742 if (mddev->pers) {
1743 if (!mddev->ro)
1744 OUT(-EBUSY);
1746 mddev->ro = 0;
1747 set_device_ro(mddev_to_kdev(mddev), 0);
1749 printk (KERN_INFO
1750 "md%d switched to read-write mode.\n", mdidx(mddev));
1752 * Kick recovery or resync if necessary
1754 md_recover_arrays();
1755 if (mddev->pers->restart_resync)
1756 mddev->pers->restart_resync(mddev);
1757 } else
1758 err = -EINVAL;
1760 out:
1761 return err;
1764 #define STILL_MOUNTED KERN_WARNING \
1765 "md: md%d still mounted.\n"
1767 static int do_md_stop (mddev_t * mddev, int ro)
1769 int err = 0, resync_interrupted = 0;
1770 kdev_t dev = mddev_to_kdev(mddev);
1772 if (!ro && get_super(dev)) {
1773 printk (STILL_MOUNTED, mdidx(mddev));
1774 OUT(-EBUSY);
1777 if (mddev->pers) {
1779 * It is safe to call stop here, it only frees private
1780 * data. Also, it tells us if a device is unstoppable
1781 * (eg. resyncing is in progress)
1783 if (mddev->pers->stop_resync)
1784 if (mddev->pers->stop_resync(mddev))
1785 resync_interrupted = 1;
1787 if (mddev->recovery_running)
1788 md_interrupt_thread(md_recovery_thread);
1791 * This synchronizes with signal delivery to the
1792 * resync or reconstruction thread. It also nicely
1793 * hangs the process if some reconstruction has not
1794 * finished.
1796 down(&mddev->recovery_sem);
1797 up(&mddev->recovery_sem);
1800 * sync and invalidate buffers because we cannot kill the
1801 * main thread with valid IO transfers still around.
1802 * the kernel lock protects us from new requests being
1803 * added after invalidate_buffers().
1805 fsync_dev (mddev_to_kdev(mddev));
1806 fsync_dev (dev);
1807 invalidate_buffers (dev);
1809 if (ro) {
1810 if (mddev->ro)
1811 OUT(-ENXIO);
1812 mddev->ro = 1;
1813 } else {
1814 if (mddev->ro)
1815 set_device_ro(dev, 0);
1816 if (mddev->pers->stop(mddev)) {
1817 if (mddev->ro)
1818 set_device_ro(dev, 1);
1819 OUT(-EBUSY);
1821 if (mddev->ro)
1822 mddev->ro = 0;
1824 if (mddev->sb) {
1826 * mark it clean only if there was no resync
1827 * interrupted.
1829 if (!mddev->recovery_running && !resync_interrupted) {
1830 printk("marking sb clean...\n");
1831 mddev->sb->state |= 1 << MD_SB_CLEAN;
1833 md_update_sb(mddev);
1835 if (ro)
1836 set_device_ro(dev, 1);
1840 * Free resources if final stop
1842 if (!ro) {
1843 printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
1844 free_mddev(mddev);
1846 } else
1847 printk (KERN_INFO
1848 "md%d switched to read-only mode.\n", mdidx(mddev));
1849 out:
1850 return err;
1853 #undef OUT
1856 * We have to safely support old arrays too.
1858 int detect_old_array (mdp_super_t *sb)
1860 if (sb->major_version > 0)
1861 return 0;
1862 if (sb->minor_version >= 90)
1863 return 0;
1865 return -EINVAL;
1869 static void autorun_array (mddev_t *mddev)
1871 mdk_rdev_t *rdev;
1872 struct md_list_head *tmp;
1873 int err;
1875 if (mddev->disks.prev == &mddev->disks) {
1876 MD_BUG();
1877 return;
1880 printk("running: ");
1882 ITERATE_RDEV(mddev,rdev,tmp) {
1883 printk("<%s>", partition_name(rdev->dev));
1885 printk("\nnow!\n");
1887 err = do_md_run (mddev);
1888 if (err) {
1889 printk("do_md_run() returned %d\n", err);
1891 * prevent the writeback of an unrunnable array
1893 mddev->sb_dirty = 0;
1894 do_md_stop (mddev, 0);
1899 * lets try to run arrays based on all disks that have arrived
1900 * until now. (those are in the ->pending list)
1902 * the method: pick the first pending disk, collect all disks with
1903 * the same UUID, remove all from the pending list and put them into
1904 * the 'same_array' list. Then order this list based on superblock
1905 * update time (freshest comes first), kick out 'old' disks and
1906 * compare superblocks. If everything's fine then run it.
1908 static void autorun_devices (void)
1910 struct md_list_head candidates;
1911 struct md_list_head *tmp;
1912 mdk_rdev_t *rdev0, *rdev;
1913 mddev_t *mddev;
1914 kdev_t md_kdev;
1917 printk("autorun ...\n");
1918 while (pending_raid_disks.next != &pending_raid_disks) {
1919 rdev0 = md_list_entry(pending_raid_disks.next,
1920 mdk_rdev_t, pending);
1922 printk("considering %s ...\n", partition_name(rdev0->dev));
1923 MD_INIT_LIST_HEAD(&candidates);
1924 ITERATE_RDEV_PENDING(rdev,tmp) {
1925 if (uuid_equal(rdev0, rdev)) {
1926 if (!sb_equal(rdev0->sb, rdev->sb)) {
1927 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1928 continue;
1930 printk(" adding %s ...\n", partition_name(rdev->dev));
1931 md_list_del(&rdev->pending);
1932 md_list_add(&rdev->pending, &candidates);
1936 * now we have a set of devices, with all of them having
1937 * mostly sane superblocks. It's time to allocate the
1938 * mddev.
1940 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1941 mddev = kdev_to_mddev(md_kdev);
1942 if (mddev) {
1943 printk("md%d already running, cannot run %s\n",
1944 mdidx(mddev), partition_name(rdev0->dev));
1945 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1946 export_rdev(rdev);
1947 continue;
1949 mddev = alloc_mddev(md_kdev);
1950 printk("created md%d\n", mdidx(mddev));
1951 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1952 bind_rdev_to_array(rdev, mddev);
1953 md_list_del(&rdev->pending);
1954 MD_INIT_LIST_HEAD(&rdev->pending);
1956 autorun_array(mddev);
1958 printk("... autorun DONE.\n");
1962 * import RAID devices based on one partition
1963 * if possible, the array gets run as well.
1966 #define BAD_VERSION KERN_ERR \
1967 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1969 #define OUT_OF_MEM KERN_ALERT \
1970 "md: out of memory.\n"
1972 #define NO_DEVICE KERN_ERR \
1973 "md: disabled device %s\n"
1975 #define AUTOADD_FAILED KERN_ERR \
1976 "md: auto-adding devices to md%d FAILED (error %d).\n"
1978 #define AUTOADD_FAILED_USED KERN_ERR \
1979 "md: cannot auto-add device %s to md%d, already used.\n"
1981 #define AUTORUN_FAILED KERN_ERR \
1982 "md: auto-running md%d FAILED (error %d).\n"
1984 #define MDDEV_BUSY KERN_ERR \
1985 "md: cannot auto-add to md%d, already running.\n"
1987 #define AUTOADDING KERN_INFO \
1988 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1990 #define AUTORUNNING KERN_INFO \
1991 "md: auto-running md%d.\n"
1993 static int autostart_array (kdev_t startdev)
1995 int err = -EINVAL, i;
1996 mdp_super_t *sb = NULL;
1997 mdk_rdev_t *start_rdev = NULL, *rdev;
1999 if (md_import_device(startdev, 1)) {
2000 printk("could not import %s!\n", partition_name(startdev));
2001 goto abort;
2004 start_rdev = find_rdev_all(startdev);
2005 if (!start_rdev) {
2006 MD_BUG();
2007 goto abort;
2009 if (start_rdev->faulty) {
2010 printk("can not autostart based on faulty %s!\n",
2011 partition_name(startdev));
2012 goto abort;
2014 md_list_add(&start_rdev->pending, &pending_raid_disks);
2016 sb = start_rdev->sb;
2018 err = detect_old_array(sb);
2019 if (err) {
2020 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
2021 goto abort;
2024 for (i = 0; i < MD_SB_DISKS; i++) {
2025 mdp_disk_t *desc;
2026 kdev_t dev;
2028 desc = sb->disks + i;
2029 dev = MKDEV(desc->major, desc->minor);
2031 if (dev == MKDEV(0,0))
2032 continue;
2033 if (dev == startdev)
2034 continue;
2035 if (md_import_device(dev, 1)) {
2036 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
2037 continue;
2039 rdev = find_rdev_all(dev);
2040 if (!rdev) {
2041 MD_BUG();
2042 goto abort;
2044 md_list_add(&rdev->pending, &pending_raid_disks);
2048 * possibly return codes
2050 autorun_devices();
2051 return 0;
2053 abort:
2054 if (start_rdev)
2055 export_rdev(start_rdev);
2056 return err;
2059 #undef BAD_VERSION
2060 #undef OUT_OF_MEM
2061 #undef NO_DEVICE
2062 #undef AUTOADD_FAILED_USED
2063 #undef AUTOADD_FAILED
2064 #undef AUTORUN_FAILED
2065 #undef AUTOADDING
2066 #undef AUTORUNNING
2068 struct {
2069 int set;
2070 int noautodetect;
2072 } raid_setup_args md__initdata = { 0, 0 };
2074 void md_setup_drive(void) md__init;
2077 * Searches all registered partitions for autorun RAID arrays
2078 * at boot time.
2080 #ifdef CONFIG_AUTODETECT_RAID
2081 static int detected_devices[128] md__initdata;
2082 static int dev_cnt md__initdata=0;
2083 void md__init md_autodetect_dev(kdev_t dev)
2085 if (dev_cnt < 127)
2086 detected_devices[dev_cnt++] = dev;
2088 #endif
2090 void md__init md_run_setup(void)
2092 #ifdef CONFIG_AUTODETECT_RAID
2093 mdk_rdev_t *rdev;
2094 int i;
2096 if (raid_setup_args.noautodetect) {
2097 printk(KERN_INFO "skipping autodetection of RAID arrays\n");
2098 return;
2100 printk(KERN_INFO "autodetecting RAID arrays\n");
2102 for (i=0; i<dev_cnt; i++) {
2103 kdev_t dev = detected_devices[i];
2105 if (md_import_device(dev,1)) {
2106 printk(KERN_ALERT "could not import %s!\n",
2107 partition_name(dev));
2108 continue;
2111 * Sanity checks:
2113 rdev = find_rdev_all(dev);
2114 if (!rdev) {
2115 MD_BUG();
2116 continue;
2118 if (rdev->faulty) {
2119 MD_BUG();
2120 continue;
2122 md_list_add(&rdev->pending, &pending_raid_disks);
2125 autorun_devices();
2126 #endif
2127 #ifdef CONFIG_MD_BOOT
2128 md_setup_drive();
2129 #endif
2133 static int get_version (void * arg)
2135 mdu_version_t ver;
2137 ver.major = MD_MAJOR_VERSION;
2138 ver.minor = MD_MINOR_VERSION;
2139 ver.patchlevel = MD_PATCHLEVEL_VERSION;
2141 if (md_copy_to_user(arg, &ver, sizeof(ver)))
2142 return -EFAULT;
2144 return 0;
2147 #define SET_FROM_SB(x) info.x = mddev->sb->x
2148 static int get_array_info (mddev_t * mddev, void * arg)
2150 mdu_array_info_t info;
2152 if (!mddev->sb)
2153 return -EINVAL;
2155 SET_FROM_SB(major_version);
2156 SET_FROM_SB(minor_version);
2157 SET_FROM_SB(patch_version);
2158 SET_FROM_SB(ctime);
2159 SET_FROM_SB(level);
2160 SET_FROM_SB(size);
2161 SET_FROM_SB(nr_disks);
2162 SET_FROM_SB(raid_disks);
2163 SET_FROM_SB(md_minor);
2164 SET_FROM_SB(not_persistent);
2166 SET_FROM_SB(utime);
2167 SET_FROM_SB(state);
2168 SET_FROM_SB(active_disks);
2169 SET_FROM_SB(working_disks);
2170 SET_FROM_SB(failed_disks);
2171 SET_FROM_SB(spare_disks);
2173 SET_FROM_SB(layout);
2174 SET_FROM_SB(chunk_size);
2176 if (md_copy_to_user(arg, &info, sizeof(info)))
2177 return -EFAULT;
2179 return 0;
2181 #undef SET_FROM_SB
2183 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2184 static int get_disk_info (mddev_t * mddev, void * arg)
2186 mdu_disk_info_t info;
2187 unsigned int nr;
2189 if (!mddev->sb)
2190 return -EINVAL;
2192 if (md_copy_from_user(&info, arg, sizeof(info)))
2193 return -EFAULT;
2195 nr = info.number;
2196 if (nr >= mddev->sb->nr_disks)
2197 return -EINVAL;
2199 SET_FROM_SB(major);
2200 SET_FROM_SB(minor);
2201 SET_FROM_SB(raid_disk);
2202 SET_FROM_SB(state);
2204 if (md_copy_to_user(arg, &info, sizeof(info)))
2205 return -EFAULT;
2207 return 0;
2209 #undef SET_FROM_SB
2211 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2213 static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2215 int err, size, persistent;
2216 mdk_rdev_t *rdev;
2217 unsigned int nr;
2218 kdev_t dev;
2219 dev = MKDEV(info->major,info->minor);
2221 if (find_rdev_all(dev)) {
2222 printk("device %s already used in a RAID array!\n",
2223 partition_name(dev));
2224 return -EBUSY;
2226 if (!mddev->sb) {
2227 /* expecting a device which has a superblock */
2228 err = md_import_device(dev, 1);
2229 if (err) {
2230 printk("md error, md_import_device returned %d\n", err);
2231 return -EINVAL;
2233 rdev = find_rdev_all(dev);
2234 if (!rdev) {
2235 MD_BUG();
2236 return -EINVAL;
2238 if (mddev->nb_dev) {
2239 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2240 mdk_rdev_t, same_set);
2241 if (!uuid_equal(rdev0, rdev)) {
2242 printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2243 export_rdev(rdev);
2244 return -EINVAL;
2246 if (!sb_equal(rdev0->sb, rdev->sb)) {
2247 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2248 export_rdev(rdev);
2249 return -EINVAL;
2252 bind_rdev_to_array(rdev, mddev);
2253 return 0;
2256 nr = info->number;
2257 if (nr >= mddev->sb->nr_disks)
2258 return -EINVAL;
2260 SET_SB(number);
2261 SET_SB(major);
2262 SET_SB(minor);
2263 SET_SB(raid_disk);
2264 SET_SB(state);
2266 if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2267 err = md_import_device (dev, 0);
2268 if (err) {
2269 printk("md: error, md_import_device() returned %d\n", err);
2270 return -EINVAL;
2272 rdev = find_rdev_all(dev);
2273 if (!rdev) {
2274 MD_BUG();
2275 return -EINVAL;
2278 rdev->old_dev = dev;
2279 rdev->desc_nr = info->number;
2281 bind_rdev_to_array(rdev, mddev);
2283 persistent = !mddev->sb->not_persistent;
2284 if (!persistent)
2285 printk("nonpersistent superblock ...\n");
2286 if (!mddev->sb->chunk_size)
2287 printk("no chunksize?\n");
2289 size = calc_dev_size(dev, mddev, persistent);
2290 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2292 if (!mddev->sb->size || (mddev->sb->size > size))
2293 mddev->sb->size = size;
2297 * sync all other superblocks with the main superblock
2299 sync_sbs(mddev);
2301 return 0;
2303 #undef SET_SB
2305 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2307 int err;
2308 mdk_rdev_t *rdev;
2309 mdp_disk_t *disk;
2311 if (!mddev->pers)
2312 return -ENODEV;
2314 printk("trying to remove %s from md%d ... \n",
2315 partition_name(dev), mdidx(mddev));
2317 if (!mddev->pers->diskop) {
2318 printk("md%d: personality does not support diskops!\n",
2319 mdidx(mddev));
2320 return -EINVAL;
2323 rdev = find_rdev(mddev, dev);
2324 if (!rdev)
2325 return -ENXIO;
2327 if (rdev->desc_nr == -1) {
2328 MD_BUG();
2329 return -EINVAL;
2331 disk = &mddev->sb->disks[rdev->desc_nr];
2332 if (disk_active(disk))
2333 goto busy;
2334 if (disk_removed(disk)) {
2335 MD_BUG();
2336 return -EINVAL;
2339 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2340 if (err == -EBUSY)
2341 goto busy;
2342 if (err) {
2343 MD_BUG();
2344 return -EINVAL;
2347 remove_descriptor(disk, mddev->sb);
2348 kick_rdev_from_array(rdev);
2349 mddev->sb_dirty = 1;
2350 md_update_sb(mddev);
2352 return 0;
2353 busy:
2354 printk("cannot remove active disk %s from md%d ... \n",
2355 partition_name(dev), mdidx(mddev));
2356 return -EBUSY;
2359 static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2361 int i, err, persistent;
2362 unsigned int size;
2363 mdk_rdev_t *rdev;
2364 mdp_disk_t *disk;
2366 if (!mddev->pers)
2367 return -ENODEV;
2369 printk("trying to hot-add %s to md%d ... \n",
2370 partition_name(dev), mdidx(mddev));
2372 if (!mddev->pers->diskop) {
2373 printk("md%d: personality does not support diskops!\n",
2374 mdidx(mddev));
2375 return -EINVAL;
2378 persistent = !mddev->sb->not_persistent;
2379 size = calc_dev_size(dev, mddev, persistent);
2381 if (size < mddev->sb->size) {
2382 printk("md%d: disk size %d blocks < array size %d\n",
2383 mdidx(mddev), size, mddev->sb->size);
2384 return -ENOSPC;
2387 rdev = find_rdev(mddev, dev);
2388 if (rdev)
2389 return -EBUSY;
2391 err = md_import_device (dev, 0);
2392 if (err) {
2393 printk("md: error, md_import_device() returned %d\n", err);
2394 return -EINVAL;
2396 rdev = find_rdev_all(dev);
2397 if (!rdev) {
2398 MD_BUG();
2399 return -EINVAL;
2401 if (rdev->faulty) {
2402 printk("md: can not hot-add faulty %s disk to md%d!\n",
2403 partition_name(dev), mdidx(mddev));
2404 err = -EINVAL;
2405 goto abort_export;
2407 bind_rdev_to_array(rdev, mddev);
2410 * The rest should better be atomic, we can have disk failures
2411 * noticed in interrupt contexts ...
2413 rdev->old_dev = dev;
2414 rdev->size = size;
2415 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2417 disk = mddev->sb->disks + mddev->sb->raid_disks;
2418 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2419 disk = mddev->sb->disks + i;
2421 if (!disk->major && !disk->minor)
2422 break;
2423 if (disk_removed(disk))
2424 break;
2426 if (i == MD_SB_DISKS) {
2427 printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2428 err = -EBUSY;
2429 goto abort_unbind_export;
2432 if (disk_removed(disk)) {
2434 * reuse slot
2436 if (disk->number != i) {
2437 MD_BUG();
2438 err = -EINVAL;
2439 goto abort_unbind_export;
2441 } else {
2442 disk->number = i;
2445 disk->raid_disk = disk->number;
2446 disk->major = MAJOR(dev);
2447 disk->minor = MINOR(dev);
2449 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2450 MD_BUG();
2451 err = -EINVAL;
2452 goto abort_unbind_export;
2455 mark_disk_spare(disk);
2456 mddev->sb->nr_disks++;
2457 mddev->sb->spare_disks++;
2458 mddev->sb->working_disks++;
2460 mddev->sb_dirty = 1;
2462 md_update_sb(mddev);
2465 * Kick recovery, maybe this spare has to be added to the
2466 * array immediately.
2468 md_recover_arrays();
2470 return 0;
2472 abort_unbind_export:
2473 unbind_rdev_from_array(rdev);
2475 abort_export:
2476 export_rdev(rdev);
2477 return err;
2480 #define SET_SB(x) mddev->sb->x = info->x
2481 static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2484 if (alloc_array_sb(mddev))
2485 return -ENOMEM;
2487 mddev->sb->major_version = MD_MAJOR_VERSION;
2488 mddev->sb->minor_version = MD_MINOR_VERSION;
2489 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2490 mddev->sb->ctime = CURRENT_TIME;
2492 SET_SB(level);
2493 SET_SB(size);
2494 SET_SB(nr_disks);
2495 SET_SB(raid_disks);
2496 SET_SB(md_minor);
2497 SET_SB(not_persistent);
2499 SET_SB(state);
2500 SET_SB(active_disks);
2501 SET_SB(working_disks);
2502 SET_SB(failed_disks);
2503 SET_SB(spare_disks);
2505 SET_SB(layout);
2506 SET_SB(chunk_size);
2508 mddev->sb->md_magic = MD_SB_MAGIC;
2511 * Generate a 128 bit UUID
2513 get_random_bytes(&mddev->sb->set_uuid0, 4);
2514 get_random_bytes(&mddev->sb->set_uuid1, 4);
2515 get_random_bytes(&mddev->sb->set_uuid2, 4);
2516 get_random_bytes(&mddev->sb->set_uuid3, 4);
2518 return 0;
2520 #undef SET_SB
2522 static int set_disk_info (mddev_t * mddev, void * arg)
2524 printk("not yet");
2525 return -EINVAL;
2528 static int clear_array (mddev_t * mddev)
2530 printk("not yet");
2531 return -EINVAL;
2534 static int write_raid_info (mddev_t * mddev)
2536 printk("not yet");
2537 return -EINVAL;
2540 static int protect_array (mddev_t * mddev)
2542 printk("not yet");
2543 return -EINVAL;
2546 static int unprotect_array (mddev_t * mddev)
2548 printk("not yet");
2549 return -EINVAL;
2552 static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2554 int ret;
2556 fsync_dev(mddev_to_kdev(mddev));
2557 ret = md_error(mddev_to_kdev(mddev), dev);
2558 return ret;
2561 static int md_ioctl (struct inode *inode, struct file *file,
2562 unsigned int cmd, unsigned long arg)
2564 unsigned int minor;
2565 int err = 0;
2566 struct hd_geometry *loc = (struct hd_geometry *) arg;
2567 mddev_t *mddev = NULL;
2568 kdev_t dev;
2570 if (!md_capable_admin())
2571 return -EACCES;
2573 dev = inode->i_rdev;
2574 minor = MINOR(dev);
2575 if (minor >= MAX_MD_DEVS)
2576 return -EINVAL;
2579 * Commands dealing with the RAID driver but not any
2580 * particular array:
2582 switch (cmd)
2584 case RAID_VERSION:
2585 err = get_version((void *)arg);
2586 goto done;
2588 case PRINT_RAID_DEBUG:
2589 err = 0;
2590 md_print_devices();
2591 goto done_unlock;
2593 case BLKGETSIZE: /* Return device size */
2594 if (!arg) {
2595 err = -EINVAL;
2596 goto abort;
2598 err = md_put_user(md_hd_struct[minor].nr_sects,
2599 (long *) arg);
2600 goto done;
2602 case BLKFLSBUF:
2603 fsync_dev(dev);
2604 invalidate_buffers(dev);
2605 goto done;
2607 case BLKRASET:
2608 if (arg > 0xff) {
2609 err = -EINVAL;
2610 goto abort;
2612 read_ahead[MAJOR(dev)] = arg;
2613 goto done;
2615 case BLKRAGET:
2616 if (!arg) {
2617 err = -EINVAL;
2618 goto abort;
2620 err = md_put_user (read_ahead[
2621 MAJOR(dev)], (long *) arg);
2622 goto done;
2623 default:
2627 * Commands creating/starting a new array:
2630 mddev = kdev_to_mddev(dev);
2632 switch (cmd)
2634 case SET_ARRAY_INFO:
2635 case START_ARRAY:
2636 if (mddev) {
2637 printk("array md%d already exists!\n",
2638 mdidx(mddev));
2639 err = -EEXIST;
2640 goto abort;
2642 default:
2644 switch (cmd)
2646 case SET_ARRAY_INFO:
2647 mddev = alloc_mddev(dev);
2648 if (!mddev) {
2649 err = -ENOMEM;
2650 goto abort;
2653 * alloc_mddev() should possibly self-lock.
2655 err = lock_mddev(mddev);
2656 if (err) {
2657 printk("ioctl, reason %d, cmd %d\n", err, cmd);
2658 goto abort;
2661 if (mddev->sb) {
2662 printk("array md%d already has a superblock!\n",
2663 mdidx(mddev));
2664 err = -EBUSY;
2665 goto abort_unlock;
2667 if (arg) {
2668 mdu_array_info_t info;
2669 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2670 err = -EFAULT;
2671 goto abort_unlock;
2673 err = set_array_info(mddev, &info);
2674 if (err) {
2675 printk("couldnt set array info. %d\n", err);
2676 goto abort_unlock;
2679 goto done_unlock;
2681 case START_ARRAY:
2683 * possibly make it lock the array ...
2685 err = autostart_array((kdev_t)arg);
2686 if (err) {
2687 printk("autostart %s failed!\n",
2688 partition_name((kdev_t)arg));
2689 goto abort;
2691 goto done;
2693 default:
2697 * Commands querying/configuring an existing array:
2700 if (!mddev) {
2701 err = -ENODEV;
2702 goto abort;
2704 err = lock_mddev(mddev);
2705 if (err) {
2706 printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2707 goto abort;
2709 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2710 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2711 err = -ENODEV;
2712 goto abort_unlock;
2716 * Commands even a read-only array can execute:
2718 switch (cmd)
2720 case GET_ARRAY_INFO:
2721 err = get_array_info(mddev, (void *)arg);
2722 goto done_unlock;
2724 case GET_DISK_INFO:
2725 err = get_disk_info(mddev, (void *)arg);
2726 goto done_unlock;
2728 case RESTART_ARRAY_RW:
2729 err = restart_array(mddev);
2730 goto done_unlock;
2732 case STOP_ARRAY:
2733 err = do_md_stop (mddev, 0);
2734 if (err)
2735 goto done_unlock;
2736 else
2737 goto done;
2739 case STOP_ARRAY_RO:
2740 err = do_md_stop (mddev, 1);
2741 goto done_unlock;
2744 * We have a problem here : there is no easy way to give a CHS
2745 * virtual geometry. We currently pretend that we have a 2 heads
2746 * 4 sectors (with a BIG number of cylinders...). This drives
2747 * dosfs just mad... ;-)
2749 case HDIO_GETGEO:
2750 if (!loc) {
2751 err = -EINVAL;
2752 goto abort_unlock;
2754 err = md_put_user (2, (char *) &loc->heads);
2755 if (err)
2756 goto abort_unlock;
2757 err = md_put_user (4, (char *) &loc->sectors);
2758 if (err)
2759 goto abort_unlock;
2760 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2761 (short *) &loc->cylinders);
2762 if (err)
2763 goto abort_unlock;
2764 err = md_put_user (md_hd_struct[minor].start_sect,
2765 (long *) &loc->start);
2766 goto done_unlock;
2770 * The remaining ioctls are changing the state of the
2771 * superblock, so we do not allow read-only arrays
2772 * here:
2774 if (mddev->ro) {
2775 err = -EROFS;
2776 goto abort_unlock;
2779 switch (cmd)
2781 case CLEAR_ARRAY:
2782 err = clear_array(mddev);
2783 goto done_unlock;
2785 case ADD_NEW_DISK:
2787 mdu_disk_info_t info;
2788 if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2789 err = -EFAULT;
2790 else
2791 err = add_new_disk(mddev, (void *)arg);
2792 goto done_unlock;
2794 case HOT_REMOVE_DISK:
2795 err = hot_remove_disk(mddev, (kdev_t)arg);
2796 goto done_unlock;
2798 case HOT_ADD_DISK:
2799 err = hot_add_disk(mddev, (kdev_t)arg);
2800 goto done_unlock;
2802 case SET_DISK_INFO:
2803 err = set_disk_info(mddev, (void *)arg);
2804 goto done_unlock;
2806 case WRITE_RAID_INFO:
2807 err = write_raid_info(mddev);
2808 goto done_unlock;
2810 case UNPROTECT_ARRAY:
2811 err = unprotect_array(mddev);
2812 goto done_unlock;
2814 case PROTECT_ARRAY:
2815 err = protect_array(mddev);
2816 goto done_unlock;
2818 case SET_DISK_FAULTY:
2819 err = set_disk_faulty(mddev, (kdev_t)arg);
2820 goto done_unlock;
2822 case RUN_ARRAY:
2824 /* The data is never used....
2825 mdu_param_t param;
2826 err = md_copy_from_user(&param, (mdu_param_t *)arg,
2827 sizeof(param));
2828 if (err)
2829 goto abort_unlock;
2831 err = do_md_run (mddev);
2833 * we have to clean up the mess if
2834 * the array cannot be run for some
2835 * reason ...
2837 if (err) {
2838 mddev->sb_dirty = 0;
2839 do_md_stop (mddev, 0);
2841 goto done_unlock;
2844 default:
2845 printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2846 err = -EINVAL;
2847 goto abort_unlock;
2850 done_unlock:
2851 abort_unlock:
2852 if (mddev)
2853 unlock_mddev(mddev);
2854 else
2855 printk("huh11?\n");
2857 return err;
2858 done:
2859 if (err)
2860 printk("huh12?\n");
2861 abort:
2862 return err;
2865 static int md_open (struct inode *inode, struct file *file)
2868 * Always succeed
2870 return (0);
2873 static struct block_device_operations md_fops=
2875 open: md_open,
2876 ioctl: md_ioctl,
2880 int md_thread(void * arg)
2882 mdk_thread_t *thread = arg;
2884 md_lock_kernel();
2885 exit_mm(current);
2886 exit_files(current);
2887 exit_fs(current);
2890 * Detach thread
2892 sys_setsid();
2893 sprintf(current->comm, thread->name);
2894 md_init_signals();
2895 md_flush_signals();
2896 thread->tsk = current;
2899 * md_thread is a 'system-thread', it's priority should be very
2900 * high. We avoid resource deadlocks individually in each
2901 * raid personality. (RAID5 does preallocation) We also use RR and
2902 * the very same RT priority as kswapd, thus we will never get
2903 * into a priority inversion deadlock.
2905 * we definitely have to have equal or higher priority than
2906 * bdflush, otherwise bdflush will deadlock if there are too
2907 * many dirty RAID5 blocks.
2909 current->policy = SCHED_OTHER;
2910 current->priority = 40;
2911 // md_unlock_kernel();
2913 up(thread->sem);
2915 for (;;) {
2916 DECLARE_WAITQUEUE(wait, current);
2918 add_wait_queue(&thread->wqueue, &wait);
2919 set_task_state(current, TASK_INTERRUPTIBLE);
2920 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2921 dprintk("thread %p went to sleep.\n", thread);
2922 schedule();
2923 dprintk("thread %p woke up.\n", thread);
2925 current->state = TASK_RUNNING;
2926 remove_wait_queue(&thread->wqueue, &wait);
2927 clear_bit(THREAD_WAKEUP, &thread->flags);
2929 if (thread->run) {
2930 thread->run(thread->data);
2931 run_task_queue(&tq_disk);
2932 } else
2933 break;
2934 if (md_signal_pending(current)) {
2935 printk("%8s(%d) flushing signals.\n", current->comm,
2936 current->pid);
2937 md_flush_signals();
2940 up(thread->sem);
2941 return 0;
2944 void md_wakeup_thread(mdk_thread_t *thread)
2946 dprintk("waking up MD thread %p.\n", thread);
2947 set_bit(THREAD_WAKEUP, &thread->flags);
2948 wake_up(&thread->wqueue);
2951 mdk_thread_t *md_register_thread (void (*run) (void *),
2952 void *data, const char *name)
2954 mdk_thread_t *thread;
2955 int ret;
2956 DECLARE_MUTEX_LOCKED(sem);
2958 thread = (mdk_thread_t *) kmalloc
2959 (sizeof(mdk_thread_t), GFP_KERNEL);
2960 if (!thread)
2961 return NULL;
2963 memset(thread, 0, sizeof(mdk_thread_t));
2964 md_init_waitqueue_head(&thread->wqueue);
2966 thread->sem = &sem;
2967 thread->run = run;
2968 thread->data = data;
2969 thread->name = name;
2970 ret = kernel_thread(md_thread, thread, 0);
2971 if (ret < 0) {
2972 kfree(thread);
2973 return NULL;
2975 down(&sem);
2976 return thread;
2979 void md_interrupt_thread (mdk_thread_t *thread)
2981 if (!thread->tsk) {
2982 MD_BUG();
2983 return;
2985 printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2986 send_sig(SIGKILL, thread->tsk, 1);
2989 void md_unregister_thread (mdk_thread_t *thread)
2991 DECLARE_MUTEX_LOCKED(sem);
2993 thread->sem = &sem;
2994 thread->run = NULL;
2995 thread->name = NULL;
2996 if (!thread->tsk) {
2997 MD_BUG();
2998 return;
3000 md_interrupt_thread(thread);
3001 down(&sem);
3004 void md_recover_arrays (void)
3006 if (!md_recovery_thread) {
3007 MD_BUG();
3008 return;
3010 md_wakeup_thread(md_recovery_thread);
3014 int md_error (kdev_t dev, kdev_t rdev)
3016 mddev_t *mddev;
3017 mdk_rdev_t * rrdev;
3018 int rc;
3020 mddev = kdev_to_mddev(dev);
3021 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
3023 if (!mddev) {
3024 MD_BUG();
3025 return 0;
3027 rrdev = find_rdev(mddev, rdev);
3028 mark_rdev_faulty(rrdev);
3030 * if recovery was running, stop it now.
3032 if (mddev->pers->stop_resync)
3033 mddev->pers->stop_resync(mddev);
3034 if (mddev->recovery_running)
3035 md_interrupt_thread(md_recovery_thread);
3036 if (mddev->pers->error_handler) {
3037 rc = mddev->pers->error_handler(mddev, rdev);
3038 md_recover_arrays();
3039 return rc;
3041 return 0;
3044 static int status_unused (char * page)
3046 int sz = 0, i = 0;
3047 mdk_rdev_t *rdev;
3048 struct md_list_head *tmp;
3050 sz += sprintf(page + sz, "unused devices: ");
3052 ITERATE_RDEV_ALL(rdev,tmp) {
3053 if (!rdev->same_set.next && !rdev->same_set.prev) {
3055 * The device is not yet used by any array.
3057 i++;
3058 sz += sprintf(page + sz, "%s ",
3059 partition_name(rdev->dev));
3062 if (!i)
3063 sz += sprintf(page + sz, "<none>");
3065 sz += sprintf(page + sz, "\n");
3066 return sz;
3070 static int status_resync (char * page, mddev_t * mddev)
3072 int sz = 0;
3073 unsigned long max_blocks, resync, res, dt, db, rt;
3075 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
3076 max_blocks = mddev->sb->size;
3079 * Should not happen.
3081 if (!max_blocks) {
3082 MD_BUG();
3083 return 0;
3085 res = (resync/1024)*1000/(max_blocks/1024 + 1);
3087 int i, x = res/50, y = 20-x;
3088 sz += sprintf(page + sz, "[");
3089 for (i = 0; i < x; i++)
3090 sz += sprintf(page + sz, "=");
3091 sz += sprintf(page + sz, ">");
3092 for (i = 0; i < y; i++)
3093 sz += sprintf(page + sz, ".");
3094 sz += sprintf(page + sz, "] ");
3096 if (!mddev->recovery_running)
3098 * true resync
3100 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3101 res/10, res % 10, resync, max_blocks);
3102 else
3104 * recovery ...
3106 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3107 res/10, res % 10, resync, max_blocks);
3110 * We do not want to overflow, so the order of operands and
3111 * the * 100 / 100 trick are important. We do a +1 to be
3112 * safe against division by zero. We only estimate anyway.
3114 * dt: time from mark until now
3115 * db: blocks written from mark until now
3116 * rt: remaining time
3118 dt = ((jiffies - mddev->resync_mark) / HZ);
3119 if (!dt) dt++;
3120 db = resync - mddev->resync_mark_cnt;
3121 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3123 sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3125 sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3127 return sz;
3130 static int md_status_read_proc(char *page, char **start, off_t off,
3131 int count, int *eof, void *data)
3133 int sz = 0, j, size;
3134 struct md_list_head *tmp, *tmp2;
3135 mdk_rdev_t *rdev;
3136 mddev_t *mddev;
3138 sz += sprintf(page + sz, "Personalities : ");
3139 for (j = 0; j < MAX_PERSONALITY; j++)
3140 if (pers[j])
3141 sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3143 sz += sprintf(page+sz, "\n");
3146 sz += sprintf(page+sz, "read_ahead ");
3147 if (read_ahead[MD_MAJOR] == INT_MAX)
3148 sz += sprintf(page+sz, "not set\n");
3149 else
3150 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3152 ITERATE_MDDEV(mddev,tmp) {
3153 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3154 mddev->pers ? "" : "in");
3155 if (mddev->pers) {
3156 if (mddev->ro)
3157 sz += sprintf(page + sz, " (read-only)");
3158 sz += sprintf(page + sz, " %s", mddev->pers->name);
3161 size = 0;
3162 ITERATE_RDEV(mddev,rdev,tmp2) {
3163 sz += sprintf(page + sz, " %s[%d]",
3164 partition_name(rdev->dev), rdev->desc_nr);
3165 if (rdev->faulty) {
3166 sz += sprintf(page + sz, "(F)");
3167 continue;
3169 size += rdev->size;
3172 if (mddev->nb_dev) {
3173 if (mddev->pers)
3174 sz += sprintf(page + sz, "\n %d blocks",
3175 md_size[mdidx(mddev)]);
3176 else
3177 sz += sprintf(page + sz, "\n %d blocks", size);
3180 if (!mddev->pers) {
3181 sz += sprintf(page+sz, "\n");
3182 continue;
3185 sz += mddev->pers->status (page+sz, mddev);
3187 sz += sprintf(page+sz, "\n ");
3188 if (mddev->curr_resync) {
3189 sz += status_resync (page+sz, mddev);
3190 } else {
3191 if (md_atomic_read(&mddev->resync_sem.count) != 1)
3192 sz += sprintf(page + sz, " resync=DELAYED");
3194 sz += sprintf(page + sz, "\n");
3196 sz += status_unused (page + sz);
3198 return sz;
3201 int register_md_personality (int pnum, mdk_personality_t *p)
3203 if (pnum >= MAX_PERSONALITY)
3204 return -EINVAL;
3206 if (pers[pnum])
3207 return -EBUSY;
3209 pers[pnum] = p;
3210 printk(KERN_INFO "%s personality registered\n", p->name);
3211 return 0;
3214 int unregister_md_personality (int pnum)
3216 if (pnum >= MAX_PERSONALITY)
3217 return -EINVAL;
3219 printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
3220 pers[pnum] = NULL;
3221 return 0;
3224 static mdp_disk_t *get_spare(mddev_t *mddev)
3226 mdp_super_t *sb = mddev->sb;
3227 mdp_disk_t *disk;
3228 mdk_rdev_t *rdev;
3229 struct md_list_head *tmp;
3231 ITERATE_RDEV(mddev,rdev,tmp) {
3232 if (rdev->faulty)
3233 continue;
3234 if (!rdev->sb) {
3235 MD_BUG();
3236 continue;
3238 disk = &sb->disks[rdev->desc_nr];
3239 if (disk_faulty(disk)) {
3240 MD_BUG();
3241 continue;
3243 if (disk_active(disk))
3244 continue;
3245 return disk;
3247 return NULL;
3250 static int is_mddev_idle (mddev_t *mddev)
3252 mdk_rdev_t * rdev;
3253 struct md_list_head *tmp;
3254 int idle;
3255 unsigned long curr_events;
3257 idle = 1;
3258 ITERATE_RDEV(mddev,rdev,tmp) {
3259 int major = MAJOR(rdev->dev);
3260 int idx = disk_index(rdev->dev);
3262 curr_events = kstat.dk_drive_rblk[major][idx] +
3263 kstat.dk_drive_wblk[major][idx] ;
3264 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3265 if (curr_events != rdev->last_events) {
3266 // printk("!I(%ld)", curr_events - rdev->last_events);
3267 rdev->last_events = curr_events;
3268 idle = 0;
3271 return idle;
3274 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3276 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3278 /* another "blocks" (1K) blocks have been synced */
3279 atomic_sub(blocks, &mddev->recovery_active);
3280 wake_up(&mddev->recovery_wait);
3281 if (!ok) {
3282 // stop recovery, signal do_sync ....
3286 #define SYNC_MARKS 10
3287 #define SYNC_MARK_STEP (3*HZ)
3288 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3290 mddev_t *mddev2;
3291 unsigned int max_blocks, currspeed,
3292 j, window, err, serialize;
3293 kdev_t read_disk = mddev_to_kdev(mddev);
3294 unsigned long mark[SYNC_MARKS];
3295 unsigned long mark_cnt[SYNC_MARKS];
3296 int last_mark,m;
3297 struct md_list_head *tmp;
3298 unsigned long last_check;
3301 err = down_interruptible(&mddev->resync_sem);
3302 if (err)
3303 goto out_nolock;
3305 recheck:
3306 serialize = 0;
3307 ITERATE_MDDEV(mddev2,tmp) {
3308 if (mddev2 == mddev)
3309 continue;
3310 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3311 printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
3312 serialize = 1;
3313 break;
3316 if (serialize) {
3317 interruptible_sleep_on(&resync_wait);
3318 if (md_signal_pending(current)) {
3319 md_flush_signals();
3320 err = -EINTR;
3321 goto out;
3323 goto recheck;
3326 mddev->curr_resync = 1;
3328 max_blocks = mddev->sb->size;
3330 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3331 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3332 sysctl_speed_limit_min);
3333 printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3336 * Resync has low priority.
3338 current->priority = 1;
3340 is_mddev_idle(mddev); /* this also initializes IO event counters */
3341 for (m = 0; m < SYNC_MARKS; m++) {
3342 mark[m] = jiffies;
3343 mark_cnt[m] = 0;
3345 last_mark = 0;
3346 mddev->resync_mark = mark[last_mark];
3347 mddev->resync_mark_cnt = mark_cnt[last_mark];
3350 * Tune reconstruction:
3352 window = md_maxreadahead[mdidx(mddev)]/1024;
3353 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks);
3355 atomic_set(&mddev->recovery_active, 0);
3356 init_waitqueue_head(&mddev->recovery_wait);
3357 last_check = 0;
3358 for (j = 0; j < max_blocks;) {
3359 int blocks;
3361 blocks = mddev->pers->sync_request(mddev, j);
3363 if (blocks < 0) {
3364 err = blocks;
3365 goto out;
3367 atomic_add(blocks, &mddev->recovery_active);
3368 j += blocks;
3369 mddev->curr_resync = j;
3371 if (last_check + window > j)
3372 continue;
3374 run_task_queue(&tq_disk); //??
3376 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3377 /* step marks */
3378 int next = (last_mark+1) % SYNC_MARKS;
3380 mddev->resync_mark = mark[next];
3381 mddev->resync_mark_cnt = mark_cnt[next];
3382 mark[next] = jiffies;
3383 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3384 last_mark = next;
3388 if (md_signal_pending(current)) {
3390 * got a signal, exit.
3392 mddev->curr_resync = 0;
3393 printk("md_do_sync() got signal ... exiting\n");
3394 md_flush_signals();
3395 err = -EINTR;
3396 goto out;
3400 * this loop exits only if either when we are slower than
3401 * the 'hard' speed limit, or the system was IO-idle for
3402 * a jiffy.
3403 * the system might be non-idle CPU-wise, but we only care
3404 * about not overloading the IO subsystem. (things like an
3405 * e2fsck being done on the RAID array should execute fast)
3407 repeat:
3408 if (md_need_resched(current))
3409 schedule();
3411 currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1;
3413 if (currspeed > sysctl_speed_limit_min) {
3414 current->priority = 1;
3416 if ((currspeed > sysctl_speed_limit_max) ||
3417 !is_mddev_idle(mddev)) {
3418 current->state = TASK_INTERRUPTIBLE;
3419 md_schedule_timeout(HZ/4);
3420 if (!md_signal_pending(current))
3421 goto repeat;
3423 } else
3424 current->priority = 40;
3426 fsync_dev(read_disk);
3427 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3428 err = 0;
3430 * this also signals 'finished resyncing' to md_stop
3432 out:
3433 wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3434 up(&mddev->resync_sem);
3435 out_nolock:
3436 mddev->curr_resync = 0;
3437 wake_up(&resync_wait);
3438 return err;
3443 * This is a kernel thread which syncs a spare disk with the active array
3445 * the amount of foolproofing might seem to be a tad excessive, but an
3446 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3447 * of my root partition with the first 0.5 gigs of my /home partition ... so
3448 * i'm a bit nervous ;)
3450 void md_do_recovery (void *data)
3452 int err;
3453 mddev_t *mddev;
3454 mdp_super_t *sb;
3455 mdp_disk_t *spare;
3456 struct md_list_head *tmp;
3458 printk(KERN_INFO "md: recovery thread got woken up ...\n");
3459 restart:
3460 ITERATE_MDDEV(mddev,tmp) {
3461 sb = mddev->sb;
3462 if (!sb)
3463 continue;
3464 if (mddev->recovery_running)
3465 continue;
3466 if (sb->active_disks == sb->raid_disks)
3467 continue;
3468 if (!sb->spare_disks) {
3469 printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3470 continue;
3473 * now here we get the spare and resync it.
3475 if ((spare = get_spare(mddev)) == NULL)
3476 continue;
3477 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3478 if (!mddev->pers->diskop)
3479 continue;
3480 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3481 continue;
3482 down(&mddev->recovery_sem);
3483 mddev->recovery_running = 1;
3484 err = md_do_sync(mddev, spare);
3485 if (err == -EIO) {
3486 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3487 if (!disk_faulty(spare)) {
3488 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3489 mark_disk_faulty(spare);
3490 mark_disk_nonsync(spare);
3491 mark_disk_inactive(spare);
3492 sb->spare_disks--;
3493 sb->working_disks--;
3494 sb->failed_disks++;
3496 } else
3497 if (disk_faulty(spare))
3498 mddev->pers->diskop(mddev, &spare,
3499 DISKOP_SPARE_INACTIVE);
3500 if (err == -EINTR || err == -ENOMEM) {
3502 * Recovery got interrupted, or ran out of mem ...
3503 * signal back that we have finished using the array.
3505 mddev->pers->diskop(mddev, &spare,
3506 DISKOP_SPARE_INACTIVE);
3507 up(&mddev->recovery_sem);
3508 mddev->recovery_running = 0;
3509 continue;
3510 } else {
3511 mddev->recovery_running = 0;
3512 up(&mddev->recovery_sem);
3514 if (!disk_faulty(spare)) {
3516 * the SPARE_ACTIVE diskop possibly changes the
3517 * pointer too
3519 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3520 mark_disk_sync(spare);
3521 mark_disk_active(spare);
3522 sb->active_disks++;
3523 sb->spare_disks--;
3525 mddev->sb_dirty = 1;
3526 md_update_sb(mddev);
3527 goto restart;
3529 printk(KERN_INFO "md: recovery thread finished ...\n");
3533 int md_notify_reboot(struct notifier_block *this,
3534 unsigned long code, void *x)
3536 struct md_list_head *tmp;
3537 mddev_t *mddev;
3539 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3540 || (code == MD_SYS_POWER_OFF)) {
3542 printk(KERN_INFO "stopping all md devices.\n");
3544 ITERATE_MDDEV(mddev,tmp)
3545 do_md_stop (mddev, 1);
3547 * certain more exotic SCSI devices are known to be
3548 * volatile wrt too early system reboots. While the
3549 * right place to handle this issue is the given
3550 * driver, we do want to have a safe RAID driver ...
3552 md_mdelay(1000*1);
3554 return NOTIFY_DONE;
3557 struct notifier_block md_notifier = {
3558 md_notify_reboot,
3559 NULL,
3563 void md__init raid_setup(char *str, int *ints)
3565 char tmpline[100];
3566 int len, pos, nr, i;
3568 len = strlen(str) + 1;
3569 nr = 0;
3570 pos = 0;
3572 for (i = 0; i < len; i++) {
3573 char c = str[i];
3575 if (c == ',' || !c) {
3576 tmpline[pos] = 0;
3577 if (!strcmp(tmpline,"noautodetect"))
3578 raid_setup_args.noautodetect = 1;
3579 nr++;
3580 pos = 0;
3581 continue;
3583 tmpline[pos] = c;
3584 pos++;
3586 raid_setup_args.set = 1;
3587 return;
3590 static void md_geninit (void)
3592 int i;
3594 for(i = 0; i < MAX_MD_DEVS; i++) {
3595 md_blocksizes[i] = 1024;
3596 md_size[i] = 0;
3597 md_maxreadahead[i] = MD_READAHEAD;
3598 register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0);
3600 blksize_size[MD_MAJOR] = md_blocksizes;
3601 blk_size[MAJOR_NR] = md_size;
3602 max_readahead[MD_MAJOR] = md_maxreadahead;
3604 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3606 #ifdef CONFIG_PROC_FS
3607 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3608 #endif
3610 void hsm_init (void);
3611 void translucent_init (void);
3612 void linear_init (void);
3613 void raid0_init (void);
3614 void raid1_init (void);
3615 void raid5_init (void);
3617 int md__init md_init (void)
3619 static char * name = "mdrecoveryd";
3621 printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n",
3622 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3623 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL);
3625 if (devfs_register_blkdev (MD_MAJOR, "md", &md_fops))
3627 printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR);
3628 return (-1);
3630 devfs_handle = devfs_mk_dir (NULL, "md", 0, NULL);
3631 devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT,
3632 MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR, 0, 0,
3633 &md_fops, NULL);
3635 blk_dev[MD_MAJOR].queue = md_get_queue;
3637 read_ahead[MD_MAJOR] = INT_MAX;
3638 md_gendisk.next = gendisk_head;
3640 gendisk_head = &md_gendisk;
3642 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3643 if (!md_recovery_thread)
3644 printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
3646 md_register_reboot_notifier(&md_notifier);
3647 raid_table_header = register_sysctl_table(raid_root_table, 1);
3649 #ifdef CONFIG_MD_LINEAR
3650 linear_init ();
3651 #endif
3652 #ifdef CONFIG_MD_RAID0
3653 raid0_init ();
3654 #endif
3655 #ifdef CONFIG_MD_RAID1
3656 raid1_init ();
3657 #endif
3658 #ifdef CONFIG_MD_RAID5
3659 raid5_init ();
3660 #endif
3661 #if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
3663 * pick a XOR routine, runtime.
3665 calibrate_xor_block();
3666 #endif
3667 md_geninit();
3668 return (0);
3671 #ifdef CONFIG_MD_BOOT
3672 #define MAX_MD_BOOT_DEVS 8
3673 struct {
3674 unsigned long set;
3675 int pers[MAX_MD_BOOT_DEVS];
3676 int chunk[MAX_MD_BOOT_DEVS];
3677 kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL];
3678 } md_setup_args md__initdata;
3681 * Parse the command-line parameters given our kernel, but do not
3682 * actually try to invoke the MD device now; that is handled by
3683 * md_setup_drive after the low-level disk drivers have initialised.
3685 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3686 * assigns the task of parsing integer arguments to the
3687 * invoked program now). Added ability to initialise all
3688 * the MD devices (by specifying multiple "md=" lines)
3689 * instead of just one. -- KTK
3690 * 18May2000: Added support for persistant-superblock arrays:
3691 * md=n,0,factor,fault,device-list uses RAID0 for device n
3692 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3693 * md=n,device-list reads a RAID superblock from the devices
3694 * elements in device-list are read by name_to_kdev_t so can be
3695 * a hex number or something like /dev/hda1 /dev/sdb
3697 extern kdev_t name_to_kdev_t(char *line) md__init;
3698 static int md__init md_setup(char *str)
3700 int minor, level, factor, fault, i=0;
3701 kdev_t device;
3702 char *devnames, *pername = "";
3704 if(get_option(&str, &minor) != 2) { /* MD Number */
3705 printk("md: Too few arguments supplied to md=.\n");
3706 return 0;
3708 if (minor >= MAX_MD_BOOT_DEVS) {
3709 printk ("md: Minor device number too high.\n");
3710 return 0;
3711 } else if (md_setup_args.set & (1 << minor)) {
3712 printk ("md: Warning - md=%d,... has been specified twice;\n"
3713 " will discard the first definition.\n", minor);
3715 switch(get_option(&str, &level)) { /* RAID Personality */
3716 case 2: /* could be 0 or -1.. */
3717 if (level == 0 || level == -1) {
3718 if (get_option(&str, &factor) != 2 || /* Chunk Size */
3719 get_option(&str, &fault) != 2) {
3720 printk("md: Too few arguments supplied to md=.\n");
3721 return 0;
3723 md_setup_args.pers[minor] = level;
3724 md_setup_args.chunk[minor] = 1 << (factor+12);
3725 switch(level) {
3726 case -1:
3727 level = LINEAR;
3728 pername = "linear";
3729 break;
3730 case 0:
3731 level = RAID0;
3732 pername = "raid0";
3733 break;
3734 default:
3735 printk ("md: The kernel has not been configured for raid%d"
3736 " support!\n", level);
3737 return 0;
3739 md_setup_args.pers[minor] = level;
3740 break;
3742 /* FALL THROUGH */
3743 case 1: /* the first device is numeric */
3744 md_setup_args.devices[minor][i++] = level;
3745 /* FALL THROUGH */
3746 case 0:
3747 md_setup_args.pers[minor] = 0;
3748 pername="super-block";
3750 devnames = str;
3751 for (; i<MAX_REAL && str; i++) {
3752 if ((device = name_to_kdev_t(str))) {
3753 md_setup_args.devices[minor][i] = device;
3754 } else {
3755 printk ("md: Unknown device name, %s.\n", str);
3756 return 0;
3758 if ((str = strchr(str, ',')) != NULL)
3759 str++;
3761 if (!i) {
3762 printk ("md: No devices specified for md%d?\n", minor);
3763 return 0;
3766 printk ("md: Will configure md%d (%s) from %s, below.\n",
3767 minor, pername, devnames);
3768 md_setup_args.devices[minor][i] = (kdev_t) 0;
3769 md_setup_args.set |= (1 << minor);
3770 return 1;
3773 void md__init md_setup_drive(void)
3775 int minor, i;
3776 kdev_t dev;
3777 mddev_t*mddev;
3779 for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) {
3780 mdu_disk_info_t dinfo;
3781 int err=0;
3782 if (!(md_setup_args.set & (1 << minor)))
3783 continue;
3784 printk("md: Loading md%d.\n", minor);
3785 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3786 if (md_setup_args.pers[minor]) {
3787 /* non-persistent */
3788 mdu_array_info_t ainfo;
3789 ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3790 ainfo.size = 0;
3791 ainfo.nr_disks =0;
3792 ainfo.raid_disks =0;
3793 ainfo.md_minor =minor;
3794 ainfo.not_persistent = 1;
3796 ainfo.state = MD_SB_CLEAN;
3797 ainfo.active_disks = 0;
3798 ainfo.working_disks = 0;
3799 ainfo.failed_disks = 0;
3800 ainfo.spare_disks = 0;
3801 ainfo.layout = 0;
3802 ainfo.chunk_size = md_setup_args.chunk[minor];
3803 err = set_array_info(mddev, &ainfo);
3804 for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) {
3805 dinfo.number = i;
3806 dinfo.raid_disk = i;
3807 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3808 dinfo.major = MAJOR(dev);
3809 dinfo.minor = MINOR(dev);
3810 mddev->sb->nr_disks++;
3811 mddev->sb->raid_disks++;
3812 mddev->sb->active_disks++;
3813 mddev->sb->working_disks++;
3814 err = add_new_disk (mddev, &dinfo);
3816 } else {
3817 /* persistent */
3818 for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) {
3819 dinfo.major = MAJOR(dev);
3820 dinfo.minor = MINOR(dev);
3821 add_new_disk (mddev, &dinfo);
3824 if (!err)
3825 err = do_md_run(mddev);
3826 if (err) {
3827 mddev->sb_dirty = 0;
3828 do_md_stop(mddev, 0);
3829 printk("md: starting md%d failed\n", minor);
3834 __setup("md=", md_setup);
3835 #endif
3838 MD_EXPORT_SYMBOL(md_size);
3839 MD_EXPORT_SYMBOL(register_md_personality);
3840 MD_EXPORT_SYMBOL(unregister_md_personality);
3841 MD_EXPORT_SYMBOL(partition_name);
3842 MD_EXPORT_SYMBOL(md_error);
3843 MD_EXPORT_SYMBOL(md_do_sync);
3844 MD_EXPORT_SYMBOL(md_done_sync);
3845 MD_EXPORT_SYMBOL(md_recover_arrays);
3846 MD_EXPORT_SYMBOL(md_register_thread);
3847 MD_EXPORT_SYMBOL(md_unregister_thread);
3848 MD_EXPORT_SYMBOL(md_update_sb);
3849 MD_EXPORT_SYMBOL(md_wakeup_thread);
3850 MD_EXPORT_SYMBOL(md_print_devices);
3851 MD_EXPORT_SYMBOL(find_rdev_nr);
3852 MD_EXPORT_SYMBOL(md_interrupt_thread);
3853 MD_EXPORT_SYMBOL(mddev_map);
3854 MD_EXPORT_SYMBOL(md_check_ordering);