Import 2.1.118
[davej-history.git] / drivers / block / md.c
blobb6cf3d34e26ec951b57233929003b13aa59a27f4
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
6 <maz@gloups.fdn.fr>
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
11 boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 RAID-1/RAID-5 extensions by:
14 Ingo Molnar, Miguel de Icaza, Gadi Oxman
16 Changes for kmod by:
17 Cyrus Durgin
19 This program is free software; you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation; either version 2, or (at your option)
22 any later version.
24 You should have received a copy of the GNU General Public License
25 (for example /usr/src/linux/COPYING); if not, write to the Free
26 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
31 * the extra system load does not show up that much. Increase it if your
32 * system can take more.
34 #define SPEED_LIMIT 1024
36 #include <linux/config.h>
37 #include <linux/module.h>
38 #include <linux/version.h>
39 #include <linux/malloc.h>
40 #include <linux/mm.h>
41 #include <linux/md.h>
42 #include <linux/hdreg.h>
43 #include <linux/stat.h>
44 #include <linux/fs.h>
45 #include <linux/proc_fs.h>
46 #include <linux/blkdev.h>
47 #include <linux/genhd.h>
48 #include <linux/smp_lock.h>
49 #ifdef CONFIG_KMOD
50 #include <linux/kmod.h>
51 #endif
52 #include <linux/errno.h>
53 #include <linux/init.h>
55 #define __KERNEL_SYSCALLS__
56 #include <linux/unistd.h>
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
61 #include <linux/blk.h>
62 #include <asm/uaccess.h>
63 #include <asm/bitops.h>
64 #include <asm/atomic.h>
66 #ifdef CONFIG_MD_BOOT
67 extern kdev_t name_to_kdev_t(char *line) __init;
68 #endif
70 static struct hd_struct md_hd_struct[MAX_MD_DEV];
71 static int md_blocksizes[MAX_MD_DEV];
72 int md_maxreadahead[MAX_MD_DEV];
73 #if SUPPORT_RECONSTRUCTION
74 static struct md_thread *md_sync_thread = NULL;
75 #endif /* SUPPORT_RECONSTRUCTION */
77 int md_size[MAX_MD_DEV]={0, };
79 static void md_geninit (struct gendisk *);
81 static struct gendisk md_gendisk=
83 MD_MAJOR,
84 "md",
87 MAX_MD_DEV,
88 md_geninit,
89 md_hd_struct,
90 md_size,
91 MAX_MD_DEV,
92 NULL,
93 NULL
96 static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
97 struct md_dev md_dev[MAX_MD_DEV];
99 int md_thread(void * arg);
101 static struct gendisk *find_gendisk (kdev_t dev)
103 struct gendisk *tmp=gendisk_head;
105 while (tmp != NULL)
107 if (tmp->major==MAJOR(dev))
108 return (tmp);
110 tmp=tmp->next;
113 return (NULL);
116 char *partition_name (kdev_t dev)
118 static char name[40]; /* This should be long
119 enough for a device name ! */
120 struct gendisk *hd = find_gendisk (dev);
122 if (!hd)
124 sprintf (name, "[dev %s]", kdevname(dev));
125 return (name);
128 return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
131 static int legacy_raid_sb (int minor, int pnum)
133 int i, factor;
135 factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
137 /*****
138 * do size and offset calculations.
140 for (i=0; i<md_dev[minor].nb_dev; i++) {
141 md_dev[minor].devices[i].size &= ~(factor - 1);
142 md_size[minor] += md_dev[minor].devices[i].size;
143 md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
144 md_dev[minor].devices[i-1].size) : 0;
146 if (pnum == RAID0 >> PERSONALITY_SHIFT)
147 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
148 return 0;
151 static void free_sb (struct md_dev *mddev)
153 int i;
154 struct real_dev *realdev;
156 if (mddev->sb) {
157 free_page((unsigned long) mddev->sb);
158 mddev->sb = NULL;
160 for (i = 0; i <mddev->nb_dev; i++) {
161 realdev = mddev->devices + i;
162 if (realdev->sb) {
163 free_page((unsigned long) realdev->sb);
164 realdev->sb = NULL;
170 * Check one RAID superblock for generic plausibility
173 #define BAD_MAGIC KERN_ERR \
174 "md: %s: invalid raid superblock magic (%x) on block %u\n"
176 #define OUT_OF_MEM KERN_ALERT \
177 "md: out of memory.\n"
179 #define NO_DEVICE KERN_ERR \
180 "md: disabled device %s\n"
182 #define SUCCESS 0
183 #define FAILURE -1
185 static int analyze_one_sb (struct real_dev * rdev)
187 int ret = FAILURE;
188 struct buffer_head *bh;
189 kdev_t dev = rdev->dev;
190 md_superblock_t *sb;
193 * Read the superblock, it's at the end of the disk
195 rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
196 set_blocksize (dev, MD_SB_BYTES);
197 bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
199 if (bh) {
200 sb = (md_superblock_t *) bh->b_data;
201 if (sb->md_magic != MD_SB_MAGIC) {
202 printk (BAD_MAGIC, kdevname(dev),
203 sb->md_magic, rdev->sb_offset);
204 goto abort;
206 rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
207 if (!rdev->sb) {
208 printk (OUT_OF_MEM);
209 goto abort;
211 memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
213 rdev->size = sb->size;
214 } else
215 printk (NO_DEVICE,kdevname(rdev->dev));
216 ret = SUCCESS;
217 abort:
218 if (bh)
219 brelse (bh);
220 return ret;
223 #undef SUCCESS
224 #undef FAILURE
226 #undef BAD_MAGIC
227 #undef OUT_OF_MEM
228 #undef NO_DEVICE
231 * Check a full RAID array for plausibility
234 #define INCONSISTENT KERN_ERR \
235 "md: superblock inconsistency -- run ckraid\n"
237 #define OUT_OF_DATE KERN_ERR \
238 "md: superblock update time inconsistenty -- using the most recent one\n"
240 #define OLD_VERSION KERN_ALERT \
241 "md: %s: unsupported raid array version %d.%d.%d\n"
243 #define NOT_CLEAN KERN_ERR \
244 "md: %s: raid array is not clean -- run ckraid\n"
246 #define NOT_CLEAN_IGNORE KERN_ERR \
247 "md: %s: raid array is not clean -- reconstructing parity\n"
249 #define UNKNOWN_LEVEL KERN_ERR \
250 "md: %s: unsupported raid level %d\n"
252 static int analyze_sbs (int minor, int pnum)
254 struct md_dev *mddev = md_dev + minor;
255 int i, N = mddev->nb_dev, out_of_date = 0;
256 struct real_dev * disks = mddev->devices;
257 md_superblock_t *sb, *freshest = NULL;
260 * RAID-0 and linear don't use a RAID superblock
262 if (pnum == RAID0 >> PERSONALITY_SHIFT ||
263 pnum == LINEAR >> PERSONALITY_SHIFT)
264 return legacy_raid_sb (minor, pnum);
267 * Verify the RAID superblock on each real device
269 for (i = 0; i < N; i++)
270 if (analyze_one_sb(disks+i))
271 goto abort;
274 * The superblock constant part has to be the same
275 * for all disks in the array.
277 sb = NULL;
278 for (i = 0; i < N; i++) {
279 if (!disks[i].sb)
280 continue;
281 if (!sb) {
282 sb = disks[i].sb;
283 continue;
285 if (memcmp(sb,
286 disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
287 printk (INCONSISTENT);
288 goto abort;
293 * OK, we have all disks and the array is ready to run. Let's
294 * find the freshest superblock, that one will be the superblock
295 * that represents the whole array.
297 if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
298 goto abort;
299 freshest = NULL;
300 for (i = 0; i < N; i++) {
301 if (!disks[i].sb)
302 continue;
303 if (!freshest) {
304 freshest = disks[i].sb;
305 continue;
308 * Find the newest superblock version
310 if (disks[i].sb->utime != freshest->utime) {
311 out_of_date = 1;
312 if (disks[i].sb->utime > freshest->utime)
313 freshest = disks[i].sb;
316 if (out_of_date)
317 printk(OUT_OF_DATE);
318 memcpy (sb, freshest, sizeof(*freshest));
321 * Check if we can support this RAID array
323 if (sb->major_version != MD_MAJOR_VERSION ||
324 sb->minor_version > MD_MINOR_VERSION) {
326 printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
327 sb->major_version, sb->minor_version,
328 sb->patch_version);
329 goto abort;
333 * We need to add this as a superblock option.
335 #if SUPPORT_RECONSTRUCTION
336 if (sb->state != (1 << MD_SB_CLEAN)) {
337 if (sb->level == 1) {
338 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
339 goto abort;
340 } else
341 printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
343 #else
344 if (sb->state != (1 << MD_SB_CLEAN)) {
345 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
346 goto abort;
348 #endif /* SUPPORT_RECONSTRUCTION */
350 switch (sb->level) {
351 case 1:
352 md_size[minor] = sb->size;
353 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
354 break;
355 case 4:
356 case 5:
357 md_size[minor] = sb->size * (sb->raid_disks - 1);
358 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
359 break;
360 default:
361 printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
362 sb->level);
363 goto abort;
365 return 0;
366 abort:
367 free_sb(mddev);
368 return 1;
371 #undef INCONSISTENT
372 #undef OUT_OF_DATE
373 #undef OLD_VERSION
374 #undef NOT_CLEAN
375 #undef OLD_LEVEL
377 int md_update_sb(int minor)
379 struct md_dev *mddev = md_dev + minor;
380 struct buffer_head *bh;
381 md_superblock_t *sb = mddev->sb;
382 struct real_dev *realdev;
383 kdev_t dev;
384 int i;
385 u32 sb_offset;
387 sb->utime = CURRENT_TIME;
388 for (i = 0; i < mddev->nb_dev; i++) {
389 realdev = mddev->devices + i;
390 if (!realdev->sb)
391 continue;
392 dev = realdev->dev;
393 sb_offset = realdev->sb_offset;
394 set_blocksize(dev, MD_SB_BYTES);
395 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
396 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
397 if (bh) {
398 sb = (md_superblock_t *) bh->b_data;
399 memcpy(sb, mddev->sb, MD_SB_BYTES);
400 memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
401 mark_buffer_uptodate(bh, 1);
402 mark_buffer_dirty(bh, 1);
403 ll_rw_block(WRITE, 1, &bh);
404 wait_on_buffer(bh);
405 bforget(bh);
406 fsync_dev(dev);
407 invalidate_buffers(dev);
408 } else
409 printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
411 return 0;
414 static int do_md_run (int minor, int repart)
416 int pnum, i, min, factor, err;
418 if (!md_dev[minor].nb_dev)
419 return -EINVAL;
421 if (md_dev[minor].pers)
422 return -EBUSY;
424 md_dev[minor].repartition=repart;
426 if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
427 >= MAX_PERSONALITY)
428 return -EINVAL;
430 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
431 if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
432 for (i = 0; i < md_dev [minor].nb_dev; i++)
433 if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
434 return -EINVAL;
436 if (!pers[pnum])
438 #ifdef CONFIG_KMOD
439 char module_name[80];
440 sprintf (module_name, "md-personality-%d", pnum);
441 request_module (module_name);
442 if (!pers[pnum])
443 #endif
444 return -EINVAL;
447 factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
449 for (i=0; i<md_dev[minor].nb_dev; i++)
450 if (md_dev[minor].devices[i].size<min)
452 printk ("Dev %s smaller than %dk, cannot shrink\n",
453 partition_name (md_dev[minor].devices[i].dev), min);
454 return -EINVAL;
457 for (i=0; i<md_dev[minor].nb_dev; i++) {
458 fsync_dev(md_dev[minor].devices[i].dev);
459 invalidate_buffers(md_dev[minor].devices[i].dev);
462 /* Resize devices according to the factor. It is used to align
463 partitions size on a given chunk size. */
464 md_size[minor]=0;
467 * Analyze the raid superblock
469 if (analyze_sbs(minor, pnum))
470 return -EINVAL;
472 md_dev[minor].pers=pers[pnum];
474 if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
476 md_dev[minor].pers=NULL;
477 free_sb(md_dev + minor);
478 return (err);
481 if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
483 md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
484 md_update_sb(minor);
487 /* FIXME : We assume here we have blocks
488 that are twice as large as sectors.
489 THIS MAY NOT BE TRUE !!! */
490 md_hd_struct[minor].start_sect=0;
491 md_hd_struct[minor].nr_sects=md_size[minor]<<1;
493 read_ahead[MD_MAJOR] = 128;
494 return (0);
497 static int do_md_stop (int minor, struct inode *inode)
499 int i;
501 if (inode->i_count>1 || md_dev[minor].busy>1) {
503 * ioctl : one open channel
505 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
506 minor, inode->i_count, md_dev[minor].busy);
507 return -EBUSY;
510 if (md_dev[minor].pers) {
512 * It is safe to call stop here, it only frees private
513 * data. Also, it tells us if a device is unstoppable
514 * (eg. resyncing is in progress)
516 if (md_dev[minor].pers->stop (minor, md_dev+minor))
517 return -EBUSY;
519 * The device won't exist anymore -> flush it now
521 fsync_dev (inode->i_rdev);
522 invalidate_buffers (inode->i_rdev);
523 if (md_dev[minor].sb) {
524 md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
525 md_update_sb(minor);
529 /* Remove locks. */
530 if (md_dev[minor].sb)
531 free_sb(md_dev + minor);
532 for (i=0; i<md_dev[minor].nb_dev; i++)
533 clear_inode (md_dev[minor].devices[i].inode);
535 md_dev[minor].nb_dev=md_size[minor]=0;
536 md_hd_struct[minor].nr_sects=0;
537 md_dev[minor].pers=NULL;
539 read_ahead[MD_MAJOR] = 128;
541 return (0);
544 static int do_md_add (int minor, kdev_t dev)
546 int i;
547 int hot_add=0;
548 struct real_dev *realdev;
550 if (md_dev[minor].nb_dev==MAX_REAL)
551 return -EINVAL;
553 if (!fs_may_mount (dev))
554 return -EBUSY;
556 if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
557 printk("md_add(): zero device size, huh, bailing out.\n");
558 return -EINVAL;
561 if (md_dev[minor].pers) {
563 * The array is already running, hot-add the drive, or
564 * bail out:
566 if (!md_dev[minor].pers->hot_add_disk)
567 return -EBUSY;
568 else
569 hot_add=1;
573 * Careful. We cannot increase nb_dev for a running array.
575 i=md_dev[minor].nb_dev;
576 realdev = &md_dev[minor].devices[i];
577 realdev->dev=dev;
579 /* Lock the device by inserting a dummy inode. This doesn't
580 smell very good, but I need to be consistent with the
581 mount stuff, specially with fs_may_mount. If someone have
582 a better idea, please help ! */
584 realdev->inode=get_empty_inode ();
585 realdev->inode->i_dev=dev; /* don't care about other fields */
586 insert_inode_hash (realdev->inode);
588 /* Sizes are now rounded at run time */
590 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
592 realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
594 if (hot_add) {
596 * Check the superblock for consistency.
597 * The personality itself has to check whether it's getting
598 * added with the proper flags. The personality has to be
599 * checked too. ;)
601 if (analyze_one_sb (realdev))
602 return -EINVAL;
604 * hot_add has to bump up nb_dev itself
606 if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
608 * FIXME: here we should free up the inode and stuff
610 printk ("FIXME\n");
611 return -EINVAL;
613 } else
614 md_dev[minor].nb_dev++;
616 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
617 return (0);
620 static int md_ioctl (struct inode *inode, struct file *file,
621 unsigned int cmd, unsigned long arg)
623 int minor, err;
624 struct hd_geometry *loc = (struct hd_geometry *) arg;
626 if (!capable(CAP_SYS_ADMIN))
627 return -EACCES;
629 if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
630 (minor & 0x7f) < MAX_PERSONALITY &&
631 pers[minor & 0x7f] &&
632 pers[minor & 0x7f]->ioctl)
633 return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
635 if (minor >= MAX_MD_DEV)
636 return -EINVAL;
638 switch (cmd)
640 case REGISTER_DEV:
641 return do_md_add (minor, to_kdev_t ((dev_t) arg));
643 case START_MD:
644 return do_md_run (minor, (int) arg);
646 case STOP_MD:
647 return do_md_stop (minor, inode);
649 case BLKGETSIZE: /* Return device size */
650 if (!arg) return -EINVAL;
651 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
652 if (err)
653 return err;
654 break;
656 case BLKFLSBUF:
657 fsync_dev (inode->i_rdev);
658 invalidate_buffers (inode->i_rdev);
659 break;
661 case BLKRASET:
662 if (arg > 0xff)
663 return -EINVAL;
664 read_ahead[MAJOR(inode->i_rdev)] = arg;
665 return 0;
667 case BLKRAGET:
668 if (!arg) return -EINVAL;
669 err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
670 if (err)
671 return err;
672 break;
674 /* We have a problem here : there is no easy way to give a CHS
675 virtual geometry. We currently pretend that we have a 2 heads
676 4 sectors (with a BIG number of cylinders...). This drives dosfs
677 just mad... ;-) */
679 case HDIO_GETGEO:
680 if (!loc) return -EINVAL;
681 err = put_user (2, (char *) &loc->heads);
682 if (err)
683 return err;
684 err = put_user (4, (char *) &loc->sectors);
685 if (err)
686 return err;
687 err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
688 if (err)
689 return err;
690 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
691 (long *) &loc->start);
692 if (err)
693 return err;
694 break;
696 RO_IOCTLS(inode->i_rdev,arg);
698 default:
699 printk ("Unknown md_ioctl %d\n", cmd);
700 return -EINVAL;
703 return (0);
706 static int md_open (struct inode *inode, struct file *file)
708 int minor=MINOR(inode->i_rdev);
710 md_dev[minor].busy++;
711 return (0); /* Always succeed */
715 static int md_release (struct inode *inode, struct file *file)
717 int minor=MINOR(inode->i_rdev);
719 sync_dev (inode->i_rdev);
720 md_dev[minor].busy--;
721 return 0;
725 static ssize_t md_read (struct file *file, char *buf, size_t count,
726 loff_t *ppos)
728 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
730 if (!md_dev[minor].pers) /* Check if device is being run */
731 return -ENXIO;
733 return block_read(file, buf, count, ppos);
736 static ssize_t md_write (struct file *file, const char *buf,
737 size_t count, loff_t *ppos)
739 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
741 if (!md_dev[minor].pers) /* Check if device is being run */
742 return -ENXIO;
744 return block_write(file, buf, count, ppos);
747 static struct file_operations md_fops=
749 NULL,
750 md_read,
751 md_write,
752 NULL,
753 NULL,
754 md_ioctl,
755 NULL,
756 md_open,
757 NULL,
758 md_release,
759 block_fsync
762 int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
764 if ((unsigned int) minor >= MAX_MD_DEV)
766 printk ("Bad md device %d\n", minor);
767 return (-1);
770 if (!md_dev[minor].pers)
772 printk ("Oops ! md%d not running, giving up !\n", minor);
773 return (-1);
776 return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
779 int md_make_request (int minor, int rw, struct buffer_head * bh)
781 if (md_dev [minor].pers->make_request) {
782 if (buffer_locked(bh))
783 return 0;
784 set_bit(BH_Lock, &bh->b_state);
785 if (rw == WRITE || rw == WRITEA) {
786 if (!buffer_dirty(bh)) {
787 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
788 return 0;
791 if (rw == READ || rw == READA) {
792 if (buffer_uptodate(bh)) {
793 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
794 return 0;
797 return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
798 } else {
799 make_request (MAJOR(bh->b_rdev), rw, bh);
800 return 0;
804 static void do_md_request (void)
806 printk ("Got md request, not good...");
807 return;
810 void md_wakeup_thread(struct md_thread *thread)
812 set_bit(THREAD_WAKEUP, &thread->flags);
813 wake_up(&thread->wqueue);
816 struct md_thread *md_register_thread (void (*run) (void *), void *data)
818 struct md_thread *thread = (struct md_thread *)
819 kmalloc(sizeof(struct md_thread), GFP_KERNEL);
820 int ret;
821 struct semaphore sem = MUTEX_LOCKED;
823 if (!thread) return NULL;
825 memset(thread, 0, sizeof(struct md_thread));
826 init_waitqueue(&thread->wqueue);
828 thread->sem = &sem;
829 thread->run = run;
830 thread->data = data;
831 ret = kernel_thread(md_thread, thread, 0);
832 if (ret < 0) {
833 kfree(thread);
834 return NULL;
836 down(&sem);
837 return thread;
840 void md_unregister_thread (struct md_thread *thread)
842 struct semaphore sem = MUTEX_LOCKED;
844 thread->sem = &sem;
845 thread->run = NULL;
846 if (thread->tsk)
847 printk("Killing md_thread %d %p %s\n",
848 thread->tsk->pid, thread->tsk, thread->tsk->comm);
849 else
850 printk("Aiee. md_thread has 0 tsk\n");
851 send_sig(SIGKILL, thread->tsk, 1);
852 printk("downing on %p\n", &sem);
853 down(&sem);
856 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
858 int md_thread(void * arg)
860 struct md_thread *thread = arg;
862 lock_kernel();
863 exit_mm(current);
864 exit_files(current);
865 exit_fs(current);
867 current->session = 1;
868 current->pgrp = 1;
869 sprintf(current->comm, "md_thread");
870 siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
871 thread->tsk = current;
872 up(thread->sem);
874 for (;;) {
875 cli();
876 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
877 do {
878 spin_lock(&current->sigmask_lock);
879 flush_signals(current);
880 spin_unlock(&current->sigmask_lock);
881 interruptible_sleep_on(&thread->wqueue);
882 cli();
883 if (test_bit(THREAD_WAKEUP, &thread->flags))
884 break;
885 if (!thread->run) {
886 sti();
887 up(thread->sem);
888 return 0;
890 } while (signal_pending(current));
892 sti();
893 clear_bit(THREAD_WAKEUP, &thread->flags);
894 if (thread->run) {
895 thread->run(thread->data);
896 run_task_queue(&tq_disk);
901 EXPORT_SYMBOL(md_size);
902 EXPORT_SYMBOL(md_maxreadahead);
903 EXPORT_SYMBOL(register_md_personality);
904 EXPORT_SYMBOL(unregister_md_personality);
905 EXPORT_SYMBOL(partition_name);
906 EXPORT_SYMBOL(md_dev);
907 EXPORT_SYMBOL(md_error);
908 EXPORT_SYMBOL(md_register_thread);
909 EXPORT_SYMBOL(md_unregister_thread);
910 EXPORT_SYMBOL(md_update_sb);
911 EXPORT_SYMBOL(md_map);
912 EXPORT_SYMBOL(md_wakeup_thread);
913 EXPORT_SYMBOL(md_do_sync);
915 #ifdef CONFIG_PROC_FS
916 static struct proc_dir_entry proc_md = {
917 PROC_MD, 6, "mdstat",
918 S_IFREG | S_IRUGO, 1, 0, 0,
919 0, &proc_array_inode_operations,
921 #endif
923 static void md_geninit (struct gendisk *gdisk)
925 int i;
927 for(i=0;i<MAX_MD_DEV;i++)
929 md_blocksizes[i] = 1024;
930 md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
931 md_gendisk.part[i].start_sect=-1; /* avoid partition check */
932 md_gendisk.part[i].nr_sects=0;
933 md_dev[i].pers=NULL;
936 blksize_size[MD_MAJOR] = md_blocksizes;
937 max_readahead[MD_MAJOR] = md_maxreadahead;
939 #ifdef CONFIG_PROC_FS
940 proc_register(&proc_root, &proc_md);
941 #endif
944 int md_error (kdev_t mddev, kdev_t rdev)
946 unsigned int minor = MINOR (mddev);
947 int rc;
949 if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
950 panic ("md_error gets unknown device\n");
951 if (!md_dev [minor].pers)
952 panic ("md_error gets an error for an unknown device\n");
953 if (md_dev [minor].pers->error_handler) {
954 rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
955 #if SUPPORT_RECONSTRUCTION
956 md_wakeup_thread(md_sync_thread);
957 #endif /* SUPPORT_RECONSTRUCTION */
958 return rc;
960 return 0;
963 int get_md_status (char *page)
965 int sz=0, i, j, size;
967 sz+=sprintf( page+sz, "Personalities : ");
968 for (i=0; i<MAX_PERSONALITY; i++)
969 if (pers[i])
970 sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
972 page[sz-1]='\n';
974 sz+=sprintf (page+sz, "read_ahead ");
975 if (read_ahead[MD_MAJOR]==INT_MAX)
976 sz+=sprintf (page+sz, "not set\n");
977 else
978 sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
980 for (i=0; i<MAX_MD_DEV; i++)
982 sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
984 if (md_dev[i].pers)
985 sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
987 size=0;
988 for (j=0; j<md_dev[i].nb_dev; j++)
990 sz+=sprintf (page+sz, " %s",
991 partition_name(md_dev[i].devices[j].dev));
992 size+=md_dev[i].devices[j].size;
995 if (md_dev[i].nb_dev) {
996 if (md_dev[i].pers)
997 sz+=sprintf (page+sz, " %d blocks", md_size[i]);
998 else
999 sz+=sprintf (page+sz, " %d blocks", size);
1002 if (!md_dev[i].pers)
1004 sz+=sprintf (page+sz, "\n");
1005 continue;
1008 if (md_dev[i].pers->max_invalid_dev)
1009 sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
1011 sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
1012 sz+=sprintf (page+sz, "\n");
1015 return (sz);
1018 int register_md_personality (int p_num, struct md_personality *p)
1020 int i=(p_num >> PERSONALITY_SHIFT);
1022 if (i >= MAX_PERSONALITY)
1023 return -EINVAL;
1025 if (pers[i])
1026 return -EBUSY;
1028 pers[i]=p;
1029 printk ("%s personality registered\n", p->name);
1030 return 0;
1033 int unregister_md_personality (int p_num)
1035 int i=(p_num >> PERSONALITY_SHIFT);
1037 if (i >= MAX_PERSONALITY)
1038 return -EINVAL;
1040 printk ("%s personality unregistered\n", pers[i]->name);
1041 pers[i]=NULL;
1042 return 0;
1045 static md_descriptor_t *get_spare(struct md_dev *mddev)
1047 int i;
1048 md_superblock_t *sb = mddev->sb;
1049 md_descriptor_t *descriptor;
1050 struct real_dev *realdev;
1052 for (i = 0; i < mddev->nb_dev; i++) {
1053 realdev = &mddev->devices[i];
1054 if (!realdev->sb)
1055 continue;
1056 descriptor = &sb->disks[realdev->sb->descriptor.number];
1057 if (descriptor->state & (1 << MD_FAULTY_DEVICE))
1058 continue;
1059 if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
1060 continue;
1061 return descriptor;
1063 return NULL;
1067 * parallel resyncing thread.
1069 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1070 * - fix read error handing
1073 int md_do_sync(struct md_dev *mddev)
1075 struct buffer_head *bh;
1076 int max_blocks, blocksize, curr_bsize, percent=1, j;
1077 kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
1078 int major = MAJOR(read_disk), minor = MINOR(read_disk);
1079 unsigned long starttime;
1081 blocksize = blksize_size[major][minor];
1082 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1084 printk("... resync log\n");
1085 printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
1086 printk(" .... raid array: %s\n", kdevname(read_disk));
1087 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
1088 printk("md: syncing RAID array %s\n", kdevname(read_disk));
1090 mddev->busy++;
1092 starttime=jiffies;
1093 for (j = 0; j < max_blocks; j++) {
1096 * B careful. When some1 mounts a non-'blocksize' filesystem
1097 * then we get the blocksize changed right under us. Go deal
1098 * with it transparently, recalculate 'blocksize', 'j' and
1099 * 'max_blocks':
1101 curr_bsize = blksize_size[major][minor];
1102 if (curr_bsize != blocksize) {
1103 diff_blocksize:
1104 if (curr_bsize > blocksize)
1106 * this is safe, rounds downwards.
1108 j /= curr_bsize/blocksize;
1109 else
1110 j *= blocksize/curr_bsize;
1112 blocksize = curr_bsize;
1113 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1115 if ((bh = breada (read_disk, j, blocksize, j * blocksize,
1116 max_blocks * blocksize)) != NULL) {
1117 mark_buffer_dirty(bh, 1);
1118 brelse(bh);
1119 } else {
1121 * FIXME: Ugly, but set_blocksize() isnt safe ...
1123 curr_bsize = blksize_size[major][minor];
1124 if (curr_bsize != blocksize)
1125 goto diff_blocksize;
1128 * It's a real read problem. FIXME, handle this
1129 * a better way.
1131 printk ( KERN_ALERT
1132 "read error, stopping reconstruction.\n");
1133 mddev->busy--;
1134 return 1;
1138 * Let's sleep some if we are faster than our speed limit:
1140 while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
1142 current->state = TASK_INTERRUPTIBLE;
1143 current->timeout = jiffies+1;
1144 schedule();
1148 * FIXME: put this status bar thing into /proc
1150 if (!(j%(max_blocks/100))) {
1151 if (!(percent%10))
1152 printk (" %03d%% done.\n",percent);
1153 else
1154 printk (".");
1155 percent++;
1158 fsync_dev(read_disk);
1159 printk("md: %s: sync done.\n", kdevname(read_disk));
1160 mddev->busy--;
1161 return 0;
1165 * This is a kernel thread which: syncs a spare disk with the active array
1167 * the amount of foolproofing might seem to be a tad excessive, but an
1168 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1169 * of my root partition with the first 0.5 gigs of my /home partition ... so
1170 * i'm a bit nervous ;)
1172 void mdsyncd (void *data)
1174 int i;
1175 struct md_dev *mddev;
1176 md_superblock_t *sb;
1177 md_descriptor_t *spare;
1178 unsigned long flags;
1180 for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
1181 if ((sb = mddev->sb) == NULL)
1182 continue;
1183 if (sb->active_disks == sb->raid_disks)
1184 continue;
1185 if (!sb->spare_disks)
1186 continue;
1187 if ((spare = get_spare(mddev)) == NULL)
1188 continue;
1189 if (!mddev->pers->mark_spare)
1190 continue;
1191 if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
1192 continue;
1193 if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
1194 mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
1195 continue;
1197 save_flags(flags);
1198 cli();
1199 mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
1200 spare->state |= (1 << MD_SYNC_DEVICE);
1201 spare->state |= (1 << MD_ACTIVE_DEVICE);
1202 sb->spare_disks--;
1203 sb->active_disks++;
1204 mddev->sb_dirty = 1;
1205 md_update_sb(mddev - md_dev);
1206 restore_flags(flags);
1211 #ifdef CONFIG_MD_BOOT
1212 struct {
1213 int set;
1214 int ints[100];
1215 char str[100];
1216 } md_setup_args __initdata = {
1217 0,{0},{0}
1220 /* called from init/main.c */
1221 __initfunc(void md_setup(char *str,int *ints))
1223 int i;
1224 for(i=0;i<=ints[0];i++) {
1225 md_setup_args.ints[i] = ints[i];
1226 strcpy(md_setup_args.str, str);
1227 /* printk ("md: ints[%d]=%d.\n", i, ints[i]);*/
1229 md_setup_args.set=1;
1230 return;
1233 __initfunc(void do_md_setup(char *str,int *ints))
1235 int minor, pers, factor, fault;
1236 kdev_t dev;
1237 int i=1;
1239 if(ints[0] < 4) {
1240 printk ("md: Too few Arguments (%d).\n", ints[0]);
1241 return;
1244 minor=ints[i++];
1246 if (minor >= MAX_MD_DEV) {
1247 printk ("md: Minor device number too high.\n");
1248 return;
1251 pers = 0;
1253 switch(ints[i++]) { /* Raidlevel */
1254 case -1:
1255 #ifdef CONFIG_MD_LINEAR
1256 pers = LINEAR;
1257 printk ("md: Setting up md%d as linear device.\n",minor);
1258 #else
1259 printk ("md: Linear mode not configured."
1260 "Recompile the kernel with linear mode enabled!\n");
1261 #endif
1262 break;
1263 case 0:
1264 pers = STRIPED;
1265 #ifdef CONFIG_MD_STRIPED
1266 printk ("md: Setting up md%d as a striped device.\n",minor);
1267 #else
1268 printk ("md: Striped mode not configured."
1269 "Recompile the kernel with striped mode enabled!\n");
1270 #endif
1271 break;
1272 /* not supported yet
1273 case 1:
1274 pers = RAID1;
1275 printk ("md: Setting up md%d as a raid1 device.\n",minor);
1276 break;
1277 case 5:
1278 pers = RAID5;
1279 printk ("md: Setting up md%d as a raid5 device.\n",minor);
1280 break;
1282 default:
1283 printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
1284 return;
1287 if(pers) {
1289 factor=ints[i++]; /* Chunksize */
1290 fault =ints[i++]; /* Faultlevel */
1292 pers=pers | factor | (fault << FAULT_SHIFT);
1294 while( str && (dev = name_to_kdev_t(str))) {
1295 do_md_add (minor, dev);
1296 if((str = strchr (str, ',')) != NULL)
1297 str++;
1300 do_md_run (minor, pers);
1301 printk ("md: Loading md%d.\n",minor);
1305 #endif
1307 void linear_init (void);
1308 void raid0_init (void);
1309 void raid1_init (void);
1310 void raid5_init (void);
1312 __initfunc(int md_init (void))
1314 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1315 MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
1316 MAX_MD_DEV, MAX_REAL);
1318 if (register_blkdev (MD_MAJOR, "md", &md_fops))
1320 printk ("Unable to get major %d for md\n", MD_MAJOR);
1321 return (-1);
1324 blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
1325 blk_dev[MD_MAJOR].current_request=NULL;
1326 read_ahead[MD_MAJOR]=INT_MAX;
1327 memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
1328 md_gendisk.next=gendisk_head;
1330 gendisk_head=&md_gendisk;
1332 #if SUPPORT_RECONSTRUCTION
1333 if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
1334 printk("md: bug: md_sync_thread == NULL\n");
1335 #endif /* SUPPORT_RECONSTRUCTION */
1337 #ifdef CONFIG_MD_LINEAR
1338 linear_init ();
1339 #endif
1340 #ifdef CONFIG_MD_STRIPED
1341 raid0_init ();
1342 #endif
1343 #ifdef CONFIG_MD_MIRRORING
1344 raid1_init ();
1345 #endif
1346 #ifdef CONFIG_MD_RAID5
1347 raid5_init ();
1348 #endif
1349 return (0);
1352 #ifdef CONFIG_MD_BOOT
1353 __initfunc(void md_setup_drive(void))
1355 if(md_setup_args.set)
1356 do_md_setup(md_setup_args.str, md_setup_args.ints);
1358 #endif