Import 2.1.81
[davej-history.git] / drivers / block / md.c
blob8c592c7518cf9f531724cddb72a961ff1836b401
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
6 <maz@gloups.fdn.fr>
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 RAID-1/RAID-5 extensions by:
13 Ingo Molnar, Miguel de Icaza, Gadi Oxman
15 This program is free software; you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation; either version 2, or (at your option)
18 any later version.
20 You should have received a copy of the GNU General Public License
21 (for example /usr/src/linux/COPYING); if not, write to the Free
22 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
27 * the extra system load does not show up that much. Increase it if your
28 * system can take more.
30 #define SPEED_LIMIT 1024
32 #include <linux/config.h>
33 #include <linux/module.h>
34 #include <linux/version.h>
35 #include <linux/malloc.h>
36 #include <linux/mm.h>
37 #include <linux/md.h>
38 #include <linux/hdreg.h>
39 #include <linux/stat.h>
40 #include <linux/fs.h>
41 #include <linux/proc_fs.h>
42 #include <linux/blkdev.h>
43 #include <linux/genhd.h>
44 #include <linux/smp_lock.h>
45 #ifdef CONFIG_KERNELD
46 #include <linux/kerneld.h>
47 #endif
48 #include <linux/errno.h>
49 #include <linux/init.h>
51 #define __KERNEL_SYSCALLS__
52 #include <linux/unistd.h>
54 #define MAJOR_NR MD_MAJOR
55 #define MD_DRIVER
57 #include <linux/blk.h>
58 #include <asm/uaccess.h>
59 #include <asm/bitops.h>
60 #include <asm/atomic.h>
62 static struct hd_struct md_hd_struct[MAX_MD_DEV];
63 static int md_blocksizes[MAX_MD_DEV];
64 int md_maxreadahead[MAX_MD_DEV];
65 static struct md_thread md_threads[MAX_MD_THREADS];
66 #if SUPPORT_RECONSTRUCTION
67 static struct md_thread *md_sync_thread = NULL;
68 #endif /* SUPPORT_RECONSTRUCTION */
70 int md_size[MAX_MD_DEV]={0, };
72 static void md_geninit (struct gendisk *);
74 static struct gendisk md_gendisk=
76 MD_MAJOR,
77 "md",
80 MAX_MD_DEV,
81 md_geninit,
82 md_hd_struct,
83 md_size,
84 MAX_MD_DEV,
85 NULL,
86 NULL
89 static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
90 struct md_dev md_dev[MAX_MD_DEV];
92 static struct gendisk *find_gendisk (kdev_t dev)
94 struct gendisk *tmp=gendisk_head;
96 while (tmp != NULL)
98 if (tmp->major==MAJOR(dev))
99 return (tmp);
101 tmp=tmp->next;
104 return (NULL);
107 char *partition_name (kdev_t dev)
109 static char name[40]; /* This should be long
110 enough for a device name ! */
111 struct gendisk *hd = find_gendisk (dev);
113 if (!hd)
115 sprintf (name, "[dev %s]", kdevname(dev));
116 return (name);
119 return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */
122 static int legacy_raid_sb (int minor, int pnum)
124 int i, factor;
126 factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
128 /*****
129 * do size and offset calculations.
131 for (i=0; i<md_dev[minor].nb_dev; i++) {
132 md_dev[minor].devices[i].size &= ~(factor - 1);
133 md_size[minor] += md_dev[minor].devices[i].size;
134 md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
135 md_dev[minor].devices[i-1].size) : 0;
137 if (pnum == RAID0 >> PERSONALITY_SHIFT)
138 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
139 return 0;
142 static void free_sb (struct md_dev *mddev)
144 int i;
145 struct real_dev *realdev;
147 if (mddev->sb) {
148 free_page((unsigned long) mddev->sb);
149 mddev->sb = NULL;
151 for (i = 0; i <mddev->nb_dev; i++) {
152 realdev = mddev->devices + i;
153 if (realdev->sb) {
154 free_page((unsigned long) realdev->sb);
155 realdev->sb = NULL;
161 * Check one RAID superblock for generic plausibility
164 #define BAD_MAGIC KERN_ERR \
165 "md: %s: invalid raid superblock magic (%x) on block %u\n"
167 #define OUT_OF_MEM KERN_ALERT \
168 "md: out of memory.\n"
170 #define NO_DEVICE KERN_ERR \
171 "md: disabled device %s\n"
173 #define SUCCESS 0
174 #define FAILURE -1
176 static int analyze_one_sb (struct real_dev * rdev)
178 int ret = FAILURE;
179 struct buffer_head *bh;
180 kdev_t dev = rdev->dev;
181 md_superblock_t *sb;
184 * Read the superblock, it's at the end of the disk
186 rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
187 set_blocksize (dev, MD_SB_BYTES);
188 bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
190 if (bh) {
191 sb = (md_superblock_t *) bh->b_data;
192 if (sb->md_magic != MD_SB_MAGIC) {
193 printk (BAD_MAGIC, kdevname(dev),
194 sb->md_magic, rdev->sb_offset);
195 goto abort;
197 rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
198 if (!rdev->sb) {
199 printk (OUT_OF_MEM);
200 goto abort;
202 memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
204 rdev->size = sb->size;
205 } else
206 printk (NO_DEVICE,kdevname(rdev->dev));
207 ret = SUCCESS;
208 abort:
209 if (bh)
210 brelse (bh);
211 return ret;
214 #undef SUCCESS
215 #undef FAILURE
217 #undef BAD_MAGIC
218 #undef OUT_OF_MEM
219 #undef NO_DEVICE
222 * Check a full RAID array for plausibility
225 #define INCONSISTENT KERN_ERR \
226 "md: superblock inconsistency -- run ckraid\n"
228 #define OUT_OF_DATE KERN_ERR \
229 "md: superblock update time inconsistenty -- using the most recent one\n"
231 #define OLD_VERSION KERN_ALERT \
232 "md: %s: unsupported raid array version %d.%d.%d\n"
234 #define NOT_CLEAN KERN_ERR \
235 "md: %s: raid array is not clean -- run ckraid\n"
237 #define NOT_CLEAN_IGNORE KERN_ERR \
238 "md: %s: raid array is not clean -- reconstructing parity\n"
240 #define UNKNOWN_LEVEL KERN_ERR \
241 "md: %s: unsupported raid level %d\n"
243 static int analyze_sbs (int minor, int pnum)
245 struct md_dev *mddev = md_dev + minor;
246 int i, N = mddev->nb_dev, out_of_date = 0;
247 struct real_dev * disks = mddev->devices;
248 md_superblock_t *sb, *freshest = NULL;
251 * RAID-0 and linear don't use a RAID superblock
253 if (pnum == RAID0 >> PERSONALITY_SHIFT ||
254 pnum == LINEAR >> PERSONALITY_SHIFT)
255 return legacy_raid_sb (minor, pnum);
258 * Verify the RAID superblock on each real device
260 for (i = 0; i < N; i++)
261 if (analyze_one_sb(disks+i))
262 goto abort;
265 * The superblock constant part has to be the same
266 * for all disks in the array.
268 sb = NULL;
269 for (i = 0; i < N; i++) {
270 if (!disks[i].sb)
271 continue;
272 if (!sb) {
273 sb = disks[i].sb;
274 continue;
276 if (memcmp(sb,
277 disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
278 printk (INCONSISTENT);
279 goto abort;
284 * Ok, we have all disks and the array is ready to run. Lets
285 * find the freshest superblock, that one will be the superblock
286 * that represents the whole array.
288 if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
289 goto abort;
290 freshest = NULL;
291 for (i = 0; i < N; i++) {
292 if (!disks[i].sb)
293 continue;
294 if (!freshest) {
295 freshest = disks[i].sb;
296 continue;
299 * Find the newest superblock version
301 if (disks[i].sb->utime != freshest->utime) {
302 out_of_date = 1;
303 if (disks[i].sb->utime > freshest->utime)
304 freshest = disks[i].sb;
307 if (out_of_date)
308 printk(OUT_OF_DATE);
309 memcpy (sb, freshest, sizeof(*freshest));
312 * Check if we can support this RAID array
314 if (sb->major_version != MD_MAJOR_VERSION ||
315 sb->minor_version > MD_MINOR_VERSION) {
317 printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
318 sb->major_version, sb->minor_version,
319 sb->patch_version);
320 goto abort;
324 * We need to add this as a superblock option.
326 #if SUPPORT_RECONSTRUCTION
327 if (sb->state != (1 << MD_SB_CLEAN)) {
328 if (sb->level == 1) {
329 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
330 goto abort;
331 } else
332 printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
334 #else
335 if (sb->state != (1 << MD_SB_CLEAN)) {
336 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
337 goto abort;
339 #endif /* SUPPORT_RECONSTRUCTION */
341 switch (sb->level) {
342 case 1:
343 md_size[minor] = sb->size;
344 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
345 break;
346 case 4:
347 case 5:
348 md_size[minor] = sb->size * (sb->raid_disks - 1);
349 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
350 break;
351 default:
352 printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
353 sb->level);
354 goto abort;
356 return 0;
357 abort:
358 free_sb(mddev);
359 return 1;
362 #undef INCONSISTENT
363 #undef OUT_OF_DATE
364 #undef OLD_VERSION
365 #undef NOT_CLEAN
366 #undef OLD_LEVEL
368 int md_update_sb(int minor)
370 struct md_dev *mddev = md_dev + minor;
371 struct buffer_head *bh;
372 md_superblock_t *sb = mddev->sb;
373 struct real_dev *realdev;
374 kdev_t dev;
375 int i;
376 u32 sb_offset;
378 sb->utime = CURRENT_TIME;
379 for (i = 0; i < mddev->nb_dev; i++) {
380 realdev = mddev->devices + i;
381 if (!realdev->sb)
382 continue;
383 dev = realdev->dev;
384 sb_offset = realdev->sb_offset;
385 set_blocksize(dev, MD_SB_BYTES);
386 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
387 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
388 if (bh) {
389 sb = (md_superblock_t *) bh->b_data;
390 memcpy(sb, mddev->sb, MD_SB_BYTES);
391 memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
392 mark_buffer_uptodate(bh, 1);
393 mark_buffer_dirty(bh, 1);
394 ll_rw_block(WRITE, 1, &bh);
395 wait_on_buffer(bh);
396 bforget(bh);
397 fsync_dev(dev);
398 invalidate_buffers(dev);
399 } else
400 printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
402 return 0;
405 static int do_md_run (int minor, int repart)
407 int pnum, i, min, factor, err;
409 if (!md_dev[minor].nb_dev)
410 return -EINVAL;
412 if (md_dev[minor].pers)
413 return -EBUSY;
415 md_dev[minor].repartition=repart;
417 if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
418 >= MAX_PERSONALITY)
419 return -EINVAL;
421 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
422 if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
423 for (i = 0; i < md_dev [minor].nb_dev; i++)
424 if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
425 return -EINVAL;
427 if (!pers[pnum])
429 #ifdef CONFIG_KERNELD
430 char module_name[80];
431 sprintf (module_name, "md-personality-%d", pnum);
432 request_module (module_name);
433 if (!pers[pnum])
434 #endif
435 return -EINVAL;
438 factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
440 for (i=0; i<md_dev[minor].nb_dev; i++)
441 if (md_dev[minor].devices[i].size<min)
443 printk ("Dev %s smaller than %dk, cannot shrink\n",
444 partition_name (md_dev[minor].devices[i].dev), min);
445 return -EINVAL;
448 for (i=0; i<md_dev[minor].nb_dev; i++) {
449 fsync_dev(md_dev[minor].devices[i].dev);
450 invalidate_buffers(md_dev[minor].devices[i].dev);
453 /* Resize devices according to the factor. It is used to align
454 partitions size on a given chunk size. */
455 md_size[minor]=0;
458 * Analyze the raid superblock
460 if (analyze_sbs(minor, pnum))
461 return -EINVAL;
463 md_dev[minor].pers=pers[pnum];
465 if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
467 md_dev[minor].pers=NULL;
468 free_sb(md_dev + minor);
469 return (err);
472 if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
474 md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
475 md_update_sb(minor);
478 /* FIXME : We assume here we have blocks
479 that are twice as large as sectors.
480 THIS MAY NOT BE TRUE !!! */
481 md_hd_struct[minor].start_sect=0;
482 md_hd_struct[minor].nr_sects=md_size[minor]<<1;
484 read_ahead[MD_MAJOR] = 128;
485 return (0);
488 static int do_md_stop (int minor, struct inode *inode)
490 int i;
492 if (inode->i_count>1 || md_dev[minor].busy>1) {
494 * ioctl : one open channel
496 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
497 minor, inode->i_count, md_dev[minor].busy);
498 return -EBUSY;
501 if (md_dev[minor].pers) {
503 * It is safe to call stop here, it only frees private
504 * data. Also, it tells us if a device is unstoppable
505 * (eg. resyncing is in progress)
507 if (md_dev[minor].pers->stop (minor, md_dev+minor))
508 return -EBUSY;
510 * The device won't exist anymore -> flush it now
512 fsync_dev (inode->i_rdev);
513 invalidate_buffers (inode->i_rdev);
514 if (md_dev[minor].sb) {
515 md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
516 md_update_sb(minor);
520 /* Remove locks. */
521 if (md_dev[minor].sb)
522 free_sb(md_dev + minor);
523 for (i=0; i<md_dev[minor].nb_dev; i++)
524 clear_inode (md_dev[minor].devices[i].inode);
526 md_dev[minor].nb_dev=md_size[minor]=0;
527 md_hd_struct[minor].nr_sects=0;
528 md_dev[minor].pers=NULL;
530 read_ahead[MD_MAJOR] = 128;
532 return (0);
535 static int do_md_add (int minor, kdev_t dev)
537 int i;
538 int hot_add=0;
539 struct real_dev *realdev;
541 if (md_dev[minor].nb_dev==MAX_REAL)
542 return -EINVAL;
544 if (!fs_may_mount (dev))
545 return -EBUSY;
547 if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
548 printk("md_add(): zero device size, huh, bailing out.\n");
549 return -EINVAL;
552 if (md_dev[minor].pers) {
554 * The array is already running, hot-add the drive, or
555 * bail out:
557 if (!md_dev[minor].pers->hot_add_disk)
558 return -EBUSY;
559 else
560 hot_add=1;
564 * Careful. We cannot increase nb_dev for a running array.
566 i=md_dev[minor].nb_dev;
567 realdev = &md_dev[minor].devices[i];
568 realdev->dev=dev;
570 /* Lock the device by inserting a dummy inode. This doesn't
571 smell very good, but I need to be consistent with the
572 mount stuff, specially with fs_may_mount. If someone have
573 a better idea, please help ! */
575 realdev->inode=get_empty_inode ();
576 realdev->inode->i_dev=dev; /* don't care about other fields */
577 insert_inode_hash (realdev->inode);
579 /* Sizes are now rounded at run time */
581 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
583 realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
585 if (hot_add) {
587 * Check the superblock for consistency.
588 * the personality itself has to check wether it's getting
589 * added with the proper flags ... also, personality has to
590 * be checked too ;)
592 if (analyze_one_sb (realdev))
593 return -EINVAL;
595 * hot_add has to bump up nb_dev itself
597 if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
599 * FIXME: here we should free up the inode and stuff
601 printk ("FIXME\n");
602 return -EINVAL;
604 } else
605 md_dev[minor].nb_dev++;
607 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
608 return (0);
611 static int md_ioctl (struct inode *inode, struct file *file,
612 unsigned int cmd, unsigned long arg)
614 int minor, err;
615 struct hd_geometry *loc = (struct hd_geometry *) arg;
617 if (!suser())
618 return -EACCES;
620 if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
621 (minor & 0x7f) < MAX_PERSONALITY &&
622 pers[minor & 0x7f] &&
623 pers[minor & 0x7f]->ioctl)
624 return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
626 if (minor >= MAX_MD_DEV)
627 return -EINVAL;
629 switch (cmd)
631 case REGISTER_DEV:
632 return do_md_add (minor, to_kdev_t ((dev_t) arg));
634 case START_MD:
635 return do_md_run (minor, (int) arg);
637 case STOP_MD:
638 return do_md_stop (minor, inode);
640 case BLKGETSIZE: /* Return device size */
641 if (!arg) return -EINVAL;
642 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
643 if (err)
644 return err;
645 break;
647 case BLKFLSBUF:
648 fsync_dev (inode->i_rdev);
649 invalidate_buffers (inode->i_rdev);
650 break;
652 case BLKRASET:
653 if (arg > 0xff)
654 return -EINVAL;
655 read_ahead[MAJOR(inode->i_rdev)] = arg;
656 return 0;
658 case BLKRAGET:
659 if (!arg) return -EINVAL;
660 err = put_user (read_ahead[MAJOR(inode->i_rdev)], (long *) arg);
661 if (err)
662 return err;
663 break;
665 /* We have a problem here : there is no easy way to give a CHS
666 virtual geometry. We currently pretend that we have a 2 heads
667 4 sectors (with a BIG number of cylinders...). This drives dosfs
668 just mad... ;-) */
670 case HDIO_GETGEO:
671 if (!loc) return -EINVAL;
672 err = put_user (2, (char *) &loc->heads);
673 if (err)
674 return err;
675 err = put_user (4, (char *) &loc->sectors);
676 if (err)
677 return err;
678 err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
679 if (err)
680 return err;
681 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
682 (long *) &loc->start);
683 if (err)
684 return err;
685 break;
687 RO_IOCTLS(inode->i_rdev,arg);
689 default:
690 printk ("Unknown md_ioctl %d\n", cmd);
691 return -EINVAL;
694 return (0);
697 static int md_open (struct inode *inode, struct file *file)
699 int minor=MINOR(inode->i_rdev);
701 md_dev[minor].busy++;
702 return (0); /* Always succeed */
706 static int md_release (struct inode *inode, struct file *file)
708 int minor=MINOR(inode->i_rdev);
710 sync_dev (inode->i_rdev);
711 md_dev[minor].busy--;
712 return 0;
716 static ssize_t md_read (struct file *file, char *buf, size_t count,
717 loff_t *ppos)
719 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
721 if (!md_dev[minor].pers) /* Check if device is being run */
722 return -ENXIO;
724 return block_read(file, buf, count, ppos);
727 static ssize_t md_write (struct file *file, const char *buf,
728 size_t count, loff_t *ppos)
730 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
732 if (!md_dev[minor].pers) /* Check if device is being run */
733 return -ENXIO;
735 return block_write(file, buf, count, ppos);
738 static struct file_operations md_fops=
740 NULL,
741 md_read,
742 md_write,
743 NULL,
744 NULL,
745 md_ioctl,
746 NULL,
747 md_open,
748 md_release,
749 block_fsync
752 int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
754 if ((unsigned int) minor >= MAX_MD_DEV)
756 printk ("Bad md device %d\n", minor);
757 return (-1);
760 if (!md_dev[minor].pers)
762 printk ("Oops ! md%d not running, giving up !\n", minor);
763 return (-1);
766 return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
769 int md_make_request (int minor, int rw, struct buffer_head * bh)
771 if (md_dev [minor].pers->make_request) {
772 if (buffer_locked(bh))
773 return 0;
774 set_bit(BH_Lock, &bh->b_state);
775 if (rw == WRITE || rw == WRITEA) {
776 if (!buffer_dirty(bh)) {
777 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
778 return 0;
781 if (rw == READ || rw == READA) {
782 if (buffer_uptodate(bh)) {
783 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
784 return 0;
787 return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
788 } else {
789 make_request (MAJOR(bh->b_rdev), rw, bh);
790 return 0;
794 static void do_md_request (void)
796 printk ("Got md request, not good...");
797 return;
801 * We run MAX_MD_THREADS from md_init() and arbitrate them in run time.
802 * This is not so elegant, but how can we use kernel_thread() from within
803 * loadable modules?
805 struct md_thread *md_register_thread (void (*run) (void *), void *data)
807 int i;
808 for (i = 0; i < MAX_MD_THREADS; i++) {
809 if (md_threads[i].run == NULL) {
810 md_threads[i].run = run;
811 md_threads[i].data = data;
812 return md_threads + i;
815 return NULL;
818 void md_unregister_thread (struct md_thread *thread)
820 thread->run = NULL;
821 thread->data = NULL;
822 thread->flags = 0;
825 void md_wakeup_thread(struct md_thread *thread)
827 set_bit(THREAD_WAKEUP, &thread->flags);
828 wake_up(&thread->wqueue);
832 EXPORT_SYMBOL(md_size);
833 EXPORT_SYMBOL(md_maxreadahead);
834 EXPORT_SYMBOL(register_md_personality);
835 EXPORT_SYMBOL(unregister_md_personality);
836 EXPORT_SYMBOL(partition_name);
837 EXPORT_SYMBOL(md_dev);
838 EXPORT_SYMBOL(md_error);
839 EXPORT_SYMBOL(md_register_thread);
840 EXPORT_SYMBOL(md_unregister_thread);
841 EXPORT_SYMBOL(md_update_sb);
842 EXPORT_SYMBOL(md_map);
843 EXPORT_SYMBOL(md_wakeup_thread);
844 EXPORT_SYMBOL(md_do_sync);
846 static struct proc_dir_entry proc_md = {
847 PROC_MD, 6, "mdstat",
848 S_IFREG | S_IRUGO, 1, 0, 0,
849 0, &proc_array_inode_operations,
852 static void md_geninit (struct gendisk *gdisk)
854 int i;
856 for(i=0;i<MAX_MD_DEV;i++)
858 md_blocksizes[i] = 1024;
859 md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
860 md_gendisk.part[i].start_sect=-1; /* avoid partition check */
861 md_gendisk.part[i].nr_sects=0;
862 md_dev[i].pers=NULL;
865 blksize_size[MD_MAJOR] = md_blocksizes;
866 max_readahead[MD_MAJOR] = md_maxreadahead;
868 proc_register(&proc_root, &proc_md);
871 int md_error (kdev_t mddev, kdev_t rdev)
873 unsigned int minor = MINOR (mddev);
874 int rc;
876 if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
877 panic ("md_error gets unknown device\n");
878 if (!md_dev [minor].pers)
879 panic ("md_error gets an error for an unknown device\n");
880 if (md_dev [minor].pers->error_handler) {
881 rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
882 #if SUPPORT_RECONSTRUCTION
883 md_wakeup_thread(md_sync_thread);
884 #endif /* SUPPORT_RECONSTRUCTION */
885 return rc;
887 return 0;
890 int get_md_status (char *page)
892 int sz=0, i, j, size;
894 sz+=sprintf( page+sz, "Personalities : ");
895 for (i=0; i<MAX_PERSONALITY; i++)
896 if (pers[i])
897 sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
899 page[sz-1]='\n';
901 sz+=sprintf (page+sz, "read_ahead ");
902 if (read_ahead[MD_MAJOR]==INT_MAX)
903 sz+=sprintf (page+sz, "not set\n");
904 else
905 sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
907 for (i=0; i<MAX_MD_DEV; i++)
909 sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
911 if (md_dev[i].pers)
912 sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
914 size=0;
915 for (j=0; j<md_dev[i].nb_dev; j++)
917 sz+=sprintf (page+sz, " %s",
918 partition_name(md_dev[i].devices[j].dev));
919 size+=md_dev[i].devices[j].size;
922 if (md_dev[i].nb_dev) {
923 if (md_dev[i].pers)
924 sz+=sprintf (page+sz, " %d blocks", md_size[i]);
925 else
926 sz+=sprintf (page+sz, " %d blocks", size);
929 if (!md_dev[i].pers)
931 sz+=sprintf (page+sz, "\n");
932 continue;
935 if (md_dev[i].pers->max_invalid_dev)
936 sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
938 sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
939 sz+=sprintf (page+sz, "\n");
942 return (sz);
945 int register_md_personality (int p_num, struct md_personality *p)
947 int i=(p_num >> PERSONALITY_SHIFT);
949 if (i >= MAX_PERSONALITY)
950 return -EINVAL;
952 if (pers[i])
953 return -EBUSY;
955 pers[i]=p;
956 printk ("%s personality registered\n", p->name);
957 return 0;
960 int unregister_md_personality (int p_num)
962 int i=(p_num >> PERSONALITY_SHIFT);
964 if (i >= MAX_PERSONALITY)
965 return -EINVAL;
967 printk ("%s personality unregistered\n", pers[i]->name);
968 pers[i]=NULL;
969 return 0;
972 int md_thread(void * arg)
974 struct md_thread *thread = arg;
976 current->session = 1;
977 current->pgrp = 1;
978 sprintf(current->comm, "md_thread");
980 lock_kernel();
981 for (;;) {
982 sti();
983 clear_bit(THREAD_WAKEUP, &thread->flags);
984 if (thread->run) {
985 thread->run(thread->data);
986 run_task_queue(&tq_disk);
988 cli();
989 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
990 do {
991 spin_lock_irq(&current->sigmask_lock);
992 flush_signals(current);
993 spin_unlock_irq(&current->sigmask_lock);
994 interruptible_sleep_on(&thread->wqueue);
995 } while (signal_pending(current));
1000 static md_descriptor_t *get_spare(struct md_dev *mddev)
1002 int i;
1003 md_superblock_t *sb = mddev->sb;
1004 md_descriptor_t *descriptor;
1005 struct real_dev *realdev;
1007 for (i = 0; i < mddev->nb_dev; i++) {
1008 realdev = &mddev->devices[i];
1009 if (!realdev->sb)
1010 continue;
1011 descriptor = &sb->disks[realdev->sb->descriptor.number];
1012 if (descriptor->state & (1 << MD_FAULTY_DEVICE))
1013 continue;
1014 if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
1015 continue;
1016 return descriptor;
1018 return NULL;
1022 * parallel resyncing thread.
1024 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1025 * - fix read error handing
1028 int md_do_sync(struct md_dev *mddev)
1030 struct buffer_head *bh;
1031 int max_blocks, blocksize, curr_bsize, percent=1, j;
1032 kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
1033 int major = MAJOR(read_disk), minor = MINOR(read_disk);
1034 unsigned long starttime;
1036 blocksize = blksize_size[major][minor];
1037 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1039 printk("... resync log\n");
1040 printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
1041 printk(" .... raid array: %s\n", kdevname(read_disk));
1042 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
1043 printk("md: syncing RAID array %s\n", kdevname(read_disk));
1045 mddev->busy++;
1047 starttime=jiffies;
1048 for (j = 0; j < max_blocks; j++) {
1051 * B careful. When some1 mounts a non-'blocksize' filesystem
1052 * then we get the blocksize changed right under us. Go deal
1053 * with it transparently, recalculate 'blocksize', 'j' and
1054 * 'max_blocks':
1056 curr_bsize = blksize_size[major][minor];
1057 if (curr_bsize != blocksize) {
1058 diff_blocksize:
1059 if (curr_bsize > blocksize)
1061 * this is safe, rounds downwards.
1063 j /= curr_bsize/blocksize;
1064 else
1065 j *= blocksize/curr_bsize;
1067 blocksize = curr_bsize;
1068 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1070 if ((bh = breada (read_disk, j, blocksize, j * blocksize,
1071 max_blocks * blocksize)) != NULL) {
1072 mark_buffer_dirty(bh, 1);
1073 brelse(bh);
1074 } else {
1076 * FIXME: Ugly, but set_blocksize() isnt safe ...
1078 curr_bsize = blksize_size[major][minor];
1079 if (curr_bsize != blocksize)
1080 goto diff_blocksize;
1083 * It's a real read problem. FIXME, handle this
1084 * a better way.
1086 printk ( KERN_ALERT
1087 "read error, stopping reconstruction.\n");
1088 mddev->busy--;
1089 return 1;
1093 * Lets sleep some if we are faster than our speed limit:
1095 while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
1097 current->state = TASK_INTERRUPTIBLE;
1098 current->timeout = jiffies+1;
1099 schedule();
1103 * FIXME: put this status bar thing into /proc
1105 if (!(j%(max_blocks/100))) {
1106 if (!(percent%10))
1107 printk (" %03d%% done.\n",percent);
1108 else
1109 printk (".");
1110 percent++;
1113 fsync_dev(read_disk);
1114 printk("md: %s: sync done.\n", kdevname(read_disk));
1115 mddev->busy--;
1116 return 0;
1120 * This is a kernel thread which: syncs a spare disk with the active array
1122 * the amount of foolproofing might seem to be a tad excessive, but an
1123 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1124 * of my root partition with the first 0.5 gigs of my /home partition ... so
1125 * i'm a bit nervous ;)
1127 void mdsyncd (void *data)
1129 int i;
1130 struct md_dev *mddev;
1131 md_superblock_t *sb;
1132 md_descriptor_t *spare;
1133 unsigned long flags;
1135 for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
1136 if ((sb = mddev->sb) == NULL)
1137 continue;
1138 if (sb->active_disks == sb->raid_disks)
1139 continue;
1140 if (!sb->spare_disks)
1141 continue;
1142 if ((spare = get_spare(mddev)) == NULL)
1143 continue;
1144 if (!mddev->pers->mark_spare)
1145 continue;
1146 if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
1147 continue;
1148 if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
1149 mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
1150 continue;
1152 save_flags(flags);
1153 cli();
1154 mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
1155 spare->state |= (1 << MD_SYNC_DEVICE);
1156 spare->state |= (1 << MD_ACTIVE_DEVICE);
1157 sb->spare_disks--;
1158 sb->active_disks++;
1159 mddev->sb_dirty = 1;
1160 md_update_sb(mddev - md_dev);
1161 restore_flags(flags);
1166 void linear_init (void);
1167 void raid0_init (void);
1168 void raid1_init (void);
1169 void raid5_init (void);
1171 __initfunc(int md_init (void))
1173 int i;
1175 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1176 MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
1177 MAX_MD_DEV, MAX_REAL);
1179 if (register_blkdev (MD_MAJOR, "md", &md_fops))
1181 printk ("Unable to get major %d for md\n", MD_MAJOR);
1182 return (-1);
1185 memset(md_threads, 0, MAX_MD_THREADS * sizeof(struct md_thread));
1186 printk("md: starting %d kernel threads\n", MAX_MD_THREADS);
1187 for (i = 0; i < MAX_MD_THREADS; i++) {
1188 md_threads[i].run = NULL;
1189 init_waitqueue(&md_threads[i].wqueue);
1190 md_threads[i].flags = 0;
1191 kernel_thread (md_thread, md_threads + i, 0);
1194 blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
1195 blk_dev[MD_MAJOR].current_request=NULL;
1196 read_ahead[MD_MAJOR]=INT_MAX;
1197 memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
1198 md_gendisk.next=gendisk_head;
1200 gendisk_head=&md_gendisk;
1202 #if SUPPORT_RECONSTRUCTION
1203 if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
1204 printk("md: bug: md_sync_thread == NULL\n");
1205 #endif /* SUPPORT_RECONSTRUCTION */
1207 #ifdef CONFIG_MD_LINEAR
1208 linear_init ();
1209 #endif
1210 #ifdef CONFIG_MD_STRIPED
1211 raid0_init ();
1212 #endif
1213 #ifdef CONFIG_MD_MIRRORING
1214 raid1_init ();
1215 #endif
1216 #ifdef CONFIG_MD_RAID5
1217 raid5_init ();
1218 #endif
1220 return (0);