Import 2.3.9pre5
[davej-history.git] / drivers / block / md.c
blobbd610dc7b2a3d9c17386c3e47a002c23624a3d2a
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
6 <maz@gloups.fdn.fr>
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
11 boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 RAID-1/RAID-5 extensions by:
14 Ingo Molnar, Miguel de Icaza, Gadi Oxman
16 Changes for kmod by:
17 Cyrus Durgin
19 This program is free software; you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation; either version 2, or (at your option)
22 any later version.
24 You should have received a copy of the GNU General Public License
25 (for example /usr/src/linux/COPYING); if not, write to the Free
26 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
31 * the extra system load does not show up that much. Increase it if your
32 * system can take more.
34 #define SPEED_LIMIT 1024
36 #include <linux/config.h>
37 #include <linux/module.h>
38 #include <linux/version.h>
39 #include <linux/malloc.h>
40 #include <linux/mm.h>
41 #include <linux/md.h>
42 #include <linux/hdreg.h>
43 #include <linux/stat.h>
44 #include <linux/fs.h>
45 #include <linux/proc_fs.h>
46 #include <linux/blkdev.h>
47 #include <linux/genhd.h>
48 #include <linux/smp_lock.h>
49 #ifdef CONFIG_KMOD
50 #include <linux/kmod.h>
51 #endif
52 #include <linux/errno.h>
53 #include <linux/init.h>
55 #define __KERNEL_SYSCALLS__
56 #include <linux/unistd.h>
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
61 #include <linux/blk.h>
62 #include <linux/blkpg.h>
63 #include <asm/uaccess.h>
64 #include <asm/bitops.h>
65 #include <asm/atomic.h>
67 #ifdef CONFIG_MD_BOOT
68 extern kdev_t name_to_kdev_t(char *line) __init;
69 #endif
71 static struct hd_struct md_hd_struct[MAX_MD_DEV];
72 static int md_blocksizes[MAX_MD_DEV];
73 int md_maxreadahead[MAX_MD_DEV];
74 #if SUPPORT_RECONSTRUCTION
75 static struct md_thread *md_sync_thread = NULL;
76 #endif /* SUPPORT_RECONSTRUCTION */
78 int md_size[MAX_MD_DEV]={0, };
80 static void md_geninit (struct gendisk *);
82 static struct gendisk md_gendisk=
84 MD_MAJOR,
85 "md",
88 MAX_MD_DEV,
89 md_geninit,
90 md_hd_struct,
91 md_size,
92 MAX_MD_DEV,
93 NULL,
94 NULL
97 static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
98 struct md_dev md_dev[MAX_MD_DEV];
100 int md_thread(void * arg);
102 static int legacy_raid_sb (int minor, int pnum)
104 int i, factor;
106 factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
108 /*****
109 * do size and offset calculations.
111 for (i=0; i<md_dev[minor].nb_dev; i++) {
112 md_dev[minor].devices[i].size &= ~(factor - 1);
113 md_size[minor] += md_dev[minor].devices[i].size;
114 md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
115 md_dev[minor].devices[i-1].size) : 0;
117 if (pnum == RAID0 >> PERSONALITY_SHIFT)
118 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
119 return 0;
122 static void free_sb (struct md_dev *mddev)
124 int i;
125 struct real_dev *realdev;
127 if (mddev->sb) {
128 free_page((unsigned long) mddev->sb);
129 mddev->sb = NULL;
131 for (i = 0; i <mddev->nb_dev; i++) {
132 realdev = mddev->devices + i;
133 if (realdev->sb) {
134 free_page((unsigned long) realdev->sb);
135 realdev->sb = NULL;
141 * Check one RAID superblock for generic plausibility
144 #define BAD_MAGIC KERN_ERR \
145 "md: %s: invalid raid superblock magic (%x) on block %u\n"
147 #define OUT_OF_MEM KERN_ALERT \
148 "md: out of memory.\n"
150 #define NO_DEVICE KERN_ERR \
151 "md: disabled device %s\n"
153 #define SUCCESS 0
154 #define FAILURE -1
156 static int analyze_one_sb (struct real_dev * rdev)
158 int ret = FAILURE;
159 struct buffer_head *bh;
160 kdev_t dev = rdev->dev;
161 md_superblock_t *sb;
164 * Read the superblock, it's at the end of the disk
166 rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
167 set_blocksize (dev, MD_SB_BYTES);
168 bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
170 if (bh) {
171 sb = (md_superblock_t *) bh->b_data;
172 if (sb->md_magic != MD_SB_MAGIC) {
173 printk (BAD_MAGIC, kdevname(dev),
174 sb->md_magic, rdev->sb_offset);
175 goto abort;
177 rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
178 if (!rdev->sb) {
179 printk (OUT_OF_MEM);
180 goto abort;
182 memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
184 rdev->size = sb->size;
185 } else
186 printk (NO_DEVICE,kdevname(rdev->dev));
187 ret = SUCCESS;
188 abort:
189 if (bh)
190 brelse (bh);
191 return ret;
194 #undef SUCCESS
195 #undef FAILURE
197 #undef BAD_MAGIC
198 #undef OUT_OF_MEM
199 #undef NO_DEVICE
202 * Check a full RAID array for plausibility
205 #define INCONSISTENT KERN_ERR \
206 "md: superblock inconsistency -- run ckraid\n"
208 #define OUT_OF_DATE KERN_ERR \
209 "md: superblock update time inconsistenty -- using the most recent one\n"
211 #define OLD_VERSION KERN_ALERT \
212 "md: %s: unsupported raid array version %d.%d.%d\n"
214 #define NOT_CLEAN KERN_ERR \
215 "md: %s: raid array is not clean -- run ckraid\n"
217 #define NOT_CLEAN_IGNORE KERN_ERR \
218 "md: %s: raid array is not clean -- reconstructing parity\n"
220 #define UNKNOWN_LEVEL KERN_ERR \
221 "md: %s: unsupported raid level %d\n"
223 static int analyze_sbs (int minor, int pnum)
225 struct md_dev *mddev = md_dev + minor;
226 int i, N = mddev->nb_dev, out_of_date = 0;
227 struct real_dev * disks = mddev->devices;
228 md_superblock_t *sb, *freshest = NULL;
231 * RAID-0 and linear don't use a RAID superblock
233 if (pnum == RAID0 >> PERSONALITY_SHIFT ||
234 pnum == LINEAR >> PERSONALITY_SHIFT)
235 return legacy_raid_sb (minor, pnum);
238 * Verify the RAID superblock on each real device
240 for (i = 0; i < N; i++)
241 if (analyze_one_sb(disks+i))
242 goto abort;
245 * The superblock constant part has to be the same
246 * for all disks in the array.
248 sb = NULL;
249 for (i = 0; i < N; i++) {
250 if (!disks[i].sb)
251 continue;
252 if (!sb) {
253 sb = disks[i].sb;
254 continue;
256 if (memcmp(sb,
257 disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
258 printk (INCONSISTENT);
259 goto abort;
264 * OK, we have all disks and the array is ready to run. Let's
265 * find the freshest superblock, that one will be the superblock
266 * that represents the whole array.
268 if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
269 goto abort;
270 freshest = NULL;
271 for (i = 0; i < N; i++) {
272 if (!disks[i].sb)
273 continue;
274 if (!freshest) {
275 freshest = disks[i].sb;
276 continue;
279 * Find the newest superblock version
281 if (disks[i].sb->utime != freshest->utime) {
282 out_of_date = 1;
283 if (disks[i].sb->utime > freshest->utime)
284 freshest = disks[i].sb;
287 if (out_of_date)
288 printk(OUT_OF_DATE);
289 memcpy (sb, freshest, sizeof(*freshest));
292 * Check if we can support this RAID array
294 if (sb->major_version != MD_MAJOR_VERSION ||
295 sb->minor_version > MD_MINOR_VERSION) {
297 printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
298 sb->major_version, sb->minor_version,
299 sb->patch_version);
300 goto abort;
304 * We need to add this as a superblock option.
306 #if SUPPORT_RECONSTRUCTION
307 if (sb->state != (1 << MD_SB_CLEAN)) {
308 if (sb->level == 1) {
309 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
310 goto abort;
311 } else
312 printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
314 #else
315 if (sb->state != (1 << MD_SB_CLEAN)) {
316 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
317 goto abort;
319 #endif /* SUPPORT_RECONSTRUCTION */
321 switch (sb->level) {
322 case 1:
323 md_size[minor] = sb->size;
324 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
325 break;
326 case 4:
327 case 5:
328 md_size[minor] = sb->size * (sb->raid_disks - 1);
329 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
330 break;
331 default:
332 printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
333 sb->level);
334 goto abort;
336 return 0;
337 abort:
338 free_sb(mddev);
339 return 1;
342 #undef INCONSISTENT
343 #undef OUT_OF_DATE
344 #undef OLD_VERSION
345 #undef NOT_CLEAN
346 #undef OLD_LEVEL
348 int md_update_sb(int minor)
350 struct md_dev *mddev = md_dev + minor;
351 struct buffer_head *bh;
352 md_superblock_t *sb = mddev->sb;
353 struct real_dev *realdev;
354 kdev_t dev;
355 int i;
356 u32 sb_offset;
358 sb->utime = CURRENT_TIME;
359 for (i = 0; i < mddev->nb_dev; i++) {
360 realdev = mddev->devices + i;
361 if (!realdev->sb)
362 continue;
363 dev = realdev->dev;
364 sb_offset = realdev->sb_offset;
365 set_blocksize(dev, MD_SB_BYTES);
366 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
367 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
368 if (bh) {
369 sb = (md_superblock_t *) bh->b_data;
370 memcpy(sb, mddev->sb, MD_SB_BYTES);
371 memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
372 mark_buffer_uptodate(bh, 1);
373 mark_buffer_dirty(bh, 1);
374 ll_rw_block(WRITE, 1, &bh);
375 wait_on_buffer(bh);
376 bforget(bh);
377 fsync_dev(dev);
378 invalidate_buffers(dev);
379 } else
380 printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
382 return 0;
385 static int do_md_run (int minor, int repart)
387 int pnum, i, min, factor, err;
389 if (!md_dev[minor].nb_dev)
390 return -EINVAL;
392 if (md_dev[minor].pers)
393 return -EBUSY;
395 md_dev[minor].repartition=repart;
397 if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
398 >= MAX_PERSONALITY)
399 return -EINVAL;
401 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
402 if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
403 for (i = 0; i < md_dev [minor].nb_dev; i++)
404 if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
405 return -EINVAL;
407 if (!pers[pnum])
409 #ifdef CONFIG_KMOD
410 char module_name[80];
411 sprintf (module_name, "md-personality-%d", pnum);
412 request_module (module_name);
413 if (!pers[pnum])
414 #endif
415 return -EINVAL;
418 factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
420 for (i=0; i<md_dev[minor].nb_dev; i++)
421 if (md_dev[minor].devices[i].size<min)
423 printk ("Dev %s smaller than %dk, cannot shrink\n",
424 partition_name (md_dev[minor].devices[i].dev), min);
425 return -EINVAL;
428 for (i=0; i<md_dev[minor].nb_dev; i++) {
429 fsync_dev(md_dev[minor].devices[i].dev);
430 invalidate_buffers(md_dev[minor].devices[i].dev);
433 /* Resize devices according to the factor. It is used to align
434 partitions size on a given chunk size. */
435 md_size[minor]=0;
438 * Analyze the raid superblock
440 if (analyze_sbs(minor, pnum))
441 return -EINVAL;
443 md_dev[minor].pers=pers[pnum];
445 if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
447 md_dev[minor].pers=NULL;
448 free_sb(md_dev + minor);
449 return (err);
452 if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
454 md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
455 md_update_sb(minor);
458 /* FIXME : We assume here we have blocks
459 that are twice as large as sectors.
460 THIS MAY NOT BE TRUE !!! */
461 md_hd_struct[minor].start_sect=0;
462 md_hd_struct[minor].nr_sects=md_size[minor]<<1;
464 read_ahead[MD_MAJOR] = 128;
465 return (0);
468 static int do_md_stop (int minor, struct inode *inode)
470 int i;
472 if (inode->i_count>1 || md_dev[minor].busy>1) {
474 * ioctl : one open channel
476 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
477 minor, inode->i_count, md_dev[minor].busy);
478 return -EBUSY;
481 if (md_dev[minor].pers) {
483 * It is safe to call stop here, it only frees private
484 * data. Also, it tells us if a device is unstoppable
485 * (eg. resyncing is in progress)
487 if (md_dev[minor].pers->stop (minor, md_dev+minor))
488 return -EBUSY;
490 * The device won't exist anymore -> flush it now
492 fsync_dev (inode->i_rdev);
493 invalidate_buffers (inode->i_rdev);
494 if (md_dev[minor].sb) {
495 md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
496 md_update_sb(minor);
500 /* Remove locks. */
501 if (md_dev[minor].sb)
502 free_sb(md_dev + minor);
503 for (i=0; i<md_dev[minor].nb_dev; i++)
504 clear_inode (md_dev[minor].devices[i].inode);
506 md_dev[minor].nb_dev=md_size[minor]=0;
507 md_hd_struct[minor].nr_sects=0;
508 md_dev[minor].pers=NULL;
510 read_ahead[MD_MAJOR] = 128;
512 return (0);
515 static int do_md_add (int minor, kdev_t dev)
517 int i;
518 int hot_add=0;
519 struct real_dev *realdev;
521 if (md_dev[minor].nb_dev==MAX_REAL)
522 return -EINVAL;
524 if (!fs_may_mount (dev))
525 return -EBUSY;
527 if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
528 printk("md_add(): zero device size, huh, bailing out.\n");
529 return -EINVAL;
532 if (md_dev[minor].pers) {
534 * The array is already running, hot-add the drive, or
535 * bail out:
537 if (!md_dev[minor].pers->hot_add_disk)
538 return -EBUSY;
539 else
540 hot_add=1;
544 * Careful. We cannot increase nb_dev for a running array.
546 i=md_dev[minor].nb_dev;
547 realdev = &md_dev[minor].devices[i];
548 realdev->dev=dev;
550 /* Lock the device by inserting a dummy inode. This doesn't
551 smell very good, but I need to be consistent with the
552 mount stuff, specially with fs_may_mount. If someone have
553 a better idea, please help ! */
555 realdev->inode=get_empty_inode ();
556 realdev->inode->i_dev=dev; /* don't care about other fields */
557 insert_inode_hash (realdev->inode);
559 /* Sizes are now rounded at run time */
561 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
563 realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
565 if (hot_add) {
567 * Check the superblock for consistency.
568 * The personality itself has to check whether it's getting
569 * added with the proper flags. The personality has to be
570 * checked too. ;)
572 if (analyze_one_sb (realdev))
573 return -EINVAL;
575 * hot_add has to bump up nb_dev itself
577 if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
579 * FIXME: here we should free up the inode and stuff
581 printk ("FIXME\n");
582 return -EINVAL;
584 } else
585 md_dev[minor].nb_dev++;
587 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
588 return (0);
591 static int md_ioctl (struct inode *inode, struct file *file,
592 unsigned int cmd, unsigned long arg)
594 int minor, err;
595 struct hd_geometry *loc = (struct hd_geometry *) arg;
597 if (!capable(CAP_SYS_ADMIN))
598 return -EACCES;
600 if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
601 (minor & 0x7f) < MAX_PERSONALITY &&
602 pers[minor & 0x7f] &&
603 pers[minor & 0x7f]->ioctl)
604 return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
606 if (minor >= MAX_MD_DEV)
607 return -EINVAL;
609 switch (cmd)
611 case REGISTER_DEV:
612 return do_md_add (minor, to_kdev_t ((dev_t) arg));
614 case START_MD:
615 return do_md_run (minor, (int) arg);
617 case STOP_MD:
618 return do_md_stop (minor, inode);
620 case BLKGETSIZE: /* Return device size */
621 if (!arg) return -EINVAL;
622 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
623 if (err)
624 return err;
625 break;
628 /* We have a problem here : there is no easy way to give a CHS
629 virtual geometry. We currently pretend that we have a 2 heads
630 4 sectors (with a BIG number of cylinders...). This drives dosfs
631 just mad... ;-) */
633 case HDIO_GETGEO:
634 if (!loc) return -EINVAL;
635 err = put_user (2, (char *) &loc->heads);
636 if (err)
637 return err;
638 err = put_user (4, (char *) &loc->sectors);
639 if (err)
640 return err;
641 err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
642 if (err)
643 return err;
644 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
645 (long *) &loc->start);
646 if (err)
647 return err;
648 break;
650 case BLKROSET:
651 case BLKROGET:
652 case BLKRAGET:
653 case BLKRASET:
654 case BLKFLSBUF:
655 return blk_ioctl(inode->i_rdev, cmd, arg);
657 default:
658 return -EINVAL;
661 return (0);
664 static int md_open (struct inode *inode, struct file *file)
666 int minor=MINOR(inode->i_rdev);
668 md_dev[minor].busy++;
669 return (0); /* Always succeed */
673 static int md_release (struct inode *inode, struct file *file)
675 int minor=MINOR(inode->i_rdev);
677 sync_dev (inode->i_rdev);
678 md_dev[minor].busy--;
679 return 0;
683 static ssize_t md_read (struct file *file, char *buf, size_t count,
684 loff_t *ppos)
686 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
688 if (!md_dev[minor].pers) /* Check if device is being run */
689 return -ENXIO;
691 return block_read(file, buf, count, ppos);
694 static ssize_t md_write (struct file *file, const char *buf,
695 size_t count, loff_t *ppos)
697 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
699 if (!md_dev[minor].pers) /* Check if device is being run */
700 return -ENXIO;
702 return block_write(file, buf, count, ppos);
705 static struct file_operations md_fops=
707 NULL,
708 md_read,
709 md_write,
710 NULL,
711 NULL,
712 md_ioctl,
713 NULL,
714 md_open,
715 NULL,
716 md_release,
717 block_fsync
720 int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
722 if ((unsigned int) minor >= MAX_MD_DEV)
724 printk ("Bad md device %d\n", minor);
725 return (-1);
728 if (!md_dev[minor].pers)
730 printk ("Oops ! md%d not running, giving up !\n", minor);
731 return (-1);
734 return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
737 int md_make_request (int minor, int rw, struct buffer_head * bh)
739 if (md_dev [minor].pers->make_request) {
740 if (buffer_locked(bh))
741 return 0;
742 set_bit(BH_Lock, &bh->b_state);
743 if (rw == WRITE || rw == WRITEA) {
744 if (!buffer_dirty(bh)) {
745 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
746 return 0;
749 if (rw == READ || rw == READA) {
750 if (buffer_uptodate(bh)) {
751 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
752 return 0;
755 return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
756 } else {
757 make_request (MAJOR(bh->b_rdev), rw, bh);
758 return 0;
762 static void do_md_request (void)
764 printk ("Got md request, not good...");
765 return;
768 void md_wakeup_thread(struct md_thread *thread)
770 set_bit(THREAD_WAKEUP, &thread->flags);
771 wake_up(&thread->wqueue);
774 struct md_thread *md_register_thread (void (*run) (void *), void *data)
776 struct md_thread *thread = (struct md_thread *)
777 kmalloc(sizeof(struct md_thread), GFP_KERNEL);
778 int ret;
779 DECLARE_MUTEX_LOCKED(sem);
781 if (!thread) return NULL;
783 memset(thread, 0, sizeof(struct md_thread));
784 init_waitqueue_head(&thread->wqueue);
786 thread->sem = &sem;
787 thread->run = run;
788 thread->data = data;
789 ret = kernel_thread(md_thread, thread, 0);
790 if (ret < 0) {
791 kfree(thread);
792 return NULL;
794 down(&sem);
795 return thread;
798 void md_unregister_thread (struct md_thread *thread)
800 DECLARE_MUTEX_LOCKED(sem);
802 thread->sem = &sem;
803 thread->run = NULL;
804 if (thread->tsk)
805 printk("Killing md_thread %d %p %s\n",
806 thread->tsk->pid, thread->tsk, thread->tsk->comm);
807 else
808 printk("Aiee. md_thread has 0 tsk\n");
809 send_sig(SIGKILL, thread->tsk, 1);
810 printk("downing on %p\n", &sem);
811 down(&sem);
814 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
816 int md_thread(void * arg)
818 struct md_thread *thread = arg;
820 lock_kernel();
821 exit_mm(current);
822 exit_files(current);
823 exit_fs(current);
825 current->session = 1;
826 current->pgrp = 1;
827 sprintf(current->comm, "md_thread");
828 siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
829 thread->tsk = current;
830 up(thread->sem);
832 for (;;) {
833 cli();
834 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
835 do {
836 spin_lock(&current->sigmask_lock);
837 flush_signals(current);
838 spin_unlock(&current->sigmask_lock);
839 interruptible_sleep_on(&thread->wqueue);
840 cli();
841 if (test_bit(THREAD_WAKEUP, &thread->flags))
842 break;
843 if (!thread->run) {
844 sti();
845 up(thread->sem);
846 return 0;
848 } while (signal_pending(current));
850 sti();
851 clear_bit(THREAD_WAKEUP, &thread->flags);
852 if (thread->run) {
853 thread->run(thread->data);
854 run_task_queue(&tq_disk);
859 EXPORT_SYMBOL(md_size);
860 EXPORT_SYMBOL(md_maxreadahead);
861 EXPORT_SYMBOL(register_md_personality);
862 EXPORT_SYMBOL(unregister_md_personality);
863 EXPORT_SYMBOL(md_dev);
864 EXPORT_SYMBOL(md_error);
865 EXPORT_SYMBOL(md_register_thread);
866 EXPORT_SYMBOL(md_unregister_thread);
867 EXPORT_SYMBOL(md_update_sb);
868 EXPORT_SYMBOL(md_map);
869 EXPORT_SYMBOL(md_wakeup_thread);
870 EXPORT_SYMBOL(md_do_sync);
872 #ifdef CONFIG_PROC_FS
873 static struct proc_dir_entry proc_md = {
874 PROC_MD, 6, "mdstat",
875 S_IFREG | S_IRUGO, 1, 0, 0,
876 0, &proc_array_inode_operations,
878 #endif
880 static void md_geninit (struct gendisk *gdisk)
882 int i;
884 for(i=0;i<MAX_MD_DEV;i++)
886 md_blocksizes[i] = 1024;
887 md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
888 md_gendisk.part[i].start_sect=-1; /* avoid partition check */
889 md_gendisk.part[i].nr_sects=0;
890 md_dev[i].pers=NULL;
893 blksize_size[MD_MAJOR] = md_blocksizes;
894 max_readahead[MD_MAJOR] = md_maxreadahead;
896 #ifdef CONFIG_PROC_FS
897 proc_register(&proc_root, &proc_md);
898 #endif
901 int md_error (kdev_t mddev, kdev_t rdev)
903 unsigned int minor = MINOR (mddev);
904 int rc;
906 if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
907 panic ("md_error gets unknown device\n");
908 if (!md_dev [minor].pers)
909 panic ("md_error gets an error for an unknown device\n");
910 if (md_dev [minor].pers->error_handler) {
911 rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
912 #if SUPPORT_RECONSTRUCTION
913 md_wakeup_thread(md_sync_thread);
914 #endif /* SUPPORT_RECONSTRUCTION */
915 return rc;
917 return 0;
920 int get_md_status (char *page)
922 int sz=0, i, j, size;
924 sz+=sprintf( page+sz, "Personalities : ");
925 for (i=0; i<MAX_PERSONALITY; i++)
926 if (pers[i])
927 sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
929 page[sz-1]='\n';
931 sz+=sprintf (page+sz, "read_ahead ");
932 if (read_ahead[MD_MAJOR]==INT_MAX)
933 sz+=sprintf (page+sz, "not set\n");
934 else
935 sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
937 for (i=0; i<MAX_MD_DEV; i++)
939 sz+=sprintf (page+sz, "md%d : %sactive", i, md_dev[i].pers ? "" : "in");
941 if (md_dev[i].pers)
942 sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
944 size=0;
945 for (j=0; j<md_dev[i].nb_dev; j++)
947 sz+=sprintf (page+sz, " %s",
948 partition_name(md_dev[i].devices[j].dev));
949 size+=md_dev[i].devices[j].size;
952 if (md_dev[i].nb_dev) {
953 if (md_dev[i].pers)
954 sz+=sprintf (page+sz, " %d blocks", md_size[i]);
955 else
956 sz+=sprintf (page+sz, " %d blocks", size);
959 if (!md_dev[i].pers)
961 sz+=sprintf (page+sz, "\n");
962 continue;
965 if (md_dev[i].pers->max_invalid_dev)
966 sz+=sprintf (page+sz, " maxfault=%ld", MAX_FAULT(md_dev+i));
968 sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
969 sz+=sprintf (page+sz, "\n");
972 return (sz);
975 int register_md_personality (int p_num, struct md_personality *p)
977 int i=(p_num >> PERSONALITY_SHIFT);
979 if (i >= MAX_PERSONALITY)
980 return -EINVAL;
982 if (pers[i])
983 return -EBUSY;
985 pers[i]=p;
986 printk ("%s personality registered\n", p->name);
987 return 0;
990 int unregister_md_personality (int p_num)
992 int i=(p_num >> PERSONALITY_SHIFT);
994 if (i >= MAX_PERSONALITY)
995 return -EINVAL;
997 printk ("%s personality unregistered\n", pers[i]->name);
998 pers[i]=NULL;
999 return 0;
1002 static md_descriptor_t *get_spare(struct md_dev *mddev)
1004 int i;
1005 md_superblock_t *sb = mddev->sb;
1006 md_descriptor_t *descriptor;
1007 struct real_dev *realdev;
1009 for (i = 0; i < mddev->nb_dev; i++) {
1010 realdev = &mddev->devices[i];
1011 if (!realdev->sb)
1012 continue;
1013 descriptor = &sb->disks[realdev->sb->descriptor.number];
1014 if (descriptor->state & (1 << MD_FAULTY_DEVICE))
1015 continue;
1016 if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
1017 continue;
1018 return descriptor;
1020 return NULL;
1024 * parallel resyncing thread.
1026 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1027 * - fix read error handing
1030 int md_do_sync(struct md_dev *mddev)
1032 struct buffer_head *bh;
1033 int max_blocks, blocksize, curr_bsize, percent=1, j;
1034 kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
1035 int major = MAJOR(read_disk), minor = MINOR(read_disk);
1036 unsigned long starttime;
1038 blocksize = blksize_size[major][minor];
1039 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1041 printk("... resync log\n");
1042 printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
1043 printk(" .... raid array: %s\n", kdevname(read_disk));
1044 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
1045 printk("md: syncing RAID array %s\n", kdevname(read_disk));
1047 mddev->busy++;
1049 starttime=jiffies;
1050 for (j = 0; j < max_blocks; j++) {
1053 * B careful. When some1 mounts a non-'blocksize' filesystem
1054 * then we get the blocksize changed right under us. Go deal
1055 * with it transparently, recalculate 'blocksize', 'j' and
1056 * 'max_blocks':
1058 curr_bsize = blksize_size[major][minor];
1059 if (curr_bsize != blocksize) {
1060 diff_blocksize:
1061 if (curr_bsize > blocksize)
1063 * this is safe, rounds downwards.
1065 j /= curr_bsize/blocksize;
1066 else
1067 j *= blocksize/curr_bsize;
1069 blocksize = curr_bsize;
1070 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1072 if ((bh = breada (read_disk, j, blocksize, j * blocksize,
1073 max_blocks * blocksize)) != NULL) {
1074 mark_buffer_dirty(bh, 1);
1075 brelse(bh);
1076 } else {
1078 * FIXME: Ugly, but set_blocksize() isnt safe ...
1080 curr_bsize = blksize_size[major][minor];
1081 if (curr_bsize != blocksize)
1082 goto diff_blocksize;
1085 * It's a real read problem. FIXME, handle this
1086 * a better way.
1088 printk ( KERN_ALERT
1089 "read error, stopping reconstruction.\n");
1090 mddev->busy--;
1091 return 1;
1095 * Let's sleep some if we are faster than our speed limit:
1097 while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
1099 current->state = TASK_INTERRUPTIBLE;
1100 schedule_timeout(1);
1104 * FIXME: put this status bar thing into /proc
1106 if (!(j%(max_blocks/100))) {
1107 if (!(percent%10))
1108 printk (" %03d%% done.\n",percent);
1109 else
1110 printk (".");
1111 percent++;
1114 fsync_dev(read_disk);
1115 printk("md: %s: sync done.\n", kdevname(read_disk));
1116 mddev->busy--;
1117 return 0;
1121 * This is a kernel thread which: syncs a spare disk with the active array
1123 * the amount of foolproofing might seem to be a tad excessive, but an
1124 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1125 * of my root partition with the first 0.5 gigs of my /home partition ... so
1126 * i'm a bit nervous ;)
1128 void mdsyncd (void *data)
1130 int i;
1131 struct md_dev *mddev;
1132 md_superblock_t *sb;
1133 md_descriptor_t *spare;
1134 unsigned long flags;
1136 for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
1137 if ((sb = mddev->sb) == NULL)
1138 continue;
1139 if (sb->active_disks == sb->raid_disks)
1140 continue;
1141 if (!sb->spare_disks)
1142 continue;
1143 if ((spare = get_spare(mddev)) == NULL)
1144 continue;
1145 if (!mddev->pers->mark_spare)
1146 continue;
1147 if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
1148 continue;
1149 if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
1150 mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
1151 continue;
1153 save_flags(flags);
1154 cli();
1155 mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
1156 spare->state |= (1 << MD_SYNC_DEVICE);
1157 spare->state |= (1 << MD_ACTIVE_DEVICE);
1158 sb->spare_disks--;
1159 sb->active_disks++;
1160 mddev->sb_dirty = 1;
1161 md_update_sb(mddev - md_dev);
1162 restore_flags(flags);
1167 #ifdef CONFIG_MD_BOOT
1168 struct {
1169 int set;
1170 int ints[100];
1171 char str[100];
1172 } md_setup_args __initdata = {
1173 0,{0},{0}
1176 /* called from init/main.c */
1177 __initfunc(void md_setup(char *str,int *ints))
1179 int i;
1180 for(i=0;i<=ints[0];i++) {
1181 md_setup_args.ints[i] = ints[i];
1182 strcpy(md_setup_args.str, str);
1183 /* printk ("md: ints[%d]=%d.\n", i, ints[i]);*/
1185 md_setup_args.set=1;
1186 return;
1189 __initfunc(void do_md_setup(char *str,int *ints))
1191 int minor, pers, factor, fault;
1192 kdev_t dev;
1193 int i=1;
1195 if(ints[0] < 4) {
1196 printk ("md: Too few Arguments (%d).\n", ints[0]);
1197 return;
1200 minor=ints[i++];
1202 if (minor >= MAX_MD_DEV) {
1203 printk ("md: Minor device number too high.\n");
1204 return;
1207 pers = 0;
1209 switch(ints[i++]) { /* Raidlevel */
1210 case -1:
1211 #ifdef CONFIG_MD_LINEAR
1212 pers = LINEAR;
1213 printk ("md: Setting up md%d as linear device.\n",minor);
1214 #else
1215 printk ("md: Linear mode not configured."
1216 "Recompile the kernel with linear mode enabled!\n");
1217 #endif
1218 break;
1219 case 0:
1220 pers = STRIPED;
1221 #ifdef CONFIG_MD_STRIPED
1222 printk ("md: Setting up md%d as a striped device.\n",minor);
1223 #else
1224 printk ("md: Striped mode not configured."
1225 "Recompile the kernel with striped mode enabled!\n");
1226 #endif
1227 break;
1228 /* not supported yet
1229 case 1:
1230 pers = RAID1;
1231 printk ("md: Setting up md%d as a raid1 device.\n",minor);
1232 break;
1233 case 5:
1234 pers = RAID5;
1235 printk ("md: Setting up md%d as a raid5 device.\n",minor);
1236 break;
1238 default:
1239 printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
1240 return;
1243 if(pers) {
1245 factor=ints[i++]; /* Chunksize */
1246 fault =ints[i++]; /* Faultlevel */
1248 pers=pers | factor | (fault << FAULT_SHIFT);
1250 while( str && (dev = name_to_kdev_t(str))) {
1251 do_md_add (minor, dev);
1252 if((str = strchr (str, ',')) != NULL)
1253 str++;
1256 do_md_run (minor, pers);
1257 printk ("md: Loading md%d.\n",minor);
1261 #endif
1263 void linear_init (void);
1264 void raid0_init (void);
1265 void raid1_init (void);
1266 void raid5_init (void);
1268 __initfunc(int md_init (void))
1270 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1271 MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
1272 MAX_MD_DEV, MAX_REAL);
1274 if (register_blkdev (MD_MAJOR, "md", &md_fops))
1276 printk ("Unable to get major %d for md\n", MD_MAJOR);
1277 return (-1);
1280 blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
1281 blk_dev[MD_MAJOR].current_request=NULL;
1282 read_ahead[MD_MAJOR]=INT_MAX;
1283 memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
1284 md_gendisk.next=gendisk_head;
1286 gendisk_head=&md_gendisk;
1288 #if SUPPORT_RECONSTRUCTION
1289 if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
1290 printk("md: bug: md_sync_thread == NULL\n");
1291 #endif /* SUPPORT_RECONSTRUCTION */
1293 #ifdef CONFIG_MD_LINEAR
1294 linear_init ();
1295 #endif
1296 #ifdef CONFIG_MD_STRIPED
1297 raid0_init ();
1298 #endif
1299 #ifdef CONFIG_MD_MIRRORING
1300 raid1_init ();
1301 #endif
1302 #ifdef CONFIG_MD_RAID5
1303 raid5_init ();
1304 #endif
1305 return (0);
1308 #ifdef CONFIG_MD_BOOT
1309 __initfunc(void md_setup_drive(void))
1311 if(md_setup_args.set)
1312 do_md_setup(md_setup_args.str, md_setup_args.ints);
1314 #endif