Import 2.3.25pre1
[davej-history.git] / drivers / block / md.c
blob4dedb867188f892f2684611539b64be088b6b00e
2 /*
3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
6 <maz@gloups.fdn.fr>
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
11 boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 RAID-1/RAID-5 extensions by:
14 Ingo Molnar, Miguel de Icaza, Gadi Oxman
16 Changes for kmod by:
17 Cyrus Durgin
19 This program is free software; you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation; either version 2, or (at your option)
22 any later version.
24 You should have received a copy of the GNU General Public License
25 (for example /usr/src/linux/COPYING); if not, write to the Free
26 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
31 * the extra system load does not show up that much. Increase it if your
32 * system can take more.
34 #define SPEED_LIMIT 1024
36 #include <linux/config.h>
37 #include <linux/module.h>
38 #include <linux/version.h>
39 #include <linux/malloc.h>
40 #include <linux/mm.h>
41 #include <linux/md.h>
42 #include <linux/hdreg.h>
43 #include <linux/stat.h>
44 #include <linux/fs.h>
45 #include <linux/proc_fs.h>
46 #include <linux/blkdev.h>
47 #include <linux/genhd.h>
48 #include <linux/smp_lock.h>
49 #ifdef CONFIG_KMOD
50 #include <linux/kmod.h>
51 #endif
52 #include <linux/errno.h>
53 #include <linux/init.h>
55 #define __KERNEL_SYSCALLS__
56 #include <linux/unistd.h>
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
61 #include <linux/blk.h>
62 #include <linux/blkpg.h>
63 #include <asm/uaccess.h>
64 #include <asm/bitops.h>
65 #include <asm/atomic.h>
67 #ifdef CONFIG_MD_BOOT
68 extern kdev_t name_to_kdev_t(char *line) __init;
69 #endif
71 static struct hd_struct md_hd_struct[MAX_MD_DEV];
72 static int md_blocksizes[MAX_MD_DEV];
73 int md_maxreadahead[MAX_MD_DEV];
74 #if SUPPORT_RECONSTRUCTION
75 static struct md_thread *md_sync_thread = NULL;
76 #endif /* SUPPORT_RECONSTRUCTION */
78 int md_size[MAX_MD_DEV]={0, };
80 static void md_geninit (struct gendisk *);
82 static struct gendisk md_gendisk=
84 MD_MAJOR,
85 "md",
88 MAX_MD_DEV,
89 md_geninit,
90 md_hd_struct,
91 md_size,
92 MAX_MD_DEV,
93 NULL,
94 NULL
97 static struct md_personality *pers[MAX_PERSONALITY]={NULL, };
98 struct md_dev md_dev[MAX_MD_DEV];
100 int md_thread(void * arg);
102 static int legacy_raid_sb (int minor, int pnum)
104 int i, factor;
106 factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
108 /*****
109 * do size and offset calculations.
111 for (i=0; i<md_dev[minor].nb_dev; i++) {
112 md_dev[minor].devices[i].size &= ~(factor - 1);
113 md_size[minor] += md_dev[minor].devices[i].size;
114 md_dev[minor].devices[i].offset=i ? (md_dev[minor].devices[i-1].offset +
115 md_dev[minor].devices[i-1].size) : 0;
117 if (pnum == RAID0 >> PERSONALITY_SHIFT)
118 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev;
119 return 0;
122 static void free_sb (struct md_dev *mddev)
124 int i;
125 struct real_dev *realdev;
127 if (mddev->sb) {
128 free_page((unsigned long) mddev->sb);
129 mddev->sb = NULL;
131 for (i = 0; i <mddev->nb_dev; i++) {
132 realdev = mddev->devices + i;
133 if (realdev->sb) {
134 free_page((unsigned long) realdev->sb);
135 realdev->sb = NULL;
141 * Check one RAID superblock for generic plausibility
144 #define BAD_MAGIC KERN_ERR \
145 "md: %s: invalid raid superblock magic (%x) on block %u\n"
147 #define OUT_OF_MEM KERN_ALERT \
148 "md: out of memory.\n"
150 #define NO_DEVICE KERN_ERR \
151 "md: disabled device %s\n"
153 #define SUCCESS 0
154 #define FAILURE -1
156 static int analyze_one_sb (struct real_dev * rdev)
158 int ret = FAILURE;
159 struct buffer_head *bh;
160 kdev_t dev = rdev->dev;
161 md_superblock_t *sb;
164 * Read the superblock, it's at the end of the disk
166 rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]);
167 set_blocksize (dev, MD_SB_BYTES);
168 bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
170 if (bh) {
171 sb = (md_superblock_t *) bh->b_data;
172 if (sb->md_magic != MD_SB_MAGIC) {
173 printk (BAD_MAGIC, kdevname(dev),
174 sb->md_magic, rdev->sb_offset);
175 goto abort;
177 rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL);
178 if (!rdev->sb) {
179 printk (OUT_OF_MEM);
180 goto abort;
182 memcpy (rdev->sb, bh->b_data, MD_SB_BYTES);
184 rdev->size = sb->size;
185 } else
186 printk (NO_DEVICE,kdevname(rdev->dev));
187 ret = SUCCESS;
188 abort:
189 if (bh)
190 brelse (bh);
191 return ret;
194 #undef SUCCESS
195 #undef FAILURE
197 #undef BAD_MAGIC
198 #undef OUT_OF_MEM
199 #undef NO_DEVICE
202 * Check a full RAID array for plausibility
205 #define INCONSISTENT KERN_ERR \
206 "md: superblock inconsistency -- run ckraid\n"
208 #define OUT_OF_DATE KERN_ERR \
209 "md: superblock update time inconsistenty -- using the most recent one\n"
211 #define OLD_VERSION KERN_ALERT \
212 "md: %s: unsupported raid array version %d.%d.%d\n"
214 #define NOT_CLEAN KERN_ERR \
215 "md: %s: raid array is not clean -- run ckraid\n"
217 #define NOT_CLEAN_IGNORE KERN_ERR \
218 "md: %s: raid array is not clean -- reconstructing parity\n"
220 #define UNKNOWN_LEVEL KERN_ERR \
221 "md: %s: unsupported raid level %d\n"
223 static int analyze_sbs (int minor, int pnum)
225 struct md_dev *mddev = md_dev + minor;
226 int i, N = mddev->nb_dev, out_of_date = 0;
227 struct real_dev * disks = mddev->devices;
228 md_superblock_t *sb, *freshest = NULL;
231 * RAID-0 and linear don't use a RAID superblock
233 if (pnum == RAID0 >> PERSONALITY_SHIFT ||
234 pnum == LINEAR >> PERSONALITY_SHIFT)
235 return legacy_raid_sb (minor, pnum);
238 * Verify the RAID superblock on each real device
240 for (i = 0; i < N; i++)
241 if (analyze_one_sb(disks+i))
242 goto abort;
245 * The superblock constant part has to be the same
246 * for all disks in the array.
248 sb = NULL;
249 for (i = 0; i < N; i++) {
250 if (!disks[i].sb)
251 continue;
252 if (!sb) {
253 sb = disks[i].sb;
254 continue;
256 if (memcmp(sb,
257 disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) {
258 printk (INCONSISTENT);
259 goto abort;
264 * OK, we have all disks and the array is ready to run. Let's
265 * find the freshest superblock, that one will be the superblock
266 * that represents the whole array.
268 if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL)
269 goto abort;
270 freshest = NULL;
271 for (i = 0; i < N; i++) {
272 if (!disks[i].sb)
273 continue;
274 if (!freshest) {
275 freshest = disks[i].sb;
276 continue;
279 * Find the newest superblock version
281 if (disks[i].sb->utime != freshest->utime) {
282 out_of_date = 1;
283 if (disks[i].sb->utime > freshest->utime)
284 freshest = disks[i].sb;
287 if (out_of_date)
288 printk(OUT_OF_DATE);
289 memcpy (sb, freshest, sizeof(*freshest));
292 * Check if we can support this RAID array
294 if (sb->major_version != MD_MAJOR_VERSION ||
295 sb->minor_version > MD_MINOR_VERSION) {
297 printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)),
298 sb->major_version, sb->minor_version,
299 sb->patch_version);
300 goto abort;
304 * We need to add this as a superblock option.
306 #if SUPPORT_RECONSTRUCTION
307 if (sb->state != (1 << MD_SB_CLEAN)) {
308 if (sb->level == 1) {
309 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
310 goto abort;
311 } else
312 printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor)));
314 #else
315 if (sb->state != (1 << MD_SB_CLEAN)) {
316 printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor)));
317 goto abort;
319 #endif /* SUPPORT_RECONSTRUCTION */
321 switch (sb->level) {
322 case 1:
323 md_size[minor] = sb->size;
324 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD;
325 break;
326 case 4:
327 case 5:
328 md_size[minor] = sb->size * (sb->raid_disks - 1);
329 md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1);
330 break;
331 default:
332 printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)),
333 sb->level);
334 goto abort;
336 return 0;
337 abort:
338 free_sb(mddev);
339 return 1;
342 #undef INCONSISTENT
343 #undef OUT_OF_DATE
344 #undef OLD_VERSION
345 #undef NOT_CLEAN
346 #undef OLD_LEVEL
348 int md_update_sb(int minor)
350 struct md_dev *mddev = md_dev + minor;
351 struct buffer_head *bh;
352 md_superblock_t *sb = mddev->sb;
353 struct real_dev *realdev;
354 kdev_t dev;
355 int i;
356 u32 sb_offset;
358 sb->utime = CURRENT_TIME;
359 for (i = 0; i < mddev->nb_dev; i++) {
360 realdev = mddev->devices + i;
361 if (!realdev->sb)
362 continue;
363 dev = realdev->dev;
364 sb_offset = realdev->sb_offset;
365 set_blocksize(dev, MD_SB_BYTES);
366 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset);
367 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
368 if (bh) {
369 sb = (md_superblock_t *) bh->b_data;
370 memcpy(sb, mddev->sb, MD_SB_BYTES);
371 memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4);
372 mark_buffer_uptodate(bh, 1);
373 mark_buffer_dirty(bh, 1);
374 ll_rw_block(WRITE, 1, &bh);
375 wait_on_buffer(bh);
376 bforget(bh);
377 fsync_dev(dev);
378 invalidate_buffers(dev);
379 } else
380 printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
382 return 0;
385 static int do_md_run (int minor, int repart)
387 int pnum, i, min, factor, err;
389 if (!md_dev[minor].nb_dev)
390 return -EINVAL;
392 if (md_dev[minor].pers)
393 return -EBUSY;
395 md_dev[minor].repartition=repart;
397 if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT))
398 >= MAX_PERSONALITY)
399 return -EINVAL;
401 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
402 if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){
403 for (i = 0; i < md_dev [minor].nb_dev; i++)
404 if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR)
405 return -EINVAL;
407 if (!pers[pnum])
409 #ifdef CONFIG_KMOD
410 char module_name[80];
411 sprintf (module_name, "md-personality-%d", pnum);
412 request_module (module_name);
413 if (!pers[pnum])
414 #endif
415 return -EINVAL;
418 factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor)));
420 for (i=0; i<md_dev[minor].nb_dev; i++)
421 if (md_dev[minor].devices[i].size<min)
423 printk ("Dev %s smaller than %dk, cannot shrink\n",
424 partition_name (md_dev[minor].devices[i].dev), min);
425 return -EINVAL;
428 for (i=0; i<md_dev[minor].nb_dev; i++) {
429 fsync_dev(md_dev[minor].devices[i].dev);
430 invalidate_buffers(md_dev[minor].devices[i].dev);
433 /* Resize devices according to the factor. It is used to align
434 partitions size on a given chunk size. */
435 md_size[minor]=0;
438 * Analyze the raid superblock
440 if (analyze_sbs(minor, pnum))
441 return -EINVAL;
443 md_dev[minor].pers=pers[pnum];
445 if ((err=md_dev[minor].pers->run (minor, md_dev+minor)))
447 md_dev[minor].pers=NULL;
448 free_sb(md_dev + minor);
449 return (err);
452 if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT)
454 md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN);
455 md_update_sb(minor);
458 /* FIXME : We assume here we have blocks
459 that are twice as large as sectors.
460 THIS MAY NOT BE TRUE !!! */
461 md_hd_struct[minor].start_sect=0;
462 md_hd_struct[minor].nr_sects=md_size[minor]<<1;
464 read_ahead[MD_MAJOR] = 128;
465 return (0);
468 static int do_md_stop (int minor, struct inode *inode)
470 int i;
472 if (inode->i_count>1 || md_dev[minor].busy>1) {
474 * ioctl : one open channel
476 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
477 minor, inode->i_count, md_dev[minor].busy);
478 return -EBUSY;
481 if (md_dev[minor].pers) {
483 * It is safe to call stop here, it only frees private
484 * data. Also, it tells us if a device is unstoppable
485 * (eg. resyncing is in progress)
487 if (md_dev[minor].pers->stop (minor, md_dev+minor))
488 return -EBUSY;
490 * The device won't exist anymore -> flush it now
492 fsync_dev (inode->i_rdev);
493 invalidate_buffers (inode->i_rdev);
494 if (md_dev[minor].sb) {
495 md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
496 md_update_sb(minor);
500 /* Remove locks. */
501 if (md_dev[minor].sb)
502 free_sb(md_dev + minor);
503 for (i=0; i<md_dev[minor].nb_dev; i++)
504 clear_inode (md_dev[minor].devices[i].inode);
506 md_dev[minor].nb_dev=md_size[minor]=0;
507 md_hd_struct[minor].nr_sects=0;
508 md_dev[minor].pers=NULL;
510 read_ahead[MD_MAJOR] = 128;
512 return (0);
515 static int do_md_add (int minor, kdev_t dev)
517 int i;
518 int hot_add=0;
519 struct real_dev *realdev;
521 if (md_dev[minor].nb_dev==MAX_REAL)
522 return -EINVAL;
524 if (!fs_may_mount (dev))
525 return -EBUSY;
527 if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) {
528 printk("md_add(): zero device size, huh, bailing out.\n");
529 return -EINVAL;
532 if (md_dev[minor].pers) {
534 * The array is already running, hot-add the drive, or
535 * bail out:
537 if (!md_dev[minor].pers->hot_add_disk)
538 return -EBUSY;
539 else
540 hot_add=1;
544 * Careful. We cannot increase nb_dev for a running array.
546 i=md_dev[minor].nb_dev;
547 realdev = &md_dev[minor].devices[i];
548 realdev->dev=dev;
550 /* Lock the device by inserting a dummy inode. This doesn't
551 smell very good, but I need to be consistent with the
552 mount stuff, specially with fs_may_mount. If someone have
553 a better idea, please help ! */
555 realdev->inode=get_empty_inode ();
556 if (!realdev->inode)
557 return -ENOMEM;
558 realdev->inode->i_dev=dev; /* don't care about other fields */
559 insert_inode_hash (realdev->inode);
561 /* Sizes are now rounded at run time */
563 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
565 realdev->size=blk_size[MAJOR(dev)][MINOR(dev)];
567 if (hot_add) {
569 * Check the superblock for consistency.
570 * The personality itself has to check whether it's getting
571 * added with the proper flags. The personality has to be
572 * checked too. ;)
574 if (analyze_one_sb (realdev))
575 return -EINVAL;
577 * hot_add has to bump up nb_dev itself
579 if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) {
581 * FIXME: here we should free up the inode and stuff
583 printk ("FIXME\n");
584 return -EINVAL;
586 } else
587 md_dev[minor].nb_dev++;
589 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor);
590 return (0);
593 static int md_ioctl (struct inode *inode, struct file *file,
594 unsigned int cmd, unsigned long arg)
596 int minor, err;
597 struct hd_geometry *loc = (struct hd_geometry *) arg;
599 if (!capable(CAP_SYS_ADMIN))
600 return -EACCES;
602 if (((minor=MINOR(inode->i_rdev)) & 0x80) &&
603 (minor & 0x7f) < MAX_PERSONALITY &&
604 pers[minor & 0x7f] &&
605 pers[minor & 0x7f]->ioctl)
606 return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg));
608 if (minor >= MAX_MD_DEV)
609 return -EINVAL;
611 switch (cmd)
613 case REGISTER_DEV:
614 return do_md_add (minor, to_kdev_t ((dev_t) arg));
616 case START_MD:
617 return do_md_run (minor, (int) arg);
619 case STOP_MD:
620 return do_md_stop (minor, inode);
622 case BLKGETSIZE: /* Return device size */
623 if (!arg) return -EINVAL;
624 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg);
625 if (err)
626 return err;
627 break;
630 /* We have a problem here : there is no easy way to give a CHS
631 virtual geometry. We currently pretend that we have a 2 heads
632 4 sectors (with a BIG number of cylinders...). This drives dosfs
633 just mad... ;-) */
635 case HDIO_GETGEO:
636 if (!loc) return -EINVAL;
637 err = put_user (2, (char *) &loc->heads);
638 if (err)
639 return err;
640 err = put_user (4, (char *) &loc->sectors);
641 if (err)
642 return err;
643 err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders);
644 if (err)
645 return err;
646 err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect,
647 (long *) &loc->start);
648 if (err)
649 return err;
650 break;
652 case BLKROSET:
653 case BLKROGET:
654 case BLKRAGET:
655 case BLKRASET:
656 case BLKFLSBUF:
657 return blk_ioctl(inode->i_rdev, cmd, arg);
659 default:
660 return -EINVAL;
663 return (0);
666 static int md_open (struct inode *inode, struct file *file)
668 int minor=MINOR(inode->i_rdev);
670 md_dev[minor].busy++;
671 return (0); /* Always succeed */
675 static int md_release (struct inode *inode, struct file *file)
677 int minor=MINOR(inode->i_rdev);
679 sync_dev (inode->i_rdev);
680 md_dev[minor].busy--;
681 return 0;
685 static ssize_t md_read (struct file *file, char *buf, size_t count,
686 loff_t *ppos)
688 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
690 if (!md_dev[minor].pers) /* Check if device is being run */
691 return -ENXIO;
693 return block_read(file, buf, count, ppos);
696 static ssize_t md_write (struct file *file, const char *buf,
697 size_t count, loff_t *ppos)
699 int minor=MINOR(file->f_dentry->d_inode->i_rdev);
701 if (!md_dev[minor].pers) /* Check if device is being run */
702 return -ENXIO;
704 return block_write(file, buf, count, ppos);
707 static struct file_operations md_fops=
709 NULL,
710 md_read,
711 md_write,
712 NULL,
713 NULL,
714 md_ioctl,
715 NULL,
716 md_open,
717 NULL,
718 md_release,
719 block_fsync
722 int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size)
724 if ((unsigned int) minor >= MAX_MD_DEV)
726 printk ("Bad md device %d\n", minor);
727 return (-1);
730 if (!md_dev[minor].pers)
732 printk ("Oops ! md%d not running, giving up !\n", minor);
733 return (-1);
736 return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size));
739 int md_make_request (int minor, int rw, struct buffer_head * bh)
741 if (md_dev [minor].pers->make_request) {
742 if (buffer_locked(bh))
743 return 0;
744 set_bit(BH_Lock, &bh->b_state);
745 if (rw == WRITE) {
746 if (!buffer_dirty(bh)) {
747 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
748 return 0;
751 if (rw == READ || rw == READA) {
752 if (buffer_uptodate(bh)) {
753 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
754 return 0;
757 return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh));
758 } else {
759 make_request (MAJOR(bh->b_rdev), rw, bh);
760 return 0;
764 static void do_md_request (void)
766 printk ("Got md request, not good...");
767 return;
770 void md_wakeup_thread(struct md_thread *thread)
772 set_bit(THREAD_WAKEUP, &thread->flags);
773 wake_up(&thread->wqueue);
776 struct md_thread *md_register_thread (void (*run) (void *), void *data)
778 struct md_thread *thread = (struct md_thread *)
779 kmalloc(sizeof(struct md_thread), GFP_KERNEL);
780 int ret;
781 DECLARE_MUTEX_LOCKED(sem);
783 if (!thread) return NULL;
785 memset(thread, 0, sizeof(struct md_thread));
786 init_waitqueue_head(&thread->wqueue);
788 thread->sem = &sem;
789 thread->run = run;
790 thread->data = data;
791 ret = kernel_thread(md_thread, thread, 0);
792 if (ret < 0) {
793 kfree(thread);
794 return NULL;
796 down(&sem);
797 return thread;
800 void md_unregister_thread (struct md_thread *thread)
802 DECLARE_MUTEX_LOCKED(sem);
804 thread->sem = &sem;
805 thread->run = NULL;
806 if (thread->tsk)
807 printk("Killing md_thread %d %p %s\n",
808 thread->tsk->pid, thread->tsk, thread->tsk->comm);
809 else
810 printk("Aiee. md_thread has 0 tsk\n");
811 send_sig(SIGKILL, thread->tsk, 1);
812 printk("downing on %p\n", &sem);
813 down(&sem);
816 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
818 int md_thread(void * arg)
820 struct md_thread *thread = arg;
822 lock_kernel();
823 exit_mm(current);
824 exit_files(current);
825 exit_fs(current);
827 current->session = 1;
828 current->pgrp = 1;
829 sprintf(current->comm, "md_thread");
830 siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
831 thread->tsk = current;
832 up(thread->sem);
834 for (;;) {
835 cli();
836 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
837 do {
838 spin_lock(&current->sigmask_lock);
839 flush_signals(current);
840 spin_unlock(&current->sigmask_lock);
841 interruptible_sleep_on(&thread->wqueue);
842 cli();
843 if (test_bit(THREAD_WAKEUP, &thread->flags))
844 break;
845 if (!thread->run) {
846 sti();
847 up(thread->sem);
848 return 0;
850 } while (signal_pending(current));
852 sti();
853 clear_bit(THREAD_WAKEUP, &thread->flags);
854 if (thread->run) {
855 thread->run(thread->data);
856 run_task_queue(&tq_disk);
861 EXPORT_SYMBOL(md_size);
862 EXPORT_SYMBOL(md_maxreadahead);
863 EXPORT_SYMBOL(register_md_personality);
864 EXPORT_SYMBOL(unregister_md_personality);
865 EXPORT_SYMBOL(md_dev);
866 EXPORT_SYMBOL(md_error);
867 EXPORT_SYMBOL(md_register_thread);
868 EXPORT_SYMBOL(md_unregister_thread);
869 EXPORT_SYMBOL(md_update_sb);
870 EXPORT_SYMBOL(md_map);
871 EXPORT_SYMBOL(md_wakeup_thread);
872 EXPORT_SYMBOL(md_do_sync);
874 #ifdef CONFIG_PROC_FS
875 static int md_status_read_proc(char *page, char **start, off_t off,
876 int count, int *eof, void *data)
878 int sz = 0, i, j, size;
879 int begin = 0;
881 sz=sprintf( page, "Personalities : ");
882 for (i=0; i<MAX_PERSONALITY; i++)
883 if (pers[i])
884 sz+=sprintf (page+sz, "[%d %s] ", i, pers[i]->name);
885 page[sz-1]='\n';
887 sz+=sprintf (page+sz, "read_ahead ");
888 if (read_ahead[MD_MAJOR]==INT_MAX)
889 sz+=sprintf (page+sz, "not set\n");
890 else
891 sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
893 for (i=0; i<MAX_MD_DEV; i++) {
894 if (sz < off) {
895 begin += sz;
896 off -= sz;
897 sz = 0;
899 if (sz >= off+count) {
900 *eof = 1;
901 break;
903 sz+=sprintf (page+sz, "md%d : %sactive",
904 i, md_dev[i].pers ? "" : "in");
906 if (md_dev[i].pers)
907 sz+=sprintf (page+sz, " %s", md_dev[i].pers->name);
909 for (j=0, size=0; j<md_dev[i].nb_dev; j++) {
910 sz+=sprintf (page+sz, " %s",
911 partition_name(md_dev[i].devices[j].dev));
912 size+=md_dev[i].devices[j].size;
915 if (md_dev[i].nb_dev) {
916 if (md_dev[i].pers)
917 sz+=sprintf (page+sz, " %d blocks", md_size[i]);
918 else
919 sz+=sprintf (page+sz, " %d blocks", size);
922 if (!md_dev[i].pers) {
923 sz+=sprintf (page+sz, "\n");
924 continue;
927 if (md_dev[i].pers->max_invalid_dev)
928 sz+=sprintf (page+sz, " maxfault=%ld",
929 MAX_FAULT(md_dev+i));
931 sz+=md_dev[i].pers->status (page+sz, i, md_dev+i);
932 sz+=sprintf (page+sz, "\n");
935 sz -= off;
936 *start = page + off;
937 if (sz>count)
938 sz = count;
939 if (sz<0)
940 sz = 0;
941 return sz;
943 #endif
945 static void md_geninit (struct gendisk *gdisk)
947 int i;
949 for(i=0;i<MAX_MD_DEV;i++)
951 md_blocksizes[i] = 1024;
952 md_maxreadahead[i] = MD_DEFAULT_DISK_READAHEAD;
953 md_gendisk.part[i].start_sect=-1; /* avoid partition check */
954 md_gendisk.part[i].nr_sects=0;
955 md_dev[i].pers=NULL;
958 blksize_size[MD_MAJOR] = md_blocksizes;
959 max_readahead[MD_MAJOR] = md_maxreadahead;
961 #ifdef CONFIG_PROC_FS
962 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
963 #endif
966 int md_error (kdev_t mddev, kdev_t rdev)
968 unsigned int minor = MINOR (mddev);
969 int rc;
971 if (MAJOR(mddev) != MD_MAJOR || minor > MAX_MD_DEV)
972 panic ("md_error gets unknown device\n");
973 if (!md_dev [minor].pers)
974 panic ("md_error gets an error for an unknown device\n");
975 if (md_dev [minor].pers->error_handler) {
976 rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev);
977 #if SUPPORT_RECONSTRUCTION
978 md_wakeup_thread(md_sync_thread);
979 #endif /* SUPPORT_RECONSTRUCTION */
980 return rc;
982 return 0;
985 int register_md_personality (int p_num, struct md_personality *p)
987 int i=(p_num >> PERSONALITY_SHIFT);
989 if (i >= MAX_PERSONALITY)
990 return -EINVAL;
992 if (pers[i])
993 return -EBUSY;
995 pers[i]=p;
996 printk ("%s personality registered\n", p->name);
997 return 0;
1000 int unregister_md_personality (int p_num)
1002 int i=(p_num >> PERSONALITY_SHIFT);
1004 if (i >= MAX_PERSONALITY)
1005 return -EINVAL;
1007 printk ("%s personality unregistered\n", pers[i]->name);
1008 pers[i]=NULL;
1009 return 0;
1012 static md_descriptor_t *get_spare(struct md_dev *mddev)
1014 int i;
1015 md_superblock_t *sb = mddev->sb;
1016 md_descriptor_t *descriptor;
1017 struct real_dev *realdev;
1019 for (i = 0; i < mddev->nb_dev; i++) {
1020 realdev = &mddev->devices[i];
1021 if (!realdev->sb)
1022 continue;
1023 descriptor = &sb->disks[realdev->sb->descriptor.number];
1024 if (descriptor->state & (1 << MD_FAULTY_DEVICE))
1025 continue;
1026 if (descriptor->state & (1 << MD_ACTIVE_DEVICE))
1027 continue;
1028 return descriptor;
1030 return NULL;
1034 * parallel resyncing thread.
1036 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1037 * - fix read error handing
1040 int md_do_sync(struct md_dev *mddev)
1042 struct buffer_head *bh;
1043 int max_blocks, blocksize, curr_bsize, percent=1, j;
1044 kdev_t read_disk = MKDEV(MD_MAJOR, mddev - md_dev);
1045 int major = MAJOR(read_disk), minor = MINOR(read_disk);
1046 unsigned long starttime;
1048 blocksize = blksize_size[major][minor];
1049 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1051 printk("... resync log\n");
1052 printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev);
1053 printk(" .... raid array: %s\n", kdevname(read_disk));
1054 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize);
1055 printk("md: syncing RAID array %s\n", kdevname(read_disk));
1057 mddev->busy++;
1059 starttime=jiffies;
1060 for (j = 0; j < max_blocks; j++) {
1063 * B careful. When some1 mounts a non-'blocksize' filesystem
1064 * then we get the blocksize changed right under us. Go deal
1065 * with it transparently, recalculate 'blocksize', 'j' and
1066 * 'max_blocks':
1068 curr_bsize = blksize_size[major][minor];
1069 if (curr_bsize != blocksize) {
1070 diff_blocksize:
1071 if (curr_bsize > blocksize)
1073 * this is safe, rounds downwards.
1075 j /= curr_bsize/blocksize;
1076 else
1077 j *= blocksize/curr_bsize;
1079 blocksize = curr_bsize;
1080 max_blocks = blk_size[major][minor] / (blocksize >> 10);
1082 if ((bh = breada (read_disk, j, blocksize, j * blocksize,
1083 max_blocks * blocksize)) != NULL) {
1084 mark_buffer_dirty(bh, 1);
1085 brelse(bh);
1086 } else {
1088 * FIXME: Ugly, but set_blocksize() isnt safe ...
1090 curr_bsize = blksize_size[major][minor];
1091 if (curr_bsize != blocksize)
1092 goto diff_blocksize;
1095 * It's a real read problem. FIXME, handle this
1096 * a better way.
1098 printk ( KERN_ALERT
1099 "read error, stopping reconstruction.\n");
1100 mddev->busy--;
1101 return 1;
1105 * Let's sleep some if we are faster than our speed limit:
1107 while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT)
1109 current->state = TASK_INTERRUPTIBLE;
1110 schedule_timeout(1);
1114 * FIXME: put this status bar thing into /proc
1116 if (!(j%(max_blocks/100))) {
1117 if (!(percent%10))
1118 printk (" %03d%% done.\n",percent);
1119 else
1120 printk (".");
1121 percent++;
1124 fsync_dev(read_disk);
1125 printk("md: %s: sync done.\n", kdevname(read_disk));
1126 mddev->busy--;
1127 return 0;
1131 * This is a kernel thread which: syncs a spare disk with the active array
1133 * the amount of foolproofing might seem to be a tad excessive, but an
1134 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1135 * of my root partition with the first 0.5 gigs of my /home partition ... so
1136 * i'm a bit nervous ;)
1138 void mdsyncd (void *data)
1140 int i;
1141 struct md_dev *mddev;
1142 md_superblock_t *sb;
1143 md_descriptor_t *spare;
1144 unsigned long flags;
1146 for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) {
1147 if ((sb = mddev->sb) == NULL)
1148 continue;
1149 if (sb->active_disks == sb->raid_disks)
1150 continue;
1151 if (!sb->spare_disks)
1152 continue;
1153 if ((spare = get_spare(mddev)) == NULL)
1154 continue;
1155 if (!mddev->pers->mark_spare)
1156 continue;
1157 if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE))
1158 continue;
1159 if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) {
1160 mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE);
1161 continue;
1163 save_flags(flags);
1164 cli();
1165 mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE);
1166 spare->state |= (1 << MD_SYNC_DEVICE);
1167 spare->state |= (1 << MD_ACTIVE_DEVICE);
1168 sb->spare_disks--;
1169 sb->active_disks++;
1170 mddev->sb_dirty = 1;
1171 md_update_sb(mddev - md_dev);
1172 restore_flags(flags);
1177 #ifdef CONFIG_MD_BOOT
1178 struct {
1179 int set;
1180 int ints[100];
1181 char str[100];
1182 } md_setup_args __initdata = {
1183 0,{0},{0}
1186 /* called from init/main.c */
1187 void __init md_setup(char *str,int *ints)
1189 int i;
1190 for(i=0;i<=ints[0];i++) {
1191 md_setup_args.ints[i] = ints[i];
1192 strcpy(md_setup_args.str, str);
1193 /* printk ("md: ints[%d]=%d.\n", i, ints[i]);*/
1195 md_setup_args.set=1;
1196 return;
1199 void __init do_md_setup(char *str,int *ints)
1201 int minor, pers, factor, fault;
1202 kdev_t dev;
1203 int i=1;
1205 if(ints[0] < 4) {
1206 printk ("md: Too few Arguments (%d).\n", ints[0]);
1207 return;
1210 minor=ints[i++];
1212 if (minor >= MAX_MD_DEV) {
1213 printk ("md: Minor device number too high.\n");
1214 return;
1217 pers = 0;
1219 switch(ints[i++]) { /* Raidlevel */
1220 case -1:
1221 #ifdef CONFIG_MD_LINEAR
1222 pers = LINEAR;
1223 printk ("md: Setting up md%d as linear device.\n",minor);
1224 #else
1225 printk ("md: Linear mode not configured."
1226 "Recompile the kernel with linear mode enabled!\n");
1227 #endif
1228 break;
1229 case 0:
1230 pers = STRIPED;
1231 #ifdef CONFIG_MD_STRIPED
1232 printk ("md: Setting up md%d as a striped device.\n",minor);
1233 #else
1234 printk ("md: Striped mode not configured."
1235 "Recompile the kernel with striped mode enabled!\n");
1236 #endif
1237 break;
1238 /* not supported yet
1239 case 1:
1240 pers = RAID1;
1241 printk ("md: Setting up md%d as a raid1 device.\n",minor);
1242 break;
1243 case 5:
1244 pers = RAID5;
1245 printk ("md: Setting up md%d as a raid5 device.\n",minor);
1246 break;
1248 default:
1249 printk ("md: Unknown or not supported raid level %d.\n", ints[--i]);
1250 return;
1253 if(pers) {
1255 factor=ints[i++]; /* Chunksize */
1256 fault =ints[i++]; /* Faultlevel */
1258 pers=pers | factor | (fault << FAULT_SHIFT);
1260 while( str && (dev = name_to_kdev_t(str))) {
1261 do_md_add (minor, dev);
1262 if((str = strchr (str, ',')) != NULL)
1263 str++;
1266 do_md_run (minor, pers);
1267 printk ("md: Loading md%d.\n",minor);
1271 #endif
1273 void linear_init (void);
1274 void raid0_init (void);
1275 void raid1_init (void);
1276 void raid5_init (void);
1278 int __init md_init (void)
1280 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1281 MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION,
1282 MAX_MD_DEV, MAX_REAL);
1284 if (register_blkdev (MD_MAJOR, "md", &md_fops))
1286 printk ("Unable to get major %d for md\n", MD_MAJOR);
1287 return (-1);
1290 blk_dev[MD_MAJOR].request_fn=DEVICE_REQUEST;
1291 blk_dev[MD_MAJOR].current_request=NULL;
1292 read_ahead[MD_MAJOR]=INT_MAX;
1293 memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev));
1294 md_gendisk.next=gendisk_head;
1296 gendisk_head=&md_gendisk;
1298 #if SUPPORT_RECONSTRUCTION
1299 if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL)
1300 printk("md: bug: md_sync_thread == NULL\n");
1301 #endif /* SUPPORT_RECONSTRUCTION */
1303 #ifdef CONFIG_MD_LINEAR
1304 linear_init ();
1305 #endif
1306 #ifdef CONFIG_MD_STRIPED
1307 raid0_init ();
1308 #endif
1309 #ifdef CONFIG_MD_MIRRORING
1310 raid1_init ();
1311 #endif
1312 #ifdef CONFIG_MD_RAID5
1313 raid5_init ();
1314 #endif
1315 return (0);
1318 #ifdef CONFIG_MD_BOOT
1319 void __init md_setup_drive(void)
1321 if(md_setup_args.set)
1322 do_md_setup(md_setup_args.str, md_setup_args.ints);
1324 #endif