3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
11 boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 RAID-1/RAID-5 extensions by:
14 Ingo Molnar, Miguel de Icaza, Gadi Oxman
19 This program is free software; you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation; either version 2, or (at your option)
24 You should have received a copy of the GNU General Public License
25 (for example /usr/src/linux/COPYING); if not, write to the Free
26 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
31 * the extra system load does not show up that much. Increase it if your
32 * system can take more.
34 #define SPEED_LIMIT 1024
36 #include <linux/config.h>
37 #include <linux/module.h>
38 #include <linux/version.h>
39 #include <linux/malloc.h>
42 #include <linux/hdreg.h>
43 #include <linux/stat.h>
45 #include <linux/proc_fs.h>
46 #include <linux/blkdev.h>
47 #include <linux/genhd.h>
48 #include <linux/smp_lock.h>
50 #include <linux/kmod.h>
52 #include <linux/errno.h>
53 #include <linux/init.h>
55 #define __KERNEL_SYSCALLS__
56 #include <linux/unistd.h>
58 #define MAJOR_NR MD_MAJOR
61 #include <linux/blk.h>
62 #include <asm/uaccess.h>
63 #include <asm/bitops.h>
64 #include <asm/atomic.h>
67 extern kdev_t
name_to_kdev_t(char *line
) __init
;
70 static struct hd_struct md_hd_struct
[MAX_MD_DEV
];
71 static int md_blocksizes
[MAX_MD_DEV
];
72 int md_maxreadahead
[MAX_MD_DEV
];
73 #if SUPPORT_RECONSTRUCTION
74 static struct md_thread
*md_sync_thread
= NULL
;
75 #endif /* SUPPORT_RECONSTRUCTION */
77 int md_size
[MAX_MD_DEV
]={0, };
79 static void md_geninit (struct gendisk
*);
81 static struct gendisk md_gendisk
=
96 static struct md_personality
*pers
[MAX_PERSONALITY
]={NULL
, };
97 struct md_dev md_dev
[MAX_MD_DEV
];
99 int md_thread(void * arg
);
101 static struct gendisk
*find_gendisk (kdev_t dev
)
103 struct gendisk
*tmp
=gendisk_head
;
107 if (tmp
->major
==MAJOR(dev
))
116 char *partition_name (kdev_t dev
)
118 static char name
[40]; /* This should be long
119 enough for a device name ! */
120 struct gendisk
*hd
= find_gendisk (dev
);
124 sprintf (name
, "[dev %s]", kdevname(dev
));
128 return disk_name (hd
, MINOR(dev
), name
); /* routine in genhd.c */
131 static int legacy_raid_sb (int minor
, int pnum
)
135 factor
= 1 << FACTOR_SHIFT(FACTOR((md_dev
+minor
)));
138 * do size and offset calculations.
140 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++) {
141 md_dev
[minor
].devices
[i
].size
&= ~(factor
- 1);
142 md_size
[minor
] += md_dev
[minor
].devices
[i
].size
;
143 md_dev
[minor
].devices
[i
].offset
=i
? (md_dev
[minor
].devices
[i
-1].offset
+
144 md_dev
[minor
].devices
[i
-1].size
) : 0;
146 if (pnum
== RAID0
>> PERSONALITY_SHIFT
)
147 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
* md_dev
[minor
].nb_dev
;
151 static void free_sb (struct md_dev
*mddev
)
154 struct real_dev
*realdev
;
157 free_page((unsigned long) mddev
->sb
);
160 for (i
= 0; i
<mddev
->nb_dev
; i
++) {
161 realdev
= mddev
->devices
+ i
;
163 free_page((unsigned long) realdev
->sb
);
170 * Check one RAID superblock for generic plausibility
173 #define BAD_MAGIC KERN_ERR \
174 "md: %s: invalid raid superblock magic (%x) on block %u\n"
176 #define OUT_OF_MEM KERN_ALERT \
177 "md: out of memory.\n"
179 #define NO_DEVICE KERN_ERR \
180 "md: disabled device %s\n"
185 static int analyze_one_sb (struct real_dev
* rdev
)
188 struct buffer_head
*bh
;
189 kdev_t dev
= rdev
->dev
;
193 * Read the superblock, it's at the end of the disk
195 rdev
->sb_offset
= MD_NEW_SIZE_BLOCKS (blk_size
[MAJOR(dev
)][MINOR(dev
)]);
196 set_blocksize (dev
, MD_SB_BYTES
);
197 bh
= bread (dev
, rdev
->sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
200 sb
= (md_superblock_t
*) bh
->b_data
;
201 if (sb
->md_magic
!= MD_SB_MAGIC
) {
202 printk (BAD_MAGIC
, kdevname(dev
),
203 sb
->md_magic
, rdev
->sb_offset
);
206 rdev
->sb
= (md_superblock_t
*) __get_free_page(GFP_KERNEL
);
211 memcpy (rdev
->sb
, bh
->b_data
, MD_SB_BYTES
);
213 rdev
->size
= sb
->size
;
215 printk (NO_DEVICE
,kdevname(rdev
->dev
));
231 * Check a full RAID array for plausibility
234 #define INCONSISTENT KERN_ERR \
235 "md: superblock inconsistency -- run ckraid\n"
237 #define OUT_OF_DATE KERN_ERR \
238 "md: superblock update time inconsistenty -- using the most recent one\n"
240 #define OLD_VERSION KERN_ALERT \
241 "md: %s: unsupported raid array version %d.%d.%d\n"
243 #define NOT_CLEAN KERN_ERR \
244 "md: %s: raid array is not clean -- run ckraid\n"
246 #define NOT_CLEAN_IGNORE KERN_ERR \
247 "md: %s: raid array is not clean -- reconstructing parity\n"
249 #define UNKNOWN_LEVEL KERN_ERR \
250 "md: %s: unsupported raid level %d\n"
252 static int analyze_sbs (int minor
, int pnum
)
254 struct md_dev
*mddev
= md_dev
+ minor
;
255 int i
, N
= mddev
->nb_dev
, out_of_date
= 0;
256 struct real_dev
* disks
= mddev
->devices
;
257 md_superblock_t
*sb
, *freshest
= NULL
;
260 * RAID-0 and linear don't use a RAID superblock
262 if (pnum
== RAID0
>> PERSONALITY_SHIFT
||
263 pnum
== LINEAR
>> PERSONALITY_SHIFT
)
264 return legacy_raid_sb (minor
, pnum
);
267 * Verify the RAID superblock on each real device
269 for (i
= 0; i
< N
; i
++)
270 if (analyze_one_sb(disks
+i
))
274 * The superblock constant part has to be the same
275 * for all disks in the array.
278 for (i
= 0; i
< N
; i
++) {
286 disks
[i
].sb
, MD_SB_GENERIC_CONSTANT_WORDS
* 4)) {
287 printk (INCONSISTENT
);
293 * OK, we have all disks and the array is ready to run. Let's
294 * find the freshest superblock, that one will be the superblock
295 * that represents the whole array.
297 if ((sb
= mddev
->sb
= (md_superblock_t
*) __get_free_page (GFP_KERNEL
)) == NULL
)
300 for (i
= 0; i
< N
; i
++) {
304 freshest
= disks
[i
].sb
;
308 * Find the newest superblock version
310 if (disks
[i
].sb
->utime
!= freshest
->utime
) {
312 if (disks
[i
].sb
->utime
> freshest
->utime
)
313 freshest
= disks
[i
].sb
;
318 memcpy (sb
, freshest
, sizeof(*freshest
));
321 * Check if we can support this RAID array
323 if (sb
->major_version
!= MD_MAJOR_VERSION
||
324 sb
->minor_version
> MD_MINOR_VERSION
) {
326 printk (OLD_VERSION
, kdevname(MKDEV(MD_MAJOR
, minor
)),
327 sb
->major_version
, sb
->minor_version
,
333 * We need to add this as a superblock option.
335 #if SUPPORT_RECONSTRUCTION
336 if (sb
->state
!= (1 << MD_SB_CLEAN
)) {
337 if (sb
->level
== 1) {
338 printk (NOT_CLEAN
, kdevname(MKDEV(MD_MAJOR
, minor
)));
341 printk (NOT_CLEAN_IGNORE
, kdevname(MKDEV(MD_MAJOR
, minor
)));
344 if (sb
->state
!= (1 << MD_SB_CLEAN
)) {
345 printk (NOT_CLEAN
, kdevname(MKDEV(MD_MAJOR
, minor
)));
348 #endif /* SUPPORT_RECONSTRUCTION */
352 md_size
[minor
] = sb
->size
;
353 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
;
357 md_size
[minor
] = sb
->size
* (sb
->raid_disks
- 1);
358 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
* (sb
->raid_disks
- 1);
361 printk (UNKNOWN_LEVEL
, kdevname(MKDEV(MD_MAJOR
, minor
)),
377 int md_update_sb(int minor
)
379 struct md_dev
*mddev
= md_dev
+ minor
;
380 struct buffer_head
*bh
;
381 md_superblock_t
*sb
= mddev
->sb
;
382 struct real_dev
*realdev
;
387 sb
->utime
= CURRENT_TIME
;
388 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
389 realdev
= mddev
->devices
+ i
;
393 sb_offset
= realdev
->sb_offset
;
394 set_blocksize(dev
, MD_SB_BYTES
);
395 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev
), sb_offset
);
396 bh
= getblk(dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
398 sb
= (md_superblock_t
*) bh
->b_data
;
399 memcpy(sb
, mddev
->sb
, MD_SB_BYTES
);
400 memcpy(&sb
->descriptor
, sb
->disks
+ realdev
->sb
->descriptor
.number
, MD_SB_DESCRIPTOR_WORDS
* 4);
401 mark_buffer_uptodate(bh
, 1);
402 mark_buffer_dirty(bh
, 1);
403 ll_rw_block(WRITE
, 1, &bh
);
407 invalidate_buffers(dev
);
409 printk(KERN_ERR
"md: getblk failed for device %s\n", kdevname(dev
));
414 static int do_md_run (int minor
, int repart
)
416 int pnum
, i
, min
, factor
, err
;
418 if (!md_dev
[minor
].nb_dev
)
421 if (md_dev
[minor
].pers
)
424 md_dev
[minor
].repartition
=repart
;
426 if ((pnum
=PERSONALITY(&md_dev
[minor
]) >> (PERSONALITY_SHIFT
))
430 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
431 if (pnum
!= (RAID1
>> PERSONALITY_SHIFT
) && pnum
!= (RAID5
>> PERSONALITY_SHIFT
)){
432 for (i
= 0; i
< md_dev
[minor
].nb_dev
; i
++)
433 if (MAJOR (md_dev
[minor
].devices
[i
].dev
) == MD_MAJOR
)
439 char module_name
[80];
440 sprintf (module_name
, "md-personality-%d", pnum
);
441 request_module (module_name
);
447 factor
= min
= 1 << FACTOR_SHIFT(FACTOR((md_dev
+minor
)));
449 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++)
450 if (md_dev
[minor
].devices
[i
].size
<min
)
452 printk ("Dev %s smaller than %dk, cannot shrink\n",
453 partition_name (md_dev
[minor
].devices
[i
].dev
), min
);
457 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++) {
458 fsync_dev(md_dev
[minor
].devices
[i
].dev
);
459 invalidate_buffers(md_dev
[minor
].devices
[i
].dev
);
462 /* Resize devices according to the factor. It is used to align
463 partitions size on a given chunk size. */
467 * Analyze the raid superblock
469 if (analyze_sbs(minor
, pnum
))
472 md_dev
[minor
].pers
=pers
[pnum
];
474 if ((err
=md_dev
[minor
].pers
->run (minor
, md_dev
+minor
)))
476 md_dev
[minor
].pers
=NULL
;
477 free_sb(md_dev
+ minor
);
481 if (pnum
!= RAID0
>> PERSONALITY_SHIFT
&& pnum
!= LINEAR
>> PERSONALITY_SHIFT
)
483 md_dev
[minor
].sb
->state
&= ~(1 << MD_SB_CLEAN
);
487 /* FIXME : We assume here we have blocks
488 that are twice as large as sectors.
489 THIS MAY NOT BE TRUE !!! */
490 md_hd_struct
[minor
].start_sect
=0;
491 md_hd_struct
[minor
].nr_sects
=md_size
[minor
]<<1;
493 read_ahead
[MD_MAJOR
] = 128;
497 static int do_md_stop (int minor
, struct inode
*inode
)
501 if (inode
->i_count
>1 || md_dev
[minor
].busy
>1) {
503 * ioctl : one open channel
505 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
506 minor
, inode
->i_count
, md_dev
[minor
].busy
);
510 if (md_dev
[minor
].pers
) {
512 * It is safe to call stop here, it only frees private
513 * data. Also, it tells us if a device is unstoppable
514 * (eg. resyncing is in progress)
516 if (md_dev
[minor
].pers
->stop (minor
, md_dev
+minor
))
519 * The device won't exist anymore -> flush it now
521 fsync_dev (inode
->i_rdev
);
522 invalidate_buffers (inode
->i_rdev
);
523 if (md_dev
[minor
].sb
) {
524 md_dev
[minor
].sb
->state
|= 1 << MD_SB_CLEAN
;
530 if (md_dev
[minor
].sb
)
531 free_sb(md_dev
+ minor
);
532 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++)
533 clear_inode (md_dev
[minor
].devices
[i
].inode
);
535 md_dev
[minor
].nb_dev
=md_size
[minor
]=0;
536 md_hd_struct
[minor
].nr_sects
=0;
537 md_dev
[minor
].pers
=NULL
;
539 read_ahead
[MD_MAJOR
] = 128;
544 static int do_md_add (int minor
, kdev_t dev
)
548 struct real_dev
*realdev
;
550 if (md_dev
[minor
].nb_dev
==MAX_REAL
)
553 if (!fs_may_mount (dev
))
556 if (blk_size
[MAJOR(dev
)] == NULL
|| blk_size
[MAJOR(dev
)][MINOR(dev
)] == 0) {
557 printk("md_add(): zero device size, huh, bailing out.\n");
561 if (md_dev
[minor
].pers
) {
563 * The array is already running, hot-add the drive, or
566 if (!md_dev
[minor
].pers
->hot_add_disk
)
573 * Careful. We cannot increase nb_dev for a running array.
575 i
=md_dev
[minor
].nb_dev
;
576 realdev
= &md_dev
[minor
].devices
[i
];
579 /* Lock the device by inserting a dummy inode. This doesn't
580 smell very good, but I need to be consistent with the
581 mount stuff, specially with fs_may_mount. If someone have
582 a better idea, please help ! */
584 realdev
->inode
=get_empty_inode ();
585 realdev
->inode
->i_dev
=dev
; /* don't care about other fields */
586 insert_inode_hash (realdev
->inode
);
588 /* Sizes are now rounded at run time */
590 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
592 realdev
->size
=blk_size
[MAJOR(dev
)][MINOR(dev
)];
596 * Check the superblock for consistency.
597 * The personality itself has to check whether it's getting
598 * added with the proper flags. The personality has to be
601 if (analyze_one_sb (realdev
))
604 * hot_add has to bump up nb_dev itself
606 if (md_dev
[minor
].pers
->hot_add_disk (&md_dev
[minor
], dev
)) {
608 * FIXME: here we should free up the inode and stuff
614 md_dev
[minor
].nb_dev
++;
616 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev
), minor
);
620 static int md_ioctl (struct inode
*inode
, struct file
*file
,
621 unsigned int cmd
, unsigned long arg
)
624 struct hd_geometry
*loc
= (struct hd_geometry
*) arg
;
626 if (!capable(CAP_SYS_ADMIN
))
629 if (((minor
=MINOR(inode
->i_rdev
)) & 0x80) &&
630 (minor
& 0x7f) < MAX_PERSONALITY
&&
631 pers
[minor
& 0x7f] &&
632 pers
[minor
& 0x7f]->ioctl
)
633 return (pers
[minor
& 0x7f]->ioctl (inode
, file
, cmd
, arg
));
635 if (minor
>= MAX_MD_DEV
)
641 return do_md_add (minor
, to_kdev_t ((dev_t
) arg
));
644 return do_md_run (minor
, (int) arg
);
647 return do_md_stop (minor
, inode
);
649 case BLKGETSIZE
: /* Return device size */
650 if (!arg
) return -EINVAL
;
651 err
= put_user (md_hd_struct
[MINOR(inode
->i_rdev
)].nr_sects
, (long *) arg
);
657 fsync_dev (inode
->i_rdev
);
658 invalidate_buffers (inode
->i_rdev
);
664 read_ahead
[MAJOR(inode
->i_rdev
)] = arg
;
668 if (!arg
) return -EINVAL
;
669 err
= put_user (read_ahead
[MAJOR(inode
->i_rdev
)], (long *) arg
);
674 /* We have a problem here : there is no easy way to give a CHS
675 virtual geometry. We currently pretend that we have a 2 heads
676 4 sectors (with a BIG number of cylinders...). This drives dosfs
680 if (!loc
) return -EINVAL
;
681 err
= put_user (2, (char *) &loc
->heads
);
684 err
= put_user (4, (char *) &loc
->sectors
);
687 err
= put_user (md_hd_struct
[minor
].nr_sects
/8, (short *) &loc
->cylinders
);
690 err
= put_user (md_hd_struct
[MINOR(inode
->i_rdev
)].start_sect
,
691 (long *) &loc
->start
);
696 RO_IOCTLS(inode
->i_rdev
,arg
);
699 printk ("Unknown md_ioctl %d\n", cmd
);
706 static int md_open (struct inode
*inode
, struct file
*file
)
708 int minor
=MINOR(inode
->i_rdev
);
710 md_dev
[minor
].busy
++;
711 return (0); /* Always succeed */
715 static int md_release (struct inode
*inode
, struct file
*file
)
717 int minor
=MINOR(inode
->i_rdev
);
719 sync_dev (inode
->i_rdev
);
720 md_dev
[minor
].busy
--;
725 static ssize_t
md_read (struct file
*file
, char *buf
, size_t count
,
728 int minor
=MINOR(file
->f_dentry
->d_inode
->i_rdev
);
730 if (!md_dev
[minor
].pers
) /* Check if device is being run */
733 return block_read(file
, buf
, count
, ppos
);
736 static ssize_t
md_write (struct file
*file
, const char *buf
,
737 size_t count
, loff_t
*ppos
)
739 int minor
=MINOR(file
->f_dentry
->d_inode
->i_rdev
);
741 if (!md_dev
[minor
].pers
) /* Check if device is being run */
744 return block_write(file
, buf
, count
, ppos
);
747 static struct file_operations md_fops
=
762 int md_map (int minor
, kdev_t
*rdev
, unsigned long *rsector
, unsigned long size
)
764 if ((unsigned int) minor
>= MAX_MD_DEV
)
766 printk ("Bad md device %d\n", minor
);
770 if (!md_dev
[minor
].pers
)
772 printk ("Oops ! md%d not running, giving up !\n", minor
);
776 return (md_dev
[minor
].pers
->map(md_dev
+minor
, rdev
, rsector
, size
));
779 int md_make_request (int minor
, int rw
, struct buffer_head
* bh
)
781 if (md_dev
[minor
].pers
->make_request
) {
782 if (buffer_locked(bh
))
784 set_bit(BH_Lock
, &bh
->b_state
);
785 if (rw
== WRITE
|| rw
== WRITEA
) {
786 if (!buffer_dirty(bh
)) {
787 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
791 if (rw
== READ
|| rw
== READA
) {
792 if (buffer_uptodate(bh
)) {
793 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
797 return (md_dev
[minor
].pers
->make_request(md_dev
+minor
, rw
, bh
));
799 make_request (MAJOR(bh
->b_rdev
), rw
, bh
);
804 static void do_md_request (void)
806 printk ("Got md request, not good...");
810 void md_wakeup_thread(struct md_thread
*thread
)
812 set_bit(THREAD_WAKEUP
, &thread
->flags
);
813 wake_up(&thread
->wqueue
);
816 struct md_thread
*md_register_thread (void (*run
) (void *), void *data
)
818 struct md_thread
*thread
= (struct md_thread
*)
819 kmalloc(sizeof(struct md_thread
), GFP_KERNEL
);
821 struct semaphore sem
= MUTEX_LOCKED
;
823 if (!thread
) return NULL
;
825 memset(thread
, 0, sizeof(struct md_thread
));
826 init_waitqueue(&thread
->wqueue
);
831 ret
= kernel_thread(md_thread
, thread
, 0);
840 void md_unregister_thread (struct md_thread
*thread
)
842 struct semaphore sem
= MUTEX_LOCKED
;
847 printk("Killing md_thread %d %p %s\n",
848 thread
->tsk
->pid
, thread
->tsk
, thread
->tsk
->comm
);
850 printk("Aiee. md_thread has 0 tsk\n");
851 send_sig(SIGKILL
, thread
->tsk
, 1);
852 printk("downing on %p\n", &sem
);
856 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
858 int md_thread(void * arg
)
860 struct md_thread
*thread
= arg
;
867 current
->session
= 1;
869 sprintf(current
->comm
, "md_thread");
870 siginitsetinv(¤t
->blocked
, SHUTDOWN_SIGS
);
871 thread
->tsk
= current
;
876 if (!test_bit(THREAD_WAKEUP
, &thread
->flags
)) {
878 spin_lock(¤t
->sigmask_lock
);
879 flush_signals(current
);
880 spin_unlock(¤t
->sigmask_lock
);
881 interruptible_sleep_on(&thread
->wqueue
);
883 if (test_bit(THREAD_WAKEUP
, &thread
->flags
))
890 } while (signal_pending(current
));
893 clear_bit(THREAD_WAKEUP
, &thread
->flags
);
895 thread
->run(thread
->data
);
896 run_task_queue(&tq_disk
);
901 EXPORT_SYMBOL(md_size
);
902 EXPORT_SYMBOL(md_maxreadahead
);
903 EXPORT_SYMBOL(register_md_personality
);
904 EXPORT_SYMBOL(unregister_md_personality
);
905 EXPORT_SYMBOL(partition_name
);
906 EXPORT_SYMBOL(md_dev
);
907 EXPORT_SYMBOL(md_error
);
908 EXPORT_SYMBOL(md_register_thread
);
909 EXPORT_SYMBOL(md_unregister_thread
);
910 EXPORT_SYMBOL(md_update_sb
);
911 EXPORT_SYMBOL(md_map
);
912 EXPORT_SYMBOL(md_wakeup_thread
);
913 EXPORT_SYMBOL(md_do_sync
);
915 #ifdef CONFIG_PROC_FS
916 static struct proc_dir_entry proc_md
= {
917 PROC_MD
, 6, "mdstat",
918 S_IFREG
| S_IRUGO
, 1, 0, 0,
919 0, &proc_array_inode_operations
,
923 static void md_geninit (struct gendisk
*gdisk
)
927 for(i
=0;i
<MAX_MD_DEV
;i
++)
929 md_blocksizes
[i
] = 1024;
930 md_maxreadahead
[i
] = MD_DEFAULT_DISK_READAHEAD
;
931 md_gendisk
.part
[i
].start_sect
=-1; /* avoid partition check */
932 md_gendisk
.part
[i
].nr_sects
=0;
936 blksize_size
[MD_MAJOR
] = md_blocksizes
;
937 max_readahead
[MD_MAJOR
] = md_maxreadahead
;
939 #ifdef CONFIG_PROC_FS
940 proc_register(&proc_root
, &proc_md
);
944 int md_error (kdev_t mddev
, kdev_t rdev
)
946 unsigned int minor
= MINOR (mddev
);
949 if (MAJOR(mddev
) != MD_MAJOR
|| minor
> MAX_MD_DEV
)
950 panic ("md_error gets unknown device\n");
951 if (!md_dev
[minor
].pers
)
952 panic ("md_error gets an error for an unknown device\n");
953 if (md_dev
[minor
].pers
->error_handler
) {
954 rc
= md_dev
[minor
].pers
->error_handler (md_dev
+minor
, rdev
);
955 #if SUPPORT_RECONSTRUCTION
956 md_wakeup_thread(md_sync_thread
);
957 #endif /* SUPPORT_RECONSTRUCTION */
963 int get_md_status (char *page
)
965 int sz
=0, i
, j
, size
;
967 sz
+=sprintf( page
+sz
, "Personalities : ");
968 for (i
=0; i
<MAX_PERSONALITY
; i
++)
970 sz
+=sprintf (page
+sz
, "[%d %s] ", i
, pers
[i
]->name
);
974 sz
+=sprintf (page
+sz
, "read_ahead ");
975 if (read_ahead
[MD_MAJOR
]==INT_MAX
)
976 sz
+=sprintf (page
+sz
, "not set\n");
978 sz
+=sprintf (page
+sz
, "%d sectors\n", read_ahead
[MD_MAJOR
]);
980 for (i
=0; i
<MAX_MD_DEV
; i
++)
982 sz
+=sprintf (page
+sz
, "md%d : %sactive", i
, md_dev
[i
].pers
? "" : "in");
985 sz
+=sprintf (page
+sz
, " %s", md_dev
[i
].pers
->name
);
988 for (j
=0; j
<md_dev
[i
].nb_dev
; j
++)
990 sz
+=sprintf (page
+sz
, " %s",
991 partition_name(md_dev
[i
].devices
[j
].dev
));
992 size
+=md_dev
[i
].devices
[j
].size
;
995 if (md_dev
[i
].nb_dev
) {
997 sz
+=sprintf (page
+sz
, " %d blocks", md_size
[i
]);
999 sz
+=sprintf (page
+sz
, " %d blocks", size
);
1002 if (!md_dev
[i
].pers
)
1004 sz
+=sprintf (page
+sz
, "\n");
1008 if (md_dev
[i
].pers
->max_invalid_dev
)
1009 sz
+=sprintf (page
+sz
, " maxfault=%ld", MAX_FAULT(md_dev
+i
));
1011 sz
+=md_dev
[i
].pers
->status (page
+sz
, i
, md_dev
+i
);
1012 sz
+=sprintf (page
+sz
, "\n");
1018 int register_md_personality (int p_num
, struct md_personality
*p
)
1020 int i
=(p_num
>> PERSONALITY_SHIFT
);
1022 if (i
>= MAX_PERSONALITY
)
1029 printk ("%s personality registered\n", p
->name
);
1033 int unregister_md_personality (int p_num
)
1035 int i
=(p_num
>> PERSONALITY_SHIFT
);
1037 if (i
>= MAX_PERSONALITY
)
1040 printk ("%s personality unregistered\n", pers
[i
]->name
);
1045 static md_descriptor_t
*get_spare(struct md_dev
*mddev
)
1048 md_superblock_t
*sb
= mddev
->sb
;
1049 md_descriptor_t
*descriptor
;
1050 struct real_dev
*realdev
;
1052 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
1053 realdev
= &mddev
->devices
[i
];
1056 descriptor
= &sb
->disks
[realdev
->sb
->descriptor
.number
];
1057 if (descriptor
->state
& (1 << MD_FAULTY_DEVICE
))
1059 if (descriptor
->state
& (1 << MD_ACTIVE_DEVICE
))
1067 * parallel resyncing thread.
1069 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1070 * - fix read error handing
1073 int md_do_sync(struct md_dev
*mddev
)
1075 struct buffer_head
*bh
;
1076 int max_blocks
, blocksize
, curr_bsize
, percent
=1, j
;
1077 kdev_t read_disk
= MKDEV(MD_MAJOR
, mddev
- md_dev
);
1078 int major
= MAJOR(read_disk
), minor
= MINOR(read_disk
);
1079 unsigned long starttime
;
1081 blocksize
= blksize_size
[major
][minor
];
1082 max_blocks
= blk_size
[major
][minor
] / (blocksize
>> 10);
1084 printk("... resync log\n");
1085 printk(" .... mddev->nb_dev: %d\n", mddev
->nb_dev
);
1086 printk(" .... raid array: %s\n", kdevname(read_disk
));
1087 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks
, blocksize
);
1088 printk("md: syncing RAID array %s\n", kdevname(read_disk
));
1093 for (j
= 0; j
< max_blocks
; j
++) {
1096 * B careful. When some1 mounts a non-'blocksize' filesystem
1097 * then we get the blocksize changed right under us. Go deal
1098 * with it transparently, recalculate 'blocksize', 'j' and
1101 curr_bsize
= blksize_size
[major
][minor
];
1102 if (curr_bsize
!= blocksize
) {
1104 if (curr_bsize
> blocksize
)
1106 * this is safe, rounds downwards.
1108 j
/= curr_bsize
/blocksize
;
1110 j
*= blocksize
/curr_bsize
;
1112 blocksize
= curr_bsize
;
1113 max_blocks
= blk_size
[major
][minor
] / (blocksize
>> 10);
1115 if ((bh
= breada (read_disk
, j
, blocksize
, j
* blocksize
,
1116 max_blocks
* blocksize
)) != NULL
) {
1117 mark_buffer_dirty(bh
, 1);
1121 * FIXME: Ugly, but set_blocksize() isnt safe ...
1123 curr_bsize
= blksize_size
[major
][minor
];
1124 if (curr_bsize
!= blocksize
)
1125 goto diff_blocksize
;
1128 * It's a real read problem. FIXME, handle this
1132 "read error, stopping reconstruction.\n");
1138 * Let's sleep some if we are faster than our speed limit:
1140 while (blocksize
*j
/(jiffies
-starttime
+1)*HZ
/1024 > SPEED_LIMIT
)
1142 current
->state
= TASK_INTERRUPTIBLE
;
1143 current
->timeout
= jiffies
+1;
1148 * FIXME: put this status bar thing into /proc
1150 if (!(j
%(max_blocks
/100))) {
1152 printk (" %03d%% done.\n",percent
);
1158 fsync_dev(read_disk
);
1159 printk("md: %s: sync done.\n", kdevname(read_disk
));
1165 * This is a kernel thread which: syncs a spare disk with the active array
1167 * the amount of foolproofing might seem to be a tad excessive, but an
1168 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1169 * of my root partition with the first 0.5 gigs of my /home partition ... so
1170 * i'm a bit nervous ;)
1172 void mdsyncd (void *data
)
1175 struct md_dev
*mddev
;
1176 md_superblock_t
*sb
;
1177 md_descriptor_t
*spare
;
1178 unsigned long flags
;
1180 for (i
= 0, mddev
= md_dev
; i
< MAX_MD_DEV
; i
++, mddev
++) {
1181 if ((sb
= mddev
->sb
) == NULL
)
1183 if (sb
->active_disks
== sb
->raid_disks
)
1185 if (!sb
->spare_disks
)
1187 if ((spare
= get_spare(mddev
)) == NULL
)
1189 if (!mddev
->pers
->mark_spare
)
1191 if (mddev
->pers
->mark_spare(mddev
, spare
, SPARE_WRITE
))
1193 if (md_do_sync(mddev
) || (spare
->state
& (1 << MD_FAULTY_DEVICE
))) {
1194 mddev
->pers
->mark_spare(mddev
, spare
, SPARE_INACTIVE
);
1199 mddev
->pers
->mark_spare(mddev
, spare
, SPARE_ACTIVE
);
1200 spare
->state
|= (1 << MD_SYNC_DEVICE
);
1201 spare
->state
|= (1 << MD_ACTIVE_DEVICE
);
1204 mddev
->sb_dirty
= 1;
1205 md_update_sb(mddev
- md_dev
);
1206 restore_flags(flags
);
1211 #ifdef CONFIG_MD_BOOT
1216 } md_setup_args __initdata
= {
1220 /* called from init/main.c */
1221 __initfunc(void md_setup(char *str
,int *ints
))
1224 for(i
=0;i
<=ints
[0];i
++) {
1225 md_setup_args
.ints
[i
] = ints
[i
];
1226 strcpy(md_setup_args
.str
, str
);
1227 /* printk ("md: ints[%d]=%d.\n", i, ints[i]);*/
1229 md_setup_args
.set
=1;
1233 __initfunc(void do_md_setup(char *str
,int *ints
))
1235 int minor
, pers
, factor
, fault
;
1240 printk ("md: Too few Arguments (%d).\n", ints
[0]);
1246 if (minor
>= MAX_MD_DEV
) {
1247 printk ("md: Minor device number too high.\n");
1253 switch(ints
[i
++]) { /* Raidlevel */
1255 #ifdef CONFIG_MD_LINEAR
1257 printk ("md: Setting up md%d as linear device.\n",minor
);
1259 printk ("md: Linear mode not configured."
1260 "Recompile the kernel with linear mode enabled!\n");
1265 #ifdef CONFIG_MD_STRIPED
1266 printk ("md: Setting up md%d as a striped device.\n",minor
);
1268 printk ("md: Striped mode not configured."
1269 "Recompile the kernel with striped mode enabled!\n");
1272 /* not supported yet
1275 printk ("md: Setting up md%d as a raid1 device.\n",minor);
1279 printk ("md: Setting up md%d as a raid5 device.\n",minor);
1283 printk ("md: Unknown or not supported raid level %d.\n", ints
[--i
]);
1289 factor
=ints
[i
++]; /* Chunksize */
1290 fault
=ints
[i
++]; /* Faultlevel */
1292 pers
=pers
| factor
| (fault
<< FAULT_SHIFT
);
1294 while( str
&& (dev
= name_to_kdev_t(str
))) {
1295 do_md_add (minor
, dev
);
1296 if((str
= strchr (str
, ',')) != NULL
)
1300 do_md_run (minor
, pers
);
1301 printk ("md: Loading md%d.\n",minor
);
1307 void linear_init (void);
1308 void raid0_init (void);
1309 void raid1_init (void);
1310 void raid5_init (void);
1312 __initfunc(int md_init (void))
1314 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1315 MD_MAJOR_VERSION
, MD_MINOR_VERSION
, MD_PATCHLEVEL_VERSION
,
1316 MAX_MD_DEV
, MAX_REAL
);
1318 if (register_blkdev (MD_MAJOR
, "md", &md_fops
))
1320 printk ("Unable to get major %d for md\n", MD_MAJOR
);
1324 blk_dev
[MD_MAJOR
].request_fn
=DEVICE_REQUEST
;
1325 blk_dev
[MD_MAJOR
].current_request
=NULL
;
1326 read_ahead
[MD_MAJOR
]=INT_MAX
;
1327 memset(md_dev
, 0, MAX_MD_DEV
* sizeof (struct md_dev
));
1328 md_gendisk
.next
=gendisk_head
;
1330 gendisk_head
=&md_gendisk
;
1332 #if SUPPORT_RECONSTRUCTION
1333 if ((md_sync_thread
= md_register_thread(mdsyncd
, NULL
)) == NULL
)
1334 printk("md: bug: md_sync_thread == NULL\n");
1335 #endif /* SUPPORT_RECONSTRUCTION */
1337 #ifdef CONFIG_MD_LINEAR
1340 #ifdef CONFIG_MD_STRIPED
1343 #ifdef CONFIG_MD_MIRRORING
1346 #ifdef CONFIG_MD_RAID5
1352 #ifdef CONFIG_MD_BOOT
1353 __initfunc(void md_setup_drive(void))
1355 if(md_setup_args
.set
)
1356 do_md_setup(md_setup_args
.str
, md_setup_args
.ints
);