3 md.c : Multiple Devices driver for Linux
4 Copyright (C) 1994-96 Marc ZYNGIER
5 <zyngier@ufr-info-p7.ibp.fr> or
8 A lot of inspiration came from hd.c ...
10 kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 RAID-1/RAID-5 extensions by:
13 Ingo Molnar, Miguel de Icaza, Gadi Oxman
15 This program is free software; you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation; either version 2, or (at your option)
20 You should have received a copy of the GNU General Public License
21 (for example /usr/src/linux/COPYING); if not, write to the Free
22 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so
27 * the extra system load does not show up that much. Increase it if your
28 * system can take more.
30 #define SPEED_LIMIT 1024
32 #include <linux/config.h>
33 #include <linux/module.h>
34 #include <linux/version.h>
35 #include <linux/malloc.h>
38 #include <linux/hdreg.h>
39 #include <linux/stat.h>
41 #include <linux/proc_fs.h>
42 #include <linux/blkdev.h>
43 #include <linux/genhd.h>
44 #include <linux/smp_lock.h>
46 #include <linux/kerneld.h>
48 #include <linux/errno.h>
49 #include <linux/init.h>
51 #define __KERNEL_SYSCALLS__
52 #include <linux/unistd.h>
54 #define MAJOR_NR MD_MAJOR
57 #include <linux/blk.h>
58 #include <asm/uaccess.h>
59 #include <asm/bitops.h>
60 #include <asm/atomic.h>
62 static struct hd_struct md_hd_struct
[MAX_MD_DEV
];
63 static int md_blocksizes
[MAX_MD_DEV
];
64 int md_maxreadahead
[MAX_MD_DEV
];
65 static struct md_thread md_threads
[MAX_MD_THREADS
];
66 #if SUPPORT_RECONSTRUCTION
67 static struct md_thread
*md_sync_thread
= NULL
;
68 #endif /* SUPPORT_RECONSTRUCTION */
70 int md_size
[MAX_MD_DEV
]={0, };
72 static void md_geninit (struct gendisk
*);
74 static struct gendisk md_gendisk
=
89 static struct md_personality
*pers
[MAX_PERSONALITY
]={NULL
, };
90 struct md_dev md_dev
[MAX_MD_DEV
];
92 static struct gendisk
*find_gendisk (kdev_t dev
)
94 struct gendisk
*tmp
=gendisk_head
;
98 if (tmp
->major
==MAJOR(dev
))
107 char *partition_name (kdev_t dev
)
109 static char name
[40]; /* This should be long
110 enough for a device name ! */
111 struct gendisk
*hd
= find_gendisk (dev
);
115 sprintf (name
, "[dev %s]", kdevname(dev
));
119 return disk_name (hd
, MINOR(dev
), name
); /* routine in genhd.c */
122 static int legacy_raid_sb (int minor
, int pnum
)
126 factor
= 1 << FACTOR_SHIFT(FACTOR((md_dev
+minor
)));
129 * do size and offset calculations.
131 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++) {
132 md_dev
[minor
].devices
[i
].size
&= ~(factor
- 1);
133 md_size
[minor
] += md_dev
[minor
].devices
[i
].size
;
134 md_dev
[minor
].devices
[i
].offset
=i
? (md_dev
[minor
].devices
[i
-1].offset
+
135 md_dev
[minor
].devices
[i
-1].size
) : 0;
137 if (pnum
== RAID0
>> PERSONALITY_SHIFT
)
138 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
* md_dev
[minor
].nb_dev
;
142 static void free_sb (struct md_dev
*mddev
)
145 struct real_dev
*realdev
;
148 free_page((unsigned long) mddev
->sb
);
151 for (i
= 0; i
<mddev
->nb_dev
; i
++) {
152 realdev
= mddev
->devices
+ i
;
154 free_page((unsigned long) realdev
->sb
);
161 * Check one RAID superblock for generic plausibility
164 #define BAD_MAGIC KERN_ERR \
165 "md: %s: invalid raid superblock magic (%x) on block %u\n"
167 #define OUT_OF_MEM KERN_ALERT \
168 "md: out of memory.\n"
170 #define NO_DEVICE KERN_ERR \
171 "md: disabled device %s\n"
176 static int analyze_one_sb (struct real_dev
* rdev
)
179 struct buffer_head
*bh
;
180 kdev_t dev
= rdev
->dev
;
184 * Read the superblock, it's at the end of the disk
186 rdev
->sb_offset
= MD_NEW_SIZE_BLOCKS (blk_size
[MAJOR(dev
)][MINOR(dev
)]);
187 set_blocksize (dev
, MD_SB_BYTES
);
188 bh
= bread (dev
, rdev
->sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
191 sb
= (md_superblock_t
*) bh
->b_data
;
192 if (sb
->md_magic
!= MD_SB_MAGIC
) {
193 printk (BAD_MAGIC
, kdevname(dev
),
194 sb
->md_magic
, rdev
->sb_offset
);
197 rdev
->sb
= (md_superblock_t
*) __get_free_page(GFP_KERNEL
);
202 memcpy (rdev
->sb
, bh
->b_data
, MD_SB_BYTES
);
204 rdev
->size
= sb
->size
;
206 printk (NO_DEVICE
,kdevname(rdev
->dev
));
222 * Check a full RAID array for plausibility
225 #define INCONSISTENT KERN_ERR \
226 "md: superblock inconsistency -- run ckraid\n"
228 #define OUT_OF_DATE KERN_ERR \
229 "md: superblock update time inconsistenty -- using the most recent one\n"
231 #define OLD_VERSION KERN_ALERT \
232 "md: %s: unsupported raid array version %d.%d.%d\n"
234 #define NOT_CLEAN KERN_ERR \
235 "md: %s: raid array is not clean -- run ckraid\n"
237 #define NOT_CLEAN_IGNORE KERN_ERR \
238 "md: %s: raid array is not clean -- reconstructing parity\n"
240 #define UNKNOWN_LEVEL KERN_ERR \
241 "md: %s: unsupported raid level %d\n"
243 static int analyze_sbs (int minor
, int pnum
)
245 struct md_dev
*mddev
= md_dev
+ minor
;
246 int i
, N
= mddev
->nb_dev
, out_of_date
= 0;
247 struct real_dev
* disks
= mddev
->devices
;
248 md_superblock_t
*sb
, *freshest
= NULL
;
251 * RAID-0 and linear don't use a RAID superblock
253 if (pnum
== RAID0
>> PERSONALITY_SHIFT
||
254 pnum
== LINEAR
>> PERSONALITY_SHIFT
)
255 return legacy_raid_sb (minor
, pnum
);
258 * Verify the RAID superblock on each real device
260 for (i
= 0; i
< N
; i
++)
261 if (analyze_one_sb(disks
+i
))
265 * The superblock constant part has to be the same
266 * for all disks in the array.
269 for (i
= 0; i
< N
; i
++) {
277 disks
[i
].sb
, MD_SB_GENERIC_CONSTANT_WORDS
* 4)) {
278 printk (INCONSISTENT
);
284 * Ok, we have all disks and the array is ready to run. Lets
285 * find the freshest superblock, that one will be the superblock
286 * that represents the whole array.
288 if ((sb
= mddev
->sb
= (md_superblock_t
*) __get_free_page (GFP_KERNEL
)) == NULL
)
291 for (i
= 0; i
< N
; i
++) {
295 freshest
= disks
[i
].sb
;
299 * Find the newest superblock version
301 if (disks
[i
].sb
->utime
!= freshest
->utime
) {
303 if (disks
[i
].sb
->utime
> freshest
->utime
)
304 freshest
= disks
[i
].sb
;
309 memcpy (sb
, freshest
, sizeof(*freshest
));
312 * Check if we can support this RAID array
314 if (sb
->major_version
!= MD_MAJOR_VERSION
||
315 sb
->minor_version
> MD_MINOR_VERSION
) {
317 printk (OLD_VERSION
, kdevname(MKDEV(MD_MAJOR
, minor
)),
318 sb
->major_version
, sb
->minor_version
,
324 * We need to add this as a superblock option.
326 #if SUPPORT_RECONSTRUCTION
327 if (sb
->state
!= (1 << MD_SB_CLEAN
)) {
328 if (sb
->level
== 1) {
329 printk (NOT_CLEAN
, kdevname(MKDEV(MD_MAJOR
, minor
)));
332 printk (NOT_CLEAN_IGNORE
, kdevname(MKDEV(MD_MAJOR
, minor
)));
335 if (sb
->state
!= (1 << MD_SB_CLEAN
)) {
336 printk (NOT_CLEAN
, kdevname(MKDEV(MD_MAJOR
, minor
)));
339 #endif /* SUPPORT_RECONSTRUCTION */
343 md_size
[minor
] = sb
->size
;
344 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
;
348 md_size
[minor
] = sb
->size
* (sb
->raid_disks
- 1);
349 md_maxreadahead
[minor
] = MD_DEFAULT_DISK_READAHEAD
* (sb
->raid_disks
- 1);
352 printk (UNKNOWN_LEVEL
, kdevname(MKDEV(MD_MAJOR
, minor
)),
368 int md_update_sb(int minor
)
370 struct md_dev
*mddev
= md_dev
+ minor
;
371 struct buffer_head
*bh
;
372 md_superblock_t
*sb
= mddev
->sb
;
373 struct real_dev
*realdev
;
378 sb
->utime
= CURRENT_TIME
;
379 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
380 realdev
= mddev
->devices
+ i
;
384 sb_offset
= realdev
->sb_offset
;
385 set_blocksize(dev
, MD_SB_BYTES
);
386 printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev
), sb_offset
);
387 bh
= getblk(dev
, sb_offset
/ MD_SB_BLOCKS
, MD_SB_BYTES
);
389 sb
= (md_superblock_t
*) bh
->b_data
;
390 memcpy(sb
, mddev
->sb
, MD_SB_BYTES
);
391 memcpy(&sb
->descriptor
, sb
->disks
+ realdev
->sb
->descriptor
.number
, MD_SB_DESCRIPTOR_WORDS
* 4);
392 mark_buffer_uptodate(bh
, 1);
393 mark_buffer_dirty(bh
, 1);
394 ll_rw_block(WRITE
, 1, &bh
);
398 invalidate_buffers(dev
);
400 printk(KERN_ERR
"md: getblk failed for device %s\n", kdevname(dev
));
405 static int do_md_run (int minor
, int repart
)
407 int pnum
, i
, min
, factor
, err
;
409 if (!md_dev
[minor
].nb_dev
)
412 if (md_dev
[minor
].pers
)
415 md_dev
[minor
].repartition
=repart
;
417 if ((pnum
=PERSONALITY(&md_dev
[minor
]) >> (PERSONALITY_SHIFT
))
421 /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */
422 if (pnum
!= (RAID1
>> PERSONALITY_SHIFT
) && pnum
!= (RAID5
>> PERSONALITY_SHIFT
)){
423 for (i
= 0; i
< md_dev
[minor
].nb_dev
; i
++)
424 if (MAJOR (md_dev
[minor
].devices
[i
].dev
) == MD_MAJOR
)
429 #ifdef CONFIG_KERNELD
430 char module_name
[80];
431 sprintf (module_name
, "md-personality-%d", pnum
);
432 request_module (module_name
);
438 factor
= min
= 1 << FACTOR_SHIFT(FACTOR((md_dev
+minor
)));
440 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++)
441 if (md_dev
[minor
].devices
[i
].size
<min
)
443 printk ("Dev %s smaller than %dk, cannot shrink\n",
444 partition_name (md_dev
[minor
].devices
[i
].dev
), min
);
448 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++) {
449 fsync_dev(md_dev
[minor
].devices
[i
].dev
);
450 invalidate_buffers(md_dev
[minor
].devices
[i
].dev
);
453 /* Resize devices according to the factor. It is used to align
454 partitions size on a given chunk size. */
458 * Analyze the raid superblock
460 if (analyze_sbs(minor
, pnum
))
463 md_dev
[minor
].pers
=pers
[pnum
];
465 if ((err
=md_dev
[minor
].pers
->run (minor
, md_dev
+minor
)))
467 md_dev
[minor
].pers
=NULL
;
468 free_sb(md_dev
+ minor
);
472 if (pnum
!= RAID0
>> PERSONALITY_SHIFT
&& pnum
!= LINEAR
>> PERSONALITY_SHIFT
)
474 md_dev
[minor
].sb
->state
&= ~(1 << MD_SB_CLEAN
);
478 /* FIXME : We assume here we have blocks
479 that are twice as large as sectors.
480 THIS MAY NOT BE TRUE !!! */
481 md_hd_struct
[minor
].start_sect
=0;
482 md_hd_struct
[minor
].nr_sects
=md_size
[minor
]<<1;
484 read_ahead
[MD_MAJOR
] = 128;
488 static int do_md_stop (int minor
, struct inode
*inode
)
492 if (inode
->i_count
>1 || md_dev
[minor
].busy
>1) {
494 * ioctl : one open channel
496 printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n",
497 minor
, inode
->i_count
, md_dev
[minor
].busy
);
501 if (md_dev
[minor
].pers
) {
503 * It is safe to call stop here, it only frees private
504 * data. Also, it tells us if a device is unstoppable
505 * (eg. resyncing is in progress)
507 if (md_dev
[minor
].pers
->stop (minor
, md_dev
+minor
))
510 * The device won't exist anymore -> flush it now
512 fsync_dev (inode
->i_rdev
);
513 invalidate_buffers (inode
->i_rdev
);
514 if (md_dev
[minor
].sb
) {
515 md_dev
[minor
].sb
->state
|= 1 << MD_SB_CLEAN
;
521 if (md_dev
[minor
].sb
)
522 free_sb(md_dev
+ minor
);
523 for (i
=0; i
<md_dev
[minor
].nb_dev
; i
++)
524 clear_inode (md_dev
[minor
].devices
[i
].inode
);
526 md_dev
[minor
].nb_dev
=md_size
[minor
]=0;
527 md_hd_struct
[minor
].nr_sects
=0;
528 md_dev
[minor
].pers
=NULL
;
530 read_ahead
[MD_MAJOR
] = 128;
535 static int do_md_add (int minor
, kdev_t dev
)
539 struct real_dev
*realdev
;
541 if (md_dev
[minor
].nb_dev
==MAX_REAL
)
544 if (!fs_may_mount (dev
))
547 if (blk_size
[MAJOR(dev
)] == NULL
|| blk_size
[MAJOR(dev
)][MINOR(dev
)] == 0) {
548 printk("md_add(): zero device size, huh, bailing out.\n");
552 if (md_dev
[minor
].pers
) {
554 * The array is already running, hot-add the drive, or
557 if (!md_dev
[minor
].pers
->hot_add_disk
)
564 * Careful. We cannot increase nb_dev for a running array.
566 i
=md_dev
[minor
].nb_dev
;
567 realdev
= &md_dev
[minor
].devices
[i
];
570 /* Lock the device by inserting a dummy inode. This doesn't
571 smell very good, but I need to be consistent with the
572 mount stuff, specially with fs_may_mount. If someone have
573 a better idea, please help ! */
575 realdev
->inode
=get_empty_inode ();
576 realdev
->inode
->i_dev
=dev
; /* don't care about other fields */
577 insert_inode_hash (realdev
->inode
);
579 /* Sizes are now rounded at run time */
581 /* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/
583 realdev
->size
=blk_size
[MAJOR(dev
)][MINOR(dev
)];
587 * Check the superblock for consistency.
588 * the personality itself has to check wether it's getting
589 * added with the proper flags ... also, personality has to
592 if (analyze_one_sb (realdev
))
595 * hot_add has to bump up nb_dev itself
597 if (md_dev
[minor
].pers
->hot_add_disk (&md_dev
[minor
], dev
)) {
599 * FIXME: here we should free up the inode and stuff
605 md_dev
[minor
].nb_dev
++;
607 printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev
), minor
);
611 static int md_ioctl (struct inode
*inode
, struct file
*file
,
612 unsigned int cmd
, unsigned long arg
)
615 struct hd_geometry
*loc
= (struct hd_geometry
*) arg
;
620 if (((minor
=MINOR(inode
->i_rdev
)) & 0x80) &&
621 (minor
& 0x7f) < MAX_PERSONALITY
&&
622 pers
[minor
& 0x7f] &&
623 pers
[minor
& 0x7f]->ioctl
)
624 return (pers
[minor
& 0x7f]->ioctl (inode
, file
, cmd
, arg
));
626 if (minor
>= MAX_MD_DEV
)
632 return do_md_add (minor
, to_kdev_t ((dev_t
) arg
));
635 return do_md_run (minor
, (int) arg
);
638 return do_md_stop (minor
, inode
);
640 case BLKGETSIZE
: /* Return device size */
641 if (!arg
) return -EINVAL
;
642 err
= put_user (md_hd_struct
[MINOR(inode
->i_rdev
)].nr_sects
, (long *) arg
);
648 fsync_dev (inode
->i_rdev
);
649 invalidate_buffers (inode
->i_rdev
);
655 read_ahead
[MAJOR(inode
->i_rdev
)] = arg
;
659 if (!arg
) return -EINVAL
;
660 err
= put_user (read_ahead
[MAJOR(inode
->i_rdev
)], (long *) arg
);
665 /* We have a problem here : there is no easy way to give a CHS
666 virtual geometry. We currently pretend that we have a 2 heads
667 4 sectors (with a BIG number of cylinders...). This drives dosfs
671 if (!loc
) return -EINVAL
;
672 err
= put_user (2, (char *) &loc
->heads
);
675 err
= put_user (4, (char *) &loc
->sectors
);
678 err
= put_user (md_hd_struct
[minor
].nr_sects
/8, (short *) &loc
->cylinders
);
681 err
= put_user (md_hd_struct
[MINOR(inode
->i_rdev
)].start_sect
,
682 (long *) &loc
->start
);
687 RO_IOCTLS(inode
->i_rdev
,arg
);
690 printk ("Unknown md_ioctl %d\n", cmd
);
697 static int md_open (struct inode
*inode
, struct file
*file
)
699 int minor
=MINOR(inode
->i_rdev
);
701 md_dev
[minor
].busy
++;
702 return (0); /* Always succeed */
706 static int md_release (struct inode
*inode
, struct file
*file
)
708 int minor
=MINOR(inode
->i_rdev
);
710 sync_dev (inode
->i_rdev
);
711 md_dev
[minor
].busy
--;
716 static ssize_t
md_read (struct file
*file
, char *buf
, size_t count
,
719 int minor
=MINOR(file
->f_dentry
->d_inode
->i_rdev
);
721 if (!md_dev
[minor
].pers
) /* Check if device is being run */
724 return block_read(file
, buf
, count
, ppos
);
727 static ssize_t
md_write (struct file
*file
, const char *buf
,
728 size_t count
, loff_t
*ppos
)
730 int minor
=MINOR(file
->f_dentry
->d_inode
->i_rdev
);
732 if (!md_dev
[minor
].pers
) /* Check if device is being run */
735 return block_write(file
, buf
, count
, ppos
);
738 static struct file_operations md_fops
=
752 int md_map (int minor
, kdev_t
*rdev
, unsigned long *rsector
, unsigned long size
)
754 if ((unsigned int) minor
>= MAX_MD_DEV
)
756 printk ("Bad md device %d\n", minor
);
760 if (!md_dev
[minor
].pers
)
762 printk ("Oops ! md%d not running, giving up !\n", minor
);
766 return (md_dev
[minor
].pers
->map(md_dev
+minor
, rdev
, rsector
, size
));
769 int md_make_request (int minor
, int rw
, struct buffer_head
* bh
)
771 if (md_dev
[minor
].pers
->make_request
) {
772 if (buffer_locked(bh
))
774 set_bit(BH_Lock
, &bh
->b_state
);
775 if (rw
== WRITE
|| rw
== WRITEA
) {
776 if (!buffer_dirty(bh
)) {
777 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
781 if (rw
== READ
|| rw
== READA
) {
782 if (buffer_uptodate(bh
)) {
783 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
787 return (md_dev
[minor
].pers
->make_request(md_dev
+minor
, rw
, bh
));
789 make_request (MAJOR(bh
->b_rdev
), rw
, bh
);
794 static void do_md_request (void)
796 printk ("Got md request, not good...");
801 * We run MAX_MD_THREADS from md_init() and arbitrate them in run time.
802 * This is not so elegant, but how can we use kernel_thread() from within
805 struct md_thread
*md_register_thread (void (*run
) (void *), void *data
)
808 for (i
= 0; i
< MAX_MD_THREADS
; i
++) {
809 if (md_threads
[i
].run
== NULL
) {
810 md_threads
[i
].run
= run
;
811 md_threads
[i
].data
= data
;
812 return md_threads
+ i
;
818 void md_unregister_thread (struct md_thread
*thread
)
825 void md_wakeup_thread(struct md_thread
*thread
)
827 set_bit(THREAD_WAKEUP
, &thread
->flags
);
828 wake_up(&thread
->wqueue
);
832 EXPORT_SYMBOL(md_size
);
833 EXPORT_SYMBOL(md_maxreadahead
);
834 EXPORT_SYMBOL(register_md_personality
);
835 EXPORT_SYMBOL(unregister_md_personality
);
836 EXPORT_SYMBOL(partition_name
);
837 EXPORT_SYMBOL(md_dev
);
838 EXPORT_SYMBOL(md_error
);
839 EXPORT_SYMBOL(md_register_thread
);
840 EXPORT_SYMBOL(md_unregister_thread
);
841 EXPORT_SYMBOL(md_update_sb
);
842 EXPORT_SYMBOL(md_map
);
843 EXPORT_SYMBOL(md_wakeup_thread
);
844 EXPORT_SYMBOL(md_do_sync
);
846 static struct proc_dir_entry proc_md
= {
847 PROC_MD
, 6, "mdstat",
848 S_IFREG
| S_IRUGO
, 1, 0, 0,
849 0, &proc_array_inode_operations
,
852 static void md_geninit (struct gendisk
*gdisk
)
856 for(i
=0;i
<MAX_MD_DEV
;i
++)
858 md_blocksizes
[i
] = 1024;
859 md_maxreadahead
[i
] = MD_DEFAULT_DISK_READAHEAD
;
860 md_gendisk
.part
[i
].start_sect
=-1; /* avoid partition check */
861 md_gendisk
.part
[i
].nr_sects
=0;
865 blksize_size
[MD_MAJOR
] = md_blocksizes
;
866 max_readahead
[MD_MAJOR
] = md_maxreadahead
;
868 proc_register(&proc_root
, &proc_md
);
871 int md_error (kdev_t mddev
, kdev_t rdev
)
873 unsigned int minor
= MINOR (mddev
);
876 if (MAJOR(mddev
) != MD_MAJOR
|| minor
> MAX_MD_DEV
)
877 panic ("md_error gets unknown device\n");
878 if (!md_dev
[minor
].pers
)
879 panic ("md_error gets an error for an unknown device\n");
880 if (md_dev
[minor
].pers
->error_handler
) {
881 rc
= md_dev
[minor
].pers
->error_handler (md_dev
+minor
, rdev
);
882 #if SUPPORT_RECONSTRUCTION
883 md_wakeup_thread(md_sync_thread
);
884 #endif /* SUPPORT_RECONSTRUCTION */
890 int get_md_status (char *page
)
892 int sz
=0, i
, j
, size
;
894 sz
+=sprintf( page
+sz
, "Personalities : ");
895 for (i
=0; i
<MAX_PERSONALITY
; i
++)
897 sz
+=sprintf (page
+sz
, "[%d %s] ", i
, pers
[i
]->name
);
901 sz
+=sprintf (page
+sz
, "read_ahead ");
902 if (read_ahead
[MD_MAJOR
]==INT_MAX
)
903 sz
+=sprintf (page
+sz
, "not set\n");
905 sz
+=sprintf (page
+sz
, "%d sectors\n", read_ahead
[MD_MAJOR
]);
907 for (i
=0; i
<MAX_MD_DEV
; i
++)
909 sz
+=sprintf (page
+sz
, "md%d : %sactive", i
, md_dev
[i
].pers
? "" : "in");
912 sz
+=sprintf (page
+sz
, " %s", md_dev
[i
].pers
->name
);
915 for (j
=0; j
<md_dev
[i
].nb_dev
; j
++)
917 sz
+=sprintf (page
+sz
, " %s",
918 partition_name(md_dev
[i
].devices
[j
].dev
));
919 size
+=md_dev
[i
].devices
[j
].size
;
922 if (md_dev
[i
].nb_dev
) {
924 sz
+=sprintf (page
+sz
, " %d blocks", md_size
[i
]);
926 sz
+=sprintf (page
+sz
, " %d blocks", size
);
931 sz
+=sprintf (page
+sz
, "\n");
935 if (md_dev
[i
].pers
->max_invalid_dev
)
936 sz
+=sprintf (page
+sz
, " maxfault=%ld", MAX_FAULT(md_dev
+i
));
938 sz
+=md_dev
[i
].pers
->status (page
+sz
, i
, md_dev
+i
);
939 sz
+=sprintf (page
+sz
, "\n");
945 int register_md_personality (int p_num
, struct md_personality
*p
)
947 int i
=(p_num
>> PERSONALITY_SHIFT
);
949 if (i
>= MAX_PERSONALITY
)
956 printk ("%s personality registered\n", p
->name
);
960 int unregister_md_personality (int p_num
)
962 int i
=(p_num
>> PERSONALITY_SHIFT
);
964 if (i
>= MAX_PERSONALITY
)
967 printk ("%s personality unregistered\n", pers
[i
]->name
);
972 int md_thread(void * arg
)
974 struct md_thread
*thread
= arg
;
976 current
->session
= 1;
978 sprintf(current
->comm
, "md_thread");
983 clear_bit(THREAD_WAKEUP
, &thread
->flags
);
985 thread
->run(thread
->data
);
986 run_task_queue(&tq_disk
);
989 if (!test_bit(THREAD_WAKEUP
, &thread
->flags
)) {
991 spin_lock_irq(¤t
->sigmask_lock
);
992 flush_signals(current
);
993 spin_unlock_irq(¤t
->sigmask_lock
);
994 interruptible_sleep_on(&thread
->wqueue
);
995 } while (signal_pending(current
));
1000 static md_descriptor_t
*get_spare(struct md_dev
*mddev
)
1003 md_superblock_t
*sb
= mddev
->sb
;
1004 md_descriptor_t
*descriptor
;
1005 struct real_dev
*realdev
;
1007 for (i
= 0; i
< mddev
->nb_dev
; i
++) {
1008 realdev
= &mddev
->devices
[i
];
1011 descriptor
= &sb
->disks
[realdev
->sb
->descriptor
.number
];
1012 if (descriptor
->state
& (1 << MD_FAULTY_DEVICE
))
1014 if (descriptor
->state
& (1 << MD_ACTIVE_DEVICE
))
1022 * parallel resyncing thread.
1024 * FIXME: - make it abort with a dirty array on mdstop, now it just blocks
1025 * - fix read error handing
1028 int md_do_sync(struct md_dev
*mddev
)
1030 struct buffer_head
*bh
;
1031 int max_blocks
, blocksize
, curr_bsize
, percent
=1, j
;
1032 kdev_t read_disk
= MKDEV(MD_MAJOR
, mddev
- md_dev
);
1033 int major
= MAJOR(read_disk
), minor
= MINOR(read_disk
);
1034 unsigned long starttime
;
1036 blocksize
= blksize_size
[major
][minor
];
1037 max_blocks
= blk_size
[major
][minor
] / (blocksize
>> 10);
1039 printk("... resync log\n");
1040 printk(" .... mddev->nb_dev: %d\n", mddev
->nb_dev
);
1041 printk(" .... raid array: %s\n", kdevname(read_disk
));
1042 printk(" .... max_blocks: %d blocksize: %d\n", max_blocks
, blocksize
);
1043 printk("md: syncing RAID array %s\n", kdevname(read_disk
));
1048 for (j
= 0; j
< max_blocks
; j
++) {
1051 * B careful. When some1 mounts a non-'blocksize' filesystem
1052 * then we get the blocksize changed right under us. Go deal
1053 * with it transparently, recalculate 'blocksize', 'j' and
1056 curr_bsize
= blksize_size
[major
][minor
];
1057 if (curr_bsize
!= blocksize
) {
1059 if (curr_bsize
> blocksize
)
1061 * this is safe, rounds downwards.
1063 j
/= curr_bsize
/blocksize
;
1065 j
*= blocksize
/curr_bsize
;
1067 blocksize
= curr_bsize
;
1068 max_blocks
= blk_size
[major
][minor
] / (blocksize
>> 10);
1070 if ((bh
= breada (read_disk
, j
, blocksize
, j
* blocksize
,
1071 max_blocks
* blocksize
)) != NULL
) {
1072 mark_buffer_dirty(bh
, 1);
1076 * FIXME: Ugly, but set_blocksize() isnt safe ...
1078 curr_bsize
= blksize_size
[major
][minor
];
1079 if (curr_bsize
!= blocksize
)
1080 goto diff_blocksize
;
1083 * It's a real read problem. FIXME, handle this
1087 "read error, stopping reconstruction.\n");
1093 * Lets sleep some if we are faster than our speed limit:
1095 while (blocksize
*j
/(jiffies
-starttime
+1)*HZ
/1024 > SPEED_LIMIT
)
1097 current
->state
= TASK_INTERRUPTIBLE
;
1098 current
->timeout
= jiffies
+1;
1103 * FIXME: put this status bar thing into /proc
1105 if (!(j
%(max_blocks
/100))) {
1107 printk (" %03d%% done.\n",percent
);
1113 fsync_dev(read_disk
);
1114 printk("md: %s: sync done.\n", kdevname(read_disk
));
1120 * This is a kernel thread which: syncs a spare disk with the active array
1122 * the amount of foolproofing might seem to be a tad excessive, but an
1123 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
1124 * of my root partition with the first 0.5 gigs of my /home partition ... so
1125 * i'm a bit nervous ;)
1127 void mdsyncd (void *data
)
1130 struct md_dev
*mddev
;
1131 md_superblock_t
*sb
;
1132 md_descriptor_t
*spare
;
1133 unsigned long flags
;
1135 for (i
= 0, mddev
= md_dev
; i
< MAX_MD_DEV
; i
++, mddev
++) {
1136 if ((sb
= mddev
->sb
) == NULL
)
1138 if (sb
->active_disks
== sb
->raid_disks
)
1140 if (!sb
->spare_disks
)
1142 if ((spare
= get_spare(mddev
)) == NULL
)
1144 if (!mddev
->pers
->mark_spare
)
1146 if (mddev
->pers
->mark_spare(mddev
, spare
, SPARE_WRITE
))
1148 if (md_do_sync(mddev
) || (spare
->state
& (1 << MD_FAULTY_DEVICE
))) {
1149 mddev
->pers
->mark_spare(mddev
, spare
, SPARE_INACTIVE
);
1154 mddev
->pers
->mark_spare(mddev
, spare
, SPARE_ACTIVE
);
1155 spare
->state
|= (1 << MD_SYNC_DEVICE
);
1156 spare
->state
|= (1 << MD_ACTIVE_DEVICE
);
1159 mddev
->sb_dirty
= 1;
1160 md_update_sb(mddev
- md_dev
);
1161 restore_flags(flags
);
1166 void linear_init (void);
1167 void raid0_init (void);
1168 void raid1_init (void);
1169 void raid5_init (void);
1171 __initfunc(int md_init (void))
1175 printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n",
1176 MD_MAJOR_VERSION
, MD_MINOR_VERSION
, MD_PATCHLEVEL_VERSION
,
1177 MAX_MD_DEV
, MAX_REAL
);
1179 if (register_blkdev (MD_MAJOR
, "md", &md_fops
))
1181 printk ("Unable to get major %d for md\n", MD_MAJOR
);
1185 memset(md_threads
, 0, MAX_MD_THREADS
* sizeof(struct md_thread
));
1186 printk("md: starting %d kernel threads\n", MAX_MD_THREADS
);
1187 for (i
= 0; i
< MAX_MD_THREADS
; i
++) {
1188 md_threads
[i
].run
= NULL
;
1189 init_waitqueue(&md_threads
[i
].wqueue
);
1190 md_threads
[i
].flags
= 0;
1191 kernel_thread (md_thread
, md_threads
+ i
, 0);
1194 blk_dev
[MD_MAJOR
].request_fn
=DEVICE_REQUEST
;
1195 blk_dev
[MD_MAJOR
].current_request
=NULL
;
1196 read_ahead
[MD_MAJOR
]=INT_MAX
;
1197 memset(md_dev
, 0, MAX_MD_DEV
* sizeof (struct md_dev
));
1198 md_gendisk
.next
=gendisk_head
;
1200 gendisk_head
=&md_gendisk
;
1202 #if SUPPORT_RECONSTRUCTION
1203 if ((md_sync_thread
= md_register_thread(mdsyncd
, NULL
)) == NULL
)
1204 printk("md: bug: md_sync_thread == NULL\n");
1205 #endif /* SUPPORT_RECONSTRUCTION */
1207 #ifdef CONFIG_MD_LINEAR
1210 #ifdef CONFIG_MD_STRIPED
1213 #ifdef CONFIG_MD_MIRRORING
1216 #ifdef CONFIG_MD_RAID5