2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/slab.h>
15 #include <linux/device-mapper.h>
17 #define DM_MSG_PREFIX "raid"
20 * The following flags are used by dm-raid.c to set up the array state.
21 * They must be cleared before md_run is called.
23 #define FirstUse 10 /* rdev flag */
27 * Two DM devices, one to hold metadata and one to hold the
28 * actual data/parity. The reason for this is to not confuse
29 * ti->len and give more flexibility in altering size and
32 * While it is possible for this device to be associated
33 * with a different physical device than the data_dev, it
34 * is intended for it to be the same.
35 * |--------- Physical Device ---------|
36 * |- meta_dev -|------ data_dev ------|
38 struct dm_dev
*meta_dev
;
39 struct dm_dev
*data_dev
;
40 struct mdk_rdev_s rdev
;
44 * Flags for rs->print_flags field.
47 #define DMPF_NOSYNC 0x2
48 #define DMPF_REBUILD 0x4
49 #define DMPF_DAEMON_SLEEP 0x8
50 #define DMPF_MIN_RECOVERY_RATE 0x10
51 #define DMPF_MAX_RECOVERY_RATE 0x20
52 #define DMPF_MAX_WRITE_BEHIND 0x40
53 #define DMPF_STRIPE_CACHE 0x80
54 #define DMPF_REGION_SIZE 0X100
61 struct raid_type
*raid_type
;
62 struct dm_target_callbacks callbacks
;
64 struct raid_dev dev
[0];
67 /* Supported raid types and properties. */
68 static struct raid_type
{
69 const char *name
; /* RAID algorithm. */
70 const char *descr
; /* Descriptor text for logging. */
71 const unsigned parity_devs
; /* # of parity devices. */
72 const unsigned minimal_devs
; /* minimal # of devices in set. */
73 const unsigned level
; /* RAID level. */
74 const unsigned algorithm
; /* RAID algorithm. */
76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0
},
78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC
},
79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC
},
80 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC
},
81 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC
},
82 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART
},
83 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART
},
84 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE
}
87 static struct raid_type
*get_raid_type(char *name
)
91 for (i
= 0; i
< ARRAY_SIZE(raid_types
); i
++)
92 if (!strcmp(raid_types
[i
].name
, name
))
93 return &raid_types
[i
];
98 static struct raid_set
*context_alloc(struct dm_target
*ti
, struct raid_type
*raid_type
, unsigned raid_devs
)
102 sector_t sectors_per_dev
;
104 if (raid_devs
<= raid_type
->parity_devs
) {
105 ti
->error
= "Insufficient number of devices";
106 return ERR_PTR(-EINVAL
);
109 sectors_per_dev
= ti
->len
;
110 if ((raid_type
->level
> 1) &&
111 sector_div(sectors_per_dev
, (raid_devs
- raid_type
->parity_devs
))) {
112 ti
->error
= "Target length not divisible by number of data devices";
113 return ERR_PTR(-EINVAL
);
116 rs
= kzalloc(sizeof(*rs
) + raid_devs
* sizeof(rs
->dev
[0]), GFP_KERNEL
);
118 ti
->error
= "Cannot allocate raid context";
119 return ERR_PTR(-ENOMEM
);
125 rs
->raid_type
= raid_type
;
126 rs
->md
.raid_disks
= raid_devs
;
127 rs
->md
.level
= raid_type
->level
;
128 rs
->md
.new_level
= rs
->md
.level
;
129 rs
->md
.dev_sectors
= sectors_per_dev
;
130 rs
->md
.layout
= raid_type
->algorithm
;
131 rs
->md
.new_layout
= rs
->md
.layout
;
132 rs
->md
.delta_disks
= 0;
133 rs
->md
.recovery_cp
= 0;
135 for (i
= 0; i
< raid_devs
; i
++)
136 md_rdev_init(&rs
->dev
[i
].rdev
);
139 * Remaining items to be initialized by further RAID params:
142 * rs->md.chunk_sectors
143 * rs->md.new_chunk_sectors
149 static void context_free(struct raid_set
*rs
)
153 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
154 if (rs
->dev
[i
].meta_dev
)
155 dm_put_device(rs
->ti
, rs
->dev
[i
].meta_dev
);
156 if (rs
->dev
[i
].rdev
.sb_page
)
157 put_page(rs
->dev
[i
].rdev
.sb_page
);
158 rs
->dev
[i
].rdev
.sb_page
= NULL
;
159 rs
->dev
[i
].rdev
.sb_loaded
= 0;
160 if (rs
->dev
[i
].data_dev
)
161 dm_put_device(rs
->ti
, rs
->dev
[i
].data_dev
);
168 * For every device we have two words
169 * <meta_dev>: meta device name or '-' if missing
170 * <data_dev>: data device name or '-' if missing
172 * The following are permitted:
175 * <meta_dev> <data_dev>
177 * The following is not allowed:
180 * This code parses those words. If there is a failure,
181 * the caller must use context_free to unwind the operations.
183 static int dev_parms(struct raid_set
*rs
, char **argv
)
187 int metadata_available
= 0;
190 for (i
= 0; i
< rs
->md
.raid_disks
; i
++, argv
+= 2) {
191 rs
->dev
[i
].rdev
.raid_disk
= i
;
193 rs
->dev
[i
].meta_dev
= NULL
;
194 rs
->dev
[i
].data_dev
= NULL
;
197 * There are no offsets, since there is a separate device
198 * for data and metadata.
200 rs
->dev
[i
].rdev
.data_offset
= 0;
201 rs
->dev
[i
].rdev
.mddev
= &rs
->md
;
203 if (strcmp(argv
[0], "-")) {
204 ret
= dm_get_device(rs
->ti
, argv
[0],
205 dm_table_get_mode(rs
->ti
->table
),
206 &rs
->dev
[i
].meta_dev
);
207 rs
->ti
->error
= "RAID metadata device lookup failure";
211 rs
->dev
[i
].rdev
.sb_page
= alloc_page(GFP_KERNEL
);
212 if (!rs
->dev
[i
].rdev
.sb_page
)
216 if (!strcmp(argv
[1], "-")) {
217 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
) &&
218 (!rs
->dev
[i
].rdev
.recovery_offset
)) {
219 rs
->ti
->error
= "Drive designated for rebuild not specified";
223 rs
->ti
->error
= "No data device supplied with metadata device";
224 if (rs
->dev
[i
].meta_dev
)
230 ret
= dm_get_device(rs
->ti
, argv
[1],
231 dm_table_get_mode(rs
->ti
->table
),
232 &rs
->dev
[i
].data_dev
);
234 rs
->ti
->error
= "RAID device lookup failure";
238 if (rs
->dev
[i
].meta_dev
) {
239 metadata_available
= 1;
240 rs
->dev
[i
].rdev
.meta_bdev
= rs
->dev
[i
].meta_dev
->bdev
;
242 rs
->dev
[i
].rdev
.bdev
= rs
->dev
[i
].data_dev
->bdev
;
243 list_add(&rs
->dev
[i
].rdev
.same_set
, &rs
->md
.disks
);
244 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
248 if (metadata_available
) {
250 rs
->md
.persistent
= 1;
251 rs
->md
.major_version
= 2;
252 } else if (rebuild
&& !rs
->md
.recovery_cp
) {
254 * Without metadata, we will not be able to tell if the array
255 * is in-sync or not - we must assume it is not. Therefore,
256 * it is impossible to rebuild a drive.
258 * Even if there is metadata, the on-disk information may
259 * indicate that the array is not in-sync and it will then
262 * User could specify 'nosync' option if desperate.
264 DMERR("Unable to rebuild drive while array is not in-sync");
265 rs
->ti
->error
= "RAID device lookup failure";
273 * validate_region_size
275 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
277 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
278 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
280 * Returns: 0 on success, -EINVAL on failure.
282 static int validate_region_size(struct raid_set
*rs
, unsigned long region_size
)
284 unsigned long min_region_size
= rs
->ti
->len
/ (1 << 21);
288 * Choose a reasonable default. All figures in sectors.
290 if (min_region_size
> (1 << 13)) {
291 DMINFO("Choosing default region size of %lu sectors",
293 region_size
= min_region_size
;
295 DMINFO("Choosing default region size of 4MiB");
296 region_size
= 1 << 13; /* sectors */
300 * Validate user-supplied value.
302 if (region_size
> rs
->ti
->len
) {
303 rs
->ti
->error
= "Supplied region size is too large";
307 if (region_size
< min_region_size
) {
308 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
309 region_size
, min_region_size
);
310 rs
->ti
->error
= "Supplied region size is too small";
314 if (!is_power_of_2(region_size
)) {
315 rs
->ti
->error
= "Region size is not a power of 2";
319 if (region_size
< rs
->md
.chunk_sectors
) {
320 rs
->ti
->error
= "Region size is smaller than the chunk size";
326 * Convert sectors to bytes.
328 rs
->md
.bitmap_info
.chunksize
= (region_size
<< 9);
334 * Possible arguments are...
335 * <chunk_size> [optional_args]
337 * Argument definitions
338 * <chunk_size> The number of sectors per disk that
339 * will form the "stripe"
340 * [[no]sync] Force or prevent recovery of the
342 * [rebuild <idx>] Rebuild the drive indicated by the index
343 * [daemon_sleep <ms>] Time between bitmap daemon work to
345 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
346 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
347 * [write_mostly <idx>] Indicate a write mostly drive via index
348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
350 * [region_size <sectors>] Defines granularity of bitmap
352 static int parse_raid_params(struct raid_set
*rs
, char **argv
,
353 unsigned num_raid_params
)
355 unsigned i
, rebuild_cnt
= 0;
356 unsigned long value
, region_size
= 0;
360 * First, parse the in-order required arguments
361 * "chunk_size" is the only argument of this type.
363 if ((strict_strtoul(argv
[0], 10, &value
) < 0)) {
364 rs
->ti
->error
= "Bad chunk size";
366 } else if (rs
->raid_type
->level
== 1) {
368 DMERR("Ignoring chunk size parameter for RAID 1");
370 } else if (!is_power_of_2(value
)) {
371 rs
->ti
->error
= "Chunk size must be a power of 2";
373 } else if (value
< 8) {
374 rs
->ti
->error
= "Chunk size value is too small";
378 rs
->md
.new_chunk_sectors
= rs
->md
.chunk_sectors
= value
;
383 * We set each individual device as In_sync with a completed
384 * 'recovery_offset'. If there has been a device failure or
385 * replacement then one of the following cases applies:
387 * 1) User specifies 'rebuild'.
388 * - Device is reset when param is read.
389 * 2) A new device is supplied.
390 * - No matching superblock found, resets device.
391 * 3) Device failure was transient and returns on reload.
392 * - Failure noticed, resets device for bitmap replay.
393 * 4) Device hadn't completed recovery after previous failure.
394 * - Superblock is read and overrides recovery_offset.
396 * What is found in the superblocks of the devices is always
397 * authoritative, unless 'rebuild' or '[no]sync' was specified.
399 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
400 set_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
);
401 rs
->dev
[i
].rdev
.recovery_offset
= MaxSector
;
405 * Second, parse the unordered optional arguments
407 for (i
= 0; i
< num_raid_params
; i
++) {
408 if (!strcasecmp(argv
[i
], "nosync")) {
409 rs
->md
.recovery_cp
= MaxSector
;
410 rs
->print_flags
|= DMPF_NOSYNC
;
413 if (!strcasecmp(argv
[i
], "sync")) {
414 rs
->md
.recovery_cp
= 0;
415 rs
->print_flags
|= DMPF_SYNC
;
419 /* The rest of the optional arguments come in key/value pairs */
420 if ((i
+ 1) >= num_raid_params
) {
421 rs
->ti
->error
= "Wrong number of raid parameters given";
426 if (strict_strtoul(argv
[i
], 10, &value
) < 0) {
427 rs
->ti
->error
= "Bad numerical argument given in raid params";
431 if (!strcasecmp(key
, "rebuild")) {
433 if (((rs
->raid_type
->level
!= 1) &&
434 (rebuild_cnt
> rs
->raid_type
->parity_devs
)) ||
435 ((rs
->raid_type
->level
== 1) &&
436 (rebuild_cnt
> (rs
->md
.raid_disks
- 1)))) {
437 rs
->ti
->error
= "Too many rebuild devices specified for given RAID type";
440 if (value
> rs
->md
.raid_disks
) {
441 rs
->ti
->error
= "Invalid rebuild index given";
444 clear_bit(In_sync
, &rs
->dev
[value
].rdev
.flags
);
445 rs
->dev
[value
].rdev
.recovery_offset
= 0;
446 rs
->print_flags
|= DMPF_REBUILD
;
447 } else if (!strcasecmp(key
, "write_mostly")) {
448 if (rs
->raid_type
->level
!= 1) {
449 rs
->ti
->error
= "write_mostly option is only valid for RAID1";
452 if (value
>= rs
->md
.raid_disks
) {
453 rs
->ti
->error
= "Invalid write_mostly drive index given";
456 set_bit(WriteMostly
, &rs
->dev
[value
].rdev
.flags
);
457 } else if (!strcasecmp(key
, "max_write_behind")) {
458 if (rs
->raid_type
->level
!= 1) {
459 rs
->ti
->error
= "max_write_behind option is only valid for RAID1";
462 rs
->print_flags
|= DMPF_MAX_WRITE_BEHIND
;
465 * In device-mapper, we specify things in sectors, but
466 * MD records this value in kB
469 if (value
> COUNTER_MAX
) {
470 rs
->ti
->error
= "Max write-behind limit out of range";
473 rs
->md
.bitmap_info
.max_write_behind
= value
;
474 } else if (!strcasecmp(key
, "daemon_sleep")) {
475 rs
->print_flags
|= DMPF_DAEMON_SLEEP
;
476 if (!value
|| (value
> MAX_SCHEDULE_TIMEOUT
)) {
477 rs
->ti
->error
= "daemon sleep period out of range";
480 rs
->md
.bitmap_info
.daemon_sleep
= value
;
481 } else if (!strcasecmp(key
, "stripe_cache")) {
482 rs
->print_flags
|= DMPF_STRIPE_CACHE
;
485 * In device-mapper, we specify things in sectors, but
486 * MD records this value in kB
490 if (rs
->raid_type
->level
< 5) {
491 rs
->ti
->error
= "Inappropriate argument: stripe_cache";
494 if (raid5_set_cache_size(&rs
->md
, (int)value
)) {
495 rs
->ti
->error
= "Bad stripe_cache size";
498 } else if (!strcasecmp(key
, "min_recovery_rate")) {
499 rs
->print_flags
|= DMPF_MIN_RECOVERY_RATE
;
500 if (value
> INT_MAX
) {
501 rs
->ti
->error
= "min_recovery_rate out of range";
504 rs
->md
.sync_speed_min
= (int)value
;
505 } else if (!strcasecmp(key
, "max_recovery_rate")) {
506 rs
->print_flags
|= DMPF_MAX_RECOVERY_RATE
;
507 if (value
> INT_MAX
) {
508 rs
->ti
->error
= "max_recovery_rate out of range";
511 rs
->md
.sync_speed_max
= (int)value
;
512 } else if (!strcasecmp(key
, "region_size")) {
513 rs
->print_flags
|= DMPF_REGION_SIZE
;
516 DMERR("Unable to parse RAID parameter: %s", key
);
517 rs
->ti
->error
= "Unable to parse RAID parameters";
522 if (validate_region_size(rs
, region_size
))
525 if (rs
->md
.chunk_sectors
)
526 rs
->ti
->split_io
= rs
->md
.chunk_sectors
;
528 rs
->ti
->split_io
= region_size
;
530 if (rs
->md
.chunk_sectors
)
531 rs
->ti
->split_io
= rs
->md
.chunk_sectors
;
533 rs
->ti
->split_io
= region_size
;
535 /* Assume there are no metadata devices until the drives are parsed */
536 rs
->md
.persistent
= 0;
542 static void do_table_event(struct work_struct
*ws
)
544 struct raid_set
*rs
= container_of(ws
, struct raid_set
, md
.event_work
);
546 dm_table_event(rs
->ti
->table
);
549 static int raid_is_congested(struct dm_target_callbacks
*cb
, int bits
)
551 struct raid_set
*rs
= container_of(cb
, struct raid_set
, callbacks
);
553 if (rs
->raid_type
->level
== 1)
554 return md_raid1_congested(&rs
->md
, bits
);
556 return md_raid5_congested(&rs
->md
, bits
);
560 * This structure is never routinely used by userspace, unlike md superblocks.
561 * Devices with this superblock should only ever be accessed via device-mapper.
563 #define DM_RAID_MAGIC 0x64526D44
564 struct dm_raid_superblock
{
565 __le32 magic
; /* "DmRd" */
566 __le32 features
; /* Used to indicate possible future changes */
568 __le32 num_devices
; /* Number of devices in this array. (Max 64) */
569 __le32 array_position
; /* The position of this drive in the array */
571 __le64 events
; /* Incremented by md when superblock updated */
572 __le64 failed_devices
; /* Bit field of devices to indicate failures */
575 * This offset tracks the progress of the repair or replacement of
576 * an individual drive.
578 __le64 disk_recovery_offset
;
581 * This offset tracks the progress of the initial array
582 * synchronisation/parity calculation.
584 __le64 array_resync_offset
;
587 * RAID characteristics
591 __le32 stripe_sectors
;
593 __u8 pad
[452]; /* Round struct to 512 bytes. */
594 /* Always set to 0 when writing. */
597 static int read_disk_sb(mdk_rdev_t
*rdev
, int size
)
599 BUG_ON(!rdev
->sb_page
);
604 if (!sync_page_io(rdev
, 0, size
, rdev
->sb_page
, READ
, 1)) {
605 DMERR("Failed to read device superblock");
614 static void super_sync(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
617 uint64_t failed_devices
;
618 struct dm_raid_superblock
*sb
;
620 sb
= page_address(rdev
->sb_page
);
621 failed_devices
= le64_to_cpu(sb
->failed_devices
);
623 rdev_for_each(r
, t
, mddev
)
624 if ((r
->raid_disk
>= 0) && test_bit(Faulty
, &r
->flags
))
625 failed_devices
|= (1ULL << r
->raid_disk
);
627 memset(sb
, 0, sizeof(*sb
));
629 sb
->magic
= cpu_to_le32(DM_RAID_MAGIC
);
630 sb
->features
= cpu_to_le32(0); /* No features yet */
632 sb
->num_devices
= cpu_to_le32(mddev
->raid_disks
);
633 sb
->array_position
= cpu_to_le32(rdev
->raid_disk
);
635 sb
->events
= cpu_to_le64(mddev
->events
);
636 sb
->failed_devices
= cpu_to_le64(failed_devices
);
638 sb
->disk_recovery_offset
= cpu_to_le64(rdev
->recovery_offset
);
639 sb
->array_resync_offset
= cpu_to_le64(mddev
->recovery_cp
);
641 sb
->level
= cpu_to_le32(mddev
->level
);
642 sb
->layout
= cpu_to_le32(mddev
->layout
);
643 sb
->stripe_sectors
= cpu_to_le32(mddev
->chunk_sectors
);
649 * This function creates a superblock if one is not found on the device
650 * and will decide which superblock to use if there's a choice.
652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
654 static int super_load(mdk_rdev_t
*rdev
, mdk_rdev_t
*refdev
)
657 struct dm_raid_superblock
*sb
;
658 struct dm_raid_superblock
*refsb
;
659 uint64_t events_sb
, events_refsb
;
662 rdev
->sb_size
= sizeof(*sb
);
664 ret
= read_disk_sb(rdev
, rdev
->sb_size
);
668 sb
= page_address(rdev
->sb_page
);
669 if (sb
->magic
!= cpu_to_le32(DM_RAID_MAGIC
)) {
670 super_sync(rdev
->mddev
, rdev
);
672 set_bit(FirstUse
, &rdev
->flags
);
674 /* Force writing of superblocks to disk */
675 set_bit(MD_CHANGE_DEVS
, &rdev
->mddev
->flags
);
677 /* Any superblock is better than none, choose that if given */
678 return refdev
? 0 : 1;
684 events_sb
= le64_to_cpu(sb
->events
);
686 refsb
= page_address(refdev
->sb_page
);
687 events_refsb
= le64_to_cpu(refsb
->events
);
689 return (events_sb
> events_refsb
) ? 1 : 0;
692 static int super_init_validation(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
695 struct raid_set
*rs
= container_of(mddev
, struct raid_set
, md
);
697 uint64_t failed_devices
;
698 struct dm_raid_superblock
*sb
;
699 uint32_t new_devs
= 0;
700 uint32_t rebuilds
= 0;
702 struct dm_raid_superblock
*sb2
;
704 sb
= page_address(rdev
->sb_page
);
705 events_sb
= le64_to_cpu(sb
->events
);
706 failed_devices
= le64_to_cpu(sb
->failed_devices
);
709 * Initialise to 1 if this is a new superblock.
711 mddev
->events
= events_sb
? : 1;
714 * Reshaping is not currently allowed
716 if ((le32_to_cpu(sb
->level
) != mddev
->level
) ||
717 (le32_to_cpu(sb
->layout
) != mddev
->layout
) ||
718 (le32_to_cpu(sb
->stripe_sectors
) != mddev
->chunk_sectors
)) {
719 DMERR("Reshaping arrays not yet supported.");
723 /* We can only change the number of devices in RAID1 right now */
724 if ((rs
->raid_type
->level
!= 1) &&
725 (le32_to_cpu(sb
->num_devices
) != mddev
->raid_disks
)) {
726 DMERR("Reshaping arrays not yet supported.");
730 if (!(rs
->print_flags
& (DMPF_SYNC
| DMPF_NOSYNC
)))
731 mddev
->recovery_cp
= le64_to_cpu(sb
->array_resync_offset
);
734 * During load, we set FirstUse if a new superblock was written.
735 * There are two reasons we might not have a superblock:
736 * 1) The array is brand new - in which case, all of the
737 * devices must have their In_sync bit set. Also,
738 * recovery_cp must be 0, unless forced.
739 * 2) This is a new device being added to an old array
740 * and the new device needs to be rebuilt - in which
741 * case the In_sync bit will /not/ be set and
742 * recovery_cp must be MaxSector.
744 rdev_for_each(r
, t
, mddev
) {
745 if (!test_bit(In_sync
, &r
->flags
)) {
746 if (!test_bit(FirstUse
, &r
->flags
))
747 DMERR("Superblock area of "
748 "rebuild device %d should have been "
749 "cleared.", r
->raid_disk
);
750 set_bit(FirstUse
, &r
->flags
);
752 } else if (test_bit(FirstUse
, &r
->flags
))
757 if (new_devs
== mddev
->raid_disks
) {
758 DMINFO("Superblocks created for new array");
759 set_bit(MD_ARRAY_FIRST_USE
, &mddev
->flags
);
760 } else if (new_devs
) {
761 DMERR("New device injected "
762 "into existing array without 'rebuild' "
763 "parameter specified");
766 } else if (new_devs
) {
767 DMERR("'rebuild' devices cannot be "
768 "injected into an array with other first-time devices");
770 } else if (mddev
->recovery_cp
!= MaxSector
) {
771 DMERR("'rebuild' specified while array is not in-sync");
776 * Now we set the Faulty bit for those devices that are
777 * recorded in the superblock as failed.
779 rdev_for_each(r
, t
, mddev
) {
782 sb2
= page_address(r
->sb_page
);
783 sb2
->failed_devices
= 0;
786 * Check for any device re-ordering.
788 if (!test_bit(FirstUse
, &r
->flags
) && (r
->raid_disk
>= 0)) {
789 role
= le32_to_cpu(sb2
->array_position
);
790 if (role
!= r
->raid_disk
) {
791 if (rs
->raid_type
->level
!= 1) {
792 rs
->ti
->error
= "Cannot change device "
793 "positions in RAID array";
796 DMINFO("RAID1 device #%d now at position #%d",
801 * Partial recovery is performed on
802 * returning failed devices.
804 if (failed_devices
& (1 << role
))
805 set_bit(Faulty
, &r
->flags
);
812 static int super_validate(mddev_t
*mddev
, mdk_rdev_t
*rdev
)
814 struct dm_raid_superblock
*sb
= page_address(rdev
->sb_page
);
817 * If mddev->events is not set, we know we have not yet initialized
820 if (!mddev
->events
&& super_init_validation(mddev
, rdev
))
823 mddev
->bitmap_info
.offset
= 4096 >> 9; /* Enable bitmap creation */
824 rdev
->mddev
->bitmap_info
.default_offset
= 4096 >> 9;
825 if (!test_bit(FirstUse
, &rdev
->flags
)) {
826 rdev
->recovery_offset
= le64_to_cpu(sb
->disk_recovery_offset
);
827 if (rdev
->recovery_offset
!= MaxSector
)
828 clear_bit(In_sync
, &rdev
->flags
);
832 * If a device comes back, set it as not In_sync and no longer faulty.
834 if (test_bit(Faulty
, &rdev
->flags
)) {
835 clear_bit(Faulty
, &rdev
->flags
);
836 clear_bit(In_sync
, &rdev
->flags
);
837 rdev
->saved_raid_disk
= rdev
->raid_disk
;
838 rdev
->recovery_offset
= 0;
841 clear_bit(FirstUse
, &rdev
->flags
);
847 * Analyse superblocks and select the freshest.
849 static int analyse_superblocks(struct dm_target
*ti
, struct raid_set
*rs
)
852 mdk_rdev_t
*rdev
, *freshest
, *tmp
;
853 mddev_t
*mddev
= &rs
->md
;
856 rdev_for_each(rdev
, tmp
, mddev
) {
857 if (!rdev
->meta_bdev
)
860 ret
= super_load(rdev
, freshest
);
869 ti
->error
= "Failed to load superblock";
878 * Validation of the freshest device provides the source of
879 * validation for the remaining devices.
881 ti
->error
= "Unable to assemble array: Invalid superblocks";
882 if (super_validate(mddev
, freshest
))
885 rdev_for_each(rdev
, tmp
, mddev
)
886 if ((rdev
!= freshest
) && super_validate(mddev
, rdev
))
893 * Construct a RAID4/5/6 mapping:
895 * <raid_type> <#raid_params> <raid_params> \
896 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
898 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
899 * details on possible <raid_params>.
901 static int raid_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
904 struct raid_type
*rt
;
905 unsigned long num_raid_params
, num_raid_devs
;
906 struct raid_set
*rs
= NULL
;
908 /* Must have at least <raid_type> <#raid_params> */
910 ti
->error
= "Too few arguments";
915 rt
= get_raid_type(argv
[0]);
917 ti
->error
= "Unrecognised raid_type";
923 /* number of RAID parameters */
924 if (strict_strtoul(argv
[0], 10, &num_raid_params
) < 0) {
925 ti
->error
= "Cannot understand number of RAID parameters";
931 /* Skip over RAID params for now and find out # of devices */
932 if (num_raid_params
+ 1 > argc
) {
933 ti
->error
= "Arguments do not agree with counts given";
937 if ((strict_strtoul(argv
[num_raid_params
], 10, &num_raid_devs
) < 0) ||
938 (num_raid_devs
>= INT_MAX
)) {
939 ti
->error
= "Cannot understand number of raid devices";
943 rs
= context_alloc(ti
, rt
, (unsigned)num_raid_devs
);
947 ret
= parse_raid_params(rs
, argv
, (unsigned)num_raid_params
);
953 argc
-= num_raid_params
+ 1; /* +1: we already have num_raid_devs */
954 argv
+= num_raid_params
+ 1;
956 if (argc
!= (num_raid_devs
* 2)) {
957 ti
->error
= "Supplied RAID devices does not match the count given";
961 ret
= dev_parms(rs
, argv
);
965 rs
->md
.sync_super
= super_sync
;
966 ret
= analyse_superblocks(ti
, rs
);
970 INIT_WORK(&rs
->md
.event_work
, do_table_event
);
973 mutex_lock(&rs
->md
.reconfig_mutex
);
974 ret
= md_run(&rs
->md
);
975 rs
->md
.in_sync
= 0; /* Assume already marked dirty */
976 mutex_unlock(&rs
->md
.reconfig_mutex
);
979 ti
->error
= "Fail to run raid array";
983 rs
->callbacks
.congested_fn
= raid_is_congested
;
984 dm_table_add_target_callbacks(ti
->table
, &rs
->callbacks
);
986 mddev_suspend(&rs
->md
);
995 static void raid_dtr(struct dm_target
*ti
)
997 struct raid_set
*rs
= ti
->private;
999 list_del_init(&rs
->callbacks
.list
);
1004 static int raid_map(struct dm_target
*ti
, struct bio
*bio
, union map_info
*map_context
)
1006 struct raid_set
*rs
= ti
->private;
1007 mddev_t
*mddev
= &rs
->md
;
1009 mddev
->pers
->make_request(mddev
, bio
);
1011 return DM_MAPIO_SUBMITTED
;
1014 static int raid_status(struct dm_target
*ti
, status_type_t type
,
1015 char *result
, unsigned maxlen
)
1017 struct raid_set
*rs
= ti
->private;
1018 unsigned raid_param_cnt
= 1; /* at least 1 for chunksize */
1024 case STATUSTYPE_INFO
:
1025 DMEMIT("%s %d ", rs
->raid_type
->name
, rs
->md
.raid_disks
);
1027 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
1028 if (test_bit(Faulty
, &rs
->dev
[i
].rdev
.flags
))
1030 else if (test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
1036 if (test_bit(MD_RECOVERY_RUNNING
, &rs
->md
.recovery
))
1037 sync
= rs
->md
.curr_resync_completed
;
1039 sync
= rs
->md
.recovery_cp
;
1041 if (sync
> rs
->md
.resync_max_sectors
)
1042 sync
= rs
->md
.resync_max_sectors
;
1044 DMEMIT(" %llu/%llu",
1045 (unsigned long long) sync
,
1046 (unsigned long long) rs
->md
.resync_max_sectors
);
1049 case STATUSTYPE_TABLE
:
1050 /* The string you would use to construct this array */
1051 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
1052 if ((rs
->print_flags
& DMPF_REBUILD
) &&
1053 rs
->dev
[i
].data_dev
&&
1054 !test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
1055 raid_param_cnt
+= 2; /* for rebuilds */
1056 if (rs
->dev
[i
].data_dev
&&
1057 test_bit(WriteMostly
, &rs
->dev
[i
].rdev
.flags
))
1058 raid_param_cnt
+= 2;
1061 raid_param_cnt
+= (hweight64(rs
->print_flags
& ~DMPF_REBUILD
) * 2);
1062 if (rs
->print_flags
& (DMPF_SYNC
| DMPF_NOSYNC
))
1065 DMEMIT("%s %u %u", rs
->raid_type
->name
,
1066 raid_param_cnt
, rs
->md
.chunk_sectors
);
1068 if ((rs
->print_flags
& DMPF_SYNC
) &&
1069 (rs
->md
.recovery_cp
== MaxSector
))
1071 if (rs
->print_flags
& DMPF_NOSYNC
)
1074 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
1075 if ((rs
->print_flags
& DMPF_REBUILD
) &&
1076 rs
->dev
[i
].data_dev
&&
1077 !test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
1078 DMEMIT(" rebuild %u", i
);
1080 if (rs
->print_flags
& DMPF_DAEMON_SLEEP
)
1081 DMEMIT(" daemon_sleep %lu",
1082 rs
->md
.bitmap_info
.daemon_sleep
);
1084 if (rs
->print_flags
& DMPF_MIN_RECOVERY_RATE
)
1085 DMEMIT(" min_recovery_rate %d", rs
->md
.sync_speed_min
);
1087 if (rs
->print_flags
& DMPF_MAX_RECOVERY_RATE
)
1088 DMEMIT(" max_recovery_rate %d", rs
->md
.sync_speed_max
);
1090 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
1091 if (rs
->dev
[i
].data_dev
&&
1092 test_bit(WriteMostly
, &rs
->dev
[i
].rdev
.flags
))
1093 DMEMIT(" write_mostly %u", i
);
1095 if (rs
->print_flags
& DMPF_MAX_WRITE_BEHIND
)
1096 DMEMIT(" max_write_behind %lu",
1097 rs
->md
.bitmap_info
.max_write_behind
);
1099 if (rs
->print_flags
& DMPF_STRIPE_CACHE
) {
1100 raid5_conf_t
*conf
= rs
->md
.private;
1102 /* convert from kiB to sectors */
1103 DMEMIT(" stripe_cache %d",
1104 conf
? conf
->max_nr_stripes
* 2 : 0);
1107 if (rs
->print_flags
& DMPF_REGION_SIZE
)
1108 DMEMIT(" region_size %lu",
1109 rs
->md
.bitmap_info
.chunksize
>> 9);
1111 DMEMIT(" %d", rs
->md
.raid_disks
);
1112 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
1113 if (rs
->dev
[i
].meta_dev
)
1114 DMEMIT(" %s", rs
->dev
[i
].meta_dev
->name
);
1118 if (rs
->dev
[i
].data_dev
)
1119 DMEMIT(" %s", rs
->dev
[i
].data_dev
->name
);
1128 static int raid_iterate_devices(struct dm_target
*ti
, iterate_devices_callout_fn fn
, void *data
)
1130 struct raid_set
*rs
= ti
->private;
1134 for (i
= 0; !ret
&& i
< rs
->md
.raid_disks
; i
++)
1135 if (rs
->dev
[i
].data_dev
)
1137 rs
->dev
[i
].data_dev
,
1138 0, /* No offset on data devs */
1145 static void raid_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
1147 struct raid_set
*rs
= ti
->private;
1148 unsigned chunk_size
= rs
->md
.chunk_sectors
<< 9;
1149 raid5_conf_t
*conf
= rs
->md
.private;
1151 blk_limits_io_min(limits
, chunk_size
);
1152 blk_limits_io_opt(limits
, chunk_size
* (conf
->raid_disks
- conf
->max_degraded
));
1155 static void raid_presuspend(struct dm_target
*ti
)
1157 struct raid_set
*rs
= ti
->private;
1159 md_stop_writes(&rs
->md
);
1162 static void raid_postsuspend(struct dm_target
*ti
)
1164 struct raid_set
*rs
= ti
->private;
1166 mddev_suspend(&rs
->md
);
1169 static void raid_resume(struct dm_target
*ti
)
1171 struct raid_set
*rs
= ti
->private;
1173 bitmap_load(&rs
->md
);
1174 mddev_resume(&rs
->md
);
1177 static struct target_type raid_target
= {
1179 .version
= {1, 1, 0},
1180 .module
= THIS_MODULE
,
1184 .status
= raid_status
,
1185 .iterate_devices
= raid_iterate_devices
,
1186 .io_hints
= raid_io_hints
,
1187 .presuspend
= raid_presuspend
,
1188 .postsuspend
= raid_postsuspend
,
1189 .resume
= raid_resume
,
1192 static int __init
dm_raid_init(void)
1194 return dm_register_target(&raid_target
);
1197 static void __exit
dm_raid_exit(void)
1199 dm_unregister_target(&raid_target
);
1202 module_init(dm_raid_init
);
1203 module_exit(dm_raid_exit
);
1205 MODULE_DESCRIPTION(DM_NAME
" raid4/5/6 target");
1206 MODULE_ALIAS("dm-raid4");
1207 MODULE_ALIAS("dm-raid5");
1208 MODULE_ALIAS("dm-raid6");
1209 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
1210 MODULE_LICENSE("GPL");