2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumvar.h,v 1.24 2000/03/01 02:34:57 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumvar.h,v 1.32.2.4 2001/05/28 05:56:27 grog Exp $
42 * $DragonFly: src/sys/dev/raid/vinum/vinumvar.h,v 1.10 2007/05/15 17:50:56 dillon Exp $
46 #include "vinumstate.h"
49 * Some configuration maxima. They're an enum because
50 * we can't define global constants. Sorry about that.
52 * These aren't as bad as they look: most of them are soft limits.
57 VINUM_HEADER
= 512, /* size of header on disk */
58 MAXCONFIGLINE
= 1024, /* maximum size of a single config line */
59 MINVINUMSLICE
= 1048576, /* minimum size of a slice */
61 VINUM_CDEV_MAJOR
= 91, /* major number for character device */
63 ROUND_ROBIN_READPOL
= -1, /* round robin read policy */
65 /* type field in minor number */
66 VINUM_VOLUME_TYPE
= 0,
70 VINUM_SUPERDEV_TYPE
= 4, /* super device. */
71 VINUM_RAWPLEX_TYPE
= 5, /* anonymous plex */
72 VINUM_RAWSD_TYPE
= 6, /* anonymous subdisk */
74 /* Shifts for the individual fields in the device */
75 VINUM_TYPE_SHIFT
= 28,
77 VINUM_PLEX_SHIFT
= 16,
84 * Shifts for the second half of raw plex and
87 VINUM_RAWPLEX_SHIFT
= 8, /* shift the second half this much */
88 VINUM_RAWPLEX_WIDTH
= 12, /* width of second half */
92 MAXPLEX
= 8, /* maximum number of plexes in a volume */
93 MAXSD
= 256, /* maximum number of subdisks in a plex */
94 MAXDRIVENAME
= 32, /* maximum length of a device name */
95 MAXSDNAME
= 64, /* maximum length of a subdisk name */
96 MAXPLEXNAME
= 64, /* maximum length of a plex name */
97 MAXVOLNAME
= 64, /* maximum length of a volume name */
98 MAXNAME
= 64, /* maximum length of any name */
102 * Define a minor device number.
103 * This is not used directly; instead, it's
104 * called by the other macros.
106 #define VINUMMINOR(v,p,s,t) ( (v << VINUM_VOL_SHIFT) \
107 | (p << VINUM_PLEX_SHIFT) \
108 | (s << VINUM_SD_SHIFT) \
109 | (t << VINUM_TYPE_SHIFT) )
111 /* Create device minor numbers */
115 #define VINUMDEV(v,p,s,t) \
116 make_adhoc_dev (&vinum_ops, VINUMMINOR (v, p, s, t))
118 #define VINUM_PLEX(p) \
119 make_adhoc_dev (&vinum_ops, \
120 (VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \
122 | ((p & ~0xff) << 8) )
124 #define VINUM_SD(s) \
125 make_adhoc_dev (&vinum_ops, \
126 (VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \
128 | ((s & ~0xff) << 8) )
132 #define VINUMDEV(v,p,s,t) \
133 makedev(VINUM_CDEV_MAJOR, VINUMMINOR (v, p, s, t))
135 #define VINUM_PLEX(p) \
136 makedev(VINUM_CDEV_MAJOR, \
137 (VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \
139 | ((p & ~0xff) << 8) )
141 #define VINUM_SD(s) \
142 makedev(VINUM_CDEV_MAJOR, \
143 (VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \
145 | ((s & ~0xff) << 8) )
149 /* Create a bit mask for x bits */
150 #define MASK(x) ((1 << (x)) - 1)
152 /* Create a raw block device minor number */
153 #define VINUMRMINOR(d,t) ( ((d & MASK (VINUM_VOL_WIDTH)) << VINUM_VOL_SHIFT) \
154 | ((d & ~MASK (VINUM_VOL_WIDTH)) \
155 << (VINUM_PLEX_SHIFT + VINUM_VOL_WIDTH)) \
156 | (t << VINUM_TYPE_SHIFT) )
158 /* extract device type */
159 #define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 7)
162 * This mess is used to catch people who compile
163 * a debug vinum(8) and non-debug kernel module,
164 * or the other way round.
168 #define VINUM_SUPERDEV VINUMMINOR (1, 0, 0, VINUM_SUPERDEV_TYPE) /* superdevice number */
169 #define VINUM_WRONGSUPERDEV VINUMMINOR (2, 0, 0, VINUM_SUPERDEV_TYPE) /* non-debug superdevice number */
171 #define VINUM_SUPERDEV VINUMMINOR (2, 0, 0, VINUM_SUPERDEV_TYPE) /* superdevice number */
172 #define VINUM_WRONGSUPERDEV VINUMMINOR (1, 0, 0, VINUM_SUPERDEV_TYPE) /* debug superdevice number */
175 #define VINUM_DAEMON_DEV VINUMMINOR (0, 0, 0, VINUM_SUPERDEV_TYPE) /* daemon superdevice number */
178 * the number of object entries to cater for initially, and also the
179 * value by which they are incremented. It doesn't take long
180 * to extend them, so theoretically we could start with 1 of each, but
181 * it's untidy to allocate such small areas. These values are
182 * probably too small.
188 INITIAL_SUBDISKS
= 16,
189 INITIAL_SUBDISKS_IN_PLEX
= 4, /* number of subdisks to allocate to a plex */
190 INITIAL_SUBDISKS_IN_DRIVE
= 4, /* number of subdisks to allocate to a drive */
191 INITIAL_DRIVE_FREELIST
= 16, /* number of entries in drive freelist */
192 PLEX_REGION_TABLE_SIZE
= 8, /* number of entries in plex region tables */
193 PLEX_LOCKS
= 256, /* number of locks to allocate to a plex */
194 MAX_REVIVE_BLOCKSIZE
= MAXPHYS
, /* maximum revive block size */
195 DEFAULT_REVIVE_BLOCKSIZE
= 65536, /* default revive block size */
196 VINUMHOSTNAMELEN
= 32, /* host name field in label */
202 * 31 30 28 27 20 19 18 16 15 8 7 0
203 * |-----------------------------------------------------------------------------------------------|
204 * |X | Type | Subdisk number | X| Plex | Major number | volume number |
205 * |-----------------------------------------------------------------------------------------------|
209 * The fields in the minor number are interpreted as follows:
211 * Volume: Only type and volume number are relevant
212 * Plex in volume: type, plex number in volume and volume number are relevant
213 * raw plex: type, plex number is made of bits 27-16 and 7-0
214 * raw subdisk: type, subdisk number is made of bits 27-16 and 7-0
217 /* This doesn't get used. Consider removing it. */
220 * CARE. These fields assume a big-endian word. On a
221 * little-endian system, they're the wrong way around
223 unsigned volume
:8; /* up to 256 volumes */
224 unsigned major
:8; /* this is where the major number fits */
225 unsigned plex
:3; /* up to 8 plexes per volume */
226 unsigned unused
:1; /* up for grabs */
227 unsigned sd
:8; /* up to 256 subdisks per plex */
228 unsigned type
:3; /* type of object */
238 unsigned signbit
:1; /* to make 32 bits */
241 #define VINUM_DIR "/dev/vinum"
244 * These definitions help catch
245 * userland/kernel mismatches.
248 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR"/control" /* normal super device */
249 #define VINUM_SUPERDEV_NAME VINUM_DIR"/Control" /* debug super device */
251 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR"/Control" /* debug super device */
252 #define VINUM_SUPERDEV_NAME VINUM_DIR"/control" /* normal super device */
254 #define VINUM_DAEMON_DEV_NAME VINUM_DIR"/controld" /* super device for daemon only */
257 * Flags for all objects. Most of them only apply to
258 * specific objects, but we have space for all in any
262 VF_LOCKED
= 1, /* somebody has locked access to this object */
263 VF_LOCKING
= 2, /* we want access to this object */
264 VF_OPEN
= 4, /* object has openers */
265 VF_WRITETHROUGH
= 8, /* volume: write through */
266 VF_INITED
= 0x10, /* unit has been initialized */
267 VF_WLABEL
= 0x20, /* label area is writable */
268 VF_LABELLING
= 0x40, /* unit is currently being labelled */
269 VF_WANTED
= 0x80, /* someone is waiting to obtain a lock */
270 VF_RAW
= 0x100, /* raw volume (no file system) */
271 VF_LOADED
= 0x200, /* module is loaded */
272 VF_CONFIGURING
= 0x400, /* somebody is changing the config */
273 VF_WILL_CONFIGURE
= 0x800, /* somebody wants to change the config */
274 VF_CONFIG_INCOMPLETE
= 0x1000, /* haven't finished changing the config */
275 VF_CONFIG_SETUPSTATE
= 0x2000, /* set a volume up if all plexes are empty */
276 VF_READING_CONFIG
= 0x4000, /* we're reading config database from disk */
277 VF_FORCECONFIG
= 0x8000, /* configure drives even with different names */
278 VF_NEWBORN
= 0x10000, /* for objects: we've just created it */
279 VF_CONFIGURED
= 0x20000, /* for drives: we read the config */
280 VF_STOPPING
= 0x40000, /* for vinum_conf: stop on last close */
281 VF_DAEMONOPEN
= 0x80000, /* the daemon has us open (only superdev) */
282 VF_CREATED
= 0x100000, /* for volumes: freshly created, more then new */
283 VF_HOTSPARE
= 0x200000, /* for drives: use as hot spare */
284 VF_RETRYERRORS
= 0x400000, /* don't down subdisks on I/O errors */
287 /* Global configuration information for the vinum subsystem */
289 /* Pointers to vinum structures */
293 struct volume
*volume
;
295 /* the number allocated */
296 int drives_allocated
;
297 int subdisks_allocated
;
298 int plexes_allocated
;
299 int volumes_allocated
;
301 /* and the number currently in use */
309 #define VINUM_MAXACTIVE 30000 /* maximum number of active requests */
310 int active
; /* current number of requests outstanding */
311 int maxactive
; /* maximum number of requests ever outstanding */
313 struct request
*lastrq
;
319 /* Use these defines to simplify code */
320 #define DRIVE vinum_conf.drive
321 #define SD vinum_conf.sd
322 #define PLEX vinum_conf.plex
323 #define VOL vinum_conf.volume
324 #define VFLAGS vinum_conf.flags
329 * Vinum drives start with this structure:
332 * |--------------------------------------|
333 * | PDP-11 memorial boot block | 0
334 * |--------------------------------------|
335 * | Disk label, maybe | 1
336 * |--------------------------------------|
337 * | Slice definition (vinum_hdr) | 8
338 * |--------------------------------------|
340 * | Configuration info, first copy | 9
342 * |--------------------------------------|
344 * | Configuration info, second copy | 9 + size of config
346 * |--------------------------------------|
349 /* Sizes and offsets of our information */
351 VINUM_LABEL_OFFSET
= 4096, /* offset of vinum label */
352 VINUMHEADERLEN
= 512, /* size of vinum label */
353 VINUM_CONFIG_OFFSET
= 4608, /* offset of first config copy */
354 MAXCONFIG
= 65536, /* and size of config copy */
355 DATASTART
= (MAXCONFIG
* 2 + VINUM_CONFIG_OFFSET
) / DEV_BSIZE
/* this is where the data starts */
359 * hostname is 256 bytes long, but we don't need to shlep
360 * multiple copies in vinum. We use the host name just
361 * to identify this system, and 32 bytes should be ample
366 char sysname
[VINUMHOSTNAMELEN
]; /* system name at time of creation */
367 char name
[MAXDRIVENAME
]; /* our name of the drive */
368 struct timeval date_of_birth
; /* the time it was created */
369 struct timeval last_update
; /* and the time of last update */
371 * total size in bytes of the drive. This value
372 * includes the headers.
378 uint64_t magic
; /* we're long on magic numbers */
379 #define VINUM_MAGIC 22322600044678729LL /* should be this */
380 #define VINUM_NOMAGIC 22322600044678990LL /* becomes this after obliteration */
382 * Size in bytes of each copy of the
383 * configuration info. This must be a multiple
384 * of the sector size.
387 struct vinum_label label
; /* unique label */
390 /* Information returned from read_drive_label */
391 enum drive_label_info
{
392 DL_CANT_OPEN
, /* invalid partition */
393 DL_NOT_OURS
, /* valid partition, but no vinum label */
394 DL_DELETED_LABEL
, /* valid partition, deleted label found */
395 DL_WRONG_DRIVE
, /* drive name doesn't match */
396 DL_OURS
/* valid partition and label found */
399 /*** Drive definitions ***/
401 * A drive corresponds to a disk slice. We use a different term to show
402 * the difference in usage: it doesn't have to be a slice, and could
403 * theoretically be a complete, unpartitioned disk
407 char devicename
[MAXDRIVENAME
]; /* name of the slice it's on */
408 enum drivestate state
; /* current state */
409 int flags
; /* flags */
410 int subdisks_allocated
; /* number of entries in sd */
411 int subdisks_used
; /* and the number used */
412 int blocksize
; /* size of fs blocks */
413 int pid
; /* of locker */
414 u_int64_t sectors_available
; /* number of sectors still available */
416 int lasterror
; /* last error on drive */
417 int driveno
; /* index of drive in vinum_conf */
418 int opencount
; /* number of up subdisks */
419 u_int64_t reads
; /* number of reads on this drive */
420 u_int64_t writes
; /* number of writes on this drive */
421 u_int64_t bytes_read
; /* number of bytes read */
422 u_int64_t bytes_written
; /* number of bytes written */
423 struct vinum_label label
; /* and the label information */
424 #define DRIVE_MAXACTIVE 30000 /* maximum number of active requests */
425 int active
; /* current number of requests outstanding */
426 int maxactive
; /* maximum number of requests ever outstanding */
427 int freelist_size
; /* number of entries alloced in free list */
428 int freelist_entries
; /* number of entries used in free list */
429 struct drive_freelist
{ /* sorted list of free space on drive */
430 u_int64_t offset
; /* offset of entry */
431 u_int64_t sectors
; /* and length in sectors */
433 struct partinfo partinfo
; /* partition information */
434 /* XXX kludge until we get this struct cleaned up */
436 cdev_t dev
; /* device information */
438 char dev
[sizeof (int *)];
441 char lockfilename
[16]; /* name of file from which we were locked */
442 int lockline
; /* and the line number */
446 /*** Subdisk definitions ***/
449 char name
[MAXSDNAME
]; /* name of subdisk */
450 enum sdstate state
; /* state */
452 int lasterror
; /* last error occurred */
453 /* offsets in blocks */
454 int64_t driveoffset
; /* offset on drive */
456 * plexoffset is the offset from the beginning
457 * of the plex to the very first part of the
458 * subdisk, in sectors. For striped, RAID-4 and
459 * RAID-5 plexes, only the first stripe is
460 * located at this offset
462 int64_t plexoffset
; /* offset in plex */
463 u_int64_t sectors
; /* and length in sectors */
464 int plexno
; /* index of plex, if it belongs */
465 int driveno
; /* index of the drive on which it is located */
466 int sdno
; /* our index in vinum_conf */
467 int plexsdno
; /* and our number in our plex */
468 /* (undefined if no plex) */
469 u_int64_t reads
; /* number of reads on this subdisk */
470 u_int64_t writes
; /* number of writes on this subdisk */
471 u_int64_t bytes_read
; /* number of bytes read */
472 u_int64_t bytes_written
; /* number of bytes written */
473 /* revive parameters */
474 u_int64_t revived
; /* block number of current revive request */
475 int revive_blocksize
; /* revive block size (bytes) */
476 int revive_interval
; /* and time to wait between transfers */
477 pid_t reviver
; /* PID of reviving process */
478 /* init parameters */
479 u_int64_t initialized
; /* block number of current init request */
480 int init_blocksize
; /* init block size (bytes) */
481 int init_interval
; /* and time to wait between transfers */
482 struct request
*waitlist
; /* list of requests waiting on revive op */
485 /*** Plex definitions ***/
487 /* kinds of plex organization */
489 plex_disorg
, /* disorganized */
490 plex_concat
, /* concatenated plex */
491 plex_striped
, /* striped plex */
492 plex_raid4
, /* RAID4 plex */
493 plex_raid5
/* RAID5 plex */
496 /* Recognize plex organizations */
497 #define isstriped(p) (p->organization >= plex_striped) /* RAID 1, 4 or 5 */
498 #define isparity(p) (p->organization >= plex_raid4) /* RAID 4 or 5 */
501 char name
[MAXPLEXNAME
]; /* name of plex */
502 enum plexorg organization
; /* Plex organization */
503 enum plexstate state
; /* and current state */
504 u_int64_t length
; /* total length of plex (sectors) */
506 int stripesize
; /* size of stripe or raid band, in sectors */
507 int subdisks
; /* number of associated subdisks */
508 int subdisks_allocated
; /* number of subdisks allocated space for */
509 int *sdnos
; /* list of component subdisks */
510 int plexno
; /* index of plex in vinum_conf */
511 int volno
; /* index of volume */
512 int volplexno
; /* number of plex in volume */
514 u_int64_t reads
; /* number of reads on this plex */
515 u_int64_t writes
; /* number of writes on this plex */
516 u_int64_t bytes_read
; /* number of bytes read */
517 u_int64_t bytes_written
; /* number of bytes written */
518 u_int64_t recovered_reads
; /* number of recovered read operations */
519 u_int64_t degraded_writes
; /* number of degraded writes */
520 u_int64_t parityless_writes
; /* number of parityless writes */
521 u_int64_t multiblock
; /* requests that needed more than one block */
522 u_int64_t multistripe
; /* requests that needed more than one stripe */
523 int sddowncount
; /* number of subdisks down */
524 /* Lock information */
525 int usedlocks
; /* number currently in use */
526 int lockwaits
; /* and number of waits for locks */
527 off_t checkblock
; /* block number for parity op */
528 struct rangelock
*lock
; /* ranges of locked addresses */
531 /*** Volume definitions ***/
533 /* Address range definitions, for locking volumes */
535 daddr_t stripe
; /* address + 1 of the range being locked */
536 struct buf
*bp
; /* user's buffer pointer */
540 char name
[MAXVOLNAME
]; /* name of volume */
541 enum volumestate state
; /* current state */
542 int plexes
; /* number of plexes */
543 int preferred_plex
; /* plex to read from, -1 for round-robin */
545 * index of plex used for last read, for
549 int volno
; /* volume number */
550 int flags
; /* status and configuration flags */
551 int openflags
; /* flags supplied to last open(2) */
552 u_int64_t size
; /* size of volume */
553 int blocksize
; /* logical block size */
554 int active
; /* number of outstanding requests active */
555 int subops
; /* and the number of suboperations */
557 u_int64_t bytes_read
; /* number of bytes read */
558 u_int64_t bytes_written
; /* number of bytes written */
559 u_int64_t reads
; /* number of reads on this volume */
560 u_int64_t writes
; /* number of writes on this volume */
561 u_int64_t recovered_reads
; /* reads recovered from another plex */
563 * Unlike subdisks in the plex, space for the
564 * plex pointers is static.
566 int plex
[MAXPLEX
]; /* index of plexes */
570 * Table expansion. Expand table, which contains oldcount
571 * entries of type element, by increment entries, and change
572 * oldcount accordingly
574 #define EXPAND(table, element, oldcount, increment) \
576 expand_table ((void **) &table, \
577 oldcount * sizeof (element), \
578 (oldcount + increment) * sizeof (element) ); \
579 oldcount += increment; \
582 /* Information on vinum's memory usage */
584 int mallocs
; /* number of malloced blocks */
585 int total_malloced
; /* total amount malloced */
586 int highwater
; /* maximum number of mallocs */
587 struct mc
*malloced
; /* pointer to kernel table */
590 #define MCFILENAMELEN 16
597 char file
[MCFILENAMELEN
];
601 * These enums are used by the state transition
602 * routines. They're in bit map format:
604 * Bit 0: Other plexes in the volume are down
605 * Bit 1: Other plexes in the volume are up
606 * Bit 2: The current plex is up
607 * Maybe they should be local to
611 volplex_onlyusdown
= 0, /* 0: we're the only plex, and we're down */
612 volplex_alldown
, /* 1: another plex is down, and so are we */
613 volplex_otherup
, /* 2: another plex is up */
614 volplex_otherupdown
, /* 3: other plexes are up and down */
615 volplex_onlyus
, /* 4: we're up and alone */
616 volplex_onlyusup
, /* 5: only we are up, others are down */
617 volplex_allup
, /* 6: all plexes are up */
618 volplex_someup
/* 7: some plexes are up, including us */
621 /* state map for plex */
624 sd_downstate
= 2, /* SD is down */
625 sd_crashedstate
= 4, /* SD is crashed */
626 sd_obsoletestate
= 8, /* SD is obsolete */
627 sd_stalestate
= 16, /* SD is stale */
628 sd_rebornstate
= 32, /* SD is reborn */
629 sd_upstate
= 64, /* SD is up */
630 sd_initstate
= 128, /* SD is initializing */
631 sd_initializedstate
= 256, /* SD is initialized */
632 sd_otherstate
= 512, /* SD is in some other state */
636 * This is really just a parameter to pass to
637 * set_<foo>_state, but since it needs to be known
638 * in the external definitions, we need to define
642 setstate_none
= 0, /* no flags */
643 setstate_force
= 1, /* force the state change */
644 setstate_configuring
= 2, /* we're currently configuring, don't save */
647 /* Operations for parityops to perform. */
651 rebuildandcheckparity
, /* rebuildparity with the -v option */
655 /* Debugging stuff */
657 DEBUG_ADDRESSES
= 1, /* show buffer information during requests */
658 DEBUG_NUMOUTPUT
= 2, /* show the value of vp->v_numoutput */
659 DEBUG_RESID
= 4, /* go into debugger in complete_rqe */
660 DEBUG_LASTREQS
= 8, /* keep a circular buffer of last requests */
661 DEBUG_REVIVECONFLICT
= 16, /* print info about revive conflicts */
662 DEBUG_EOFINFO
= 32, /* print info about EOF detection */
663 DEBUG_MEMFREE
= 64, /* keep info about Frees */
664 DEBUG_BIGDRIVE
= 128, /* pretend our drives are 100 times the size */
665 DEBUG_REMOTEGDB
= 256, /* go into remote gdb */
666 DEBUG_WARNINGS
= 512, /* log various relatively harmless warnings */
671 #define longjmp LongJmp /* test our longjmps */
675 /* Local Variables: */
676 /* fill-column: 50 */