1948 zpool list should show more detailed pool information
[unleashed.git] / usr / src / uts / common / sys / lvm / md_mddb.h
blob0668d8c023c64d73b4f113d06ee70734d2e29930
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #ifndef _SYS_MD_MDDB_H
27 #define _SYS_MD_MDDB_H
29 #pragma ident "%Z%%M% %I% %E% SMI"
31 #include <sys/types.h>
32 #include <sys/buf.h>
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
38 #if 0 /* DRP FOR DEBUGGING */
39 #define MDDB_FAKE
40 #endif
42 /* Private flags */
43 #define MD_PRV_GOTIT 0x0001 /* Been snarfed */
44 #define MD_PRV_DELETE 0x0002 /* Record pending to be deleted */
45 #define MD_PRV_COMMIT 0x0004 /* Record pending to be commited */
46 #define MD_PRV_CLEANUP 0x0008 /* Record pending to be cleaned up */
47 #define MD_PRV_CONVD 0x0010 /* Record has been converted (32->64) */
48 #define MD_PRV_PENDDEL (MD_PRV_GOTIT | MD_PRV_DELETE)
49 #define MD_PRV_PENDCOM (MD_PRV_GOTIT | MD_PRV_COMMIT)
50 #define MD_PRV_PENDCLEAN (MD_PRV_GOTIT | MD_PRV_CLEANUP)
53 #define MDDB_E_INVALID (-1) /* an invalid argument was passed */
54 #define MDDB_E_EXISTS (-2) /* doing an operation a 2nd time which can */
55 /* only be done once */
56 #define MDDB_E_MASTER (-3) /* problem occurred accessing mastor block */
57 /* returned from NEW_DEV */
58 #define MDDB_E_TOOSMALL (-4) /* device is not large enough */
59 #define MDDB_E_NORECORD (-5) /* record does not exits */
61 * returned from: mddb_getnextrec
62 * mddb_getrecsize
63 * mddb_commitrec
64 * mddb_commitrecs
65 * mddb_deleterec
67 #define MDDB_E_NOSPACE (-6) /* no space to create record */
68 #define MDDB_E_NOTNOW (-7) /* do not presently have enough resources */
69 /* to perform requested operation */
70 #define MDDB_E_NODB (-8) /* no database exist */
71 #define MDDB_E_NOTOWNER (-9) /* have not been told to grab this set */
72 #define MDDB_E_STALE (-10) /* database is stale */
73 #define MDDB_E_TOOFEW (-11) /* not enough replicas available */
74 #define MDDB_E_TAGDATA (-12) /* tagged data detected */
75 #define MDDB_E_ACCOK (-13) /* 50/50 mode */
76 #define MDDB_E_NTAGDATA (-14) /* tagop try, no tag data */
77 #define MDDB_E_ACCNOTOK (-15) /* accop try, no accept possible */
78 #define MDDB_E_NOLOCBLK (-16) /* No valid locators found */
79 #define MDDB_E_NOLOCNMS (-17) /* No valid locator name information */
80 #define MDDB_E_NODIRBLK (-18) /* No directory blocks found */
81 #define MDDB_E_NOTAGREC (-19) /* No tag record blocks found */
82 #define MDDB_E_NOTAG (-20) /* No matching tag record found */
83 #define MDDB_E_NODEVID (-21) /* No device id found */
85 #define MDDB_MINBLKS 16 /* enough for a few metadevices */
86 #define MDDB_MAXBLKS 8192 /* size of free bit map (must be / 8) */
87 #define MDDB_MN_MINBLKS 32768 /* Multinode metadb minimum size */
88 /* 16MB */
89 #define MDDB_MN_MAXBLKS 524288 /* size of free bit map (must be / 8) */
90 /* 256MB */
92 #define MDDB_C_STALE 0x0001
93 #define MDDB_C_TOOFEW 0x0002
94 #define MDDB_C_NOTOWNER 0x0004
95 #define MDDB_C_SET_MN_STALE 0x0008 /* Set MN set to stale */
96 #define MDDB_C_IMPORT 0x0010
99 * Defines used to set/reset new master flag in set structure.
100 * Used during reconfig cycle to determine quickly if there is
101 * new master for the set.
103 #define MDDB_NM_SET 0x0001
104 #define MDDB_NM_RESET 0x0002
105 #define MDDB_NM_GET 0x0004
107 /* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
108 #define MDDB_DID_EXISTS 0x0001 /* Device ID exists */
109 #define MDDB_DID_VALID 0x0002 /* Device ID valid on current system */
110 #define MDDB_DID_UPDATED 0x0004 /* locator/sidelocator info updated */
112 /* Definitions of flag in Locator Block - mddb_lb */
113 #define MDDB_DEVID_STYLE 0x0001 /* Locator Block in Device ID format */
114 #define MDDB_MNSET 0x0002 /* MDDB is for a multi-node set */
117 #define MDDB_MAX_PATCH 25 /* number of locations that */
118 /* can be patched in etc/system */
121 * Set struct used by all parts of the driver, to store anchor pointers.
123 * Lock associated with field in this structure:
125 * Some of fields are accessible by both the single threaded ioctl thread
126 * and internal threads such as resync, hotsparing...etc. In this case
127 * additional protection is needed. For example, s_db is protected by
128 * s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock
129 * s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock
130 * Rest of other fileds are protected by md_mx. Two fields s_un_next and
131 * s_un_avail are introduced by the friendly name project and are ONLY
132 * accessible via a single threaded ioctl thread which already is protected
133 * by the ioctl lock and there is no need to add extra protection to them.
134 * However, in the future if they become accessible by other internal threads
135 * then an additional protection such as md_mx lock is highly recommended.
138 typedef struct md_set {
139 uint_t s_status; /* set status */
140 void **s_ui; /* set unit incore anchor */
141 void **s_un; /* set unit anchor */
142 void *s_hsp; /* set Hot Spare Pool anchor */
143 void *s_hs; /* set Hot Spare anchor */
144 void *s_db; /* set MDDB anchor */
145 kmutex_t s_dbmx; /* set MDDB mutex */
146 void *s_nm; /* set namespace anchor */
147 mddb_recid_t s_nmid; /* set namespace anchor record */
148 void *s_did_nm; /* set device id namespace anchor */
149 mddb_recid_t s_did_nmid; /* set device id namespace anchor rec */
150 void *s_dtp; /* set data tag rec */
151 int s_am_i_master; /* incore master flag for this node */
152 md_mn_nodeid_t s_nodeid; /* nodeid of this node - for MN sets */
153 uint_t s_rcnt; /* incore resync count for set */
154 unit_t s_un_next; /* s_un scan starts here */
155 unit_t s_un_avail; /* number of avail slots */
156 } md_set_t;
159 #define MDDB_MAGIC_MB 0x6d646d62 /* magic number for master blocks */
160 #define MDDB_MAGIC_DB 0x6d646462 /* magic number for directory blocks */
161 #define MDDB_MAGIC_RB 0x6d647262 /* magic number for record blocks */
162 #define MDDB_MAGIC_LB 0x6d646c62 /* magic number for locator blocks */
163 #define MDDB_MAGIC_LN 0x6d646c6e /* magic number for locator names */
164 #define MDDB_MAGIC_DT 0x6d646474 /* magic number for data tag */
165 #define MDDB_MAGIC_DI 0x6d646469 /* magic number for device ID block */
166 #define MDDB_MAGIC_DU 0x6d646475 /* magic num for dummy mb */
167 #define MDDB_MAGIC_DE 0x6d646465 /* magic num for mb devid */
169 #define MDDB_GLOBAL_XOR 1234567890
171 #define MDDB_REV_MAJOR (uint_t)0xff00
172 #define MDDB_REV_MINOR (uint_t)0x00ff
175 * MDDB_REV_MNMB:
176 * If a MN diskset, master block revision is set to MDDB_REV_MNMB.
177 * Even though the master block structure is no different
178 * for a MN set, setting the revision field to a different
179 * number keeps any pre-MN_diskset code from accessing
180 * this diskset. It also allows for an early determination
181 * of a MN diskset when reading in from disk so that the
182 * proper size locator block and locator names structure
183 * can be read in thus saving time on diskset startup.
184 * Since no change in master block structure, the MDDB_REV_MINOR
185 * portion of the revision was incremented.
187 * MDDB_REV_MNLB:
188 * If a MN diskset, the locator block structure is a different size in
189 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
190 * with any nodeid (sideno) allowed.
191 * The revision is set to MDDB_REV_MNLB which is a change of the
192 * MDDB_REV_MAJOR portion of the revision.
194 * MDDB_REV_MNLN:
195 * If a MN diskset, the locator names is a different size in
196 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
197 * with any nodeid (sideno) allowed.
198 * The revision is set to MDDB_REV_MNLN which is a change of the
199 * MDDB_REV_MAJOR portion of the revision.
201 * The record blocks have two binary properties. A record block can
202 * represent either a 32 or 64 bit unit. A record block can also represent
203 * a traditionally named unit or a friendly named unit. Thus, there are
204 * minor revisions of record block.
206 * Traditional Friendly
207 * Name Name
208 * ----------- --------
209 * 32 bit MDDB_REV_RB MDDB_REV_RBFN
210 * 64 bit MDDB_REV_RB64 MDDB_REV_RB64FN
213 #define MDDB_REV_MB (uint_t)0x0201
214 #define MDDB_REV_MNMB (uint_t)0x0202
215 #define MDDB_REV_DB (uint_t)0x0201
216 #define MDDB_REV_LB (uint_t)0x0500
217 #define MDDB_REV_MNLB (uint_t)0x0600
218 #define MDDB_REV_LN (uint_t)0x0100
219 #define MDDB_REV_MNLN (uint_t)0x0300
220 #define MDDB_REV_RB (uint_t)0x0200
221 #define MDDB_REV_RB64 (uint_t)0x0201
222 #define MDDB_REV_RBFN (uint_t)0x0202
223 #define MDDB_REV_RB64FN (uint_t)0x0203
224 #define MDDB_REV_DT (uint_t)0x0100
225 #define MDDB_REV_DI (uint_t)0x0100
228 * Transfer record block friendly name status to unit/hs structure.
230 #define MDDB_NOTE_FN(rbv, unv) switch (rbv) { \
231 case MDDB_REV_RB: \
232 case MDDB_REV_RB64: \
233 unv &= ~MD_FN_META_DEV; \
234 break; \
235 case MDDB_REV_RBFN: \
236 case MDDB_REV_RB64FN: \
237 unv |= MD_FN_META_DEV; \
238 break; \
241 #define MDDB_BSIZE (uint_t)DEV_BSIZE
242 #define MDDB_PREFIXCNT 10
243 #define MDDB_DRVNMCNT 10
245 typedef int mddb_block_t;
247 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
248 #pragma pack(4)
249 #endif
250 typedef struct md_mnname_suffix {
251 md_name_suffix mn_ln_suffix;
252 uint_t mn_ln_sideno;
253 } md_mnname_suffix_t;
255 typedef struct mddb_ln {
256 int ln_magic;
257 uint_t ln_revision;
258 uint_t ln_checksum;
259 struct timeval32 ln_timestamp;
260 md_name_prefix ln_prefixes[MDDB_PREFIXCNT];
261 /* Don't change array sizes without changing RNDUP_BLKCNT */
262 md_name_suffix ln_suffixes[MD_MAXSIDES][MDDB_NLB];
263 } mddb_ln_t;
266 * Locator name structure for MN diskset. Same as for traditional
267 * and local diskset except that more sides are supported and the
268 * side number can be any number since the side number is stored
269 * in the ln_mnsuffixes structure instead of being used as an index
270 * into that array. This means that the whole array may need to be
271 * searched in order to find the correct information given a side number.
273 typedef struct mddb_mnln {
274 int ln_magic;
275 uint_t ln_revision;
276 uint_t ln_checksum;
277 struct timeval32 ln_timestamp;
278 md_name_prefix ln_prefixes[MDDB_PREFIXCNT];
279 /* Don't change array sizes without changing MDDB_MNLNCNT */
280 md_mnname_suffix_t ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
281 } mddb_mnln_t;
283 #define RNDUP_BLKCNT(sz, delta) (((sz) - \
284 ((delta) * \
285 ((MD_MAXSIDES - 1) * MDDB_NLB)) + \
286 MDDB_BSIZE - 1) / MDDB_BSIZE)
287 #define MDDB_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
288 #define MDDB_LOCAL_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), \
289 sizeof (md_name_suffix))
291 #define MDDB_MNLNCNT ((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
292 / MDDB_BSIZE)
294 typedef struct mddb_dt {
295 uint_t dt_mag;
296 uint_t dt_rev;
297 uint_t dt_cks;
298 mddb_dtag_t dt_dtag;
299 } mddb_dt_t;
301 #define MDDB_DT_BYTES (roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
302 #define MDDB_DT_BLOCKS (btodb(MDDB_DT_BYTES))
304 typedef union identifier {
305 char serial[MDDB_SN_LEN];
306 struct timeval32 createtime;
307 } identifier_t;
309 typedef struct mddb_locator {
310 dev32_t l_dev;
311 daddr32_t l_blkno;
312 int l_flags;
313 } mddb_locator_t;
315 typedef struct mddb_sidelocator {
316 uchar_t l_drvnm_index;
317 minor_t l_mnum;
318 } mddb_sidelocator_t;
320 typedef struct mddb_mnsidelocator {
321 uchar_t mnl_drvnm_index;
322 minor_t mnl_mnum;
323 uint_t mnl_sideno;
324 } mddb_mnsidelocator_t;
326 typedef struct mddb_drvnm {
327 uchar_t dn_len;
328 char dn_data[MD_MAXDRVNM];
329 } mddb_drvnm_t;
332 * Locator Block Device ID Information
333 * Several device id's may share one disk block in an effort to
334 * conserve used replica space.
336 typedef struct mddb_did_info {
337 uint_t info_flags; /* MDDB Device ID flags */
338 uint_t info_firstblk; /* Device ID Start Block */
339 uint_t info_blkcnt; /* Device ID Block Count */
340 uint_t info_offset; /* Device ID offset w/i Block */
341 uint_t info_length; /* Device ID Length */
342 uint_t info_checksum; /* Device ID Checksum */
343 char info_minor_name[32]; /* Minor name of lb dev */
344 } mddb_did_info_t;
346 typedef struct mddb_did_blk {
347 int blk_magic; /* used for verification */
348 uint_t blk_revision; /* used for verification */
349 int blk_checksum; /* used for verification */
350 uint_t blk_commitcnt; /* matches LB's commitcnt */
351 mddb_did_info_t blk_info[MDDB_NLB];
352 } mddb_did_blk_t;
353 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
354 #pragma pack()
355 #endif
357 #define MDDB_DID_BYTES (roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
358 #define MDDB_DID_BLOCKS (btodb(MDDB_DID_BYTES))
361 * Device ID Disk Blocks.
362 * Incore linked list of disk blocks containing device IDs.
363 * The list is built when reading in the mddb_did_blk structure and
364 * when reading in the actual disk blocks containing device ids.
365 * This list is used to easily write out all disk blocks containing
366 * device ids.
368 typedef struct mddb_did_db {
369 uint_t db_firstblk; /* Disk Block's logical addr */
370 uint_t db_blkcnt; /* Contig Disk Block Count */
371 caddr_t db_ptr; /* Ptr to incore Block(s) */
372 struct mddb_did_db *db_next; /* Ptr to next in list */
373 } mddb_did_db_t;
376 * Device ID Free List.
377 * Incore linked list of free space in disk blocks containing device IDs.
378 * Used to manage placement of device IDs in disk blocks.
379 * All disk blocks on free list are also in linked list of disk block
380 * containing device IDs (mddb_did_db_t).
382 typedef struct mddb_did_free {
383 uint_t free_blk; /* Disk Block's logical addr */
384 uint_t free_offset; /* offset of free space */
385 uint_t free_length; /* length of free space */
386 struct mddb_did_free *free_next; /* Ptr to next in list */
387 } mddb_did_free_t;
390 * Device ID Incore Area
391 * Contains pointer to Device ID Disk Block list and
392 * Device ID Free List.
393 * Also contains incore array of pointers to device IDs. Pointers
394 * point into the device ID Disk Block list and are used as a
395 * shortcut to find incore device IDs.
397 typedef struct mddb_did_ic {
398 mddb_did_blk_t *did_ic_blkp;
399 mddb_did_db_t *did_ic_dbp;
400 mddb_did_free_t *did_ic_freep;
401 ddi_devid_t did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
402 } mddb_did_ic_t;
405 * Locator Block (LB):
406 * - Are fixed size, but the size is different
407 * for local/shared set db replicas.
408 * - All LB's start at logical block 0.
409 * - After a replica quorum is found, there is
410 * is only one incore copy of the LB.
411 * - LB's are only written when replicas are added, deleted, or errored.
412 * - LB's provide information about other replica's and their state.
414 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
415 #pragma pack(4)
416 #endif
417 typedef struct mddb_lb {
418 int lb_magic; /* used for verification */
419 uint_t lb_revision; /* used for verification */
420 int lb_checksum; /* used for verification */
421 uint_t lb_commitcnt; /* IMPORTANT */
422 struct timeval32 lb_timestamp; /* informative only */
423 int lb_loccnt; /* used for verification */
424 identifier_t lb_ident; /* used for verification */
425 uint_t lb_flags; /* flags describing LB */
426 uint_t lb_spare[8]; /* Spare/Pad */
427 mddb_block_t lb_didfirstblk; /* Devid Array Start Block */
428 mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */
429 mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */
430 mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */
431 struct timeval32 lb_inittime; /* creation of database */
432 set_t lb_setno; /* used for verification */
433 mddb_block_t lb_blkcnt; /* used for verification */
434 mddb_block_t lb_lnfirstblk;
435 mddb_block_t lb_lnblkcnt;
436 mddb_block_t lb_dbfirstblk;
437 mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT];
438 mddb_locator_t lb_locators[MDDB_NLB];
439 /* Don't change array sizes without changing RNDUP_BLKCNT */
440 mddb_sidelocator_t lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
441 } mddb_lb_t;
442 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
443 #pragma pack()
444 #endif
447 * Locator block structure for MN diskset. Same as for traditional
448 * and local diskset except that more sides are supported and the
449 * side number can be any number since the side number is stored
450 * in the lb_mnsidelocators structure instead of being used as an index
451 * into that array. This means that the whole array may need to be
452 * searched in order to find the correct information given a side number.
454 typedef struct mddb_mnlb {
455 int lb_magic; /* used for verification */
456 uint_t lb_revision; /* used for verification */
457 int lb_checksum; /* used for verification */
458 uint_t lb_commitcnt; /* IMPORTANT */
459 struct timeval32 lb_timestamp; /* informative only */
460 int lb_loccnt; /* used for verification */
461 identifier_t lb_ident; /* used for verification */
462 uint_t lb_flags; /* flags describing LB */
463 uint_t lb_spare[8]; /* Spare/Pad */
464 mddb_block_t lb_didfirstblk; /* Devid Array Start Block */
465 mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */
466 mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */
467 mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */
468 struct timeval32 lb_inittime; /* creation of database */
469 set_t lb_setno; /* used for verification */
470 mddb_block_t lb_blkcnt; /* used for verification */
471 mddb_block_t lb_lnfirstblk;
472 mddb_block_t lb_lnblkcnt;
473 mddb_block_t lb_dbfirstblk;
474 mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT];
475 mddb_locator_t lb_locators[MDDB_NLB];
476 /* Don't change array sizes without changing MDDB_MNLBCNT */
477 mddb_mnsidelocator_t lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
478 } mddb_mnlb_t;
481 #define MDDB_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
482 #define MDDB_LOCAL_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), \
483 sizeof (mddb_sidelocator_t))
485 #define MDDB_MNLBCNT ((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
486 / MDDB_BSIZE)
488 typedef struct mddb_map {
489 daddr32_t m_consecutive;
490 daddr32_t m_firstblk;
491 } mddb_map_t;
494 * Master block(s) (MB)
495 * - Are written by userland; Never by the driver!
496 * - Each replica has there own master blocks,
497 * the master block(s) are not shared.
498 * - MB's are not in the logical block address space of the database.
499 * - MB's are a fixed size record (MDDB_BSIZE)
500 * - MB's provide the logical to physical block translation,
501 * for their replica.
503 typedef struct mddb_mb {
504 int mb_magic; /* used for verification */
505 uint_t mb_revision; /* used for verification */
506 uint_t mb_checksum; /* used for verification */
507 #ifdef _LP64
508 uint32_t mb_next; /* incore to next mb */
509 #else
510 struct mddb_mb *mb_next; /* incore to next mb */
511 #endif /* _LP64 */
512 daddr32_t mb_nextblk; /* block # for next mb */
513 md_timeval32_t mb_timestamp; /* timestamp */
514 daddr32_t mb_blkcnt; /* size of blkmap */
515 daddr32_t mb_blkno; /* physical loc. for this MB */
516 set_t mb_setno; /* used for verification */
517 struct timeval32 mb_setcreatetime; /* set creation timestamp */
518 int spares[7];
519 mddb_map_t mb_blkmap; /* logical->physical blk map */
520 int mb_devid_magic; /* verify devid in mb */
521 short mb_devid_len; /* len of following devid */
522 char mb_devid[1]; /* devid byte array */
523 } mddb_mb_t;
526 * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
527 * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
529 #define MDDB_IC_BSIZE (MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
530 typedef struct mddb_mb_ic {
531 struct mddb_mb_ic *mbi_next;
532 struct mddb_mb mbi_mddb_mb;
533 } mddb_mb_ic_t;
537 * there can be no address in record block. The checksum must
538 * stay the same where ever the record is in memory. Many
539 * things depend on this. Also the timestamp is the time the the
540 * record was committed not the time it was written to a particular
541 * device.
543 * Old definition of mddb_rb, for 32-bit apps and libraries
545 typedef struct mddb_rb {
546 uint_t rb_magic;
547 uint_t rb_revision;
548 uint_t rb_checksum;
549 uint_t rb_checksum_fiddle;
550 uint_t rb_private;
551 void *rb_userdata;
552 uint_t rb_commitcnt;
553 uint_t rb_spare[1];
554 struct timeval32 rb_timestamp;
555 int rb_data[1];
556 } mddb_rb_t;
558 /* This is, and always will be, the on-disk version of mddb_rb */
559 typedef struct mddb_rb32 {
560 uint_t rb_magic;
561 uint_t rb_revision;
562 uint_t rb_checksum;
563 uint_t rb_checksum_fiddle;
564 uint_t rb_private;
565 uint32_t rb_userdata;
566 uint_t rb_commitcnt;
567 uint_t rb_spare[1];
568 struct timeval32 rb_timestamp;
569 int rb_data[1];
570 } mddb_rb32_t;
573 * directory entries
575 typedef struct mddb_optinfo {
576 int o_li;
577 int o_flags;
578 } mddb_optinfo_t;
580 /* Old definition of mddb_de, for 32-bit apps and libraries */
581 typedef struct mddb_de {
582 struct mddb_de *de_next;
583 mddb_rb_t *de_rb;
584 mddb_recid_t de_recid;
585 mddb_type_t de_type1;
586 uint_t de_type2;
587 uint_t de_reqsize;
588 uint_t de_recsize;
589 mddb_block_t de_blkcount;
590 uint_t de_flags;
591 mddb_optinfo_t de_optinfo[2];
592 mddb_block_t de_blks[1];
593 } mddb_de_t;
596 * In core version of mddb_de, includes pointer for mddb_rb32_t user data
597 * mddb_rb32_t is used incore
599 typedef struct mddb_de_ic {
600 void *de_rb_userdata;
601 void *de_rb_userdata_ic;
602 uint_t de_owner_nodeid;
603 struct mddb_de_ic *de_next;
604 mddb_rb32_t *de_rb;
605 mddb_recid_t de_recid;
606 mddb_type_t de_type1;
607 uint_t de_type2;
608 size_t de_reqsize;
609 size_t de_icreqsize;
610 size_t de_recsize;
611 uint_t de_blkcount;
612 uint_t de_flags;
613 mddb_optinfo_t de_optinfo[2];
614 mddb_block_t de_blks[1];
615 } mddb_de_ic_t;
617 typedef struct mddb_db {
618 uint_t db_magic;
619 uint_t db_revision;
620 uint_t db_checksum;
621 mddb_block_t db_blknum;
622 struct mddb_db *db_next;
623 mddb_block_t db_nextblk;
624 struct timeval32 db_timestamp;
625 uint_t db_recsum;
626 #ifdef _KERNEL
627 mddb_de_ic_t *db_firstentry;
628 #else
629 mddb_de_t *db_firstentry;
630 #endif
631 } mddb_db_t;
634 * This is, and always will be, the on-disk version of mddb_de
635 * When mddb_de32 is read in it is converted into mddb_de_ic
637 typedef struct mddb_de32 {
638 uint32_t de32_next;
639 uint32_t de32_rb;
640 mddb_recid_t de32_recid;
641 mddb_type_t de32_type1;
642 uint_t de32_type2;
643 uint_t de32_reqsize;
644 uint_t de32_recsize;
645 mddb_block_t de32_blkcount;
646 uint_t de32_flags;
647 mddb_optinfo_t de32_optinfo[2];
648 mddb_block_t de32_blks[1];
649 } mddb_de32_t;
652 * This is, and always will be, the on-disk version of mddb_db
653 * When mddb_db32 is read in it is converted into mddb_db
654 * To minimize impact on mddb format mddb_db fileds remain intact
656 typedef struct mddb_db32 {
657 uint_t db32_magic;
658 uint_t db32_revision;
659 uint_t db32_checksum;
660 mddb_block_t db32_blknum;
661 uint32_t db32_next;
662 mddb_block_t db32_nextblk;
663 struct timeval32 db32_timestamp;
664 uint_t db32_recsum;
665 uint32_t db32_firstentry;
666 } mddb_db32_t;
668 #define de32tode(from, to) \
670 int i; \
671 to->de_rb_userdata = NULL; \
672 to->de_owner_nodeid = MD_MN_INVALID_NID; \
673 to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
674 to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
675 to->de_recid = from->de32_recid; \
676 to->de_type1 = from->de32_type1; \
677 to->de_type2 = from->de32_type2; \
678 to->de_reqsize = from->de32_reqsize; \
679 to->de_recsize = from->de32_recsize; \
680 to->de_blkcount = from->de32_blkcount; \
681 to->de_flags = from->de32_flags; \
682 to->de_optinfo[0] = from->de32_optinfo[0]; \
683 to->de_optinfo[1] = from->de32_optinfo[1]; \
684 for (i = 0; i < from->de32_blkcount; i++) \
685 to->de_blks[i] = from->de32_blks[i]; \
688 #define detode32(from, to) \
690 int i; \
691 to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
692 to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
693 to->de32_recid = from->de_recid; \
694 to->de32_type1 = from->de_type1; \
695 to->de32_type2 = from->de_type2; \
696 to->de32_reqsize = from->de_reqsize; \
697 to->de32_recsize = from->de_recsize; \
698 to->de32_blkcount = from->de_blkcount; \
699 to->de32_flags = from->de_flags; \
700 to->de32_optinfo[0] = from->de_optinfo[0]; \
701 to->de32_optinfo[1] = from->de_optinfo[1]; \
702 for (i = 0; i < from->de_blkcount; i++) \
703 to->de32_blks[i] = from->de_blks[i]; \
706 #define db32todb(from, to) \
707 to->db_magic = from->db32_magic; \
708 to->db_revision = from->db32_revision; \
709 to->db_checksum = from->db32_checksum; \
710 to->db_blknum = from->db32_blknum; \
711 to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
712 to->db_nextblk = from->db32_nextblk; \
713 to->db_timestamp = from->db32_timestamp; \
714 to->db_recsum = from->db32_recsum; \
715 to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;
717 #define dbtodb32(from, to) \
718 to->db32_magic = from->db_magic; \
719 to->db32_revision = from->db_revision; \
720 to->db32_checksum = from->db_checksum; \
721 to->db32_blknum = from->db_blknum; \
722 to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
723 to->db32_nextblk = from->db_nextblk; \
724 to->db32_timestamp = from->db_timestamp; \
725 to->db32_recsum = from->db_recsum; \
726 to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;
729 * information about a replica of the data base
731 typedef struct mddb_ri {
732 struct mddb_ri *ri_next;
733 uint_t ri_flags;
734 uint_t ri_commitcnt;
735 int ri_transplant;
736 md_dev64_t ri_dev;
737 daddr32_t ri_blkno;
738 char ri_driver[16];
739 mddb_mb_ic_t *ri_mbip;
740 mddb_lb_t *ri_lbp;
741 mddb_dt_t *ri_dtp;
742 mddb_did_ic_t *ri_did_icp;
743 ddi_devid_t ri_devid;
744 ddi_devid_t ri_old_devid;
745 char ri_minor_name[MDDB_MINOR_NAME_MAX];
746 char ri_devname[MAXPATHLEN];
747 } mddb_ri_t;
749 typedef struct mddb_bf {
750 struct mddb_bf *bf_next;
751 mddb_locator_t *bf_locator;
752 buf_t bf_buf;
753 } mddb_bf_t;
756 * Information for sets of databases (which include replicas)
758 #define MDDB_BITSRECID 31
759 #define MDDB_SETSHIFT (MDDB_BITSRECID - MD_BITSSET)
760 #define MDDB_SETMASK (MD_SETMASK << MDDB_SETSHIFT)
761 #define MDDB_RECIDMASK ((1 << MDDB_SETSHIFT) - 1)
763 #define DBSET(id) (((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
764 #define DBID(id) ((id) & MDDB_RECIDMASK)
765 #define MAKERECID(s, i) ((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
766 ((i) & MDDB_RECIDMASK))
768 #define MDDB_PARSE_LOCBLK 0x00000001
769 #define MDDB_PARSE_LOCNM 0x00000002
770 #define MDDB_PARSE_OPTRECS 0x00000004
771 #define MDDB_PARSE_MASK 0x0000000F
774 #define MDDB_BLOCK_PARSE 0x00000001 /* Block sending parse msgs */
775 #define MDDB_UNBLOCK_PARSE 0x00000002 /* Unblock sending parse msgs */
778 * We need to keep s_ident and s_inittime 32 bit. They are used in mddb_lb
780 typedef struct mddb_set {
781 uint_t s_setno; /* set number */
782 uint_t s_sideno; /* side number */
783 identifier_t s_ident; /* set identifier */
784 char *s_setname; /* set name */
785 mddb_mb_ic_t **s_mbiarray; /* master blocks array */
786 mddb_db_t *s_dbp; /* directory block */
787 mddb_lb_t *s_lbp; /* locator block */
788 /* May be cast to mddb_mnlb_t */
789 /* if accessing sidenames in */
790 /* MN diskset */
791 mddb_ln_t *s_lnp; /* locator names block */
792 /* May be cast to mddb_mnln_t */
793 /* if accessing sidenames in */
794 /* MN diskset */
795 mddb_dtag_lst_t *s_dtlp; /* List of data tags found */
796 mddb_did_ic_t *s_did_icp; /* Device ID incore area */
797 mddb_ri_t *s_rip; /* replicas incore list */
798 int s_freeblkcnt; /* visable for test code */
799 int s_totalblkcnt; /* visable for test code */
800 int s_mn_parseflags; /* mddb parse flags for MNset */
801 int s_mn_parseflags_sending; /* parse flgs sent to slaves */
802 uchar_t *s_freebitmap; /* free blocks bitmap */
803 uint_t s_freebitmapsize; /* size of bitmap */
804 struct timeval32 s_inittime; /* timestamp set created */
805 mddb_recid_t s_zombie; /* zombie record - createrec */
806 int s_staledeletes; /* number of stale deleterec */
807 int s_optcmtcnt; /* Following are opt. record */
808 int s_opthavelck; /* bookkeeping records ... */
809 int s_optwantlck;
810 kcondvar_t s_optwantlck_cv;
811 int s_optwaiterr;
812 int s_opthungerr;
813 kcondvar_t s_opthungerr_cv;
814 int s_opthavequeuinglck;
815 int s_optwantqueuinglck;
816 kcondvar_t s_optqueuing_cv;
817 ulong_t s_bufmisses;
818 mddb_bf_t *s_freebufhead;
819 int s_bufwakeup;
820 kcondvar_t s_buf_cv;
821 size_t s_databuffer_size;
822 void *s_databuffer;
823 int s_singlelockgotten;
824 int s_singlelockwanted;
825 kcondvar_t s_single_thread_cv;
826 md_hi_arr_t s_med;
827 } mddb_set_t;
829 #ifndef MDDB_FAKE
830 #ifdef _KERNEL
831 /* md_mddb.c */
832 extern uint_t mddb_lb_did_convert(mddb_set_t *,
833 uint_t, uint_t *);
834 extern void mddb_locatorblock2splitname(mddb_ln_t *,
835 int, side_t, md_splitname *);
836 extern int mddb_configure(mddb_cfgcmd_t,
837 struct mddb_config *);
838 extern mddb_recid_t mddb_getnextrec(mddb_recid_t,
839 mddb_type_t, uint_t);
840 extern int mddb_getoptloc(mddb_optloc_t *);
841 extern void *mddb_getrecaddr(mddb_recid_t);
842 extern void *mddb_getrecaddr_resize(mddb_recid_t, size_t,
843 off_t);
844 extern int mddb_getrecprivate(mddb_recid_t);
845 extern void mddb_setrecprivate(mddb_recid_t, uint_t);
846 extern mddb_de_ic_t *mddb_getrecdep(mddb_recid_t);
847 extern mddb_type_t mddb_getrectype1(mddb_recid_t);
848 extern int mddb_getrectype2(mddb_recid_t);
849 extern int mddb_getrecsize(mddb_recid_t);
850 extern int mddb_commitrec(mddb_recid_t);
851 extern int mddb_commitrecs(mddb_recid_t *);
852 extern int mddb_deleterec(mddb_recid_t);
853 extern mddb_recstatus_t mddb_getrecstatus(mddb_recid_t);
854 extern mddb_recid_t mddb_createrec(size_t usersize,
855 mddb_type_t type, uint_t type2,
856 md_create_rec_option_t option, set_t setno);
857 extern void mddb_init(void);
858 extern void mddb_unload(void);
859 extern void mddb_unload_set(set_t setno);
860 extern mddb_recid_t mddb_makerecid(set_t setno, mddb_recid_t id);
861 extern set_t mddb_getsetnum(mddb_recid_t id);
862 extern char *mddb_getsetname(set_t setno);
863 extern side_t mddb_getsidenum(set_t setno);
864 extern int mddb_ownset(set_t setno);
865 extern int getmed_ioctl(mddb_med_parm_t *medpp, int mode);
866 extern int setmed_ioctl(mddb_med_parm_t *medpp, int mode);
867 extern int updmed_ioctl(mddb_med_upd_parm_t *medpp,
868 int mode);
869 extern int take_set(mddb_config_t *cp, int mode);
870 extern int release_set(mddb_config_t *cp, int mode);
871 extern int gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
872 int mode);
873 extern int usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
874 int mode);
875 extern int accept_ioctl(mddb_accept_parm_t *medpp,
876 int mode);
877 extern int md_update_locator_namespace(set_t setno,
878 side_t side, char *dname, char *pname,
879 md_dev64_t devt);
880 extern int mddb_validate_lb(set_t setno, int *rmaxsz);
881 extern int mddb_getinvlb_devid(set_t setno, int count,
882 int size, char **ctdptr);
883 extern int md_update_minor(set_t, side_t, mdkey_t);
884 extern int md_update_nm_rr_did_ioctl(mddb_config_t *cp);
885 extern int md_update_top_device_minor(set_t, side_t,
886 md_dev64_t);
887 #ifdef DEBUG
888 extern void mddb_check(void);
889 #endif /* DEBUG */
890 #endif /* _KERNEL */
892 #else
894 caddr_t mddb_fakeit;
896 #define md_lb_did_convert(a, b, c) (0)
897 #define mddb_configure(a, b) (0)
898 #define mddb_getnextrec(a, b, c) ((mddb_recid_t)0)
899 #define mddb_getrecaddr(a) (mddb_fakeit)
900 #define mddb_getrecprivate(a) (0)
901 #define mddb_setrecprivate(a, b) (0)
902 #define mddb_getrectype1(a) (0)
903 #define mddb_getrectype2(a) (0)
904 #define mddb_getrecsize(a) (0)
905 #define mddb_commitrec(a) (0)
906 #define mddb_commitrecs(a) (0)
907 #define mddb_deleterec(a) (0)
908 #define mddb_getrecstatus(a) (MDDB_OK)
909 #define mddb_createrec(s, a, b) (0xffff & (int)(mddb_fakeit = \
910 (caddr_t)kmem_zalloc(s, KM_SLEEP)))
911 #define mddb_unload() (0)
913 #endif
915 #define MDDB_NOSLEEP 1
916 #define MDDB_SLEEPOK 0
918 #define MDDB_NOOLDOK 0x1
919 #define MDDB_MUSTEXIST 0x2
920 #define MDDB_NOINIT 0x4
921 #define MDDB_MULTINODE 0x8
922 #define MDDB_MN_STALE 0x10 /* MN set is stale */
924 /* Flags passed to selectreplicas - not a bit mask */
925 #define MDDB_SCANALL 1
926 #define MDDB_RETRYSCAN 0
927 #define MDDB_SCANALLSYNC 2 /* During reconfig, sync up incore */
928 /* and ondisk mddb by writing incore */
929 /* values to disk. Don't write */
930 /* change log records. */
932 /* Flags passed to writestart and writecopy */
933 #define MDDB_WRITECOPY_ALL 1 /* Write all incore mddb to disk */
934 #define MDDB_WRITECOPY_SYNC 2 /* Write incore mddb to disk except */
935 /* - change log records */
936 /* - optimized resync records */
939 #define MDDB_PROBE 1
940 #define MDDB_NOPROBE 0
944 * MN diskset definitions used to determine if a slave can write
945 * directly to the mddb. ONLY_MASTER only allows the master node
946 * to write to the mddb. ANY_NODE allows any node to write
947 * to the mddb.
949 #define MDDB_WR_ONLY_MASTER 0
950 #define MDDB_WR_ANY_NODE 1
952 #define MDDB_L_LOCKED 0x0001 /* this record is locked */
953 #define MDDB_L_WANTED 0x0002
955 #ifdef __cplusplus
957 #endif
959 #endif /* _SYS_MD_MDDB_H */