Update.
[glibc.git] / db2 / include / db_page.h
blobe1846cbbbdcee6314621116208ef09b70eb15581
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
7 * @(#)db_page.h 10.15 (Sleepycat) 5/1/98
8 */
10 #ifndef _DB_PAGE_H_
11 #define _DB_PAGE_H_
14 * DB page formats.
16 * This implementation requires that values within the following structures
17 * NOT be padded -- note, ANSI C permits random padding within structures.
18 * If your compiler pads randomly you can just forget ever making DB run on
19 * your system. In addition, no data type can require larger alignment than
20 * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
22 * Note that key/data lengths are often stored in db_indx_t's -- this is
23 * not accidental, nor does it limit the key/data size. If the key/data
24 * item fits on a page, it's guaranteed to be small enough to fit into a
25 * db_indx_t, and storing it in one saves space.
28 #define PGNO_METADATA 0 /* Metadata page number. */
29 #define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */
30 #define PGNO_ROOT 1 /* Root is page #1. */
33 * When we create pages in mpool, we ask mpool to clear some number of bytes
34 * in the header. This number must be at least as big as the regular page
35 * headers and cover enough of the btree and hash meta-data pages to obliterate
36 * the magic and version numbers.
38 #define DB_PAGE_CLEAR_LEN 32
40 /************************************************************************
41 BTREE METADATA PAGE LAYOUT
42 ************************************************************************/
45 * Btree metadata page layout:
47 * +-----------------------------------+
48 * | lsn | pgno | magic |
49 * +-----------------------------------+
50 * | version | pagesize | free |
51 * +-----------------------------------+
52 * | flags | unused ... |
53 * +-----------------------------------+
55 typedef struct _btmeta {
56 DB_LSN lsn; /* 00-07: LSN. */
57 db_pgno_t pgno; /* 08-11: Current page number. */
58 u_int32_t magic; /* 12-15: Magic number. */
59 u_int32_t version; /* 16-19: Version. */
60 u_int32_t pagesize; /* 20-23: Pagesize. */
61 u_int32_t maxkey; /* 24-27: Btree: Maxkey. */
62 u_int32_t minkey; /* 28-31: Btree: Minkey. */
63 u_int32_t free; /* 32-35: Free list page number. */
64 #define BTM_DUP 0x001 /* Duplicates. */
65 #define BTM_RECNO 0x002 /* Recno tree. */
66 #define BTM_RECNUM 0x004 /* Btree: maintain record count. */
67 #define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */
68 #define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */
69 #define BTM_MASK 0x01f
70 u_int32_t flags; /* 36-39: Flags. */
71 u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */
72 u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */
73 /* 48-67: Unique file ID. */
74 u_int8_t uid[DB_FILE_ID_LEN];
76 u_int32_t spare[13]; /* 68-123: Save some room for growth. */
78 DB_BTREE_LSTAT stat; /* 124-163: Statistics. */
79 } BTMETA;
81 /************************************************************************
82 HASH METADATA PAGE LAYOUT
83 ************************************************************************/
86 * Hash metadata page layout:
88 * +-----------------------------------+
89 * | lsn | magic | version |
90 * +-----------------------------------+
91 * | pagesize | ovfl_point| last_freed|
92 * +-----------------------------------+
93 * | max_bucket| high_mask | low_mask |
94 * +-----------------------------------+
95 * | ffactor | nelem | charkey |
96 * +-----------------------------------+
97 * | spares[32]| flags | unused |
98 * +-----------------------------------+
100 /* Hash Table Information */
101 typedef struct hashhdr { /* Disk resident portion */
102 DB_LSN lsn; /* 00-07: LSN of the header page */
103 db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */
104 u_int32_t magic; /* 12-15: Magic NO for hash tables */
105 u_int32_t version; /* 16-19: Version ID */
106 u_int32_t pagesize; /* 20-23: Bucket/Page Size */
107 u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */
108 u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */
109 u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */
110 u_int32_t high_mask; /* 36-39: Modulo mask into table */
111 u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */
112 u_int32_t ffactor; /* 44-47: Fill factor */
113 u_int32_t nelem; /* 48-51: Number of keys in hash table */
114 u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */
115 #define DB_HASH_DUP 0x01
116 u_int32_t flags; /* 56-59: Allow duplicates. */
117 #define NCACHED 32 /* number of spare points */
118 /* 60-187: Spare pages for overflow */
119 u_int32_t spares[NCACHED];
120 /* 188-207: Unique file ID. */
121 u_int8_t uid[DB_FILE_ID_LEN];
124 * Minimum page size is 256.
126 } HASHHDR;
128 /************************************************************************
129 MAIN PAGE LAYOUT
130 ************************************************************************/
133 * +-----------------------------------+
134 * | lsn | pgno | prev pgno |
135 * +-----------------------------------+
136 * | next pgno | entries | hf offset |
137 * +-----------------------------------+
138 * | level | type | index |
139 * +-----------------------------------+
140 * | index | free --> |
141 * +-----------+-----------------------+
142 * | F R E E A R E A |
143 * +-----------------------------------+
144 * | <-- free | item |
145 * +-----------------------------------+
146 * | item | item | item |
147 * +-----------------------------------+
149 * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
150 * two-byte aligned.
152 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
153 * key for inp[1]'s data. All other types of pages only contain single items.
155 typedef struct _db_page {
156 DB_LSN lsn; /* 00-07: Log sequence number. */
157 db_pgno_t pgno; /* 08-11: Current page number. */
158 db_pgno_t prev_pgno; /* 12-15: Previous page number. */
159 db_pgno_t next_pgno; /* 16-19: Next page number. */
160 db_indx_t entries; /* 20-21: Number of item pairs on the page. */
161 db_indx_t hf_offset; /* 22-23: High free byte page offset. */
164 * The btree levels are numbered from the leaf to the root, starting
165 * with 1, so the leaf is level 1, its parent is level 2, and so on.
166 * We maintain this level on all btree pages, but the only place that
167 * we actually need it is on the root page. It would not be difficult
168 * to hide the byte on the root page once it becomes an internal page,
169 * so we could get this byte back if we needed it for something else.
171 #define LEAFLEVEL 1
172 #define MAXBTREELEVEL 255
173 u_int8_t level; /* 24: Btree tree level. */
175 #define P_INVALID 0 /* Invalid page type. */
176 #define P_DUPLICATE 1 /* Duplicate. */
177 #define P_HASH 2 /* Hash. */
178 #define P_IBTREE 3 /* Btree internal. */
179 #define P_IRECNO 4 /* Recno internal. */
180 #define P_LBTREE 5 /* Btree leaf. */
181 #define P_LRECNO 6 /* Recno leaf. */
182 #define P_OVERFLOW 7 /* Overflow. */
183 u_int8_t type; /* 25: Page type. */
184 db_indx_t inp[1]; /* Variable length index of items. */
185 } PAGE;
187 /* Element macros. */
188 #define LSN(p) (((PAGE *)p)->lsn)
189 #define PGNO(p) (((PAGE *)p)->pgno)
190 #define PREV_PGNO(p) (((PAGE *)p)->prev_pgno)
191 #define NEXT_PGNO(p) (((PAGE *)p)->next_pgno)
192 #define NUM_ENT(p) (((PAGE *)p)->entries)
193 #define HOFFSET(p) (((PAGE *)p)->hf_offset)
194 #define LEVEL(p) (((PAGE *)p)->level)
195 #define TYPE(p) (((PAGE *)p)->type)
198 * !!!
199 * The next_pgno and prev_pgno fields are not maintained for btree and recno
200 * internal pages. It's a minor performance improvement, and more, it's
201 * hard to do when deleting internal pages, and it decreases the chance of
202 * deadlock during deletes and splits.
204 * !!!
205 * The btree/recno access method needs db_recno_t bytes of space on the root
206 * page to specify how many records are stored in the tree. (The alternative
207 * is to store the number of records in the meta-data page, which will create
208 * a second hot spot in trees being actively modified, or recalculate it from
209 * the BINTERNAL fields on each access.) Overload the prev_pgno field.
211 #define RE_NREC(p) \
212 (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \
213 TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p))
214 #define RE_NREC_ADJ(p, adj) \
215 PREV_PGNO(p) += adj;
216 #define RE_NREC_SET(p, num) \
217 PREV_PGNO(p) = num;
220 * Initialize a page.
222 * !!!
223 * Don't modify the page's LSN, code depends on it being unchanged after a
224 * P_INIT call.
226 #define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \
227 PGNO(pg) = n; \
228 PREV_PGNO(pg) = pg_prev; \
229 NEXT_PGNO(pg) = pg_next; \
230 NUM_ENT(pg) = 0; \
231 HOFFSET(pg) = pg_size; \
232 LEVEL(pg) = btl; \
233 TYPE(pg) = pg_type; \
234 } while (0)
236 /* Page header length (offset to first index). */
237 #define P_OVERHEAD (SSZA(PAGE, inp))
239 /* First free byte. */
240 #define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t))
242 /* Free space on the page. */
243 #define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg))
245 /* Get a pointer to the bytes at a specific index. */
246 #define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx])
248 /************************************************************************
249 OVERFLOW PAGE LAYOUT
250 ************************************************************************/
253 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
254 * store a page number (the first page of the overflow item) and a length
255 * (the total length of the overflow item). The overflow item consists of
256 * some number of overflow pages, linked by the next_pgno field of the page.
257 * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
259 * Overflow page overloads:
260 * The amount of overflow data stored on each page is stored in the
261 * hf_offset field.
263 * The implementation reference counts overflow items as it's possible
264 * for them to be promoted onto btree internal pages. The reference
265 * count is stored in the entries field.
267 #define OV_LEN(p) (((PAGE *)p)->hf_offset)
268 #define OV_REF(p) (((PAGE *)p)->entries)
270 /* Maximum number of bytes that you can put on an overflow page. */
271 #define P_MAXSPACE(psize) ((psize) - P_OVERHEAD)
273 /************************************************************************
274 HASH PAGE LAYOUT
275 ************************************************************************/
277 /* Each index references a group of bytes on the page. */
278 #define H_KEYDATA 1 /* Key/data item. */
279 #define H_DUPLICATE 2 /* Duplicate key/data item. */
280 #define H_OFFPAGE 3 /* Overflow key/data item. */
281 #define H_OFFDUP 4 /* Overflow page of duplicates. */
284 * !!!
285 * Items on hash pages are (potentially) unaligned, so we can never cast the
286 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
287 * we do with B+tree on-page structures. Because we frequently want the type
288 * field, it requires no alignment, and it's in the same location in all three
289 * structures, there's a pair of macros.
291 #define HPAGE_PTYPE(p) (*(u_int8_t *)p)
292 #define HPAGE_TYPE(pg, indx) (*P_ENTRY(pg, indx))
295 * The first and second types are H_KEYDATA and H_DUPLICATE, represented
296 * by the HKEYDATA structure:
298 * +-----------------------------------+
299 * | type | key/data ... |
300 * +-----------------------------------+
302 * For duplicates, the data field encodes duplicate elements in the data
303 * field:
305 * +---------------------------------------------------------------+
306 * | type | len1 | element1 | len1 | len2 | element2 | len2 |
307 * +---------------------------------------------------------------+
309 * Thus, by keeping track of the offset in the element, we can do both
310 * backward and forward traversal.
312 typedef struct _hkeydata {
313 u_int8_t type; /* 00: Page type. */
314 u_int8_t data[1]; /* Variable length key/data item. */
315 } HKEYDATA;
316 #define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data))
319 * The length of any HKEYDATA item. Note that indx is an element index,
320 * not a PAIR index.
322 #define LEN_HITEM(pg, pgsize, indx) \
323 (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx])
325 #define LEN_HKEYDATA(pg, psize, indx) \
326 (((indx) == 0 ? psize : pg->inp[indx - 1]) - \
327 pg->inp[indx] - HKEYDATA_SIZE(0))
330 * Page space required to add a new HKEYDATA item to the page, with and
331 * without the index value.
333 #define HKEYDATA_SIZE(len) \
334 ((len) + SSZA(HKEYDATA, data))
335 #define HKEYDATA_PSIZE(len) \
336 (HKEYDATA_SIZE(len) + sizeof(db_indx_t))
338 /* Put a HKEYDATA item at the location referenced by a page entry. */
339 #define PUT_HKEYDATA(pe, kd, len, type) { \
340 ((HKEYDATA *)pe)->type = type; \
341 memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \
345 * Macros the describe the page layout in terms of key-data pairs.
346 * The use of "pindex" indicates that the argument is the index
347 * expressed in pairs instead of individual elements.
349 #define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2)
350 #define H_KEYINDEX(pindx) (2 * (pindx))
351 #define H_DATAINDEX(pindx) ((2 * (pindx)) + 1)
352 #define H_PAIRKEY(pg, pindx) P_ENTRY(pg, H_KEYINDEX(pindx))
353 #define H_PAIRDATA(pg, pindx) P_ENTRY(pg, H_DATAINDEX(pindx))
354 #define H_PAIRSIZE(pg, psize, pindx) \
355 (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \
356 LEN_HITEM(pg, psize, H_DATAINDEX(pindx)))
357 #define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx))
358 #define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx))
361 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
363 * +-----------------------------------+
364 * | type | pgno_t | total len |
365 * +-----------------------------------+
367 typedef struct _hoffpage {
368 u_int8_t type; /* 00: Page type and delete flag. */
369 u_int8_t unused[3]; /* 01-03: Padding, unused. */
370 db_pgno_t pgno; /* 04-07: Offpage page number. */
371 u_int32_t tlen; /* 08-11: Total length of item. */
372 } HOFFPAGE;
374 #define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
375 #define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
378 * Page space required to add a new HOFFPAGE item to the page, with and
379 * without the index value.
381 #define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
382 #define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t))
385 * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
387 * +-----------------------+
388 * | type | pgno_t |
389 * +-----------------------+
391 typedef struct _hoffdup {
392 u_int8_t type; /* 00: Page type and delete flag. */
393 u_int8_t unused[3]; /* 01-03: Padding, unused. */
394 db_pgno_t pgno; /* 04-07: Offpage page number. */
395 } HOFFDUP;
396 #define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
399 * Page space required to add a new HOFFDUP item to the page, with and
400 * without the index value.
402 #define HOFFDUP_SIZE (sizeof(HOFFDUP))
403 #define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t))
405 /************************************************************************
406 BTREE PAGE LAYOUT
407 ************************************************************************/
409 /* Each index references a group of bytes on the page. */
410 #define B_KEYDATA 1 /* Key/data item. */
411 #define B_DUPLICATE 2 /* Duplicate key/data item. */
412 #define B_OVERFLOW 3 /* Overflow key/data item. */
415 * We have to store a deleted entry flag in the page. The reason is complex,
416 * but the simple version is that we can't delete on-page items referenced by
417 * a cursor -- the return order of subsequent insertions might be wrong. The
418 * delete flag is an overload of the top bit of the type byte.
420 #define B_DELETE (0x80)
421 #define B_DCLR(t) (t) &= ~B_DELETE
422 #define B_DSET(t) (t) |= B_DELETE
423 #define B_DISSET(t) ((t) & B_DELETE)
425 #define B_TYPE(t) ((t) & ~B_DELETE)
426 #define B_TSET(t, type, deleted) { \
427 (t) = (type); \
428 if (deleted) \
429 B_DSET(t); \
433 * The first type is B_KEYDATA, represented by the BKEYDATA structure:
435 * +-----------------------------------+
436 * | length | type | key/data |
437 * +-----------------------------------+
439 typedef struct _bkeydata {
440 db_indx_t len; /* 00-01: Key/data item length. */
441 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
442 u_int8_t data[1]; /* Variable length key/data item. */
443 } BKEYDATA;
445 /* Get a BKEYDATA item for a specific index. */
446 #define GET_BKEYDATA(pg, indx) \
447 ((BKEYDATA *)P_ENTRY(pg, indx))
450 * Page space required to add a new BKEYDATA item to the page, with and
451 * without the index value.
453 #define BKEYDATA_SIZE(len) \
454 ALIGN((len) + SSZA(BKEYDATA, data), 4)
455 #define BKEYDATA_PSIZE(len) \
456 (BKEYDATA_SIZE(len) + sizeof(db_indx_t))
459 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
460 * by the BOVERFLOW structure:
462 * +-----------------------------------+
463 * | total len | type | unused |
464 * +-----------------------------------+
465 * | nxt: page | nxt: off | nxt: len |
466 * +-----------------------------------+
468 typedef struct _boverflow {
469 db_indx_t unused1; /* 00-01: Padding, unused. */
470 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
471 u_int8_t unused2; /* 03: Padding, unused. */
472 db_pgno_t pgno; /* 04-07: Next page number. */
473 u_int32_t tlen; /* 08-11: Total length of item. */
474 } BOVERFLOW;
476 /* Get a BOVERFLOW item for a specific index. */
477 #define GET_BOVERFLOW(pg, indx) \
478 ((BOVERFLOW *)P_ENTRY(pg, indx))
481 * Page space required to add a new BOVERFLOW item to the page, with and
482 * without the index value.
484 #define BOVERFLOW_SIZE \
485 ALIGN(sizeof(BOVERFLOW), 4)
486 #define BOVERFLOW_PSIZE \
487 (BOVERFLOW_SIZE + sizeof(db_indx_t))
490 * Btree leaf and hash page layouts group indices in sets of two, one
491 * for the key and one for the data. Everything else does it in sets
492 * of one to save space. I use the following macros so that it's real
493 * obvious what's going on...
495 #define O_INDX 1
496 #define P_INDX 2
498 /************************************************************************
499 BTREE INTERNAL PAGE LAYOUT
500 ************************************************************************/
503 * Btree internal entry.
505 * +-----------------------------------+
506 * | leaf pgno | type | data ... |
507 * +-----------------------------------+
509 typedef struct _binternal {
510 db_indx_t len; /* 00-01: Key/data item length. */
511 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
512 u_int8_t unused; /* 03: Padding, unused. */
513 db_pgno_t pgno; /* 04-07: Page number of referenced page. */
514 db_recno_t nrecs; /* 08-11: Subtree record count. */
515 u_int8_t data[1]; /* Variable length key item. */
516 } BINTERNAL;
518 /* Get a BINTERNAL item for a specific index. */
519 #define GET_BINTERNAL(pg, indx) \
520 ((BINTERNAL *)P_ENTRY(pg, indx))
523 * Page space required to add a new BINTERNAL item to the page, with and
524 * without the index value.
526 #define BINTERNAL_SIZE(len) \
527 ALIGN((len) + SSZA(BINTERNAL, data), 4)
528 #define BINTERNAL_PSIZE(len) \
529 (BINTERNAL_SIZE(len) + sizeof(db_indx_t))
531 /************************************************************************
532 RECNO INTERNAL PAGE LAYOUT
533 ************************************************************************/
536 * The recno internal entry.
538 * +-----------------------+
539 * | leaf pgno | # of recs |
540 * +-----------------------+
542 * XXX
543 * Why not fold this into the db_indx_t structure, it's fixed length.
545 typedef struct _rinternal {
546 db_pgno_t pgno; /* 00-03: Page number of referenced page. */
547 db_recno_t nrecs; /* 04-07: Subtree record count. */
548 } RINTERNAL;
550 /* Get a RINTERNAL item for a specific index. */
551 #define GET_RINTERNAL(pg, indx) \
552 ((RINTERNAL *)P_ENTRY(pg, indx))
555 * Page space required to add a new RINTERNAL item to the page, with and
556 * without the index value.
558 #define RINTERNAL_SIZE \
559 ALIGN(sizeof(RINTERNAL), 4)
560 #define RINTERNAL_PSIZE \
561 (RINTERNAL_SIZE + sizeof(db_indx_t))
562 #endif /* _DB_PAGE_H_ */