Update.
[glibc.git] / db2 / include / db_page.h
blob30f6072fc34c4ba4783c1e70c73f886f2f561950
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
7 * @(#)db_page.h 10.13 (Sleepycat) 9/24/97
8 */
10 #ifndef _DB_PAGE_H_
11 #define _DB_PAGE_H_
14 * DB page formats.
16 * This implementation requires that values within the following structures
17 * NOT be padded -- note, ANSI C permits random padding within structures.
18 * If your compiler pads randomly you can just forget ever making DB run on
19 * your system. In addition, no data type can require larger alignment than
20 * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
22 * Note that key/data lengths are often stored in db_indx_t's -- this is
23 * not accidental, nor does it limit the key/data size. If the key/data
24 * item fits on a page, it's guaranteed to be small enough to fit into a
25 * db_indx_t, and storing it in one saves space.
28 #define PGNO_METADATA 0 /* Metadata page number. */
29 #define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */
30 #define PGNO_ROOT 1 /* Root is page #1. */
32 /************************************************************************
33 BTREE METADATA PAGE LAYOUT
34 ************************************************************************/
37 * Btree metadata page layout:
39 * +-----------------------------------+
40 * | lsn | pgno | magic |
41 * +-----------------------------------+
42 * | version | pagesize | free |
43 * +-----------------------------------+
44 * | flags | unused ... |
45 * +-----------------------------------+
47 typedef struct _btmeta {
48 DB_LSN lsn; /* 00-07: LSN. */
49 db_pgno_t pgno; /* 08-11: Current page number. */
50 u_int32_t magic; /* 12-15: Magic number. */
51 u_int32_t version; /* 16-19: Version. */
52 u_int32_t pagesize; /* 20-23: Pagesize. */
53 u_int32_t maxkey; /* 24-27: Btree: Maxkey. */
54 u_int32_t minkey; /* 28-31: Btree: Minkey. */
55 u_int32_t free; /* 32-35: Free list page number. */
56 #define BTM_DUP 0x001 /* Duplicates. */
57 #define BTM_RECNO 0x002 /* Recno tree. */
58 #define BTM_RECNUM 0x004 /* Btree: maintain record count. */
59 #define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */
60 #define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */
61 #define BTM_MASK 0x01f
62 u_int32_t flags; /* 36-39: Flags. */
63 u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */
64 u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */
65 /* 48-67: Unique file ID. */
66 u_int8_t uid[DB_FILE_ID_LEN];
68 u_int32_t spare[13]; /* 68-123: Save some room for growth. */
70 DB_BTREE_LSTAT stat; /* 124-163: Statistics. */
71 } BTMETA;
73 /************************************************************************
74 HASH METADATA PAGE LAYOUT
75 ************************************************************************/
78 * Hash metadata page layout:
80 * +-----------------------------------+
81 * | lsn | magic | version |
82 * +-----------------------------------+
83 * | pagesize | ovfl_point| last_freed|
84 * +-----------------------------------+
85 * | max_bucket| high_mask | low_mask |
86 * +-----------------------------------+
87 * | ffactor | nelem | charkey |
88 * +-----------------------------------+
89 * | spares[32]| flags | unused |
90 * +-----------------------------------+
92 /* Hash Table Information */
93 typedef struct hashhdr { /* Disk resident portion */
94 DB_LSN lsn; /* 00-07: LSN of the header page */
95 db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */
96 u_int32_t magic; /* 12-15: Magic NO for hash tables */
97 u_int32_t version; /* 16-19: Version ID */
98 u_int32_t pagesize; /* 20-23: Bucket/Page Size */
99 u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */
100 u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */
101 u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */
102 u_int32_t high_mask; /* 36-39: Modulo mask into table */
103 u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */
104 u_int32_t ffactor; /* 44-47: Fill factor */
105 u_int32_t nelem; /* 48-51: Number of keys in hash table */
106 u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */
107 #define DB_HASH_DUP 0x01
108 u_int32_t flags; /* 56-59: Allow duplicates. */
109 #define NCACHED 32 /* number of spare points */
110 /* 60-187: Spare pages for overflow */
111 u_int32_t spares[NCACHED];
112 /* 188-207: Unique file ID. */
113 u_int8_t uid[DB_FILE_ID_LEN];
116 * Minimum page size is 256.
118 } HASHHDR;
120 /************************************************************************
121 MAIN PAGE LAYOUT
122 ************************************************************************/
125 * +-----------------------------------+
126 * | lsn | pgno | prev pgno |
127 * +-----------------------------------+
128 * | next pgno | entries | hf offset |
129 * +-----------------------------------+
130 * | level | type | index |
131 * +-----------------------------------+
132 * | index | free --> |
133 * +-----------+-----------------------+
134 * | F R E E A R E A |
135 * +-----------------------------------+
136 * | <-- free | item |
137 * +-----------------------------------+
138 * | item | item | item |
139 * +-----------------------------------+
141 * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
142 * two-byte aligned.
144 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
145 * key for inp[1]'s data. All other types of pages only contain single items.
147 typedef struct _db_page {
148 DB_LSN lsn; /* 00-07: Log sequence number. */
149 db_pgno_t pgno; /* 08-11: Current page number. */
150 db_pgno_t prev_pgno; /* 12-15: Previous page number. */
151 db_pgno_t next_pgno; /* 16-19: Next page number. */
152 db_indx_t entries; /* 20-21: Number of item pairs on the page. */
153 db_indx_t hf_offset; /* 22-23: High free byte page offset. */
156 * The btree levels are numbered from the leaf to the root, starting
157 * with 1, so the leaf is level 1, its parent is level 2, and so on.
158 * We maintain this level on all btree pages, but the only place that
159 * we actually need it is on the root page. It would not be difficult
160 * to hide the byte on the root page once it becomes an internal page,
161 * so we could get this byte back if we needed it for something else.
163 #define LEAFLEVEL 1
164 #define MAXBTREELEVEL 255
165 u_int8_t level; /* 24: Btree tree level. */
167 #define P_INVALID 0 /* Invalid page type. */
168 #define P_DUPLICATE 1 /* Duplicate. */
169 #define P_HASH 2 /* Hash. */
170 #define P_IBTREE 3 /* Btree internal. */
171 #define P_IRECNO 4 /* Recno internal. */
172 #define P_LBTREE 5 /* Btree leaf. */
173 #define P_LRECNO 6 /* Recno leaf. */
174 #define P_OVERFLOW 7 /* Overflow. */
175 u_int8_t type; /* 25: Page type. */
176 db_indx_t inp[1]; /* Variable length index of items. */
177 } PAGE;
179 /* Element macros. */
180 #define LSN(p) (((PAGE *)p)->lsn)
181 #define PGNO(p) (((PAGE *)p)->pgno)
182 #define PREV_PGNO(p) (((PAGE *)p)->prev_pgno)
183 #define NEXT_PGNO(p) (((PAGE *)p)->next_pgno)
184 #define NUM_ENT(p) (((PAGE *)p)->entries)
185 #define HOFFSET(p) (((PAGE *)p)->hf_offset)
186 #define LEVEL(p) (((PAGE *)p)->level)
187 #define TYPE(p) (((PAGE *)p)->type)
190 * !!!
191 * The next_pgno and prev_pgno fields are not maintained for btree and recno
192 * internal pages. It's a minor performance improvement, and more, it's
193 * hard to do when deleting internal pages, and it decreases the chance of
194 * deadlock during deletes and splits.
196 * !!!
197 * The btree/recno access method needs db_recno_t bytes of space on the root
198 * page to specify how many records are stored in the tree. (The alternative
199 * is to store the number of records in the meta-data page, which will create
200 * a second hot spot in trees being actively modified, or recalculate it from
201 * the BINTERNAL fields on each access.) Overload the prev_pgno field.
203 #define RE_NREC(p) \
204 (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \
205 TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p))
206 #define RE_NREC_ADJ(p, adj) \
207 PREV_PGNO(p) += adj;
208 #define RE_NREC_SET(p, num) \
209 PREV_PGNO(p) = num;
212 * Initialize a page.
214 * !!!
215 * Don't modify the page's LSN, code depends on it being unchanged after a
216 * P_INIT call.
218 #define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \
219 PGNO(pg) = n; \
220 PREV_PGNO(pg) = pg_prev; \
221 NEXT_PGNO(pg) = pg_next; \
222 NUM_ENT(pg) = 0; \
223 HOFFSET(pg) = pg_size; \
224 LEVEL(pg) = btl; \
225 TYPE(pg) = pg_type; \
226 } while (0)
228 /* Page header length (offset to first index). */
229 #define P_OVERHEAD (SSZA(PAGE, inp))
231 /* First free byte. */
232 #define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t))
234 /* Free space on the page. */
235 #define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg))
237 /* Get a pointer to the bytes at a specific index. */
238 #define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx])
240 /************************************************************************
241 OVERFLOW PAGE LAYOUT
242 ************************************************************************/
245 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
246 * store a page number (the first page of the overflow item) and a length
247 * (the total length of the overflow item). The overflow item consists of
248 * some number of overflow pages, linked by the next_pgno field of the page.
249 * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
251 * Overflow page overloads:
252 * The amount of overflow data stored on each page is stored in the
253 * hf_offset field.
255 * The implementation reference counts overflow items as it's possible
256 * for them to be promoted onto btree internal pages. The reference
257 * count is stored in the entries field.
259 #define OV_LEN(p) (((PAGE *)p)->hf_offset)
260 #define OV_REF(p) (((PAGE *)p)->entries)
262 /* Maximum number of bytes that you can put on an overflow page. */
263 #define P_MAXSPACE(psize) ((psize) - P_OVERHEAD)
265 /************************************************************************
266 HASH PAGE LAYOUT
267 ************************************************************************/
269 /* Each index references a group of bytes on the page. */
270 #define H_KEYDATA 1 /* Key/data item. */
271 #define H_DUPLICATE 2 /* Duplicate key/data item. */
272 #define H_OFFPAGE 3 /* Overflow key/data item. */
273 #define H_OFFDUP 4 /* Overflow page of duplicates. */
276 * !!!
277 * Items on hash pages are (potentially) unaligned, so we can never cast the
278 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
279 * we do with B+tree on-page structures. Because we frequently want the type
280 * field, it requires no alignment, and it's in the same location in all three
281 * structures, there's a pair of macros.
283 #define HPAGE_PTYPE(p) (*(u_int8_t *)p)
284 #define HPAGE_TYPE(pg, indx) (*P_ENTRY(pg, indx))
287 * The first and second types are H_KEYDATA and H_DUPLICATE, represented
288 * by the HKEYDATA structure:
290 * +-----------------------------------+
291 * | type | key/data ... |
292 * +-----------------------------------+
294 * For duplicates, the data field encodes duplicate elements in the data
295 * field:
297 * +---------------------------------------------------------------+
298 * | type | len1 | element1 | len1 | len2 | element2 | len2 |
299 * +---------------------------------------------------------------+
301 * Thus, by keeping track of the offset in the element, we can do both
302 * backward and forward traversal.
304 typedef struct _hkeydata {
305 u_int8_t type; /* 00: Page type. */
306 u_int8_t data[1]; /* Variable length key/data item. */
307 } HKEYDATA;
308 #define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data))
311 * The length of any HKEYDATA item. Note that indx is an element index,
312 * not a PAIR index.
314 #define LEN_HITEM(pg, pgsize, indx) \
315 (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx])
317 #define LEN_HKEYDATA(pg, psize, indx) \
318 (((indx) == 0 ? psize : pg->inp[indx - 1]) - \
319 pg->inp[indx] - HKEYDATA_SIZE(0))
322 * Page space required to add a new HKEYDATA item to the page, with and
323 * without the index value.
325 #define HKEYDATA_SIZE(len) \
326 ((len) + SSZA(HKEYDATA, data))
327 #define HKEYDATA_PSIZE(len) \
328 (HKEYDATA_SIZE(len) + sizeof(db_indx_t))
330 /* Put a HKEYDATA item at the location referenced by a page entry. */
331 #define PUT_HKEYDATA(pe, kd, len, type) { \
332 ((HKEYDATA *)pe)->type = type; \
333 memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \
337 * Macros the describe the page layout in terms of key-data pairs.
338 * The use of "pindex" indicates that the argument is the index
339 * expressed in pairs instead of individual elements.
341 #define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2)
342 #define H_KEYINDEX(pindx) (2 * (pindx))
343 #define H_DATAINDEX(pindx) ((2 * (pindx)) + 1)
344 #define H_PAIRKEY(pg, pindx) P_ENTRY(pg, H_KEYINDEX(pindx))
345 #define H_PAIRDATA(pg, pindx) P_ENTRY(pg, H_DATAINDEX(pindx))
346 #define H_PAIRSIZE(pg, psize, pindx) \
347 (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \
348 LEN_HITEM(pg, psize, H_DATAINDEX(pindx)))
349 #define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx))
350 #define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx))
353 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
355 * +-----------------------------------+
356 * | type | pgno_t | total len |
357 * +-----------------------------------+
359 typedef struct _hoffpage {
360 u_int8_t type; /* 00: Page type and delete flag. */
361 u_int8_t unused[3]; /* 01-03: Padding, unused. */
362 db_pgno_t pgno; /* 04-07: Offpage page number. */
363 u_int32_t tlen; /* 08-11: Total length of item. */
364 } HOFFPAGE;
366 #define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
367 #define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
370 * Page space required to add a new HOFFPAGE item to the page, with and
371 * without the index value.
373 #define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
374 #define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t))
377 * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
379 * +-----------------------+
380 * | type | pgno_t |
381 * +-----------------------+
383 typedef struct _hoffdup {
384 u_int8_t type; /* 00: Page type and delete flag. */
385 u_int8_t unused[3]; /* 01-03: Padding, unused. */
386 db_pgno_t pgno; /* 04-07: Offpage page number. */
387 } HOFFDUP;
388 #define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
391 * Page space required to add a new HOFFDUP item to the page, with and
392 * without the index value.
394 #define HOFFDUP_SIZE (sizeof(HOFFDUP))
395 #define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t))
397 /************************************************************************
398 BTREE PAGE LAYOUT
399 ************************************************************************/
401 /* Each index references a group of bytes on the page. */
402 #define B_KEYDATA 1 /* Key/data item. */
403 #define B_DUPLICATE 2 /* Duplicate key/data item. */
404 #define B_OVERFLOW 3 /* Overflow key/data item. */
407 * We have to store a deleted entry flag in the page. The reason is complex,
408 * but the simple version is that we can't delete on-page items referenced by
409 * a cursor -- the return order of subsequent insertions might be wrong. The
410 * delete flag is an overload of the top bit of the type byte.
412 #define B_DELETE (0x80)
413 #define B_DCLR(t) (t) &= ~B_DELETE
414 #define B_DSET(t) (t) |= B_DELETE
415 #define B_DISSET(t) ((t) & B_DELETE)
417 #define B_TYPE(t) ((t) & ~B_DELETE)
418 #define B_TSET(t, type, deleted) { \
419 (t) = (type); \
420 if (deleted) \
421 B_DSET(t); \
425 * The first type is B_KEYDATA, represented by the BKEYDATA structure:
427 * +-----------------------------------+
428 * | length | type | key/data |
429 * +-----------------------------------+
431 typedef struct _bkeydata {
432 db_indx_t len; /* 00-01: Key/data item length. */
433 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
434 u_int8_t data[1]; /* Variable length key/data item. */
435 } BKEYDATA;
437 /* Get a BKEYDATA item for a specific index. */
438 #define GET_BKEYDATA(pg, indx) \
439 ((BKEYDATA *)P_ENTRY(pg, indx))
442 * Page space required to add a new BKEYDATA item to the page, with and
443 * without the index value.
445 #define BKEYDATA_SIZE(len) \
446 ALIGN((len) + SSZA(BKEYDATA, data), 4)
447 #define BKEYDATA_PSIZE(len) \
448 (BKEYDATA_SIZE(len) + sizeof(db_indx_t))
451 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
452 * by the BOVERFLOW structure:
454 * +-----------------------------------+
455 * | total len | type | unused |
456 * +-----------------------------------+
457 * | nxt: page | nxt: off | nxt: len |
458 * +-----------------------------------+
460 typedef struct _boverflow {
461 db_indx_t unused1; /* 00-01: Padding, unused. */
462 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
463 u_int8_t unused2; /* 03: Padding, unused. */
464 db_pgno_t pgno; /* 04-07: Next page number. */
465 u_int32_t tlen; /* 08-11: Total length of item. */
466 } BOVERFLOW;
468 /* Get a BOVERFLOW item for a specific index. */
469 #define GET_BOVERFLOW(pg, indx) \
470 ((BOVERFLOW *)P_ENTRY(pg, indx))
473 * Page space required to add a new BOVERFLOW item to the page, with and
474 * without the index value.
476 #define BOVERFLOW_SIZE \
477 ALIGN(sizeof(BOVERFLOW), 4)
478 #define BOVERFLOW_PSIZE \
479 (BOVERFLOW_SIZE + sizeof(db_indx_t))
482 * Btree leaf and hash page layouts group indices in sets of two, one
483 * for the key and one for the data. Everything else does it in sets
484 * of one to save space. I use the following macros so that it's real
485 * obvious what's going on...
487 #define O_INDX 1
488 #define P_INDX 2
490 /************************************************************************
491 BTREE INTERNAL PAGE LAYOUT
492 ************************************************************************/
495 * Btree internal entry.
497 * +-----------------------------------+
498 * | leaf pgno | type | data ... |
499 * +-----------------------------------+
501 typedef struct _binternal {
502 db_indx_t len; /* 00-01: Key/data item length. */
503 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
504 u_int8_t unused; /* 03: Padding, unused. */
505 db_pgno_t pgno; /* 04-07: Page number of referenced page. */
506 db_recno_t nrecs; /* 08-11: Subtree record count. */
507 u_int8_t data[1]; /* Variable length key item. */
508 } BINTERNAL;
510 /* Get a BINTERNAL item for a specific index. */
511 #define GET_BINTERNAL(pg, indx) \
512 ((BINTERNAL *)P_ENTRY(pg, indx))
515 * Page space required to add a new BINTERNAL item to the page, with and
516 * without the index value.
518 #define BINTERNAL_SIZE(len) \
519 ALIGN((len) + SSZA(BINTERNAL, data), 4)
520 #define BINTERNAL_PSIZE(len) \
521 (BINTERNAL_SIZE(len) + sizeof(db_indx_t))
523 /************************************************************************
524 RECNO INTERNAL PAGE LAYOUT
525 ************************************************************************/
528 * The recno internal entry.
530 * +-----------------------+
531 * | leaf pgno | # of recs |
532 * +-----------------------+
534 * XXX
535 * Why not fold this into the db_indx_t structure, it's fixed length.
537 typedef struct _rinternal {
538 db_pgno_t pgno; /* 00-03: Page number of referenced page. */
539 db_recno_t nrecs; /* 04-07: Subtree record count. */
540 } RINTERNAL;
542 /* Get a RINTERNAL item for a specific index. */
543 #define GET_RINTERNAL(pg, indx) \
544 ((RINTERNAL *)P_ENTRY(pg, indx))
547 * Page space required to add a new RINTERNAL item to the page, with and
548 * without the index value.
550 #define RINTERNAL_SIZE \
551 ALIGN(sizeof(RINTERNAL), 4)
552 #define RINTERNAL_PSIZE \
553 (RINTERNAL_SIZE + sizeof(db_indx_t))
554 #endif /* _DB_PAGE_H_ */