2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
7 * @(#)mp.h 10.33 (Sleepycat) 5/4/98
10 struct __bh
; typedef struct __bh BH
;
11 struct __db_mpreg
; typedef struct __db_mpreg DB_MPREG
;
12 struct __mpool
; typedef struct __mpool MPOOL
;
13 struct __mpoolfile
; typedef struct __mpoolfile MPOOLFILE
;
15 /* Default mpool name. */
16 #define DB_DEFAULT_MPOOL_FILE "__db_mpool.share"
19 * We default to 128K (16 8K pages) if the user doesn't specify, and
20 * require a minimum of 20K.
22 #ifndef DB_CACHESIZE_DEF
23 #define DB_CACHESIZE_DEF (128 * 1024)
25 #define DB_CACHESIZE_MIN ( 20 * 1024)
27 #define INVALID 0 /* Invalid shared memory offset. */
30 * There are three ways we do locking in the mpool code:
32 * Locking a handle mutex to provide concurrency for DB_THREAD operations.
33 * Locking the region mutex to provide mutual exclusion while reading and
34 * writing structures in the shared region.
35 * Locking buffer header mutexes during I/O.
37 * The first will not be further described here. We use the shared mpool
38 * region lock to provide mutual exclusion while reading/modifying all of
39 * the data structures, including the buffer headers. We use a per-buffer
40 * header lock to wait on buffer I/O. The order of locking is as follows:
42 * Searching for a buffer:
43 * Acquire the region lock.
44 * Find the buffer header.
45 * Increment the reference count (guarantee the buffer stays).
46 * While the BH_LOCKED flag is set (I/O is going on) {
47 * Release the region lock.
48 * Explicitly yield the processor if it's not the first pass
49 * through this loop, otherwise, we can simply spin because
50 * we'll be simply switching between the two locks.
51 * Request the buffer lock.
52 * The I/O will complete...
53 * Acquire the buffer lock.
54 * Release the buffer lock.
55 * Acquire the region lock.
59 * Reading/writing a buffer:
60 * Acquire the region lock.
61 * Find/create the buffer header.
62 * If reading, increment the reference count (guarantee the buffer stays).
63 * Set the BH_LOCKED flag.
64 * Acquire the buffer lock (guaranteed not to block).
65 * Release the region lock.
66 * Do the I/O and/or initialize the buffer contents.
67 * Release the buffer lock.
68 * At this point, the buffer lock is available, but the logical
69 * operation (flagged by BH_LOCKED) is not yet completed. For
70 * this reason, among others, threads checking the BH_LOCKED flag
71 * must loop around their test.
72 * Acquire the region lock.
73 * Clear the BH_LOCKED flag.
74 * Release the region lock.
75 * Return/discard the buffer.
77 * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not
78 * reacquired when a region lock is reacquired because they couldn't have been
79 * closed/discarded and because they never move in memory.
81 #define LOCKINIT(dbmp, mutexp) \
82 if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \
83 (void)__db_mutex_init(mutexp, \
84 MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp))
86 #define LOCKHANDLE(dbmp, mutexp) \
87 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
88 (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd)
89 #define UNLOCKHANDLE(dbmp, mutexp) \
90 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \
91 (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd)
93 #define LOCKREGION(dbmp) \
94 if (F_ISSET(dbmp, MP_LOCKREGION)) \
95 (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \
97 #define UNLOCKREGION(dbmp) \
98 if (F_ISSET(dbmp, MP_LOCKREGION)) \
99 (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \
102 #define LOCKBUFFER(dbmp, bhp) \
103 if (F_ISSET(dbmp, MP_LOCKREGION)) \
104 (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd)
105 #define UNLOCKBUFFER(dbmp, bhp) \
106 if (F_ISSET(dbmp, MP_LOCKREGION)) \
107 (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd)
111 * Per-process memory pool structure.
114 /* These fields need to be protected for multi-threaded support. */
115 db_mutex_t
*mutexp
; /* Structure lock. */
117 /* List of pgin/pgout routines. */
118 LIST_HEAD(__db_mpregh
, __db_mpreg
) dbregq
;
120 /* List of DB_MPOOLFILE's. */
121 TAILQ_HEAD(__db_mpoolfileh
, __db_mpoolfile
) dbmfq
;
123 /* These fields are not protected. */
124 DB_ENV
*dbenv
; /* Reference to error information. */
125 REGINFO reginfo
; /* Region information. */
127 MPOOL
*mp
; /* Address of the shared MPOOL. */
129 void *addr
; /* Address of shalloc() region. */
131 DB_HASHTAB
*htab
; /* Hash table of bucket headers. */
133 #define MP_LOCKHANDLE 0x01 /* Threaded, lock handles and region. */
134 #define MP_LOCKREGION 0x02 /* Concurrent access, lock region. */
140 * DB_MPOOL registry of pgin/pgout functions.
143 LIST_ENTRY(__db_mpreg
) q
; /* Linked list. */
145 int ftype
; /* File type. */
146 /* Pgin, pgout routines. */
147 int (DB_CALLBACK
*pgin
) __P((db_pgno_t
, void *, DBT
*));
148 int (DB_CALLBACK
*pgout
) __P((db_pgno_t
, void *, DBT
*));
153 * Per-process DB_MPOOLFILE information.
155 struct __db_mpoolfile
{
156 /* These fields need to be protected for multi-threaded support. */
157 db_mutex_t
*mutexp
; /* Structure lock. */
159 int fd
; /* Underlying file descriptor. */
161 u_int32_t pinref
; /* Pinned block reference count. */
163 /* These fields are not protected. */
164 TAILQ_ENTRY(__db_mpoolfile
) q
; /* Linked list of DB_MPOOLFILE's. */
166 DB_MPOOL
*dbmp
; /* Overlying DB_MPOOL. */
167 MPOOLFILE
*mfp
; /* Underlying MPOOLFILE. */
169 void *addr
; /* Address of mmap'd region. */
170 size_t len
; /* Length of mmap'd region. */
172 /* These fields need to be protected for multi-threaded support. */
173 #define MP_READONLY 0x01 /* File is readonly. */
174 #define MP_UPGRADE 0x02 /* File descriptor is readwrite. */
175 #define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */
181 * Shared memory pool region. One of these is allocated in shared
182 * memory, and describes the pool.
185 RLAYOUT rlayout
; /* General region information. */
187 SH_TAILQ_HEAD(__bhq
) bhq
; /* LRU list of buckets. */
188 SH_TAILQ_HEAD(__bhfq
) bhfq
; /* Free buckets. */
189 SH_TAILQ_HEAD(__mpfq
) mpfq
; /* List of MPOOLFILEs. */
192 * We make the assumption that the early pages of the file are far
193 * more likely to be retrieved than the later pages, which means
194 * that the top bits are more interesting for hashing since they're
195 * less likely to collide. On the other hand, since 512 4K pages
196 * represents a 2MB file, only the bottom 9 bits of the page number
197 * are likely to be set. We XOR in the offset in the MPOOL of the
198 * MPOOLFILE that backs this particular page, since that should also
199 * be unique for the page.
201 #define BUCKET(mp, mf_offset, pgno) \
202 (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets)
204 size_t htab
; /* Hash table offset. */
205 size_t htab_buckets
; /* Number of hash table entries. */
207 DB_LSN lsn
; /* Maximum checkpoint LSN. */
208 u_int32_t lsn_cnt
; /* Checkpoint buffers left to write. */
210 DB_MPOOL_STAT stat
; /* Global mpool statistics. */
212 #define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */
218 * Shared DB_MPOOLFILE information.
221 SH_TAILQ_ENTRY q
; /* List of MPOOLFILEs */
223 u_int32_t ref
; /* Reference count. */
225 int ftype
; /* File type. */
227 int32_t lsn_off
; /* Page's LSN offset. */
228 u_int32_t clear_len
; /* Bytes to clear on page create. */
230 size_t path_off
; /* File name location. */
231 size_t fileid_off
; /* File identification location. */
233 size_t pgcookie_len
; /* Pgin/pgout cookie length. */
234 size_t pgcookie_off
; /* Pgin/pgout cookie location. */
236 u_int32_t lsn_cnt
; /* Checkpoint buffers left to write. */
238 db_pgno_t last_pgno
; /* Last page in the file. */
239 db_pgno_t orig_last_pgno
; /* Original last page in the file. */
241 #define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */
242 #define MP_TEMP 0x02 /* Backing file is a temporary. */
245 DB_MPOOL_FSTAT stat
; /* Per-file mpool statistics. */
253 db_mutex_t mutex
; /* Structure lock. */
255 u_int16_t ref
; /* Reference count. */
257 #define BH_CALLPGIN 0x001 /* Page needs to be reworked... */
258 #define BH_DIRTY 0x002 /* Page was modified. */
259 #define BH_DISCARD 0x004 /* Page is useless. */
260 #define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */
261 #define BH_TRASH 0x010 /* Page is garbage. */
262 #define BH_WRITE 0x020 /* Page scheduled for writing. */
265 SH_TAILQ_ENTRY q
; /* LRU queue. */
266 SH_TAILQ_ENTRY hq
; /* MPOOL hash bucket queue. */
268 db_pgno_t pgno
; /* Underlying MPOOLFILE page number. */
269 size_t mf_offset
; /* Associated MPOOLFILE offset. */
273 * This array must be size_t aligned -- the DB access methods put PAGE
274 * and other structures into it, and expect to be able to access them
275 * directly. (We guarantee size_t alignment in the db_mpool(3) manual
278 u_int8_t buf
[1]; /* Variable length data. */