2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid
[] = "@(#)mp_fget.c 10.53 (Sleepycat) 11/16/98";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
24 #include "common_ext.h"
28 * Get a page from the file.
31 memp_fget(dbmfp
, pgnoaddr
, flags
, addrp
)
41 size_t bucket
, mf_offset
;
43 int b_incr
, first
, ret
;
55 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
56 * files here, and create non-existent pages in readonly files if the
57 * flags are set, later. The reason is that the hash access method
58 * wants to get empty pages that don't really exist in readonly files.
59 * The only alternative is for hash to write the last "bucket" all the
60 * time, which we don't want to do because one of our big goals in life
61 * is to keep database files small. It's sleazy as hell, but we catch
62 * any attempt to actually write the file in memp_fput().
64 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
67 __db_fchk(dbmp
->dbenv
, "memp_fget", flags
, OKFLAGS
)) != 0)
77 return (__db_ferr(dbmp
->dbenv
, "memp_fget", 1));
84 * We want to switch threads as often as possible. Yield every time
85 * we get a new page to ensure contention.
87 if (DB_GLOBAL(db_pageyield
))
91 /* Initialize remaining local variables. */
92 mf_offset
= R_OFFSET(dbmp
, mfp
);
97 /* Determine the hash bucket where this page will live. */
98 bucket
= BUCKET(mp
, mf_offset
, *pgnoaddr
);
103 * Check for the last or last + 1 page requests.
105 * Examine and update the file's last_pgno value. We don't care if
106 * the last_pgno value immediately changes due to another thread --
107 * at this instant in time, the value is correct. We do increment the
108 * current last_pgno value if the thread is asking for a new page,
109 * however, to ensure that two threads creating pages don't get the
112 if (LF_ISSET(DB_MPOOL_LAST
| DB_MPOOL_NEW
)) {
113 if (LF_ISSET(DB_MPOOL_NEW
))
115 *pgnoaddr
= mfp
->last_pgno
;
116 bucket
= BUCKET(mp
, mf_offset
, mfp
->last_pgno
);
118 if (LF_ISSET(DB_MPOOL_NEW
))
123 * If mmap'ing the file and the page is not past the end of the file,
124 * just return a pointer.
126 * The page may be past the end of the file, so check the page number
127 * argument against the original length of the file. If we previously
128 * returned pages past the original end of the file, last_pgno will
129 * have been updated to match the "new" end of the file, and checking
130 * against it would return pointers past the end of the mmap'd region.
132 * If another process has opened the file for writing since we mmap'd
133 * it, we will start playing the game by their rules, i.e. everything
134 * goes through the cache. All pages previously returned will be safe,
135 * as long as the correct locking protocol was observed.
138 * We don't discard the map because we don't know when all of the
139 * pages will have been discarded from the process' address space.
140 * It would be possible to do so by reference counting the open
141 * pages from the mmap, but it's unclear to me that it's worth it.
143 if (dbmfp
->addr
!= NULL
&& F_ISSET(mfp
, MP_CAN_MMAP
)) {
144 if (*pgnoaddr
> mfp
->orig_last_pgno
) {
147 * See the comment above about non-existent pages and
148 * the hash access method.
150 if (!LF_ISSET(DB_MPOOL_CREATE
)) {
151 __db_err(dbmp
->dbenv
,
152 "%s: page %lu doesn't exist",
153 __memp_fn(dbmfp
), (u_long
)*pgnoaddr
);
159 R_ADDR(dbmfp
, *pgnoaddr
* mfp
->stat
.st_pagesize
);
166 /* Search the hash chain for the page. */
167 for (bhp
= SH_TAILQ_FIRST(&dbmp
->htab
[bucket
], __bh
);
168 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, hq
, __bh
)) {
170 if (bhp
->pgno
!= *pgnoaddr
|| bhp
->mf_offset
!= mf_offset
)
173 /* Increment the reference count. */
174 if (bhp
->ref
== UINT16_T_MAX
) {
175 __db_err(dbmp
->dbenv
,
176 "%s: page %lu: reference count overflow",
177 __memp_fn(dbmfp
), (u_long
)bhp
->pgno
);
183 * Increment the reference count. We may discard the region
184 * lock as we evaluate and/or read the buffer, so we need to
185 * ensure that it doesn't move and that its contents remain
192 * Any buffer we find might be trouble.
195 * I/O is in progress. Because we've incremented the buffer
196 * reference count, we know the buffer can't move. Unlock
197 * the region lock, wait for the I/O to complete, and reacquire
200 for (first
= 1; F_ISSET(bhp
, BH_LOCKED
); first
= 0) {
204 * Explicitly yield the processor if it's not the first
205 * pass through this loop -- if we don't, we might end
206 * up running to the end of our CPU quantum as we will
207 * simply be swapping between the two locks.
212 LOCKBUFFER(dbmp
, bhp
);
213 /* Wait for I/O to finish... */
214 UNLOCKBUFFER(dbmp
, bhp
);
220 * The contents of the buffer are garbage. Shouldn't happen,
221 * and this read is likely to fail, but might as well try.
223 if (F_ISSET(bhp
, BH_TRASH
))
228 * The buffer was converted so it could be written, and the
229 * contents need to be converted again.
231 if (F_ISSET(bhp
, BH_CALLPGIN
)) {
232 if ((ret
= __memp_pg(dbmfp
, bhp
, 1)) != 0)
234 F_CLR(bhp
, BH_CALLPGIN
);
237 ++mp
->stat
.st_cache_hit
;
238 ++mfp
->stat
.st_cache_hit
;
239 *(void **)addrp
= bhp
->buf
;
243 alloc
: /* Allocate new buffer header and data space. */
244 if ((ret
= __memp_alloc(dbmp
, sizeof(BH
) -
245 sizeof(u_int8_t
) + mfp
->stat
.st_pagesize
, NULL
, &bhp
)) != 0)
249 if ((ALIGNTYPE
)bhp
->buf
& (sizeof(size_t) - 1)) {
250 __db_err(dbmp
->dbenv
,
251 "Internal error: BH data NOT size_t aligned.");
256 /* Initialize the BH fields. */
257 memset(bhp
, 0, sizeof(BH
));
258 LOCKINIT(dbmp
, &bhp
->mutex
);
260 bhp
->pgno
= *pgnoaddr
;
261 bhp
->mf_offset
= mf_offset
;
264 * Prepend the bucket header to the head of the appropriate MPOOL
265 * bucket hash list. Append the bucket header to the tail of the
268 SH_TAILQ_INSERT_HEAD(&dbmp
->htab
[bucket
], bhp
, hq
, __bh
);
269 SH_TAILQ_INSERT_TAIL(&mp
->bhq
, bhp
, q
);
272 * If we created the page, zero it out and continue.
275 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
276 * If DB_MPOOL_CREATE is used, then the application's pgin function
277 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
278 * it can detect all of its page creates, and not bother.
280 * Otherwise, read the page into memory, optionally creating it if
281 * DB_MPOOL_CREATE is set.
283 if (LF_ISSET(DB_MPOOL_NEW
)) {
284 if (mfp
->clear_len
== 0)
285 memset(bhp
->buf
, 0, mfp
->stat
.st_pagesize
);
287 memset(bhp
->buf
, 0, mfp
->clear_len
);
289 memset(bhp
->buf
+ mfp
->clear_len
, 0xdb,
290 mfp
->stat
.st_pagesize
- mfp
->clear_len
);
294 ++mp
->stat
.st_page_create
;
295 ++mfp
->stat
.st_page_create
;
298 * It's possible for the read function to fail, which means
299 * that we fail as well. Note, the __memp_pgread() function
300 * discards the region lock, so the buffer must be pinned
301 * down so that it cannot move and its contents are unchanged.
303 reread
: if ((ret
= __memp_pgread(dbmfp
,
304 bhp
, LF_ISSET(DB_MPOOL_CREATE
))) != 0) {
307 * Discard the buffer unless another thread is waiting
308 * on our I/O to complete. Regardless, the header has
309 * the BH_TRASH flag set.
312 __memp_bhfree(dbmp
, mfp
, bhp
, 1);
316 ++mp
->stat
.st_cache_miss
;
317 ++mfp
->stat
.st_cache_miss
;
321 * If we're returning a page after our current notion of the last-page,
322 * update our information. Note, there's no way to un-instantiate this
323 * page, it's going to exist whether it's returned to us dirty or not.
325 if (bhp
->pgno
> mfp
->last_pgno
)
326 mfp
->last_pgno
= bhp
->pgno
;
328 ++mp
->stat
.st_page_clean
;
329 *(void **)addrp
= bhp
->buf
;
331 done
: /* Update the chain search statistics. */
333 ++mp
->stat
.st_hash_searches
;
334 if (st_hsearch
> mp
->stat
.st_hash_longest
)
335 mp
->stat
.st_hash_longest
= st_hsearch
;
336 mp
->stat
.st_hash_examined
+= st_hsearch
;
345 err
: /* Discard our reference. */
350 *(void **)addrp
= NULL
;