Update.
[glibc.git] / db2 / hash / hash_page.c
blob5b3463947beb6dbba8f1c5628b493fc2d39224cf
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 /*
8 * Copyright (c) 1990, 1993, 1994
9 * Margo Seltzer. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
16 * Margo Seltzer.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 * SUCH DAMAGE.
47 #include "config.h"
49 #ifndef lint
50 static const char sccsid[] = "@(#)hash_page.c 10.40 (Sleepycat) 6/2/98";
51 #endif /* not lint */
54 * PACKAGE: hashing
56 * DESCRIPTION:
57 * Page manipulation for hashing package.
59 * ROUTINES:
61 * External
62 * __get_page
63 * __add_ovflpage
64 * __overflow_page
65 * Internal
66 * open_temp
69 #ifndef NO_SYSTEM_INCLUDES
70 #include <sys/types.h>
72 #include <errno.h>
73 #include <string.h>
74 #endif
76 #include "db_int.h"
77 #include "db_page.h"
78 #include "hash.h"
80 static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t));
82 #ifdef DEBUG_SLOW
83 static void __account_page(HTAB *, db_pgno_t, int);
84 #endif
87 * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
89 int
90 __ham_item(hashp, cursorp, mode)
91 HTAB *hashp;
92 HASH_CURSOR *cursorp;
93 db_lockmode_t mode;
95 db_pgno_t next_pgno;
96 int ret;
98 if (F_ISSET(cursorp, H_DELETED))
99 return (EINVAL);
100 F_CLR(cursorp, H_OK | H_NOMORE);
102 /* Check if we need to get a page for this cursor. */
103 if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
104 return (ret);
106 /* Check if we are looking for space in which to insert an item. */
107 if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID
108 && cursorp->seek_size < P_FREESPACE(cursorp->pagep))
109 cursorp->seek_found_page = cursorp->pgno;
111 /* Check if we need to go on to the next page. */
112 if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID)
114 * ISDUP is set, and offset is at the beginning of the datum.
115 * We need to grab the length of the datum, then set the datum
116 * pointer to be the beginning of the datum.
118 memcpy(&cursorp->dup_len,
119 HKEYDATA_DATA(H_PAIRDATA(cursorp->pagep, cursorp->bndx)) +
120 cursorp->dup_off, sizeof(db_indx_t));
121 else if (F_ISSET(cursorp, H_ISDUP)) {
122 /* Make sure we're not about to run off the page. */
123 if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp,
124 cursorp->dpgno, &cursorp->dpagep)) != 0)
125 return (ret);
127 if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) {
128 if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) {
129 if ((ret = __ham_put_page(hashp->dbp,
130 cursorp->dpagep, 0)) != 0)
131 return (ret);
132 F_CLR(cursorp, H_ISDUP);
133 cursorp->dpagep = NULL;
134 cursorp->dpgno = PGNO_INVALID;
135 cursorp->dndx = NDX_INVALID;
136 cursorp->bndx++;
137 } else if ((ret = __ham_next_cpage(hashp, cursorp,
138 NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
139 return (ret);
143 if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) {
144 /* Fetch next page. */
145 if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) {
146 F_SET(cursorp, H_NOMORE);
147 if (cursorp->dpagep != NULL &&
148 (ret = __ham_put_page(hashp->dbp,
149 cursorp->dpagep, 0)) != 0)
150 return (ret);
151 cursorp->dpgno = PGNO_INVALID;
152 return (DB_NOTFOUND);
154 next_pgno = NEXT_PGNO(cursorp->pagep);
155 cursorp->bndx = 0;
156 if ((ret = __ham_next_cpage(hashp,
157 cursorp, next_pgno, 0, 0)) != 0)
158 return (ret);
161 F_SET(cursorp, H_OK);
162 return (0);
166 * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *));
169 __ham_item_reset(hashp, cursorp)
170 HTAB *hashp;
171 HASH_CURSOR *cursorp;
173 int ret;
175 if (cursorp->pagep)
176 ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0);
177 else
178 ret = 0;
180 __ham_item_init(cursorp);
181 return (ret);
185 * PUBLIC: void __ham_item_init __P((HASH_CURSOR *));
187 void
188 __ham_item_init(cursorp)
189 HASH_CURSOR *cursorp;
191 cursorp->pagep = NULL;
192 cursorp->bucket = BUCKET_INVALID;
193 cursorp->lock = 0;
194 cursorp->bndx = NDX_INVALID;
195 cursorp->pgno = PGNO_INVALID;
196 cursorp->dpgno = PGNO_INVALID;
197 cursorp->dndx = NDX_INVALID;
198 cursorp->dpagep = NULL;
199 cursorp->flags = 0;
200 cursorp->seek_size = 0;
201 cursorp->seek_found_page = PGNO_INVALID;
205 * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int));
208 __ham_item_done(hashp, cursorp, dirty)
209 HTAB *hashp;
210 HASH_CURSOR *cursorp;
211 int dirty;
213 int ret, t_ret;
215 t_ret = ret = 0;
217 if (cursorp->pagep)
218 ret = __ham_put_page(hashp->dbp, cursorp->pagep,
219 dirty && cursorp->dpagep == NULL);
220 cursorp->pagep = NULL;
222 if (cursorp->dpagep)
223 t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty);
224 cursorp->dpagep = NULL;
226 if (ret == 0 && t_ret != 0)
227 ret = t_ret;
230 * If we are running with transactions, then we must
231 * not relinquish locks explicitly.
233 if (cursorp->lock && hashp->dbp->txn == NULL)
234 t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock);
235 cursorp->lock = 0;
239 * We don't throw out the page number since we might want to
240 * continue getting on this page.
242 return (ret != 0 ? ret : t_ret);
246 * Returns the last item in a bucket.
248 * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
251 __ham_item_last(hashp, cursorp, mode)
252 HTAB *hashp;
253 HASH_CURSOR *cursorp;
254 db_lockmode_t mode;
256 int ret;
258 if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
259 return (ret);
261 cursorp->bucket = hashp->hdr->max_bucket;
262 F_SET(cursorp, H_OK);
263 return (__ham_item_prev(hashp, cursorp, mode));
267 * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
270 __ham_item_first(hashp, cursorp, mode)
271 HTAB *hashp;
272 HASH_CURSOR *cursorp;
273 db_lockmode_t mode;
275 int ret;
277 if ((ret = __ham_item_reset(hashp, cursorp)) != 0)
278 return (ret);
279 F_SET(cursorp, H_OK);
280 cursorp->bucket = 0;
281 return (__ham_item_next(hashp, cursorp, mode));
285 * __ham_item_prev --
286 * Returns a pointer to key/data pair on a page. In the case of
287 * bigkeys, just returns the page number and index of the bigkey
288 * pointer pair.
290 * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
293 __ham_item_prev(hashp, cursorp, mode)
294 HTAB *hashp;
295 HASH_CURSOR *cursorp;
296 db_lockmode_t mode;
298 db_pgno_t next_pgno;
299 int ret;
302 * There are N cases for backing up in a hash file.
303 * Case 1: In the middle of a page, no duplicates, just dec the index.
304 * Case 2: In the middle of a duplicate set, back up one.
305 * Case 3: At the beginning of a duplicate set, get out of set and
306 * back up to next key.
307 * Case 4: At the beginning of a page; go to previous page.
308 * Case 5: At the beginning of a bucket; go to prev bucket.
310 F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED);
313 * First handle the duplicates. Either you'll get the key here
314 * or you'll exit the duplicate set and drop into the code below
315 * to handle backing up through keys.
317 if (F_ISSET(cursorp, H_ISDUP)) {
318 if (cursorp->dpgno == PGNO_INVALID) {
319 /* Duplicates are on-page. */
320 if (cursorp->dup_off != 0) {
321 if ((ret = __ham_get_cpage(hashp,
322 cursorp, mode)) != 0)
323 return (ret);
324 else {
325 HASH_CURSOR *h;
326 h = cursorp;
327 memcpy(&h->dup_len, HKEYDATA_DATA(
328 H_PAIRDATA(h->pagep, h->bndx))
329 + h->dup_off - sizeof(db_indx_t),
330 sizeof(db_indx_t));
331 cursorp->dup_off -=
332 DUP_SIZE(cursorp->dup_len);
333 cursorp->dndx--;
334 return (__ham_item(hashp,
335 cursorp, mode));
338 } else if (cursorp->dndx > 0) { /* Duplicates are off-page. */
339 cursorp->dndx--;
340 return (__ham_item(hashp, cursorp, mode));
341 } else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
342 return (ret);
343 else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) {
344 F_CLR(cursorp, H_ISDUP); /* End of dups */
345 cursorp->dpgno = PGNO_INVALID;
346 if (cursorp->dpagep != NULL)
347 (void)__ham_put_page(hashp->dbp,
348 cursorp->dpagep, 0);
349 cursorp->dpagep = NULL;
350 } else if ((ret = __ham_next_cpage(hashp, cursorp,
351 PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0)
352 return (ret);
353 else {
354 cursorp->dndx = NUM_ENT(cursorp->pagep) - 1;
355 return (__ham_item(hashp, cursorp, mode));
360 * If we get here, we are not in a duplicate set, and just need
361 * to back up the cursor. There are still three cases:
362 * midpage, beginning of page, beginning of bucket.
365 if (cursorp->bndx == 0) { /* Beginning of page. */
366 if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0)
367 return (ret);
368 cursorp->pgno = PREV_PGNO(cursorp->pagep);
369 if (cursorp->pgno == PGNO_INVALID) {
370 /* Beginning of bucket. */
371 F_SET(cursorp, H_NOMORE);
372 return (DB_NOTFOUND);
373 } else if ((ret = __ham_next_cpage(hashp,
374 cursorp, cursorp->pgno, 0, 0)) != 0)
375 return (ret);
376 else
377 cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
381 * Either we've got the cursor set up to be decremented, or we
382 * have to find the end of a bucket.
384 if (cursorp->bndx == NDX_INVALID) {
385 if (cursorp->pagep == NULL)
386 next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket);
387 else
388 goto got_page;
390 do {
391 if ((ret = __ham_next_cpage(hashp,
392 cursorp, next_pgno, 0, 0)) != 0)
393 return (ret);
394 got_page: next_pgno = NEXT_PGNO(cursorp->pagep);
395 cursorp->bndx = H_NUMPAIRS(cursorp->pagep);
396 } while (next_pgno != PGNO_INVALID);
398 if (cursorp->bndx == 0) {
399 /* Bucket was empty. */
400 F_SET(cursorp, H_NOMORE);
401 return (DB_NOTFOUND);
405 cursorp->bndx--;
407 return (__ham_item(hashp, cursorp, mode));
411 * Sets the cursor to the next key/data pair on a page.
413 * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
416 __ham_item_next(hashp, cursorp, mode)
417 HTAB *hashp;
418 HASH_CURSOR *cursorp;
419 db_lockmode_t mode;
422 * Deleted on-page duplicates are a weird case. If we delete the last
423 * one, then our cursor is at the very end of a duplicate set and
424 * we actually need to go on to the next key.
426 if (F_ISSET(cursorp, H_DELETED)) {
427 if (cursorp->bndx != NDX_INVALID &&
428 F_ISSET(cursorp, H_ISDUP) &&
429 cursorp->dpgno == PGNO_INVALID &&
430 cursorp->dup_tlen == cursorp->dup_off) {
431 F_CLR(cursorp, H_ISDUP);
432 cursorp->dpgno = PGNO_INVALID;
433 cursorp->bndx++;
435 F_CLR(cursorp, H_DELETED);
436 } else if (cursorp->bndx == NDX_INVALID) {
437 cursorp->bndx = 0;
438 cursorp->dpgno = PGNO_INVALID;
439 F_CLR(cursorp, H_ISDUP);
440 } else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID)
441 cursorp->dndx++;
442 else if (F_ISSET(cursorp, H_ISDUP)) {
443 cursorp->dndx++;
444 cursorp->dup_off += DUP_SIZE(cursorp->dup_len);
445 if (cursorp->dup_off >= cursorp->dup_tlen) {
446 F_CLR(cursorp, H_ISDUP);
447 cursorp->dpgno = PGNO_INVALID;
448 cursorp->bndx++;
450 } else
451 cursorp->bndx++;
453 return (__ham_item(hashp, cursorp, mode));
457 * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int));
459 * This is a little bit sleazy in that we're overloading the meaning
460 * of the H_OFFPAGE type here. When we recover deletes, we have the
461 * entire entry instead of having only the DBT, so we'll pass type
462 * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing
463 * an H_KEYDATA around it.
465 void
466 __ham_putitem(p, dbt, type)
467 PAGE *p;
468 const DBT *dbt;
469 int type;
471 u_int16_t n, off;
473 n = NUM_ENT(p);
475 /* Put the item element on the page. */
476 if (type == H_OFFPAGE) {
477 off = HOFFSET(p) - dbt->size;
478 HOFFSET(p) = p->inp[n] = off;
479 memcpy(P_ENTRY(p, n), dbt->data, dbt->size);
480 } else {
481 off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size);
482 HOFFSET(p) = p->inp[n] = off;
483 PUT_HKEYDATA(P_ENTRY(p, n), dbt->data, dbt->size, type);
486 /* Adjust page info. */
487 NUM_ENT(p) += 1;
491 * PUBLIC: void __ham_reputpair
492 * PUBLIC: __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *));
494 * This is a special case to restore a key/data pair to its original
495 * location during recovery. We are guaranteed that the pair fits
496 * on the page and is not the last pair on the page (because if it's
497 * the last pair, the normal insert works).
499 void
500 __ham_reputpair(p, psize, ndx, key, data)
501 PAGE *p;
502 u_int32_t psize, ndx;
503 const DBT *key, *data;
505 db_indx_t i, movebytes, newbytes;
506 u_int8_t *from;
508 /* First shuffle the existing items up on the page. */
509 movebytes =
510 (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - HOFFSET(p);
511 newbytes = key->size + data->size;
512 from = (u_int8_t *)p + HOFFSET(p);
513 memmove(from - newbytes, from, movebytes);
516 * Adjust the indices and move them up 2 spaces. Note that we
517 * have to check the exit condition inside the loop just in case
518 * we are dealing with index 0 (db_indx_t's are unsigned).
520 for (i = NUM_ENT(p) - 1; ; i-- ) {
521 p->inp[i + 2] = p->inp[i] - newbytes;
522 if (i == H_KEYINDEX(ndx))
523 break;
526 /* Put the key and data on the page. */
527 p->inp[H_KEYINDEX(ndx)] =
528 (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - key->size;
529 p->inp[H_DATAINDEX(ndx)] = p->inp[H_KEYINDEX(ndx)] - data->size;
530 memcpy(P_ENTRY(p, H_KEYINDEX(ndx)), key->data, key->size);
531 memcpy(P_ENTRY(p, H_DATAINDEX(ndx)), data->data, data->size);
533 /* Adjust page info. */
534 HOFFSET(p) -= newbytes;
535 NUM_ENT(p) += 2;
540 * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *, int));
542 * XXX
543 * TODO: if the item is an offdup, delete the other pages and then remove
544 * the pair. If the offpage page is 0, then you can just remove the pair.
547 __ham_del_pair(hashp, cursorp, reclaim_page)
548 HTAB *hashp;
549 HASH_CURSOR *cursorp;
550 int reclaim_page;
552 DBT data_dbt, key_dbt;
553 DB_ENV *dbenv;
554 DB_LSN new_lsn, *n_lsn, tmp_lsn;
555 PAGE *p;
556 db_indx_t ndx;
557 db_pgno_t chg_pgno, pgno;
558 int ret, tret;
560 dbenv = hashp->dbp->dbenv;
561 ndx = cursorp->bndx;
562 if (cursorp->pagep == NULL && (ret =
563 __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0)
564 return (ret);
566 p = cursorp->pagep;
569 * We optimize for the normal case which is when neither the key nor
570 * the data are large. In this case, we write a single log record
571 * and do the delete. If either is large, we'll call __big_delete
572 * to remove the big item and then update the page to remove the
573 * entry referring to the big item.
575 ret = 0;
576 if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) {
577 memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))),
578 sizeof(db_pgno_t));
579 ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
582 if (ret == 0)
583 switch (HPAGE_PTYPE(H_PAIRDATA(p, ndx))) {
584 case H_OFFPAGE:
585 memcpy(&pgno,
586 HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
587 sizeof(db_pgno_t));
588 ret = __db_doff(hashp->dbp, pgno, __ham_del_page);
589 break;
590 case H_OFFDUP:
591 memcpy(&pgno,
592 HOFFDUP_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
593 sizeof(db_pgno_t));
594 ret = __db_ddup(hashp->dbp, pgno, __ham_del_page);
595 F_CLR(cursorp, H_ISDUP);
596 break;
597 case H_DUPLICATE:
599 * If we delete a pair that is/was a duplicate, then
600 * we had better clear the flag so that we update the
601 * cursor appropriately.
603 F_CLR(cursorp, H_ISDUP);
604 break;
607 if (ret)
608 return (ret);
610 /* Now log the delete off this page. */
611 if (DB_LOGGING(hashp->dbp)) {
612 key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx));
613 key_dbt.size =
614 LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx));
615 data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx));
616 data_dbt.size =
617 LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx));
619 if ((ret = __ham_insdel_log(dbenv->lg_info,
620 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR,
621 hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
622 &LSN(p), &key_dbt, &data_dbt)) != 0)
623 return (ret);
625 /* Move lsn onto page. */
626 LSN(p) = new_lsn;
629 __ham_dpair(hashp->dbp, p, ndx);
632 * If we are locking, we will not maintain this.
633 * XXXX perhaps we can retain incremental numbers and apply them
634 * later.
636 if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
637 --hashp->hdr->nelem;
640 * If we need to reclaim the page, then check if the page is empty.
641 * There are two cases. If it's empty and it's not the first page
642 * in the bucket (i.e., the bucket page) then we can simply remove
643 * it. If it is the first chain in the bucket, then we need to copy
644 * the second page into it and remove the second page.
646 if (reclaim_page && NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID &&
647 NEXT_PGNO(p) != PGNO_INVALID) {
648 PAGE *n_pagep, *nn_pagep;
649 db_pgno_t tmp_pgno;
652 * First page in chain is empty and we know that there
653 * are more pages in the chain.
655 if ((ret =
656 __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0)
657 return (ret);
659 if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
660 if ((ret =
661 __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep),
662 &nn_pagep)) != 0) {
663 (void) __ham_put_page(hashp->dbp, n_pagep, 0);
664 return (ret);
668 if (DB_LOGGING(hashp->dbp)) {
669 key_dbt.data = n_pagep;
670 key_dbt.size = hashp->hdr->pagesize;
671 if ((ret = __ham_copypage_log(dbenv->lg_info,
672 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
673 hashp->dbp->log_fileid, PGNO(p), &LSN(p),
674 PGNO(n_pagep), &LSN(n_pagep), NEXT_PGNO(n_pagep),
675 NEXT_PGNO(n_pagep) == PGNO_INVALID ? NULL :
676 &LSN(nn_pagep), &key_dbt)) != 0)
677 return (ret);
679 /* Move lsn onto page. */
680 LSN(p) = new_lsn; /* Structure assignment. */
681 LSN(n_pagep) = new_lsn;
682 if (NEXT_PGNO(n_pagep) != PGNO_INVALID)
683 LSN(nn_pagep) = new_lsn;
685 if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
686 PREV_PGNO(nn_pagep) = PGNO(p);
687 (void)__ham_put_page(hashp->dbp, nn_pagep, 1);
690 tmp_pgno = PGNO(p);
691 tmp_lsn = LSN(p);
692 memcpy(p, n_pagep, hashp->hdr->pagesize);
693 PGNO(p) = tmp_pgno;
694 LSN(p) = tmp_lsn;
695 PREV_PGNO(p) = PGNO_INVALID;
698 * Cursor is advanced to the beginning of the next page.
700 cursorp->bndx = 0;
701 cursorp->pgno = PGNO(p);
702 F_SET(cursorp, H_DELETED);
703 chg_pgno = PGNO(p);
704 if ((ret = __ham_dirty_page(hashp, p)) != 0 ||
705 (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0)
706 return (ret);
707 } else if (reclaim_page &&
708 NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) {
709 PAGE *n_pagep, *p_pagep;
711 if ((ret =
712 __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0)
713 return (ret);
715 if (NEXT_PGNO(p) != PGNO_INVALID) {
716 if ((ret = __ham_get_page(hashp->dbp,
717 NEXT_PGNO(p), &n_pagep)) != 0) {
718 (void)__ham_put_page(hashp->dbp, p_pagep, 0);
719 return (ret);
721 n_lsn = &LSN(n_pagep);
722 } else {
723 n_pagep = NULL;
724 n_lsn = NULL;
727 NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
728 if (n_pagep != NULL)
729 PREV_PGNO(n_pagep) = PGNO(p_pagep);
731 if (DB_LOGGING(hashp->dbp)) {
732 if ((ret = __ham_newpage_log(dbenv->lg_info,
733 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL,
734 hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
735 PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
736 return (ret);
738 /* Move lsn onto page. */
739 LSN(p_pagep) = new_lsn; /* Structure assignment. */
740 if (n_pagep)
741 LSN(n_pagep) = new_lsn;
742 LSN(p) = new_lsn;
744 cursorp->pgno = NEXT_PGNO(p);
745 cursorp->bndx = 0;
747 * Since we are about to delete the cursor page and we have
748 * just moved the cursor, we need to make sure that the
749 * old page pointer isn't left hanging around in the cursor.
751 cursorp->pagep = NULL;
752 chg_pgno = PGNO(p);
753 ret = __ham_del_page(hashp->dbp, p);
754 if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 &&
755 ret == 0)
756 ret = tret;
757 if (n_pagep != NULL &&
758 (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 &&
759 ret == 0)
760 ret = tret;
761 if (ret != 0)
762 return (ret);
763 } else {
765 * Mark item deleted so that we don't try to return it, and
766 * so that we update the cursor correctly on the next call
767 * to next.
769 F_SET(cursorp, H_DELETED);
770 chg_pgno = cursorp->pgno;
771 ret = __ham_dirty_page(hashp, p);
773 __ham_c_update(cursorp, chg_pgno, 0, 0, 0);
776 * Since we just deleted a pair from the master page, anything
777 * in cursorp->dpgno should be cleared.
779 cursorp->dpgno = PGNO_INVALID;
781 F_CLR(cursorp, H_OK);
782 return (ret);
786 * __ham_replpair --
787 * Given the key data indicated by the cursor, replace part/all of it
788 * according to the fields in the dbt.
790 * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t));
793 __ham_replpair(hashp, hcp, dbt, make_dup)
794 HTAB *hashp;
795 HASH_CURSOR *hcp;
796 DBT *dbt;
797 u_int32_t make_dup;
799 DBT old_dbt, tdata, tmp;
800 DB_LSN new_lsn;
801 int32_t change; /* XXX: Possible overflow. */
802 u_int32_t len;
803 int is_big, ret, type;
804 u_int8_t *beg, *dest, *end, *hk, *src;
807 * Big item replacements are handled in generic code.
808 * Items that fit on the current page fall into 4 classes.
809 * 1. On-page element, same size
810 * 2. On-page element, new is bigger (fits)
811 * 3. On-page element, new is bigger (does not fit)
812 * 4. On-page element, old is bigger
813 * Numbers 1, 2, and 4 are essentially the same (and should
814 * be the common case). We handle case 3 as a delete and
815 * add.
819 * We need to compute the number of bytes that we are adding or
820 * removing from the entry. Normally, we can simply substract
821 * the number of bytes we are replacing (dbt->dlen) from the
822 * number of bytes we are inserting (dbt->size). However, if
823 * we are doing a partial put off the end of a record, then this
824 * formula doesn't work, because we are essentially adding
825 * new bytes.
827 change = dbt->size - dbt->dlen;
829 hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
830 is_big = HPAGE_PTYPE(hk) == H_OFFPAGE;
832 if (is_big)
833 memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
834 else
835 len = LEN_HKEYDATA(hcp->pagep,
836 hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx));
838 if (dbt->doff + dbt->dlen > len)
839 change += dbt->doff + dbt->dlen - len;
842 if (change > (int32_t)P_FREESPACE(hcp->pagep) || is_big) {
844 * Case 3 -- two subcases.
845 * A. This is not really a partial operation, but an overwrite.
846 * Simple del and add works.
847 * B. This is a partial and we need to construct the data that
848 * we are really inserting (yuck).
849 * In both cases, we need to grab the key off the page (in
850 * some cases we could do this outside of this routine; for
851 * cleanliness we do it here. If you happen to be on a big
852 * key, this could be a performance hit).
854 tmp.flags = 0;
855 F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL);
856 if ((ret =
857 __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
858 &tmp, &hcp->big_key, &hcp->big_keylen)) != 0)
859 return (ret);
861 if (dbt->doff == 0 && dbt->dlen == len) {
862 ret = __ham_del_pair(hashp, hcp, 0);
863 if (ret == 0)
864 ret = __ham_add_el(hashp,
865 hcp, &tmp, dbt, H_KEYDATA);
866 } else { /* Case B */
867 type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
868 HPAGE_PTYPE(hk) : H_KEYDATA;
869 tdata.flags = 0;
870 F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL);
872 if ((ret = __db_ret(hashp->dbp, hcp->pagep,
873 H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data,
874 &hcp->big_datalen)) != 0)
875 goto err;
877 /* Now we can delete the item. */
878 if ((ret = __ham_del_pair(hashp, hcp, 0)) != 0) {
879 __db_free(tdata.data);
880 goto err;
883 /* Now shift old data around to make room for new. */
884 if (change > 0) {
885 tdata.data = (void *)__db_realloc(tdata.data,
886 tdata.size + change);
887 memset((u_int8_t *)tdata.data + tdata.size,
888 0, change);
890 if (tdata.data == NULL)
891 return (ENOMEM);
892 end = (u_int8_t *)tdata.data + tdata.size;
894 src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
895 if (src < end && tdata.size > dbt->doff + dbt->dlen) {
896 len = tdata.size - dbt->doff - dbt->dlen;
897 dest = src + change;
898 memmove(dest, src, len);
900 memcpy((u_int8_t *)tdata.data + dbt->doff,
901 dbt->data, dbt->size);
902 tdata.size += change;
904 /* Now add the pair. */
905 ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type);
906 __db_free(tdata.data);
908 err: __db_free(tmp.data);
909 return (ret);
913 * Set up pointer into existing data. Do it before the log
914 * message so we can use it inside of the log setup.
916 beg = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
917 beg += dbt->doff;
920 * If we are going to have to move bytes at all, figure out
921 * all the parameters here. Then log the call before moving
922 * anything around.
924 if (DB_LOGGING(hashp->dbp)) {
925 old_dbt.data = beg;
926 old_dbt.size = dbt->dlen;
927 if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info,
928 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
929 hashp->dbp->log_fileid, PGNO(hcp->pagep),
930 (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep),
931 (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0)
932 return (ret);
934 LSN(hcp->pagep) = new_lsn; /* Structure assignment. */
937 __ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize,
938 (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt);
940 return (0);
944 * Replace data on a page with new data, possibly growing or shrinking what's
945 * there. This is called on two different occasions. On one (from replpair)
946 * we are interested in changing only the data. On the other (from recovery)
947 * we are replacing the entire data (header and all) with a new element. In
948 * the latter case, the off argument is negative.
949 * pagep: the page that we're changing
950 * ndx: page index of the element that is growing/shrinking.
951 * off: Offset at which we are beginning the replacement.
952 * change: the number of bytes (+ or -) that the element is growing/shrinking.
953 * dbt: the new data that gets written at beg.
954 * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t,
955 * PUBLIC: int32_t, DBT *));
957 void
958 __ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt)
959 PAGE *pagep;
960 size_t pgsize;
961 u_int32_t ndx;
962 int32_t off;
963 int32_t change;
964 DBT *dbt;
966 db_indx_t i;
967 int32_t len;
968 u_int8_t *src, *dest;
969 int zero_me;
971 if (change != 0) {
972 zero_me = 0;
973 src = (u_int8_t *)(pagep) + HOFFSET(pagep);
974 if (off < 0)
975 len = pagep->inp[ndx] - HOFFSET(pagep);
976 else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) {
977 len = HKEYDATA_DATA(P_ENTRY(pagep, ndx)) +
978 LEN_HKEYDATA(pagep, pgsize, ndx) - src;
979 zero_me = 1;
980 } else
981 len = (HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off) - src;
982 dest = src - change;
983 memmove(dest, src, len);
984 if (zero_me)
985 memset(dest + len, 0, change);
987 /* Now update the indices. */
988 for (i = ndx; i < NUM_ENT(pagep); i++)
989 pagep->inp[i] -= change;
990 HOFFSET(pagep) -= change;
992 if (off >= 0)
993 memcpy(HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off,
994 dbt->data, dbt->size);
995 else
996 memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size);
1000 * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t));
1003 __ham_split_page(hashp, obucket, nbucket)
1004 HTAB *hashp;
1005 u_int32_t obucket, nbucket;
1007 DBT key, page_dbt;
1008 DB_ENV *dbenv;
1009 DB_LSN new_lsn;
1010 PAGE **pp, *old_pagep, *temp_pagep, *new_pagep;
1011 db_indx_t n;
1012 db_pgno_t bucket_pgno, next_pgno;
1013 u_int32_t big_len, len;
1014 int ret, tret;
1015 void *big_buf;
1017 dbenv = hashp->dbp->dbenv;
1018 temp_pagep = old_pagep = new_pagep = NULL;
1020 bucket_pgno = BUCKET_TO_PAGE(hashp, obucket);
1021 if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0)
1022 return (ret);
1023 if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH,
1024 &new_pagep)) != 0)
1025 goto err;
1027 temp_pagep = hashp->split_buf;
1028 memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize);
1030 if (DB_LOGGING(hashp->dbp)) {
1031 page_dbt.size = hashp->hdr->pagesize;
1032 page_dbt.data = old_pagep;
1033 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1034 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
1035 hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep),
1036 &page_dbt, &LSN(old_pagep))) != 0)
1037 goto err;
1040 P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
1041 PGNO_INVALID, 0, P_HASH);
1043 if (DB_LOGGING(hashp->dbp))
1044 LSN(old_pagep) = new_lsn; /* Structure assignment. */
1046 big_len = 0;
1047 big_buf = NULL;
1048 key.flags = 0;
1049 while (temp_pagep != NULL) {
1050 for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) {
1051 if ((ret =
1052 __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n),
1053 &key, &big_buf, &big_len)) != 0)
1054 goto err;
1056 if (__ham_call_hash(hashp, key.data, key.size)
1057 == obucket)
1058 pp = &old_pagep;
1059 else
1060 pp = &new_pagep;
1063 * Figure out how many bytes we need on the new
1064 * page to store the key/data pair.
1067 len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
1068 H_DATAINDEX(n)) +
1069 LEN_HITEM(temp_pagep, hashp->hdr->pagesize,
1070 H_KEYINDEX(n)) +
1071 2 * sizeof(db_indx_t);
1073 if (P_FREESPACE(*pp) < len) {
1074 if (DB_LOGGING(hashp->dbp)) {
1075 page_dbt.size = hashp->hdr->pagesize;
1076 page_dbt.data = *pp;
1077 if ((ret = __ham_splitdata_log(
1078 dbenv->lg_info,
1079 (DB_TXN *)hashp->dbp->txn,
1080 &new_lsn, 0,
1081 hashp->dbp->log_fileid, SPLITNEW,
1082 PGNO(*pp), &page_dbt,
1083 &LSN(*pp))) != 0)
1084 goto err;
1085 LSN(*pp) = new_lsn;
1087 if ((ret = __ham_add_ovflpage(hashp,
1088 *pp, 1, pp)) != 0)
1089 goto err;
1091 __ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp);
1092 __ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp);
1094 next_pgno = NEXT_PGNO(temp_pagep);
1096 /* Clear temp_page; if it's a link overflow page, free it. */
1097 if (PGNO(temp_pagep) != bucket_pgno && (ret =
1098 __ham_del_page(hashp->dbp, temp_pagep)) != 0)
1099 goto err;
1101 if (next_pgno == PGNO_INVALID)
1102 temp_pagep = NULL;
1103 else if ((ret =
1104 __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0)
1105 goto err;
1107 if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) {
1108 page_dbt.size = hashp->hdr->pagesize;
1109 page_dbt.data = temp_pagep;
1110 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1111 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
1112 hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep),
1113 &page_dbt, &LSN(temp_pagep))) != 0)
1114 goto err;
1115 LSN(temp_pagep) = new_lsn;
1118 if (big_buf != NULL)
1119 __db_free(big_buf);
1122 * If the original bucket spanned multiple pages, then we've got
1123 * a pointer to a page that used to be on the bucket chain. It
1124 * should be deleted.
1126 if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
1127 (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0)
1128 goto err;
1131 * Write new buckets out.
1133 if (DB_LOGGING(hashp->dbp)) {
1134 page_dbt.size = hashp->hdr->pagesize;
1135 page_dbt.data = old_pagep;
1136 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1137 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
1138 hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep),
1139 &page_dbt, &LSN(old_pagep))) != 0)
1140 goto err;
1141 LSN(old_pagep) = new_lsn;
1143 page_dbt.data = new_pagep;
1144 if ((ret = __ham_splitdata_log(dbenv->lg_info,
1145 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0,
1146 hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep),
1147 &page_dbt, &LSN(new_pagep))) != 0)
1148 goto err;
1149 LSN(new_pagep) = new_lsn;
1151 ret = __ham_put_page(hashp->dbp, old_pagep, 1);
1152 if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 &&
1153 ret == 0)
1154 ret = tret;
1156 if (0) {
1157 err: if (old_pagep != NULL)
1158 (void)__ham_put_page(hashp->dbp, old_pagep, 1);
1159 if (new_pagep != NULL)
1160 (void)__ham_put_page(hashp->dbp, new_pagep, 1);
1161 if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
1162 (void)__ham_put_page(hashp->dbp, temp_pagep, 1);
1164 return (ret);
1168 * Add the given pair to the page. The page in question may already be
1169 * held (i.e. it was already gotten). If it is, then the page is passed
1170 * in via the pagep parameter. On return, pagep will contain the page
1171 * to which we just added something. This allows us to link overflow
1172 * pages and return the new page having correctly put the last page.
1174 * PUBLIC: int __ham_add_el
1175 * PUBLIC: __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, int));
1178 __ham_add_el(hashp, hcp, key, val, type)
1179 HTAB *hashp;
1180 HASH_CURSOR *hcp;
1181 const DBT *key, *val;
1182 int type;
1184 const DBT *pkey, *pdata;
1185 DBT key_dbt, data_dbt;
1186 DB_LSN new_lsn;
1187 HOFFPAGE doff, koff;
1188 db_pgno_t next_pgno;
1189 u_int32_t data_size, key_size, pairsize, rectype;
1190 int do_expand, is_keybig, is_databig, ret;
1191 int key_type, data_type;
1193 do_expand = 0;
1195 if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp,
1196 hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page :
1197 hcp->pgno, &hcp->pagep)) != 0)
1198 return (ret);
1200 key_size = HKEYDATA_PSIZE(key->size);
1201 data_size = HKEYDATA_PSIZE(val->size);
1202 is_keybig = ISBIG(hashp, key->size);
1203 is_databig = ISBIG(hashp, val->size);
1204 if (is_keybig)
1205 key_size = HOFFPAGE_PSIZE;
1206 if (is_databig)
1207 data_size = HOFFPAGE_PSIZE;
1209 pairsize = key_size + data_size;
1211 /* Advance to first page in chain with room for item. */
1212 while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) !=
1213 PGNO_INVALID) {
1215 * This may not be the end of the chain, but the pair may fit
1216 * anyway. Check if it's a bigpair that fits or a regular
1217 * pair that fits.
1219 if (P_FREESPACE(hcp->pagep) >= pairsize)
1220 break;
1221 next_pgno = NEXT_PGNO(hcp->pagep);
1222 if ((ret =
1223 __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0)
1224 return (ret);
1228 * Check if we need to allocate a new page.
1230 if (P_FREESPACE(hcp->pagep) < pairsize) {
1231 do_expand = 1;
1232 if ((ret = __ham_add_ovflpage(hashp,
1233 hcp->pagep, 1, &hcp->pagep)) != 0)
1234 return (ret);
1235 hcp->pgno = PGNO(hcp->pagep);
1239 * Update cursor.
1241 hcp->bndx = H_NUMPAIRS(hcp->pagep);
1242 F_CLR(hcp, H_DELETED);
1243 if (is_keybig) {
1244 if ((ret = __db_poff(hashp->dbp,
1245 key, &koff.pgno, __ham_overflow_page)) != 0)
1246 return (ret);
1247 koff.type = H_OFFPAGE;
1248 koff.tlen = key->size;
1249 key_dbt.data = &koff;
1250 key_dbt.size = sizeof(koff);
1251 pkey = &key_dbt;
1252 key_type = H_OFFPAGE;
1253 } else {
1254 pkey = key;
1255 key_type = H_KEYDATA;
1258 if (is_databig) {
1259 if ((ret = __db_poff(hashp->dbp,
1260 val, &doff.pgno, __ham_overflow_page)) != 0)
1261 return (ret);
1262 doff.type = H_OFFPAGE;
1263 doff.tlen = val->size;
1264 data_dbt.data = &doff;
1265 data_dbt.size = sizeof(doff);
1266 pdata = &data_dbt;
1267 data_type = H_OFFPAGE;
1268 } else {
1269 pdata = val;
1270 data_type = type;
1273 if (DB_LOGGING(hashp->dbp)) {
1274 rectype = PUTPAIR;
1275 if (is_databig)
1276 rectype |= PAIR_DATAMASK;
1277 if (is_keybig)
1278 rectype |= PAIR_KEYMASK;
1280 if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info,
1281 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype,
1282 hashp->dbp->log_fileid, PGNO(hcp->pagep),
1283 (u_int32_t)H_NUMPAIRS(hcp->pagep),
1284 &LSN(hcp->pagep), pkey, pdata)) != 0)
1285 return (ret);
1287 /* Move lsn onto page. */
1288 LSN(hcp->pagep) = new_lsn; /* Structure assignment. */
1291 __ham_putitem(hcp->pagep, pkey, key_type);
1292 __ham_putitem(hcp->pagep, pdata, data_type);
1295 * For splits, we are going to update item_info's page number
1296 * field, so that we can easily return to the same page the
1297 * next time we come in here. For other operations, this shouldn't
1298 * matter, since odds are this is the last thing that happens before
1299 * we return to the user program.
1301 hcp->pgno = PGNO(hcp->pagep);
1304 * XXX Maybe keep incremental numbers here
1306 if (!F_ISSET(hashp->dbp, DB_AM_LOCKING))
1307 hashp->hdr->nelem++;
1309 if (do_expand || (hashp->hdr->ffactor != 0 &&
1310 (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor))
1311 F_SET(hcp, H_EXPAND);
1312 return (0);
1317 * Special __putitem call used in splitting -- copies one entry to
1318 * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
1319 * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we
1320 * do not need to do any logging here.
1322 * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, u_int32_t, PAGE *));
1324 void
1325 __ham_copy_item(hashp, src_page, src_ndx, dest_page)
1326 HTAB *hashp;
1327 PAGE *src_page;
1328 u_int32_t src_ndx;
1329 PAGE *dest_page;
1331 u_int32_t len;
1332 void *src, *dest;
1335 * Copy the key and data entries onto this new page.
1337 src = P_ENTRY(src_page, src_ndx);
1339 /* Set up space on dest. */
1340 len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx);
1341 HOFFSET(dest_page) -= len;
1342 dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page);
1343 dest = P_ENTRY(dest_page, NUM_ENT(dest_page));
1344 NUM_ENT(dest_page)++;
1346 memcpy(dest, src, len);
1351 * Returns:
1352 * pointer on success
1353 * NULL on error
1355 * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **));
1358 __ham_add_ovflpage(hashp, pagep, release, pp)
1359 HTAB *hashp;
1360 PAGE *pagep;
1361 int release;
1362 PAGE **pp;
1364 DB_ENV *dbenv;
1365 DB_LSN new_lsn;
1366 PAGE *new_pagep;
1367 int ret;
1369 dbenv = hashp->dbp->dbenv;
1371 if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0)
1372 return (ret);
1374 if (DB_LOGGING(hashp->dbp)) {
1375 if ((ret = __ham_newpage_log(dbenv->lg_info,
1376 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL,
1377 hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep),
1378 PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0)
1379 return (ret);
1381 /* Move lsn onto page. */
1382 LSN(pagep) = LSN(new_pagep) = new_lsn;
1384 NEXT_PGNO(pagep) = PGNO(new_pagep);
1385 PREV_PGNO(new_pagep) = PGNO(pagep);
1387 if (release)
1388 ret = __ham_put_page(hashp->dbp, pagep, 1);
1390 hashp->hash_overflows++;
1391 *pp = new_pagep;
1392 return (ret);
1397 * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **));
1400 __ham_new_page(hashp, addr, type, pp)
1401 HTAB *hashp;
1402 u_int32_t addr, type;
1403 PAGE **pp;
1405 PAGE *pagep;
1406 int ret;
1408 if ((ret = memp_fget(hashp->dbp->mpf,
1409 &addr, DB_MPOOL_CREATE, &pagep)) != 0)
1410 return (ret);
1412 #ifdef DEBUG_SLOW
1413 __account_page(hashp, addr, 1);
1414 #endif
1415 /* This should not be necessary because page-in should do it. */
1416 P_INIT(pagep,
1417 hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
1419 *pp = pagep;
1420 return (0);
1424 * PUBLIC: int __ham_del_page __P((DB *, PAGE *));
1427 __ham_del_page(dbp, pagep)
1428 DB *dbp;
1429 PAGE *pagep;
1431 DB_LSN new_lsn;
1432 HTAB *hashp;
1433 int ret;
1435 hashp = (HTAB *)dbp->internal;
1436 ret = 0;
1437 DIRTY_META(hashp, ret);
1438 if (ret != 0) {
1439 if (ret != EAGAIN)
1440 __db_err(hashp->dbp->dbenv,
1441 "free_ovflpage: unable to lock meta data page %s\n",
1442 strerror(ret));
1444 * If we are going to return an error, then we should free
1445 * the page, so it doesn't stay pinned forever.
1447 (void)__ham_put_page(hashp->dbp, pagep, 0);
1448 return (ret);
1451 if (DB_LOGGING(hashp->dbp)) {
1452 if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
1453 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO,
1454 hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed,
1455 (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID,
1456 &LSN(pagep), &hashp->hdr->lsn)) != 0)
1457 return (ret);
1459 hashp->hdr->lsn = new_lsn;
1460 LSN(pagep) = new_lsn;
1463 #ifdef DIAGNOSTIC
1465 db_pgno_t __pgno;
1466 DB_LSN __lsn;
1467 __pgno = pagep->pgno;
1468 __lsn = pagep->lsn;
1469 memset(pagep, 0xff, dbp->pgsize);
1470 pagep->pgno = __pgno;
1471 pagep->lsn = __lsn;
1473 #endif
1474 TYPE(pagep) = P_INVALID;
1475 NEXT_PGNO(pagep) = hashp->hdr->last_freed;
1476 hashp->hdr->last_freed = PGNO(pagep);
1478 return (__ham_put_page(hashp->dbp, pagep, 1));
1483 * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t));
1486 __ham_put_page(dbp, pagep, is_dirty)
1487 DB *dbp;
1488 PAGE *pagep;
1489 int32_t is_dirty;
1491 #ifdef DEBUG_SLOW
1492 __account_page((HTAB *)dbp->cookie,
1493 ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
1494 #endif
1495 return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0)));
1499 * __ham_dirty_page --
1500 * Mark a page dirty.
1502 * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *));
1505 __ham_dirty_page(hashp, pagep)
1506 HTAB *hashp;
1507 PAGE *pagep;
1509 return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY));
1513 * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **));
1516 __ham_get_page(dbp, addr, pagep)
1517 DB *dbp;
1518 db_pgno_t addr;
1519 PAGE **pagep;
1521 int ret;
1523 ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep);
1524 #ifdef DEBUG_SLOW
1525 if (*pagep != NULL)
1526 __account_page((HTAB *)dbp->internal, addr, 1);
1527 #endif
1528 return (ret);
1532 * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **));
1535 __ham_overflow_page(dbp, type, pp)
1536 DB *dbp;
1537 u_int32_t type;
1538 PAGE **pp;
1540 DB_LSN *lsnp, new_lsn;
1541 HTAB *hashp;
1542 PAGE *p;
1543 db_pgno_t new_addr, next_free, newalloc_flag;
1544 u_int32_t offset, splitnum;
1545 int ret;
1547 hashp = (HTAB *)dbp->internal;
1549 ret = 0;
1550 DIRTY_META(hashp, ret);
1551 if (ret != 0)
1552 return (ret);
1555 * This routine is split up into two parts. First we have
1556 * to figure out the address of the new page that we are
1557 * allocating. Then we have to log the allocation. Only
1558 * after the log do we get to complete allocation of the
1559 * new page.
1561 new_addr = hashp->hdr->last_freed;
1562 if (new_addr != PGNO_INVALID) {
1563 if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0)
1564 return (ret);
1565 next_free = NEXT_PGNO(p);
1566 lsnp = &LSN(p);
1567 newalloc_flag = 0;
1568 } else {
1569 splitnum = hashp->hdr->ovfl_point;
1570 hashp->hdr->spares[splitnum]++;
1571 offset = hashp->hdr->spares[splitnum] -
1572 (splitnum ? hashp->hdr->spares[splitnum - 1] : 0);
1573 new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset);
1574 if (new_addr > MAX_PAGES(hashp)) {
1575 __db_err(hashp->dbp->dbenv, "hash: out of file pages");
1576 hashp->hdr->spares[splitnum]--;
1577 return (ENOMEM);
1579 next_free = PGNO_INVALID;
1580 p = NULL;
1581 lsnp = NULL;
1582 newalloc_flag = 1;
1585 if (DB_LOGGING(hashp->dbp)) {
1586 if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info,
1587 (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO,
1588 hashp->dbp->log_fileid, new_addr, next_free,
1589 0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0)
1590 return (ret);
1592 hashp->hdr->lsn = new_lsn;
1593 if (lsnp != NULL)
1594 *lsnp = new_lsn;
1597 if (p != NULL) {
1598 /* We just took something off the free list, initialize it. */
1599 hashp->hdr->last_freed = next_free;
1600 P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID,
1601 PGNO_INVALID, 0, (u_int8_t)type);
1602 } else {
1603 /* Get the new page. */
1604 if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0)
1605 return (ret);
1607 if (DB_LOGGING(hashp->dbp))
1608 LSN(p) = new_lsn;
1610 *pp = p;
1611 return (0);
1614 #ifdef DEBUG
1616 * PUBLIC: #ifdef DEBUG
1617 * PUBLIC: db_pgno_t __bucket_to_page __P((HTAB *, db_pgno_t));
1618 * PUBLIC: #endif
1620 db_pgno_t
1621 __bucket_to_page(hashp, n)
1622 HTAB *hashp;
1623 db_pgno_t n;
1625 int ret_val;
1627 ret_val = n + 1;
1628 if (n != 0)
1629 ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1];
1630 return (ret_val);
1632 #endif
1635 * Create a bunch of overflow pages at the current split point.
1636 * PUBLIC: void __ham_init_ovflpages __P((HTAB *));
1638 void
1639 __ham_init_ovflpages(hp)
1640 HTAB *hp;
1642 DB_LSN new_lsn;
1643 PAGE *p;
1644 db_pgno_t last_pgno, new_pgno;
1645 u_int32_t i, curpages, numpages;
1647 curpages = hp->hdr->spares[hp->hdr->ovfl_point] -
1648 hp->hdr->spares[hp->hdr->ovfl_point - 1];
1649 numpages = hp->hdr->ovfl_point + 1 - curpages;
1651 last_pgno = hp->hdr->last_freed;
1652 new_pgno = PGNO_OF(hp, hp->hdr->ovfl_point, curpages + 1);
1653 if (DB_LOGGING(hp->dbp)) {
1654 (void)__ham_ovfl_log(hp->dbp->dbenv->lg_info,
1655 (DB_TXN *)hp->dbp->txn, &new_lsn, 0,
1656 hp->dbp->log_fileid, new_pgno,
1657 numpages, last_pgno, hp->hdr->ovfl_point, &hp->hdr->lsn);
1658 hp->hdr->lsn = new_lsn;
1659 } else
1660 ZERO_LSN(new_lsn);
1662 hp->hdr->spares[hp->hdr->ovfl_point] += numpages;
1663 for (i = numpages; i > 0; i--) {
1664 if (__ham_new_page(hp,
1665 PGNO_OF(hp, hp->hdr->ovfl_point, curpages + i),
1666 P_INVALID, &p) != 0)
1667 break;
1668 LSN(p) = new_lsn;
1669 NEXT_PGNO(p) = last_pgno;
1670 last_pgno = PGNO(p);
1671 (void)__ham_put_page(hp->dbp, p, 1);
1673 hp->hdr->last_freed = last_pgno;
1677 * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t));
1680 __ham_get_cpage(hashp, hcp, mode)
1681 HTAB *hashp;
1682 HASH_CURSOR *hcp;
1683 db_lockmode_t mode;
1685 int ret;
1687 if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) &&
1688 (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0)
1689 return (ret);
1691 if (hcp->pagep == NULL) {
1692 if (hcp->pgno == PGNO_INVALID) {
1693 hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket);
1694 hcp->bndx = 0;
1697 if ((ret =
1698 __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0)
1699 return (ret);
1702 if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL)
1703 if ((ret =
1704 __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
1705 return (ret);
1706 return (0);
1710 * Get a new page at the cursor, putting the last page if necessary.
1711 * If the flag is set to H_ISDUP, then we are talking about the
1712 * duplicate page, not the main page.
1714 * PUBLIC: int __ham_next_cpage
1715 * PUBLIC: __P((HTAB *, HASH_CURSOR *, db_pgno_t, int, u_int32_t));
1718 __ham_next_cpage(hashp, hcp, pgno, dirty, flags)
1719 HTAB *hashp;
1720 HASH_CURSOR *hcp;
1721 db_pgno_t pgno;
1722 int dirty;
1723 u_int32_t flags;
1725 PAGE *p;
1726 int ret;
1728 if (LF_ISSET(H_ISDUP) && hcp->dpagep != NULL &&
1729 (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0)
1730 return (ret);
1731 else if (!LF_ISSET(H_ISDUP) && hcp->pagep != NULL &&
1732 (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0)
1733 return (ret);
1735 if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0)
1736 return (ret);
1738 if (LF_ISSET(H_ISDUP)) {
1739 hcp->dpagep = p;
1740 hcp->dpgno = pgno;
1741 hcp->dndx = 0;
1742 } else {
1743 hcp->pagep = p;
1744 hcp->pgno = pgno;
1745 hcp->bndx = 0;
1748 return (0);
1752 * __ham_lock_bucket --
1753 * Get the lock on a particular bucket.
1755 static int
1756 __ham_lock_bucket(dbp, hcp, mode)
1757 DB *dbp;
1758 HASH_CURSOR *hcp;
1759 db_lockmode_t mode;
1761 int ret;
1764 * What a way to trounce on the memory system. It might be
1765 * worth copying the lk_info into the hashp.
1767 ret = 0;
1768 dbp->lock.pgno = (db_pgno_t)(hcp->bucket);
1769 ret = lock_get(dbp->dbenv->lk_info,
1770 dbp->txn == NULL ? dbp->locker : dbp->txn->txnid, 0,
1771 &dbp->lock_dbt, mode, &hcp->lock);
1773 return (ret < 0 ? EAGAIN : ret);
1777 * __ham_dpair --
1778 * Delete a pair on a page, paying no attention to what the pair
1779 * represents. The caller is responsible for freeing up duplicates
1780 * or offpage entries that might be referenced by this pair.
1782 * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
1784 void
1785 __ham_dpair(dbp, p, pndx)
1786 DB *dbp;
1787 PAGE *p;
1788 u_int32_t pndx;
1790 db_indx_t delta, n;
1791 u_int8_t *dest, *src;
1794 * Compute "delta", the amount we have to shift all of the
1795 * offsets. To find the delta, we just need to calculate
1796 * the size of the pair of elements we are removing.
1798 delta = H_PAIRSIZE(p, dbp->pgsize, pndx);
1801 * The hard case: we want to remove something other than
1802 * the last item on the page. We need to shift data and
1803 * offsets down.
1805 if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) {
1807 * Move the data: src is the first occupied byte on
1808 * the page. (Length is delta.)
1810 src = (u_int8_t *)p + HOFFSET(p);
1813 * Destination is delta bytes beyond src. This might
1814 * be an overlapping copy, so we have to use memmove.
1816 dest = src + delta;
1817 memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p));
1820 /* Adjust the offsets. */
1821 for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) {
1822 p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta;
1823 p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta;
1826 /* Adjust page metadata. */
1827 HOFFSET(p) = HOFFSET(p) + delta;
1828 NUM_ENT(p) = NUM_ENT(p) - 2;
1831 #ifdef DEBUG_SLOW
1832 static void
1833 __account_page(hashp, pgno, inout)
1834 HTAB *hashp;
1835 db_pgno_t pgno;
1836 int inout;
1838 static struct {
1839 db_pgno_t pgno;
1840 int times;
1841 } list[100];
1842 static int last;
1843 int i, j;
1845 if (inout == -1) /* XXX: Kluge */
1846 inout = 0;
1848 /* Find page in list. */
1849 for (i = 0; i < last; i++)
1850 if (list[i].pgno == pgno)
1851 break;
1852 /* Not found. */
1853 if (i == last) {
1854 list[last].times = inout;
1855 list[last].pgno = pgno;
1856 last++;
1858 list[i].times = inout;
1859 if (list[i].times == 0) {
1860 for (j = i; j < last; j++)
1861 list[j] = list[j + 1];
1862 last--;
1864 for (i = 0; i < last; i++, list[i].times++)
1865 if (list[i].times > 20 &&
1866 !__is_bitmap_pgno(hashp, list[i].pgno))
1867 (void)fprintf(stderr,
1868 "Warning: pg %lu has been out for %d times\n",
1869 (u_long)list[i].pgno, list[i].times);
1871 #endif /* DEBUG_SLOW */