2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
11 static const char sccsid
[] = "@(#)db_dup.c 10.11 (Sleepycat) 1/8/98";
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
32 #include "common_ext.h"
34 static int __db_addpage
__P((DB
*,
35 PAGE
**, db_indx_t
*, int (*)(DB
*, u_int32_t
, PAGE
**)));
36 static int __db_dsplit
__P((DB
*,
37 PAGE
**, db_indx_t
*, u_int32_t
, int (*)(DB
*, u_int32_t
, PAGE
**)));
41 * Put a duplicate item onto a duplicate page at the given index.
43 * PUBLIC: int __db_dput __P((DB *,
44 * PUBLIC: DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
47 __db_dput(dbp
, dbt
, pp
, indxp
, newfunc
)
52 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
55 DBT
*data_dbtp
, hdr_dbt
, *hdr_dbtp
;
57 db_indx_t size
, isize
;
62 * We need some access method independent threshold for when we put
63 * a duplicate item onto an overflow page.
65 if (dbt
->size
> 0.25 * dbp
->pgsize
) {
66 if ((ret
= __db_poff(dbp
, dbt
, &pgno
, newfunc
)) != 0)
68 B_TSET(bo
.type
, B_OVERFLOW
, 0);
72 hdr_dbt
.size
= isize
= BOVERFLOW_SIZE
;
74 size
= BOVERFLOW_PSIZE
;
77 size
= BKEYDATA_PSIZE(dbt
->size
);
78 isize
= BKEYDATA_SIZE(dbt
->size
);
84 if (size
> P_FREESPACE(pagep
)) {
85 if (*indxp
== NUM_ENT(*pp
) && NEXT_PGNO(*pp
) == PGNO_INVALID
)
86 ret
= __db_addpage(dbp
, pp
, indxp
, newfunc
);
88 ret
= __db_dsplit(dbp
, pp
, indxp
, isize
, newfunc
);
90 /* XXX: Pages not returned to free list. */
96 * Now, pagep references the page on which to insert and indx is the
97 * the location to insert.
99 if ((ret
= __db_pitem(dbp
,
100 pagep
, (u_int32_t
)*indxp
, isize
, hdr_dbtp
, data_dbtp
)) != 0)
103 (void)memp_fset(dbp
->mpf
, pagep
, DB_MPOOL_DIRTY
);
109 * Remove a duplicate at the given index on the given page.
111 * PUBLIC: int __db_drem __P((DB *,
112 * PUBLIC: PAGE **, u_int32_t, int (*)(DB *, PAGE *)));
115 __db_drem(dbp
, pp
, indx
, freefunc
)
119 int (*freefunc
) __P((DB
*, PAGE
*));
126 /* Check if we are freeing a big item. */
127 if (B_TYPE(GET_BKEYDATA(pagep
, indx
)->type
) == B_OVERFLOW
) {
128 if ((ret
= __db_doff(dbp
,
129 GET_BOVERFLOW(pagep
, indx
)->pgno
, freefunc
)) != 0)
131 ret
= __db_ditem(dbp
, pagep
, indx
, BOVERFLOW_SIZE
);
133 ret
= __db_ditem(dbp
, pagep
, indx
,
134 BKEYDATA_SIZE(GET_BKEYDATA(pagep
, indx
)->len
));
138 if (NUM_ENT(pagep
) == 0) {
140 * If the page is emptied, then the page is freed and the pp
141 * parameter is set to reference the next, locked page in the
142 * duplicate chain, if one exists. If there was no such page,
143 * then it is set to NULL.
146 * __db_relink will set the dirty bit for us.
148 if ((ret
= __db_relink(dbp
, pagep
, pp
, 0)) != 0)
150 if ((ret
= freefunc(dbp
, pagep
)) != 0)
153 (void)memp_fset(dbp
->mpf
, pagep
, DB_MPOOL_DIRTY
);
160 * Find the last page in a set of offpage duplicates.
162 * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **));
165 __db_dend(dbp
, pgno
, pagep
)
174 * This implements DB_KEYLAST. The last page is returned in pp; pgno
175 * should be the page number of the first page of the duplicate chain.
178 if ((ret
= memp_fget(dbp
->mpf
, &pgno
, 0, &h
)) != 0) {
179 (void)__db_pgerr(dbp
, pgno
);
182 if ((pgno
= NEXT_PGNO(h
)) == PGNO_INVALID
)
184 (void)memp_fput(dbp
->mpf
, h
, 0);
193 * Split a page of duplicates, calculating the split point based
194 * on an element of size "size" being added at "*indxp".
195 * On entry hp contains a pointer to the page-pointer of the original
196 * page. On exit, it returns a pointer to the page containing "*indxp"
197 * and "indxp" has been modified to reflect the index on the new page
198 * where the element should be added. The function returns with
199 * the page on which the insert should happen, not yet put.
202 __db_dsplit(dbp
, hp
, indxp
, size
, newfunc
)
207 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
212 db_indx_t indx
, nindex
, oindex
, sum
;
213 db_indx_t halfbytes
, i
, lastsum
;
214 int did_indx
, ret
, s
;
219 /* Create a temporary page to do compaction onto. */
220 if ((tp
= (PAGE
*)__db_malloc(dbp
->pgsize
)) == NULL
)
223 memset(tp
, 0xff, dbp
->pgsize
);
225 /* Create new page for the split. */
226 if ((ret
= newfunc(dbp
, P_DUPLICATE
, &np
)) != 0) {
227 FREE(tp
, dbp
->pgsize
);
231 P_INIT(np
, dbp
->pgsize
, PGNO(np
), PGNO(h
), NEXT_PGNO(h
), 0,
233 P_INIT(tp
, dbp
->pgsize
, PGNO(h
), PREV_PGNO(h
), PGNO(np
), 0,
236 /* Figure out the split point */
237 halfbytes
= (dbp
->pgsize
- HOFFSET(h
)) / 2;
239 for (sum
= 0, lastsum
= 0, i
= 0; i
< NUM_ENT(h
); i
++) {
242 if (lastsum
< halfbytes
&& sum
>= halfbytes
) {
243 /* We've crossed the halfway point. */
244 if ((db_indx_t
)(halfbytes
- lastsum
) <
245 (db_indx_t
)(sum
- halfbytes
)) {
257 if (B_TYPE(GET_BKEYDATA(h
, i
)->type
) == B_KEYDATA
)
258 sum
+= BKEYDATA_SIZE(GET_BKEYDATA(h
, i
)->len
);
260 sum
+= BOVERFLOW_SIZE
;
262 if (lastsum
< halfbytes
&& sum
>= halfbytes
) {
263 /* We've crossed the halfway point. */
264 if ((db_indx_t
)(halfbytes
- lastsum
) <
265 (db_indx_t
)(sum
- halfbytes
))
272 * Check if we have set the return values of the index pointer and
277 *indxp
= indx
- i
- 1;
280 if (DB_LOGGING(dbp
)) {
281 page_dbt
.size
= dbp
->pgsize
;
283 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
284 dbp
->txn
, &LSN(h
), 0, DB_SPLITOLD
, dbp
->log_fileid
,
285 PGNO(h
), &page_dbt
, &LSN(h
))) != 0) {
286 FREE(tp
, dbp
->pgsize
);
293 * If it's a btree, adjust the cursors.
295 * i is the index of the last element to stay on the page.
297 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
298 __bam_ca_split(dbp
, PGNO(h
), PGNO(h
), PGNO(np
), i
+ 1, 0);
300 for (nindex
= 0, oindex
= i
+ 1; oindex
< NUM_ENT(h
); oindex
++) {
301 bk
= GET_BKEYDATA(h
, oindex
);
302 if (B_TYPE(bk
->type
) == B_KEYDATA
)
303 s
= BKEYDATA_SIZE(bk
->len
);
307 np
->inp
[nindex
++] = HOFFSET(np
) -= s
;
308 memcpy((u_int8_t
*)np
+ HOFFSET(np
), bk
, s
);
313 * Now do data compaction by copying the remaining stuff onto the
314 * temporary page and then copying it back to the real page.
316 for (nindex
= 0, oindex
= 0; oindex
<= i
; oindex
++) {
317 bk
= GET_BKEYDATA(h
, oindex
);
318 if (B_TYPE(bk
->type
) == B_KEYDATA
)
319 s
= BKEYDATA_SIZE(bk
->len
);
323 tp
->inp
[nindex
++] = HOFFSET(tp
) -= s
;
324 memcpy((u_int8_t
*)tp
+ HOFFSET(tp
), bk
, s
);
329 * This page (the temporary) should be only half full, so we do two
330 * memcpy's, one for the top of the page and one for the bottom of
331 * the page. This way we avoid copying the middle which should be
334 memcpy(h
, tp
, LOFFSET(tp
));
335 memcpy((u_int8_t
*)h
+ HOFFSET(tp
),
336 (u_int8_t
*)tp
+ HOFFSET(tp
), dbp
->pgsize
- HOFFSET(tp
));
337 FREE(tp
, dbp
->pgsize
);
339 if (DB_LOGGING(dbp
)) {
340 page_dbt
.size
= dbp
->pgsize
;
342 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
343 dbp
->txn
, &LSN(h
), 0, DB_SPLITNEW
, dbp
->log_fileid
,
344 PGNO(h
), &page_dbt
, &LSN(h
))) != 0)
347 page_dbt
.size
= dbp
->pgsize
;
349 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
350 dbp
->txn
, &LSN(np
), 0, DB_SPLITNEW
, dbp
->log_fileid
,
351 PGNO(np
), &page_dbt
, &LSN(np
))) != 0)
356 * Figure out if the location we're interested in is on the new
357 * page, and if so, reset the callers' pointer. Push the other
358 * page back to the store.
361 ret
= memp_fput(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
363 ret
= memp_fput(dbp
->mpf
, h
, DB_MPOOL_DIRTY
);
370 * Remove an item from a page.
372 * PUBLIC: int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t));
375 __db_ditem(dbp
, pagep
, indx
, nbytes
)
378 u_int32_t indx
, nbytes
;
381 db_indx_t cnt
, offset
;
385 if (DB_LOGGING(dbp
)) {
386 ldbt
.data
= P_ENTRY(pagep
, indx
);
388 if ((ret
= __db_addrem_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
389 &LSN(pagep
), 0, DB_REM_DUP
, dbp
->log_fileid
, PGNO(pagep
),
390 (u_int32_t
)indx
, nbytes
, &ldbt
, NULL
, &LSN(pagep
))) != 0)
395 * If there's only a single item on the page, we don't have to
398 if (NUM_ENT(pagep
) == 1) {
400 HOFFSET(pagep
) = dbp
->pgsize
;
405 * Pack the remaining key/data items at the end of the page. Use
406 * memmove(3), the regions may overlap.
408 from
= (u_int8_t
*)pagep
+ HOFFSET(pagep
);
409 memmove(from
+ nbytes
, from
, pagep
->inp
[indx
] - HOFFSET(pagep
));
410 HOFFSET(pagep
) += nbytes
;
412 /* Adjust the indices' offsets. */
413 offset
= pagep
->inp
[indx
];
414 for (cnt
= 0; cnt
< NUM_ENT(pagep
); ++cnt
)
415 if (pagep
->inp
[cnt
] < offset
)
416 pagep
->inp
[cnt
] += nbytes
;
418 /* Shift the indices down. */
420 if (indx
!= NUM_ENT(pagep
))
421 memmove(&pagep
->inp
[indx
], &pagep
->inp
[indx
+ 1],
422 sizeof(db_indx_t
) * (NUM_ENT(pagep
) - indx
));
424 /* If it's a btree, adjust the cursors. */
425 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
426 __bam_ca_di(dbp
, PGNO(pagep
), indx
, -1);
433 * Put an item on a page.
435 * PUBLIC: int __db_pitem
436 * PUBLIC: __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
439 __db_pitem(dbp
, pagep
, indx
, nbytes
, hdr
, data
)
452 * Put a single item onto a page. The logic figuring out where to
453 * insert and whether it fits is handled in the caller. All we do
454 * here is manage the page shuffling. We cheat a little bit in that
455 * we don't want to copy the dbt on a normal put twice. If hdr is
456 * NULL, we create a BKEYDATA structure on the page, otherwise, just
457 * copy the caller's information onto the page.
459 * This routine is also used to put entries onto the page where the
460 * entry is pre-built, e.g., during recovery. In this case, the hdr
461 * will point to the entry, and the data argument will be NULL.
464 * There's a tremendous potential for off-by-one errors here, since
465 * the passed in header sizes must be adjusted for the structure's
466 * placeholder for the trailing variable-length data field.
469 if ((ret
= __db_addrem_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
470 &LSN(pagep
), 0, DB_ADD_DUP
, dbp
->log_fileid
, PGNO(pagep
),
471 (u_int32_t
)indx
, nbytes
, hdr
, data
, &LSN(pagep
))) != 0)
475 B_TSET(bk
.type
, B_KEYDATA
, 0);
476 bk
.len
= data
== NULL
? 0 : data
->size
;
479 thdr
.size
= SSZA(BKEYDATA
, data
);
483 /* Adjust the index table, then put the item on the page. */
484 if (indx
!= NUM_ENT(pagep
))
485 memmove(&pagep
->inp
[indx
+ 1], &pagep
->inp
[indx
],
486 sizeof(db_indx_t
) * (NUM_ENT(pagep
) - indx
));
487 HOFFSET(pagep
) -= nbytes
;
488 pagep
->inp
[indx
] = HOFFSET(pagep
);
491 p
= P_ENTRY(pagep
, indx
);
492 memcpy(p
, hdr
->data
, hdr
->size
);
494 memcpy(p
+ hdr
->size
, data
->data
, data
->size
);
496 /* If it's a btree, adjust the cursors. */
497 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
498 __bam_ca_di(dbp
, PGNO(pagep
), indx
, 1);
505 * Relink around a deleted page.
507 * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int));
510 __db_relink(dbp
, pagep
, new_next
, needlock
)
512 PAGE
*pagep
, **new_next
;
517 DB_LSN
*nlsnp
, *plsnp
;
522 npl
= ppl
= LOCK_INVALID
;
523 nlsnp
= plsnp
= NULL
;
525 /* Retrieve and lock the two pages. */
526 if (pagep
->next_pgno
!= PGNO_INVALID
) {
527 if (needlock
&& (ret
= __bam_lget(dbp
,
528 0, pagep
->next_pgno
, DB_LOCK_WRITE
, &npl
)) != 0)
530 if ((ret
= memp_fget(dbp
->mpf
,
531 &pagep
->next_pgno
, 0, &np
)) != 0) {
532 (void)__db_pgerr(dbp
, pagep
->next_pgno
);
537 if (pagep
->prev_pgno
!= PGNO_INVALID
) {
538 if (needlock
&& (ret
= __bam_lget(dbp
,
539 0, pagep
->prev_pgno
, DB_LOCK_WRITE
, &ppl
)) != 0)
541 if ((ret
= memp_fget(dbp
->mpf
,
542 &pagep
->prev_pgno
, 0, &pp
)) != 0) {
543 (void)__db_pgerr(dbp
, pagep
->next_pgno
);
549 /* Log the change. */
550 if (DB_LOGGING(dbp
)) {
551 if ((ret
= __db_relink_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
552 &pagep
->lsn
, 0, dbp
->log_fileid
, pagep
->pgno
, &pagep
->lsn
,
553 pagep
->prev_pgno
, plsnp
, pagep
->next_pgno
, nlsnp
)) != 0)
556 np
->lsn
= pagep
->lsn
;
558 pp
->lsn
= pagep
->lsn
;
562 * Modify and release the two pages.
565 * The parameter new_next gets set to the page following the page we
566 * are removing. If there is no following page, then new_next gets
570 np
->prev_pgno
= pagep
->prev_pgno
;
571 if (new_next
== NULL
)
572 ret
= memp_fput(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
575 ret
= memp_fset(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
580 (void)__bam_lput(dbp
, npl
);
581 } else if (new_next
!= NULL
)
585 pp
->next_pgno
= pagep
->next_pgno
;
586 if ((ret
= memp_fput(dbp
->mpf
, pp
, DB_MPOOL_DIRTY
)) != 0)
589 (void)__bam_lput(dbp
, ppl
);
594 (void)memp_fput(dbp
->mpf
, np
, 0);
595 if (needlock
&& npl
!= LOCK_INVALID
)
596 (void)__bam_lput(dbp
, npl
);
598 (void)memp_fput(dbp
->mpf
, pp
, 0);
599 if (needlock
&& ppl
!= LOCK_INVALID
)
600 (void)__bam_lput(dbp
, ppl
);
606 * Delete an offpage chain of duplicates.
608 * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
611 __db_ddup(dbp
, pgno
, freefunc
)
614 int (*freefunc
) __P((DB
*, PAGE
*));
621 if ((ret
= memp_fget(dbp
->mpf
, &pgno
, 0, &pagep
)) != 0) {
622 (void)__db_pgerr(dbp
, pgno
);
626 if (DB_LOGGING(dbp
)) {
627 tmp_dbt
.data
= pagep
;
628 tmp_dbt
.size
= dbp
->pgsize
;
629 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
630 &LSN(pagep
), 0, DB_SPLITOLD
, dbp
->log_fileid
,
631 PGNO(pagep
), &tmp_dbt
, &LSN(pagep
))) != 0)
634 pgno
= pagep
->next_pgno
;
635 if ((ret
= freefunc(dbp
, pagep
)) != 0)
637 } while (pgno
!= PGNO_INVALID
);
644 * Create a new page and link it onto the next_pgno field of the
648 __db_addpage(dbp
, hp
, indxp
, newfunc
)
652 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
657 if ((ret
= newfunc(dbp
, P_DUPLICATE
, &newpage
)) != 0)
660 if (DB_LOGGING(dbp
)) {
661 if ((ret
= __db_addpage_log(dbp
->dbenv
->lg_info
,
662 dbp
->txn
, &LSN(*hp
), 0, dbp
->log_fileid
,
663 PGNO(*hp
), &LSN(*hp
), PGNO(newpage
), &LSN(newpage
))) != 0) {
666 LSN(newpage
) = LSN(*hp
);
669 PREV_PGNO(newpage
) = PGNO(*hp
);
670 NEXT_PGNO(*hp
) = PGNO(newpage
);
672 if ((ret
= memp_fput(dbp
->mpf
, *hp
, DB_MPOOL_DIRTY
)) != 0)