2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
11 static const char sccsid
[] = "@(#)db_dup.c 10.10 (Sleepycat) 10/25/97";
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
32 #include "common_ext.h"
34 static int __db_addpage
__P((DB
*,
35 PAGE
**, db_indx_t
*, int (*)(DB
*, u_int32_t
, PAGE
**)));
36 static int __db_dsplit
__P((DB
*,
37 PAGE
**, db_indx_t
*, u_int32_t
, int (*)(DB
*, u_int32_t
, PAGE
**)));
41 * Put a duplicate item onto a duplicate page at the given index.
43 * PUBLIC: int __db_dput __P((DB *,
44 * PUBLIC: DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **)));
47 __db_dput(dbp
, dbt
, pp
, indxp
, newfunc
)
52 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
55 DBT
*data_dbtp
, hdr_dbt
, *hdr_dbtp
;
57 db_indx_t size
, isize
;
62 * We need some access method independent threshold for when we put
63 * a duplicate item onto an overflow page.
65 if (dbt
->size
> 0.25 * dbp
->pgsize
) {
66 if ((ret
= __db_poff(dbp
, dbt
, &pgno
, newfunc
)) != 0)
68 B_TSET(bo
.type
, B_OVERFLOW
, 0);
72 hdr_dbt
.size
= isize
= BOVERFLOW_SIZE
;
74 size
= BOVERFLOW_PSIZE
;
77 size
= BKEYDATA_PSIZE(dbt
->size
);
78 isize
= BKEYDATA_SIZE(dbt
->size
);
84 if (size
> P_FREESPACE(pagep
)) {
85 if (*indxp
== NUM_ENT(*pp
) && NEXT_PGNO(*pp
) == PGNO_INVALID
)
86 ret
= __db_addpage(dbp
, pp
, indxp
, newfunc
);
88 ret
= __db_dsplit(dbp
, pp
, indxp
, isize
, newfunc
);
90 /* XXX: Pages not returned to free list. */
96 * Now, pagep references the page on which to insert and indx is the
97 * the location to insert.
99 if ((ret
= __db_pitem(dbp
,
100 pagep
, (u_int32_t
)*indxp
, isize
, hdr_dbtp
, data_dbtp
)) != 0)
103 (void)memp_fset(dbp
->mpf
, pagep
, DB_MPOOL_DIRTY
);
109 * Remove a duplicate at the given index on the given page.
111 * PUBLIC: int __db_drem __P((DB *,
112 * PUBLIC: PAGE **, u_int32_t, int (*)(DB *, PAGE *)));
115 __db_drem(dbp
, pp
, indx
, freefunc
)
119 int (*freefunc
) __P((DB
*, PAGE
*));
126 /* Check if we are freeing a big item. */
127 if (B_TYPE(GET_BKEYDATA(pagep
, indx
)->type
) == B_OVERFLOW
) {
128 if ((ret
= __db_doff(dbp
,
129 GET_BOVERFLOW(pagep
, indx
)->pgno
, freefunc
)) != 0)
131 ret
= __db_ditem(dbp
, pagep
, indx
, BOVERFLOW_SIZE
);
133 ret
= __db_ditem(dbp
, pagep
, indx
,
134 BKEYDATA_SIZE(GET_BKEYDATA(pagep
, indx
)->len
));
138 if (NUM_ENT(pagep
) == 0) {
140 * If the page is emptied, then the page is freed and the pp
141 * parameter is set to reference the next, locked page in the
142 * duplicate chain, if one exists. If there was no such page,
143 * then it is set to NULL.
146 * __db_relink will set the dirty bit for us.
148 if ((ret
= __db_relink(dbp
, pagep
, pp
, 0)) != 0)
150 if ((ret
= freefunc(dbp
, pagep
)) != 0)
153 (void)memp_fset(dbp
->mpf
, pagep
, DB_MPOOL_DIRTY
);
160 * Find the last page in a set of offpage duplicates.
162 * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **));
165 __db_dend(dbp
, pgno
, pagep
)
174 * This implements DB_KEYLAST. The last page is returned in pp; pgno
175 * should be the page number of the first page of the duplicate chain.
178 if ((ret
= memp_fget(dbp
->mpf
, &pgno
, 0, &h
)) != 0) {
179 (void)__db_pgerr(dbp
, pgno
);
182 if ((pgno
= NEXT_PGNO(h
)) == PGNO_INVALID
)
184 (void)memp_fput(dbp
->mpf
, h
, 0);
193 * Split a page of duplicates, calculating the split point based
194 * on an element of size "size" being added at "*indxp".
195 * On entry hp contains a pointer to the page-pointer of the original
196 * page. On exit, it returns a pointer to the page containing "*indxp"
197 * and "indxp" has been modified to reflect the index on the new page
198 * where the element should be added. The function returns with
199 * the page on which the insert should happen, not yet put.
202 __db_dsplit(dbp
, hp
, indxp
, size
, newfunc
)
207 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
212 db_indx_t indx
, nindex
, oindex
, sum
;
213 db_indx_t halfbytes
, i
, lastsum
;
214 int did_indx
, ret
, s
;
219 /* Create a temporary page to do compaction onto. */
220 if ((tp
= (PAGE
*)__db_malloc(dbp
->pgsize
)) == NULL
)
223 memset(tp
, 0xff, dbp
->pgsize
);
225 /* Create new page for the split. */
226 if ((ret
= newfunc(dbp
, P_DUPLICATE
, &np
)) != 0) {
227 FREE(tp
, dbp
->pgsize
);
231 P_INIT(np
, dbp
->pgsize
, PGNO(np
), PGNO(h
), NEXT_PGNO(h
), 0,
233 P_INIT(tp
, dbp
->pgsize
, PGNO(h
), PREV_PGNO(h
), PGNO(np
), 0,
236 /* Figure out the split point */
237 halfbytes
= (dbp
->pgsize
- HOFFSET(h
)) / 2;
239 for (sum
= 0, lastsum
= 0, i
= 0; i
< NUM_ENT(h
); i
++) {
242 if (lastsum
< halfbytes
&& sum
>= halfbytes
) {
243 /* We've crossed the halfway point. */
244 if ((db_indx_t
)(halfbytes
- lastsum
) <
245 (db_indx_t
)(sum
- halfbytes
)) {
257 if (B_TYPE(GET_BKEYDATA(h
, i
)->type
) == B_KEYDATA
)
258 sum
+= BKEYDATA_SIZE(GET_BKEYDATA(h
, i
)->len
);
260 sum
+= BOVERFLOW_SIZE
;
262 if (lastsum
< halfbytes
&& sum
>= halfbytes
) {
263 /* We've crossed the halfway point. */
264 if ((db_indx_t
)(halfbytes
- lastsum
) <
265 (db_indx_t
)(sum
- halfbytes
))
272 * Check if we have set the return values of the index pointer and
277 *indxp
= indx
- i
- 1;
280 if (DB_LOGGING(dbp
)) {
281 page_dbt
.size
= dbp
->pgsize
;
283 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
284 dbp
->txn
, &LSN(h
), 0, DB_SPLITOLD
, dbp
->log_fileid
,
285 PGNO(h
), &page_dbt
, &LSN(h
))) != 0) {
286 FREE(tp
, dbp
->pgsize
);
293 * If it's a btree, adjust the cursors.
295 * i is the index of the last element to stay on the page.
297 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
298 __bam_ca_split(dbp
, PGNO(h
), PGNO(h
), PGNO(np
), i
+ 1, 0);
300 for (nindex
= 0, oindex
= i
+ 1; oindex
< NUM_ENT(h
); oindex
++) {
301 bk
= GET_BKEYDATA(h
, oindex
);
302 if (B_TYPE(bk
->type
) == B_KEYDATA
)
303 s
= BKEYDATA_SIZE(bk
->len
);
307 np
->inp
[nindex
++] = HOFFSET(np
) -= s
;
308 memcpy((u_int8_t
*)np
+ HOFFSET(np
), bk
, s
);
313 * Now do data compaction by copying the remaining stuff onto the
314 * temporary page and then copying it back to the real page.
316 for (nindex
= 0, oindex
= 0; oindex
<= i
; oindex
++) {
317 bk
= GET_BKEYDATA(h
, oindex
);
318 if (B_TYPE(bk
->type
) == B_KEYDATA
)
319 s
= BKEYDATA_SIZE(bk
->len
);
323 tp
->inp
[nindex
++] = HOFFSET(tp
) -= s
;
324 memcpy((u_int8_t
*)tp
+ HOFFSET(tp
), bk
, s
);
329 * This page (the temporary) should be only half full, so we do two
330 * memcpy's, one for the top of the page and one for the bottom of
331 * the page. This way we avoid copying the middle which should be
334 memcpy(h
, tp
, LOFFSET(tp
));
335 memcpy((u_int8_t
*)h
+ HOFFSET(tp
),
336 (u_int8_t
*)tp
+ HOFFSET(tp
), dbp
->pgsize
- HOFFSET(tp
));
337 FREE(tp
, dbp
->pgsize
);
339 if (DB_LOGGING(dbp
)) {
340 page_dbt
.size
= dbp
->pgsize
;
342 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
343 dbp
->txn
, &LSN(h
), 0, DB_SPLITNEW
, dbp
->log_fileid
,
344 PGNO(h
), &page_dbt
, &LSN(h
))) != 0)
347 page_dbt
.size
= dbp
->pgsize
;
349 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
,
350 dbp
->txn
, &LSN(np
), 0, DB_SPLITNEW
, dbp
->log_fileid
,
351 PGNO(np
), &page_dbt
, &LSN(np
))) != 0)
356 * Figure out if the location we're interested in is on the new
357 * page, and if so, reset the callers' pointer. Push the other
358 * page back to the store.
361 ret
= memp_fput(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
363 ret
= memp_fput(dbp
->mpf
, h
, DB_MPOOL_DIRTY
);
370 * Remove an item from a page.
372 * PUBLIC: int __db_ditem __P((DB *, PAGE *, int, u_int32_t));
375 __db_ditem(dbp
, pagep
, indx
, nbytes
)
382 db_indx_t cnt
, offset
;
386 if (DB_LOGGING(dbp
)) {
387 ldbt
.data
= P_ENTRY(pagep
, indx
);
389 if ((ret
= __db_addrem_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
390 &LSN(pagep
), 0, DB_REM_DUP
, dbp
->log_fileid
, PGNO(pagep
),
391 (u_int32_t
)indx
, nbytes
, &ldbt
, NULL
, &LSN(pagep
))) != 0)
396 * If there's only a single item on the page, we don't have to
399 if (NUM_ENT(pagep
) == 1) {
401 HOFFSET(pagep
) = dbp
->pgsize
;
406 * Pack the remaining key/data items at the end of the page. Use
407 * memmove(3), the regions may overlap.
409 from
= (u_int8_t
*)pagep
+ HOFFSET(pagep
);
410 memmove(from
+ nbytes
, from
, pagep
->inp
[indx
] - HOFFSET(pagep
));
411 HOFFSET(pagep
) += nbytes
;
413 /* Adjust the indices' offsets. */
414 offset
= pagep
->inp
[indx
];
415 for (cnt
= 0; cnt
< NUM_ENT(pagep
); ++cnt
)
416 if (pagep
->inp
[cnt
] < offset
)
417 pagep
->inp
[cnt
] += nbytes
;
419 /* Shift the indices down. */
421 if (indx
!= NUM_ENT(pagep
))
422 memmove(&pagep
->inp
[indx
], &pagep
->inp
[indx
+ 1],
423 sizeof(db_indx_t
) * (NUM_ENT(pagep
) - indx
));
425 /* If it's a btree, adjust the cursors. */
426 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
427 __bam_ca_di(dbp
, PGNO(pagep
), indx
, -1);
434 * Put an item on a page.
436 * PUBLIC: int __db_pitem
437 * PUBLIC: __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
440 __db_pitem(dbp
, pagep
, indx
, nbytes
, hdr
, data
)
453 * Put a single item onto a page. The logic figuring out where to
454 * insert and whether it fits is handled in the caller. All we do
455 * here is manage the page shuffling. We cheat a little bit in that
456 * we don't want to copy the dbt on a normal put twice. If hdr is
457 * NULL, we create a BKEYDATA structure on the page, otherwise, just
458 * copy the caller's information onto the page.
460 * This routine is also used to put entries onto the page where the
461 * entry is pre-built, e.g., during recovery. In this case, the hdr
462 * will point to the entry, and the data argument will be NULL.
465 * There's a tremendous potential for off-by-one errors here, since
466 * the passed in header sizes must be adjusted for the structure's
467 * placeholder for the trailing variable-length data field.
470 if ((ret
= __db_addrem_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
471 &LSN(pagep
), 0, DB_ADD_DUP
, dbp
->log_fileid
, PGNO(pagep
),
472 (u_int32_t
)indx
, nbytes
, hdr
, data
, &LSN(pagep
))) != 0)
476 B_TSET(bk
.type
, B_KEYDATA
, 0);
477 bk
.len
= data
== NULL
? 0 : data
->size
;
480 thdr
.size
= SSZA(BKEYDATA
, data
);
484 /* Adjust the index table, then put the item on the page. */
485 if (indx
!= NUM_ENT(pagep
))
486 memmove(&pagep
->inp
[indx
+ 1], &pagep
->inp
[indx
],
487 sizeof(db_indx_t
) * (NUM_ENT(pagep
) - indx
));
488 HOFFSET(pagep
) -= nbytes
;
489 pagep
->inp
[indx
] = HOFFSET(pagep
);
492 p
= P_ENTRY(pagep
, indx
);
493 memcpy(p
, hdr
->data
, hdr
->size
);
495 memcpy(p
+ hdr
->size
, data
->data
, data
->size
);
497 /* If it's a btree, adjust the cursors. */
498 if (dbp
->type
== DB_BTREE
|| dbp
->type
== DB_RECNO
)
499 __bam_ca_di(dbp
, PGNO(pagep
), indx
, 1);
506 * Relink around a deleted page.
508 * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int));
511 __db_relink(dbp
, pagep
, new_next
, needlock
)
513 PAGE
*pagep
, **new_next
;
518 DB_LSN
*nlsnp
, *plsnp
;
523 npl
= ppl
= LOCK_INVALID
;
524 nlsnp
= plsnp
= NULL
;
526 /* Retrieve and lock the two pages. */
527 if (pagep
->next_pgno
!= PGNO_INVALID
) {
528 if (needlock
&& (ret
= __bam_lget(dbp
,
529 0, pagep
->next_pgno
, DB_LOCK_WRITE
, &npl
)) != 0)
531 if ((ret
= memp_fget(dbp
->mpf
,
532 &pagep
->next_pgno
, 0, &np
)) != 0) {
533 (void)__db_pgerr(dbp
, pagep
->next_pgno
);
538 if (pagep
->prev_pgno
!= PGNO_INVALID
) {
539 if (needlock
&& (ret
= __bam_lget(dbp
,
540 0, pagep
->prev_pgno
, DB_LOCK_WRITE
, &ppl
)) != 0)
542 if ((ret
= memp_fget(dbp
->mpf
,
543 &pagep
->prev_pgno
, 0, &pp
)) != 0) {
544 (void)__db_pgerr(dbp
, pagep
->next_pgno
);
550 /* Log the change. */
551 if (DB_LOGGING(dbp
)) {
552 if ((ret
= __db_relink_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
553 &pagep
->lsn
, 0, dbp
->log_fileid
, pagep
->pgno
, &pagep
->lsn
,
554 pagep
->prev_pgno
, plsnp
, pagep
->next_pgno
, nlsnp
)) != 0)
557 np
->lsn
= pagep
->lsn
;
559 pp
->lsn
= pagep
->lsn
;
563 * Modify and release the two pages.
566 * The parameter new_next gets set to the page following the page we
567 * are removing. If there is no following page, then new_next gets
571 np
->prev_pgno
= pagep
->prev_pgno
;
572 if (new_next
== NULL
)
573 ret
= memp_fput(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
576 ret
= memp_fset(dbp
->mpf
, np
, DB_MPOOL_DIRTY
);
581 (void)__bam_lput(dbp
, npl
);
582 } else if (new_next
!= NULL
)
586 pp
->next_pgno
= pagep
->next_pgno
;
587 if ((ret
= memp_fput(dbp
->mpf
, pp
, DB_MPOOL_DIRTY
)) != 0)
590 (void)__bam_lput(dbp
, ppl
);
595 (void)memp_fput(dbp
->mpf
, np
, 0);
596 if (needlock
&& npl
!= LOCK_INVALID
)
597 (void)__bam_lput(dbp
, npl
);
599 (void)memp_fput(dbp
->mpf
, pp
, 0);
600 if (needlock
&& ppl
!= LOCK_INVALID
)
601 (void)__bam_lput(dbp
, ppl
);
607 * Delete an offpage chain of duplicates.
609 * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *)));
612 __db_ddup(dbp
, pgno
, freefunc
)
615 int (*freefunc
) __P((DB
*, PAGE
*));
622 if ((ret
= memp_fget(dbp
->mpf
, &pgno
, 0, &pagep
)) != 0) {
623 (void)__db_pgerr(dbp
, pgno
);
627 if (DB_LOGGING(dbp
)) {
628 tmp_dbt
.data
= pagep
;
629 tmp_dbt
.size
= dbp
->pgsize
;
630 if ((ret
= __db_split_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
631 &LSN(pagep
), 0, DB_SPLITOLD
, dbp
->log_fileid
,
632 PGNO(pagep
), &tmp_dbt
, &LSN(pagep
))) != 0)
635 pgno
= pagep
->next_pgno
;
636 if ((ret
= freefunc(dbp
, pagep
)) != 0)
638 } while (pgno
!= PGNO_INVALID
);
645 * Create a new page and link it onto the next_pgno field of the
649 __db_addpage(dbp
, hp
, indxp
, newfunc
)
653 int (*newfunc
) __P((DB
*, u_int32_t
, PAGE
**));
658 if ((ret
= newfunc(dbp
, P_DUPLICATE
, &newpage
)) != 0)
661 if (DB_LOGGING(dbp
)) {
662 if ((ret
= __db_addpage_log(dbp
->dbenv
->lg_info
,
663 dbp
->txn
, &LSN(*hp
), 0, dbp
->log_fileid
,
664 PGNO(*hp
), &LSN(*hp
), PGNO(newpage
), &LSN(newpage
))) != 0) {
667 LSN(newpage
) = LSN(*hp
);
670 PREV_PGNO(newpage
) = PGNO(*hp
);
671 NEXT_PGNO(*hp
) = PGNO(newpage
);
673 if ((ret
= memp_fput(dbp
->mpf
, *hp
, DB_MPOOL_DIRTY
)) != 0)