2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 * Keith Bostic. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994, 1995
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 static const char sccsid
[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98";
53 #ifndef NO_SYSTEM_INCLUDES
54 #include <sys/types.h>
64 static int __bam_fixed
__P((BTREE
*, DBT
*));
65 static int __bam_isdeleted
__P((DB
*, PAGE
*, u_int32_t
, int *));
66 static int __bam_lookup
__P((DB
*, DBT
*, int *));
67 static int __bam_ndup
__P((DB
*, PAGE
*, u_int32_t
));
68 static int __bam_ovput
__P((DB
*, PAGE
*, u_int32_t
, DBT
*));
69 static int __bam_partial
__P((DB
*, DBT
*, PAGE
*, u_int32_t
, u_int32_t
));
70 static u_int32_t __bam_partsize
__P((DBT
*, PAGE
*, u_int32_t
));
74 * Add a new key/data pair or replace an existing pair (btree).
76 * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
79 __bam_put(argdbp
, txn
, key
, data
, flags
)
90 u_int32_t iitem_flags
, insert_flags
;
91 int exact
, isdeleted
, newkey
, ret
, stack
;
93 DEBUG_LWRITE(argdbp
, txn
, "bam_put", key
, data
, flags
);
96 if ((ret
= __db_putchk(argdbp
, key
, data
, flags
,
97 F_ISSET(argdbp
, DB_AM_RDONLY
), F_ISSET(argdbp
, DB_AM_DUP
))) != 0)
100 GETHANDLE(argdbp
, txn
, &dbp
, ret
);
104 * Find the location at which to insert. The call to __bam_lookup
105 * leaves the returned page pinned.
107 if ((ret
= __bam_lookup(dbp
, key
, &exact
)) != 0) {
112 indx
= t
->bt_csp
->indx
;
116 * If DB_NOOVERWRITE is set and there's an identical key in the tree,
117 * return an error unless the data item has already been marked for
118 * deletion, or, all the remaining data items have already been marked
119 * for deletion in the case of duplicates. If all the data items have
120 * been marked for deletion, we do a replace, otherwise, it has to be
121 * a set of duplicates, and we simply append a new one to the set.
125 if ((ret
= __bam_isdeleted(dbp
, h
, indx
, &isdeleted
)) != 0)
128 __bam_ca_replace(dbp
, h
->pgno
, indx
, REPLACE_SETUP
);
130 if (flags
== DB_NOOVERWRITE
) {
137 * If we're inserting into the first or last page of the tree,
138 * remember where we did it so we can do fast lookup next time.
141 * Does reverse order still work (did it ever!?!?)
144 h
->next_pgno
== PGNO_INVALID
|| h
->prev_pgno
== PGNO_INVALID
?
145 h
->pgno
: PGNO_INVALID
;
148 * Select the arguments for __bam_iitem() and do the insert. If the
149 * key is an exact match, we're either adding a new duplicate at the
150 * end of the duplicate set, or we're replacing the data item with a
151 * new data item. If the key isn't an exact match, we're inserting
152 * a new key/data pair, before the search location.
154 newkey
= dbp
->type
== DB_BTREE
&& !exact
;
156 if (!isdeleted
&& F_ISSET(dbp
, DB_AM_DUP
)) {
158 * Make sure that we're not looking at a page of
159 * duplicates -- if so, move to the last entry on
165 c
.dpgno
= PGNO_INVALID
;
168 __bam_ovfl_chk(dbp
, &c
, indx
+ O_INDX
, 1)) != 0)
170 if (c
.dpgno
!= PGNO_INVALID
) {
173 * The __bam_ovfl_chk() routine memp_fput() the
174 * current page and acquired a new one, but did
175 * not do anything about the lock we're holding.
177 t
->bt_csp
->page
= h
= c
.page
;
180 insert_flags
= DB_AFTER
;
182 insert_flags
= DB_CURRENT
;
184 insert_flags
= DB_BEFORE
;
187 * The pages we're using may be modified by __bam_iitem(), so make
188 * sure we reset the stack.
192 iitem_flags
|= BI_NEWKEY
;
194 iitem_flags
|= BI_DOINCR
;
195 ret
= __bam_iitem(dbp
, &h
, &indx
, key
, data
, insert_flags
, iitem_flags
);
197 t
->bt_csp
->indx
= indx
;
201 /* Done. Clean up the cursor. */
203 __bam_ca_replace(dbp
, h
->pgno
, indx
, REPLACE_SUCCESS
);
207 * We have to split the page. Back out the cursor setup,
208 * discard the stack of pages, and do the split.
211 __bam_ca_replace(dbp
, h
->pgno
, indx
, REPLACE_FAILED
);
213 (void)__bam_stkrel(dbp
);
216 if ((ret
= __bam_split(dbp
, key
)) != 0)
223 __bam_ca_replace(dbp
, h
->pgno
, indx
, REPLACE_FAILED
);
228 (void)__bam_stkrel(dbp
);
236 * Return if the only remaining data item for the element has been
240 __bam_isdeleted(dbp
, h
, indx
, isdeletedp
)
252 bk
= GET_BKEYDATA(h
, indx
+ O_INDX
);
253 switch (B_TYPE(bk
->type
)) {
256 if (!B_DISSET(bk
->type
)) {
263 * If the data item referencing the off-page duplicates
264 * is flagged as deleted, we're done. Else, we have to
265 * walk the chain of duplicate pages.
267 if (B_DISSET(bk
->type
))
271 return (__db_pgfmt(dbp
, h
->pgno
));
275 * If there are no more on-page duplicate items, then every
276 * data item for this key must have been deleted.
278 if (indx
+ P_INDX
>= (u_int32_t
)NUM_ENT(h
))
280 if (h
->inp
[indx
] != h
->inp
[indx
+ P_INDX
])
283 /* Check the next item. */
288 dupchk
: /* Check a chain of duplicate pages. */
289 pgno
= ((BOVERFLOW
*)bk
)->pgno
;
291 /* Acquire the next page in the duplicate chain. */
292 if ((ret
= memp_fget(dbp
->mpf
, &pgno
, 0, &h
)) != 0)
295 /* Check each item for a delete flag. */
296 for (indx
= 0; indx
< NUM_ENT(h
); ++indx
)
297 if (!B_DISSET(GET_BKEYDATA(h
, indx
)->type
)) {
302 * If we reach the end of the duplicate pages, then every
303 * item we reviewed must have been deleted.
305 if ((pgno
= NEXT_PGNO(h
)) == PGNO_INVALID
)
308 (void)memp_fput(dbp
->mpf
, h
, 0);
312 done
: (void)memp_fput(dbp
->mpf
, h
, 0);
318 * Find the right location in the tree for the key.
321 __bam_lookup(dbp
, key
, exactp
)
337 * Record numbers can't be fast-tracked, we have to lock the entire
340 if (F_ISSET(dbp
, DB_BT_RECNUM
))
343 /* Check to see if we've been seeing sorted input. */
344 if (t
->bt_lpgno
== PGNO_INVALID
)
348 * Retrieve the page on which we did the last insert. It's okay if
349 * it doesn't exist, or if it's not the page type we expect, it just
350 * means that the world changed.
352 if (__bam_lget(dbp
, 0, t
->bt_lpgno
, DB_LOCK_WRITE
, &lock
))
354 if (__bam_pget(dbp
, &h
, &t
->bt_lpgno
, 0)) {
355 (void)__BT_LPUT(dbp
, lock
);
358 if (TYPE(h
) != P_LBTREE
)
364 * We have to be at the end or beginning of the tree to know that
365 * we're inserting in a sort order. If that's the case and we're
366 * in the right order in comparison to the first/last key/data pair,
367 * we have the right position.
369 if (h
->next_pgno
== PGNO_INVALID
) {
371 e
.indx
= NUM_ENT(h
) - P_INDX
;
372 if ((cmp
= __bam_cmp(dbp
, key
, &e
)) >= 0) {
378 if (h
->prev_pgno
== PGNO_INVALID
) {
381 if ((cmp
= __bam_cmp(dbp
, key
, &e
)) <= 0) {
383 * We're doing a put, so we want to insert as the last
384 * of any set of duplicates.
388 indx
< (db_indx_t
)(NUM_ENT(h
) - P_INDX
) &&
389 h
->inp
[indx
] == h
->inp
[indx
+ P_INDX
];
399 /* Set the exact match flag in case we've already inserted this key. */
400 fast
: *exactp
= cmp
== 0;
402 /* Enter the entry in the stack. */
404 BT_STK_ENTER(t
, e
.page
, e
.indx
, lock
, ret
);
408 ++t
->lstat
.bt_cache_hit
;
411 miss
: ++t
->lstat
.bt_cache_miss
;
413 (void)memp_fput(dbp
->mpf
, h
, 0);
414 (void)__BT_LPUT(dbp
, lock
);
417 slow
: return (__bam_search(dbp
, key
, S_INSERT
, 1, NULL
, exactp
));
422 * Insert an item into the tree.
424 * PUBLIC: int __bam_iitem __P((DB *,
425 * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t));
428 __bam_iitem(dbp
, hp
, indxp
, key
, data
, op
, flags
)
439 db_indx_t indx
, nbytes
;
440 u_int32_t data_size
, have_bytes
, need_bytes
, needed
;
441 int bigkey
, bigdata
, dupadjust
, replace
, ret
;
448 dupadjust
= replace
= 0;
451 * If it's a page of duplicates, call the common code to do the work.
454 * Here's where the hp and indxp are important. The duplicate code
455 * may decide to rework/rearrange the pages and indices we're using,
456 * so the caller must understand that the page stack may change.
458 if (TYPE(h
) == P_DUPLICATE
) {
459 /* Adjust the index for the new item if it's a DB_AFTER op. */
463 /* Remove the current item if it's a DB_CURRENT op. */
464 if (op
== DB_CURRENT
) {
465 bk
= GET_BKEYDATA(*hp
, *indxp
);
466 switch (B_TYPE(bk
->type
)) {
468 nbytes
= BKEYDATA_SIZE(bk
->len
);
471 nbytes
= BOVERFLOW_SIZE
;
474 return (__db_pgfmt(dbp
, h
->pgno
));
476 if ((ret
= __db_ditem(dbp
, *hp
, *indxp
, nbytes
)) != 0)
480 /* Put the new/replacement item onto the page. */
481 if ((ret
= __db_dput(dbp
, data
, hp
, indxp
, __bam_new
)) != 0)
487 /* Handle fixed-length records: build the real record. */
488 if (F_ISSET(dbp
, DB_RE_FIXEDLEN
) && data
->size
!= t
->bt_recno
->re_len
) {
490 if ((ret
= __bam_fixed(t
, &tdbt
)) != 0)
496 * Figure out how much space the data will take, including if it's a
497 * partial record. If either of the key or data items won't fit on
498 * a page, we'll have to store them on overflow pages.
500 bigkey
= LF_ISSET(BI_NEWKEY
) && key
->size
> t
->bt_ovflsize
;
501 data_size
= F_ISSET(data
, DB_DBT_PARTIAL
) ?
502 __bam_partsize(data
, h
, indx
) : data
->size
;
503 bigdata
= data_size
> t
->bt_ovflsize
;
506 if (LF_ISSET(BI_NEWKEY
)) {
507 /* If BI_NEWKEY is set we're adding a new key and data pair. */
509 needed
+= BOVERFLOW_PSIZE
;
511 needed
+= BKEYDATA_PSIZE(key
->size
);
513 needed
+= BOVERFLOW_PSIZE
;
515 needed
+= BKEYDATA_PSIZE(data_size
);
518 * We're either overwriting the data item of a key/data pair
519 * or we're adding the data item only, i.e. a new duplicate.
521 if (op
== DB_CURRENT
) {
523 indx
+ (TYPE(h
) == P_LBTREE
? O_INDX
: 0));
524 if (B_TYPE(bk
->type
) == B_KEYDATA
)
525 have_bytes
= BKEYDATA_PSIZE(bk
->len
);
527 have_bytes
= BOVERFLOW_PSIZE
;
531 need_bytes
= sizeof(db_indx_t
);
534 need_bytes
+= BOVERFLOW_PSIZE
;
536 need_bytes
+= BKEYDATA_PSIZE(data_size
);
538 if (have_bytes
< need_bytes
)
539 needed
+= need_bytes
- have_bytes
;
543 * If there's not enough room, or the user has put a ceiling on the
544 * number of keys permitted in the page, split the page.
547 * The t->bt_maxkey test here may be insufficient -- do we have to
548 * check in the btree split code, so we don't undo it there!?!?
550 if (P_FREESPACE(h
) < needed
||
551 (t
->bt_maxkey
!= 0 && NUM_ENT(h
) > t
->bt_maxkey
))
552 return (DB_NEEDSPLIT
);
554 /* Handle partial puts: build the real record. */
555 if (F_ISSET(data
, DB_DBT_PARTIAL
)) {
557 if ((ret
= __bam_partial(dbp
, &tdbt
, h
, indx
, data_size
)) != 0)
563 * The code breaks it up into six cases:
565 * 1. Append a new key/data pair.
566 * 2. Insert a new key/data pair.
567 * 3. Append a new data item (a new duplicate).
568 * 4. Insert a new data item (a new duplicate).
569 * 5. Overflow item: delete and re-add the data item.
570 * 6. Replace the data item.
572 if (LF_ISSET(BI_NEWKEY
)) {
574 case DB_AFTER
: /* 1. Append a new key/data pair. */
578 case DB_BEFORE
: /* 2. Insert a new key/data pair. */
586 if ((ret
= __bam_ovput(dbp
, h
, indx
, key
)) != 0)
589 if ((ret
= __db_pitem(dbp
, h
, indx
,
590 BKEYDATA_SIZE(key
->size
), NULL
, key
)) != 0)
595 case DB_AFTER
: /* 3. Append a new data item. */
596 if (TYPE(h
) == P_LBTREE
) {
598 * Adjust the cursor and copy in the key for
601 if ((ret
= __bam_adjindx(dbp
,
602 h
, indx
+ P_INDX
, indx
, 1)) != 0)
611 __bam_ca_di(dbp
, h
->pgno
, indx
, 1);
616 case DB_BEFORE
: /* 4. Insert a new data item. */
617 if (TYPE(h
) == P_LBTREE
) {
619 * Adjust the cursor and copy in the key for
623 __bam_adjindx(dbp
, h
, indx
, indx
, 1)) != 0)
629 __bam_ca_di(dbp
, h
->pgno
, indx
, 1);
632 if (TYPE(h
) == P_LBTREE
)
636 * 5. Delete/re-add the data item.
638 * If we're dealing with offpage items, we have to
639 * delete and then re-add the item.
641 if (bigdata
|| B_TYPE(bk
->type
) != B_KEYDATA
) {
642 if ((ret
= __bam_ditem(dbp
, h
, indx
)) != 0)
647 /* 6. Replace the data item. */
657 if ((ret
= __bam_ovput(dbp
, h
, indx
, data
)) != 0)
663 if (LF_ISSET(BI_DELETED
)) {
664 B_TSET(__bk
.type
, B_KEYDATA
, 1);
665 __bk
.len
= data
->size
;
667 __hdr
.size
= SSZA(BKEYDATA
, data
);
668 ret
= __db_pitem(dbp
, h
, indx
,
669 BKEYDATA_SIZE(data
->size
), &__hdr
, data
);
671 ret
= __bam_ritem(dbp
, h
, indx
, data
);
673 ret
= __db_pitem(dbp
, h
, indx
,
674 BKEYDATA_SIZE(data
->size
), NULL
, data
);
679 if ((ret
= memp_fset(dbp
->mpf
, h
, DB_MPOOL_DIRTY
)) != 0)
683 * If the page is at least 50% full, and we added a duplicate, see if
684 * that set of duplicates takes up at least 25% of the space. If it
685 * does, move it off onto its own page.
687 if (dupadjust
&& P_FREESPACE(h
) <= dbp
->pgsize
/ 2) {
689 if ((ret
= __bam_ndup(dbp
, h
, indx
)) != 0)
694 * If we've changed the record count, update the tree. Record counts
695 * need to be updated in recno databases and in btree databases where
696 * we are supporting records. In both cases, adjust the count if the
697 * operation wasn't performed on the current record or when the caller
698 * overrides and wants the adjustment made regardless.
700 done
: if (LF_ISSET(BI_DOINCR
) ||
702 (F_ISSET(dbp
, DB_BT_RECNUM
) || dbp
->type
== DB_RECNO
)))
703 if ((ret
= __bam_adjust(dbp
, t
, 1)) != 0)
706 /* If we've modified a recno file, set the flag */
707 if (t
->bt_recno
!= NULL
)
708 F_SET(t
->bt_recno
, RECNO_MODIFIED
);
717 * Figure out how much space a partial data item is in total.
720 __bam_partsize(data
, h
, indx
)
729 * Figure out how much total space we'll need. If the record doesn't
730 * already exist, it's simply the data we're provided.
732 if (indx
>= NUM_ENT(h
))
733 return (data
->doff
+ data
->size
);
736 * Otherwise, it's the data provided plus any already existing data
737 * that we're not replacing.
739 bk
= GET_BKEYDATA(h
, indx
+ (TYPE(h
) == P_LBTREE
? O_INDX
: 0));
741 B_TYPE(bk
->type
) == B_OVERFLOW
? ((BOVERFLOW
*)bk
)->tlen
: bk
->len
;
744 * There are really two cases here:
746 * Case 1: We are replacing some bytes that do not exist (i.e., they
747 * are past the end of the record). In this case the number of bytes
748 * we are replacing is irrelevant and all we care about is how many
749 * bytes we are going to add from offset. So, the new record length
750 * is going to be the size of the new bytes (size) plus wherever those
751 * new bytes begin (doff).
753 * Case 2: All the bytes we are replacing exist. Therefore, the new
754 * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
755 * plus the bytes we are adding (size).
757 if (nbytes
< data
->doff
+ data
->dlen
) /* Case 1 */
758 return (data
->doff
+ data
->size
);
760 return (nbytes
+ data
->size
- data
->dlen
); /* Case 2 */
765 * Copy an overflow item onto a page.
768 #define OVPUT(h, indx, bo) do { \
770 memset(&__hdr, 0, sizeof(__hdr)); \
772 __hdr.size = BOVERFLOW_SIZE; \
773 if ((ret = __db_pitem(dbp, \
774 h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \
780 * Build an overflow item and put it on the page.
783 __bam_ovput(dbp
, h
, indx
, item
)
792 B_TSET(bo
.type
, B_OVERFLOW
, 0);
793 bo
.tlen
= item
->size
;
794 if ((ret
= __db_poff(dbp
, item
, &bo
.pgno
, __bam_new
)) != 0)
804 * Replace an item on a page.
806 * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *));
809 __bam_ritem(dbp
, h
, indx
, data
)
817 db_indx_t cnt
, lo
, ln
, min
, off
, prefix
, suffix
;
823 * Replace a single item onto a page. The logic figuring out where
824 * to insert and whether it fits is handled in the caller. All we do
825 * here is manage the page shuffling.
827 bk
= GET_BKEYDATA(h
, indx
);
829 /* Log the change. */
830 if (DB_LOGGING(dbp
)) {
832 * We might as well check to see if the two data items share
833 * a common prefix and suffix -- it can save us a lot of log
834 * message if they're large.
836 min
= data
->size
< bk
->len
? data
->size
: bk
->len
;
838 p
= bk
->data
, t
= data
->data
;
839 prefix
< min
&& *p
== *t
; ++prefix
, ++p
, ++t
)
844 p
= (u_int8_t
*)bk
->data
+ bk
->len
- 1,
845 t
= (u_int8_t
*)data
->data
+ data
->size
- 1;
846 suffix
< min
&& *p
== *t
; ++suffix
, --p
, --t
)
849 /* We only log the parts of the keys that have changed. */
850 orig
.data
= (u_int8_t
*)bk
->data
+ prefix
;
851 orig
.size
= bk
->len
- (prefix
+ suffix
);
852 repl
.data
= (u_int8_t
*)data
->data
+ prefix
;
853 repl
.size
= data
->size
- (prefix
+ suffix
);
854 if ((ret
= __bam_repl_log(dbp
->dbenv
->lg_info
, dbp
->txn
,
855 &LSN(h
), 0, dbp
->log_fileid
, PGNO(h
), &LSN(h
),
856 (u_int32_t
)indx
, (u_int32_t
)B_DISSET(bk
->type
),
857 &orig
, &repl
, (u_int32_t
)prefix
, (u_int32_t
)suffix
)) != 0)
862 * Set references to the first in-use byte on the page and the
863 * first byte of the item being replaced.
865 p
= (u_int8_t
*)h
+ HOFFSET(h
);
869 * If the entry is growing in size, shift the beginning of the data
870 * part of the page down. If the entry is shrinking in size, shift
871 * the beginning of the data part of the page up. Use memmove(3),
872 * the regions overlap.
874 lo
= BKEYDATA_SIZE(bk
->len
);
875 ln
= BKEYDATA_SIZE(data
->size
);
877 nbytes
= lo
- ln
; /* Signed difference. */
878 if (p
== t
) /* First index is fast. */
879 h
->inp
[indx
] += nbytes
;
880 else { /* Else, shift the page. */
881 memmove(p
+ nbytes
, p
, t
- p
);
883 /* Adjust the indices' offsets. */
885 for (cnt
= 0; cnt
< NUM_ENT(h
); ++cnt
)
886 if (h
->inp
[cnt
] <= off
)
887 h
->inp
[cnt
] += nbytes
;
890 /* Clean up the page and adjust the item's reference. */
891 HOFFSET(h
) += nbytes
;
895 /* Copy the new item onto the page. */
897 B_TSET(bk
->type
, B_KEYDATA
, 0);
898 bk
->len
= data
->size
;
899 memcpy(bk
->data
, data
->data
, data
->size
);
906 * Check to see if the duplicate set at indx should have its own page.
907 * If it should, create it.
910 __bam_ndup(dbp
, h
, indx
)
919 db_indx_t cnt
, cpindx
, first
, sz
;
922 while (indx
> 0 && h
->inp
[indx
] == h
->inp
[indx
- P_INDX
])
924 for (cnt
= 0, sz
= 0, first
= indx
;; ++cnt
, indx
+= P_INDX
) {
925 if (indx
>= NUM_ENT(h
) || h
->inp
[first
] != h
->inp
[indx
])
927 bk
= GET_BKEYDATA(h
, indx
);
928 sz
+= B_TYPE(bk
->type
) == B_KEYDATA
?
929 BKEYDATA_PSIZE(bk
->len
) : BOVERFLOW_PSIZE
;
930 bk
= GET_BKEYDATA(h
, indx
+ O_INDX
);
931 sz
+= B_TYPE(bk
->type
) == B_KEYDATA
?
932 BKEYDATA_PSIZE(bk
->len
) : BOVERFLOW_PSIZE
;
936 * If this set of duplicates is using more than 25% of the page, move
937 * them off. The choice of 25% is a WAG, but it has to be small enough
938 * that we can always split regardless of the presence of duplicates.
940 if (sz
< dbp
->pgsize
/ 4)
943 /* Get a new page. */
944 if ((ret
= __bam_new(dbp
, P_DUPLICATE
, &cp
)) != 0)
948 * Move this set of duplicates off the page. First points to the first
949 * key of the first duplicate key/data pair, cnt is the number of pairs
950 * we're dealing with.
952 memset(&hdr
, 0, sizeof(hdr
));
953 for (indx
= first
+ O_INDX
, cpindx
= 0;; ++cpindx
) {
954 /* Copy the entry to the new page. */
955 bk
= GET_BKEYDATA(h
, indx
);
957 hdr
.size
= B_TYPE(bk
->type
) == B_KEYDATA
?
958 BKEYDATA_SIZE(bk
->len
) : BOVERFLOW_SIZE
;
960 __db_pitem(dbp
, cp
, cpindx
, hdr
.size
, &hdr
, NULL
)) != 0)
964 * Move cursors referencing the old entry to the new entry.
965 * Done after the page put because __db_pitem() adjusts
966 * cursors on the new page, and before the delete because
967 * __db_ditem adjusts cursors on the old page.
970 PGNO(h
), first
, indx
- O_INDX
, PGNO(cp
), cpindx
);
972 /* Delete the data item. */
973 if ((ret
= __db_ditem(dbp
, h
, indx
, hdr
.size
)) != 0)
976 /* Delete all but the first reference to the key. */
979 if ((ret
= __bam_adjindx(dbp
, h
, indx
, first
, 0)) != 0)
983 /* Put in a new data item that points to the duplicates page. */
984 B_TSET(bo
.type
, B_DUPLICATE
, 0);
990 return (memp_fput(dbp
->mpf
, cp
, DB_MPOOL_DIRTY
));
992 err
: (void)__bam_free(dbp
, cp
);
998 * Build the real record for a fixed length put.
1010 * If database contains fixed-length records, and the record is long,
1013 if (dbt
->size
> rp
->re_len
)
1017 * The caller checked to see if it was just right, so we know it's
1018 * short. Pad it out. We use the record data return memory, it's
1019 * only a short-term use.
1021 if (t
->bt_rdata
.ulen
< rp
->re_len
) {
1022 t
->bt_rdata
.data
= t
->bt_rdata
.data
== NULL
?
1023 (void *)__db_malloc(rp
->re_len
) :
1024 (void *)__db_realloc(t
->bt_rdata
.data
, rp
->re_len
);
1025 if (t
->bt_rdata
.data
== NULL
) {
1026 t
->bt_rdata
.ulen
= 0;
1029 t
->bt_rdata
.ulen
= rp
->re_len
;
1031 memcpy(t
->bt_rdata
.data
, dbt
->data
, dbt
->size
);
1032 memset((u_int8_t
*)t
->bt_rdata
.data
+ dbt
->size
,
1033 rp
->re_pad
, rp
->re_len
- dbt
->size
);
1036 * Clean up our flags and other information just in case, and
1037 * change the caller's DBT to reference our created record.
1039 t
->bt_rdata
.size
= rp
->re_len
;
1040 t
->bt_rdata
.dlen
= 0;
1041 t
->bt_rdata
.doff
= 0;
1042 t
->bt_rdata
.flags
= 0;
1050 * Build the real record for a partial put.
1053 __bam_partial(dbp
, dbt
, h
, indx
, nbytes
)
1057 u_int32_t indx
, nbytes
;
1063 u_int32_t len
, tlen
;
1067 COMPQUIET(bo
, NULL
);
1071 /* We use the record data return memory, it's only a short-term use. */
1072 if (t
->bt_rdata
.ulen
< nbytes
) {
1073 t
->bt_rdata
.data
= t
->bt_rdata
.data
== NULL
?
1074 (void *)__db_malloc(nbytes
) :
1075 (void *)__db_realloc(t
->bt_rdata
.data
, nbytes
);
1076 if (t
->bt_rdata
.data
== NULL
) {
1077 t
->bt_rdata
.ulen
= 0;
1080 t
->bt_rdata
.ulen
= nbytes
;
1083 /* Find the current record. */
1084 if (indx
< NUM_ENT(h
)) {
1085 bk
= GET_BKEYDATA(h
, indx
+ (TYPE(h
) == P_LBTREE
? O_INDX
: 0));
1086 bo
= (BOVERFLOW
*)bk
;
1089 B_TSET(bk
->type
, B_KEYDATA
, 0);
1094 * We use nul bytes for any part of the record that isn't specified,
1097 memset(t
->bt_rdata
.data
, 0, nbytes
);
1099 if (B_TYPE(bk
->type
) == B_OVERFLOW
) {
1101 * In the case of an overflow record, we shift things around
1102 * in the current record rather than allocate a separate copy.
1104 memset(©
, 0, sizeof(copy
));
1105 if ((ret
= __db_goff(dbp
, ©
, bo
->tlen
,
1106 bo
->pgno
, &t
->bt_rdata
.data
, &t
->bt_rdata
.ulen
)) != 0)
1109 /* Skip any leading data from the original record. */
1111 p
= (u_int8_t
*)t
->bt_rdata
.data
+ dbt
->doff
;
1114 * Copy in any trailing data from the original record.
1116 * If the original record was larger than the original offset
1117 * plus the bytes being deleted, there is trailing data in the
1118 * original record we need to preserve. If we aren't deleting
1119 * the same number of bytes as we're inserting, copy it up or
1122 * Use memmove(), the regions may overlap.
1124 if (bo
->tlen
> dbt
->doff
+ dbt
->dlen
) {
1125 len
= bo
->tlen
- (dbt
->doff
+ dbt
->dlen
);
1126 if (dbt
->dlen
!= dbt
->size
)
1127 memmove(p
+ dbt
->size
, p
+ dbt
->dlen
, len
);
1131 /* Copy in the application provided data. */
1132 memcpy(p
, dbt
->data
, dbt
->size
);
1135 /* Copy in any leading data from the original record. */
1136 memcpy(t
->bt_rdata
.data
,
1137 bk
->data
, dbt
->doff
> bk
->len
? bk
->len
: dbt
->doff
);
1139 p
= (u_int8_t
*)t
->bt_rdata
.data
+ dbt
->doff
;
1141 /* Copy in the application provided data. */
1142 memcpy(p
, dbt
->data
, dbt
->size
);
1145 /* Copy in any trailing data from the original record. */
1146 len
= dbt
->doff
+ dbt
->dlen
;
1147 if (bk
->len
> len
) {
1148 memcpy(p
+ dbt
->size
, bk
->data
+ len
, bk
->len
- len
);
1149 tlen
+= bk
->len
- len
;
1153 /* Set the DBT to reference our new record. */
1154 t
->bt_rdata
.size
= tlen
;
1155 t
->bt_rdata
.dlen
= 0;
1156 t
->bt_rdata
.doff
= 0;
1157 t
->bt_rdata
.flags
= 0;