Update.
[glibc.git] / db2 / btree / bt_put.c
bloba93faac98c2822d4c5c6e54e9e9d4b0d6b046024
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 /*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 * Keith Bostic. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994, 1995
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
16 * Mike Olson.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 * SUCH DAMAGE.
47 #include "config.h"
49 #ifndef lint
50 static const char sccsid[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98";
51 #endif /* not lint */
53 #ifndef NO_SYSTEM_INCLUDES
54 #include <sys/types.h>
56 #include <errno.h>
57 #include <string.h>
58 #endif
60 #include "db_int.h"
61 #include "db_page.h"
62 #include "btree.h"
64 static int __bam_fixed __P((BTREE *, DBT *));
65 static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *));
66 static int __bam_lookup __P((DB *, DBT *, int *));
67 static int __bam_ndup __P((DB *, PAGE *, u_int32_t));
68 static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *));
69 static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t));
70 static u_int32_t __bam_partsize __P((DBT *, PAGE *, u_int32_t));
73 * __bam_put --
74 * Add a new key/data pair or replace an existing pair (btree).
76 * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
78 int
79 __bam_put(argdbp, txn, key, data, flags)
80 DB *argdbp;
81 DB_TXN *txn;
82 DBT *key, *data;
83 u_int32_t flags;
85 BTREE *t;
86 CURSOR c;
87 DB *dbp;
88 PAGE *h;
89 db_indx_t indx;
90 u_int32_t iitem_flags, insert_flags;
91 int exact, isdeleted, newkey, ret, stack;
93 DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags);
95 /* Check flags. */
96 if ((ret = __db_putchk(argdbp, key, data, flags,
97 F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0)
98 return (ret);
100 GETHANDLE(argdbp, txn, &dbp, ret);
101 t = dbp->internal;
103 retry: /*
104 * Find the location at which to insert. The call to __bam_lookup
105 * leaves the returned page pinned.
107 if ((ret = __bam_lookup(dbp, key, &exact)) != 0) {
108 PUTHANDLE(dbp);
109 return (ret);
111 h = t->bt_csp->page;
112 indx = t->bt_csp->indx;
113 stack = 1;
116 * If DB_NOOVERWRITE is set and there's an identical key in the tree,
117 * return an error unless the data item has already been marked for
118 * deletion, or, all the remaining data items have already been marked
119 * for deletion in the case of duplicates. If all the data items have
120 * been marked for deletion, we do a replace, otherwise, it has to be
121 * a set of duplicates, and we simply append a new one to the set.
123 isdeleted = 0;
124 if (exact) {
125 if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0)
126 goto err;
127 if (isdeleted)
128 __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP);
129 else
130 if (flags == DB_NOOVERWRITE) {
131 ret = DB_KEYEXIST;
132 goto err;
137 * If we're inserting into the first or last page of the tree,
138 * remember where we did it so we can do fast lookup next time.
140 * XXX
141 * Does reverse order still work (did it ever!?!?)
143 t->bt_lpgno =
144 h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ?
145 h->pgno : PGNO_INVALID;
148 * Select the arguments for __bam_iitem() and do the insert. If the
149 * key is an exact match, we're either adding a new duplicate at the
150 * end of the duplicate set, or we're replacing the data item with a
151 * new data item. If the key isn't an exact match, we're inserting
152 * a new key/data pair, before the search location.
154 newkey = dbp->type == DB_BTREE && !exact;
155 if (exact) {
156 if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) {
158 * Make sure that we're not looking at a page of
159 * duplicates -- if so, move to the last entry on
160 * that page.
162 c.page = h;
163 c.pgno = h->pgno;
164 c.indx = indx;
165 c.dpgno = PGNO_INVALID;
166 c.dindx = 0;
167 if ((ret =
168 __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0)
169 goto err;
170 if (c.dpgno != PGNO_INVALID) {
172 * XXX
173 * The __bam_ovfl_chk() routine memp_fput() the
174 * current page and acquired a new one, but did
175 * not do anything about the lock we're holding.
177 t->bt_csp->page = h = c.page;
178 indx = c.dindx;
180 insert_flags = DB_AFTER;
181 } else
182 insert_flags = DB_CURRENT;
183 } else
184 insert_flags = DB_BEFORE;
187 * The pages we're using may be modified by __bam_iitem(), so make
188 * sure we reset the stack.
190 iitem_flags = 0;
191 if (newkey)
192 iitem_flags |= BI_NEWKEY;
193 if (isdeleted)
194 iitem_flags |= BI_DOINCR;
195 ret = __bam_iitem(dbp, &h, &indx, key, data, insert_flags, iitem_flags);
196 t->bt_csp->page = h;
197 t->bt_csp->indx = indx;
199 switch (ret) {
200 case 0:
201 /* Done. Clean up the cursor. */
202 if (isdeleted)
203 __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS);
204 break;
205 case DB_NEEDSPLIT:
207 * We have to split the page. Back out the cursor setup,
208 * discard the stack of pages, and do the split.
210 if (isdeleted)
211 __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
213 (void)__bam_stkrel(dbp);
214 stack = 0;
216 if ((ret = __bam_split(dbp, key)) != 0)
217 break;
219 goto retry;
220 /* NOTREACHED */
221 default:
222 if (isdeleted)
223 __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED);
224 break;
227 err: if (stack)
228 (void)__bam_stkrel(dbp);
230 PUTHANDLE(dbp);
231 return (ret);
235 * __bam_isdeleted --
236 * Return if the only remaining data item for the element has been
237 * deleted.
239 static int
240 __bam_isdeleted(dbp, h, indx, isdeletedp)
241 DB *dbp;
242 PAGE *h;
243 u_int32_t indx;
244 int *isdeletedp;
246 BKEYDATA *bk;
247 db_pgno_t pgno;
248 int ret;
250 *isdeletedp = 1;
251 for (;;) {
252 bk = GET_BKEYDATA(h, indx + O_INDX);
253 switch (B_TYPE(bk->type)) {
254 case B_KEYDATA:
255 case B_OVERFLOW:
256 if (!B_DISSET(bk->type)) {
257 *isdeletedp = 0;
258 return (0);
260 break;
261 case B_DUPLICATE:
263 * If the data item referencing the off-page duplicates
264 * is flagged as deleted, we're done. Else, we have to
265 * walk the chain of duplicate pages.
267 if (B_DISSET(bk->type))
268 return (0);
269 goto dupchk;
270 default:
271 return (__db_pgfmt(dbp, h->pgno));
275 * If there are no more on-page duplicate items, then every
276 * data item for this key must have been deleted.
278 if (indx + P_INDX >= (u_int32_t)NUM_ENT(h))
279 return (0);
280 if (h->inp[indx] != h->inp[indx + P_INDX])
281 return (0);
283 /* Check the next item. */
284 indx += P_INDX;
286 /* NOTREACHED */
288 dupchk: /* Check a chain of duplicate pages. */
289 pgno = ((BOVERFLOW *)bk)->pgno;
290 for (;;) {
291 /* Acquire the next page in the duplicate chain. */
292 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
293 return (ret);
295 /* Check each item for a delete flag. */
296 for (indx = 0; indx < NUM_ENT(h); ++indx)
297 if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) {
298 *isdeletedp = 0;
299 goto done;
302 * If we reach the end of the duplicate pages, then every
303 * item we reviewed must have been deleted.
305 if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID)
306 goto done;
308 (void)memp_fput(dbp->mpf, h, 0);
310 /* NOTREACHED */
312 done: (void)memp_fput(dbp->mpf, h, 0);
313 return (0);
317 * __bam_lookup --
318 * Find the right location in the tree for the key.
320 static int
321 __bam_lookup(dbp, key, exactp)
322 DB *dbp;
323 DBT *key;
324 int *exactp;
326 BTREE *t;
327 DB_LOCK lock;
328 EPG e;
329 PAGE *h;
330 db_indx_t indx;
331 int cmp, ret;
333 t = dbp->internal;
334 h = NULL;
337 * Record numbers can't be fast-tracked, we have to lock the entire
338 * tree.
340 if (F_ISSET(dbp, DB_BT_RECNUM))
341 goto slow;
343 /* Check to see if we've been seeing sorted input. */
344 if (t->bt_lpgno == PGNO_INVALID)
345 goto slow;
348 * Retrieve the page on which we did the last insert. It's okay if
349 * it doesn't exist, or if it's not the page type we expect, it just
350 * means that the world changed.
352 if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
353 goto miss;
354 if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) {
355 (void)__BT_LPUT(dbp, lock);
356 goto miss;
358 if (TYPE(h) != P_LBTREE)
359 goto miss;
360 if (NUM_ENT(h) == 0)
361 goto miss;
364 * We have to be at the end or beginning of the tree to know that
365 * we're inserting in a sort order. If that's the case and we're
366 * in the right order in comparison to the first/last key/data pair,
367 * we have the right position.
369 if (h->next_pgno == PGNO_INVALID) {
370 e.page = h;
371 e.indx = NUM_ENT(h) - P_INDX;
372 if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) {
373 if (cmp > 0)
374 e.indx += P_INDX;
375 goto fast;
378 if (h->prev_pgno == PGNO_INVALID) {
379 e.page = h;
380 e.indx = 0;
381 if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) {
383 * We're doing a put, so we want to insert as the last
384 * of any set of duplicates.
386 if (cmp == 0) {
387 for (indx = 0;
388 indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
389 h->inp[indx] == h->inp[indx + P_INDX];
390 indx += P_INDX)
392 e.indx = indx;
394 goto fast;
397 goto miss;
399 /* Set the exact match flag in case we've already inserted this key. */
400 fast: *exactp = cmp == 0;
402 /* Enter the entry in the stack. */
403 BT_STK_CLR(t);
404 BT_STK_ENTER(t, e.page, e.indx, lock, ret);
405 if (ret != 0)
406 return (ret);
408 ++t->lstat.bt_cache_hit;
409 return (0);
411 miss: ++t->lstat.bt_cache_miss;
412 if (h != NULL) {
413 (void)memp_fput(dbp->mpf, h, 0);
414 (void)__BT_LPUT(dbp, lock);
417 slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp));
421 * __bam_iitem --
422 * Insert an item into the tree.
424 * PUBLIC: int __bam_iitem __P((DB *,
425 * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t));
428 __bam_iitem(dbp, hp, indxp, key, data, op, flags)
429 DB *dbp;
430 PAGE **hp;
431 db_indx_t *indxp;
432 DBT *key, *data;
433 u_int32_t op, flags;
435 BTREE *t;
436 BKEYDATA *bk;
437 DBT tdbt;
438 PAGE *h;
439 db_indx_t indx, nbytes;
440 u_int32_t data_size, have_bytes, need_bytes, needed;
441 int bigkey, bigdata, dupadjust, replace, ret;
443 COMPQUIET(bk, NULL);
445 t = dbp->internal;
446 h = *hp;
447 indx = *indxp;
448 dupadjust = replace = 0;
451 * If it's a page of duplicates, call the common code to do the work.
453 * !!!
454 * Here's where the hp and indxp are important. The duplicate code
455 * may decide to rework/rearrange the pages and indices we're using,
456 * so the caller must understand that the page stack may change.
458 if (TYPE(h) == P_DUPLICATE) {
459 /* Adjust the index for the new item if it's a DB_AFTER op. */
460 if (op == DB_AFTER)
461 ++*indxp;
463 /* Remove the current item if it's a DB_CURRENT op. */
464 if (op == DB_CURRENT) {
465 bk = GET_BKEYDATA(*hp, *indxp);
466 switch (B_TYPE(bk->type)) {
467 case B_KEYDATA:
468 nbytes = BKEYDATA_SIZE(bk->len);
469 break;
470 case B_OVERFLOW:
471 nbytes = BOVERFLOW_SIZE;
472 break;
473 default:
474 return (__db_pgfmt(dbp, h->pgno));
476 if ((ret = __db_ditem(dbp, *hp, *indxp, nbytes)) != 0)
477 return (ret);
480 /* Put the new/replacement item onto the page. */
481 if ((ret = __db_dput(dbp, data, hp, indxp, __bam_new)) != 0)
482 return (ret);
484 goto done;
487 /* Handle fixed-length records: build the real record. */
488 if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) {
489 tdbt = *data;
490 if ((ret = __bam_fixed(t, &tdbt)) != 0)
491 return (ret);
492 data = &tdbt;
496 * Figure out how much space the data will take, including if it's a
497 * partial record. If either of the key or data items won't fit on
498 * a page, we'll have to store them on overflow pages.
500 bigkey = LF_ISSET(BI_NEWKEY) && key->size > t->bt_ovflsize;
501 data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
502 __bam_partsize(data, h, indx) : data->size;
503 bigdata = data_size > t->bt_ovflsize;
505 needed = 0;
506 if (LF_ISSET(BI_NEWKEY)) {
507 /* If BI_NEWKEY is set we're adding a new key and data pair. */
508 if (bigkey)
509 needed += BOVERFLOW_PSIZE;
510 else
511 needed += BKEYDATA_PSIZE(key->size);
512 if (bigdata)
513 needed += BOVERFLOW_PSIZE;
514 else
515 needed += BKEYDATA_PSIZE(data_size);
516 } else {
518 * We're either overwriting the data item of a key/data pair
519 * or we're adding the data item only, i.e. a new duplicate.
521 if (op == DB_CURRENT) {
522 bk = GET_BKEYDATA(h,
523 indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
524 if (B_TYPE(bk->type) == B_KEYDATA)
525 have_bytes = BKEYDATA_PSIZE(bk->len);
526 else
527 have_bytes = BOVERFLOW_PSIZE;
528 need_bytes = 0;
529 } else {
530 have_bytes = 0;
531 need_bytes = sizeof(db_indx_t);
533 if (bigdata)
534 need_bytes += BOVERFLOW_PSIZE;
535 else
536 need_bytes += BKEYDATA_PSIZE(data_size);
538 if (have_bytes < need_bytes)
539 needed += need_bytes - have_bytes;
543 * If there's not enough room, or the user has put a ceiling on the
544 * number of keys permitted in the page, split the page.
546 * XXX
547 * The t->bt_maxkey test here may be insufficient -- do we have to
548 * check in the btree split code, so we don't undo it there!?!?
550 if (P_FREESPACE(h) < needed ||
551 (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey))
552 return (DB_NEEDSPLIT);
554 /* Handle partial puts: build the real record. */
555 if (F_ISSET(data, DB_DBT_PARTIAL)) {
556 tdbt = *data;
557 if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0)
558 return (ret);
559 data = &tdbt;
563 * The code breaks it up into six cases:
565 * 1. Append a new key/data pair.
566 * 2. Insert a new key/data pair.
567 * 3. Append a new data item (a new duplicate).
568 * 4. Insert a new data item (a new duplicate).
569 * 5. Overflow item: delete and re-add the data item.
570 * 6. Replace the data item.
572 if (LF_ISSET(BI_NEWKEY)) {
573 switch (op) {
574 case DB_AFTER: /* 1. Append a new key/data pair. */
575 indx += 2;
576 *indxp += 2;
577 break;
578 case DB_BEFORE: /* 2. Insert a new key/data pair. */
579 break;
580 default:
581 return (EINVAL);
584 /* Add the key. */
585 if (bigkey) {
586 if ((ret = __bam_ovput(dbp, h, indx, key)) != 0)
587 return (ret);
588 } else
589 if ((ret = __db_pitem(dbp, h, indx,
590 BKEYDATA_SIZE(key->size), NULL, key)) != 0)
591 return (ret);
592 ++indx;
593 } else {
594 switch (op) {
595 case DB_AFTER: /* 3. Append a new data item. */
596 if (TYPE(h) == P_LBTREE) {
598 * Adjust the cursor and copy in the key for
599 * the duplicate.
601 if ((ret = __bam_adjindx(dbp,
602 h, indx + P_INDX, indx, 1)) != 0)
603 return (ret);
605 indx += 3;
606 dupadjust = 1;
608 *indxp += 2;
609 } else {
610 ++indx;
611 __bam_ca_di(dbp, h->pgno, indx, 1);
613 *indxp += 1;
615 break;
616 case DB_BEFORE: /* 4. Insert a new data item. */
617 if (TYPE(h) == P_LBTREE) {
619 * Adjust the cursor and copy in the key for
620 * the duplicate.
622 if ((ret =
623 __bam_adjindx(dbp, h, indx, indx, 1)) != 0)
624 return (ret);
626 ++indx;
627 dupadjust = 1;
628 } else
629 __bam_ca_di(dbp, h->pgno, indx, 1);
630 break;
631 case DB_CURRENT:
632 if (TYPE(h) == P_LBTREE)
633 ++indx;
636 * 5. Delete/re-add the data item.
638 * If we're dealing with offpage items, we have to
639 * delete and then re-add the item.
641 if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
642 if ((ret = __bam_ditem(dbp, h, indx)) != 0)
643 return (ret);
644 break;
647 /* 6. Replace the data item. */
648 replace = 1;
649 break;
650 default:
651 return (EINVAL);
655 /* Add the data. */
656 if (bigdata) {
657 if ((ret = __bam_ovput(dbp, h, indx, data)) != 0)
658 return (ret);
659 } else {
660 BKEYDATA __bk;
661 DBT __hdr;
663 if (LF_ISSET(BI_DELETED)) {
664 B_TSET(__bk.type, B_KEYDATA, 1);
665 __bk.len = data->size;
666 __hdr.data = &__bk;
667 __hdr.size = SSZA(BKEYDATA, data);
668 ret = __db_pitem(dbp, h, indx,
669 BKEYDATA_SIZE(data->size), &__hdr, data);
670 } else if (replace)
671 ret = __bam_ritem(dbp, h, indx, data);
672 else
673 ret = __db_pitem(dbp, h, indx,
674 BKEYDATA_SIZE(data->size), NULL, data);
675 if (ret != 0)
676 return (ret);
679 if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
680 return (ret);
683 * If the page is at least 50% full, and we added a duplicate, see if
684 * that set of duplicates takes up at least 25% of the space. If it
685 * does, move it off onto its own page.
687 if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
688 --indx;
689 if ((ret = __bam_ndup(dbp, h, indx)) != 0)
690 return (ret);
694 * If we've changed the record count, update the tree. Record counts
695 * need to be updated in recno databases and in btree databases where
696 * we are supporting records. In both cases, adjust the count if the
697 * operation wasn't performed on the current record or when the caller
698 * overrides and wants the adjustment made regardless.
700 done: if (LF_ISSET(BI_DOINCR) ||
701 (op != DB_CURRENT &&
702 (F_ISSET(dbp, DB_BT_RECNUM) || dbp->type == DB_RECNO)))
703 if ((ret = __bam_adjust(dbp, t, 1)) != 0)
704 return (ret);
706 /* If we've modified a recno file, set the flag */
707 if (t->bt_recno != NULL)
708 F_SET(t->bt_recno, RECNO_MODIFIED);
710 ++t->lstat.bt_added;
712 return (ret);
716 * __bam_partsize --
717 * Figure out how much space a partial data item is in total.
719 static u_int32_t
720 __bam_partsize(data, h, indx)
721 DBT *data;
722 PAGE *h;
723 u_int32_t indx;
725 BKEYDATA *bk;
726 u_int32_t nbytes;
729 * Figure out how much total space we'll need. If the record doesn't
730 * already exist, it's simply the data we're provided.
732 if (indx >= NUM_ENT(h))
733 return (data->doff + data->size);
736 * Otherwise, it's the data provided plus any already existing data
737 * that we're not replacing.
739 bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
740 nbytes =
741 B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
744 * There are really two cases here:
746 * Case 1: We are replacing some bytes that do not exist (i.e., they
747 * are past the end of the record). In this case the number of bytes
748 * we are replacing is irrelevant and all we care about is how many
749 * bytes we are going to add from offset. So, the new record length
750 * is going to be the size of the new bytes (size) plus wherever those
751 * new bytes begin (doff).
753 * Case 2: All the bytes we are replacing exist. Therefore, the new
754 * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
755 * plus the bytes we are adding (size).
757 if (nbytes < data->doff + data->dlen) /* Case 1 */
758 return (data->doff + data->size);
760 return (nbytes + data->size - data->dlen); /* Case 2 */
764 * OVPUT --
765 * Copy an overflow item onto a page.
767 #undef OVPUT
768 #define OVPUT(h, indx, bo) do { \
769 DBT __hdr; \
770 memset(&__hdr, 0, sizeof(__hdr)); \
771 __hdr.data = &bo; \
772 __hdr.size = BOVERFLOW_SIZE; \
773 if ((ret = __db_pitem(dbp, \
774 h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \
775 return (ret); \
776 } while (0)
779 * __bam_ovput --
780 * Build an overflow item and put it on the page.
782 static int
783 __bam_ovput(dbp, h, indx, item)
784 DB *dbp;
785 PAGE *h;
786 u_int32_t indx;
787 DBT *item;
789 BOVERFLOW bo;
790 int ret;
792 B_TSET(bo.type, B_OVERFLOW, 0);
793 bo.tlen = item->size;
794 if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0)
795 return (ret);
797 OVPUT(h, indx, bo);
799 return (0);
803 * __bam_ritem --
804 * Replace an item on a page.
806 * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *));
809 __bam_ritem(dbp, h, indx, data)
810 DB *dbp;
811 PAGE *h;
812 u_int32_t indx;
813 DBT *data;
815 BKEYDATA *bk;
816 DBT orig, repl;
817 db_indx_t cnt, lo, ln, min, off, prefix, suffix;
818 int32_t nbytes;
819 int ret;
820 u_int8_t *p, *t;
823 * Replace a single item onto a page. The logic figuring out where
824 * to insert and whether it fits is handled in the caller. All we do
825 * here is manage the page shuffling.
827 bk = GET_BKEYDATA(h, indx);
829 /* Log the change. */
830 if (DB_LOGGING(dbp)) {
832 * We might as well check to see if the two data items share
833 * a common prefix and suffix -- it can save us a lot of log
834 * message if they're large.
836 min = data->size < bk->len ? data->size : bk->len;
837 for (prefix = 0,
838 p = bk->data, t = data->data;
839 prefix < min && *p == *t; ++prefix, ++p, ++t)
842 min -= prefix;
843 for (suffix = 0,
844 p = (u_int8_t *)bk->data + bk->len - 1,
845 t = (u_int8_t *)data->data + data->size - 1;
846 suffix < min && *p == *t; ++suffix, --p, --t)
849 /* We only log the parts of the keys that have changed. */
850 orig.data = (u_int8_t *)bk->data + prefix;
851 orig.size = bk->len - (prefix + suffix);
852 repl.data = (u_int8_t *)data->data + prefix;
853 repl.size = data->size - (prefix + suffix);
854 if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn,
855 &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h),
856 (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
857 &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
858 return (ret);
862 * Set references to the first in-use byte on the page and the
863 * first byte of the item being replaced.
865 p = (u_int8_t *)h + HOFFSET(h);
866 t = (u_int8_t *)bk;
869 * If the entry is growing in size, shift the beginning of the data
870 * part of the page down. If the entry is shrinking in size, shift
871 * the beginning of the data part of the page up. Use memmove(3),
872 * the regions overlap.
874 lo = BKEYDATA_SIZE(bk->len);
875 ln = BKEYDATA_SIZE(data->size);
876 if (lo != ln) {
877 nbytes = lo - ln; /* Signed difference. */
878 if (p == t) /* First index is fast. */
879 h->inp[indx] += nbytes;
880 else { /* Else, shift the page. */
881 memmove(p + nbytes, p, t - p);
883 /* Adjust the indices' offsets. */
884 off = h->inp[indx];
885 for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
886 if (h->inp[cnt] <= off)
887 h->inp[cnt] += nbytes;
890 /* Clean up the page and adjust the item's reference. */
891 HOFFSET(h) += nbytes;
892 t += nbytes;
895 /* Copy the new item onto the page. */
896 bk = (BKEYDATA *)t;
897 B_TSET(bk->type, B_KEYDATA, 0);
898 bk->len = data->size;
899 memcpy(bk->data, data->data, data->size);
901 return (0);
905 * __bam_ndup --
906 * Check to see if the duplicate set at indx should have its own page.
907 * If it should, create it.
909 static int
910 __bam_ndup(dbp, h, indx)
911 DB *dbp;
912 PAGE *h;
913 u_int32_t indx;
915 BKEYDATA *bk;
916 BOVERFLOW bo;
917 DBT hdr;
918 PAGE *cp;
919 db_indx_t cnt, cpindx, first, sz;
920 int ret;
922 while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
923 indx -= P_INDX;
924 for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
925 if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx])
926 break;
927 bk = GET_BKEYDATA(h, indx);
928 sz += B_TYPE(bk->type) == B_KEYDATA ?
929 BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
930 bk = GET_BKEYDATA(h, indx + O_INDX);
931 sz += B_TYPE(bk->type) == B_KEYDATA ?
932 BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
936 * If this set of duplicates is using more than 25% of the page, move
937 * them off. The choice of 25% is a WAG, but it has to be small enough
938 * that we can always split regardless of the presence of duplicates.
940 if (sz < dbp->pgsize / 4)
941 return (0);
943 /* Get a new page. */
944 if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0)
945 return (ret);
948 * Move this set of duplicates off the page. First points to the first
949 * key of the first duplicate key/data pair, cnt is the number of pairs
950 * we're dealing with.
952 memset(&hdr, 0, sizeof(hdr));
953 for (indx = first + O_INDX, cpindx = 0;; ++cpindx) {
954 /* Copy the entry to the new page. */
955 bk = GET_BKEYDATA(h, indx);
956 hdr.data = bk;
957 hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
958 BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
959 if ((ret =
960 __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0)
961 goto err;
964 * Move cursors referencing the old entry to the new entry.
965 * Done after the page put because __db_pitem() adjusts
966 * cursors on the new page, and before the delete because
967 * __db_ditem adjusts cursors on the old page.
969 __bam_ca_dup(dbp,
970 PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx);
972 /* Delete the data item. */
973 if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0)
974 goto err;
976 /* Delete all but the first reference to the key. */
977 if (--cnt == 0)
978 break;
979 if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0)
980 goto err;
983 /* Put in a new data item that points to the duplicates page. */
984 B_TSET(bo.type, B_DUPLICATE, 0);
985 bo.pgno = cp->pgno;
986 bo.tlen = 0;
988 OVPUT(h, indx, bo);
990 return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY));
992 err: (void)__bam_free(dbp, cp);
993 return (ret);
997 * __bam_fixed --
998 * Build the real record for a fixed length put.
1000 static int
1001 __bam_fixed(t, dbt)
1002 BTREE *t;
1003 DBT *dbt;
1005 RECNO *rp;
1007 rp = t->bt_recno;
1010 * If database contains fixed-length records, and the record is long,
1011 * return EINVAL.
1013 if (dbt->size > rp->re_len)
1014 return (EINVAL);
1017 * The caller checked to see if it was just right, so we know it's
1018 * short. Pad it out. We use the record data return memory, it's
1019 * only a short-term use.
1021 if (t->bt_rdata.ulen < rp->re_len) {
1022 t->bt_rdata.data = t->bt_rdata.data == NULL ?
1023 (void *)__db_malloc(rp->re_len) :
1024 (void *)__db_realloc(t->bt_rdata.data, rp->re_len);
1025 if (t->bt_rdata.data == NULL) {
1026 t->bt_rdata.ulen = 0;
1027 return (ENOMEM);
1029 t->bt_rdata.ulen = rp->re_len;
1031 memcpy(t->bt_rdata.data, dbt->data, dbt->size);
1032 memset((u_int8_t *)t->bt_rdata.data + dbt->size,
1033 rp->re_pad, rp->re_len - dbt->size);
1036 * Clean up our flags and other information just in case, and
1037 * change the caller's DBT to reference our created record.
1039 t->bt_rdata.size = rp->re_len;
1040 t->bt_rdata.dlen = 0;
1041 t->bt_rdata.doff = 0;
1042 t->bt_rdata.flags = 0;
1043 *dbt = t->bt_rdata;
1045 return (0);
1049 * __bam_partial --
1050 * Build the real record for a partial put.
1052 static int
1053 __bam_partial(dbp, dbt, h, indx, nbytes)
1054 DB *dbp;
1055 DBT *dbt;
1056 PAGE *h;
1057 u_int32_t indx, nbytes;
1059 BTREE *t;
1060 BKEYDATA *bk, tbk;
1061 BOVERFLOW *bo;
1062 DBT copy;
1063 u_int32_t len, tlen;
1064 u_int8_t *p;
1065 int ret;
1067 COMPQUIET(bo, NULL);
1069 t = dbp->internal;
1071 /* We use the record data return memory, it's only a short-term use. */
1072 if (t->bt_rdata.ulen < nbytes) {
1073 t->bt_rdata.data = t->bt_rdata.data == NULL ?
1074 (void *)__db_malloc(nbytes) :
1075 (void *)__db_realloc(t->bt_rdata.data, nbytes);
1076 if (t->bt_rdata.data == NULL) {
1077 t->bt_rdata.ulen = 0;
1078 return (ENOMEM);
1080 t->bt_rdata.ulen = nbytes;
1083 /* Find the current record. */
1084 if (indx < NUM_ENT(h)) {
1085 bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
1086 bo = (BOVERFLOW *)bk;
1087 } else {
1088 bk = &tbk;
1089 B_TSET(bk->type, B_KEYDATA, 0);
1090 bk->len = 0;
1094 * We use nul bytes for any part of the record that isn't specified,
1095 * get it over with.
1097 memset(t->bt_rdata.data, 0, nbytes);
1099 if (B_TYPE(bk->type) == B_OVERFLOW) {
1101 * In the case of an overflow record, we shift things around
1102 * in the current record rather than allocate a separate copy.
1104 memset(&copy, 0, sizeof(copy));
1105 if ((ret = __db_goff(dbp, &copy, bo->tlen,
1106 bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0)
1107 return (ret);
1109 /* Skip any leading data from the original record. */
1110 tlen = dbt->doff;
1111 p = (u_int8_t *)t->bt_rdata.data + dbt->doff;
1114 * Copy in any trailing data from the original record.
1116 * If the original record was larger than the original offset
1117 * plus the bytes being deleted, there is trailing data in the
1118 * original record we need to preserve. If we aren't deleting
1119 * the same number of bytes as we're inserting, copy it up or
1120 * down, into place.
1122 * Use memmove(), the regions may overlap.
1124 if (bo->tlen > dbt->doff + dbt->dlen) {
1125 len = bo->tlen - (dbt->doff + dbt->dlen);
1126 if (dbt->dlen != dbt->size)
1127 memmove(p + dbt->size, p + dbt->dlen, len);
1128 tlen += len;
1131 /* Copy in the application provided data. */
1132 memcpy(p, dbt->data, dbt->size);
1133 tlen += dbt->size;
1134 } else {
1135 /* Copy in any leading data from the original record. */
1136 memcpy(t->bt_rdata.data,
1137 bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
1138 tlen = dbt->doff;
1139 p = (u_int8_t *)t->bt_rdata.data + dbt->doff;
1141 /* Copy in the application provided data. */
1142 memcpy(p, dbt->data, dbt->size);
1143 tlen += dbt->size;
1145 /* Copy in any trailing data from the original record. */
1146 len = dbt->doff + dbt->dlen;
1147 if (bk->len > len) {
1148 memcpy(p + dbt->size, bk->data + len, bk->len - len);
1149 tlen += bk->len - len;
1153 /* Set the DBT to reference our new record. */
1154 t->bt_rdata.size = tlen;
1155 t->bt_rdata.dlen = 0;
1156 t->bt_rdata.doff = 0;
1157 t->bt_rdata.flags = 0;
1158 *dbt = t->bt_rdata;
1159 return (0);