2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
11 static const char sccsid
[] = "@(#)bt_cursor.c 10.53 (Sleepycat) 5/25/98";
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
25 static int __bam_c_close
__P((DBC
*));
26 static int __bam_c_del
__P((DBC
*, u_int32_t
));
27 static int __bam_c_first
__P((DB
*, CURSOR
*));
28 static int __bam_c_get
__P((DBC
*, DBT
*, DBT
*, u_int32_t
));
29 static int __bam_c_getstack
__P((DB
*, CURSOR
*));
30 static int __bam_c_last
__P((DB
*, CURSOR
*));
31 static int __bam_c_next
__P((DB
*, CURSOR
*, int));
32 static int __bam_c_physdel
__P((DB
*, CURSOR
*, PAGE
*));
33 static int __bam_c_prev
__P((DB
*, CURSOR
*));
34 static int __bam_c_put
__P((DBC
*, DBT
*, DBT
*, u_int32_t
));
35 static int __bam_c_rget
__P((DB
*, CURSOR
*, DBT
*, u_int32_t
));
36 static int __bam_c_search
37 __P((DB
*, CURSOR
*, const DBT
*, u_int32_t
, int, int *));
39 /* Discard the current page/lock held by a cursor. */
41 #define DISCARD(dbp, cp) { \
42 if ((cp)->page != NULL) { \
43 (void)memp_fput(dbp->mpf, (cp)->page, 0); \
46 if ((cp)->lock != LOCK_INVALID) { \
47 (void)__BT_TLPUT((dbp), (cp)->lock); \
48 (cp)->lock = LOCK_INVALID; \
54 * Interface to the cursor functions.
56 * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **));
59 __bam_cursor(dbp
, txn
, dbcp
)
67 DEBUG_LWRITE(dbp
, txn
, "bam_cursor", NULL
, NULL
, 0);
69 if ((dbc
= (DBC
*)__db_calloc(1, sizeof(DBC
))) == NULL
)
71 if ((cp
= (CURSOR
*)__db_calloc(1, sizeof(CURSOR
))) == NULL
) {
77 cp
->pgno
= cp
->dpgno
= PGNO_INVALID
;
78 cp
->lock
= LOCK_INVALID
;
83 dbc
->c_close
= __bam_c_close
;
84 dbc
->c_del
= __bam_c_del
;
85 dbc
->c_get
= __bam_c_get
;
86 dbc
->c_put
= __bam_c_put
;
89 * All cursors are queued from the master DB structure. Add the
90 * cursor to that queue.
93 TAILQ_INSERT_HEAD(&dbp
->curs_queue
, dbc
, links
);
102 * Close a single cursor.
111 DEBUG_LWRITE(dbc
->dbp
, dbc
->txn
, "bam_c_close", NULL
, NULL
, 0);
113 GETHANDLE(dbc
->dbp
, dbc
->txn
, &dbp
, ret
);
115 ret
= __bam_c_iclose(dbp
, dbc
);
123 * Close a single cursor -- internal version.
125 * PUBLIC: int __bam_c_iclose __P((DB *, DBC *));
128 __bam_c_iclose(dbp
, dbc
)
135 /* If a cursor key was deleted, perform the actual deletion. */
137 ret
= F_ISSET(cp
, C_DELETED
) ? __bam_c_physdel(dbp
, cp
, NULL
) : 0;
139 /* Discard any lock if we're not inside a transaction. */
140 if (cp
->lock
!= LOCK_INVALID
)
141 (void)__BT_TLPUT(dbp
, cp
->lock
);
143 /* Remove the cursor from the queue. */
145 TAILQ_REMOVE(&dbp
->curs_queue
, dbc
, links
);
146 CURSOR_TEARDOWN(dbp
);
148 /* Discard the structures. */
149 FREE(dbc
->internal
, sizeof(CURSOR
));
150 FREE(dbc
, sizeof(DBC
));
157 * Delete using a cursor.
160 __bam_c_del(dbc
, flags
)
173 DEBUG_LWRITE(dbc
->dbp
, dbc
->txn
, "bam_c_del", NULL
, NULL
, flags
);
178 /* Check for invalid flags. */
179 if ((ret
= __db_cdelchk(dbc
->dbp
, flags
,
180 F_ISSET(dbc
->dbp
, DB_AM_RDONLY
), cp
->pgno
!= PGNO_INVALID
)) != 0)
183 /* If already deleted, return failure. */
184 if (F_ISSET(cp
, C_DELETED
| C_REPLACE
))
185 return (DB_KEYEMPTY
);
187 GETHANDLE(dbc
->dbp
, dbc
->txn
, &dbp
, ret
);
191 * We don't physically delete the record until the cursor moves,
192 * so we have to have a long-lived write lock on the page instead
193 * of a long-lived read lock. Note, we have to have a read lock
194 * to even get here, so we simply discard it.
196 if (F_ISSET(dbp
, DB_AM_LOCKING
) && cp
->mode
!= DB_LOCK_WRITE
) {
197 if ((ret
= __bam_lget(dbp
,
198 0, cp
->pgno
, DB_LOCK_WRITE
, &lock
)) != 0)
200 (void)__BT_TLPUT(dbp
, cp
->lock
);
202 cp
->mode
= DB_LOCK_WRITE
;
206 * Acquire the underlying page (which may be different from the above
207 * page because it may be a duplicate page), and set the on-page and
208 * in-cursor delete flags. We don't need to lock it as we've already
209 * write-locked the page leading to it.
211 if (cp
->dpgno
== PGNO_INVALID
) {
219 if ((ret
= __bam_pget(dbp
, &h
, &pgno
, 0)) != 0)
222 /* Log the change. */
223 if (DB_LOGGING(dbp
) &&
224 (ret
= __bam_cdel_log(dbp
->dbenv
->lg_info
, dbp
->txn
, &LSN(h
),
225 0, dbp
->log_fileid
, PGNO(h
), &LSN(h
), indx
)) != 0) {
226 (void)memp_fput(dbp
->mpf
, h
, 0);
230 /* Set the intent-to-delete flag on the page and in all cursors. */
231 if (cp
->dpgno
== PGNO_INVALID
)
232 B_DSET(GET_BKEYDATA(h
, indx
+ O_INDX
)->type
);
234 B_DSET(GET_BKEYDATA(h
, indx
)->type
);
235 (void)__bam_ca_delete(dbp
, pgno
, indx
, NULL
, 0);
237 ret
= memp_fput(dbp
->mpf
, h
, DB_MPOOL_DIRTY
);
241 * If it's a btree with record numbers, we have to adjust the
244 if (F_ISSET(dbp
, DB_BT_RECNUM
) &&
245 (ret
= __bam_c_getstack(dbp
, cp
)) == 0) {
246 ret
= __bam_adjust(dbp
, t
, -1);
247 (void)__bam_stkrel(dbp
);
251 (void)memp_fput(dbp
->mpf
, h
, 0);
258 * Retrieve a key/data pair from the tree.
260 * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
263 __bam_get(argdbp
, txn
, key
, data
, flags
)
273 DEBUG_LREAD(argdbp
, txn
, "bam_get", key
, NULL
, flags
);
275 /* Check for invalid flags. */
276 if ((ret
= __db_getchk(argdbp
, key
, data
, flags
)) != 0)
279 /* Build an internal cursor. */
280 memset(&cp
, 0, sizeof(cp
));
282 cp
.pgno
= cp
.dpgno
= PGNO_INVALID
;
283 cp
.lock
= LOCK_INVALID
;
284 cp
.flags
= C_INTERNAL
;
286 /* Build an external cursor. */
287 memset(&dbc
, 0, sizeof(dbc
));
293 return(__bam_c_get(&dbc
,
294 key
, data
, LF_ISSET(DB_SET_RECNO
) ? DB_SET_RECNO
: DB_SET
));
299 * Get using a cursor (btree).
302 __bam_c_get(dbc
, key
, data
, flags
)
313 DEBUG_LREAD(dbc
->dbp
, dbc
->txn
, "bam_c_get",
314 flags
== DB_SET
|| flags
== DB_SET_RANGE
? key
: NULL
, NULL
, flags
);
318 /* Check for invalid flags. */
319 if ((ret
= __db_cgetchk(dbc
->dbp
,
320 key
, data
, flags
, cp
->pgno
!= PGNO_INVALID
)) != 0)
323 GETHANDLE(dbc
->dbp
, dbc
->txn
, &dbp
, ret
);
327 * Break out the code to return a cursor's record number. It
328 * has nothing to do with the cursor get code except that it's
329 * been rammed into the interface.
331 if (LF_ISSET(DB_GET_RECNO
)) {
332 ret
= __bam_c_rget(dbp
, cp
, data
, flags
);
337 /* Initialize the cursor for a new retrieval. */
340 cp
->lock
= LOCK_INVALID
;
344 /* It's not possible to return a deleted record. */
345 if (F_ISSET(cp
, C_DELETED
| C_REPLACE
)) {
347 return (DB_KEYEMPTY
);
350 /* Get the page with the current item on it. */
351 if ((ret
= __bam_pget(dbp
, &cp
->page
, &cp
->pgno
, 0)) != 0)
355 if (cp
->pgno
!= PGNO_INVALID
) {
356 if ((ret
= __bam_c_next(dbp
, cp
, 1)) != 0)
362 if ((ret
= __bam_c_first(dbp
, cp
)) != 0)
366 if (cp
->pgno
!= PGNO_INVALID
) {
367 if ((ret
= __bam_c_prev(dbp
, cp
)) != 0)
373 if ((ret
= __bam_c_last(dbp
, cp
)) != 0)
379 __bam_c_search(dbp
, cp
, key
, S_FIND
, 1, &exact
)) != 0)
385 __bam_c_search(dbp
, cp
, key
, S_FIND
, 0, &exact
)) != 0)
391 __bam_c_search(dbp
, cp
, key
, S_FIND
, 0, &exact
)) != 0)
397 * Return the key if the user didn't give us one. If we've moved to
398 * a duplicate page, we may no longer have a pointer to the main page,
399 * so we have to go get it. We know that it's already read-locked,
400 * however, so we don't have to acquire a new lock.
402 if (flags
!= DB_SET
) {
403 if (cp
->dpgno
!= PGNO_INVALID
) {
404 if ((ret
= __bam_pget(dbp
, &h
, &cp
->pgno
, 0)) != 0)
409 h
, cp
->indx
, key
, &t
->bt_rkey
.data
, &t
->bt_rkey
.ulen
);
410 if (cp
->dpgno
!= PGNO_INVALID
)
411 (void)memp_fput(dbp
->mpf
, h
, 0);
416 /* Return the data. */
417 if ((ret
= __db_ret(dbp
, cp
->page
,
418 cp
->dpgno
== PGNO_INVALID
? cp
->indx
+ O_INDX
: cp
->dindx
,
419 data
, &t
->bt_rdata
.data
, &t
->bt_rdata
.ulen
)) != 0)
423 * If the previous cursor record has been deleted, delete it. The
424 * returned key isn't a deleted key, so clear the flag.
426 if (F_ISSET(©
, C_DELETED
) && __bam_c_physdel(dbp
, ©
, cp
->page
))
428 F_CLR(cp
, C_DELETED
| C_REPLACE
);
430 /* Release the previous lock, if any. */
431 if (copy
.lock
!= LOCK_INVALID
)
432 (void)__BT_TLPUT(dbp
, copy
.lock
);
434 /* Release the pinned page. */
435 ret
= memp_fput(dbp
->mpf
, cp
->page
, 0);
437 /* Internal cursors don't hold locks. */
438 if (F_ISSET(cp
, C_INTERNAL
) && cp
->lock
!= LOCK_INVALID
)
439 (void)__BT_TLPUT(dbp
, cp
->lock
);
444 err
: if (cp
->page
!= NULL
)
445 (void)memp_fput(dbp
->mpf
, cp
->page
, 0);
446 if (cp
->lock
!= LOCK_INVALID
)
447 (void)__BT_TLPUT(dbp
, cp
->lock
);
457 * Return the record number for a cursor.
460 __bam_c_rget(dbp
, cp
, data
, flags
)
473 /* Get the page with the current item on it. */
474 if ((ret
= __bam_pget(dbp
, &cp
->page
, &cp
->pgno
, 0)) != 0)
477 /* Get a copy of the key. */
478 memset(&dbt
, 0, sizeof(DBT
));
479 dbt
.flags
= DB_DBT_MALLOC
| DB_DBT_INTERNAL
;
480 if ((ret
= __db_ret(dbp
, cp
->page
, cp
->indx
, &dbt
, NULL
, NULL
)) != 0)
484 if ((ret
= __bam_search(dbp
, &dbt
, S_FIND
, 1, &recno
, &exact
)) != 0)
488 ret
= __db_retcopy(data
, &recno
, sizeof(recno
),
489 &t
->bt_rdata
.data
, &t
->bt_rdata
.ulen
, dbp
->db_malloc
);
491 /* Release the stack. */
494 err
: (void)memp_fput(dbp
->mpf
, cp
->page
, 0);
501 * Put using a cursor.
504 __bam_c_put(dbc
, key
, data
, flags
)
516 int exact
, needkey
, ret
, stack
;
519 DEBUG_LWRITE(dbc
->dbp
, dbc
->txn
, "bam_c_put",
520 flags
== DB_KEYFIRST
|| flags
== DB_KEYLAST
? key
: NULL
,
525 if ((ret
= __db_cputchk(dbc
->dbp
, key
, data
, flags
,
526 F_ISSET(dbc
->dbp
, DB_AM_RDONLY
), cp
->pgno
!= PGNO_INVALID
)) != 0)
529 GETHANDLE(dbc
->dbp
, dbc
->txn
, &dbp
, ret
);
532 /* Initialize the cursor for a new retrieval. */
535 cp
->lock
= LOCK_INVALID
;
538 * To split, we need a valid key for the page. Since it's a cursor,
539 * we have to build one.
543 split
: /* Acquire a copy of a key from the page. */
545 memset(&dbt
, 0, sizeof(DBT
));
546 if ((ret
= __db_ret(dbp
, cp
->page
, indx
,
547 &dbt
, &t
->bt_rkey
.data
, &t
->bt_rkey
.ulen
)) != 0)
553 /* Discard any pinned pages. */
555 (void)__bam_stkrel(dbp
);
560 if ((ret
= __bam_split(dbp
, arg
)) != 0)
570 if (cp
->dpgno
== PGNO_INVALID
) {
579 * This test is right -- we don't currently support duplicates
580 * in the presence of record numbers, so we don't worry about
581 * them if DB_BT_RECNUM is set.
583 if (F_ISSET(dbp
, DB_BT_RECNUM
) &&
584 (flags
!= DB_CURRENT
|| F_ISSET(cp
, C_DELETED
))) {
585 /* Acquire a complete stack. */
586 if ((ret
= __bam_c_getstack(dbp
, cp
)) != 0)
588 cp
->page
= t
->bt_csp
->page
;
593 /* Acquire the current page. */
594 if ((ret
= __bam_lget(dbp
,
595 0, cp
->pgno
, DB_LOCK_WRITE
, &cp
->lock
)) == 0)
596 ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0);
602 if ((ret
= __bam_iitem(dbp
, &cp
->page
,
603 &indx
, key
, data
, flags
, iiflags
)) == DB_NEEDSPLIT
)
609 __bam_c_search(dbp
, cp
, key
, S_KEYFIRST
, 0, &exact
)) != 0)
613 indx
= cp
->dpgno
== PGNO_INVALID
? cp
->indx
: cp
->dindx
;
614 if ((ret
= __bam_iitem(dbp
, &cp
->page
, &indx
, key
,
615 data
, DB_BEFORE
, exact
? 0 : BI_NEWKEY
)) == DB_NEEDSPLIT
)
621 __bam_c_search(dbp
, cp
, key
, S_KEYLAST
, 0, &exact
)) != 0)
625 indx
= cp
->dpgno
== PGNO_INVALID
? cp
->indx
: cp
->dindx
;
626 if ((ret
= __bam_iitem(dbp
, &cp
->page
, &indx
, key
,
627 data
, DB_AFTER
, exact
? 0 : BI_NEWKEY
)) == DB_NEEDSPLIT
)
635 * Update the cursor to point to the new entry. The new entry was
636 * stored on the current page, because we split pages until it was
639 if (cp
->dpgno
== PGNO_INVALID
)
645 * If the previous cursor record has been deleted, delete it. The
646 * returned key isn't a deleted key, so clear the flag.
648 if (F_ISSET(©
, C_DELETED
) &&
649 (ret
= __bam_c_physdel(dbp
, ©
, cp
->page
)) != 0)
651 F_CLR(cp
, C_DELETED
| C_REPLACE
);
653 /* Release the previous lock, if any. */
654 if (copy
.lock
!= LOCK_INVALID
)
655 (void)__BT_TLPUT(dbp
, copy
.lock
);
658 * Discard any pages pinned in the tree and their locks, except for
659 * the leaf page, for which we only discard the pin, not the lock.
661 * Note, the leaf page participated in the stack we acquired, and so
662 * we have to adjust the stack as necessary. If there was only a
663 * single page on the stack, we don't have to free further stack pages.
666 if (stack
&& BT_STK_POP(t
) != NULL
)
667 (void)__bam_stkrel(dbp
);
669 if ((ret
= memp_fput(dbp
->mpf
, cp
->page
, 0)) != 0)
673 err
: /* Discard any pinned pages. */
675 (void)__bam_stkrel(dbp
);
687 * Return the first record.
690 __bam_c_first(dbp
, cp
)
697 /* Walk down the left-hand side of the tree. */
698 for (pgno
= PGNO_ROOT
;;) {
700 __bam_lget(dbp
, 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
702 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
705 /* If we find a leaf page, we're done. */
706 if (ISLEAF(cp
->page
))
709 pgno
= GET_BINTERNAL(cp
->page
, 0)->pgno
;
713 cp
->pgno
= cp
->page
->pgno
;
715 cp
->dpgno
= PGNO_INVALID
;
717 /* If it's an empty page or a deleted record, go to the next one. */
718 if (NUM_ENT(cp
->page
) == 0 ||
719 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->indx
+ O_INDX
)->type
))
720 if ((ret
= __bam_c_next(dbp
, cp
, 0)) != 0)
723 /* If it's a duplicate reference, go to the first entry. */
724 if ((ret
= __bam_ovfl_chk(dbp
, cp
, O_INDX
, 0)) != 0)
727 /* If it's a deleted record, go to the next one. */
728 if (cp
->dpgno
!= PGNO_INVALID
&&
729 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->dindx
)->type
))
730 if ((ret
= __bam_c_next(dbp
, cp
, 0)) != 0)
737 * Return the last record.
740 __bam_c_last(dbp
, cp
)
747 /* Walk down the right-hand side of the tree. */
748 for (pgno
= PGNO_ROOT
;;) {
750 __bam_lget(dbp
, 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
752 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
755 /* If we find a leaf page, we're done. */
756 if (ISLEAF(cp
->page
))
760 GET_BINTERNAL(cp
->page
, NUM_ENT(cp
->page
) - O_INDX
)->pgno
;
764 cp
->pgno
= cp
->page
->pgno
;
765 cp
->indx
= NUM_ENT(cp
->page
) == 0 ? 0 : NUM_ENT(cp
->page
) - P_INDX
;
766 cp
->dpgno
= PGNO_INVALID
;
768 /* If it's an empty page or a deleted record, go to the previous one. */
769 if (NUM_ENT(cp
->page
) == 0 ||
770 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->indx
+ O_INDX
)->type
))
771 if ((ret
= __bam_c_prev(dbp
, cp
)) != 0)
774 /* If it's a duplicate reference, go to the last entry. */
775 if ((ret
= __bam_ovfl_chk(dbp
, cp
, cp
->indx
+ O_INDX
, 1)) != 0)
778 /* If it's a deleted record, go to the previous one. */
779 if (cp
->dpgno
!= PGNO_INVALID
&&
780 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->dindx
)->type
))
781 if ((ret
= __bam_c_prev(dbp
, cp
)) != 0)
788 * Move to the next record.
791 __bam_c_next(dbp
, cp
, initial_move
)
796 db_indx_t adjust
, indx
;
801 * We're either moving through a page of duplicates or a btree leaf
804 if (cp
->dpgno
== PGNO_INVALID
) {
805 adjust
= dbp
->type
== DB_BTREE
? P_INDX
: O_INDX
;
813 if (cp
->page
== NULL
) {
815 __bam_lget(dbp
, 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
817 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
822 * If at the end of the page, move to a subsequent page.
825 * Check for >= NUM_ENT. If we're here as the result of a search that
826 * landed us on NUM_ENT, we'll increment indx before we test.
829 * This code handles empty pages and pages with only deleted entries.
834 if (indx
>= NUM_ENT(cp
->page
)) {
835 pgno
= cp
->page
->next_pgno
;
839 * If we're in a btree leaf page, we've reached the end
840 * of the tree. If we've reached the end of a page of
841 * duplicates, continue from the btree leaf page where
842 * we found this page of duplicates.
844 if (pgno
== PGNO_INVALID
) {
845 /* If in a btree leaf page, it's EOF. */
846 if (cp
->dpgno
== PGNO_INVALID
)
847 return (DB_NOTFOUND
);
849 /* Continue from the last btree leaf page. */
850 cp
->dpgno
= PGNO_INVALID
;
854 indx
= cp
->indx
+ P_INDX
;
858 if ((ret
= __bam_lget(dbp
,
859 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
861 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
866 /* Ignore deleted records. */
867 if (dbp
->type
== DB_BTREE
&&
868 ((cp
->dpgno
== PGNO_INVALID
&&
869 B_DISSET(GET_BKEYDATA(cp
->page
, indx
+ O_INDX
)->type
)) ||
870 (cp
->dpgno
!= PGNO_INVALID
&&
871 B_DISSET(GET_BKEYDATA(cp
->page
, indx
)->type
)))) {
877 * If we're not in a duplicates page, check to see if we've
878 * found a page of duplicates, in which case we move to the
881 if (cp
->dpgno
== PGNO_INVALID
) {
882 cp
->pgno
= cp
->page
->pgno
;
886 __bam_ovfl_chk(dbp
, cp
, indx
+ O_INDX
, 0)) != 0)
888 if (cp
->dpgno
!= PGNO_INVALID
) {
894 cp
->dpgno
= cp
->page
->pgno
;
904 * Move to the previous record.
907 __bam_c_prev(dbp
, cp
)
911 db_indx_t indx
, adjust
;
916 * We're either moving through a page of duplicates or a btree leaf
919 if (cp
->dpgno
== PGNO_INVALID
) {
920 adjust
= dbp
->type
== DB_BTREE
? P_INDX
: O_INDX
;
928 if (cp
->page
== NULL
) {
930 __bam_lget(dbp
, 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
932 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
937 * If at the beginning of the page, move to any previous one.
940 * This code handles empty pages and pages with only deleted entries.
944 pgno
= cp
->page
->prev_pgno
;
948 * If we're in a btree leaf page, we've reached the
949 * beginning of the tree. If we've reached the first
950 * of a page of duplicates, continue from the btree
951 * leaf page where we found this page of duplicates.
953 if (pgno
== PGNO_INVALID
) {
954 /* If in a btree leaf page, it's SOF. */
955 if (cp
->dpgno
== PGNO_INVALID
)
956 return (DB_NOTFOUND
);
958 /* Continue from the last btree leaf page. */
959 cp
->dpgno
= PGNO_INVALID
;
968 if ((ret
= __bam_lget(dbp
,
969 0, pgno
, DB_LOCK_READ
, &cp
->lock
)) != 0)
971 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
975 indx
= NUM_ENT(cp
->page
);
980 /* Ignore deleted records. */
982 if (dbp
->type
== DB_BTREE
&&
983 ((cp
->dpgno
== PGNO_INVALID
&&
984 B_DISSET(GET_BKEYDATA(cp
->page
, indx
+ O_INDX
)->type
)) ||
985 (cp
->dpgno
!= PGNO_INVALID
&&
986 B_DISSET(GET_BKEYDATA(cp
->page
, indx
)->type
))))
990 * If we're not in a duplicates page, check to see if we've
991 * found a page of duplicates, in which case we move to the
994 if (cp
->dpgno
== PGNO_INVALID
) {
995 cp
->pgno
= cp
->page
->pgno
;
999 __bam_ovfl_chk(dbp
, cp
, indx
+ O_INDX
, 1)) != 0)
1001 if (cp
->dpgno
!= PGNO_INVALID
) {
1002 indx
= cp
->dindx
+ O_INDX
;
1007 cp
->dpgno
= cp
->page
->pgno
;
1017 * Move to a specified record.
1020 __bam_c_search(dbp
, cp
, key
, flags
, isrecno
, exactp
)
1025 int isrecno
, *exactp
;
1032 needexact
= *exactp
;
1035 * Find any matching record; the search function pins the page. Make
1036 * sure it's a valid key (__bam_search may return an index just past
1037 * the end of a page) and return it.
1040 if ((ret
= __ram_getno(dbp
, key
, &recno
, 0)) != 0)
1042 ret
= __bam_rsearch(dbp
, &recno
, flags
, 1, exactp
);
1044 ret
= __bam_search(dbp
, key
, flags
, 1, NULL
, exactp
);
1048 cp
->page
= t
->bt_csp
->page
;
1049 cp
->pgno
= cp
->page
->pgno
;
1050 cp
->indx
= t
->bt_csp
->indx
;
1051 cp
->lock
= t
->bt_csp
->lock
;
1052 cp
->dpgno
= PGNO_INVALID
;
1055 * If we have an exact match, make sure that we're not looking at a
1056 * chain of duplicates -- if so, move to an entry in that chain.
1059 if ((ret
= __bam_ovfl_chk(dbp
,
1060 cp
, cp
->indx
+ O_INDX
, LF_ISSET(S_DUPLAST
))) != 0)
1064 return (DB_NOTFOUND
);
1066 /* If past the end of a page, find the next entry. */
1067 if (cp
->indx
== NUM_ENT(cp
->page
) &&
1068 (ret
= __bam_c_next(dbp
, cp
, 0)) != 0)
1071 /* If it's a deleted record, go to the next or previous one. */
1072 if (cp
->dpgno
!= PGNO_INVALID
&&
1073 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->dindx
)->type
)) {
1074 if (flags
== S_KEYLAST
) {
1075 if ((ret
= __bam_c_prev(dbp
, cp
)) != 0)
1078 if ((ret
= __bam_c_next(dbp
, cp
, 0)) != 0)
1082 * If we don't specify an exact match (the DB_KEYFIRST/DB_KEYLAST or
1083 * DB_SET_RANGE flags were set) __bam_search() may return a deleted
1084 * item. For DB_KEYFIRST/DB_KEYLAST, we don't care since we're only
1085 * using it for a tree position. For DB_SET_RANGE, we're returning
1086 * the key, so we have to adjust it.
1088 if (LF_ISSET(S_DELNO
) && cp
->dpgno
== PGNO_INVALID
&&
1089 B_DISSET(GET_BKEYDATA(cp
->page
, cp
->indx
+ O_INDX
)->type
))
1090 if ((ret
= __bam_c_next(dbp
, cp
, 0)) != 0)
1098 * Check for an overflow record, and if found, move to the correct
1101 * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int));
1104 __bam_ovfl_chk(dbp
, cp
, indx
, to_end
)
1114 /* Check for an overflow entry. */
1115 bo
= GET_BOVERFLOW(cp
->page
, indx
);
1116 if (B_TYPE(bo
->type
) != B_DUPLICATE
)
1120 * If we find one, go to the duplicates page, and optionally move
1121 * to the last record on that page.
1124 * We don't lock duplicates pages, we've already got the correct
1125 * lock on the main page.
1128 if ((ret
= memp_fput(dbp
->mpf
, cp
->page
, 0)) != 0)
1132 if ((ret
= __db_dend(dbp
, pgno
, &cp
->page
)) != 0)
1134 indx
= NUM_ENT(cp
->page
) - O_INDX
;
1136 if ((ret
= __bam_pget(dbp
, &cp
->page
, &pgno
, 0)) != 0)
1141 /* Update the duplicate entry in the cursor. */
1142 cp
->dpgno
= cp
->page
->pgno
;
1151 * Display the current btree cursor list.
1153 * PUBLIC: int __bam_cprint __P((DB *));
1163 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1164 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1165 cp
= (CURSOR
*)dbc
->internal
;
1167 "%#0x: page: %lu index: %lu dpage %lu dindex: %lu",
1168 (u_int
)cp
, (u_long
)cp
->pgno
, (u_long
)cp
->indx
,
1169 (u_long
)cp
->dpgno
, (u_long
)cp
->dindx
);
1170 if (F_ISSET(cp
, C_DELETED
))
1171 fprintf(stderr
, "(deleted)");
1172 fprintf(stderr
, "\n");
1174 CURSOR_TEARDOWN(dbp
);
1181 * __bam_ca_delete --
1182 * Check if any of the cursors refer to the item we are about to delete,
1183 * returning the number of cursors that refer to the item in question.
1185 * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int));
1188 __bam_ca_delete(dbp
, pgno
, indx
, curs
, key_delete
)
1197 int count
; /* !!!: Has to contain max number of cursors. */
1200 * Adjust the cursors. We don't have to review the cursors for any
1201 * process other than the current one, because we have the page write
1202 * locked at this point, and any other process had better be using a
1203 * different locker ID, meaning that only cursors in our process can
1206 * It's possible for multiple cursors within the thread to have write
1207 * locks on the same page, but, cursors within a thread must be single
1208 * threaded, so all we're locking here is the cursor linked list.
1211 for (count
= 0, dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1212 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1213 cp
= (CURSOR
*)dbc
->internal
;
1216 * Optionally, a cursor passed in is the one initiating the
1217 * delete, so we don't want to count it or set its deleted
1218 * flag. Otherwise, if a cursor refers to the item, then we
1219 * set its deleted flag.
1225 * If we're deleting the key itself and not just one of its
1226 * duplicates, repoint the cursor to the main-page key/data
1227 * pair, everything else is about to be discarded.
1229 if (key_delete
|| cp
->dpgno
== PGNO_INVALID
) {
1230 if (cp
->pgno
== pgno
&& cp
->indx
== indx
) {
1231 cp
->dpgno
= PGNO_INVALID
;
1233 F_SET(cp
, C_DELETED
);
1236 if (cp
->dpgno
== pgno
&& cp
->dindx
== indx
) {
1238 F_SET(cp
, C_DELETED
);
1241 CURSOR_TEARDOWN(dbp
);
1248 * Adjust the cursors during a delete or insert.
1250 * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int));
1253 __bam_ca_di(dbp
, pgno
, indx
, adjust
)
1262 /* Recno is responsible for its own adjustments. */
1263 if (dbp
->type
== DB_RECNO
)
1267 * Adjust the cursors. See the comment in __bam_ca_delete().
1270 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1271 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1272 cp
= (CURSOR
*)dbc
->internal
;
1273 if (cp
->pgno
== pgno
&& cp
->indx
>= indx
)
1275 if (cp
->dpgno
== pgno
&& cp
->dindx
>= indx
)
1276 cp
->dindx
+= adjust
;
1278 CURSOR_TEARDOWN(dbp
);
1283 * Adjust the cursors when moving data items to a duplicates page.
1285 * PUBLIC: void __bam_ca_dup __P((DB *,
1286 * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t));
1289 __bam_ca_dup(dbp
, fpgno
, first
, fi
, tpgno
, ti
)
1291 db_pgno_t fpgno
, tpgno
;
1292 u_int32_t first
, fi
, ti
;
1298 * Adjust the cursors. See the comment in __bam_ca_delete().
1300 * No need to test duplicates, this only gets called when moving
1301 * leaf page data items onto a duplicates page.
1304 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1305 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1306 cp
= (CURSOR
*)dbc
->internal
;
1308 * Ignore matching entries that have already been moved,
1309 * we move from the same location on the leaf page more
1312 if (cp
->dpgno
== PGNO_INVALID
&&
1313 cp
->pgno
== fpgno
&& cp
->indx
== fi
) {
1319 CURSOR_TEARDOWN(dbp
);
1324 * Adjust the cursors when moving data items to another page.
1326 * PUBLIC: void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t));
1329 __bam_ca_move(dbp
, fpgno
, tpgno
)
1331 db_pgno_t fpgno
, tpgno
;
1336 /* Recno is responsible for its own adjustments. */
1337 if (dbp
->type
== DB_RECNO
)
1341 * Adjust the cursors. See the comment in __bam_ca_delete().
1343 * No need to test duplicates, this only gets called when copying
1344 * over the root page with a leaf or internal page.
1347 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1348 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1349 cp
= (CURSOR
*)dbc
->internal
;
1350 if (cp
->pgno
== fpgno
)
1353 CURSOR_TEARDOWN(dbp
);
1357 * __bam_ca_replace --
1358 * Check if any of the cursors refer to the item we are about to replace.
1359 * If so, their flags should be changed from deleted to replaced.
1361 * PUBLIC: void __bam_ca_replace
1362 * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg));
1365 __bam_ca_replace(dbp
, pgno
, indx
, pass
)
1369 ca_replace_arg pass
;
1375 * Adjust the cursors. See the comment in __bam_ca_delete().
1377 * Find any cursors that have logically deleted a record we're about
1380 * Pass == REPLACE_SETUP:
1381 * Set C_REPLACE_SETUP so we can find the cursors again.
1383 * Pass == REPLACE_SUCCESS:
1384 * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the
1385 * overwrite was successful.
1387 * Pass == REPLACE_FAILED:
1388 * Clear C_REPLACE_SETUP, the overwrite failed.
1390 * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value
1391 * for the cursor as it may have been changed by other cursor update
1392 * routines as the item was deleted/inserted.
1396 case REPLACE_SETUP
: /* Setup. */
1397 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1398 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1399 cp
= (CURSOR
*)dbc
->internal
;
1400 if ((cp
->pgno
== pgno
&& cp
->indx
== indx
) ||
1401 (cp
->dpgno
== pgno
&& cp
->dindx
== indx
))
1402 F_SET(cp
, C_REPLACE_SETUP
);
1405 case REPLACE_SUCCESS
: /* Overwrite succeeded. */
1406 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1407 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1408 cp
= (CURSOR
*)dbc
->internal
;
1409 if (F_ISSET(cp
, C_REPLACE_SETUP
)) {
1410 if (cp
->dpgno
== pgno
)
1412 if (cp
->pgno
== pgno
)
1414 F_SET(cp
, C_REPLACE
);
1415 F_CLR(cp
, C_DELETED
| C_REPLACE_SETUP
);
1419 case REPLACE_FAILED
: /* Overwrite failed. */
1420 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1421 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1422 cp
= (CURSOR
*)dbc
->internal
;
1423 if (F_ISSET(cp
, C_REPLACE_SETUP
)) {
1424 if (cp
->dpgno
== pgno
)
1426 if (cp
->pgno
== pgno
)
1428 F_CLR(cp
, C_REPLACE_SETUP
);
1433 CURSOR_TEARDOWN(dbp
);
1438 * Adjust the cursors when splitting a page.
1440 * PUBLIC: void __bam_ca_split __P((DB *,
1441 * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
1444 __bam_ca_split(dbp
, ppgno
, lpgno
, rpgno
, split_indx
, cleft
)
1446 db_pgno_t ppgno
, lpgno
, rpgno
;
1447 u_int32_t split_indx
;
1453 /* Recno is responsible for its own adjustments. */
1454 if (dbp
->type
== DB_RECNO
)
1458 * Adjust the cursors. See the comment in __bam_ca_delete().
1460 * If splitting the page that a cursor was on, the cursor has to be
1461 * adjusted to point to the same record as before the split. Most
1462 * of the time we don't adjust pointers to the left page, because
1463 * we're going to copy its contents back over the original page. If
1464 * the cursor is on the right page, it is decremented by the number of
1465 * records split to the left page.
1468 for (dbc
= TAILQ_FIRST(&dbp
->curs_queue
);
1469 dbc
!= NULL
; dbc
= TAILQ_NEXT(dbc
, links
)) {
1470 cp
= (CURSOR
*)dbc
->internal
;
1471 if (cp
->pgno
== ppgno
) {
1472 if (cp
->indx
< split_indx
) {
1477 cp
->indx
-= split_indx
;
1480 if (cp
->dpgno
== ppgno
) {
1481 if (cp
->dindx
< split_indx
) {
1486 cp
->dindx
-= split_indx
;
1490 CURSOR_TEARDOWN(dbp
);
1494 * __bam_c_physdel --
1495 * Actually do the cursor deletion.
1498 __bam_c_physdel(dbp
, cp
, h
)
1503 enum { DELETE_ITEM
, DELETE_PAGE
, NOTHING_FURTHER
} cmd
;
1509 db_pgno_t pgno
, next_pgno
, prev_pgno
;
1510 int delete_page
, local_page
, ret
;
1513 delete_page
= ret
= 0;
1515 /* Figure out what we're deleting. */
1516 if (cp
->dpgno
== PGNO_INVALID
) {
1525 * If the item is referenced by another cursor, leave it up to that
1526 * cursor to do the delete.
1528 if (__bam_ca_delete(dbp
, pgno
, indx
, cp
, 0) != 0)
1532 * If we don't already have the page locked, get it and delete the
1535 if ((h
== NULL
|| h
->pgno
!= pgno
)) {
1536 if ((ret
= __bam_lget(dbp
, 0, pgno
, DB_LOCK_WRITE
, &lock
)) != 0)
1538 if ((ret
= __bam_pget(dbp
, &h
, &pgno
, 0)) != 0)
1545 * If we're deleting a duplicate entry and there are other duplicate
1546 * entries remaining, call the common code to do the work and fix up
1547 * the parent page as necessary. Otherwise, do a normal btree delete.
1549 * There are 5 possible cases:
1551 * 1. It's not a duplicate item: do a normal btree delete.
1552 * 2. It's a duplicate item:
1553 * 2a: We delete an item from a page of duplicates, but there are
1554 * more items on the page.
1555 * 2b: We delete the last item from a page of duplicates, deleting
1556 * the last duplicate.
1557 * 2c: We delete the last item from a page of duplicates, but there
1558 * is a previous page of duplicates.
1559 * 2d: We delete the last item from a page of duplicates, but there
1560 * is a following page of duplicates.
1564 * 1: There's nothing further to do.
1565 * 2a: There's nothing further to do.
1566 * 2b: Do the normal btree delete instead of a duplicate delete, as
1567 * that deletes both the duplicate chain and the parent page's
1569 * 2c: There's nothing further to do.
1570 * 2d: Delete the duplicate, and update the parent page's entry.
1572 if (TYPE(h
) == P_DUPLICATE
) {
1574 prev_pgno
= PREV_PGNO(h
);
1575 next_pgno
= NEXT_PGNO(h
);
1577 if (NUM_ENT(h
) == 1 &&
1578 prev_pgno
== PGNO_INVALID
&& next_pgno
== PGNO_INVALID
)
1583 /* Delete the duplicate. */
1584 if ((ret
= __db_drem(dbp
, &h
, indx
, __bam_free
)) != 0)
1588 * 2a: h != NULL, h->pgno == pgno
1589 * 2b: We don't reach this clause, as the above test
1591 * 2c: h == NULL, prev_pgno != PGNO_INVALID
1592 * 2d: h != NULL, next_pgno != PGNO_INVALID
1594 * Test for 2a and 2c: if we didn't empty the current
1595 * page or there was a previous page of duplicates, we
1596 * don't need to touch the parent page.
1598 if ((h
!= NULL
&& pgno
== h
->pgno
) ||
1599 prev_pgno
!= PGNO_INVALID
)
1600 cmd
= NOTHING_FURTHER
;
1604 * Release any page we're holding and its lock.
1607 * If there is no subsequent page in the duplicate chain, then
1608 * __db_drem will have put page "h" and set it to NULL.
1612 (void)memp_fput(dbp
->mpf
, h
, 0);
1613 (void)__BT_TLPUT(dbp
, lock
);
1617 if (cmd
== NOTHING_FURTHER
)
1620 /* Acquire the parent page and switch the index to its entry. */
1622 __bam_lget(dbp
, 0, cp
->pgno
, DB_LOCK_WRITE
, &lock
)) != 0)
1624 if ((ret
= __bam_pget(dbp
, &h
, &cp
->pgno
, 0)) != 0) {
1625 (void)__BT_TLPUT(dbp
, lock
);
1631 if (cmd
== DELETE_PAGE
)
1635 * Copy, delete, update, add-back the parent page's data entry.
1638 * This may be a performance/logging problem. We should add a
1639 * log message which simply logs/updates a random set of bytes
1640 * on a page, and use it instead of doing a delete/add pair.
1643 bo
= *GET_BOVERFLOW(h
, indx
);
1644 (void)__db_ditem(dbp
, h
, indx
, BOVERFLOW_SIZE
);
1645 bo
.pgno
= next_pgno
;
1646 memset(&dbt
, 0, sizeof(dbt
));
1648 dbt
.size
= BOVERFLOW_SIZE
;
1649 (void)__db_pitem(dbp
, h
, indx
, BOVERFLOW_SIZE
, &dbt
, NULL
);
1650 (void)memp_fset(dbp
->mpf
, h
, DB_MPOOL_DIRTY
);
1655 * If the page is going to be emptied, delete it. To delete a leaf
1656 * page we need a copy of a key from the page. We use the 0th page
1657 * index since it's the last key that the page held.
1659 * We malloc the page information instead of using the return key/data
1660 * memory because we've already set them -- the reason we've already
1661 * set them is because we're (potentially) about to do a reverse split,
1662 * which would make our saved page information useless.
1665 * The following operations to delete a page might deadlock. I think
1666 * that's OK. The problem is if we're deleting an item because we're
1667 * closing cursors because we've already deadlocked and want to call
1668 * txn_abort(). If we fail due to deadlock, we leave a locked empty
1669 * page in the tree, which won't be empty long because we're going to
1672 if (NUM_ENT(h
) == 2 && h
->pgno
!= PGNO_ROOT
) {
1673 memset(&dbt
, 0, sizeof(DBT
));
1674 dbt
.flags
= DB_DBT_MALLOC
| DB_DBT_INTERNAL
;
1675 if ((ret
= __db_ret(dbp
, h
, 0, &dbt
, NULL
, NULL
)) != 0)
1681 * Do a normal btree delete.
1684 * Delete the key item first, otherwise the duplicate checks in
1685 * __bam_ditem() won't work!
1687 if ((ret
= __bam_ditem(dbp
, h
, indx
)) != 0)
1689 if ((ret
= __bam_ditem(dbp
, h
, indx
)) != 0)
1692 /* Discard any remaining locks/pages. */
1694 (void)memp_fput(dbp
->mpf
, h
, 0);
1695 (void)__BT_TLPUT(dbp
, lock
);
1699 /* Delete the page if it was emptied. */
1701 ret
= __bam_dpage(dbp
, &dbt
);
1704 done
: if (delete_page
)
1705 __db_free(dbt
.data
);
1708 (void)memp_fput(dbp
->mpf
, h
, 0);
1709 (void)__BT_TLPUT(dbp
, lock
);
1713 ++t
->lstat
.bt_deleted
;
1718 * __bam_c_getstack --
1719 * Acquire a full stack for a cursor.
1722 __bam_c_getstack(dbp
, cp
)
1733 memset(&dbt
, 0, sizeof(DBT
));
1735 /* Get the page with the current item on it. */
1737 if ((ret
= __bam_pget(dbp
, &h
, &pgno
, 0)) != 0)
1740 /* Get a copy of a key from the page. */
1741 dbt
.flags
= DB_DBT_MALLOC
| DB_DBT_INTERNAL
;
1742 if ((ret
= __db_ret(dbp
, h
, 0, &dbt
, NULL
, NULL
)) != 0)
1745 /* Get a write-locked stack for that page. */
1747 ret
= __bam_search(dbp
, &dbt
, S_KEYFIRST
, 1, NULL
, &exact
);
1749 /* We no longer need the key or the page. */
1751 (void)memp_fput(dbp
->mpf
, h
, 0);
1752 if (dbt
.data
!= NULL
)
1753 __db_free(dbt
.data
);