Update.
[glibc.git] / db2 / btree / bt_search.c
blob09ce46d90ac601296e1bc8f80bf653995d9fcbd7
1 /*-
2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 /*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 * Keith Bostic. All rights reserved.
12 * Copyright (c) 1990, 1993, 1994, 1995
13 * The Regents of the University of California. All rights reserved.
15 * This code is derived from software contributed to Berkeley by
16 * Mike Olson.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 * 3. All advertising materials mentioning features or use of this software
27 * must display the following acknowledgement:
28 * This product includes software developed by the University of
29 * California, Berkeley and its contributors.
30 * 4. Neither the name of the University nor the names of its contributors
31 * may be used to endorse or promote products derived from this software
32 * without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 * SUCH DAMAGE.
47 #include "config.h"
49 #ifndef lint
50 static const char sccsid[] = "@(#)bt_search.c 10.15 (Sleepycat) 5/6/98";
51 #endif /* not lint */
53 #ifndef NO_SYSTEM_INCLUDES
54 #include <sys/types.h>
56 #include <errno.h>
57 #include <string.h>
58 #endif
60 #include "db_int.h"
61 #include "db_page.h"
62 #include "btree.h"
65 * __bam_search --
66 * Search a btree for a key.
68 * PUBLIC: int __bam_search __P((DB *,
69 * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
71 int
72 __bam_search(dbp, key, flags, stop, recnop, exactp)
73 DB *dbp;
74 const DBT *key;
75 u_int32_t flags;
76 int stop, *exactp;
77 db_recno_t *recnop;
79 BTREE *t;
80 DB_LOCK lock;
81 EPG cur;
82 PAGE *h;
83 db_indx_t base, i, indx, lim;
84 db_pgno_t pg;
85 db_recno_t recno;
86 int cmp, jump, ret, stack;
88 t = dbp->internal;
89 recno = 0;
91 BT_STK_CLR(t);
94 * There are several ways we search a btree tree. The flags argument
95 * specifies if we're acquiring read or write locks, if we position
96 * to the first or last item in a set of duplicates, if we return
97 * deleted items, and if we are locking pairs of pages. See btree.h
98 * for more details. In addition, if we're doing record numbers, we
99 * have to lock the entire tree regardless.
101 * If write-locking pages, we need to know whether or not to acquire a
102 * write lock on a page before getting it. This depends on how deep it
103 * is in tree, which we don't know until we acquire the root page. So,
104 * if we need to lock the root page we may have to upgrade it later,
105 * because we won't get the correct lock initially.
107 * Retrieve the root page.
109 pg = PGNO_ROOT;
110 stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK);
111 if ((ret = __bam_lget(dbp,
112 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
113 return (ret);
114 if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
115 (void)__BT_LPUT(dbp, lock);
116 return (ret);
120 * Decide if we need to save this page; if we do, write lock it.
121 * We deliberately don't lock-couple on this call. If the tree
122 * is tiny, i.e., one page, and two threads are busily updating
123 * the root page, we're almost guaranteed deadlocks galore, as
124 * each one gets a read lock and then blocks the other's attempt
125 * for a write lock.
127 if (!stack &&
128 ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
129 (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
130 (void)memp_fput(dbp->mpf, h, 0);
131 (void)__BT_LPUT(dbp, lock);
132 if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
133 return (ret);
134 if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) {
135 (void)__BT_LPUT(dbp, lock);
136 return (ret);
139 stack = 1;
142 for (;;) {
144 * Do a binary search on the current page. If we're searching
145 * a leaf page, we have to manipulate the indices in groups of
146 * two. If we're searching an internal page, they're an index
147 * per page item. If we find an exact match on a leaf page,
148 * we're done.
150 cur.page = h;
151 jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
152 for (base = 0,
153 lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) {
154 cur.indx = indx = base + ((lim >> 1) * jump);
155 if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) {
156 if (TYPE(h) == P_LBTREE)
157 goto match;
158 goto next;
160 if (cmp > 0) {
161 base = indx + jump;
162 --lim;
167 * No match found. Base is the smallest index greater than
168 * key and may be zero or a last + O_INDX index.
170 * If it's a leaf page, return base as the "found" value.
171 * Delete only deletes exact matches.
173 if (TYPE(h) == P_LBTREE) {
174 *exactp = 0;
176 if (LF_ISSET(S_EXACT))
177 goto notfound;
180 * !!!
181 * Possibly returning a deleted record -- DB_SET_RANGE,
182 * DB_KEYFIRST and DB_KEYLAST don't require an exact
183 * match, and we don't want to walk multiple pages here
184 * to find an undeleted record. This is handled in the
185 * __bam_c_search() routine.
187 BT_STK_ENTER(t, h, base, lock, ret);
188 return (ret);
192 * If it's not a leaf page, record the internal page (which is
193 * a parent page for the key). Decrement the base by 1 if it's
194 * non-zero so that if a split later occurs, the inserted page
195 * will be to the right of the saved page.
197 indx = base > 0 ? base - O_INDX : base;
200 * If we're trying to calculate the record number, sum up
201 * all the record numbers on this page up to the indx point.
203 if (recnop != NULL)
204 for (i = 0; i < indx; ++i)
205 recno += GET_BINTERNAL(h, i)->nrecs;
207 next: pg = GET_BINTERNAL(h, indx)->pgno;
208 if (stack) {
209 /* Return if this is the lowest page wanted. */
210 if (LF_ISSET(S_PARENT) && stop == h->level) {
211 BT_STK_ENTER(t, h, indx, lock, ret);
212 return (ret);
214 BT_STK_PUSH(t, h, indx, lock, ret);
215 if (ret != 0)
216 goto err;
218 if ((ret =
219 __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0)
220 goto err;
221 } else {
222 (void)memp_fput(dbp->mpf, h, 0);
225 * Decide if we want to return a pointer to the next
226 * page in the stack. If we do, write lock it and
227 * never unlock it.
229 if ((LF_ISSET(S_PARENT) &&
230 (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
231 (h->level - 1) == LEAFLEVEL)
232 stack = 1;
234 if ((ret =
235 __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ?
236 DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0)
237 goto err;
239 if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0)
240 goto err;
243 /* NOTREACHED */
244 match: *exactp = 1;
247 * If we're trying to calculate the record number, add in the
248 * offset on this page and correct for the fact that records
249 * in the tree are 0-based.
251 if (recnop != NULL)
252 *recnop = recno + (indx / P_INDX) + 1;
255 * If we got here, we know that we have a btree leaf page.
257 * If there are duplicates, go to the first/last one. This is
258 * safe because we know that we're not going to leave the page,
259 * all duplicate sets that are not on overflow pages exist on a
260 * single leaf page.
262 if (LF_ISSET(S_DUPLAST))
263 while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
264 h->inp[indx] == h->inp[indx + P_INDX])
265 indx += P_INDX;
266 else
267 while (indx > 0 &&
268 h->inp[indx] == h->inp[indx - P_INDX])
269 indx -= P_INDX;
272 * Now check if we are allowed to return deleted items; if not
273 * find the next (or previous) non-deleted item.
275 if (LF_ISSET(S_DELNO)) {
276 if (LF_ISSET(S_DUPLAST))
277 while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
278 indx > 0 &&
279 h->inp[indx] == h->inp[indx - P_INDX])
280 indx -= P_INDX;
281 else
282 while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) &&
283 indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
284 h->inp[indx] == h->inp[indx + P_INDX])
285 indx += P_INDX;
287 if (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
288 goto notfound;
291 BT_STK_ENTER(t, h, indx, lock, ret);
292 return (ret);
294 notfound:
295 (void)memp_fput(dbp->mpf, h, 0);
296 (void)__BT_LPUT(dbp, lock);
297 ret = DB_NOTFOUND;
299 err: if (t->bt_csp > t->bt_sp) {
300 BT_STK_POP(t);
301 __bam_stkrel(dbp);
303 return (ret);
307 * __bam_stkrel --
308 * Release all pages currently held in the stack.
310 * PUBLIC: int __bam_stkrel __P((DB *));
313 __bam_stkrel(dbp)
314 DB *dbp;
316 BTREE *t;
317 EPG *epg;
319 t = dbp->internal;
320 for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) {
321 (void)memp_fput(dbp->mpf, epg->page, 0);
322 (void)__BT_TLPUT(dbp, epg->lock);
324 return (0);
328 * __bam_stkgrow --
329 * Grow the stack.
331 * PUBLIC: int __bam_stkgrow __P((BTREE *));
334 __bam_stkgrow(t)
335 BTREE *t;
337 EPG *p;
338 size_t entries;
340 entries = t->bt_esp - t->bt_sp;
342 if ((p = (EPG *)__db_calloc(entries * 2, sizeof(EPG))) == NULL)
343 return (ENOMEM);
344 memcpy(p, t->bt_sp, entries * sizeof(EPG));
345 if (t->bt_sp != t->bt_stack)
346 FREE(t->bt_sp, entries * sizeof(EPG));
347 t->bt_sp = p;
348 t->bt_csp = p + entries;
349 t->bt_esp = p + entries * 2;
350 return (0);