wintest: add option to select the dns backend
[Samba/gebeck_regimport.git] / lib / ntdb / hash.c
blobad1196ecdea5284a6409889e57de684b1d05a2df
1 /*
2 Trivial Database 2: hash handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 #include "private.h"
19 #include <ccan/hash/hash.h>
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
23 void *unused)
25 return hash_stable((const unsigned char *)key, length, seed);
28 uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
30 return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
33 static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
34 const struct ntdb_used_record *rec,
35 ntdb_off_t off,
36 const NTDB_DATA *key,
37 const char **rptr)
39 ntdb_bool_err ret = false;
40 const char *rkey;
42 if (rec_key_length(rec) != key->dsize) {
43 ntdb->stats.compare_wrong_keylen++;
44 return ret;
47 rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
48 key->dsize + rec_data_length(rec), false);
49 if (NTDB_PTR_IS_ERR(rkey)) {
50 return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
52 if (memcmp(rkey, key->dptr, key->dsize) == 0) {
53 if (rptr) {
54 *rptr = rkey;
55 } else {
56 ntdb_access_release(ntdb, rkey);
58 return true;
60 ntdb->stats.compare_wrong_keycmp++;
61 ntdb_access_release(ntdb, rkey);
62 return ret;
65 /* Does entry match? */
66 static ntdb_bool_err match(struct ntdb_context *ntdb,
67 uint32_t hash,
68 const NTDB_DATA *key,
69 ntdb_off_t val,
70 struct ntdb_used_record *rec,
71 const char **rptr,
72 const ntdb_off_t **mapped)
74 ntdb_off_t off;
75 enum NTDB_ERROR ecode;
77 ntdb->stats.compares++;
79 /* Top bits of offset == next bits of hash. */
80 if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
81 != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
82 ntdb->stats.compare_wrong_offsetbits++;
83 return false;
86 /* Unmap before we try to read actual record, which may cause expand */
87 if (mapped) {
88 ntdb_access_release(ntdb, *mapped);
89 *mapped = NULL;
92 off = val & NTDB_OFF_MASK;
93 ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
94 if (ecode != NTDB_SUCCESS) {
95 return (ntdb_bool_err)ecode;
98 return key_matches(ntdb, rec, off, key, rptr);
101 static bool is_chain(ntdb_off_t val)
103 return val & (1ULL << NTDB_OFF_CHAIN_BIT);
106 static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
108 return base + sizeof(struct ntdb_used_record)
109 + idx * sizeof(ntdb_off_t);
112 /* This is the core routine which searches the hashtable for an entry.
113 * On error, no locks are held and -ve is returned.
114 * Otherwise, hinfo is filled in.
115 * If not found, the return value is 0.
116 * If found, the return value is the offset, and *rec is the record. */
117 ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
118 NTDB_DATA key,
119 int ltype,
120 struct hash_info *h,
121 struct ntdb_used_record *rec,
122 const char **rptr)
124 ntdb_off_t off, val;
125 const ntdb_off_t *arr = NULL;
126 ntdb_len_t i;
127 bool found_empty;
128 enum NTDB_ERROR ecode;
129 struct ntdb_used_record chdr;
130 ntdb_bool_err berr;
132 h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
134 h->table = NTDB_HASH_OFFSET;
135 h->table_size = 1 << ntdb->hash_bits;
136 h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
137 h->old_val = 0;
139 ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
140 if (ecode != NTDB_SUCCESS) {
141 return NTDB_ERR_TO_OFF(ecode);
144 off = hbucket_off(h->table, h->bucket);
145 val = ntdb_read_off(ntdb, off);
146 if (NTDB_OFF_IS_ERR(val)) {
147 ecode = NTDB_OFF_TO_ERR(val);
148 goto fail;
151 /* Directly in hash table? */
152 if (!likely(is_chain(val))) {
153 if (val) {
154 berr = match(ntdb, h->h, &key, val, rec, rptr, NULL);
155 if (berr < 0) {
156 ecode = NTDB_OFF_TO_ERR(berr);
157 goto fail;
159 if (berr) {
160 return val & NTDB_OFF_MASK;
162 /* If you want to insert here, make a chain. */
163 h->old_val = val;
165 return 0;
168 /* Nope? Iterate through chain. */
169 h->table = val & NTDB_OFF_MASK;
171 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
172 if (ecode != NTDB_SUCCESS) {
173 goto fail;
176 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
177 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
178 NTDB_LOG_ERROR,
179 "find_and_lock:"
180 " corrupt record %#x at %llu",
181 rec_magic(&chdr), (long long)off);
182 goto fail;
185 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
187 found_empty = false;
188 for (i = 0; i < h->table_size; i++) {
189 /* Careful! match has to unmap this if we access a
190 * record (may cause mmap of database to move. */
191 if (!arr) {
192 arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
193 rec_data_length(&chdr), true);
194 if (NTDB_PTR_IS_ERR(arr)) {
195 ecode = NTDB_PTR_ERR(arr);
196 goto fail;
200 val = arr[i];
201 if (val == 0) {
202 if (!found_empty) {
203 h->bucket = i;
204 found_empty = true;
206 } else {
207 berr = match(ntdb, h->h, &key, val, rec, rptr, &arr);
208 if (berr < 0) {
209 ecode = NTDB_OFF_TO_ERR(berr);
210 if (arr) {
211 ntdb_access_release(ntdb, arr);
213 goto fail;
215 if (berr) {
216 /* We found it! */
217 h->bucket = i;
218 off = val & NTDB_OFF_MASK;
219 if (arr) {
220 ntdb_access_release(ntdb, arr);
222 return off;
226 if (!found_empty) {
227 /* Set to any non-zero value */
228 h->old_val = 1;
229 h->bucket = i;
232 if (arr) {
233 ntdb_access_release(ntdb, arr);
235 return 0;
237 fail:
238 ntdb_unlock_hash(ntdb, h->bucket, ltype);
239 return NTDB_ERR_TO_OFF(ecode);
242 static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
243 ntdb_off_t new_off, uint32_t hash)
245 ntdb_off_t extra;
247 assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
248 assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
249 /* We pack extra hash bits into the upper bits of the offset. */
250 extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
251 extra <<= (64 - NTDB_OFF_UPPER_STEAL);
253 return new_off | extra;
256 /* Simply overwrite the hash entry we found before. */
257 enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
258 const struct hash_info *h,
259 ntdb_off_t new_off)
261 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
262 encode_offset(ntdb, new_off, h->h));
265 enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
266 const struct hash_info *h)
268 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
272 enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
273 const struct hash_info *h,
274 ntdb_off_t new_off)
276 enum NTDB_ERROR ecode;
277 ntdb_off_t chain;
278 struct ntdb_used_record chdr;
279 const ntdb_off_t *old;
280 ntdb_off_t *new;
282 /* We hit an empty bucket during search? That's where it goes. */
283 if (!h->old_val) {
284 return replace_in_hash(ntdb, h, new_off);
287 /* Full at top-level? Create a 2-element chain. */
288 if (h->table == NTDB_HASH_OFFSET) {
289 ntdb_off_t pair[2];
291 /* One element is old value, the other is the new value. */
292 pair[0] = h->old_val;
293 pair[1] = encode_offset(ntdb, new_off, h->h);
295 chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
296 if (NTDB_OFF_IS_ERR(chain)) {
297 return NTDB_OFF_TO_ERR(chain);
299 ecode = ntdb_write_convert(ntdb,
300 chain
301 + sizeof(struct ntdb_used_record),
302 pair, sizeof(pair));
303 if (ecode == NTDB_SUCCESS) {
304 ecode = ntdb_write_off(ntdb,
305 hbucket_off(h->table, h->bucket),
306 chain
307 | (1ULL << NTDB_OFF_CHAIN_BIT));
309 return ecode;
312 /* Full bucket. Expand. */
313 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
314 if (ecode != NTDB_SUCCESS) {
315 return ecode;
318 if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
319 /* Expand in place. */
320 uint64_t dlen = rec_data_length(&chdr);
322 ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
323 dlen + sizeof(new_off),
324 dlen + rec_extra_padding(&chdr));
326 if (ecode != NTDB_SUCCESS) {
327 return ecode;
329 /* find_and_lock set up h to point to last bucket. */
330 ecode = replace_in_hash(ntdb, h, new_off);
331 if (ecode != NTDB_SUCCESS) {
332 return ecode;
334 ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
335 if (ecode != NTDB_SUCCESS) {
336 return ecode;
338 /* For futureproofing, we always make the first byte of padding
339 * a zero. */
340 if (rec_extra_padding(&chdr)) {
341 ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
342 + dlen + sizeof(new_off),
343 "", 1);
345 return ecode;
348 /* We need to reallocate the chain. */
349 chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
350 NTDB_CHAIN_MAGIC, true);
351 if (NTDB_OFF_IS_ERR(chain)) {
352 return NTDB_OFF_TO_ERR(chain);
355 /* Map both and copy across old buckets. */
356 old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
357 h->table_size*sizeof(ntdb_off_t), true);
358 if (NTDB_PTR_IS_ERR(old)) {
359 return NTDB_PTR_ERR(old);
361 new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
362 (h->table_size + 1)*sizeof(ntdb_off_t), true);
363 if (NTDB_PTR_IS_ERR(new)) {
364 ntdb_access_release(ntdb, old);
365 return NTDB_PTR_ERR(new);
368 memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
369 new[h->bucket] = encode_offset(ntdb, new_off, h->h);
370 ntdb_access_release(ntdb, old);
372 ecode = ntdb_access_commit(ntdb, new);
373 if (ecode != NTDB_SUCCESS) {
374 return ecode;
377 /* Free the old chain. */
378 ecode = add_free_record(ntdb, h->table,
379 sizeof(struct ntdb_used_record)
380 + rec_data_length(&chdr)
381 + rec_extra_padding(&chdr),
382 NTDB_LOCK_WAIT, true);
384 /* Replace top-level to point to new chain */
385 return ntdb_write_off(ntdb,
386 hbucket_off(NTDB_HASH_OFFSET,
387 bits_from(h->h, 0, ntdb->hash_bits)),
388 chain | (1ULL << NTDB_OFF_CHAIN_BIT));
391 /* Traverse support: returns offset of record, or 0 or -ve error. */
392 static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
393 ntdb_off_t val,
394 struct hash_info *h)
396 ntdb_off_t i;
397 enum NTDB_ERROR ecode;
398 struct ntdb_used_record chdr;
400 /* First load up chain header. */
401 h->table = val & NTDB_OFF_MASK;
402 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
403 if (ecode != NTDB_SUCCESS) {
404 return ecode;
407 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
408 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
409 NTDB_LOG_ERROR,
410 "get_table:"
411 " corrupt record %#x at %llu",
412 rec_magic(&chdr),
413 (long long)h->table);
416 /* Chain length is implied by data length. */
417 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
419 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
420 h->table_size);
421 if (NTDB_OFF_IS_ERR(i)) {
422 return i;
425 if (i != h->table_size) {
426 /* Return to next bucket. */
427 h->bucket = i + 1;
428 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
429 if (NTDB_OFF_IS_ERR(val)) {
430 return val;
432 return val & NTDB_OFF_MASK;
435 /* Go back up to hash table. */
436 h->table = NTDB_HASH_OFFSET;
437 h->table_size = 1 << ntdb->hash_bits;
438 h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
439 return 0;
442 /* Keeps hash locked unless returns 0 or error. */
443 static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
444 struct hash_info *h)
446 ntdb_off_t val, i;
447 enum NTDB_ERROR ecode;
449 if (h->table != NTDB_HASH_OFFSET) {
450 /* We're in a chain. */
451 i = bits_from(h->h, 0, ntdb->hash_bits);
452 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
453 if (ecode != NTDB_SUCCESS) {
454 return NTDB_ERR_TO_OFF(ecode);
457 /* We dropped lock, bucket might have moved! */
458 val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
459 if (NTDB_OFF_IS_ERR(val)) {
460 goto unlock;
463 /* We don't remove chains: there should still be one there! */
464 if (!val || !is_chain(val)) {
465 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
466 NTDB_LOG_ERROR,
467 "iterate_hash:"
468 " vanished hchain %llu at %llu",
469 (long long)val,
470 (long long)i);
471 val = NTDB_ERR_TO_OFF(ecode);
472 goto unlock;
475 /* Find next bucket in the chain. */
476 val = iterate_chain(ntdb, val, h);
477 if (NTDB_OFF_IS_ERR(val)) {
478 goto unlock;
480 if (val != 0) {
481 return val;
483 ntdb_unlock_hash(ntdb, i, F_RDLCK);
485 /* OK, we've reset h back to top level. */
488 /* We do this unlocked, then re-check. */
489 for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
490 h->bucket, h->table_size);
491 i != h->table_size;
492 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
493 i+1, h->table_size)) {
494 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
495 if (ecode != NTDB_SUCCESS) {
496 return NTDB_ERR_TO_OFF(ecode);
499 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
500 if (NTDB_OFF_IS_ERR(val)) {
501 goto unlock;
504 /* Lost race, and it's empty? */
505 if (!val) {
506 ntdb->stats.traverse_val_vanished++;
507 ntdb_unlock_hash(ntdb, i, F_RDLCK);
508 continue;
511 if (!is_chain(val)) {
512 /* So caller knows what lock to free. */
513 h->h = i;
514 /* Return to next bucket. */
515 h->bucket = i + 1;
516 val &= NTDB_OFF_MASK;
517 return val;
520 /* Start at beginning of chain */
521 h->bucket = 0;
522 h->h = i;
524 val = iterate_chain(ntdb, val, h);
525 if (NTDB_OFF_IS_ERR(val)) {
526 goto unlock;
528 if (val != 0) {
529 return val;
532 /* Otherwise, bucket has been set to i+1 */
533 ntdb_unlock_hash(ntdb, i, F_RDLCK);
535 return 0;
537 unlock:
538 ntdb_unlock_hash(ntdb, i, F_RDLCK);
539 return val;
542 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
543 enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
544 struct hash_info *h,
545 NTDB_DATA *kbuf, size_t *dlen)
547 ntdb_off_t off;
548 struct ntdb_used_record rec;
549 enum NTDB_ERROR ecode;
551 off = lock_and_iterate_hash(ntdb, h);
553 if (NTDB_OFF_IS_ERR(off)) {
554 return NTDB_OFF_TO_ERR(off);
555 } else if (off == 0) {
556 return NTDB_ERR_NOEXIST;
559 /* The hash for this key is still locked. */
560 ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
561 if (ecode != NTDB_SUCCESS) {
562 goto unlock;
564 if (rec_magic(&rec) != NTDB_USED_MAGIC) {
565 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
566 NTDB_LOG_ERROR,
567 "next_in_hash:"
568 " corrupt record at %llu",
569 (long long)off);
570 goto unlock;
573 kbuf->dsize = rec_key_length(&rec);
575 /* They want data as well? */
576 if (dlen) {
577 *dlen = rec_data_length(&rec);
578 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
579 kbuf->dsize + *dlen);
580 } else {
581 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
582 kbuf->dsize);
584 if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
585 ecode = NTDB_PTR_ERR(kbuf->dptr);
586 goto unlock;
588 ecode = NTDB_SUCCESS;
590 unlock:
591 ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
592 return ecode;
596 enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
597 struct hash_info *h,
598 NTDB_DATA *kbuf, size_t *dlen)
600 h->table = NTDB_HASH_OFFSET;
601 h->table_size = 1 << ntdb->hash_bits;
602 h->bucket = 0;
604 return next_in_hash(ntdb, h, kbuf, dlen);
607 /* Even if the entry isn't in this hash bucket, you'd have to lock this
608 * bucket to find it. */
609 static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
610 const NTDB_DATA *key, int ltype)
612 uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
614 return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
617 /* lock/unlock one hash chain. This is meant to be used to reduce
618 contention - it cannot guarantee how many records will be locked */
619 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
621 return chainlock(ntdb, &key, F_WRLCK);
624 _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
626 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
628 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
631 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
632 NTDB_DATA key)
634 return chainlock(ntdb, &key, F_RDLCK);
637 _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
639 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
641 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);