s3:net_idmap_delete do not lock two records at the same time
[Samba/gebeck_regimport.git] / lib / ntdb / hash.c
blobb223668dbb33e8879b175585f195b5be1c97cb38
1 /*
2 Trivial Database 2: hash handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 #include "private.h"
19 #include <ccan/hash/hash.h>
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key, size_t length, uint32_t seed,
23 void *unused)
25 return hash_stable((const unsigned char *)key, length, seed);
28 uint32_t ntdb_hash(struct ntdb_context *ntdb, const void *ptr, size_t len)
30 return ntdb->hash_fn(ptr, len, ntdb->hash_seed, ntdb->hash_data);
33 static ntdb_bool_err key_matches(struct ntdb_context *ntdb,
34 const struct ntdb_used_record *rec,
35 ntdb_off_t off,
36 const NTDB_DATA *key,
37 const char **rptr)
39 ntdb_bool_err ret = false;
40 const char *rkey;
42 if (rec_key_length(rec) != key->dsize) {
43 ntdb->stats.compare_wrong_keylen++;
44 return ret;
47 rkey = ntdb_access_read(ntdb, off + sizeof(*rec),
48 key->dsize + rec_data_length(rec), false);
49 if (NTDB_PTR_IS_ERR(rkey)) {
50 return (ntdb_bool_err)NTDB_PTR_ERR(rkey);
52 if (memcmp(rkey, key->dptr, key->dsize) == 0) {
53 if (rptr) {
54 *rptr = rkey;
55 } else {
56 ntdb_access_release(ntdb, rkey);
58 return true;
60 ntdb->stats.compare_wrong_keycmp++;
61 ntdb_access_release(ntdb, rkey);
62 return ret;
65 /* Does entry match? */
66 static ntdb_bool_err match(struct ntdb_context *ntdb,
67 uint32_t hash,
68 const NTDB_DATA *key,
69 ntdb_off_t val,
70 struct ntdb_used_record *rec,
71 const char **rptr)
73 ntdb_off_t off;
74 enum NTDB_ERROR ecode;
76 ntdb->stats.compares++;
78 /* Top bits of offset == next bits of hash. */
79 if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
80 != bits_from(val, 64-NTDB_OFF_UPPER_STEAL, NTDB_OFF_UPPER_STEAL)) {
81 ntdb->stats.compare_wrong_offsetbits++;
82 return false;
85 off = val & NTDB_OFF_MASK;
86 ecode = ntdb_read_convert(ntdb, off, rec, sizeof(*rec));
87 if (ecode != NTDB_SUCCESS) {
88 return (ntdb_bool_err)ecode;
91 return key_matches(ntdb, rec, off, key, rptr);
94 static bool is_chain(ntdb_off_t val)
96 return val & (1ULL << NTDB_OFF_CHAIN_BIT);
99 static ntdb_off_t hbucket_off(ntdb_off_t base, ntdb_len_t idx)
101 return base + sizeof(struct ntdb_used_record)
102 + idx * sizeof(ntdb_off_t);
105 /* This is the core routine which searches the hashtable for an entry.
106 * On error, no locks are held and -ve is returned.
107 * Otherwise, hinfo is filled in.
108 * If not found, the return value is 0.
109 * If found, the return value is the offset, and *rec is the record. */
110 ntdb_off_t find_and_lock(struct ntdb_context *ntdb,
111 NTDB_DATA key,
112 int ltype,
113 struct hash_info *h,
114 struct ntdb_used_record *rec,
115 const char **rptr)
117 ntdb_off_t off, val;
118 const ntdb_off_t *arr = NULL;
119 ntdb_len_t i;
120 bool found_empty;
121 enum NTDB_ERROR ecode;
122 struct ntdb_used_record chdr;
123 ntdb_bool_err berr;
125 h->h = ntdb_hash(ntdb, key.dptr, key.dsize);
127 h->table = NTDB_HASH_OFFSET;
128 h->table_size = 1 << ntdb->hash_bits;
129 h->bucket = bits_from(h->h, 0, ntdb->hash_bits);
130 h->old_val = 0;
132 ecode = ntdb_lock_hash(ntdb, h->bucket, ltype);
133 if (ecode != NTDB_SUCCESS) {
134 return NTDB_ERR_TO_OFF(ecode);
137 off = hbucket_off(h->table, h->bucket);
138 val = ntdb_read_off(ntdb, off);
139 if (NTDB_OFF_IS_ERR(val)) {
140 ecode = NTDB_OFF_TO_ERR(val);
141 goto fail;
144 /* Directly in hash table? */
145 if (!likely(is_chain(val))) {
146 if (val) {
147 berr = match(ntdb, h->h, &key, val, rec, rptr);
148 if (berr < 0) {
149 ecode = NTDB_OFF_TO_ERR(berr);
150 goto fail;
152 if (berr) {
153 return val & NTDB_OFF_MASK;
155 /* If you want to insert here, make a chain. */
156 h->old_val = val;
158 return 0;
161 /* Nope? Iterate through chain. */
162 h->table = val & NTDB_OFF_MASK;
164 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
165 if (ecode != NTDB_SUCCESS) {
166 goto fail;
169 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
170 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
171 NTDB_LOG_ERROR,
172 "find_and_lock:"
173 " corrupt record %#x at %llu",
174 rec_magic(&chdr), (long long)off);
175 goto fail;
178 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
180 arr = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
181 rec_data_length(&chdr), true);
182 if (NTDB_PTR_IS_ERR(arr)) {
183 ecode = NTDB_PTR_ERR(arr);
184 goto fail;
187 found_empty = false;
188 for (i = 0; i < h->table_size; i++) {
189 if (arr[i] == 0) {
190 if (!found_empty) {
191 h->bucket = i;
192 found_empty = true;
194 } else {
195 berr = match(ntdb, h->h, &key, arr[i], rec, rptr);
196 if (berr < 0) {
197 ecode = NTDB_OFF_TO_ERR(berr);
198 ntdb_access_release(ntdb, arr);
199 goto fail;
201 if (berr) {
202 /* We found it! */
203 h->bucket = i;
204 off = arr[i] & NTDB_OFF_MASK;
205 ntdb_access_release(ntdb, arr);
206 return off;
210 if (!found_empty) {
211 /* Set to any non-zero value */
212 h->old_val = 1;
213 h->bucket = i;
216 ntdb_access_release(ntdb, arr);
217 return 0;
219 fail:
220 ntdb_unlock_hash(ntdb, h->bucket, ltype);
221 return NTDB_ERR_TO_OFF(ecode);
224 static ntdb_off_t encode_offset(const struct ntdb_context *ntdb,
225 ntdb_off_t new_off, uint32_t hash)
227 ntdb_off_t extra;
229 assert((new_off & (1ULL << NTDB_OFF_CHAIN_BIT)) == 0);
230 assert((new_off >> (64 - NTDB_OFF_UPPER_STEAL)) == 0);
231 /* We pack extra hash bits into the upper bits of the offset. */
232 extra = bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL);
233 extra <<= (64 - NTDB_OFF_UPPER_STEAL);
235 return new_off | extra;
238 /* Simply overwrite the hash entry we found before. */
239 enum NTDB_ERROR replace_in_hash(struct ntdb_context *ntdb,
240 const struct hash_info *h,
241 ntdb_off_t new_off)
243 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket),
244 encode_offset(ntdb, new_off, h->h));
247 enum NTDB_ERROR delete_from_hash(struct ntdb_context *ntdb,
248 const struct hash_info *h)
250 return ntdb_write_off(ntdb, hbucket_off(h->table, h->bucket), 0);
254 enum NTDB_ERROR add_to_hash(struct ntdb_context *ntdb,
255 const struct hash_info *h,
256 ntdb_off_t new_off)
258 enum NTDB_ERROR ecode;
259 ntdb_off_t chain;
260 struct ntdb_used_record chdr;
261 const ntdb_off_t *old;
262 ntdb_off_t *new;
264 /* We hit an empty bucket during search? That's where it goes. */
265 if (!h->old_val) {
266 return replace_in_hash(ntdb, h, new_off);
269 /* Full at top-level? Create a 2-element chain. */
270 if (h->table == NTDB_HASH_OFFSET) {
271 ntdb_off_t pair[2];
273 /* One element is old value, the other is the new value. */
274 pair[0] = h->old_val;
275 pair[1] = encode_offset(ntdb, new_off, h->h);
277 chain = alloc(ntdb, 0, sizeof(pair), NTDB_CHAIN_MAGIC, true);
278 if (NTDB_OFF_IS_ERR(chain)) {
279 return NTDB_OFF_TO_ERR(chain);
281 ecode = ntdb_write_convert(ntdb,
282 chain
283 + sizeof(struct ntdb_used_record),
284 pair, sizeof(pair));
285 if (ecode == NTDB_SUCCESS) {
286 ecode = ntdb_write_off(ntdb,
287 hbucket_off(h->table, h->bucket),
288 chain
289 | (1ULL << NTDB_OFF_CHAIN_BIT));
291 return ecode;
294 /* Full bucket. Expand. */
295 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
296 if (ecode != NTDB_SUCCESS) {
297 return ecode;
300 if (rec_extra_padding(&chdr) >= sizeof(new_off)) {
301 /* Expand in place. */
302 uint64_t dlen = rec_data_length(&chdr);
304 ecode = set_header(ntdb, &chdr, NTDB_CHAIN_MAGIC, 0,
305 dlen + sizeof(new_off),
306 dlen + rec_extra_padding(&chdr));
308 if (ecode != NTDB_SUCCESS) {
309 return ecode;
311 /* find_and_lock set up h to point to last bucket. */
312 ecode = replace_in_hash(ntdb, h, new_off);
313 if (ecode != NTDB_SUCCESS) {
314 return ecode;
316 ecode = ntdb_write_convert(ntdb, h->table, &chdr, sizeof(chdr));
317 if (ecode != NTDB_SUCCESS) {
318 return ecode;
320 /* For futureproofing, we always make the first byte of padding
321 * a zero. */
322 if (rec_extra_padding(&chdr)) {
323 ecode = ntdb->io->twrite(ntdb, h->table + sizeof(chdr)
324 + dlen + sizeof(new_off),
325 "", 1);
327 return ecode;
330 /* We need to reallocate the chain. */
331 chain = alloc(ntdb, 0, (h->table_size + 1) * sizeof(ntdb_off_t),
332 NTDB_CHAIN_MAGIC, true);
333 if (NTDB_OFF_IS_ERR(chain)) {
334 return NTDB_OFF_TO_ERR(chain);
337 /* Map both and copy across old buckets. */
338 old = ntdb_access_read(ntdb, hbucket_off(h->table, 0),
339 h->table_size*sizeof(ntdb_off_t), true);
340 if (NTDB_PTR_IS_ERR(old)) {
341 return NTDB_PTR_ERR(old);
343 new = ntdb_access_write(ntdb, hbucket_off(chain, 0),
344 (h->table_size + 1)*sizeof(ntdb_off_t), true);
345 if (NTDB_PTR_IS_ERR(new)) {
346 ntdb_access_release(ntdb, old);
347 return NTDB_PTR_ERR(new);
350 memcpy(new, old, h->bucket * sizeof(ntdb_off_t));
351 new[h->bucket] = encode_offset(ntdb, new_off, h->h);
352 ntdb_access_release(ntdb, old);
354 ecode = ntdb_access_commit(ntdb, new);
355 if (ecode != NTDB_SUCCESS) {
356 return ecode;
359 /* Free the old chain. */
360 ecode = add_free_record(ntdb, h->table,
361 sizeof(struct ntdb_used_record)
362 + rec_data_length(&chdr)
363 + rec_extra_padding(&chdr),
364 NTDB_LOCK_WAIT, true);
366 /* Replace top-level to point to new chain */
367 return ntdb_write_off(ntdb,
368 hbucket_off(NTDB_HASH_OFFSET,
369 bits_from(h->h, 0, ntdb->hash_bits)),
370 chain | (1ULL << NTDB_OFF_CHAIN_BIT));
373 /* Traverse support: returns offset of record, or 0 or -ve error. */
374 static ntdb_off_t iterate_chain(struct ntdb_context *ntdb,
375 ntdb_off_t val,
376 struct hash_info *h)
378 ntdb_off_t i;
379 enum NTDB_ERROR ecode;
380 struct ntdb_used_record chdr;
382 /* First load up chain header. */
383 h->table = val & NTDB_OFF_MASK;
384 ecode = ntdb_read_convert(ntdb, h->table, &chdr, sizeof(chdr));
385 if (ecode != NTDB_SUCCESS) {
386 return ecode;
389 if (rec_magic(&chdr) != NTDB_CHAIN_MAGIC) {
390 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
391 NTDB_LOG_ERROR,
392 "get_table:"
393 " corrupt record %#x at %llu",
394 rec_magic(&chdr),
395 (long long)h->table);
398 /* Chain length is implied by data length. */
399 h->table_size = rec_data_length(&chdr) / sizeof(ntdb_off_t);
401 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0), h->bucket,
402 h->table_size);
403 if (NTDB_OFF_IS_ERR(i)) {
404 return i;
407 if (i != h->table_size) {
408 /* Return to next bucket. */
409 h->bucket = i + 1;
410 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
411 if (NTDB_OFF_IS_ERR(val)) {
412 return val;
414 return val & NTDB_OFF_MASK;
417 /* Go back up to hash table. */
418 h->table = NTDB_HASH_OFFSET;
419 h->table_size = 1 << ntdb->hash_bits;
420 h->bucket = bits_from(h->h, 0, ntdb->hash_bits) + 1;
421 return 0;
424 /* Keeps hash locked unless returns 0 or error. */
425 static ntdb_off_t lock_and_iterate_hash(struct ntdb_context *ntdb,
426 struct hash_info *h)
428 ntdb_off_t val, i;
429 enum NTDB_ERROR ecode;
431 if (h->table != NTDB_HASH_OFFSET) {
432 /* We're in a chain. */
433 i = bits_from(h->h, 0, ntdb->hash_bits);
434 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
435 if (ecode != NTDB_SUCCESS) {
436 return NTDB_ERR_TO_OFF(ecode);
439 /* We dropped lock, bucket might have moved! */
440 val = ntdb_read_off(ntdb, hbucket_off(NTDB_HASH_OFFSET, i));
441 if (NTDB_OFF_IS_ERR(val)) {
442 goto unlock;
445 /* We don't remove chains: there should still be one there! */
446 if (!val || !is_chain(val)) {
447 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
448 NTDB_LOG_ERROR,
449 "iterate_hash:"
450 " vanished hchain %llu at %llu",
451 (long long)val,
452 (long long)i);
453 val = NTDB_ERR_TO_OFF(ecode);
454 goto unlock;
457 /* Find next bucket in the chain. */
458 val = iterate_chain(ntdb, val, h);
459 if (NTDB_OFF_IS_ERR(val)) {
460 goto unlock;
462 if (val != 0) {
463 return val;
465 ntdb_unlock_hash(ntdb, i, F_RDLCK);
467 /* OK, we've reset h back to top level. */
470 /* We do this unlocked, then re-check. */
471 for (i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
472 h->bucket, h->table_size);
473 i != h->table_size;
474 i = ntdb_find_nonzero_off(ntdb, hbucket_off(h->table, 0),
475 i+1, h->table_size)) {
476 ecode = ntdb_lock_hash(ntdb, i, F_RDLCK);
477 if (ecode != NTDB_SUCCESS) {
478 return NTDB_ERR_TO_OFF(ecode);
481 val = ntdb_read_off(ntdb, hbucket_off(h->table, i));
482 if (NTDB_OFF_IS_ERR(val)) {
483 goto unlock;
486 /* Lost race, and it's empty? */
487 if (!val) {
488 ntdb->stats.traverse_val_vanished++;
489 ntdb_unlock_hash(ntdb, i, F_RDLCK);
490 continue;
493 if (!is_chain(val)) {
494 /* So caller knows what lock to free. */
495 h->h = i;
496 /* Return to next bucket. */
497 h->bucket = i + 1;
498 val &= NTDB_OFF_MASK;
499 return val;
502 /* Start at beginning of chain */
503 h->bucket = 0;
504 h->h = i;
506 val = iterate_chain(ntdb, val, h);
507 if (NTDB_OFF_IS_ERR(val)) {
508 goto unlock;
510 if (val != 0) {
511 return val;
514 /* Otherwise, bucket has been set to i+1 */
515 ntdb_unlock_hash(ntdb, i, F_RDLCK);
517 return 0;
519 unlock:
520 ntdb_unlock_hash(ntdb, i, F_RDLCK);
521 return val;
524 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
525 enum NTDB_ERROR next_in_hash(struct ntdb_context *ntdb,
526 struct hash_info *h,
527 NTDB_DATA *kbuf, size_t *dlen)
529 ntdb_off_t off;
530 struct ntdb_used_record rec;
531 enum NTDB_ERROR ecode;
533 off = lock_and_iterate_hash(ntdb, h);
535 if (NTDB_OFF_IS_ERR(off)) {
536 return NTDB_OFF_TO_ERR(off);
537 } else if (off == 0) {
538 return NTDB_ERR_NOEXIST;
541 /* The hash for this key is still locked. */
542 ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
543 if (ecode != NTDB_SUCCESS) {
544 goto unlock;
546 if (rec_magic(&rec) != NTDB_USED_MAGIC) {
547 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
548 NTDB_LOG_ERROR,
549 "next_in_hash:"
550 " corrupt record at %llu",
551 (long long)off);
552 goto unlock;
555 kbuf->dsize = rec_key_length(&rec);
557 /* They want data as well? */
558 if (dlen) {
559 *dlen = rec_data_length(&rec);
560 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
561 kbuf->dsize + *dlen);
562 } else {
563 kbuf->dptr = ntdb_alloc_read(ntdb, off + sizeof(rec),
564 kbuf->dsize);
566 if (NTDB_PTR_IS_ERR(kbuf->dptr)) {
567 ecode = NTDB_PTR_ERR(kbuf->dptr);
568 goto unlock;
570 ecode = NTDB_SUCCESS;
572 unlock:
573 ntdb_unlock_hash(ntdb, bits_from(h->h, 0, ntdb->hash_bits), F_RDLCK);
574 return ecode;
578 enum NTDB_ERROR first_in_hash(struct ntdb_context *ntdb,
579 struct hash_info *h,
580 NTDB_DATA *kbuf, size_t *dlen)
582 h->table = NTDB_HASH_OFFSET;
583 h->table_size = 1 << ntdb->hash_bits;
584 h->bucket = 0;
586 return next_in_hash(ntdb, h, kbuf, dlen);
589 /* Even if the entry isn't in this hash bucket, you'd have to lock this
590 * bucket to find it. */
591 static enum NTDB_ERROR chainlock(struct ntdb_context *ntdb,
592 const NTDB_DATA *key, int ltype)
594 uint32_t h = ntdb_hash(ntdb, key->dptr, key->dsize);
596 return ntdb_lock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), ltype);
599 /* lock/unlock one hash chain. This is meant to be used to reduce
600 contention - it cannot guarantee how many records will be locked */
601 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock(struct ntdb_context *ntdb, NTDB_DATA key)
603 return chainlock(ntdb, &key, F_WRLCK);
606 _PUBLIC_ void ntdb_chainunlock(struct ntdb_context *ntdb, NTDB_DATA key)
608 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
610 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_WRLCK);
613 _PUBLIC_ enum NTDB_ERROR ntdb_chainlock_read(struct ntdb_context *ntdb,
614 NTDB_DATA key)
616 return chainlock(ntdb, &key, F_RDLCK);
619 _PUBLIC_ void ntdb_chainunlock_read(struct ntdb_context *ntdb, NTDB_DATA key)
621 uint32_t h = ntdb_hash(ntdb, key.dptr, key.dsize);
623 ntdb_unlock_hash(ntdb, bits_from(h, 0, ntdb->hash_bits), F_RDLCK);