2 Trivial Database 2: hash handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/hash/hash.h>
21 /* Default hash function. */
22 uint32_t ntdb_jenkins_hash(const void *key
, size_t length
, uint32_t seed
,
25 return hash_stable((const unsigned char *)key
, length
, seed
);
28 uint32_t ntdb_hash(struct ntdb_context
*ntdb
, const void *ptr
, size_t len
)
30 return ntdb
->hash_fn(ptr
, len
, ntdb
->hash_seed
, ntdb
->hash_data
);
33 static ntdb_bool_err
key_matches(struct ntdb_context
*ntdb
,
34 const struct ntdb_used_record
*rec
,
39 ntdb_bool_err ret
= false;
42 if (rec_key_length(rec
) != key
->dsize
) {
43 ntdb
->stats
.compare_wrong_keylen
++;
47 rkey
= ntdb_access_read(ntdb
, off
+ sizeof(*rec
),
48 key
->dsize
+ rec_data_length(rec
), false);
49 if (NTDB_PTR_IS_ERR(rkey
)) {
50 return (ntdb_bool_err
)NTDB_PTR_ERR(rkey
);
52 if (memcmp(rkey
, key
->dptr
, key
->dsize
) == 0) {
56 ntdb_access_release(ntdb
, rkey
);
60 ntdb
->stats
.compare_wrong_keycmp
++;
61 ntdb_access_release(ntdb
, rkey
);
65 /* Does entry match? */
66 static ntdb_bool_err
match(struct ntdb_context
*ntdb
,
70 struct ntdb_used_record
*rec
,
72 const ntdb_off_t
**mapped
)
75 enum NTDB_ERROR ecode
;
77 ntdb
->stats
.compares
++;
79 /* Top bits of offset == next bits of hash. */
80 if (bits_from(hash
, ntdb
->hash_bits
, NTDB_OFF_UPPER_STEAL
)
81 != bits_from(val
, 64-NTDB_OFF_UPPER_STEAL
, NTDB_OFF_UPPER_STEAL
)) {
82 ntdb
->stats
.compare_wrong_offsetbits
++;
86 /* Unmap before we try to read actual record, which may cause expand */
88 ntdb_access_release(ntdb
, *mapped
);
92 off
= val
& NTDB_OFF_MASK
;
93 ecode
= ntdb_read_convert(ntdb
, off
, rec
, sizeof(*rec
));
94 if (ecode
!= NTDB_SUCCESS
) {
95 return (ntdb_bool_err
)ecode
;
98 return key_matches(ntdb
, rec
, off
, key
, rptr
);
101 static bool is_chain(ntdb_off_t val
)
103 return val
& (1ULL << NTDB_OFF_CHAIN_BIT
);
106 static ntdb_off_t
hbucket_off(ntdb_off_t base
, ntdb_len_t idx
)
108 return base
+ sizeof(struct ntdb_used_record
)
109 + idx
* sizeof(ntdb_off_t
);
112 /* This is the core routine which searches the hashtable for an entry.
113 * On error, no locks are held and -ve is returned.
114 * Otherwise, hinfo is filled in.
115 * If not found, the return value is 0.
116 * If found, the return value is the offset, and *rec is the record. */
117 ntdb_off_t
find_and_lock(struct ntdb_context
*ntdb
,
121 struct ntdb_used_record
*rec
,
125 const ntdb_off_t
*arr
= NULL
;
128 enum NTDB_ERROR ecode
;
129 struct ntdb_used_record chdr
;
132 h
->h
= ntdb_hash(ntdb
, key
.dptr
, key
.dsize
);
134 h
->table
= NTDB_HASH_OFFSET
;
135 h
->table_size
= 1 << ntdb
->hash_bits
;
136 h
->bucket
= bits_from(h
->h
, 0, ntdb
->hash_bits
);
139 ecode
= ntdb_lock_hash(ntdb
, h
->bucket
, ltype
);
140 if (ecode
!= NTDB_SUCCESS
) {
141 return NTDB_ERR_TO_OFF(ecode
);
144 off
= hbucket_off(h
->table
, h
->bucket
);
145 val
= ntdb_read_off(ntdb
, off
);
146 if (NTDB_OFF_IS_ERR(val
)) {
147 ecode
= NTDB_OFF_TO_ERR(val
);
151 /* Directly in hash table? */
152 if (!likely(is_chain(val
))) {
154 berr
= match(ntdb
, h
->h
, &key
, val
, rec
, rptr
, NULL
);
156 ecode
= NTDB_OFF_TO_ERR(berr
);
160 return val
& NTDB_OFF_MASK
;
162 /* If you want to insert here, make a chain. */
168 /* Nope? Iterate through chain. */
169 h
->table
= val
& NTDB_OFF_MASK
;
171 ecode
= ntdb_read_convert(ntdb
, h
->table
, &chdr
, sizeof(chdr
));
172 if (ecode
!= NTDB_SUCCESS
) {
176 if (rec_magic(&chdr
) != NTDB_CHAIN_MAGIC
) {
177 ecode
= ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
,
180 " corrupt record %#x at %llu",
181 rec_magic(&chdr
), (long long)off
);
185 h
->table_size
= rec_data_length(&chdr
) / sizeof(ntdb_off_t
);
188 for (i
= 0; i
< h
->table_size
; i
++) {
189 /* Careful! match has to unmap this if we access a
190 * record (may cause mmap of database to move. */
192 arr
= ntdb_access_read(ntdb
, hbucket_off(h
->table
, 0),
193 rec_data_length(&chdr
), true);
194 if (NTDB_PTR_IS_ERR(arr
)) {
195 ecode
= NTDB_PTR_ERR(arr
);
207 berr
= match(ntdb
, h
->h
, &key
, val
, rec
, rptr
, &arr
);
209 ecode
= NTDB_OFF_TO_ERR(berr
);
211 ntdb_access_release(ntdb
, arr
);
218 off
= val
& NTDB_OFF_MASK
;
220 ntdb_access_release(ntdb
, arr
);
227 /* Set to any non-zero value */
233 ntdb_access_release(ntdb
, arr
);
238 ntdb_unlock_hash(ntdb
, h
->bucket
, ltype
);
239 return NTDB_ERR_TO_OFF(ecode
);
242 static ntdb_off_t
encode_offset(const struct ntdb_context
*ntdb
,
243 ntdb_off_t new_off
, uint32_t hash
)
247 assert((new_off
& (1ULL << NTDB_OFF_CHAIN_BIT
)) == 0);
248 assert((new_off
>> (64 - NTDB_OFF_UPPER_STEAL
)) == 0);
249 /* We pack extra hash bits into the upper bits of the offset. */
250 extra
= bits_from(hash
, ntdb
->hash_bits
, NTDB_OFF_UPPER_STEAL
);
251 extra
<<= (64 - NTDB_OFF_UPPER_STEAL
);
253 return new_off
| extra
;
256 /* Simply overwrite the hash entry we found before. */
257 enum NTDB_ERROR
replace_in_hash(struct ntdb_context
*ntdb
,
258 const struct hash_info
*h
,
261 return ntdb_write_off(ntdb
, hbucket_off(h
->table
, h
->bucket
),
262 encode_offset(ntdb
, new_off
, h
->h
));
265 enum NTDB_ERROR
delete_from_hash(struct ntdb_context
*ntdb
,
266 const struct hash_info
*h
)
268 return ntdb_write_off(ntdb
, hbucket_off(h
->table
, h
->bucket
), 0);
272 enum NTDB_ERROR
add_to_hash(struct ntdb_context
*ntdb
,
273 const struct hash_info
*h
,
276 enum NTDB_ERROR ecode
;
278 struct ntdb_used_record chdr
;
279 const ntdb_off_t
*old
;
282 /* We hit an empty bucket during search? That's where it goes. */
284 return replace_in_hash(ntdb
, h
, new_off
);
287 /* Full at top-level? Create a 2-element chain. */
288 if (h
->table
== NTDB_HASH_OFFSET
) {
291 /* One element is old value, the other is the new value. */
292 pair
[0] = h
->old_val
;
293 pair
[1] = encode_offset(ntdb
, new_off
, h
->h
);
295 chain
= alloc(ntdb
, 0, sizeof(pair
), NTDB_CHAIN_MAGIC
, true);
296 if (NTDB_OFF_IS_ERR(chain
)) {
297 return NTDB_OFF_TO_ERR(chain
);
299 ecode
= ntdb_write_convert(ntdb
,
301 + sizeof(struct ntdb_used_record
),
303 if (ecode
== NTDB_SUCCESS
) {
304 ecode
= ntdb_write_off(ntdb
,
305 hbucket_off(h
->table
, h
->bucket
),
307 | (1ULL << NTDB_OFF_CHAIN_BIT
));
312 /* Full bucket. Expand. */
313 ecode
= ntdb_read_convert(ntdb
, h
->table
, &chdr
, sizeof(chdr
));
314 if (ecode
!= NTDB_SUCCESS
) {
318 if (rec_extra_padding(&chdr
) >= sizeof(new_off
)) {
319 /* Expand in place. */
320 uint64_t dlen
= rec_data_length(&chdr
);
322 ecode
= set_header(ntdb
, &chdr
, NTDB_CHAIN_MAGIC
, 0,
323 dlen
+ sizeof(new_off
),
324 dlen
+ rec_extra_padding(&chdr
));
326 if (ecode
!= NTDB_SUCCESS
) {
329 /* find_and_lock set up h to point to last bucket. */
330 ecode
= replace_in_hash(ntdb
, h
, new_off
);
331 if (ecode
!= NTDB_SUCCESS
) {
334 ecode
= ntdb_write_convert(ntdb
, h
->table
, &chdr
, sizeof(chdr
));
335 if (ecode
!= NTDB_SUCCESS
) {
338 /* For futureproofing, we always make the first byte of padding
340 if (rec_extra_padding(&chdr
)) {
341 ecode
= ntdb
->io
->twrite(ntdb
, h
->table
+ sizeof(chdr
)
342 + dlen
+ sizeof(new_off
),
348 /* We need to reallocate the chain. */
349 chain
= alloc(ntdb
, 0, (h
->table_size
+ 1) * sizeof(ntdb_off_t
),
350 NTDB_CHAIN_MAGIC
, true);
351 if (NTDB_OFF_IS_ERR(chain
)) {
352 return NTDB_OFF_TO_ERR(chain
);
355 /* Map both and copy across old buckets. */
356 old
= ntdb_access_read(ntdb
, hbucket_off(h
->table
, 0),
357 h
->table_size
*sizeof(ntdb_off_t
), true);
358 if (NTDB_PTR_IS_ERR(old
)) {
359 return NTDB_PTR_ERR(old
);
361 new = ntdb_access_write(ntdb
, hbucket_off(chain
, 0),
362 (h
->table_size
+ 1)*sizeof(ntdb_off_t
), true);
363 if (NTDB_PTR_IS_ERR(new)) {
364 ntdb_access_release(ntdb
, old
);
365 return NTDB_PTR_ERR(new);
368 memcpy(new, old
, h
->bucket
* sizeof(ntdb_off_t
));
369 new[h
->bucket
] = encode_offset(ntdb
, new_off
, h
->h
);
370 ntdb_access_release(ntdb
, old
);
372 ecode
= ntdb_access_commit(ntdb
, new);
373 if (ecode
!= NTDB_SUCCESS
) {
377 /* Free the old chain. */
378 ecode
= add_free_record(ntdb
, h
->table
,
379 sizeof(struct ntdb_used_record
)
380 + rec_data_length(&chdr
)
381 + rec_extra_padding(&chdr
),
382 NTDB_LOCK_WAIT
, true);
384 /* Replace top-level to point to new chain */
385 return ntdb_write_off(ntdb
,
386 hbucket_off(NTDB_HASH_OFFSET
,
387 bits_from(h
->h
, 0, ntdb
->hash_bits
)),
388 chain
| (1ULL << NTDB_OFF_CHAIN_BIT
));
391 /* Traverse support: returns offset of record, or 0 or -ve error. */
392 static ntdb_off_t
iterate_chain(struct ntdb_context
*ntdb
,
397 enum NTDB_ERROR ecode
;
398 struct ntdb_used_record chdr
;
400 /* First load up chain header. */
401 h
->table
= val
& NTDB_OFF_MASK
;
402 ecode
= ntdb_read_convert(ntdb
, h
->table
, &chdr
, sizeof(chdr
));
403 if (ecode
!= NTDB_SUCCESS
) {
407 if (rec_magic(&chdr
) != NTDB_CHAIN_MAGIC
) {
408 return ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
,
411 " corrupt record %#x at %llu",
413 (long long)h
->table
);
416 /* Chain length is implied by data length. */
417 h
->table_size
= rec_data_length(&chdr
) / sizeof(ntdb_off_t
);
419 i
= ntdb_find_nonzero_off(ntdb
, hbucket_off(h
->table
, 0), h
->bucket
,
421 if (NTDB_OFF_IS_ERR(i
)) {
425 if (i
!= h
->table_size
) {
426 /* Return to next bucket. */
428 val
= ntdb_read_off(ntdb
, hbucket_off(h
->table
, i
));
429 if (NTDB_OFF_IS_ERR(val
)) {
432 return val
& NTDB_OFF_MASK
;
435 /* Go back up to hash table. */
436 h
->table
= NTDB_HASH_OFFSET
;
437 h
->table_size
= 1 << ntdb
->hash_bits
;
438 h
->bucket
= bits_from(h
->h
, 0, ntdb
->hash_bits
) + 1;
442 /* Keeps hash locked unless returns 0 or error. */
443 static ntdb_off_t
lock_and_iterate_hash(struct ntdb_context
*ntdb
,
447 enum NTDB_ERROR ecode
;
449 if (h
->table
!= NTDB_HASH_OFFSET
) {
450 /* We're in a chain. */
451 i
= bits_from(h
->h
, 0, ntdb
->hash_bits
);
452 ecode
= ntdb_lock_hash(ntdb
, i
, F_RDLCK
);
453 if (ecode
!= NTDB_SUCCESS
) {
454 return NTDB_ERR_TO_OFF(ecode
);
457 /* We dropped lock, bucket might have moved! */
458 val
= ntdb_read_off(ntdb
, hbucket_off(NTDB_HASH_OFFSET
, i
));
459 if (NTDB_OFF_IS_ERR(val
)) {
463 /* We don't remove chains: there should still be one there! */
464 if (!val
|| !is_chain(val
)) {
465 ecode
= ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
,
468 " vanished hchain %llu at %llu",
471 val
= NTDB_ERR_TO_OFF(ecode
);
475 /* Find next bucket in the chain. */
476 val
= iterate_chain(ntdb
, val
, h
);
477 if (NTDB_OFF_IS_ERR(val
)) {
483 ntdb_unlock_hash(ntdb
, i
, F_RDLCK
);
485 /* OK, we've reset h back to top level. */
488 /* We do this unlocked, then re-check. */
489 for (i
= ntdb_find_nonzero_off(ntdb
, hbucket_off(h
->table
, 0),
490 h
->bucket
, h
->table_size
);
492 i
= ntdb_find_nonzero_off(ntdb
, hbucket_off(h
->table
, 0),
493 i
+1, h
->table_size
)) {
494 ecode
= ntdb_lock_hash(ntdb
, i
, F_RDLCK
);
495 if (ecode
!= NTDB_SUCCESS
) {
496 return NTDB_ERR_TO_OFF(ecode
);
499 val
= ntdb_read_off(ntdb
, hbucket_off(h
->table
, i
));
500 if (NTDB_OFF_IS_ERR(val
)) {
504 /* Lost race, and it's empty? */
506 ntdb
->stats
.traverse_val_vanished
++;
507 ntdb_unlock_hash(ntdb
, i
, F_RDLCK
);
511 if (!is_chain(val
)) {
512 /* So caller knows what lock to free. */
514 /* Return to next bucket. */
516 val
&= NTDB_OFF_MASK
;
520 /* Start at beginning of chain */
524 val
= iterate_chain(ntdb
, val
, h
);
525 if (NTDB_OFF_IS_ERR(val
)) {
532 /* Otherwise, bucket has been set to i+1 */
533 ntdb_unlock_hash(ntdb
, i
, F_RDLCK
);
538 ntdb_unlock_hash(ntdb
, i
, F_RDLCK
);
542 /* Return success if we find something, NTDB_ERR_NOEXIST if none. */
543 enum NTDB_ERROR
next_in_hash(struct ntdb_context
*ntdb
,
545 NTDB_DATA
*kbuf
, size_t *dlen
)
548 struct ntdb_used_record rec
;
549 enum NTDB_ERROR ecode
;
551 off
= lock_and_iterate_hash(ntdb
, h
);
553 if (NTDB_OFF_IS_ERR(off
)) {
554 return NTDB_OFF_TO_ERR(off
);
555 } else if (off
== 0) {
556 return NTDB_ERR_NOEXIST
;
559 /* The hash for this key is still locked. */
560 ecode
= ntdb_read_convert(ntdb
, off
, &rec
, sizeof(rec
));
561 if (ecode
!= NTDB_SUCCESS
) {
564 if (rec_magic(&rec
) != NTDB_USED_MAGIC
) {
565 ecode
= ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
,
568 " corrupt record at %llu",
573 kbuf
->dsize
= rec_key_length(&rec
);
575 /* They want data as well? */
577 *dlen
= rec_data_length(&rec
);
578 kbuf
->dptr
= ntdb_alloc_read(ntdb
, off
+ sizeof(rec
),
579 kbuf
->dsize
+ *dlen
);
581 kbuf
->dptr
= ntdb_alloc_read(ntdb
, off
+ sizeof(rec
),
584 if (NTDB_PTR_IS_ERR(kbuf
->dptr
)) {
585 ecode
= NTDB_PTR_ERR(kbuf
->dptr
);
588 ecode
= NTDB_SUCCESS
;
591 ntdb_unlock_hash(ntdb
, bits_from(h
->h
, 0, ntdb
->hash_bits
), F_RDLCK
);
596 enum NTDB_ERROR
first_in_hash(struct ntdb_context
*ntdb
,
598 NTDB_DATA
*kbuf
, size_t *dlen
)
600 h
->table
= NTDB_HASH_OFFSET
;
601 h
->table_size
= 1 << ntdb
->hash_bits
;
604 return next_in_hash(ntdb
, h
, kbuf
, dlen
);
607 /* Even if the entry isn't in this hash bucket, you'd have to lock this
608 * bucket to find it. */
609 static enum NTDB_ERROR
chainlock(struct ntdb_context
*ntdb
,
610 const NTDB_DATA
*key
, int ltype
)
612 uint32_t h
= ntdb_hash(ntdb
, key
->dptr
, key
->dsize
);
614 return ntdb_lock_hash(ntdb
, bits_from(h
, 0, ntdb
->hash_bits
), ltype
);
617 /* lock/unlock one hash chain. This is meant to be used to reduce
618 contention - it cannot guarantee how many records will be locked */
619 _PUBLIC_
enum NTDB_ERROR
ntdb_chainlock(struct ntdb_context
*ntdb
, NTDB_DATA key
)
621 return chainlock(ntdb
, &key
, F_WRLCK
);
624 _PUBLIC_
void ntdb_chainunlock(struct ntdb_context
*ntdb
, NTDB_DATA key
)
626 uint32_t h
= ntdb_hash(ntdb
, key
.dptr
, key
.dsize
);
628 ntdb_unlock_hash(ntdb
, bits_from(h
, 0, ntdb
->hash_bits
), F_WRLCK
);
631 _PUBLIC_
enum NTDB_ERROR
ntdb_chainlock_read(struct ntdb_context
*ntdb
,
634 return chainlock(ntdb
, &key
, F_RDLCK
);
637 _PUBLIC_
void ntdb_chainunlock_read(struct ntdb_context
*ntdb
, NTDB_DATA key
)
639 uint32_t h
= ntdb_hash(ntdb
, key
.dptr
, key
.dsize
);
641 ntdb_unlock_hash(ntdb
, bits_from(h
, 0, ntdb
->hash_bits
), F_RDLCK
);