2 Trivial Database 2: hash handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
21 uint64_t tdb_hash(struct tdb_context
*tdb
, const void *ptr
, size_t len
)
23 return tdb
->hash_fn(ptr
, len
, tdb
->hash_seed
, tdb
->hash_data
);
26 uint64_t hash_record(struct tdb_context
*tdb
, tdb_off_t off
)
28 const struct tdb_used_record
*r
;
32 r
= tdb_access_read(tdb
, off
, sizeof(*r
), true);
33 if (TDB_PTR_IS_ERR(r
)) {
38 klen
= rec_key_length(r
);
39 tdb_access_release(tdb
, r
);
41 key
= tdb_access_read(tdb
, off
+ sizeof(*r
), klen
, false);
42 if (TDB_PTR_IS_ERR(key
)) {
46 hash
= tdb_hash(tdb
, key
, klen
);
47 tdb_access_release(tdb
, key
);
51 /* Get bits from a value. */
52 static uint32_t bits_from(uint64_t val
, unsigned start
, unsigned num
)
55 return (val
>> start
) & ((1U << num
) - 1);
58 /* We take bits from the top: that way we can lock whole sections of the hash
59 * by using lock ranges. */
60 static uint32_t use_bits(struct hash_info
*h
, unsigned num
)
63 return bits_from(h
->h
, 64 - h
->hash_used
, num
);
66 static tdb_bool_err
key_matches(struct tdb_context
*tdb
,
67 const struct tdb_used_record
*rec
,
69 const struct tdb_data
*key
)
71 tdb_bool_err ret
= false;
74 if (rec_key_length(rec
) != key
->dsize
) {
75 tdb
->stats
.compare_wrong_keylen
++;
79 rkey
= tdb_access_read(tdb
, off
+ sizeof(*rec
), key
->dsize
, false);
80 if (TDB_PTR_IS_ERR(rkey
)) {
81 return TDB_PTR_ERR(rkey
);
83 if (memcmp(rkey
, key
->dptr
, key
->dsize
) == 0)
86 tdb
->stats
.compare_wrong_keycmp
++;
87 tdb_access_release(tdb
, rkey
);
91 /* Does entry match? */
92 static tdb_bool_err
match(struct tdb_context
*tdb
,
94 const struct tdb_data
*key
,
96 struct tdb_used_record
*rec
)
101 tdb
->stats
.compares
++;
102 /* Desired bucket must match. */
103 if (h
->home_bucket
!= (val
& TDB_OFF_HASH_GROUP_MASK
)) {
104 tdb
->stats
.compare_wrong_bucket
++;
108 /* Top bits of offset == next bits of hash. */
109 if (bits_from(val
, TDB_OFF_HASH_EXTRA_BIT
, TDB_OFF_UPPER_STEAL_EXTRA
)
110 != bits_from(h
->h
, 64 - h
->hash_used
- TDB_OFF_UPPER_STEAL_EXTRA
,
111 TDB_OFF_UPPER_STEAL_EXTRA
)) {
112 tdb
->stats
.compare_wrong_offsetbits
++;
116 off
= val
& TDB_OFF_MASK
;
117 ecode
= tdb_read_convert(tdb
, off
, rec
, sizeof(*rec
));
118 if (ecode
!= TDB_SUCCESS
) {
122 if ((h
->h
& ((1 << 11)-1)) != rec_hash(rec
)) {
123 tdb
->stats
.compare_wrong_rechash
++;
127 return key_matches(tdb
, rec
, off
, key
);
130 static tdb_off_t
hbucket_off(tdb_off_t group_start
, unsigned bucket
)
133 + (bucket
% (1 << TDB_HASH_GROUP_BITS
)) * sizeof(tdb_off_t
);
136 bool is_subhash(tdb_off_t val
)
138 return (val
>> TDB_OFF_UPPER_STEAL_SUBHASH_BIT
) & 1;
141 /* FIXME: Guess the depth, don't over-lock! */
142 static tdb_off_t
hlock_range(tdb_off_t group
, tdb_off_t
*size
)
144 *size
= 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
));
145 return group
<< (64 - (TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
));
148 static tdb_off_t COLD
find_in_chain(struct tdb_context
*tdb
,
152 struct tdb_used_record
*rec
,
153 struct traverse_info
*tinfo
)
156 enum TDB_ERROR ecode
;
158 /* In case nothing is free, we set these to zero. */
159 h
->home_bucket
= h
->found_bucket
= 0;
161 for (off
= chain
; off
; off
= next
) {
164 h
->group_start
= off
;
165 ecode
= tdb_read_convert(tdb
, off
, h
->group
, sizeof(h
->group
));
166 if (ecode
!= TDB_SUCCESS
) {
170 for (i
= 0; i
< (1 << TDB_HASH_GROUP_BITS
); i
++) {
173 /* Remember this empty bucket. */
174 h
->home_bucket
= h
->found_bucket
= i
;
178 /* We can insert extra bits via add_to_hash
179 * empty bucket logic. */
180 recoff
= h
->group
[i
] & TDB_OFF_MASK
;
181 ecode
= tdb_read_convert(tdb
, recoff
, rec
,
183 if (ecode
!= TDB_SUCCESS
) {
187 ecode
= key_matches(tdb
, rec
, recoff
, &key
);
192 h
->home_bucket
= h
->found_bucket
= i
;
195 tinfo
->levels
[tinfo
->num_levels
]
197 tinfo
->levels
[tinfo
->num_levels
]
199 = 1 << TDB_HASH_GROUP_BITS
;
200 tinfo
->levels
[tinfo
->num_levels
].entry
207 next
= tdb_read_off(tdb
, off
208 + offsetof(struct tdb_chain
, next
));
209 if (TDB_OFF_IS_ERR(next
)) {
213 next
+= sizeof(struct tdb_used_record
);
218 /* This is the core routine which searches the hashtable for an entry.
219 * On error, no locks are held and -ve is returned.
220 * Otherwise, hinfo is filled in (and the optional tinfo).
221 * If not found, the return value is 0.
222 * If found, the return value is the offset, and *rec is the record. */
223 tdb_off_t
find_and_lock(struct tdb_context
*tdb
,
227 struct tdb_used_record
*rec
,
228 struct traverse_info
*tinfo
)
232 enum TDB_ERROR ecode
;
234 h
->h
= tdb_hash(tdb
, key
.dptr
, key
.dsize
);
236 group
= use_bits(h
, TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
);
237 h
->home_bucket
= use_bits(h
, TDB_HASH_GROUP_BITS
);
239 h
->hlock_start
= hlock_range(group
, &h
->hlock_range
);
240 ecode
= tdb_lock_hashes(tdb
, h
->hlock_start
, h
->hlock_range
, ltype
,
242 if (ecode
!= TDB_SUCCESS
) {
246 hashtable
= offsetof(struct tdb_header
, hashtable
);
248 tinfo
->toplevel_group
= group
;
249 tinfo
->num_levels
= 1;
250 tinfo
->levels
[0].entry
= 0;
251 tinfo
->levels
[0].hashtable
= hashtable
252 + (group
<< TDB_HASH_GROUP_BITS
) * sizeof(tdb_off_t
);
253 tinfo
->levels
[0].total_buckets
= 1 << TDB_HASH_GROUP_BITS
;
256 while (h
->hash_used
<= 64) {
257 /* Read in the hash group. */
258 h
->group_start
= hashtable
259 + group
* (sizeof(tdb_off_t
) << TDB_HASH_GROUP_BITS
);
261 ecode
= tdb_read_convert(tdb
, h
->group_start
, &h
->group
,
263 if (ecode
!= TDB_SUCCESS
) {
267 /* Pointer to another hash table? Go down... */
268 if (is_subhash(h
->group
[h
->home_bucket
])) {
269 hashtable
= (h
->group
[h
->home_bucket
] & TDB_OFF_MASK
)
270 + sizeof(struct tdb_used_record
);
272 /* When we come back, use *next* bucket */
273 tinfo
->levels
[tinfo
->num_levels
-1].entry
274 += h
->home_bucket
+ 1;
276 group
= use_bits(h
, TDB_SUBLEVEL_HASH_BITS
277 - TDB_HASH_GROUP_BITS
);
278 h
->home_bucket
= use_bits(h
, TDB_HASH_GROUP_BITS
);
280 tinfo
->levels
[tinfo
->num_levels
].hashtable
282 tinfo
->levels
[tinfo
->num_levels
].total_buckets
283 = 1 << TDB_SUBLEVEL_HASH_BITS
;
284 tinfo
->levels
[tinfo
->num_levels
].entry
285 = group
<< TDB_HASH_GROUP_BITS
;
291 /* It's in this group: search (until 0 or all searched) */
292 for (i
= 0, h
->found_bucket
= h
->home_bucket
;
293 i
< (1 << TDB_HASH_GROUP_BITS
);
294 i
++, h
->found_bucket
= ((h
->found_bucket
+1)
295 % (1 << TDB_HASH_GROUP_BITS
))) {
297 if (is_subhash(h
->group
[h
->found_bucket
]))
300 if (!h
->group
[h
->found_bucket
])
303 berr
= match(tdb
, h
, &key
, h
->group
[h
->found_bucket
],
311 tinfo
->levels
[tinfo
->num_levels
-1].entry
314 return h
->group
[h
->found_bucket
] & TDB_OFF_MASK
;
317 /* Didn't find it: h indicates where it would go. */
321 return find_in_chain(tdb
, key
, hashtable
, h
, rec
, tinfo
);
324 tdb_unlock_hashes(tdb
, h
->hlock_start
, h
->hlock_range
, ltype
);
328 /* I wrote a simple test, expanding a hash to 2GB, for the following
330 * 1) Expanding all the buckets at once,
331 * 2) Expanding the bucket we wanted to place the new entry into.
332 * 3) Expanding the most-populated bucket,
334 * I measured the worst/average/best density during this process.
339 * So we figure out the busiest bucket for the moment.
341 static unsigned fullest_bucket(struct tdb_context
*tdb
,
342 const tdb_off_t
*group
,
345 unsigned counts
[1 << TDB_HASH_GROUP_BITS
] = { 0 };
346 unsigned int i
, best_bucket
;
348 /* Count the new entry. */
349 counts
[new_bucket
]++;
350 best_bucket
= new_bucket
;
352 for (i
= 0; i
< (1 << TDB_HASH_GROUP_BITS
); i
++) {
353 unsigned this_bucket
;
355 if (is_subhash(group
[i
]))
357 this_bucket
= group
[i
] & TDB_OFF_HASH_GROUP_MASK
;
358 if (++counts
[this_bucket
] > counts
[best_bucket
])
359 best_bucket
= this_bucket
;
365 static bool put_into_group(tdb_off_t
*group
,
366 unsigned bucket
, tdb_off_t encoded
)
370 for (i
= 0; i
< (1 << TDB_HASH_GROUP_BITS
); i
++) {
371 unsigned b
= (bucket
+ i
) % (1 << TDB_HASH_GROUP_BITS
);
381 static void force_into_group(tdb_off_t
*group
,
382 unsigned bucket
, tdb_off_t encoded
)
384 if (!put_into_group(group
, bucket
, encoded
))
388 static tdb_off_t
encode_offset(tdb_off_t new_off
, struct hash_info
*h
)
390 return h
->home_bucket
392 | ((uint64_t)bits_from(h
->h
,
393 64 - h
->hash_used
- TDB_OFF_UPPER_STEAL_EXTRA
,
394 TDB_OFF_UPPER_STEAL_EXTRA
)
395 << TDB_OFF_HASH_EXTRA_BIT
);
398 /* Simply overwrite the hash entry we found before. */
399 enum TDB_ERROR
replace_in_hash(struct tdb_context
*tdb
,
403 return tdb_write_off(tdb
, hbucket_off(h
->group_start
, h
->found_bucket
),
404 encode_offset(new_off
, h
));
407 /* We slot in anywhere that's empty in the chain. */
408 static enum TDB_ERROR COLD
add_to_chain(struct tdb_context
*tdb
,
413 enum TDB_ERROR ecode
;
415 entry
= tdb_find_zero_off(tdb
, subhash
, 1<<TDB_HASH_GROUP_BITS
);
416 if (TDB_OFF_IS_ERR(entry
)) {
420 if (entry
== 1 << TDB_HASH_GROUP_BITS
) {
423 next
= tdb_read_off(tdb
, subhash
424 + offsetof(struct tdb_chain
, next
));
425 if (TDB_OFF_IS_ERR(next
)) {
430 next
= alloc(tdb
, 0, sizeof(struct tdb_chain
), 0,
431 TDB_CHAIN_MAGIC
, false);
432 if (TDB_OFF_IS_ERR(next
))
434 ecode
= zero_out(tdb
,
435 next
+sizeof(struct tdb_used_record
),
436 sizeof(struct tdb_chain
));
437 if (ecode
!= TDB_SUCCESS
) {
440 ecode
= tdb_write_off(tdb
, subhash
441 + offsetof(struct tdb_chain
,
444 if (ecode
!= TDB_SUCCESS
) {
448 return add_to_chain(tdb
, next
, new_off
);
451 return tdb_write_off(tdb
, subhash
+ entry
* sizeof(tdb_off_t
),
455 /* Add into a newly created subhash. */
456 static enum TDB_ERROR
add_to_subhash(struct tdb_context
*tdb
, tdb_off_t subhash
,
457 unsigned hash_used
, tdb_off_t val
)
459 tdb_off_t off
= (val
& TDB_OFF_MASK
), *group
;
463 h
.hash_used
= hash_used
;
465 if (hash_used
+ TDB_SUBLEVEL_HASH_BITS
> 64)
466 return add_to_chain(tdb
, subhash
, off
);
468 h
.h
= hash_record(tdb
, off
);
469 gnum
= use_bits(&h
, TDB_SUBLEVEL_HASH_BITS
-TDB_HASH_GROUP_BITS
);
470 h
.group_start
= subhash
471 + gnum
* (sizeof(tdb_off_t
) << TDB_HASH_GROUP_BITS
);
472 h
.home_bucket
= use_bits(&h
, TDB_HASH_GROUP_BITS
);
474 group
= tdb_access_write(tdb
, h
.group_start
,
475 sizeof(*group
) << TDB_HASH_GROUP_BITS
, true);
476 if (TDB_PTR_IS_ERR(group
)) {
477 return TDB_PTR_ERR(group
);
479 force_into_group(group
, h
.home_bucket
, encode_offset(off
, &h
));
480 return tdb_access_commit(tdb
, group
);
483 static enum TDB_ERROR
expand_group(struct tdb_context
*tdb
, struct hash_info
*h
)
485 unsigned bucket
, num_vals
, i
, magic
;
488 tdb_off_t vals
[1 << TDB_HASH_GROUP_BITS
];
489 enum TDB_ERROR ecode
;
491 /* Attach new empty subhash under fullest bucket. */
492 bucket
= fullest_bucket(tdb
, h
->group
, h
->home_bucket
);
494 if (h
->hash_used
== 64) {
495 tdb
->stats
.alloc_chain
++;
496 subsize
= sizeof(struct tdb_chain
);
497 magic
= TDB_CHAIN_MAGIC
;
499 tdb
->stats
.alloc_subhash
++;
500 subsize
= (sizeof(tdb_off_t
) << TDB_SUBLEVEL_HASH_BITS
);
501 magic
= TDB_HTABLE_MAGIC
;
504 subhash
= alloc(tdb
, 0, subsize
, 0, magic
, false);
505 if (TDB_OFF_IS_ERR(subhash
)) {
509 ecode
= zero_out(tdb
, subhash
+ sizeof(struct tdb_used_record
),
511 if (ecode
!= TDB_SUCCESS
) {
515 /* Remove any which are destined for bucket or are in wrong place. */
517 for (i
= 0; i
< (1 << TDB_HASH_GROUP_BITS
); i
++) {
518 unsigned home_bucket
= h
->group
[i
] & TDB_OFF_HASH_GROUP_MASK
;
519 if (!h
->group
[i
] || is_subhash(h
->group
[i
]))
521 if (home_bucket
== bucket
|| home_bucket
!= i
) {
522 vals
[num_vals
++] = h
->group
[i
];
526 /* FIXME: This assert is valid, but we do this during unit test :( */
527 /* assert(num_vals); */
529 /* Overwrite expanded bucket with subhash pointer. */
530 h
->group
[bucket
] = subhash
| (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT
);
532 /* Point to actual contents of record. */
533 subhash
+= sizeof(struct tdb_used_record
);
535 /* Put values back. */
536 for (i
= 0; i
< num_vals
; i
++) {
537 unsigned this_bucket
= vals
[i
] & TDB_OFF_HASH_GROUP_MASK
;
539 if (this_bucket
== bucket
) {
540 ecode
= add_to_subhash(tdb
, subhash
, h
->hash_used
,
542 if (ecode
!= TDB_SUCCESS
)
545 /* There should be room to put this back. */
546 force_into_group(h
->group
, this_bucket
, vals
[i
]);
552 enum TDB_ERROR
delete_from_hash(struct tdb_context
*tdb
, struct hash_info
*h
)
554 unsigned int i
, num_movers
= 0;
555 tdb_off_t movers
[1 << TDB_HASH_GROUP_BITS
];
557 h
->group
[h
->found_bucket
] = 0;
558 for (i
= 1; i
< (1 << TDB_HASH_GROUP_BITS
); i
++) {
559 unsigned this_bucket
;
561 this_bucket
= (h
->found_bucket
+i
) % (1 << TDB_HASH_GROUP_BITS
);
562 /* Empty bucket? We're done. */
563 if (!h
->group
[this_bucket
])
566 /* Ignore subhashes. */
567 if (is_subhash(h
->group
[this_bucket
]))
570 /* If this one is not happy where it is, we'll move it. */
571 if ((h
->group
[this_bucket
] & TDB_OFF_HASH_GROUP_MASK
)
573 movers
[num_movers
++] = h
->group
[this_bucket
];
574 h
->group
[this_bucket
] = 0;
578 /* Put back the ones we erased. */
579 for (i
= 0; i
< num_movers
; i
++) {
580 force_into_group(h
->group
, movers
[i
] & TDB_OFF_HASH_GROUP_MASK
,
584 /* Now we write back the hash group */
585 return tdb_write_convert(tdb
, h
->group_start
,
586 h
->group
, sizeof(h
->group
));
589 enum TDB_ERROR
add_to_hash(struct tdb_context
*tdb
, struct hash_info
*h
,
592 enum TDB_ERROR ecode
;
594 /* We hit an empty bucket during search? That's where it goes. */
595 if (!h
->group
[h
->found_bucket
]) {
596 h
->group
[h
->found_bucket
] = encode_offset(new_off
, h
);
597 /* Write back the modified group. */
598 return tdb_write_convert(tdb
, h
->group_start
,
599 h
->group
, sizeof(h
->group
));
602 if (h
->hash_used
> 64)
603 return add_to_chain(tdb
, h
->group_start
, new_off
);
605 /* We're full. Expand. */
606 ecode
= expand_group(tdb
, h
);
607 if (ecode
!= TDB_SUCCESS
) {
611 if (is_subhash(h
->group
[h
->home_bucket
])) {
612 /* We were expanded! */
616 /* Write back the modified group. */
617 ecode
= tdb_write_convert(tdb
, h
->group_start
, h
->group
,
619 if (ecode
!= TDB_SUCCESS
) {
623 /* Move hashinfo down a level. */
624 hashtable
= (h
->group
[h
->home_bucket
] & TDB_OFF_MASK
)
625 + sizeof(struct tdb_used_record
);
626 gnum
= use_bits(h
,TDB_SUBLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
);
627 h
->home_bucket
= use_bits(h
, TDB_HASH_GROUP_BITS
);
628 h
->group_start
= hashtable
629 + gnum
* (sizeof(tdb_off_t
) << TDB_HASH_GROUP_BITS
);
630 ecode
= tdb_read_convert(tdb
, h
->group_start
, &h
->group
,
632 if (ecode
!= TDB_SUCCESS
) {
637 /* Expanding the group must have made room if it didn't choose this
639 if (put_into_group(h
->group
, h
->home_bucket
, encode_offset(new_off
,h
))){
640 return tdb_write_convert(tdb
, h
->group_start
,
641 h
->group
, sizeof(h
->group
));
644 /* This can happen if all hashes in group (and us) dropped into same
645 * group in subhash. */
646 return add_to_hash(tdb
, h
, new_off
);
649 /* Traverse support: returns offset of record, or 0 or -ve error. */
650 static tdb_off_t
iterate_hash(struct tdb_context
*tdb
,
651 struct traverse_info
*tinfo
)
653 tdb_off_t off
, val
, i
;
654 struct traverse_level
*tlevel
;
656 tlevel
= &tinfo
->levels
[tinfo
->num_levels
-1];
659 for (i
= tdb_find_nonzero_off(tdb
, tlevel
->hashtable
,
660 tlevel
->entry
, tlevel
->total_buckets
);
661 i
!= tlevel
->total_buckets
;
662 i
= tdb_find_nonzero_off(tdb
, tlevel
->hashtable
,
663 i
+1, tlevel
->total_buckets
)) {
664 if (TDB_OFF_IS_ERR(i
)) {
668 val
= tdb_read_off(tdb
, tlevel
->hashtable
+sizeof(tdb_off_t
)*i
);
669 if (TDB_OFF_IS_ERR(val
)) {
673 off
= val
& TDB_OFF_MASK
;
675 /* This makes the delete-all-in-traverse case work
676 * (and simplifies our logic a little). */
677 if (off
== tinfo
->prev
)
682 if (!is_subhash(val
)) {
688 /* When we come back, we want the next one */
692 tlevel
->hashtable
= off
+ sizeof(struct tdb_used_record
);
694 /* Next level is a chain? */
695 if (unlikely(tinfo
->num_levels
== TDB_MAX_LEVELS
+ 1))
696 tlevel
->total_buckets
= (1 << TDB_HASH_GROUP_BITS
);
698 tlevel
->total_buckets
= (1 << TDB_SUBLEVEL_HASH_BITS
);
703 if (tinfo
->num_levels
== 1)
706 /* Handle chained entries. */
707 if (unlikely(tinfo
->num_levels
== TDB_MAX_LEVELS
+ 1)) {
708 tlevel
->hashtable
= tdb_read_off(tdb
, tlevel
->hashtable
709 + offsetof(struct tdb_chain
,
711 if (TDB_OFF_IS_ERR(tlevel
->hashtable
)) {
712 return tlevel
->hashtable
;
714 if (tlevel
->hashtable
) {
715 tlevel
->hashtable
+= sizeof(struct tdb_used_record
);
721 /* Go back up and keep searching. */
727 /* Return success if we find something, TDB_ERR_NOEXIST if none. */
728 enum TDB_ERROR
next_in_hash(struct tdb_context
*tdb
,
729 struct traverse_info
*tinfo
,
730 TDB_DATA
*kbuf
, size_t *dlen
)
732 const unsigned group_bits
= TDB_TOPLEVEL_HASH_BITS
-TDB_HASH_GROUP_BITS
;
733 tdb_off_t hl_start
, hl_range
, off
;
734 enum TDB_ERROR ecode
;
736 while (tinfo
->toplevel_group
< (1 << group_bits
)) {
737 hl_start
= (tdb_off_t
)tinfo
->toplevel_group
738 << (64 - group_bits
);
739 hl_range
= 1ULL << group_bits
;
740 ecode
= tdb_lock_hashes(tdb
, hl_start
, hl_range
, F_RDLCK
,
742 if (ecode
!= TDB_SUCCESS
) {
746 off
= iterate_hash(tdb
, tinfo
);
748 struct tdb_used_record rec
;
750 if (TDB_OFF_IS_ERR(off
)) {
755 ecode
= tdb_read_convert(tdb
, off
, &rec
, sizeof(rec
));
756 if (ecode
!= TDB_SUCCESS
) {
759 if (rec_magic(&rec
) != TDB_USED_MAGIC
) {
760 ecode
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
,
763 " corrupt record at %llu",
768 kbuf
->dsize
= rec_key_length(&rec
);
770 /* They want data as well? */
772 *dlen
= rec_data_length(&rec
);
773 kbuf
->dptr
= tdb_alloc_read(tdb
,
778 kbuf
->dptr
= tdb_alloc_read(tdb
,
782 tdb_unlock_hashes(tdb
, hl_start
, hl_range
, F_RDLCK
);
783 if (TDB_PTR_IS_ERR(kbuf
->dptr
)) {
784 return TDB_PTR_ERR(kbuf
->dptr
);
789 tdb_unlock_hashes(tdb
, hl_start
, hl_range
, F_RDLCK
);
791 tinfo
->toplevel_group
++;
792 tinfo
->levels
[0].hashtable
793 += (sizeof(tdb_off_t
) << TDB_HASH_GROUP_BITS
);
794 tinfo
->levels
[0].entry
= 0;
796 return TDB_ERR_NOEXIST
;
799 tdb_unlock_hashes(tdb
, hl_start
, hl_range
, F_RDLCK
);
804 enum TDB_ERROR
first_in_hash(struct tdb_context
*tdb
,
805 struct traverse_info
*tinfo
,
806 TDB_DATA
*kbuf
, size_t *dlen
)
809 tinfo
->toplevel_group
= 0;
810 tinfo
->num_levels
= 1;
811 tinfo
->levels
[0].hashtable
= offsetof(struct tdb_header
, hashtable
);
812 tinfo
->levels
[0].entry
= 0;
813 tinfo
->levels
[0].total_buckets
= (1 << TDB_HASH_GROUP_BITS
);
815 return next_in_hash(tdb
, tinfo
, kbuf
, dlen
);
818 /* Even if the entry isn't in this hash bucket, you'd have to lock this
819 * bucket to find it. */
820 static enum TDB_ERROR
chainlock(struct tdb_context
*tdb
, const TDB_DATA
*key
,
821 int ltype
, enum tdb_lock_flags waitflag
,
824 enum TDB_ERROR ecode
;
825 uint64_t h
= tdb_hash(tdb
, key
->dptr
, key
->dsize
);
826 tdb_off_t lockstart
, locksize
;
827 unsigned int group
, gbits
;
829 gbits
= TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
;
830 group
= bits_from(h
, 64 - gbits
, gbits
);
832 lockstart
= hlock_range(group
, &locksize
);
834 ecode
= tdb_lock_hashes(tdb
, lockstart
, locksize
, ltype
, waitflag
);
835 tdb_trace_1rec(tdb
, func
, *key
);
839 /* lock/unlock one hash chain. This is meant to be used to reduce
840 contention - it cannot guarantee how many records will be locked */
841 enum TDB_ERROR
tdb_chainlock(struct tdb_context
*tdb
, TDB_DATA key
)
843 return tdb
->last_error
= chainlock(tdb
, &key
, F_WRLCK
, TDB_LOCK_WAIT
,
847 void tdb_chainunlock(struct tdb_context
*tdb
, TDB_DATA key
)
849 uint64_t h
= tdb_hash(tdb
, key
.dptr
, key
.dsize
);
850 tdb_off_t lockstart
, locksize
;
851 unsigned int group
, gbits
;
853 gbits
= TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
;
854 group
= bits_from(h
, 64 - gbits
, gbits
);
856 lockstart
= hlock_range(group
, &locksize
);
858 tdb_trace_1rec(tdb
, "tdb_chainunlock", key
);
859 tdb_unlock_hashes(tdb
, lockstart
, locksize
, F_WRLCK
);
862 enum TDB_ERROR
tdb_chainlock_read(struct tdb_context
*tdb
, TDB_DATA key
)
864 return tdb
->last_error
= chainlock(tdb
, &key
, F_RDLCK
, TDB_LOCK_WAIT
,
865 "tdb_chainlock_read");
868 void tdb_chainunlock_read(struct tdb_context
*tdb
, TDB_DATA key
)
870 uint64_t h
= tdb_hash(tdb
, key
.dptr
, key
.dsize
);
871 tdb_off_t lockstart
, locksize
;
872 unsigned int group
, gbits
;
874 gbits
= TDB_TOPLEVEL_HASH_BITS
- TDB_HASH_GROUP_BITS
;
875 group
= bits_from(h
, 64 - gbits
, gbits
);
877 lockstart
= hlock_range(group
, &locksize
);
879 tdb_trace_1rec(tdb
, "tdb_chainunlock_read", key
);
880 tdb_unlock_hashes(tdb
, lockstart
, locksize
, F_RDLCK
);