2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/ilog/ilog.h>
25 static unsigned fls64(uint64_t val
)
30 /* In which bucket would we find a particular record size? (ignoring header) */
31 unsigned int size_to_bucket(tdb_len_t data_len
)
35 /* We can't have records smaller than this. */
36 assert(data_len
>= TDB_MIN_DATA_LEN
);
38 /* Ignoring the header... */
39 if (data_len
- TDB_MIN_DATA_LEN
<= 64) {
40 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
41 bucket
= (data_len
- TDB_MIN_DATA_LEN
) / 8;
43 /* After that we go power of 2. */
44 bucket
= fls64(data_len
- TDB_MIN_DATA_LEN
) + 2;
47 if (unlikely(bucket
>= TDB_FREE_BUCKETS
))
48 bucket
= TDB_FREE_BUCKETS
- 1;
52 tdb_off_t
first_ftable(struct tdb_context
*tdb
)
54 return tdb_read_off(tdb
, offsetof(struct tdb_header
, free_table
));
57 tdb_off_t
next_ftable(struct tdb_context
*tdb
, tdb_off_t ftable
)
59 return tdb_read_off(tdb
, ftable
+ offsetof(struct tdb_freetable
,next
));
62 enum TDB_ERROR
tdb_ftable_init(struct tdb_context
*tdb
)
64 /* Use reservoir sampling algorithm to select a free list at random. */
65 unsigned int rnd
, max
= 0, count
= 0;
68 tdb
->tdb2
.ftable_off
= off
= first_ftable(tdb
);
72 if (TDB_OFF_IS_ERR(off
)) {
73 return TDB_OFF_TO_ERR(off
);
78 tdb
->tdb2
.ftable_off
= off
;
79 tdb
->tdb2
.ftable
= count
;
83 off
= next_ftable(tdb
, off
);
89 /* Offset of a given bucket. */
90 tdb_off_t
bucket_off(tdb_off_t ftable_off
, unsigned bucket
)
92 return ftable_off
+ offsetof(struct tdb_freetable
, buckets
)
93 + bucket
* sizeof(tdb_off_t
);
96 /* Returns free_buckets + 1, or list number to search, or -ve error. */
97 static tdb_off_t
find_free_head(struct tdb_context
*tdb
,
101 /* Speculatively search for a non-zero bucket. */
102 return tdb_find_nonzero_off(tdb
, bucket_off(ftable_off
, 0),
103 bucket
, TDB_FREE_BUCKETS
);
106 static void check_list(struct tdb_context
*tdb
, tdb_off_t b_off
)
108 #ifdef CCAN_TDB2_DEBUG
109 tdb_off_t off
, prev
= 0, first
;
110 struct tdb_free_record r
;
112 first
= off
= (tdb_read_off(tdb
, b_off
) & TDB_OFF_MASK
);
114 tdb_read_convert(tdb
, off
, &r
, sizeof(r
));
115 if (frec_magic(&r
) != TDB_FREE_MAGIC
)
117 if (prev
&& frec_prev(&r
) != prev
)
124 tdb_read_convert(tdb
, first
, &r
, sizeof(r
));
125 if (frec_prev(&r
) != prev
)
131 /* Remove from free bucket. */
132 static enum TDB_ERROR
remove_from_list(struct tdb_context
*tdb
,
133 tdb_off_t b_off
, tdb_off_t r_off
,
134 const struct tdb_free_record
*r
)
136 tdb_off_t off
, prev_next
, head
;
137 enum TDB_ERROR ecode
;
139 /* Is this only element in list? Zero out bucket, and we're done. */
140 if (frec_prev(r
) == r_off
)
141 return tdb_write_off(tdb
, b_off
, 0);
143 /* off = &r->prev->next */
144 off
= frec_prev(r
) + offsetof(struct tdb_free_record
, next
);
147 prev_next
= tdb_read_off(tdb
, off
);
148 if (TDB_OFF_IS_ERR(prev_next
))
149 return TDB_OFF_TO_ERR(prev_next
);
151 /* If prev->next == 0, we were head: update bucket to point to next. */
152 if (prev_next
== 0) {
153 /* We must preserve upper bits. */
154 head
= tdb_read_off(tdb
, b_off
);
155 if (TDB_OFF_IS_ERR(head
))
156 return TDB_OFF_TO_ERR(head
);
158 if ((head
& TDB_OFF_MASK
) != r_off
) {
159 return tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
161 " %llu head %llu on list %llu",
166 head
= ((head
& ~TDB_OFF_MASK
) | r
->next
);
167 ecode
= tdb_write_off(tdb
, b_off
, head
);
168 if (ecode
!= TDB_SUCCESS
)
171 /* r->prev->next = r->next */
172 ecode
= tdb_write_off(tdb
, off
, r
->next
);
173 if (ecode
!= TDB_SUCCESS
)
177 /* If we were the tail, off = &head->prev. */
179 head
= tdb_read_off(tdb
, b_off
);
180 if (TDB_OFF_IS_ERR(head
))
181 return TDB_OFF_TO_ERR(head
);
182 head
&= TDB_OFF_MASK
;
183 off
= head
+ offsetof(struct tdb_free_record
, magic_and_prev
);
185 /* off = &r->next->prev */
186 off
= r
->next
+ offsetof(struct tdb_free_record
,
190 #ifdef CCAN_TDB2_DEBUG
192 if ((tdb_read_off(tdb
, off
) & TDB_OFF_MASK
) != r_off
) {
193 return tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
195 " %llu bad prev in list %llu",
196 (long long)r_off
, (long long)b_off
);
199 /* r->next->prev = r->prev */
200 return tdb_write_off(tdb
, off
, r
->magic_and_prev
);
203 /* Enqueue in this free bucket: sets coalesce if we've added 128
205 static enum TDB_ERROR
enqueue_in_free(struct tdb_context
*tdb
,
211 struct tdb_free_record
new;
212 enum TDB_ERROR ecode
;
213 tdb_off_t prev
, head
;
214 uint64_t magic
= (TDB_FREE_MAGIC
<< (64 - TDB_OFF_UPPER_STEAL
));
216 head
= tdb_read_off(tdb
, b_off
);
217 if (TDB_OFF_IS_ERR(head
))
218 return TDB_OFF_TO_ERR(head
);
220 /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
221 new.ftable_and_len
= ((uint64_t)tdb
->tdb2
.ftable
<< (64 - TDB_OFF_UPPER_STEAL
))
224 /* new->next = head. */
225 new.next
= (head
& TDB_OFF_MASK
);
227 /* First element? Prev points to ourselves. */
229 new.magic_and_prev
= (magic
| off
);
231 /* new->prev = next->prev */
232 prev
= tdb_read_off(tdb
,
233 new.next
+ offsetof(struct tdb_free_record
,
235 new.magic_and_prev
= prev
;
236 if (frec_magic(&new) != TDB_FREE_MAGIC
) {
237 return tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
238 "enqueue_in_free: %llu bad head"
243 /* next->prev = new. */
244 ecode
= tdb_write_off(tdb
, new.next
245 + offsetof(struct tdb_free_record
,
248 if (ecode
!= TDB_SUCCESS
) {
252 #ifdef CCAN_TDB2_DEBUG
253 prev
= tdb_read_off(tdb
, frec_prev(&new)
254 + offsetof(struct tdb_free_record
, next
));
256 return tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
258 " %llu bad tail next ptr %llu",
259 (long long)frec_prev(&new)
260 + offsetof(struct tdb_free_record
,
267 /* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */
269 head
+= (1ULL << (64 - TDB_OFF_UPPER_STEAL
));
270 head
&= ~(TDB_OFF_MASK
| (1ULL << 63));
273 ecode
= tdb_write_off(tdb
, b_off
, head
);
274 if (ecode
!= TDB_SUCCESS
) {
278 /* It's time to coalesce if counter wrapped. */
280 *coalesce
= ((head
& ~TDB_OFF_MASK
) == 0);
282 return tdb_write_convert(tdb
, off
, &new, sizeof(new));
285 static tdb_off_t
ftable_offset(struct tdb_context
*tdb
, unsigned int ftable
)
290 if (likely(tdb
->tdb2
.ftable
== ftable
))
291 return tdb
->tdb2
.ftable_off
;
293 off
= first_ftable(tdb
);
294 for (i
= 0; i
< ftable
; i
++) {
295 if (TDB_OFF_IS_ERR(off
)) {
298 off
= next_ftable(tdb
, off
);
303 /* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
304 * need to blatt the *protect record (which is set to an error). */
305 static tdb_len_t
coalesce(struct tdb_context
*tdb
,
306 tdb_off_t off
, tdb_off_t b_off
,
311 struct tdb_free_record rec
;
312 enum TDB_ERROR ecode
;
314 tdb
->stats
.alloc_coalesce_tried
++;
315 end
= off
+ sizeof(struct tdb_used_record
) + data_len
;
317 while (end
< tdb
->file
->map_size
) {
318 const struct tdb_free_record
*r
;
320 unsigned ftable
, bucket
;
322 r
= tdb_access_read(tdb
, end
, sizeof(*r
), true);
323 if (TDB_PTR_IS_ERR(r
)) {
324 ecode
= TDB_PTR_ERR(r
);
328 if (frec_magic(r
) != TDB_FREE_MAGIC
329 || frec_ftable(r
) == TDB_FTABLE_NONE
) {
330 tdb_access_release(tdb
, r
);
334 ftable
= frec_ftable(r
);
335 bucket
= size_to_bucket(frec_len(r
));
336 nb_off
= ftable_offset(tdb
, ftable
);
337 if (TDB_OFF_IS_ERR(nb_off
)) {
338 tdb_access_release(tdb
, r
);
339 ecode
= TDB_OFF_TO_ERR(nb_off
);
342 nb_off
= bucket_off(nb_off
, bucket
);
343 tdb_access_release(tdb
, r
);
345 /* We may be violating lock order here, so best effort. */
346 if (tdb_lock_free_bucket(tdb
, nb_off
, TDB_LOCK_NOWAIT
)
348 tdb
->stats
.alloc_coalesce_lockfail
++;
352 /* Now we have lock, re-check. */
353 ecode
= tdb_read_convert(tdb
, end
, &rec
, sizeof(rec
));
354 if (ecode
!= TDB_SUCCESS
) {
355 tdb_unlock_free_bucket(tdb
, nb_off
);
359 if (unlikely(frec_magic(&rec
) != TDB_FREE_MAGIC
)) {
360 tdb
->stats
.alloc_coalesce_race
++;
361 tdb_unlock_free_bucket(tdb
, nb_off
);
365 if (unlikely(frec_ftable(&rec
) != ftable
)
366 || unlikely(size_to_bucket(frec_len(&rec
)) != bucket
)) {
367 tdb
->stats
.alloc_coalesce_race
++;
368 tdb_unlock_free_bucket(tdb
, nb_off
);
372 /* Did we just mess up a record you were hoping to use? */
373 if (end
== *protect
) {
374 tdb
->stats
.alloc_coalesce_iterate_clash
++;
375 *protect
= TDB_ERR_TO_OFF(TDB_ERR_NOEXIST
);
378 ecode
= remove_from_list(tdb
, nb_off
, end
, &rec
);
379 check_list(tdb
, nb_off
);
380 if (ecode
!= TDB_SUCCESS
) {
381 tdb_unlock_free_bucket(tdb
, nb_off
);
385 end
+= sizeof(struct tdb_used_record
) + frec_len(&rec
);
386 tdb_unlock_free_bucket(tdb
, nb_off
);
387 tdb
->stats
.alloc_coalesce_num_merged
++;
390 /* Didn't find any adjacent free? */
391 if (end
== off
+ sizeof(struct tdb_used_record
) + data_len
)
394 /* Before we expand, check this isn't one you wanted protected? */
395 if (off
== *protect
) {
396 *protect
= TDB_ERR_TO_OFF(TDB_ERR_EXISTS
);
397 tdb
->stats
.alloc_coalesce_iterate_clash
++;
400 /* OK, expand initial record */
401 ecode
= tdb_read_convert(tdb
, off
, &rec
, sizeof(rec
));
402 if (ecode
!= TDB_SUCCESS
) {
406 if (frec_len(&rec
) != data_len
) {
407 ecode
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
408 "coalesce: expected data len %zu not %zu",
409 (size_t)data_len
, (size_t)frec_len(&rec
));
413 ecode
= remove_from_list(tdb
, b_off
, off
, &rec
);
414 check_list(tdb
, b_off
);
415 if (ecode
!= TDB_SUCCESS
) {
419 /* Try locking violation first. We don't allow coalesce recursion! */
420 ecode
= add_free_record(tdb
, off
, end
- off
, TDB_LOCK_NOWAIT
, false);
421 if (ecode
!= TDB_SUCCESS
) {
422 /* Need to drop lock. Can't rely on anything stable. */
423 tdb
->stats
.alloc_coalesce_lockfail
++;
424 *protect
= TDB_ERR_TO_OFF(TDB_ERR_CORRUPT
);
426 /* We have to drop this to avoid deadlocks, so make sure record
427 * doesn't get coalesced by someone else! */
428 rec
.ftable_and_len
= (TDB_FTABLE_NONE
429 << (64 - TDB_OFF_UPPER_STEAL
))
430 | (end
- off
- sizeof(struct tdb_used_record
));
431 ecode
= tdb_write_off(tdb
,
432 off
+ offsetof(struct tdb_free_record
,
435 if (ecode
!= TDB_SUCCESS
) {
439 tdb_unlock_free_bucket(tdb
, b_off
);
441 ecode
= add_free_record(tdb
, off
, end
- off
, TDB_LOCK_WAIT
,
443 if (ecode
!= TDB_SUCCESS
) {
444 return TDB_ERR_TO_OFF(ecode
);
446 } else if (TDB_OFF_IS_ERR(*protect
)) {
447 /* For simplicity, we always drop lock if they can't continue */
448 tdb_unlock_free_bucket(tdb
, b_off
);
450 tdb
->stats
.alloc_coalesce_succeeded
++;
452 /* Return usable length. */
453 return end
- off
- sizeof(struct tdb_used_record
);
456 /* To unify error paths, we *always* unlock bucket on error. */
457 tdb_unlock_free_bucket(tdb
, b_off
);
458 return TDB_ERR_TO_OFF(ecode
);
461 /* List is locked: we unlock it. */
462 static enum TDB_ERROR
coalesce_list(struct tdb_context
*tdb
,
463 tdb_off_t ftable_off
,
467 enum TDB_ERROR ecode
;
470 off
= tdb_read_off(tdb
, b_off
);
471 if (TDB_OFF_IS_ERR(off
)) {
472 ecode
= TDB_OFF_TO_ERR(off
);
475 /* A little bit of paranoia: counter should be 0. */
478 while (off
&& limit
--) {
479 struct tdb_free_record rec
;
483 ecode
= tdb_read_convert(tdb
, off
, &rec
, sizeof(rec
));
484 if (ecode
!= TDB_SUCCESS
)
488 coal
= coalesce(tdb
, off
, b_off
, frec_len(&rec
), &next
);
489 if (TDB_OFF_IS_ERR(coal
)) {
490 /* This has already unlocked on error. */
491 return TDB_OFF_TO_ERR(coal
);
493 if (TDB_OFF_IS_ERR(next
)) {
494 /* Coalescing had to unlock, so stop. */
497 /* Keep going if we're doing well... */
498 limit
+= size_to_bucket(coal
/ 16 + TDB_MIN_DATA_LEN
);
502 /* Now, move those elements to the tail of the list so we get something
505 struct tdb_free_record oldhrec
, newhrec
, oldtrec
, newtrec
;
506 tdb_off_t oldhoff
, oldtoff
, newtoff
;
508 /* The record we were up to is the new head. */
509 ecode
= tdb_read_convert(tdb
, off
, &newhrec
, sizeof(newhrec
));
510 if (ecode
!= TDB_SUCCESS
)
513 /* Get the new tail. */
514 newtoff
= frec_prev(&newhrec
);
515 ecode
= tdb_read_convert(tdb
, newtoff
, &newtrec
,
517 if (ecode
!= TDB_SUCCESS
)
520 /* Get the old head. */
521 oldhoff
= tdb_read_off(tdb
, b_off
);
522 if (TDB_OFF_IS_ERR(oldhoff
)) {
523 ecode
= TDB_OFF_TO_ERR(oldhoff
);
527 /* This could happen if they all coalesced away. */
531 ecode
= tdb_read_convert(tdb
, oldhoff
, &oldhrec
,
533 if (ecode
!= TDB_SUCCESS
)
536 /* Get the old tail. */
537 oldtoff
= frec_prev(&oldhrec
);
538 ecode
= tdb_read_convert(tdb
, oldtoff
, &oldtrec
,
540 if (ecode
!= TDB_SUCCESS
)
543 /* Old tail's next points to old head. */
544 oldtrec
.next
= oldhoff
;
546 /* Old head's prev points to old tail. */
547 oldhrec
.magic_and_prev
548 = (TDB_FREE_MAGIC
<< (64 - TDB_OFF_UPPER_STEAL
))
551 /* New tail's next is 0. */
554 /* Write out the modified versions. */
555 ecode
= tdb_write_convert(tdb
, oldtoff
, &oldtrec
,
557 if (ecode
!= TDB_SUCCESS
)
560 ecode
= tdb_write_convert(tdb
, oldhoff
, &oldhrec
,
562 if (ecode
!= TDB_SUCCESS
)
565 ecode
= tdb_write_convert(tdb
, newtoff
, &newtrec
,
567 if (ecode
!= TDB_SUCCESS
)
570 /* And finally link in new head. */
571 ecode
= tdb_write_off(tdb
, b_off
, off
);
572 if (ecode
!= TDB_SUCCESS
)
576 tdb_unlock_free_bucket(tdb
, b_off
);
580 tdb_unlock_free_bucket(tdb
, b_off
);
584 /* List must not be locked if coalesce_ok is set. */
585 enum TDB_ERROR
add_free_record(struct tdb_context
*tdb
,
586 tdb_off_t off
, tdb_len_t len_with_header
,
587 enum tdb_lock_flags waitflag
,
592 enum TDB_ERROR ecode
;
594 assert(len_with_header
>= sizeof(struct tdb_free_record
));
596 len
= len_with_header
- sizeof(struct tdb_used_record
);
598 b_off
= bucket_off(tdb
->tdb2
.ftable_off
, size_to_bucket(len
));
599 ecode
= tdb_lock_free_bucket(tdb
, b_off
, waitflag
);
600 if (ecode
!= TDB_SUCCESS
) {
604 ecode
= enqueue_in_free(tdb
, b_off
, off
, len
, &coalesce_ok
);
605 check_list(tdb
, b_off
);
607 /* Coalescing unlocks free list. */
608 if (!ecode
&& coalesce_ok
)
609 ecode
= coalesce_list(tdb
, tdb
->tdb2
.ftable_off
, b_off
, 2);
611 tdb_unlock_free_bucket(tdb
, b_off
);
615 static size_t adjust_size(size_t keylen
, size_t datalen
)
617 size_t size
= keylen
+ datalen
;
619 if (size
< TDB_MIN_DATA_LEN
)
620 size
= TDB_MIN_DATA_LEN
;
622 /* Round to next uint64_t boundary. */
623 return (size
+ (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
626 /* If we have enough left over to be useful, split that off. */
627 static size_t record_leftover(size_t keylen
, size_t datalen
,
628 bool want_extra
, size_t total_len
)
633 datalen
+= datalen
/ 2;
634 leftover
= total_len
- adjust_size(keylen
, datalen
);
636 if (leftover
< (ssize_t
)sizeof(struct tdb_free_record
))
642 /* We need size bytes to put our key and data in. */
643 static tdb_off_t
lock_and_alloc(struct tdb_context
*tdb
,
644 tdb_off_t ftable_off
,
646 size_t keylen
, size_t datalen
,
651 tdb_off_t off
, b_off
,best_off
;
652 struct tdb_free_record best
= { 0 };
654 size_t size
= adjust_size(keylen
, datalen
);
655 enum TDB_ERROR ecode
;
658 b_off
= bucket_off(ftable_off
, bucket
);
660 /* FIXME: Try non-blocking wait first, to measure contention. */
661 /* Lock this bucket. */
662 ecode
= tdb_lock_free_bucket(tdb
, b_off
, TDB_LOCK_WAIT
);
663 if (ecode
!= TDB_SUCCESS
) {
664 return TDB_ERR_TO_OFF(ecode
);
667 best
.ftable_and_len
= -1ULL;
670 /* Get slack if we're after extra. */
676 /* Walk the list to see if any are large enough, getting less fussy
678 off
= tdb_read_off(tdb
, b_off
);
679 if (TDB_OFF_IS_ERR(off
)) {
680 ecode
= TDB_OFF_TO_ERR(off
);
686 const struct tdb_free_record
*r
;
690 r
= tdb_access_read(tdb
, off
, sizeof(*r
), true);
691 if (TDB_PTR_IS_ERR(r
)) {
692 ecode
= TDB_PTR_ERR(r
);
696 if (frec_magic(r
) != TDB_FREE_MAGIC
) {
697 ecode
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
699 " %llu non-free 0x%llx",
701 (long long)r
->magic_and_prev
);
702 tdb_access_release(tdb
, r
);
706 if (frec_len(r
) >= size
&& frec_len(r
) < frec_len(&best
)) {
711 if (frec_len(&best
) <= size
* multiplier
&& best_off
) {
712 tdb_access_release(tdb
, r
);
720 tdb_access_release(tdb
, r
);
724 /* If we found anything at all, use it. */
726 struct tdb_used_record rec
;
729 /* We're happy with this size: take it. */
730 ecode
= remove_from_list(tdb
, b_off
, best_off
, &best
);
731 check_list(tdb
, b_off
);
732 if (ecode
!= TDB_SUCCESS
) {
736 leftover
= record_leftover(keylen
, datalen
, want_extra
,
739 assert(keylen
+ datalen
+ leftover
<= frec_len(&best
));
740 /* We need to mark non-free before we drop lock, otherwise
741 * coalesce() could try to merge it! */
742 ecode
= set_header(tdb
, &rec
, magic
, keylen
, datalen
,
743 frec_len(&best
) - leftover
, hashlow
);
744 if (ecode
!= TDB_SUCCESS
) {
748 ecode
= tdb_write_convert(tdb
, best_off
, &rec
, sizeof(rec
));
749 if (ecode
!= TDB_SUCCESS
) {
753 /* For futureproofing, we put a 0 in any unused space. */
754 if (rec_extra_padding(&rec
)) {
755 ecode
= tdb
->tdb2
.io
->twrite(tdb
, best_off
+ sizeof(rec
)
756 + keylen
+ datalen
, "", 1);
757 if (ecode
!= TDB_SUCCESS
) {
762 /* Bucket of leftover will be <= current bucket, so nested
763 * locking is allowed. */
765 tdb
->stats
.alloc_leftover
++;
766 ecode
= add_free_record(tdb
,
767 best_off
+ sizeof(rec
)
768 + frec_len(&best
) - leftover
,
769 leftover
, TDB_LOCK_WAIT
, false);
770 if (ecode
!= TDB_SUCCESS
) {
771 best_off
= TDB_ERR_TO_OFF(ecode
);
774 tdb_unlock_free_bucket(tdb
, b_off
);
779 tdb_unlock_free_bucket(tdb
, b_off
);
783 tdb_unlock_free_bucket(tdb
, b_off
);
784 return TDB_ERR_TO_OFF(ecode
);
787 /* Get a free block from current free list, or 0 if none, -ve on error. */
788 static tdb_off_t
get_free(struct tdb_context
*tdb
,
789 size_t keylen
, size_t datalen
, bool want_extra
,
790 unsigned magic
, unsigned hashlow
)
792 tdb_off_t off
, ftable_off
;
793 tdb_off_t start_b
, b
, ftable
;
794 bool wrapped
= false;
796 /* If they are growing, add 50% to get to higher bucket. */
798 start_b
= size_to_bucket(adjust_size(keylen
,
799 datalen
+ datalen
/ 2));
801 start_b
= size_to_bucket(adjust_size(keylen
, datalen
));
803 ftable_off
= tdb
->tdb2
.ftable_off
;
804 ftable
= tdb
->tdb2
.ftable
;
805 while (!wrapped
|| ftable_off
!= tdb
->tdb2
.ftable_off
) {
806 /* Start at exact size bucket, and search up... */
807 for (b
= find_free_head(tdb
, ftable_off
, start_b
);
808 b
< TDB_FREE_BUCKETS
;
809 b
= find_free_head(tdb
, ftable_off
, b
+ 1)) {
810 /* Try getting one from list. */
811 off
= lock_and_alloc(tdb
, ftable_off
,
812 b
, keylen
, datalen
, want_extra
,
814 if (TDB_OFF_IS_ERR(off
))
818 tdb
->stats
.alloc_bucket_exact
++;
819 if (b
== TDB_FREE_BUCKETS
- 1)
820 tdb
->stats
.alloc_bucket_max
++;
821 /* Worked? Stay using this list. */
822 tdb
->tdb2
.ftable_off
= ftable_off
;
823 tdb
->tdb2
.ftable
= ftable
;
826 /* Didn't work. Try next bucket. */
829 if (TDB_OFF_IS_ERR(b
)) {
833 /* Hmm, try next table. */
834 ftable_off
= next_ftable(tdb
, ftable_off
);
835 if (TDB_OFF_IS_ERR(ftable_off
)) {
840 if (ftable_off
== 0) {
842 ftable_off
= first_ftable(tdb
);
843 if (TDB_OFF_IS_ERR(ftable_off
)) {
853 enum TDB_ERROR
set_header(struct tdb_context
*tdb
,
854 struct tdb_used_record
*rec
,
855 unsigned magic
, uint64_t keylen
, uint64_t datalen
,
856 uint64_t actuallen
, unsigned hashlow
)
858 uint64_t keybits
= (fls64(keylen
) + 1) / 2;
860 /* Use bottom bits of hash, so it's independent of hash table size. */
861 rec
->magic_and_meta
= (hashlow
& ((1 << 11)-1))
862 | ((actuallen
- (keylen
+ datalen
)) << 11)
864 | ((uint64_t)magic
<< 48);
865 rec
->key_and_data_len
= (keylen
| (datalen
<< (keybits
*2)));
867 /* Encoding can fail on big values. */
868 if (rec_key_length(rec
) != keylen
869 || rec_data_length(rec
) != datalen
870 || rec_extra_padding(rec
) != actuallen
- (keylen
+ datalen
)) {
871 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
872 "Could not encode k=%llu,d=%llu,a=%llu",
873 (long long)keylen
, (long long)datalen
,
874 (long long)actuallen
);
879 /* You need 'size', this tells you how much you should expand by. */
880 tdb_off_t
tdb_expand_adjust(tdb_off_t map_size
, tdb_off_t size
)
882 tdb_off_t new_size
, top_size
;
884 /* limit size in order to avoid using up huge amounts of memory for
885 * in memory tdbs if an oddball huge record creeps in */
886 if (size
> 100 * 1024) {
887 top_size
= map_size
+ size
* 2;
889 top_size
= map_size
+ size
* 100;
892 /* always make room for at least top_size more records, and at
893 least 25% more space. if the DB is smaller than 100MiB,
894 otherwise grow it by 10% only. */
895 if (map_size
> 100 * 1024 * 1024) {
896 new_size
= map_size
* 1.10;
898 new_size
= map_size
* 1.25;
901 /* Round the database up to a multiple of the page size */
902 if (new_size
< top_size
)
904 return new_size
- map_size
;
907 /* Expand the database. */
908 static enum TDB_ERROR
tdb_expand(struct tdb_context
*tdb
, tdb_len_t size
)
912 enum TDB_ERROR ecode
;
914 /* Need to hold a hash lock to expand DB: transactions rely on it. */
915 if (!(tdb
->flags
& TDB_NOLOCK
)
916 && !tdb
->file
->allrecord_lock
.count
&& !tdb_has_hash_locks(tdb
)) {
917 return tdb_logerr(tdb
, TDB_ERR_LOCK
, TDB_LOG_ERROR
,
918 "tdb_expand: must hold lock during expand");
921 /* Only one person can expand file at a time. */
922 ecode
= tdb_lock_expand(tdb
, F_WRLCK
);
923 if (ecode
!= TDB_SUCCESS
) {
927 /* Someone else may have expanded the file, so retry. */
928 old_size
= tdb
->file
->map_size
;
929 tdb
->tdb2
.io
->oob(tdb
, tdb
->file
->map_size
, 1, true);
930 if (tdb
->file
->map_size
!= old_size
) {
931 tdb_unlock_expand(tdb
, F_WRLCK
);
936 wanted
= tdb_expand_adjust(old_size
, size
);
937 /* We need room for the record header too. */
938 wanted
= adjust_size(0, sizeof(struct tdb_used_record
) + wanted
);
940 ecode
= tdb
->tdb2
.io
->expand_file(tdb
, wanted
);
941 if (ecode
!= TDB_SUCCESS
) {
942 tdb_unlock_expand(tdb
, F_WRLCK
);
946 /* We need to drop this lock before adding free record. */
947 tdb_unlock_expand(tdb
, F_WRLCK
);
949 tdb
->stats
.expands
++;
950 return add_free_record(tdb
, old_size
, wanted
, TDB_LOCK_WAIT
, true);
953 /* This won't fail: it will expand the database if it has to. */
954 tdb_off_t
alloc(struct tdb_context
*tdb
, size_t keylen
, size_t datalen
,
955 uint64_t hash
, unsigned magic
, bool growing
)
959 /* We can't hold pointers during this: we could unmap! */
960 assert(!tdb
->tdb2
.direct_access
);
963 enum TDB_ERROR ecode
;
964 off
= get_free(tdb
, keylen
, datalen
, growing
, magic
, hash
);
965 if (likely(off
!= 0))
968 ecode
= tdb_expand(tdb
, adjust_size(keylen
, datalen
));
969 if (ecode
!= TDB_SUCCESS
) {
970 return TDB_ERR_TO_OFF(ecode
);