2 Trivial Database 2: free list/block handling
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/likely/likely.h>
20 #include <ccan/ilog/ilog.h>
24 static unsigned fls64(uint64_t val
)
29 /* In which bucket would we find a particular record size? (ignoring header) */
30 unsigned int size_to_bucket(ntdb_len_t data_len
)
34 /* We can't have records smaller than this. */
35 assert(data_len
>= NTDB_MIN_DATA_LEN
);
37 /* Ignoring the header... */
38 if (data_len
- NTDB_MIN_DATA_LEN
<= 64) {
39 /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
40 bucket
= (data_len
- NTDB_MIN_DATA_LEN
) / 8;
42 /* After that we go power of 2. */
43 bucket
= fls64(data_len
- NTDB_MIN_DATA_LEN
) + 2;
46 if (unlikely(bucket
>= NTDB_FREE_BUCKETS
))
47 bucket
= NTDB_FREE_BUCKETS
- 1;
51 ntdb_off_t
first_ftable(struct ntdb_context
*ntdb
)
53 return ntdb_read_off(ntdb
, offsetof(struct ntdb_header
, free_table
));
56 ntdb_off_t
next_ftable(struct ntdb_context
*ntdb
, ntdb_off_t ftable
)
58 return ntdb_read_off(ntdb
, ftable
+ offsetof(struct ntdb_freetable
,next
));
61 enum NTDB_ERROR
ntdb_ftable_init(struct ntdb_context
*ntdb
)
63 /* Use reservoir sampling algorithm to select a free list at random. */
64 unsigned int rnd
, max
= 0, count
= 0;
67 ntdb
->ftable_off
= off
= first_ftable(ntdb
);
71 if (NTDB_OFF_IS_ERR(off
)) {
72 return NTDB_OFF_TO_ERR(off
);
77 ntdb
->ftable_off
= off
;
82 off
= next_ftable(ntdb
, off
);
88 /* Offset of a given bucket. */
89 ntdb_off_t
bucket_off(ntdb_off_t ftable_off
, unsigned bucket
)
91 return ftable_off
+ offsetof(struct ntdb_freetable
, buckets
)
92 + bucket
* sizeof(ntdb_off_t
);
95 /* Returns free_buckets + 1, or list number to search, or -ve error. */
96 static ntdb_off_t
find_free_head(struct ntdb_context
*ntdb
,
97 ntdb_off_t ftable_off
,
100 /* Speculatively search for a non-zero bucket. */
101 return ntdb_find_nonzero_off(ntdb
, bucket_off(ftable_off
, 0),
102 bucket
, NTDB_FREE_BUCKETS
);
105 static void check_list(struct ntdb_context
*ntdb
, ntdb_off_t b_off
)
107 #ifdef CCAN_NTDB_DEBUG
108 ntdb_off_t off
, prev
= 0, first
;
109 struct ntdb_free_record r
;
111 first
= off
= (ntdb_read_off(ntdb
, b_off
) & NTDB_OFF_MASK
);
113 ntdb_read_convert(ntdb
, off
, &r
, sizeof(r
));
114 if (frec_magic(&r
) != NTDB_FREE_MAGIC
)
116 if (prev
&& frec_prev(&r
) != prev
)
123 ntdb_read_convert(ntdb
, first
, &r
, sizeof(r
));
124 if (frec_prev(&r
) != prev
)
130 /* Remove from free bucket. */
131 static enum NTDB_ERROR
remove_from_list(struct ntdb_context
*ntdb
,
132 ntdb_off_t b_off
, ntdb_off_t r_off
,
133 const struct ntdb_free_record
*r
)
135 ntdb_off_t off
, prev_next
, head
;
136 enum NTDB_ERROR ecode
;
138 /* Is this only element in list? Zero out bucket, and we're done. */
139 if (frec_prev(r
) == r_off
)
140 return ntdb_write_off(ntdb
, b_off
, 0);
142 /* off = &r->prev->next */
143 off
= frec_prev(r
) + offsetof(struct ntdb_free_record
, next
);
146 prev_next
= ntdb_read_off(ntdb
, off
);
147 if (NTDB_OFF_IS_ERR(prev_next
))
148 return NTDB_OFF_TO_ERR(prev_next
);
150 /* If prev->next == 0, we were head: update bucket to point to next. */
151 if (prev_next
== 0) {
152 /* We must preserve upper bits. */
153 head
= ntdb_read_off(ntdb
, b_off
);
154 if (NTDB_OFF_IS_ERR(head
))
155 return NTDB_OFF_TO_ERR(head
);
157 if ((head
& NTDB_OFF_MASK
) != r_off
) {
158 return ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
160 " %llu head %llu on list %llu",
165 head
= ((head
& ~NTDB_OFF_MASK
) | r
->next
);
166 ecode
= ntdb_write_off(ntdb
, b_off
, head
);
167 if (ecode
!= NTDB_SUCCESS
)
170 /* r->prev->next = r->next */
171 ecode
= ntdb_write_off(ntdb
, off
, r
->next
);
172 if (ecode
!= NTDB_SUCCESS
)
176 /* If we were the tail, off = &head->prev. */
178 head
= ntdb_read_off(ntdb
, b_off
);
179 if (NTDB_OFF_IS_ERR(head
))
180 return NTDB_OFF_TO_ERR(head
);
181 head
&= NTDB_OFF_MASK
;
182 off
= head
+ offsetof(struct ntdb_free_record
, magic_and_prev
);
184 /* off = &r->next->prev */
185 off
= r
->next
+ offsetof(struct ntdb_free_record
,
189 #ifdef CCAN_NTDB_DEBUG
191 if ((ntdb_read_off(ntdb
, off
) & NTDB_OFF_MASK
) != r_off
) {
192 return ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
194 " %llu bad prev in list %llu",
195 (long long)r_off
, (long long)b_off
);
198 /* r->next->prev = r->prev */
199 return ntdb_write_off(ntdb
, off
, r
->magic_and_prev
);
202 /* Enqueue in this free bucket: sets coalesce if we've added 128
204 static enum NTDB_ERROR
enqueue_in_free(struct ntdb_context
*ntdb
,
210 struct ntdb_free_record
new;
211 enum NTDB_ERROR ecode
;
212 ntdb_off_t prev
, head
;
213 uint64_t magic
= (NTDB_FREE_MAGIC
<< (64 - NTDB_OFF_UPPER_STEAL
));
215 head
= ntdb_read_off(ntdb
, b_off
);
216 if (NTDB_OFF_IS_ERR(head
))
217 return NTDB_OFF_TO_ERR(head
);
219 /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
220 new.ftable_and_len
= ((uint64_t)ntdb
->ftable
221 << (64 - NTDB_OFF_UPPER_STEAL
))
224 /* new->next = head. */
225 new.next
= (head
& NTDB_OFF_MASK
);
227 /* First element? Prev points to ourselves. */
229 new.magic_and_prev
= (magic
| off
);
231 /* new->prev = next->prev */
232 prev
= ntdb_read_off(ntdb
,
233 new.next
+ offsetof(struct ntdb_free_record
,
235 new.magic_and_prev
= prev
;
236 if (frec_magic(&new) != NTDB_FREE_MAGIC
) {
237 return ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
238 "enqueue_in_free: %llu bad head"
243 /* next->prev = new. */
244 ecode
= ntdb_write_off(ntdb
, new.next
245 + offsetof(struct ntdb_free_record
,
248 if (ecode
!= NTDB_SUCCESS
) {
252 #ifdef CCAN_NTDB_DEBUG
253 prev
= ntdb_read_off(ntdb
, frec_prev(&new)
254 + offsetof(struct ntdb_free_record
, next
));
256 return ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
258 " %llu bad tail next ptr %llu",
259 (long long)frec_prev(&new)
260 + offsetof(struct ntdb_free_record
,
267 /* Update enqueue count, but don't set high bit: see NTDB_OFF_IS_ERR */
269 head
+= (1ULL << (64 - NTDB_OFF_UPPER_STEAL
));
270 head
&= ~(NTDB_OFF_MASK
| (1ULL << 63));
273 ecode
= ntdb_write_off(ntdb
, b_off
, head
);
274 if (ecode
!= NTDB_SUCCESS
) {
278 /* It's time to coalesce if counter wrapped. */
280 *coalesce
= ((head
& ~NTDB_OFF_MASK
) == 0);
282 return ntdb_write_convert(ntdb
, off
, &new, sizeof(new));
285 static ntdb_off_t
ftable_offset(struct ntdb_context
*ntdb
, unsigned int ftable
)
290 if (likely(ntdb
->ftable
== ftable
))
291 return ntdb
->ftable_off
;
293 off
= first_ftable(ntdb
);
294 for (i
= 0; i
< ftable
; i
++) {
295 if (NTDB_OFF_IS_ERR(off
)) {
298 off
= next_ftable(ntdb
, off
);
303 /* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
304 * need to blatt the *protect record (which is set to an error). */
305 static ntdb_len_t
coalesce(struct ntdb_context
*ntdb
,
306 ntdb_off_t off
, ntdb_off_t b_off
,
311 struct ntdb_free_record rec
;
312 enum NTDB_ERROR ecode
;
314 ntdb
->stats
.alloc_coalesce_tried
++;
315 end
= off
+ sizeof(struct ntdb_used_record
) + data_len
;
317 while (end
< ntdb
->file
->map_size
) {
318 const struct ntdb_free_record
*r
;
320 unsigned ftable
, bucket
;
322 r
= ntdb_access_read(ntdb
, end
, sizeof(*r
), true);
323 if (NTDB_PTR_IS_ERR(r
)) {
324 ecode
= NTDB_PTR_ERR(r
);
328 if (frec_magic(r
) != NTDB_FREE_MAGIC
329 || frec_ftable(r
) == NTDB_FTABLE_NONE
) {
330 ntdb_access_release(ntdb
, r
);
334 ftable
= frec_ftable(r
);
335 bucket
= size_to_bucket(frec_len(r
));
336 nb_off
= ftable_offset(ntdb
, ftable
);
337 if (NTDB_OFF_IS_ERR(nb_off
)) {
338 ntdb_access_release(ntdb
, r
);
339 ecode
= NTDB_OFF_TO_ERR(nb_off
);
342 nb_off
= bucket_off(nb_off
, bucket
);
343 ntdb_access_release(ntdb
, r
);
345 /* We may be violating lock order here, so best effort. */
346 if (ntdb_lock_free_bucket(ntdb
, nb_off
, NTDB_LOCK_NOWAIT
)
348 ntdb
->stats
.alloc_coalesce_lockfail
++;
352 /* Now we have lock, re-check. */
353 ecode
= ntdb_read_convert(ntdb
, end
, &rec
, sizeof(rec
));
354 if (ecode
!= NTDB_SUCCESS
) {
355 ntdb_unlock_free_bucket(ntdb
, nb_off
);
359 if (unlikely(frec_magic(&rec
) != NTDB_FREE_MAGIC
)) {
360 ntdb
->stats
.alloc_coalesce_race
++;
361 ntdb_unlock_free_bucket(ntdb
, nb_off
);
365 if (unlikely(frec_ftable(&rec
) != ftable
)
366 || unlikely(size_to_bucket(frec_len(&rec
)) != bucket
)) {
367 ntdb
->stats
.alloc_coalesce_race
++;
368 ntdb_unlock_free_bucket(ntdb
, nb_off
);
372 /* Did we just mess up a record you were hoping to use? */
373 if (end
== *protect
) {
374 ntdb
->stats
.alloc_coalesce_iterate_clash
++;
375 *protect
= NTDB_ERR_TO_OFF(NTDB_ERR_NOEXIST
);
378 ecode
= remove_from_list(ntdb
, nb_off
, end
, &rec
);
379 check_list(ntdb
, nb_off
);
380 if (ecode
!= NTDB_SUCCESS
) {
381 ntdb_unlock_free_bucket(ntdb
, nb_off
);
385 end
+= sizeof(struct ntdb_used_record
) + frec_len(&rec
);
386 ntdb_unlock_free_bucket(ntdb
, nb_off
);
387 ntdb
->stats
.alloc_coalesce_num_merged
++;
390 /* Didn't find any adjacent free? */
391 if (end
== off
+ sizeof(struct ntdb_used_record
) + data_len
)
394 /* Before we expand, check this isn't one you wanted protected? */
395 if (off
== *protect
) {
396 *protect
= NTDB_ERR_TO_OFF(NTDB_ERR_EXISTS
);
397 ntdb
->stats
.alloc_coalesce_iterate_clash
++;
400 /* OK, expand initial record */
401 ecode
= ntdb_read_convert(ntdb
, off
, &rec
, sizeof(rec
));
402 if (ecode
!= NTDB_SUCCESS
) {
406 if (frec_len(&rec
) != data_len
) {
407 ecode
= ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
408 "coalesce: expected data len %zu not %zu",
409 (size_t)data_len
, (size_t)frec_len(&rec
));
413 ecode
= remove_from_list(ntdb
, b_off
, off
, &rec
);
414 check_list(ntdb
, b_off
);
415 if (ecode
!= NTDB_SUCCESS
) {
419 /* Try locking violation first. We don't allow coalesce recursion! */
420 ecode
= add_free_record(ntdb
, off
, end
- off
, NTDB_LOCK_NOWAIT
, false);
421 if (ecode
!= NTDB_SUCCESS
) {
422 /* Need to drop lock. Can't rely on anything stable. */
423 ntdb
->stats
.alloc_coalesce_lockfail
++;
424 *protect
= NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT
);
426 /* We have to drop this to avoid deadlocks, so make sure record
427 * doesn't get coalesced by someone else! */
428 rec
.ftable_and_len
= (NTDB_FTABLE_NONE
429 << (64 - NTDB_OFF_UPPER_STEAL
))
430 | (end
- off
- sizeof(struct ntdb_used_record
));
431 ecode
= ntdb_write_off(ntdb
,
432 off
+ offsetof(struct ntdb_free_record
,
435 if (ecode
!= NTDB_SUCCESS
) {
439 ntdb_unlock_free_bucket(ntdb
, b_off
);
441 ecode
= add_free_record(ntdb
, off
, end
- off
, NTDB_LOCK_WAIT
,
443 if (ecode
!= NTDB_SUCCESS
) {
444 return NTDB_ERR_TO_OFF(ecode
);
446 } else if (NTDB_OFF_IS_ERR(*protect
)) {
447 /* For simplicity, we always drop lock if they can't continue */
448 ntdb_unlock_free_bucket(ntdb
, b_off
);
450 ntdb
->stats
.alloc_coalesce_succeeded
++;
452 /* Return usable length. */
453 return end
- off
- sizeof(struct ntdb_used_record
);
456 /* To unify error paths, we *always* unlock bucket on error. */
457 ntdb_unlock_free_bucket(ntdb
, b_off
);
458 return NTDB_ERR_TO_OFF(ecode
);
461 /* List is locked: we unlock it. */
462 static enum NTDB_ERROR
coalesce_list(struct ntdb_context
*ntdb
,
463 ntdb_off_t ftable_off
,
467 enum NTDB_ERROR ecode
;
470 off
= ntdb_read_off(ntdb
, b_off
);
471 if (NTDB_OFF_IS_ERR(off
)) {
472 ecode
= NTDB_OFF_TO_ERR(off
);
475 /* A little bit of paranoia: counter should be 0. */
476 off
&= NTDB_OFF_MASK
;
478 while (off
&& limit
--) {
479 struct ntdb_free_record rec
;
483 ecode
= ntdb_read_convert(ntdb
, off
, &rec
, sizeof(rec
));
484 if (ecode
!= NTDB_SUCCESS
)
488 coal
= coalesce(ntdb
, off
, b_off
, frec_len(&rec
), &next
);
489 if (NTDB_OFF_IS_ERR(coal
)) {
490 /* This has already unlocked on error. */
491 return NTDB_OFF_TO_ERR(coal
);
493 if (NTDB_OFF_IS_ERR(next
)) {
494 /* Coalescing had to unlock, so stop. */
497 /* Keep going if we're doing well... */
498 limit
+= size_to_bucket(coal
/ 16 + NTDB_MIN_DATA_LEN
);
502 /* Now, move those elements to the tail of the list so we get something
505 struct ntdb_free_record oldhrec
, newhrec
, oldtrec
, newtrec
;
506 ntdb_off_t oldhoff
, oldtoff
, newtoff
;
508 /* The record we were up to is the new head. */
509 ecode
= ntdb_read_convert(ntdb
, off
, &newhrec
, sizeof(newhrec
));
510 if (ecode
!= NTDB_SUCCESS
)
513 /* Get the new tail. */
514 newtoff
= frec_prev(&newhrec
);
515 ecode
= ntdb_read_convert(ntdb
, newtoff
, &newtrec
,
517 if (ecode
!= NTDB_SUCCESS
)
520 /* Get the old head. */
521 oldhoff
= ntdb_read_off(ntdb
, b_off
);
522 if (NTDB_OFF_IS_ERR(oldhoff
)) {
523 ecode
= NTDB_OFF_TO_ERR(oldhoff
);
527 /* This could happen if they all coalesced away. */
531 ecode
= ntdb_read_convert(ntdb
, oldhoff
, &oldhrec
,
533 if (ecode
!= NTDB_SUCCESS
)
536 /* Get the old tail. */
537 oldtoff
= frec_prev(&oldhrec
);
538 ecode
= ntdb_read_convert(ntdb
, oldtoff
, &oldtrec
,
540 if (ecode
!= NTDB_SUCCESS
)
543 /* Old tail's next points to old head. */
544 oldtrec
.next
= oldhoff
;
546 /* Old head's prev points to old tail. */
547 oldhrec
.magic_and_prev
548 = (NTDB_FREE_MAGIC
<< (64 - NTDB_OFF_UPPER_STEAL
))
551 /* New tail's next is 0. */
554 /* Write out the modified versions. */
555 ecode
= ntdb_write_convert(ntdb
, oldtoff
, &oldtrec
,
557 if (ecode
!= NTDB_SUCCESS
)
560 ecode
= ntdb_write_convert(ntdb
, oldhoff
, &oldhrec
,
562 if (ecode
!= NTDB_SUCCESS
)
565 ecode
= ntdb_write_convert(ntdb
, newtoff
, &newtrec
,
567 if (ecode
!= NTDB_SUCCESS
)
570 /* And finally link in new head. */
571 ecode
= ntdb_write_off(ntdb
, b_off
, off
);
572 if (ecode
!= NTDB_SUCCESS
)
576 ntdb_unlock_free_bucket(ntdb
, b_off
);
580 ntdb_unlock_free_bucket(ntdb
, b_off
);
584 /* List must not be locked if coalesce_ok is set. */
585 enum NTDB_ERROR
add_free_record(struct ntdb_context
*ntdb
,
586 ntdb_off_t off
, ntdb_len_t len_with_header
,
587 enum ntdb_lock_flags waitflag
,
592 enum NTDB_ERROR ecode
;
594 assert(len_with_header
>= sizeof(struct ntdb_free_record
));
596 len
= len_with_header
- sizeof(struct ntdb_used_record
);
598 b_off
= bucket_off(ntdb
->ftable_off
, size_to_bucket(len
));
599 ecode
= ntdb_lock_free_bucket(ntdb
, b_off
, waitflag
);
600 if (ecode
!= NTDB_SUCCESS
) {
604 ecode
= enqueue_in_free(ntdb
, b_off
, off
, len
, &coalesce_ok
);
605 check_list(ntdb
, b_off
);
607 /* Coalescing unlocks free list. */
608 if (!ecode
&& coalesce_ok
)
609 ecode
= coalesce_list(ntdb
, ntdb
->ftable_off
, b_off
, 2);
611 ntdb_unlock_free_bucket(ntdb
, b_off
);
615 static size_t adjust_size(size_t keylen
, size_t datalen
)
617 size_t size
= keylen
+ datalen
;
619 if (size
< NTDB_MIN_DATA_LEN
)
620 size
= NTDB_MIN_DATA_LEN
;
622 /* Round to next uint64_t boundary. */
623 return (size
+ (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
626 /* If we have enough left over to be useful, split that off. */
627 static size_t record_leftover(size_t keylen
, size_t datalen
,
628 bool want_extra
, size_t total_len
)
633 datalen
+= datalen
/ 2;
634 leftover
= total_len
- adjust_size(keylen
, datalen
);
636 if (leftover
< (ssize_t
)sizeof(struct ntdb_free_record
))
642 /* We need size bytes to put our key and data in. */
643 static ntdb_off_t
lock_and_alloc(struct ntdb_context
*ntdb
,
644 ntdb_off_t ftable_off
,
646 size_t keylen
, size_t datalen
,
650 ntdb_off_t off
, b_off
,best_off
;
651 struct ntdb_free_record best
= { 0 };
653 size_t size
= adjust_size(keylen
, datalen
);
654 enum NTDB_ERROR ecode
;
656 ntdb
->stats
.allocs
++;
657 b_off
= bucket_off(ftable_off
, bucket
);
659 /* FIXME: Try non-blocking wait first, to measure contention. */
660 /* Lock this bucket. */
661 ecode
= ntdb_lock_free_bucket(ntdb
, b_off
, NTDB_LOCK_WAIT
);
662 if (ecode
!= NTDB_SUCCESS
) {
663 return NTDB_ERR_TO_OFF(ecode
);
666 best
.ftable_and_len
= -1ULL;
669 /* Get slack if we're after extra. */
675 /* Walk the list to see if any are large enough, getting less fussy
677 off
= ntdb_read_off(ntdb
, b_off
);
678 if (NTDB_OFF_IS_ERR(off
)) {
679 ecode
= NTDB_OFF_TO_ERR(off
);
682 off
&= NTDB_OFF_MASK
;
685 const struct ntdb_free_record
*r
;
689 r
= ntdb_access_read(ntdb
, off
, sizeof(*r
), true);
690 if (NTDB_PTR_IS_ERR(r
)) {
691 ecode
= NTDB_PTR_ERR(r
);
695 if (frec_magic(r
) != NTDB_FREE_MAGIC
) {
696 ecode
= ntdb_logerr(ntdb
, NTDB_ERR_CORRUPT
, NTDB_LOG_ERROR
,
698 " %llu non-free 0x%llx",
700 (long long)r
->magic_and_prev
);
701 ntdb_access_release(ntdb
, r
);
705 if (frec_len(r
) >= size
&& frec_len(r
) < frec_len(&best
)) {
710 if (frec_len(&best
) <= size
* multiplier
&& best_off
) {
711 ntdb_access_release(ntdb
, r
);
719 ntdb_access_release(ntdb
, r
);
723 /* If we found anything at all, use it. */
725 struct ntdb_used_record rec
;
728 /* We're happy with this size: take it. */
729 ecode
= remove_from_list(ntdb
, b_off
, best_off
, &best
);
730 check_list(ntdb
, b_off
);
731 if (ecode
!= NTDB_SUCCESS
) {
735 leftover
= record_leftover(keylen
, datalen
, want_extra
,
738 assert(keylen
+ datalen
+ leftover
<= frec_len(&best
));
739 /* We need to mark non-free before we drop lock, otherwise
740 * coalesce() could try to merge it! */
741 ecode
= set_header(ntdb
, &rec
, magic
, keylen
, datalen
,
742 frec_len(&best
) - leftover
);
743 if (ecode
!= NTDB_SUCCESS
) {
747 ecode
= ntdb_write_convert(ntdb
, best_off
, &rec
, sizeof(rec
));
748 if (ecode
!= NTDB_SUCCESS
) {
752 /* For futureproofing, we put a 0 in any unused space. */
753 if (rec_extra_padding(&rec
)) {
754 ecode
= ntdb
->io
->twrite(ntdb
, best_off
+ sizeof(rec
)
755 + keylen
+ datalen
, "", 1);
756 if (ecode
!= NTDB_SUCCESS
) {
761 /* Bucket of leftover will be <= current bucket, so nested
762 * locking is allowed. */
764 ntdb
->stats
.alloc_leftover
++;
765 ecode
= add_free_record(ntdb
,
766 best_off
+ sizeof(rec
)
767 + frec_len(&best
) - leftover
,
768 leftover
, NTDB_LOCK_WAIT
, false);
769 if (ecode
!= NTDB_SUCCESS
) {
770 best_off
= NTDB_ERR_TO_OFF(ecode
);
773 ntdb_unlock_free_bucket(ntdb
, b_off
);
778 ntdb_unlock_free_bucket(ntdb
, b_off
);
782 ntdb_unlock_free_bucket(ntdb
, b_off
);
783 return NTDB_ERR_TO_OFF(ecode
);
786 /* Get a free block from current free list, or 0 if none, -ve on error. */
787 static ntdb_off_t
get_free(struct ntdb_context
*ntdb
,
788 size_t keylen
, size_t datalen
, bool want_extra
,
791 ntdb_off_t off
, ftable_off
;
792 ntdb_off_t start_b
, b
, ftable
;
793 bool wrapped
= false;
795 /* If they are growing, add 50% to get to higher bucket. */
797 start_b
= size_to_bucket(adjust_size(keylen
,
798 datalen
+ datalen
/ 2));
800 start_b
= size_to_bucket(adjust_size(keylen
, datalen
));
802 ftable_off
= ntdb
->ftable_off
;
803 ftable
= ntdb
->ftable
;
804 while (!wrapped
|| ftable_off
!= ntdb
->ftable_off
) {
805 /* Start at exact size bucket, and search up... */
806 for (b
= find_free_head(ntdb
, ftable_off
, start_b
);
807 b
< NTDB_FREE_BUCKETS
;
808 b
= find_free_head(ntdb
, ftable_off
, b
+ 1)) {
809 /* Try getting one from list. */
810 off
= lock_and_alloc(ntdb
, ftable_off
,
811 b
, keylen
, datalen
, want_extra
,
813 if (NTDB_OFF_IS_ERR(off
))
817 ntdb
->stats
.alloc_bucket_exact
++;
818 if (b
== NTDB_FREE_BUCKETS
- 1)
819 ntdb
->stats
.alloc_bucket_max
++;
820 /* Worked? Stay using this list. */
821 ntdb
->ftable_off
= ftable_off
;
822 ntdb
->ftable
= ftable
;
825 /* Didn't work. Try next bucket. */
828 if (NTDB_OFF_IS_ERR(b
)) {
832 /* Hmm, try next table. */
833 ftable_off
= next_ftable(ntdb
, ftable_off
);
834 if (NTDB_OFF_IS_ERR(ftable_off
)) {
839 if (ftable_off
== 0) {
841 ftable_off
= first_ftable(ntdb
);
842 if (NTDB_OFF_IS_ERR(ftable_off
)) {
852 enum NTDB_ERROR
set_header(struct ntdb_context
*ntdb
,
853 struct ntdb_used_record
*rec
,
854 unsigned magic
, uint64_t keylen
, uint64_t datalen
,
857 uint64_t keybits
= (fls64(keylen
) + 1) / 2;
859 rec
->magic_and_meta
= ((actuallen
- (keylen
+ datalen
)) << 11)
861 | ((uint64_t)magic
<< 48);
862 rec
->key_and_data_len
= (keylen
| (datalen
<< (keybits
*2)));
864 /* Encoding can fail on big values. */
865 if (rec_key_length(rec
) != keylen
866 || rec_data_length(rec
) != datalen
867 || rec_extra_padding(rec
) != actuallen
- (keylen
+ datalen
)) {
868 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
869 "Could not encode k=%llu,d=%llu,a=%llu",
870 (long long)keylen
, (long long)datalen
,
871 (long long)actuallen
);
876 /* You need 'size', this tells you how much you should expand by. */
877 ntdb_off_t
ntdb_expand_adjust(ntdb_off_t map_size
, ntdb_off_t size
)
879 ntdb_off_t new_size
, top_size
;
881 /* limit size in order to avoid using up huge amounts of memory for
882 * in memory tdbs if an oddball huge record creeps in */
883 if (size
> 100 * 1024) {
884 top_size
= map_size
+ size
* 2;
886 top_size
= map_size
+ size
* 100;
889 /* always make room for at least top_size more records, and at
890 least 25% more space. if the DB is smaller than 100MiB,
891 otherwise grow it by 10% only. */
892 if (map_size
> 100 * 1024 * 1024) {
893 new_size
= map_size
* 1.10;
895 new_size
= map_size
* 1.25;
898 if (new_size
< top_size
)
901 /* We always make the file a multiple of transaction page
902 * size. This guarantees that the transaction recovery area
903 * is always aligned, otherwise the transaction code can overwrite
905 new_size
= (new_size
+ NTDB_PGSIZE
-1) & ~(NTDB_PGSIZE
-1);
906 return new_size
- map_size
;
909 /* Expand the database. */
910 static enum NTDB_ERROR
ntdb_expand(struct ntdb_context
*ntdb
, ntdb_len_t size
)
914 enum NTDB_ERROR ecode
;
916 /* Need to hold a hash lock to expand DB: transactions rely on it. */
917 if (!(ntdb
->flags
& NTDB_NOLOCK
)
918 && !ntdb
->file
->allrecord_lock
.count
&& !ntdb_has_hash_locks(ntdb
)) {
919 return ntdb_logerr(ntdb
, NTDB_ERR_LOCK
, NTDB_LOG_ERROR
,
920 "ntdb_expand: must hold lock during expand");
923 /* Only one person can expand file at a time. */
924 ecode
= ntdb_lock_expand(ntdb
, F_WRLCK
);
925 if (ecode
!= NTDB_SUCCESS
) {
929 /* Someone else may have expanded the file, so retry. */
930 old_size
= ntdb
->file
->map_size
;
931 ntdb_oob(ntdb
, ntdb
->file
->map_size
, 1, true);
932 if (ntdb
->file
->map_size
!= old_size
) {
933 ntdb_unlock_expand(ntdb
, F_WRLCK
);
937 /* We need room for the record header too. */
938 size
= adjust_size(0, sizeof(struct ntdb_used_record
) + size
);
940 wanted
= ntdb_expand_adjust(old_size
, size
);
942 ecode
= ntdb
->io
->expand_file(ntdb
, wanted
);
943 if (ecode
!= NTDB_SUCCESS
) {
944 ntdb_unlock_expand(ntdb
, F_WRLCK
);
948 /* We need to drop this lock before adding free record. */
949 ntdb_unlock_expand(ntdb
, F_WRLCK
);
951 ntdb
->stats
.expands
++;
952 return add_free_record(ntdb
, old_size
, wanted
, NTDB_LOCK_WAIT
, true);
955 /* This won't fail: it will expand the database if it has to. */
956 ntdb_off_t
alloc(struct ntdb_context
*ntdb
, size_t keylen
, size_t datalen
,
957 unsigned magic
, bool growing
)
962 enum NTDB_ERROR ecode
;
963 off
= get_free(ntdb
, keylen
, datalen
, growing
, magic
);
964 if (likely(off
!= 0))
967 ecode
= ntdb_expand(ntdb
, adjust_size(keylen
, datalen
));
968 if (ecode
!= NTDB_SUCCESS
) {
969 return NTDB_ERR_TO_OFF(ecode
);