2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
30 /* read a freelist record and check for simple errors */
31 int tdb1_rec_free_read(struct tdb_context
*tdb
, tdb1_off_t off
, struct tdb1_record
*rec
)
33 if (tdb
->tdb1
.io
->tdb1_read(tdb
, off
, rec
, sizeof(*rec
),TDB1_DOCONV()) == -1)
36 if (rec
->magic
== TDB1_MAGIC
) {
37 /* this happens when a app is showdown while deleting a record - we should
38 not completely fail when this happens */
39 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_WARNING
,
40 "tdb1_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
42 rec
->magic
= TDB1_FREE_MAGIC
;
43 if (tdb
->tdb1
.io
->tdb1_write(tdb
, off
, rec
, sizeof(*rec
)) == -1)
47 if (rec
->magic
!= TDB1_FREE_MAGIC
) {
48 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
49 "tdb1_rec_free_read bad magic 0x%x at offset=%d\n",
53 if (tdb
->tdb1
.io
->tdb1_oob(tdb
, rec
->next
+sizeof(*rec
), 0) != 0)
59 /* update a record tailer (must hold allocation lock) */
60 static int update_tailer(struct tdb_context
*tdb
, tdb1_off_t offset
,
61 const struct tdb1_record
*rec
)
65 /* Offset of tailer from record header */
66 totalsize
= sizeof(*rec
) + rec
->rec_len
;
67 return tdb1_ofs_write(tdb
, offset
+ totalsize
- sizeof(tdb1_off_t
),
71 /* Add an element into the freelist. Merge adjacent records if
73 int tdb1_free(struct tdb_context
*tdb
, tdb1_off_t offset
, struct tdb1_record
*rec
)
75 /* Allocation and tailer lock */
76 if (tdb1_lock(tdb
, -1, F_WRLCK
) != 0)
79 /* set an initial tailer, so if we fail we don't leave a bogus record */
80 if (update_tailer(tdb
, offset
, rec
) != 0) {
81 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
82 "tdb_free: update_tailer failed!\n");
86 tdb
->stats
.alloc_coalesce_tried
++;
88 if (offset
- sizeof(tdb1_off_t
) > TDB1_DATA_START(tdb
->tdb1
.header
.hash_size
)) {
89 tdb1_off_t left
= offset
- sizeof(tdb1_off_t
);
93 /* Read in tailer and jump back to header */
94 if (tdb1_ofs_read(tdb
, left
, &leftsize
) == -1) {
95 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
96 "tdb1_free: left offset read failed at %u", left
);
100 /* it could be uninitialised data */
101 if (leftsize
== 0 || leftsize
== TDB1_PAD_U32
) {
105 left
= offset
- leftsize
;
107 if (leftsize
> offset
||
108 left
< TDB1_DATA_START(tdb
->tdb1
.header
.hash_size
)) {
112 /* Now read in the left record */
113 if (tdb
->tdb1
.io
->tdb1_read(tdb
, left
, &l
, sizeof(l
), TDB1_DOCONV()) == -1) {
114 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
115 "tdb1_free: left read failed at %u (%u)", left
, leftsize
);
119 /* If it's free, expand to include it. */
120 if (l
.magic
== TDB1_FREE_MAGIC
) {
121 /* we now merge the new record into the left record, rather than the other
122 way around. This makes the operation O(1) instead of O(n). This change
123 prevents traverse from being O(n^2) after a lot of deletes */
124 l
.rec_len
+= sizeof(*rec
) + rec
->rec_len
;
125 if (tdb1_rec_write(tdb
, left
, &l
) == -1) {
126 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
127 "tdb1_free: update_left failed at %u", left
);
130 if (update_tailer(tdb
, left
, &l
) == -1) {
131 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
132 "tdb1_free: update_tailer failed at %u", offset
);
135 tdb
->stats
.alloc_coalesce_succeeded
++;
136 tdb
->stats
.alloc_coalesce_num_merged
++;
138 tdb1_unlock(tdb
, -1, F_WRLCK
);
145 /* Now, prepend to free list */
146 rec
->magic
= TDB1_FREE_MAGIC
;
148 if (tdb1_ofs_read(tdb
, TDB1_FREELIST_TOP
, &rec
->next
) == -1 ||
149 tdb1_rec_write(tdb
, offset
, rec
) == -1 ||
150 tdb1_ofs_write(tdb
, TDB1_FREELIST_TOP
, &offset
) == -1) {
151 tdb_logerr(tdb
, tdb
->last_error
, TDB_LOG_ERROR
,
152 "tdb1_free record write failed at offset=%d",
157 /* And we're done. */
159 tdb1_unlock(tdb
, -1, F_WRLCK
);
163 tdb1_unlock(tdb
, -1, F_WRLCK
);
170 the core of tdb1_allocate - called when we have decided which
171 free list entry to use
173 Note that we try to allocate by grabbing data from the end of an existing record,
174 not the beginning. This is so the left merge in a free is more likely to be
175 able to free up the record without fragmentation
177 static tdb1_off_t
tdb1_allocate_ofs(struct tdb_context
*tdb
,
178 tdb1_len_t length
, tdb1_off_t rec_ptr
,
179 struct tdb1_record
*rec
, tdb1_off_t last_ptr
)
181 #define MIN_REC_SIZE (sizeof(struct tdb1_record) + sizeof(tdb1_off_t) + 8)
183 if (rec
->rec_len
< length
+ MIN_REC_SIZE
) {
184 /* we have to grab the whole record */
186 /* unlink it from the previous record */
187 if (tdb1_ofs_write(tdb
, last_ptr
, &rec
->next
) == -1) {
191 /* mark it not free */
192 rec
->magic
= TDB1_MAGIC
;
193 if (tdb1_rec_write(tdb
, rec_ptr
, rec
) == -1) {
200 /* we're going to just shorten the existing record */
201 rec
->rec_len
-= (length
+ sizeof(*rec
));
202 if (tdb1_rec_write(tdb
, rec_ptr
, rec
) == -1) {
205 if (update_tailer(tdb
, rec_ptr
, rec
) == -1) {
209 /* and setup the new record */
210 rec_ptr
+= sizeof(*rec
) + rec
->rec_len
;
212 memset(rec
, '\0', sizeof(*rec
));
213 rec
->rec_len
= length
;
214 rec
->magic
= TDB1_MAGIC
;
216 if (tdb1_rec_write(tdb
, rec_ptr
, rec
) == -1) {
220 if (update_tailer(tdb
, rec_ptr
, rec
) == -1) {
225 tdb
->stats
.alloc_leftover
++;
229 /* allocate some space from the free list. The offset returned points
230 to a unconnected tdb1_record within the database with room for at
231 least length bytes of total data
233 0 is returned if the space could not be allocated
235 tdb1_off_t
tdb1_allocate(struct tdb_context
*tdb
, tdb1_len_t length
, struct tdb1_record
*rec
)
237 tdb1_off_t rec_ptr
, last_ptr
, newrec_ptr
;
239 tdb1_off_t rec_ptr
, last_ptr
;
242 float multiplier
= 1.0;
244 if (tdb1_lock(tdb
, -1, F_WRLCK
) == -1)
247 /* over-allocate to reduce fragmentation */
250 /* Extra bytes required for tailer */
251 length
+= sizeof(tdb1_off_t
);
252 length
= TDB1_ALIGN(length
, TDB1_ALIGNMENT
);
255 last_ptr
= TDB1_FREELIST_TOP
;
257 /* read in the freelist top */
258 if (tdb1_ofs_read(tdb
, TDB1_FREELIST_TOP
, &rec_ptr
) == -1)
262 bestfit
.last_ptr
= 0;
266 this is a best fit allocation strategy. Originally we used
267 a first fit strategy, but it suffered from massive fragmentation
268 issues when faced with a slowly increasing record size.
271 if (tdb1_rec_free_read(tdb
, rec_ptr
, rec
) == -1) {
275 if (rec
->rec_len
>= length
) {
276 if (bestfit
.rec_ptr
== 0 ||
277 rec
->rec_len
< bestfit
.rec_len
) {
278 bestfit
.rec_len
= rec
->rec_len
;
279 bestfit
.rec_ptr
= rec_ptr
;
280 bestfit
.last_ptr
= last_ptr
;
284 /* move to the next record */
288 /* if we've found a record that is big enough, then
289 stop searching if its also not too big. The
290 definition of 'too big' changes as we scan
292 if (bestfit
.rec_len
> 0 &&
293 bestfit
.rec_len
< length
* multiplier
) {
297 /* this multiplier means we only extremely rarely
298 search more than 50 or so records. At 50 records we
299 accept records up to 11 times larger than what we
304 if (bestfit
.rec_ptr
!= 0) {
305 if (tdb1_rec_free_read(tdb
, bestfit
.rec_ptr
, rec
) == -1) {
309 newrec_ptr
= tdb1_allocate_ofs(tdb
, length
, bestfit
.rec_ptr
,
310 rec
, bestfit
.last_ptr
);
311 tdb1_unlock(tdb
, -1, F_WRLCK
);
315 /* we didn't find enough space. See if we can expand the
316 database and if we can then try again */
317 if (tdb1_expand(tdb
, length
+ sizeof(*rec
)) == 0)
320 tdb1_unlock(tdb
, -1, F_WRLCK
);