s3: don't replace the error message if already defined
[Samba/id10ts.git] / lib / tdb2 / tdb1_freelist.c
blobaf0129372173d3b12f048f4960c66cdfb14d844b
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb1_private.h"
30 /* read a freelist record and check for simple errors */
31 int tdb1_rec_free_read(struct tdb_context *tdb, tdb1_off_t off, struct tdb1_record *rec)
33 if (tdb->tdb1.io->tdb1_read(tdb, off, rec, sizeof(*rec),TDB1_DOCONV()) == -1)
34 return -1;
36 if (rec->magic == TDB1_MAGIC) {
37 /* this happens when a app is showdown while deleting a record - we should
38 not completely fail when this happens */
39 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_WARNING,
40 "tdb1_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
41 rec->magic, off);
42 rec->magic = TDB1_FREE_MAGIC;
43 if (tdb->tdb1.io->tdb1_write(tdb, off, rec, sizeof(*rec)) == -1)
44 return -1;
47 if (rec->magic != TDB1_FREE_MAGIC) {
48 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
49 "tdb1_rec_free_read bad magic 0x%x at offset=%d\n",
50 rec->magic, off);
51 return -1;
53 if (tdb->tdb1.io->tdb1_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
54 return -1;
55 return 0;
59 /* update a record tailer (must hold allocation lock) */
60 static int update_tailer(struct tdb_context *tdb, tdb1_off_t offset,
61 const struct tdb1_record *rec)
63 tdb1_off_t totalsize;
65 /* Offset of tailer from record header */
66 totalsize = sizeof(*rec) + rec->rec_len;
67 return tdb1_ofs_write(tdb, offset + totalsize - sizeof(tdb1_off_t),
68 &totalsize);
71 /* Add an element into the freelist. Merge adjacent records if
72 necessary. */
73 int tdb1_free(struct tdb_context *tdb, tdb1_off_t offset, struct tdb1_record *rec)
75 /* Allocation and tailer lock */
76 if (tdb1_lock(tdb, -1, F_WRLCK) != 0)
77 return -1;
79 /* set an initial tailer, so if we fail we don't leave a bogus record */
80 if (update_tailer(tdb, offset, rec) != 0) {
81 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
82 "tdb_free: update_tailer failed!\n");
83 goto fail;
86 tdb->stats.alloc_coalesce_tried++;
87 /* Look left */
88 if (offset - sizeof(tdb1_off_t) > TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
89 tdb1_off_t left = offset - sizeof(tdb1_off_t);
90 struct tdb1_record l;
91 tdb1_off_t leftsize;
93 /* Read in tailer and jump back to header */
94 if (tdb1_ofs_read(tdb, left, &leftsize) == -1) {
95 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
96 "tdb1_free: left offset read failed at %u", left);
97 goto update;
100 /* it could be uninitialised data */
101 if (leftsize == 0 || leftsize == TDB1_PAD_U32) {
102 goto update;
105 left = offset - leftsize;
107 if (leftsize > offset ||
108 left < TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
109 goto update;
112 /* Now read in the left record */
113 if (tdb->tdb1.io->tdb1_read(tdb, left, &l, sizeof(l), TDB1_DOCONV()) == -1) {
114 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
115 "tdb1_free: left read failed at %u (%u)", left, leftsize);
116 goto update;
119 /* If it's free, expand to include it. */
120 if (l.magic == TDB1_FREE_MAGIC) {
121 /* we now merge the new record into the left record, rather than the other
122 way around. This makes the operation O(1) instead of O(n). This change
123 prevents traverse from being O(n^2) after a lot of deletes */
124 l.rec_len += sizeof(*rec) + rec->rec_len;
125 if (tdb1_rec_write(tdb, left, &l) == -1) {
126 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
127 "tdb1_free: update_left failed at %u", left);
128 goto fail;
130 if (update_tailer(tdb, left, &l) == -1) {
131 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
132 "tdb1_free: update_tailer failed at %u", offset);
133 goto fail;
135 tdb->stats.alloc_coalesce_succeeded++;
136 tdb->stats.alloc_coalesce_num_merged++;
137 tdb->stats.frees++;
138 tdb1_unlock(tdb, -1, F_WRLCK);
139 return 0;
143 update:
145 /* Now, prepend to free list */
146 rec->magic = TDB1_FREE_MAGIC;
148 if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec->next) == -1 ||
149 tdb1_rec_write(tdb, offset, rec) == -1 ||
150 tdb1_ofs_write(tdb, TDB1_FREELIST_TOP, &offset) == -1) {
151 tdb_logerr(tdb, tdb->last_error, TDB_LOG_ERROR,
152 "tdb1_free record write failed at offset=%d",
153 offset);
154 goto fail;
157 /* And we're done. */
158 tdb->stats.frees++;
159 tdb1_unlock(tdb, -1, F_WRLCK);
160 return 0;
162 fail:
163 tdb1_unlock(tdb, -1, F_WRLCK);
164 return -1;
170 the core of tdb1_allocate - called when we have decided which
171 free list entry to use
173 Note that we try to allocate by grabbing data from the end of an existing record,
174 not the beginning. This is so the left merge in a free is more likely to be
175 able to free up the record without fragmentation
177 static tdb1_off_t tdb1_allocate_ofs(struct tdb_context *tdb,
178 tdb1_len_t length, tdb1_off_t rec_ptr,
179 struct tdb1_record *rec, tdb1_off_t last_ptr)
181 #define MIN_REC_SIZE (sizeof(struct tdb1_record) + sizeof(tdb1_off_t) + 8)
183 if (rec->rec_len < length + MIN_REC_SIZE) {
184 /* we have to grab the whole record */
186 /* unlink it from the previous record */
187 if (tdb1_ofs_write(tdb, last_ptr, &rec->next) == -1) {
188 return 0;
191 /* mark it not free */
192 rec->magic = TDB1_MAGIC;
193 if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
194 return 0;
196 tdb->stats.allocs++;
197 return rec_ptr;
200 /* we're going to just shorten the existing record */
201 rec->rec_len -= (length + sizeof(*rec));
202 if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
203 return 0;
205 if (update_tailer(tdb, rec_ptr, rec) == -1) {
206 return 0;
209 /* and setup the new record */
210 rec_ptr += sizeof(*rec) + rec->rec_len;
212 memset(rec, '\0', sizeof(*rec));
213 rec->rec_len = length;
214 rec->magic = TDB1_MAGIC;
216 if (tdb1_rec_write(tdb, rec_ptr, rec) == -1) {
217 return 0;
220 if (update_tailer(tdb, rec_ptr, rec) == -1) {
221 return 0;
224 tdb->stats.allocs++;
225 tdb->stats.alloc_leftover++;
226 return rec_ptr;
229 /* allocate some space from the free list. The offset returned points
230 to a unconnected tdb1_record within the database with room for at
231 least length bytes of total data
233 0 is returned if the space could not be allocated
235 tdb1_off_t tdb1_allocate(struct tdb_context *tdb, tdb1_len_t length, struct tdb1_record *rec)
237 tdb1_off_t rec_ptr, last_ptr, newrec_ptr;
238 struct {
239 tdb1_off_t rec_ptr, last_ptr;
240 tdb1_len_t rec_len;
241 } bestfit;
242 float multiplier = 1.0;
244 if (tdb1_lock(tdb, -1, F_WRLCK) == -1)
245 return 0;
247 /* over-allocate to reduce fragmentation */
248 length *= 1.25;
250 /* Extra bytes required for tailer */
251 length += sizeof(tdb1_off_t);
252 length = TDB1_ALIGN(length, TDB1_ALIGNMENT);
254 again:
255 last_ptr = TDB1_FREELIST_TOP;
257 /* read in the freelist top */
258 if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP, &rec_ptr) == -1)
259 goto fail;
261 bestfit.rec_ptr = 0;
262 bestfit.last_ptr = 0;
263 bestfit.rec_len = 0;
266 this is a best fit allocation strategy. Originally we used
267 a first fit strategy, but it suffered from massive fragmentation
268 issues when faced with a slowly increasing record size.
270 while (rec_ptr) {
271 if (tdb1_rec_free_read(tdb, rec_ptr, rec) == -1) {
272 goto fail;
275 if (rec->rec_len >= length) {
276 if (bestfit.rec_ptr == 0 ||
277 rec->rec_len < bestfit.rec_len) {
278 bestfit.rec_len = rec->rec_len;
279 bestfit.rec_ptr = rec_ptr;
280 bestfit.last_ptr = last_ptr;
284 /* move to the next record */
285 last_ptr = rec_ptr;
286 rec_ptr = rec->next;
288 /* if we've found a record that is big enough, then
289 stop searching if its also not too big. The
290 definition of 'too big' changes as we scan
291 through */
292 if (bestfit.rec_len > 0 &&
293 bestfit.rec_len < length * multiplier) {
294 break;
297 /* this multiplier means we only extremely rarely
298 search more than 50 or so records. At 50 records we
299 accept records up to 11 times larger than what we
300 want */
301 multiplier *= 1.05;
304 if (bestfit.rec_ptr != 0) {
305 if (tdb1_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
306 goto fail;
309 newrec_ptr = tdb1_allocate_ofs(tdb, length, bestfit.rec_ptr,
310 rec, bestfit.last_ptr);
311 tdb1_unlock(tdb, -1, F_WRLCK);
312 return newrec_ptr;
315 /* we didn't find enough space. See if we can expand the
316 database and if we can then try again */
317 if (tdb1_expand(tdb, length + sizeof(*rec)) == 0)
318 goto again;
319 fail:
320 tdb1_unlock(tdb, -1, F_WRLCK);
321 return 0;