2 Unix SMB/Netbios implementation.
4 Samba database functions
5 Copyright (C) Andrew Tridgell 1999-2000
6 Copyright (C) Luke Kenneth Casson Leighton 2000
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45 #define TDB_MAGIC_FOOD "TDB file\n"
46 #define TDB_VERSION (0x26011967 + 6)
47 #define TDB_MAGIC (0x26011999U)
48 #define TDB_FREE_MAGIC (~TDB_MAGIC)
49 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
50 #define TDB_ALIGNMENT 4
51 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
52 #define DEFAULT_HASH_SIZE 131
53 #define TDB_PAGE_SIZE 0x2000
54 #define FREELIST_TOP (sizeof(struct tdb_header))
55 #define TDB_ALIGN(x,a) (((x) + (a)) & ~((a)-1))
56 #define TDB_BYTEREV(x) (((x<<24)|(x&0xFF00)<<8)|((x>>8)&0xFF00)|(x>>24))
57 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
58 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
59 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
69 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
72 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
73 static TDB_CONTEXT
*tdbs
= NULL
;
75 static void *tdb_munmap(void *ptr
, tdb_len size
)
83 static void *tdb_mmap(tdb_len size
, int readonly
, int fd
)
87 ret
= mmap(NULL
, size
, PROT_READ
| (readonly
? 0 : PROT_WRITE
), MAP_SHARED
|MAP_FILE
, fd
, 0);
92 /* Endian conversion: we only ever deal with 4 byte quantities */
93 static void *convert(void *buf
, u32 size
)
96 for (i
= 0; i
< size
/ 4; i
++) p
[i
] = TDB_BYTEREV(p
[i
]);
99 #define DOCONV() (tdb->flags & TDB_CONVERT)
100 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
102 /* the body of the database is made of one list_struct for the free space
103 plus a separate data list for each hash value */
105 tdb_off next
; /* offset of the next record in the list */
106 tdb_len rec_len
; /* total byte length of record */
107 tdb_len key_len
; /* byte length of key */
108 tdb_len data_len
; /* byte length of data */
109 u32 full_hash
; /* the full 32 bit hash of the key */
110 u32 magic
; /* try to catch errors */
111 /* the following union is implied:
113 char record[rec_len];
118 u32 totalsize; (tailer)
122 /* a byte range locking function - return 0 on success
123 this functions locks/unlocks 1 byte at the specified offset */
124 static int tdb_brlock(TDB_CONTEXT
*tdb
, tdb_off offset
,
125 int rw_type
, int lck_type
)
129 if (tdb
->flags
& TDB_NOLOCK
) return 0;
130 if (tdb
->read_only
) return -1;
133 fl
.l_whence
= SEEK_SET
;
138 if (fcntl(tdb
->fd
,lck_type
,&fl
)) return TDB_ERRCODE(TDB_ERR_LOCK
, -1);
142 /* lock a list in the database. list -1 is the alloc list */
143 static int tdb_lock(TDB_CONTEXT
*tdb
, int list
, int ltype
)
145 if (list
< -1 || list
>= (int)tdb
->header
.hash_size
) return -1;
146 if (tdb
->flags
& TDB_NOLOCK
) return 0;
148 /* Since fcntl locks don't nest, we do a lock for the first one,
149 and simply bump the count for future ones */
150 if (tdb
->locked
[list
+1].count
== 0) {
151 if (tdb
->header
.rwlocks
) {
152 if (tdb_spinlock(tdb
, list
, ltype
)) return -1;
153 } else if (tdb_brlock(tdb
,FREELIST_TOP
+4*list
,ltype
,F_SETLKW
))
155 tdb
->locked
[list
+1].ltype
= ltype
;
157 tdb
->locked
[list
+1].count
++;
161 /* unlock the database: returns void because it's too late for errors. */
162 static void tdb_unlock(TDB_CONTEXT
*tdb
, int list
, int ltype
)
164 if (tdb
->flags
& TDB_NOLOCK
) return;
167 if (list
< -1 || list
>= (int)tdb
->header
.hash_size
) return;
168 if (tdb
->locked
[list
+1].count
==0) return;
170 if (tdb
->locked
[list
+1].count
== 1) {
171 /* Down to last nested lock: unlock underneath */
172 if (tdb
->header
.rwlocks
) tdb_spinunlock(tdb
, list
, ltype
);
173 else tdb_brlock(tdb
, FREELIST_TOP
+4*list
, F_UNLCK
, F_SETLKW
);
175 tdb
->locked
[list
+1].count
--;
178 /* This is based on the hash agorithm from gdbm */
179 static u32
tdb_hash(TDB_DATA
*key
)
181 u32 value
; /* Used to compute the hash value. */
182 u32 i
; /* Used to cycle through random values. */
184 /* Set the initial value from the key size. */
185 for (value
= 0x238F13AF * key
->dsize
, i
=0; i
< key
->dsize
; i
++)
186 value
= (value
+ (key
->dptr
[i
] << (i
*5 % 24)));
188 return (1103515243 * value
+ 12345);
191 /* check for an out of bounds access - if it is out of bounds then
192 see if the database has been expanded by someone else and expand
194 static int tdb_oob(TDB_CONTEXT
*tdb
, tdb_off offset
)
197 if (offset
<= tdb
->map_size
) return 0;
198 if (tdb
->flags
& TDB_INTERNAL
) return 0;
201 if (st
.st_size
<= (size_t)offset
) return TDB_ERRCODE(TDB_ERR_IO
, -1);
203 /* Unmap, update size, remap */
204 if (tdb
->map_ptr
) tdb
->map_ptr
=tdb_munmap(tdb
->map_ptr
, tdb
->map_size
);
205 tdb
->map_size
= st
.st_size
;
206 if (!(tdb
->flags
& TDB_NOMMAP
))
207 tdb
->map_ptr
= tdb_mmap(tdb
->map_size
, tdb
->read_only
,tdb
->fd
);
211 /* write a lump of data at a specified offset */
212 static int tdb_write(TDB_CONTEXT
*tdb
, tdb_off off
, void *buf
, tdb_len len
)
214 if (tdb_oob(tdb
, off
+ len
) != 0) return -1;
216 if (tdb
->map_ptr
) memcpy(off
+ (char *)tdb
->map_ptr
, buf
, len
);
217 else if (lseek(tdb
->fd
, off
, SEEK_SET
) != off
218 || write(tdb
->fd
, buf
, len
) != (ssize_t
)len
)
219 return TDB_ERRCODE(TDB_ERR_IO
, -1);
223 /* read a lump of data at a specified offset, maybe convert */
224 static int tdb_read(TDB_CONTEXT
*tdb
,tdb_off off
,void *buf
,tdb_len len
,int cv
)
226 if (tdb_oob(tdb
, off
+ len
) != 0) return -1;
228 if (tdb
->map_ptr
) memcpy(buf
, off
+ (char *)tdb
->map_ptr
, len
);
229 else if (lseek(tdb
->fd
, off
, SEEK_SET
) != off
230 || read(tdb
->fd
, buf
, len
) != (ssize_t
)len
)
231 return TDB_ERRCODE(TDB_ERR_IO
, -1);
232 if (cv
) convert(buf
, len
);
236 /* read a lump of data, allocating the space for it */
237 static char *tdb_alloc_read(TDB_CONTEXT
*tdb
, tdb_off offset
, tdb_len len
)
241 if (!(buf
= malloc(len
))) return TDB_ERRCODE(TDB_ERR_OOM
, buf
);
242 if (tdb_read(tdb
, offset
, buf
, len
, 0) == -1) {
249 /* read/write a tdb_off */
250 static int ofs_read(TDB_CONTEXT
*tdb
, tdb_off offset
, tdb_off
*d
)
252 return tdb_read(tdb
, offset
, (char*)d
, sizeof(*d
), DOCONV());
254 static int ofs_write(TDB_CONTEXT
*tdb
, tdb_off offset
, tdb_off
*d
)
257 return tdb_write(tdb
, offset
, CONVERT(off
), sizeof(*d
));
260 /* read/write a record */
261 static int rec_read(TDB_CONTEXT
*tdb
, tdb_off offset
, struct list_struct
*rec
)
263 if (tdb_read(tdb
, offset
, rec
, sizeof(*rec
),DOCONV()) == -1) return -1;
264 if (TDB_BAD_MAGIC(rec
)) return TDB_ERRCODE(TDB_ERR_CORRUPT
, -1);
265 return tdb_oob(tdb
, rec
->next
);
267 static int rec_write(TDB_CONTEXT
*tdb
, tdb_off offset
, struct list_struct
*rec
)
269 struct list_struct r
= *rec
;
270 return tdb_write(tdb
, offset
, CONVERT(r
), sizeof(r
));
273 /* read a freelist record and check for simple errors */
274 static int rec_free_read(TDB_CONTEXT
*tdb
, tdb_off off
, struct list_struct
*rec
)
276 if (tdb_read(tdb
, off
, rec
, sizeof(*rec
),DOCONV()) == -1) return -1;
277 if (rec
->magic
!= TDB_FREE_MAGIC
) {
279 printf("bad magic 0x%08x at offset %d\n",
282 return TDB_ERRCODE(TDB_ERR_CORRUPT
, -1);
284 if (tdb_oob(tdb
, rec
->next
) != 0) return -1;
288 /* update a record tailer (must hold allocation lock) */
289 static int update_tailer(TDB_CONTEXT
*tdb
, tdb_off offset
,
290 const struct list_struct
*rec
)
294 /* Offset of tailer from record header */
295 totalsize
= sizeof(*rec
) + rec
->rec_len
;
296 return ofs_write(tdb
, offset
+ totalsize
- sizeof(tdb_off
),
301 void tdb_printfreelist(TDB_CONTEXT
*tdb
)
303 tdb_off offset
, rec_ptr
, last_ptr
;
304 struct list_struct rec
, lastrec
, newrec
;
306 tdb_lock(tdb
, -1, F_WRLCK
);
309 offset
= FREELIST_TOP
;
311 /* read in the freelist top */
312 if (ofs_read(tdb
, offset
, &rec_ptr
) == -1) {
316 printf("freelist top=[0x%08x]\n", rec_ptr
);
318 if (tdb_read(tdb
, rec_ptr
, (char *)&rec
, sizeof(rec
), DOCONV()) == -1) {
322 if (rec
.magic
!= TDB_FREE_MAGIC
) {
323 printf("bad magic 0x%08x in free list\n", rec
.magic
);
327 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x]\n", rec
.next
, rec
.rec_len
);
329 /* move to the next record */
333 tdb_unlock(tdb
, -1, F_WRLCK
);
337 /* Remove an element from the freelist. Must have alloc lock. */
338 static int remove_from_freelist(TDB_CONTEXT
*tdb
, tdb_off off
, tdb_off next
)
342 /* read in the freelist top */
343 last_ptr
= FREELIST_TOP
;
344 while (ofs_read(tdb
, last_ptr
, &i
) != -1 && i
!= 0) {
346 /* We've found it! */
347 return ofs_write(tdb
, last_ptr
, &next
);
349 /* Follow chain (next offset is at start of record) */
352 return TDB_ERRCODE(TDB_ERR_CORRUPT
, -1);
355 /* Add an element into the freelist. Merge adjacent records if
357 static int tdb_free(TDB_CONTEXT
*tdb
, tdb_off offset
, struct list_struct
*rec
)
361 /* Allocation and tailer lock */
362 if (tdb_lock(tdb
, -1, F_WRLCK
) != 0) return -1;
364 /* Look right first (I'm an Australian, dammit) */
365 right
= offset
+ sizeof(*rec
) + rec
->rec_len
;
366 if (tdb_oob(tdb
, right
+ sizeof(*rec
)) == 0) {
367 struct list_struct r
;
369 if (tdb_read(tdb
, right
, &r
, sizeof(r
), DOCONV()) == -1)
372 /* If it's free, expand to include it. */
373 if (r
.magic
== TDB_FREE_MAGIC
) {
374 if (remove_from_freelist(tdb
, right
, r
.next
) == -1)
376 rec
->rec_len
+= sizeof(r
) + r
.rec_len
;
382 if (left
> TDB_HASH_TOP(tdb
->header
.hash_size
-1)) {
383 struct list_struct l
;
386 /* Read in tailer and jump back to header */
387 if (ofs_read(tdb
, left
, &leftsize
) == -1) goto fail
;
388 left
= offset
- leftsize
;
390 /* Now read in record */
391 if (tdb_read(tdb
, left
, &l
, sizeof(l
), DOCONV()) == -1)
394 /* If it's free, expand to include it. */
395 if (l
.magic
== TDB_FREE_MAGIC
) {
396 if (remove_from_freelist(tdb
, left
, l
.next
) == -1)
399 rec
->rec_len
+= leftsize
;
402 if (update_tailer(tdb
, offset
, rec
) == -1) goto fail
;
404 /* Now, prepend to free list */
405 rec
->magic
= TDB_FREE_MAGIC
;
407 if (ofs_read(tdb
, FREELIST_TOP
, &rec
->next
) == -1) goto fail
;
408 if (rec_write(tdb
, offset
, rec
) == -1) goto fail
;
409 if (ofs_write(tdb
, FREELIST_TOP
, &offset
) == -1) goto fail
;
411 /* And we're done. */
412 tdb_unlock(tdb
, -1, F_WRLCK
);
416 tdb_unlock(tdb
, -1, F_WRLCK
);
420 /* expand the database at least size bytes by expanding the underlying
421 file and doing the mmap again if necessary */
422 static int tdb_expand(TDB_CONTEXT
*tdb
, tdb_off size
)
424 struct list_struct rec
;
428 if (tdb_lock(tdb
, -1, F_WRLCK
) == -1) return 0;
430 /* must know about any previous expansions by another process */
431 tdb_oob(tdb
, tdb
->map_size
+ 1);
433 /* always make room for at least 10 more records, and round
434 the database up to a multiple of TDB_PAGE_SIZE */
435 size
= TDB_ALIGN(tdb
->map_size
+ size
*10, TDB_PAGE_SIZE
) - tdb
->map_size
;
437 /* expand the file itself */
438 if (!(tdb
->flags
& TDB_INTERNAL
)) {
439 lseek(tdb
->fd
, tdb
->map_size
+ size
- 1, SEEK_SET
);
440 if (write(tdb
->fd
, &b
, 1) != 1) goto fail
;
443 if (!(tdb
->flags
& TDB_INTERNAL
) && tdb
->map_ptr
)
444 tdb
->map_ptr
= tdb_munmap(tdb
->map_ptr
, tdb
->map_size
);
446 tdb
->map_size
+= size
;
448 if (tdb
->flags
& TDB_INTERNAL
)
449 tdb
->map_ptr
= realloc(tdb
->map_ptr
, tdb
->map_size
);
451 /* form a new freelist record */
452 memset(&rec
,'\0',sizeof(rec
));
453 rec
.rec_len
= size
- sizeof(rec
);
455 /* link it into the free list */
456 offset
= tdb
->map_size
- size
;
457 if (tdb_free(tdb
, offset
, &rec
) == -1) goto fail
;
459 if (!(tdb
->flags
& TDB_NOMMAP
))
460 tdb
->map_ptr
= tdb_mmap(tdb
->map_size
, 0, tdb
->fd
);
462 tdb_unlock(tdb
, -1, F_WRLCK
);
465 tdb_unlock(tdb
, -1, F_WRLCK
);
469 /* allocate some space from the free list. The offset returned points
470 to a unconnected list_struct within the database with room for at
471 least length bytes of total data
473 0 is returned if the space could not be allocated
475 static tdb_off
tdb_allocate(TDB_CONTEXT
*tdb
, tdb_len length
)
477 tdb_off rec_ptr
, last_ptr
;
478 struct list_struct rec
, newrec
;
480 if (tdb_lock(tdb
, -1, F_WRLCK
) == -1) return 0;
482 /* Extra bytes required for tailer */
483 length
+= sizeof(tdb_off
);
486 last_ptr
= FREELIST_TOP
;
488 /* read in the freelist top */
489 if (ofs_read(tdb
, FREELIST_TOP
, &rec_ptr
) == -1) goto fail
;
491 /* keep looking until we find a freelist record big enough */
493 if (rec_free_read(tdb
, rec_ptr
, &rec
) == -1) goto fail
;
495 if (rec
.rec_len
>= length
) {
496 /* found it - now possibly split it up */
497 if (rec
.rec_len
> length
+ MIN_REC_SIZE
) {
498 length
= TDB_ALIGN(length
, TDB_ALIGNMENT
);
500 newrec
.rec_len
= rec
.rec_len
501 - (sizeof(rec
) + length
);
502 newrec
.next
= rec
.next
;
503 newrec
.magic
= TDB_FREE_MAGIC
;
505 rec
.rec_len
= length
;
506 rec
.next
= rec_ptr
+ sizeof(rec
) + rec
.rec_len
;
508 /* Update split-off record */
509 if (rec_write(tdb
, rec
.next
, &newrec
) == -1
510 || update_tailer(tdb
,rec
.next
,&newrec
)==-1)
512 /* Update record we're about to allocate */
513 if (rec_write(tdb
, rec_ptr
, &rec
) == -1
514 || update_tailer(tdb
, rec_ptr
, &rec
)==-1)
518 /* remove it from the list */
519 if (ofs_write(tdb
, last_ptr
, &rec
.next
) == -1)
522 /* all done - return the new record offset */
523 tdb_unlock(tdb
, -1, F_WRLCK
);
526 /* move to the next record */
530 /* we didn't find enough space. See if we can expand the
531 database and if we can then try again */
532 if (tdb_expand(tdb
, length
+ sizeof(rec
)) == 0) goto again
;
534 tdb_unlock(tdb
, -1, F_WRLCK
);
538 /* initialise a new database with a specified hash size */
539 static int tdb_new_database(TDB_CONTEXT
*tdb
, int hash_size
)
541 struct tdb_header
*newdb
;
544 /* We make it up in memory, then write it out if not internal */
545 size
= sizeof(struct tdb_header
) + (hash_size
+1)*sizeof(tdb_off
);
546 if (!(newdb
= calloc(size
, 1))) return TDB_ERRCODE(TDB_ERR_OOM
, -1);
548 /* Fill in the header */
549 newdb
->version
= TDB_VERSION
;
550 newdb
->hash_size
= hash_size
;
552 newdb
->rwlocks
= size
;
554 if (tdb
->flags
& TDB_INTERNAL
) {
555 tdb
->map_size
= size
;
556 tdb
->map_ptr
= (char *)newdb
;
557 memcpy(&tdb
->header
, newdb
, sizeof(tdb
->header
));
558 /* Convert the `ondisk' version if asked. */
562 lseek(tdb
->fd
, 0, SEEK_SET
);
563 ftruncate(tdb
->fd
, 0);
564 /* This creates an endian-converted header, as if read from disk */
566 memcpy(&tdb
->header
, newdb
, sizeof(tdb
->header
));
567 /* Don't endian-convert the magic food! */
568 memcpy(newdb
->magic_food
, TDB_MAGIC_FOOD
, strlen(TDB_MAGIC_FOOD
)+1);
569 if (write(tdb
->fd
, newdb
, size
) != size
) ret
= -1;
570 else ret
= tdb_create_rwlocks(tdb
->fd
, hash_size
);
576 /* Returns 0 on fail. On success, return offset of record, and fills
578 static tdb_off
tdb_find(TDB_CONTEXT
*tdb
, TDB_DATA key
, u32 hash
,
579 struct list_struct
*r
)
583 /* read in the hash top */
584 if (ofs_read(tdb
, TDB_HASH_TOP(hash
), &rec_ptr
) == -1) return 0;
586 /* keep looking until we find the right record */
588 if (rec_read(tdb
, rec_ptr
, r
) == -1) return 0;
590 if (!TDB_DEAD(r
) && hash
==r
->full_hash
&& key
.dsize
==r
->key_len
) {
592 /* a very likely hit - read the key */
593 k
= tdb_alloc_read(tdb
, rec_ptr
+ sizeof(*r
),
597 if (memcmp(key
.dptr
, k
, key
.dsize
) == 0) {
605 return TDB_ERRCODE(TDB_ERR_NOEXIST
, 0);
608 /* If they do lockkeys, check that this hash is one they locked */
609 static int tdb_keylocked(TDB_CONTEXT
*tdb
, u32 hash
)
612 if (!tdb
->lockedkeys
) return 1;
613 for (i
= 0; i
< tdb
->lockedkeys
[0]; i
++)
614 if (tdb
->lockedkeys
[i
+1] == hash
) return 1;
615 return TDB_ERRCODE(TDB_ERR_NOLOCK
, 0);
618 /* As tdb_find, but if you succeed, keep the lock */
619 static tdb_off
tdb_find_lock(TDB_CONTEXT
*tdb
, TDB_DATA key
, int locktype
,
620 struct list_struct
*rec
)
624 hash
= tdb_hash(&key
);
625 if (!tdb_keylocked(tdb
, hash
)) return 0;
626 if (tdb_lock(tdb
, BUCKET(hash
), locktype
) == -1) return 0;
627 if (!(rec_ptr
= tdb_find(tdb
, key
, hash
, rec
)))
628 tdb_unlock(tdb
, BUCKET(hash
), locktype
);
632 enum TDB_ERROR
tdb_error(TDB_CONTEXT
*tdb
)
637 static struct tdb_errname
{
638 enum TDB_ERROR ecode
; const char *estring
;
639 } emap
[] = { {TDB_SUCCESS
, "Success"},
640 {TDB_ERR_CORRUPT
, "Corrupt database"},
641 {TDB_ERR_IO
, "IO Error"},
642 {TDB_ERR_LOCK
, "Locking error"},
643 {TDB_ERR_OOM
, "Out of memory"},
644 {TDB_ERR_EXISTS
, "Record exists"},
645 {TDB_ERR_NOLOCK
, "Lock exists on other keys"},
646 {TDB_ERR_NOEXIST
, "Record does not exist"} };
648 /* Error string for the last tdb error */
649 const char *tdb_errorstr(TDB_CONTEXT
*tdb
)
652 for (i
= 0; i
< sizeof(emap
) / sizeof(struct tdb_errname
); i
++)
653 if (tdb
->ecode
== emap
[i
].ecode
) return emap
[i
].estring
;
654 return "Invalid error code";
657 /* update an entry in place - this only works if the new data size
658 is <= the old data size and the key exists.
661 static int tdb_update(TDB_CONTEXT
*tdb
, TDB_DATA key
, TDB_DATA dbuf
)
663 struct list_struct rec
;
668 if (!(rec_ptr
= tdb_find_lock(tdb
, key
, F_WRLCK
, &rec
))) return -1;
670 /* must be long enough key, data and tailer */
671 if (rec
.rec_len
< key
.dsize
+ dbuf
.dsize
+ sizeof(tdb_off
)) goto out
;
673 if (tdb_write(tdb
, rec_ptr
+ sizeof(rec
) + rec
.key_len
,
674 dbuf
.dptr
, dbuf
.dsize
) == -1)
677 if (dbuf
.dsize
!= rec
.data_len
) {
679 rec
.data_len
= dbuf
.dsize
;
680 ret
= rec_write(tdb
, rec_ptr
, &rec
);
684 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_WRLCK
);
688 /* find an entry in the database given a key */
689 TDB_DATA
tdb_fetch(TDB_CONTEXT
*tdb
, TDB_DATA key
)
692 struct list_struct rec
;
695 /* find which hash bucket it is in */
696 if (!(rec_ptr
= tdb_find_lock(tdb
,key
,F_RDLCK
,&rec
))) return tdb_null
;
698 ret
.dptr
= tdb_alloc_read(tdb
, rec_ptr
+ sizeof(rec
) + rec
.key_len
,
700 ret
.dsize
= rec
.data_len
;
701 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_RDLCK
);
705 /* check if an entry in the database exists
707 note that 1 is returned if the key is found and 0 is returned if not found
708 this doesn't match the conventions in the rest of this module, but is
711 int tdb_exists(TDB_CONTEXT
*tdb
, TDB_DATA key
)
713 struct list_struct rec
;
715 if (tdb_find_lock(tdb
, key
, F_RDLCK
, &rec
) == 0) return 0;
716 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_RDLCK
);
720 /* record lock stops delete underneath */
721 static int lock_record(TDB_CONTEXT
*tdb
, tdb_off off
)
723 return off
? tdb_brlock(tdb
, off
, F_RDLCK
, F_SETLKW
) : 0;
725 /* write locks override our own fcntl readlocks, so check it here */
726 static int write_lock_record(TDB_CONTEXT
*tdb
, tdb_off off
)
728 struct tdb_traverse_lock
*i
;
729 for (i
= &tdb
->travlocks
; i
; i
= i
->next
) if (i
->off
== off
) return -1;
730 return tdb_brlock(tdb
, off
, F_WRLCK
, F_SETLK
);
732 static int write_unlock_record(TDB_CONTEXT
*tdb
, tdb_off off
)
734 return tdb_brlock(tdb
, off
, F_UNLCK
, F_SETLK
);
736 /* fcntl locks don't stack: avoid unlocking someone else's */
737 static int unlock_record(TDB_CONTEXT
*tdb
, tdb_off off
)
739 struct tdb_traverse_lock
*i
;
742 if (off
== 0) return 0;
743 for (i
= &tdb
->travlocks
; i
; i
= i
->next
) if (i
->off
== off
) count
++;
744 return (count
== 1 ? tdb_brlock(tdb
, off
, F_UNLCK
, F_SETLKW
) : 0);
747 /* actually delete an entry in the database given the offset */
748 static int do_delete(TDB_CONTEXT
*tdb
, tdb_off rec_ptr
, struct list_struct
*rec
)
751 struct list_struct lastrec
;
753 if (write_lock_record(tdb
, rec_ptr
) == -1) {
754 /* Someone traversing here: mark it as dead */
755 rec
->magic
= TDB_DEAD_MAGIC
;
756 return rec_write(tdb
, rec_ptr
, rec
);
758 write_unlock_record(tdb
, rec_ptr
);
760 /* find previous record in hash chain */
761 if (ofs_read(tdb
, TDB_HASH_TOP(rec
->full_hash
), &i
) == -1) return -1;
762 for (last_ptr
= 0; i
!= rec_ptr
; last_ptr
= i
, i
= lastrec
.next
)
763 if (rec_read(tdb
, i
, &lastrec
) == -1) return -1;
765 /* unlink it: next ptr is at start of record. */
766 if (last_ptr
== 0) last_ptr
= TDB_HASH_TOP(rec
->full_hash
);
767 if (ofs_write(tdb
, last_ptr
, &rec
->next
) == -1) return -1;
769 /* recover the space */
770 if (tdb_free(tdb
, rec_ptr
, rec
) == -1) return -1;
774 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
775 static int tdb_next_lock(TDB_CONTEXT
*tdb
, struct tdb_traverse_lock
*tlock
,
776 struct list_struct
*rec
)
779 int first
= (tlock
->off
== 0);
781 /* No traversal allows if you've called tdb_lockkeys() */
782 if (tdb
->lockedkeys
) return TDB_ERRCODE(TDB_ERR_NOLOCK
, -1);
784 /* Lock each chain from the start one. */
785 for (; tlock
->hash
< tdb
->header
.hash_size
; tlock
->hash
++) {
786 if (tdb_lock(tdb
, tlock
->hash
, F_WRLCK
) == -1) return -1;
788 /* No previous record? Start at top of chain. */
790 if (ofs_read(tdb
, TDB_HASH_TOP(tlock
->hash
),
794 /* Othereisre unlock the previous record. */
795 unlock_record(tdb
, tlock
->off
);
799 /* Grab next record */
800 if (rec_read(tdb
, tlock
->off
, rec
) == -1) goto fail
;
801 tlock
->off
= rec
->next
;
805 /* Iterate through chain */
806 for (; tlock
->off
; tlock
->off
= next
) {
807 if (rec_read(tdb
, tlock
->off
, rec
) == -1) goto fail
;
808 if (!TDB_DEAD(rec
)) {
809 /* Woohoo: we found one! */
810 lock_record(tdb
, tlock
->off
);
813 /* Try to clean dead ones from old traverses */
815 do_delete(tdb
, tlock
->off
, rec
);
817 tdb_unlock(tdb
, tlock
->hash
, F_WRLCK
);
819 /* We finished iteration without finding anything */
820 return TDB_ERRCODE(TDB_SUCCESS
, 0);
824 tdb_unlock(tdb
, tlock
->hash
, F_WRLCK
);
828 /* traverse the entire database - calling fn(tdb, key, data) on each element.
829 return -1 on error or the record count traversed
830 if fn is NULL then it is not called
831 a non-zero return value from fn() indicates that the traversal should stop
833 int tdb_traverse(TDB_CONTEXT
*tdb
, tdb_traverse_func fn
, void *state
)
836 struct list_struct rec
;
837 struct tdb_traverse_lock tl
= { tdb
->travlocks
.next
, 0, 0 };
840 /* fcntl locks don't stack: beware traverse inside traverse */
841 tdb
->travlocks
.next
= &tl
;
843 /* tdb_next_lock places locks on the record returned, and its chain */
844 while ((ret
= tdb_next_lock(tdb
, &tl
, &rec
)) > 0) {
846 /* now read the full record */
847 key
.dptr
= tdb_alloc_read(tdb
, tl
.off
+ sizeof(rec
),
848 rec
.key_len
+ rec
.data_len
);
850 tdb_unlock(tdb
, tl
.hash
, F_WRLCK
);
851 unlock_record(tdb
, tl
.off
);
852 tdb
->travlocks
.next
= tl
.next
;
855 key
.dsize
= rec
.key_len
;
856 dbuf
.dptr
= key
.dptr
+ rec
.key_len
;
857 dbuf
.dsize
= rec
.data_len
;
859 /* Drop chain lock, call out */
860 tdb_unlock(tdb
, tl
.hash
, F_WRLCK
);
861 if (fn
&& fn(tdb
, key
, dbuf
, state
)) {
862 /* They want us to terminate traversal */
863 unlock_record(tdb
, tl
.off
);
864 tdb
->travlocks
.next
= tl
.next
;
870 tdb
->travlocks
.next
= tl
.next
;
871 if (ret
< 0) return -1;
875 /* find the first entry in the database and return its key */
876 TDB_DATA
tdb_firstkey(TDB_CONTEXT
*tdb
)
879 struct list_struct rec
;
881 /* release any old lock */
882 unlock_record(tdb
, tdb
->travlocks
.off
);
883 tdb
->travlocks
.off
= tdb
->travlocks
.hash
= 0;
885 if (tdb_next_lock(tdb
, &tdb
->travlocks
, &rec
) <= 0) return tdb_null
;
886 /* now read the key */
887 key
.dsize
= rec
.key_len
;
888 key
.dptr
=tdb_alloc_read(tdb
,tdb
->travlocks
.off
+sizeof(rec
),key
.dsize
);
889 tdb_unlock(tdb
, BUCKET(tdb
->travlocks
.hash
), F_WRLCK
);
893 /* find the next entry in the database, returning its key */
894 TDB_DATA
tdb_nextkey(TDB_CONTEXT
*tdb
, TDB_DATA oldkey
)
897 TDB_DATA key
= tdb_null
;
898 struct list_struct rec
;
901 /* Is locked key the old key? If so, traverse will be reliable. */
902 if (tdb
->travlocks
.off
) {
903 if (tdb_lock(tdb
,tdb
->travlocks
.hash
,F_WRLCK
)) return tdb_null
;
904 if (rec_read(tdb
, tdb
->travlocks
.off
, &rec
) == -1
905 || !(k
= tdb_alloc_read(tdb
,tdb
->travlocks
.off
+sizeof(rec
),
907 || memcmp(k
, oldkey
.dptr
, oldkey
.dsize
) != 0) {
908 /* No, it wasn't: unlock it and start from scratch */
910 unlock_record(tdb
, tdb
->travlocks
.off
);
911 tdb_unlock(tdb
, tdb
->travlocks
.hash
, F_WRLCK
);
912 tdb
->travlocks
.off
= 0;
916 if (!tdb
->travlocks
.off
) {
917 /* No previous element: do normal find, and lock record */
918 tdb
->travlocks
.off
= tdb_find_lock(tdb
, oldkey
, F_WRLCK
, &rec
);
919 if (!tdb
->travlocks
.off
) return tdb_null
;
920 tdb
->travlocks
.hash
= BUCKET(rec
.full_hash
);
921 lock_record(tdb
, tdb
->travlocks
.off
);
923 oldhash
= tdb
->travlocks
.hash
;
925 /* Grab next record: locks chain and returned record,
926 unlocks old record */
927 if (tdb_next_lock(tdb
, &tdb
->travlocks
, &rec
) > 0) {
928 key
.dsize
= rec
.key_len
;
929 key
.dptr
= tdb_alloc_read(tdb
, tdb
->travlocks
.off
+sizeof(rec
),
931 /* Unlock the chain of this new record */
932 tdb_unlock(tdb
, tdb
->travlocks
.hash
, F_WRLCK
);
934 /* Unlock the chain of old record */
935 tdb_unlock(tdb
, BUCKET(oldhash
), F_WRLCK
);
939 /* delete an entry in the database given a key */
940 int tdb_delete(TDB_CONTEXT
*tdb
, TDB_DATA key
)
943 struct list_struct rec
;
946 if (!(rec_ptr
= tdb_find_lock(tdb
, key
, F_WRLCK
, &rec
))) return -1;
947 ret
= do_delete(tdb
, rec_ptr
, &rec
);
948 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_WRLCK
);
952 /* store an element in the database, replacing any existing element
955 return 0 on success, -1 on failure
957 int tdb_store(TDB_CONTEXT
*tdb
, TDB_DATA key
, TDB_DATA dbuf
, int flag
)
959 struct list_struct rec
;
965 /* find which hash bucket it is in */
966 hash
= tdb_hash(&key
);
967 if (!tdb_keylocked(tdb
, hash
)) return -1;
968 tdb_lock(tdb
, BUCKET(hash
), F_WRLCK
);
970 /* check for it existing, on insert. */
971 if (flag
== TDB_INSERT
) {
972 if (tdb_exists(tdb
, key
)) {
973 tdb
->ecode
= TDB_ERR_EXISTS
;
977 /* first try in-place update, on modify or replace. */
978 if (tdb_update(tdb
, key
, dbuf
) == 0) goto out
;
979 if (flag
== TDB_MODIFY
&& tdb
->ecode
== TDB_ERR_NOEXIST
)
982 /* reset the error code potentially set by the tdb_update() */
983 tdb
->ecode
= TDB_SUCCESS
;
985 /* delete any existing record - if it doesn't exist we don't
986 care. Doing this first reduces fragmentation, and avoids
987 coalescing with `allocated' block before it's updated. */
988 if (flag
!= TDB_INSERT
) tdb_delete(tdb
, key
);
990 /* now we're into insert / modify / replace of a record which
991 * we know could not be optimised by an in-place store (for
992 * various reasons). */
993 if (!(rec_ptr
= tdb_allocate(tdb
, key
.dsize
+ dbuf
.dsize
))) goto fail
;
995 /* read the newly created record, then read hash top into next ptr */
996 if (rec_free_read(tdb
, rec_ptr
, &rec
) == -1) goto fail
;
997 if (ofs_read(tdb
, TDB_HASH_TOP(hash
), &rec
.next
) == -1) goto fail
;
999 rec
.key_len
= key
.dsize
;
1000 rec
.data_len
= dbuf
.dsize
;
1001 rec
.full_hash
= hash
;
1002 rec
.magic
= TDB_MAGIC
;
1004 if (!(p
= (char *)malloc(key
.dsize
+ dbuf
.dsize
))) {
1005 tdb
->ecode
= TDB_ERR_OOM
;
1009 memcpy(p
, key
.dptr
, key
.dsize
);
1010 memcpy(p
+key
.dsize
, dbuf
.dptr
, dbuf
.dsize
);
1011 /* write out and point the top of the hash chain at it */
1012 if (rec_write(tdb
, rec_ptr
, &rec
) == -1
1013 || tdb_write(tdb
, rec_ptr
+sizeof(rec
), p
, key
.dsize
+dbuf
.dsize
)==-1
1014 || ofs_write(tdb
, TDB_HASH_TOP(hash
), &rec_ptr
) == -1) {
1020 tdb_unlock(tdb
, BUCKET(hash
), F_WRLCK
);
1024 /* open the database, creating it if necessary
1026 The open_flags and mode are passed straight to the open call on the
1027 database file. A flags value of O_WRONLY is invalid. The hash size
1028 is advisory, use zero for a default value.
1030 return is NULL on error */
1031 TDB_CONTEXT
*tdb_open(char *name
, int hash_size
, int tdb_flags
,
1032 int open_flags
, mode_t mode
)
1034 TDB_CONTEXT tdb
, *ret
, *i
;
1036 int rev
= 0, locked
;
1038 memset(&tdb
, 0, sizeof(tdb
));
1042 tdb
.lockedkeys
= NULL
;
1043 tdb
.flags
= tdb_flags
;
1045 if ((open_flags
& O_ACCMODE
) == O_WRONLY
) goto fail
;
1046 if (hash_size
== 0) hash_size
= DEFAULT_HASH_SIZE
;
1047 if ((open_flags
& O_ACCMODE
) == O_RDONLY
) {
1049 /* read only databases don't do locking */
1050 tdb
.flags
|= TDB_NOLOCK
;
1053 /* internal databases don't mmap or lock, and start off cleared */
1054 if (tdb
.flags
& TDB_INTERNAL
) {
1055 tdb
.flags
|= (TDB_NOLOCK
| TDB_NOMMAP
);
1056 tdb
.flags
&= ~TDB_CLEAR_IF_FIRST
;
1057 tdb_new_database(&tdb
, hash_size
);
1061 if ((tdb
.fd
= open(name
, open_flags
, mode
)) == -1) goto fail
;
1063 /* ensure there is only one process initialising at once */
1064 tdb_brlock(&tdb
, GLOBAL_LOCK
, F_WRLCK
, F_SETLKW
);
1066 /* we need to zero database if we are the only one with it open */
1067 if ((locked
= (tdb_brlock(&tdb
, ACTIVE_LOCK
, F_WRLCK
, F_SETLK
) == 0))
1068 && (tdb_flags
& TDB_CLEAR_IF_FIRST
)) {
1069 open_flags
|= O_CREAT
;
1070 ftruncate(tdb
.fd
, 0);
1073 if (read(tdb
.fd
, &tdb
.header
, sizeof(tdb
.header
)) != sizeof(tdb
.header
)
1074 || strcmp(tdb
.header
.magic_food
, TDB_MAGIC_FOOD
) != 0
1075 || (tdb
.header
.version
!= TDB_VERSION
1076 && !(rev
= (tdb
.header
.version
==TDB_BYTEREV(TDB_VERSION
))))) {
1077 /* its not a valid database - possibly initialise it */
1078 if (!(open_flags
& O_CREAT
)
1079 || tdb_new_database(&tdb
, hash_size
) == -1) goto fail
;
1080 rev
= (tdb
.flags
& TDB_CONVERT
);
1082 if (!rev
) tdb
.flags
&= ~TDB_CONVERT
;
1084 tdb
.flags
|= TDB_CONVERT
;
1085 convert(&tdb
.header
, sizeof(tdb
.header
));
1088 /* Is it already in the open list? If so, fail. */
1089 for (i
= tdbs
; i
; i
= i
->next
) {
1090 if (i
->device
== st
.st_dev
&& i
->inode
== st
.st_ino
) {
1097 /* map the database and fill in the return structure */
1098 tdb
.name
= (char *)strdup(name
);
1099 tdb
.map_size
= st
.st_size
;
1100 tdb
.device
= st
.st_dev
;
1101 tdb
.inode
= st
.st_ino
;
1102 tdb
.locked
= calloc(tdb
.header
.hash_size
+1, sizeof(tdb
.locked
[0]));
1103 if (!tdb
.locked
) goto fail
;
1104 if (!(tdb
.flags
& TDB_NOMMAP
))
1105 tdb
.map_ptr
= tdb_mmap(st
.st_size
, tdb
.read_only
, tdb
.fd
);
1107 tdb_clear_spinlocks(&tdb
);
1108 tdb_brlock(&tdb
, ACTIVE_LOCK
, F_UNLCK
, F_SETLK
);
1110 /* leave this lock in place to indicate it's in use */
1111 tdb_brlock(&tdb
, ACTIVE_LOCK
, F_RDLCK
, F_SETLKW
);
1114 if (!(ret
= malloc(sizeof(tdb
)))) goto fail
;
1116 tdb_brlock(&tdb
, GLOBAL_LOCK
, F_UNLCK
, F_SETLKW
);
1122 if (tdb
.name
) free(tdb
.name
);
1123 if (tdb
.fd
!= -1) close(tdb
.fd
);
1124 if (tdb
.map_ptr
) tdb_munmap(tdb
.map_ptr
, tdb
.map_size
);
1128 /* close a database */
1129 int tdb_close(TDB_CONTEXT
*tdb
)
1135 if (tdb
->flags
& TDB_INTERNAL
) free(tdb
->map_ptr
);
1136 else tdb_munmap(tdb
->map_ptr
, tdb
->map_size
);
1138 if (tdb
->name
) free(tdb
->name
);
1139 if (tdb
->fd
!= -1) {
1140 ret
= close(tdb
->fd
);
1142 if (tdb
->locked
) free(tdb
->locked
);
1143 if (tdb
->lockedkeys
) free(tdb
->lockedkeys
);
1145 /* Remove from contexts list */
1146 for (i
= &tdbs
; *i
; i
= &(*i
)->next
) {
1153 memset(tdb
, 0, sizeof(*tdb
));
1159 /* lock/unlock entire database */
1160 int tdb_lockall(TDB_CONTEXT
*tdb
)
1164 /* There are no locks on read-only dbs */
1165 if (tdb
->read_only
) return TDB_ERRCODE(TDB_ERR_LOCK
, -1);
1166 if (tdb
->lockedkeys
) return TDB_ERRCODE(TDB_ERR_NOLOCK
, -1);
1167 for (i
= 0; i
< tdb
->header
.hash_size
; i
++) tdb_lock(tdb
, i
, F_WRLCK
);
1170 void tdb_unlockall(TDB_CONTEXT
*tdb
)
1173 for (i
=0; i
< tdb
->header
.hash_size
; i
++) tdb_unlock(tdb
, i
, F_WRLCK
);
1176 int tdb_lockkeys(TDB_CONTEXT
*tdb
, u32 number
, TDB_DATA keys
[])
1180 /* Can't lock more keys if already locked */
1181 if (tdb
->lockedkeys
) return TDB_ERRCODE(TDB_ERR_NOLOCK
, -1);
1182 if (!(tdb
->lockedkeys
= malloc(sizeof(u32
) * (number
+1))))
1183 return TDB_ERRCODE(TDB_ERR_OOM
, -1);
1184 /* First number in array is # keys */
1185 tdb
->lockedkeys
[0] = number
;
1187 /* Insertion sort by bucket */
1188 for (i
= 0; i
< number
; i
++) {
1189 hash
= tdb_hash(&keys
[i
]);
1191 j
< i
&& BUCKET(tdb
->lockedkeys
[j
+1]) < BUCKET(hash
);
1193 memmove(&tdb
->lockedkeys
[j
+2], &tdb
->lockedkeys
[j
+1],
1194 sizeof(u32
) * (i
-j
));
1195 tdb
->lockedkeys
[j
+1] = hash
;
1197 /* Finally, lock in order */
1198 for (i
= 0; i
< number
; i
++) tdb_lock(tdb
, i
, F_WRLCK
);
1202 /* Unlock the keys previously locked by tdb_lockkeys() */
1203 void tdb_unlockkeys(TDB_CONTEXT
*tdb
)
1206 for (i
= 0; i
< tdb
->lockedkeys
[0]; i
++)
1207 tdb_unlock(tdb
, tdb
->lockedkeys
[i
+1], F_WRLCK
);
1208 free(tdb
->lockedkeys
);
1209 tdb
->lockedkeys
= NULL
;
1212 /* lock/unlock one hash chain. This is meant to be used to reduce
1213 contention - it cannot guarantee how many records will be locked */
1214 int tdb_chainlock(TDB_CONTEXT
*tdb
, TDB_DATA key
)
1216 return tdb_lock(tdb
, BUCKET(tdb_hash(&key
)), F_WRLCK
);
1218 void tdb_chainunlock(TDB_CONTEXT
*tdb
, TDB_DATA key
)
1220 tdb_unlock(tdb
, BUCKET(tdb_hash(&key
)), F_WRLCK
);