2 #include <ccan/hash/hash.h>
5 /* all lock info, to detect double-opens (fcntl file don't nest!) */
6 static struct tdb_file
*files
= NULL
;
8 static struct tdb_file
*find_file(dev_t device
, ino_t ino
)
12 for (i
= files
; i
; i
= i
->next
) {
13 if (i
->device
== device
&& i
->inode
== ino
) {
21 static bool read_all(int fd
, void *buf
, size_t len
)
25 ret
= read(fd
, buf
, len
);
33 buf
= (char *)buf
+ ret
;
39 static uint64_t random_number(struct tdb_context
*tdb
)
45 fd
= open("/dev/urandom", O_RDONLY
);
47 if (read_all(fd
, &ret
, sizeof(ret
))) {
53 /* FIXME: Untested! Based on Wikipedia protocol description! */
54 fd
= open("/dev/egd-pool", O_RDWR
);
56 /* Command is 1, next byte is size we want to read. */
57 char cmd
[2] = { 1, sizeof(uint64_t) };
58 if (write(fd
, cmd
, sizeof(cmd
)) == sizeof(cmd
)) {
59 char reply
[1 + sizeof(uint64_t)];
60 int r
= read(fd
, reply
, sizeof(reply
));
62 /* Copy at least some bytes. */
63 memcpy(&ret
, reply
+1, r
- 1);
64 if (reply
[0] == sizeof(uint64_t)
65 && r
== sizeof(reply
)) {
74 /* Fallback: pid and time. */
75 gettimeofday(&now
, NULL
);
76 ret
= getpid() * 100132289ULL + now
.tv_sec
* 1000000ULL + now
.tv_usec
;
77 tdb_logerr(tdb
, TDB_SUCCESS
, TDB_LOG_WARNING
,
78 "tdb_open: random from getpid and time");
83 struct tdb_header hdr
;
84 struct tdb_freetable ftable
;
87 /* initialise a new database */
88 static enum TDB_ERROR
tdb_new_database(struct tdb_context
*tdb
,
89 struct tdb_attribute_seed
*seed
,
90 struct tdb_header
*hdr
)
92 /* We make it up in memory, then write it out if not internal */
93 struct new_database newdb
;
94 unsigned int magic_len
;
98 /* Fill in the header */
99 newdb
.hdr
.version
= TDB_VERSION
;
101 newdb
.hdr
.hash_seed
= seed
->seed
;
103 newdb
.hdr
.hash_seed
= random_number(tdb
);
104 newdb
.hdr
.hash_test
= TDB_HASH_MAGIC
;
105 newdb
.hdr
.hash_test
= tdb
->hash_fn(&newdb
.hdr
.hash_test
,
106 sizeof(newdb
.hdr
.hash_test
),
109 newdb
.hdr
.recovery
= 0;
110 newdb
.hdr
.features_used
= newdb
.hdr
.features_offered
= TDB_FEATURE_MASK
;
111 newdb
.hdr
.seqnum
= 0;
112 memset(newdb
.hdr
.reserved
, 0, sizeof(newdb
.hdr
.reserved
));
113 /* Initial hashes are empty. */
114 memset(newdb
.hdr
.hashtable
, 0, sizeof(newdb
.hdr
.hashtable
));
117 newdb
.hdr
.free_table
= offsetof(struct new_database
, ftable
);
118 memset(&newdb
.ftable
, 0, sizeof(newdb
.ftable
));
119 ecode
= set_header(NULL
, &newdb
.ftable
.hdr
, TDB_FTABLE_MAGIC
, 0,
120 sizeof(newdb
.ftable
) - sizeof(newdb
.ftable
.hdr
),
121 sizeof(newdb
.ftable
) - sizeof(newdb
.ftable
.hdr
),
123 if (ecode
!= TDB_SUCCESS
) {
128 memset(newdb
.hdr
.magic_food
, 0, sizeof(newdb
.hdr
.magic_food
));
129 strcpy(newdb
.hdr
.magic_food
, TDB_MAGIC_FOOD
);
131 /* This creates an endian-converted database, as if read from disk */
132 magic_len
= sizeof(newdb
.hdr
.magic_food
);
134 (char *)&newdb
.hdr
+ magic_len
, sizeof(newdb
) - magic_len
);
138 if (tdb
->flags
& TDB_INTERNAL
) {
139 tdb
->file
->map_size
= sizeof(newdb
);
140 tdb
->file
->map_ptr
= malloc(tdb
->file
->map_size
);
141 if (!tdb
->file
->map_ptr
) {
142 return tdb_logerr(tdb
, TDB_ERR_OOM
, TDB_LOG_ERROR
,
144 " failed to allocate");
146 memcpy(tdb
->file
->map_ptr
, &newdb
, tdb
->file
->map_size
);
149 if (lseek(tdb
->file
->fd
, 0, SEEK_SET
) == -1) {
150 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
152 " failed to seek: %s", strerror(errno
));
155 if (ftruncate(tdb
->file
->fd
, 0) == -1) {
156 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
158 " failed to truncate: %s", strerror(errno
));
161 rlen
= write(tdb
->file
->fd
, &newdb
, sizeof(newdb
));
162 if (rlen
!= sizeof(newdb
)) {
165 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
166 "tdb_new_database: %zi writing header: %s",
167 rlen
, strerror(errno
));
172 static enum TDB_ERROR
tdb_new_file(struct tdb_context
*tdb
)
174 tdb
->file
= malloc(sizeof(*tdb
->file
));
176 return tdb_logerr(tdb
, TDB_ERR_OOM
, TDB_LOG_ERROR
,
177 "tdb_open: cannot alloc tdb_file structure");
178 tdb
->file
->num_lockrecs
= 0;
179 tdb
->file
->lockrecs
= NULL
;
180 tdb
->file
->allrecord_lock
.count
= 0;
181 tdb
->file
->refcnt
= 1;
185 enum TDB_ERROR
tdb_set_attribute(struct tdb_context
*tdb
,
186 const union tdb_attribute
*attr
)
188 switch (attr
->base
.attr
) {
189 case TDB_ATTRIBUTE_LOG
:
190 tdb
->log_fn
= attr
->log
.fn
;
191 tdb
->log_data
= attr
->log
.data
;
193 case TDB_ATTRIBUTE_HASH
:
194 case TDB_ATTRIBUTE_SEED
:
195 case TDB_ATTRIBUTE_OPENHOOK
:
196 return tdb
->last_error
197 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
200 " cannot set %s after opening",
201 attr
->base
.attr
== TDB_ATTRIBUTE_HASH
202 ? "TDB_ATTRIBUTE_HASH"
203 : attr
->base
.attr
== TDB_ATTRIBUTE_SEED
204 ? "TDB_ATTRIBUTE_SEED"
205 : "TDB_ATTRIBUTE_OPENHOOK");
206 case TDB_ATTRIBUTE_STATS
:
207 return tdb
->last_error
208 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
211 " cannot set TDB_ATTRIBUTE_STATS");
212 case TDB_ATTRIBUTE_FLOCK
:
213 tdb
->lock_fn
= attr
->flock
.lock
;
214 tdb
->unlock_fn
= attr
->flock
.unlock
;
215 tdb
->lock_data
= attr
->flock
.data
;
218 return tdb
->last_error
219 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
222 " unknown attribute type %u",
228 static uint64_t jenkins_hash(const void *key
, size_t length
, uint64_t seed
,
232 /* hash64_stable assumes lower bits are more important; they are a
233 * slightly better hash. We use the upper bits first, so swap them. */
234 ret
= hash64_stable((const unsigned char *)key
, length
, seed
);
235 return (ret
>> 32) | (ret
<< 32);
238 enum TDB_ERROR
tdb_get_attribute(struct tdb_context
*tdb
,
239 union tdb_attribute
*attr
)
241 switch (attr
->base
.attr
) {
242 case TDB_ATTRIBUTE_LOG
:
244 return tdb
->last_error
= TDB_ERR_NOEXIST
;
245 attr
->log
.fn
= tdb
->log_fn
;
246 attr
->log
.data
= tdb
->log_data
;
248 case TDB_ATTRIBUTE_HASH
:
249 attr
->hash
.fn
= tdb
->hash_fn
;
250 attr
->hash
.data
= tdb
->hash_data
;
252 case TDB_ATTRIBUTE_SEED
:
253 attr
->seed
.seed
= tdb
->hash_seed
;
255 case TDB_ATTRIBUTE_OPENHOOK
:
256 return tdb
->last_error
257 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
260 " cannot get TDB_ATTRIBUTE_OPENHOOK");
261 case TDB_ATTRIBUTE_STATS
: {
262 size_t size
= attr
->stats
.size
;
263 if (size
> tdb
->stats
.size
)
264 size
= tdb
->stats
.size
;
265 memcpy(&attr
->stats
, &tdb
->stats
, size
);
268 case TDB_ATTRIBUTE_FLOCK
:
269 attr
->flock
.lock
= tdb
->lock_fn
;
270 attr
->flock
.unlock
= tdb
->unlock_fn
;
271 attr
->flock
.data
= tdb
->lock_data
;
274 return tdb
->last_error
275 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
278 " unknown attribute type %u",
281 attr
->base
.next
= NULL
;
285 void tdb_unset_attribute(struct tdb_context
*tdb
,
286 enum tdb_attribute_type type
)
289 case TDB_ATTRIBUTE_LOG
:
292 case TDB_ATTRIBUTE_HASH
:
293 case TDB_ATTRIBUTE_SEED
:
294 case TDB_ATTRIBUTE_OPENHOOK
:
295 tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
296 "tdb_unset_attribute: cannot unset %s after opening",
297 type
== TDB_ATTRIBUTE_HASH
298 ? "TDB_ATTRIBUTE_HASH"
299 : type
== TDB_ATTRIBUTE_SEED
300 ? "TDB_ATTRIBUTE_SEED"
301 : "TDB_ATTRIBUTE_OPENHOOK");
303 case TDB_ATTRIBUTE_STATS
:
304 tdb_logerr(tdb
, TDB_ERR_EINVAL
,
306 "tdb_unset_attribute:"
307 "cannot unset TDB_ATTRIBUTE_STATS");
309 case TDB_ATTRIBUTE_FLOCK
:
310 tdb
->lock_fn
= tdb_fcntl_lock
;
311 tdb
->unlock_fn
= tdb_fcntl_unlock
;
314 tdb_logerr(tdb
, TDB_ERR_EINVAL
,
316 "tdb_unset_attribute: unknown attribute type %u",
321 struct tdb_context
*tdb_open(const char *name
, int tdb_flags
,
322 int open_flags
, mode_t mode
,
323 union tdb_attribute
*attr
)
325 struct tdb_context
*tdb
;
331 struct tdb_header hdr
;
332 struct tdb_attribute_seed
*seed
= NULL
;
333 struct tdb_attribute_openhook
*openhook
= NULL
;
335 enum TDB_ERROR ecode
;
338 tdb
= malloc(sizeof(*tdb
) + (name
? strlen(name
) + 1 : 0));
344 /* Set name immediately for logging functions. */
346 tdb
->name
= strcpy((char *)(tdb
+ 1), name
);
350 tdb
->direct_access
= 0;
351 tdb
->flags
= tdb_flags
;
353 tdb
->transaction
= NULL
;
355 tdb
->last_error
= TDB_SUCCESS
;
357 tdb
->lock_fn
= tdb_fcntl_lock
;
358 tdb
->unlock_fn
= tdb_fcntl_unlock
;
359 tdb
->hash_fn
= jenkins_hash
;
360 memset(&tdb
->stats
, 0, sizeof(tdb
->stats
));
361 tdb
->stats
.base
.attr
= TDB_ATTRIBUTE_STATS
;
362 tdb
->stats
.size
= sizeof(tdb
->stats
);
366 switch (attr
->base
.attr
) {
367 case TDB_ATTRIBUTE_HASH
:
368 tdb
->hash_fn
= attr
->hash
.fn
;
369 tdb
->hash_data
= attr
->hash
.data
;
371 case TDB_ATTRIBUTE_SEED
:
374 case TDB_ATTRIBUTE_OPENHOOK
:
375 openhook
= &attr
->openhook
;
378 /* These are set as normal. */
379 ecode
= tdb_set_attribute(tdb
, attr
);
380 if (ecode
!= TDB_SUCCESS
)
383 attr
= attr
->base
.next
;
386 if (tdb_flags
& ~(TDB_INTERNAL
| TDB_NOLOCK
| TDB_NOMMAP
| TDB_CONVERT
387 | TDB_NOSYNC
| TDB_SEQNUM
| TDB_ALLOW_NESTING
)) {
388 ecode
= tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
389 "tdb_open: unknown flags %u", tdb_flags
);
393 if ((open_flags
& O_ACCMODE
) == O_WRONLY
) {
394 ecode
= tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
395 "tdb_open: can't open tdb %s write-only",
400 if ((open_flags
& O_ACCMODE
) == O_RDONLY
) {
401 tdb
->read_only
= true;
402 tdb
->mmap_flags
= PROT_READ
;
405 tdb
->read_only
= false;
406 tdb
->mmap_flags
= PROT_READ
| PROT_WRITE
;
410 /* internal databases don't need any of the rest. */
411 if (tdb
->flags
& TDB_INTERNAL
) {
412 tdb
->flags
|= (TDB_NOLOCK
| TDB_NOMMAP
);
413 ecode
= tdb_new_file(tdb
);
414 if (ecode
!= TDB_SUCCESS
) {
418 ecode
= tdb_new_database(tdb
, seed
, &hdr
);
419 if (ecode
!= TDB_SUCCESS
) {
422 tdb_convert(tdb
, &hdr
.hash_seed
, sizeof(hdr
.hash_seed
));
423 tdb
->hash_seed
= hdr
.hash_seed
;
424 tdb_ftable_init(tdb
);
428 if (stat(name
, &st
) != -1)
429 tdb
->file
= find_file(st
.st_dev
, st
.st_ino
);
434 if ((fd
= open(name
, open_flags
, mode
)) == -1) {
435 /* errno set by open(2) */
437 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
438 "tdb_open: could not open file %s: %s",
439 name
, strerror(errno
));
443 /* on exec, don't inherit the fd */
444 v
= fcntl(fd
, F_GETFD
, 0);
445 fcntl(fd
, F_SETFD
, v
| FD_CLOEXEC
);
447 if (fstat(fd
, &st
) == -1) {
449 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
450 "tdb_open: could not stat open %s: %s",
451 name
, strerror(errno
));
456 ecode
= tdb_new_file(tdb
);
457 if (ecode
!= TDB_SUCCESS
) {
462 tdb
->file
->next
= files
;
464 tdb
->file
->device
= st
.st_dev
;
465 tdb
->file
->inode
= st
.st_ino
;
466 tdb
->file
->map_ptr
= NULL
;
467 tdb
->file
->map_size
= sizeof(struct tdb_header
);
470 /* ensure there is only one process initialising at once */
471 ecode
= tdb_lock_open(tdb
, openlock
, TDB_LOCK_WAIT
|TDB_LOCK_NOCHECK
);
472 if (ecode
!= TDB_SUCCESS
) {
477 /* call their open hook if they gave us one. */
479 ecode
= openhook
->fn(tdb
->file
->fd
, openhook
->data
);
480 if (ecode
!= TDB_SUCCESS
) {
481 tdb_logerr(tdb
, ecode
, TDB_LOG_ERROR
,
482 "tdb_open: open hook failed");
485 open_flags
|= O_CREAT
;
488 /* If they used O_TRUNC, read will return 0. */
489 rlen
= pread(tdb
->file
->fd
, &hdr
, sizeof(hdr
), 0);
490 if (rlen
== 0 && (open_flags
& O_CREAT
)) {
491 ecode
= tdb_new_database(tdb
, seed
, &hdr
);
492 if (ecode
!= TDB_SUCCESS
) {
495 } else if (rlen
< 0) {
496 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
497 "tdb_open: error %s reading %s",
498 strerror(errno
), name
);
500 } else if (rlen
< sizeof(hdr
)
501 || strcmp(hdr
.magic_food
, TDB_MAGIC_FOOD
) != 0) {
502 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
503 "tdb_open: %s is not a tdb file", name
);
507 if (hdr
.version
!= TDB_VERSION
) {
508 if (hdr
.version
== bswap_64(TDB_VERSION
))
509 tdb
->flags
|= TDB_CONVERT
;
512 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
514 " %s is unknown version 0x%llx",
515 name
, (long long)hdr
.version
);
520 tdb_convert(tdb
, &hdr
, sizeof(hdr
));
521 tdb
->hash_seed
= hdr
.hash_seed
;
522 hash_test
= TDB_HASH_MAGIC
;
523 hash_test
= tdb_hash(tdb
, &hash_test
, sizeof(hash_test
));
524 if (hdr
.hash_test
!= hash_test
) {
525 /* wrong hash variant */
526 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
528 " %s uses a different hash function",
533 /* Clear any features we don't understand. */
534 if ((open_flags
& O_ACCMODE
) != O_RDONLY
) {
535 hdr
.features_used
&= TDB_FEATURE_MASK
;
536 if (tdb_write_convert(tdb
, offsetof(struct tdb_header
,
539 sizeof(hdr
.features_used
)) == -1)
543 tdb_unlock_open(tdb
, openlock
);
545 /* This make sure we have current map_size and mmap. */
546 tdb
->methods
->oob(tdb
, tdb
->file
->map_size
+ 1, true);
548 /* Now it's fully formed, recover if necessary. */
549 berr
= tdb_needs_recovery(tdb
);
550 if (unlikely(berr
!= false)) {
555 ecode
= tdb_lock_and_recover(tdb
);
556 if (ecode
!= TDB_SUCCESS
) {
561 ecode
= tdb_ftable_init(tdb
);
562 if (ecode
!= TDB_SUCCESS
) {
566 /* Add to linked list if we're new. */
567 if (tdb
->file
->refcnt
== 1)
572 /* Map ecode to some logical errno. */
574 case TDB_ERR_CORRUPT
:
579 saved_errno
= EWOULDBLOCK
;
582 saved_errno
= ENOMEM
;
585 saved_errno
= EINVAL
;
588 saved_errno
= EINVAL
;
597 tdb_lock_cleanup(tdb
);
598 if (--tdb
->file
->refcnt
== 0) {
599 assert(tdb
->file
->num_lockrecs
== 0);
600 if (tdb
->file
->map_ptr
) {
601 if (tdb
->flags
& TDB_INTERNAL
) {
602 free(tdb
->file
->map_ptr
);
604 tdb_munmap(tdb
->file
);
606 if (close(tdb
->file
->fd
) != 0)
607 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
608 "tdb_open: failed to close tdb fd"
609 " on error: %s", strerror(errno
));
610 free(tdb
->file
->lockrecs
);
620 int tdb_close(struct tdb_context
*tdb
)
624 tdb_trace(tdb
, "tdb_close");
626 if (tdb
->transaction
) {
627 tdb_transaction_cancel(tdb
);
630 if (tdb
->file
->map_ptr
) {
631 if (tdb
->flags
& TDB_INTERNAL
)
632 free(tdb
->file
->map_ptr
);
634 tdb_munmap(tdb
->file
);
639 tdb_lock_cleanup(tdb
);
640 if (--tdb
->file
->refcnt
== 0) {
641 ret
= close(tdb
->file
->fd
);
643 /* Remove from files list */
644 for (i
= &files
; *i
; i
= &(*i
)->next
) {
645 if (*i
== tdb
->file
) {
646 *i
= tdb
->file
->next
;
650 free(tdb
->file
->lockrecs
);