2 Trivial Database 2: opening and closing TDBs
3 Copyright (C) Rusty Russell 2010
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 3 of the License, or (at your option) any later version.
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include <ccan/hash/hash.h>
22 /* all lock info, to detect double-opens (fcntl file don't nest!) */
23 static struct tdb_file
*files
= NULL
;
25 static struct tdb_file
*find_file(dev_t device
, ino_t ino
)
29 for (i
= files
; i
; i
= i
->next
) {
30 if (i
->device
== device
&& i
->inode
== ino
) {
38 static bool read_all(int fd
, void *buf
, size_t len
)
42 ret
= read(fd
, buf
, len
);
50 buf
= (char *)buf
+ ret
;
56 static uint64_t random_number(struct tdb_context
*tdb
)
62 fd
= open("/dev/urandom", O_RDONLY
);
64 if (read_all(fd
, &ret
, sizeof(ret
))) {
70 /* FIXME: Untested! Based on Wikipedia protocol description! */
71 fd
= open("/dev/egd-pool", O_RDWR
);
73 /* Command is 1, next byte is size we want to read. */
74 char cmd
[2] = { 1, sizeof(uint64_t) };
75 if (write(fd
, cmd
, sizeof(cmd
)) == sizeof(cmd
)) {
76 char reply
[1 + sizeof(uint64_t)];
77 int r
= read(fd
, reply
, sizeof(reply
));
79 /* Copy at least some bytes. */
80 memcpy(&ret
, reply
+1, r
- 1);
81 if (reply
[0] == sizeof(uint64_t)
82 && r
== sizeof(reply
)) {
91 /* Fallback: pid and time. */
92 gettimeofday(&now
, NULL
);
93 ret
= getpid() * 100132289ULL + now
.tv_sec
* 1000000ULL + now
.tv_usec
;
94 tdb_logerr(tdb
, TDB_SUCCESS
, TDB_LOG_WARNING
,
95 "tdb_open: random from getpid and time");
100 struct tdb_header hdr
;
101 struct tdb_freetable ftable
;
104 /* initialise a new database */
105 static enum TDB_ERROR
tdb_new_database(struct tdb_context
*tdb
,
106 struct tdb_attribute_seed
*seed
,
107 struct tdb_header
*hdr
)
109 /* We make it up in memory, then write it out if not internal */
110 struct new_database newdb
;
111 unsigned int magic_len
;
113 enum TDB_ERROR ecode
;
115 /* Fill in the header */
116 newdb
.hdr
.version
= TDB_VERSION
;
118 newdb
.hdr
.hash_seed
= seed
->seed
;
120 newdb
.hdr
.hash_seed
= random_number(tdb
);
121 newdb
.hdr
.hash_test
= TDB_HASH_MAGIC
;
122 newdb
.hdr
.hash_test
= tdb
->hash_fn(&newdb
.hdr
.hash_test
,
123 sizeof(newdb
.hdr
.hash_test
),
126 newdb
.hdr
.recovery
= 0;
127 newdb
.hdr
.features_used
= newdb
.hdr
.features_offered
= TDB_FEATURE_MASK
;
128 newdb
.hdr
.seqnum
= 0;
129 memset(newdb
.hdr
.reserved
, 0, sizeof(newdb
.hdr
.reserved
));
130 /* Initial hashes are empty. */
131 memset(newdb
.hdr
.hashtable
, 0, sizeof(newdb
.hdr
.hashtable
));
134 newdb
.hdr
.free_table
= offsetof(struct new_database
, ftable
);
135 memset(&newdb
.ftable
, 0, sizeof(newdb
.ftable
));
136 ecode
= set_header(NULL
, &newdb
.ftable
.hdr
, TDB_FTABLE_MAGIC
, 0,
137 sizeof(newdb
.ftable
) - sizeof(newdb
.ftable
.hdr
),
138 sizeof(newdb
.ftable
) - sizeof(newdb
.ftable
.hdr
),
140 if (ecode
!= TDB_SUCCESS
) {
145 memset(newdb
.hdr
.magic_food
, 0, sizeof(newdb
.hdr
.magic_food
));
146 strcpy(newdb
.hdr
.magic_food
, TDB_MAGIC_FOOD
);
148 /* This creates an endian-converted database, as if read from disk */
149 magic_len
= sizeof(newdb
.hdr
.magic_food
);
151 (char *)&newdb
.hdr
+ magic_len
, sizeof(newdb
) - magic_len
);
155 if (tdb
->flags
& TDB_INTERNAL
) {
156 tdb
->file
->map_size
= sizeof(newdb
);
157 tdb
->file
->map_ptr
= malloc(tdb
->file
->map_size
);
158 if (!tdb
->file
->map_ptr
) {
159 return tdb_logerr(tdb
, TDB_ERR_OOM
, TDB_LOG_ERROR
,
161 " failed to allocate");
163 memcpy(tdb
->file
->map_ptr
, &newdb
, tdb
->file
->map_size
);
166 if (lseek(tdb
->file
->fd
, 0, SEEK_SET
) == -1) {
167 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
169 " failed to seek: %s", strerror(errno
));
172 if (ftruncate(tdb
->file
->fd
, 0) == -1) {
173 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
175 " failed to truncate: %s", strerror(errno
));
178 rlen
= write(tdb
->file
->fd
, &newdb
, sizeof(newdb
));
179 if (rlen
!= sizeof(newdb
)) {
182 return tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
183 "tdb_new_database: %zi writing header: %s",
184 rlen
, strerror(errno
));
189 static enum TDB_ERROR
tdb_new_file(struct tdb_context
*tdb
)
191 tdb
->file
= malloc(sizeof(*tdb
->file
));
193 return tdb_logerr(tdb
, TDB_ERR_OOM
, TDB_LOG_ERROR
,
194 "tdb_open: cannot alloc tdb_file structure");
195 tdb
->file
->num_lockrecs
= 0;
196 tdb
->file
->lockrecs
= NULL
;
197 tdb
->file
->allrecord_lock
.count
= 0;
198 tdb
->file
->refcnt
= 1;
202 enum TDB_ERROR
tdb_set_attribute(struct tdb_context
*tdb
,
203 const union tdb_attribute
*attr
)
205 switch (attr
->base
.attr
) {
206 case TDB_ATTRIBUTE_LOG
:
207 tdb
->log_fn
= attr
->log
.fn
;
208 tdb
->log_data
= attr
->log
.data
;
210 case TDB_ATTRIBUTE_HASH
:
211 case TDB_ATTRIBUTE_SEED
:
212 case TDB_ATTRIBUTE_OPENHOOK
:
213 return tdb
->last_error
214 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
217 " cannot set %s after opening",
218 attr
->base
.attr
== TDB_ATTRIBUTE_HASH
219 ? "TDB_ATTRIBUTE_HASH"
220 : attr
->base
.attr
== TDB_ATTRIBUTE_SEED
221 ? "TDB_ATTRIBUTE_SEED"
222 : "TDB_ATTRIBUTE_OPENHOOK");
223 case TDB_ATTRIBUTE_STATS
:
224 return tdb
->last_error
225 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
228 " cannot set TDB_ATTRIBUTE_STATS");
229 case TDB_ATTRIBUTE_FLOCK
:
230 tdb
->lock_fn
= attr
->flock
.lock
;
231 tdb
->unlock_fn
= attr
->flock
.unlock
;
232 tdb
->lock_data
= attr
->flock
.data
;
235 return tdb
->last_error
236 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
239 " unknown attribute type %u",
245 static uint64_t jenkins_hash(const void *key
, size_t length
, uint64_t seed
,
249 /* hash64_stable assumes lower bits are more important; they are a
250 * slightly better hash. We use the upper bits first, so swap them. */
251 ret
= hash64_stable((const unsigned char *)key
, length
, seed
);
252 return (ret
>> 32) | (ret
<< 32);
255 enum TDB_ERROR
tdb_get_attribute(struct tdb_context
*tdb
,
256 union tdb_attribute
*attr
)
258 switch (attr
->base
.attr
) {
259 case TDB_ATTRIBUTE_LOG
:
261 return tdb
->last_error
= TDB_ERR_NOEXIST
;
262 attr
->log
.fn
= tdb
->log_fn
;
263 attr
->log
.data
= tdb
->log_data
;
265 case TDB_ATTRIBUTE_HASH
:
266 attr
->hash
.fn
= tdb
->hash_fn
;
267 attr
->hash
.data
= tdb
->hash_data
;
269 case TDB_ATTRIBUTE_SEED
:
270 attr
->seed
.seed
= tdb
->hash_seed
;
272 case TDB_ATTRIBUTE_OPENHOOK
:
273 return tdb
->last_error
274 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
277 " cannot get TDB_ATTRIBUTE_OPENHOOK");
278 case TDB_ATTRIBUTE_STATS
: {
279 size_t size
= attr
->stats
.size
;
280 if (size
> tdb
->stats
.size
)
281 size
= tdb
->stats
.size
;
282 memcpy(&attr
->stats
, &tdb
->stats
, size
);
285 case TDB_ATTRIBUTE_FLOCK
:
286 attr
->flock
.lock
= tdb
->lock_fn
;
287 attr
->flock
.unlock
= tdb
->unlock_fn
;
288 attr
->flock
.data
= tdb
->lock_data
;
291 return tdb
->last_error
292 = tdb_logerr(tdb
, TDB_ERR_EINVAL
,
295 " unknown attribute type %u",
298 attr
->base
.next
= NULL
;
302 void tdb_unset_attribute(struct tdb_context
*tdb
,
303 enum tdb_attribute_type type
)
306 case TDB_ATTRIBUTE_LOG
:
309 case TDB_ATTRIBUTE_HASH
:
310 case TDB_ATTRIBUTE_SEED
:
311 case TDB_ATTRIBUTE_OPENHOOK
:
312 tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
313 "tdb_unset_attribute: cannot unset %s after opening",
314 type
== TDB_ATTRIBUTE_HASH
315 ? "TDB_ATTRIBUTE_HASH"
316 : type
== TDB_ATTRIBUTE_SEED
317 ? "TDB_ATTRIBUTE_SEED"
318 : "TDB_ATTRIBUTE_OPENHOOK");
320 case TDB_ATTRIBUTE_STATS
:
321 tdb_logerr(tdb
, TDB_ERR_EINVAL
,
323 "tdb_unset_attribute:"
324 "cannot unset TDB_ATTRIBUTE_STATS");
326 case TDB_ATTRIBUTE_FLOCK
:
327 tdb
->lock_fn
= tdb_fcntl_lock
;
328 tdb
->unlock_fn
= tdb_fcntl_unlock
;
331 tdb_logerr(tdb
, TDB_ERR_EINVAL
,
333 "tdb_unset_attribute: unknown attribute type %u",
338 struct tdb_context
*tdb_open(const char *name
, int tdb_flags
,
339 int open_flags
, mode_t mode
,
340 union tdb_attribute
*attr
)
342 struct tdb_context
*tdb
;
348 struct tdb_header hdr
;
349 struct tdb_attribute_seed
*seed
= NULL
;
350 struct tdb_attribute_openhook
*openhook
= NULL
;
352 enum TDB_ERROR ecode
;
355 tdb
= malloc(sizeof(*tdb
) + (name
? strlen(name
) + 1 : 0));
361 /* Set name immediately for logging functions. */
363 tdb
->name
= strcpy((char *)(tdb
+ 1), name
);
367 tdb
->direct_access
= 0;
368 tdb
->flags
= tdb_flags
;
370 tdb
->transaction
= NULL
;
372 tdb
->last_error
= TDB_SUCCESS
;
374 tdb
->lock_fn
= tdb_fcntl_lock
;
375 tdb
->unlock_fn
= tdb_fcntl_unlock
;
376 tdb
->hash_fn
= jenkins_hash
;
377 memset(&tdb
->stats
, 0, sizeof(tdb
->stats
));
378 tdb
->stats
.base
.attr
= TDB_ATTRIBUTE_STATS
;
379 tdb
->stats
.size
= sizeof(tdb
->stats
);
383 switch (attr
->base
.attr
) {
384 case TDB_ATTRIBUTE_HASH
:
385 tdb
->hash_fn
= attr
->hash
.fn
;
386 tdb
->hash_data
= attr
->hash
.data
;
388 case TDB_ATTRIBUTE_SEED
:
391 case TDB_ATTRIBUTE_OPENHOOK
:
392 openhook
= &attr
->openhook
;
395 /* These are set as normal. */
396 ecode
= tdb_set_attribute(tdb
, attr
);
397 if (ecode
!= TDB_SUCCESS
)
400 attr
= attr
->base
.next
;
403 if (tdb_flags
& ~(TDB_INTERNAL
| TDB_NOLOCK
| TDB_NOMMAP
| TDB_CONVERT
404 | TDB_NOSYNC
| TDB_SEQNUM
| TDB_ALLOW_NESTING
)) {
405 ecode
= tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
406 "tdb_open: unknown flags %u", tdb_flags
);
410 if ((open_flags
& O_ACCMODE
) == O_WRONLY
) {
411 ecode
= tdb_logerr(tdb
, TDB_ERR_EINVAL
, TDB_LOG_USE_ERROR
,
412 "tdb_open: can't open tdb %s write-only",
417 if ((open_flags
& O_ACCMODE
) == O_RDONLY
) {
418 tdb
->read_only
= true;
419 tdb
->mmap_flags
= PROT_READ
;
422 tdb
->read_only
= false;
423 tdb
->mmap_flags
= PROT_READ
| PROT_WRITE
;
427 /* internal databases don't need any of the rest. */
428 if (tdb
->flags
& TDB_INTERNAL
) {
429 tdb
->flags
|= (TDB_NOLOCK
| TDB_NOMMAP
);
430 ecode
= tdb_new_file(tdb
);
431 if (ecode
!= TDB_SUCCESS
) {
435 ecode
= tdb_new_database(tdb
, seed
, &hdr
);
436 if (ecode
!= TDB_SUCCESS
) {
439 tdb_convert(tdb
, &hdr
.hash_seed
, sizeof(hdr
.hash_seed
));
440 tdb
->hash_seed
= hdr
.hash_seed
;
441 tdb_ftable_init(tdb
);
445 if (stat(name
, &st
) != -1)
446 tdb
->file
= find_file(st
.st_dev
, st
.st_ino
);
451 if ((fd
= open(name
, open_flags
, mode
)) == -1) {
452 /* errno set by open(2) */
454 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
455 "tdb_open: could not open file %s: %s",
456 name
, strerror(errno
));
460 /* on exec, don't inherit the fd */
461 v
= fcntl(fd
, F_GETFD
, 0);
462 fcntl(fd
, F_SETFD
, v
| FD_CLOEXEC
);
464 if (fstat(fd
, &st
) == -1) {
466 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
467 "tdb_open: could not stat open %s: %s",
468 name
, strerror(errno
));
473 ecode
= tdb_new_file(tdb
);
474 if (ecode
!= TDB_SUCCESS
) {
479 tdb
->file
->next
= files
;
481 tdb
->file
->device
= st
.st_dev
;
482 tdb
->file
->inode
= st
.st_ino
;
483 tdb
->file
->map_ptr
= NULL
;
484 tdb
->file
->map_size
= sizeof(struct tdb_header
);
487 /* ensure there is only one process initialising at once */
488 ecode
= tdb_lock_open(tdb
, openlock
, TDB_LOCK_WAIT
|TDB_LOCK_NOCHECK
);
489 if (ecode
!= TDB_SUCCESS
) {
494 /* call their open hook if they gave us one. */
496 ecode
= openhook
->fn(tdb
->file
->fd
, openhook
->data
);
497 if (ecode
!= TDB_SUCCESS
) {
498 tdb_logerr(tdb
, ecode
, TDB_LOG_ERROR
,
499 "tdb_open: open hook failed");
502 open_flags
|= O_CREAT
;
505 /* If they used O_TRUNC, read will return 0. */
506 rlen
= pread(tdb
->file
->fd
, &hdr
, sizeof(hdr
), 0);
507 if (rlen
== 0 && (open_flags
& O_CREAT
)) {
508 ecode
= tdb_new_database(tdb
, seed
, &hdr
);
509 if (ecode
!= TDB_SUCCESS
) {
512 } else if (rlen
< 0) {
513 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
514 "tdb_open: error %s reading %s",
515 strerror(errno
), name
);
517 } else if (rlen
< sizeof(hdr
)
518 || strcmp(hdr
.magic_food
, TDB_MAGIC_FOOD
) != 0) {
519 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
520 "tdb_open: %s is not a tdb file", name
);
524 if (hdr
.version
!= TDB_VERSION
) {
525 if (hdr
.version
== bswap_64(TDB_VERSION
))
526 tdb
->flags
|= TDB_CONVERT
;
529 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
531 " %s is unknown version 0x%llx",
532 name
, (long long)hdr
.version
);
537 tdb_convert(tdb
, &hdr
, sizeof(hdr
));
538 tdb
->hash_seed
= hdr
.hash_seed
;
539 hash_test
= TDB_HASH_MAGIC
;
540 hash_test
= tdb_hash(tdb
, &hash_test
, sizeof(hash_test
));
541 if (hdr
.hash_test
!= hash_test
) {
542 /* wrong hash variant */
543 ecode
= tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
545 " %s uses a different hash function",
550 /* Clear any features we don't understand. */
551 if ((open_flags
& O_ACCMODE
) != O_RDONLY
) {
552 hdr
.features_used
&= TDB_FEATURE_MASK
;
553 if (tdb_write_convert(tdb
, offsetof(struct tdb_header
,
556 sizeof(hdr
.features_used
)) == -1)
560 tdb_unlock_open(tdb
, openlock
);
562 /* This make sure we have current map_size and mmap. */
563 tdb
->methods
->oob(tdb
, tdb
->file
->map_size
+ 1, true);
565 /* Now it's fully formed, recover if necessary. */
566 berr
= tdb_needs_recovery(tdb
);
567 if (unlikely(berr
!= false)) {
572 ecode
= tdb_lock_and_recover(tdb
);
573 if (ecode
!= TDB_SUCCESS
) {
578 ecode
= tdb_ftable_init(tdb
);
579 if (ecode
!= TDB_SUCCESS
) {
583 /* Add to linked list if we're new. */
584 if (tdb
->file
->refcnt
== 1)
589 /* Map ecode to some logical errno. */
591 case TDB_ERR_CORRUPT
:
596 saved_errno
= EWOULDBLOCK
;
599 saved_errno
= ENOMEM
;
602 saved_errno
= EINVAL
;
605 saved_errno
= EINVAL
;
614 tdb_lock_cleanup(tdb
);
615 if (--tdb
->file
->refcnt
== 0) {
616 assert(tdb
->file
->num_lockrecs
== 0);
617 if (tdb
->file
->map_ptr
) {
618 if (tdb
->flags
& TDB_INTERNAL
) {
619 free(tdb
->file
->map_ptr
);
621 tdb_munmap(tdb
->file
);
623 if (close(tdb
->file
->fd
) != 0)
624 tdb_logerr(tdb
, TDB_ERR_IO
, TDB_LOG_ERROR
,
625 "tdb_open: failed to close tdb fd"
626 " on error: %s", strerror(errno
));
627 free(tdb
->file
->lockrecs
);
637 int tdb_close(struct tdb_context
*tdb
)
641 tdb_trace(tdb
, "tdb_close");
643 if (tdb
->transaction
) {
644 tdb_transaction_cancel(tdb
);
647 if (tdb
->file
->map_ptr
) {
648 if (tdb
->flags
& TDB_INTERNAL
)
649 free(tdb
->file
->map_ptr
);
651 tdb_munmap(tdb
->file
);
656 tdb_lock_cleanup(tdb
);
657 if (--tdb
->file
->refcnt
== 0) {
658 ret
= close(tdb
->file
->fd
);
660 /* Remove from files list */
661 for (i
= &files
; *i
; i
= &(*i
)->next
) {
662 if (*i
== tdb
->file
) {
663 *i
= tdb
->file
->next
;
667 free(tdb
->file
->lockrecs
);