2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9 Copyright (C) Rusty Russell 2010
11 ** NOTE! The following LGPL license applies to the ntdb
12 ** library. This does NOT imply that all of Samba is released
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 3 of the License, or (at your option) any later version.
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Lesser General Public License for more details.
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, see <http://www.gnu.org/licenses/>.
29 #include <ccan/likely/likely.h>
31 static void free_old_mmaps(struct ntdb_context
*ntdb
)
33 struct ntdb_old_mmap
*i
;
35 assert(ntdb
->file
->direct_count
== 0);
37 while ((i
= ntdb
->file
->old_mmaps
) != NULL
) {
38 ntdb
->file
->old_mmaps
= i
->next
;
39 if (ntdb
->flags
& NTDB_INTERNAL
) {
40 ntdb
->free_fn(i
->map_ptr
, ntdb
->alloc_data
);
42 munmap(i
->map_ptr
, i
->map_size
);
44 ntdb
->free_fn(i
, ntdb
->alloc_data
);
48 static enum NTDB_ERROR
save_old_map(struct ntdb_context
*ntdb
)
50 struct ntdb_old_mmap
*old
;
52 assert(ntdb
->file
->direct_count
);
54 old
= ntdb
->alloc_fn(ntdb
->file
, sizeof(*old
), ntdb
->alloc_data
);
56 return ntdb_logerr(ntdb
, NTDB_ERR_OOM
, NTDB_LOG_ERROR
,
57 "save_old_map alloc failed");
59 old
->next
= ntdb
->file
->old_mmaps
;
60 old
->map_ptr
= ntdb
->file
->map_ptr
;
61 old
->map_size
= ntdb
->file
->map_size
;
62 ntdb
->file
->old_mmaps
= old
;
67 enum NTDB_ERROR
ntdb_munmap(struct ntdb_context
*ntdb
)
69 if (ntdb
->file
->fd
== -1) {
73 if (!ntdb
->file
->map_ptr
) {
77 /* We can't unmap now if there are accessors. */
78 if (ntdb
->file
->direct_count
) {
79 return save_old_map(ntdb
);
81 munmap(ntdb
->file
->map_ptr
, ntdb
->file
->map_size
);
82 ntdb
->file
->map_ptr
= NULL
;
87 enum NTDB_ERROR
ntdb_mmap(struct ntdb_context
*ntdb
)
91 if (ntdb
->flags
& NTDB_INTERNAL
)
94 #ifndef HAVE_INCOHERENT_MMAP
95 if (ntdb
->flags
& NTDB_NOMMAP
)
99 if ((ntdb
->open_flags
& O_ACCMODE
) == O_RDONLY
)
100 mmap_flags
= PROT_READ
;
102 mmap_flags
= PROT_READ
| PROT_WRITE
;
104 /* size_t can be smaller than off_t. */
105 if ((size_t)ntdb
->file
->map_size
== ntdb
->file
->map_size
) {
106 ntdb
->file
->map_ptr
= mmap(NULL
, ntdb
->file
->map_size
,
108 MAP_SHARED
, ntdb
->file
->fd
, 0);
110 ntdb
->file
->map_ptr
= MAP_FAILED
;
113 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
115 if (ntdb
->file
->map_ptr
== MAP_FAILED
) {
116 ntdb
->file
->map_ptr
= NULL
;
117 #ifdef HAVE_INCOHERENT_MMAP
118 /* Incoherent mmap means everyone must mmap! */
119 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
120 "ntdb_mmap failed for size %lld (%s)",
121 (long long)ntdb
->file
->map_size
,
124 ntdb_logerr(ntdb
, NTDB_SUCCESS
, NTDB_LOG_WARNING
,
125 "ntdb_mmap failed for size %lld (%s)",
126 (long long)ntdb
->file
->map_size
, strerror(errno
));
132 /* check for an out of bounds access - if it is out of bounds then
133 see if the database has been expanded by someone else and expand
135 note that "len" is the minimum length needed for the db.
137 If probe is true, len being too large isn't a failure.
139 static enum NTDB_ERROR
ntdb_normal_oob(struct ntdb_context
*ntdb
,
140 ntdb_off_t off
, ntdb_len_t len
,
144 enum NTDB_ERROR ecode
;
146 if (len
+ off
< len
) {
150 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
151 "ntdb_oob off %llu len %llu wrap\n",
152 (long long)off
, (long long)len
);
155 if (ntdb
->flags
& NTDB_INTERNAL
) {
159 ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
160 "ntdb_oob len %lld beyond internal"
162 (long long)(off
+ len
),
163 (long long)ntdb
->file
->map_size
);
167 ecode
= ntdb_lock_expand(ntdb
, F_RDLCK
);
168 if (ecode
!= NTDB_SUCCESS
) {
172 if (fstat(ntdb
->file
->fd
, &st
) != 0) {
173 ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
174 "Failed to fstat file: %s", strerror(errno
));
175 ntdb_unlock_expand(ntdb
, F_RDLCK
);
179 ntdb_unlock_expand(ntdb
, F_RDLCK
);
181 if (st
.st_size
< off
+ len
) {
185 ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
186 "ntdb_oob len %llu beyond eof at %llu",
187 (long long)(off
+ len
), (long long)st
.st_size
);
191 /* Unmap, update size, remap */
192 ecode
= ntdb_munmap(ntdb
);
197 ntdb
->file
->map_size
= st
.st_size
;
198 return ntdb_mmap(ntdb
);
201 /* Endian conversion: we only ever deal with 8 byte quantities */
202 void *ntdb_convert(const struct ntdb_context
*ntdb
, void *buf
, ntdb_len_t size
)
204 assert(size
% 8 == 0);
205 if (unlikely((ntdb
->flags
& NTDB_CONVERT
)) && buf
) {
206 uint64_t i
, *p
= (uint64_t *)buf
;
207 for (i
= 0; i
< size
/ 8; i
++)
208 p
[i
] = bswap_64(p
[i
]);
213 /* Return first non-zero offset in offset array, or end, or -ve error. */
214 /* FIXME: Return the off? */
215 uint64_t ntdb_find_nonzero_off(struct ntdb_context
*ntdb
,
216 ntdb_off_t base
, uint64_t start
, uint64_t end
)
221 /* Zero vs non-zero is the same unconverted: minor optimization. */
222 val
= ntdb_access_read(ntdb
, base
+ start
* sizeof(ntdb_off_t
),
223 (end
- start
) * sizeof(ntdb_off_t
), false);
224 if (NTDB_PTR_IS_ERR(val
)) {
225 return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val
));
228 for (i
= 0; i
< (end
- start
); i
++) {
232 ntdb_access_release(ntdb
, val
);
236 /* Return first zero offset in num offset array, or num, or -ve error. */
237 uint64_t ntdb_find_zero_off(struct ntdb_context
*ntdb
, ntdb_off_t off
,
243 /* Zero vs non-zero is the same unconverted: minor optimization. */
244 val
= ntdb_access_read(ntdb
, off
, num
* sizeof(ntdb_off_t
), false);
245 if (NTDB_PTR_IS_ERR(val
)) {
246 return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(val
));
249 for (i
= 0; i
< num
; i
++) {
253 ntdb_access_release(ntdb
, val
);
257 enum NTDB_ERROR
zero_out(struct ntdb_context
*ntdb
, ntdb_off_t off
, ntdb_len_t len
)
259 char buf
[8192] = { 0 };
260 void *p
= ntdb
->io
->direct(ntdb
, off
, len
, true);
261 enum NTDB_ERROR ecode
= NTDB_SUCCESS
;
263 assert(!(ntdb
->flags
& NTDB_RDONLY
));
264 if (NTDB_PTR_IS_ERR(p
)) {
265 return NTDB_PTR_ERR(p
);
272 unsigned todo
= len
< sizeof(buf
) ? len
: sizeof(buf
);
273 ecode
= ntdb
->io
->twrite(ntdb
, off
, buf
, todo
);
274 if (ecode
!= NTDB_SUCCESS
) {
283 /* write a lump of data at a specified offset */
284 static enum NTDB_ERROR
ntdb_write(struct ntdb_context
*ntdb
, ntdb_off_t off
,
285 const void *buf
, ntdb_len_t len
)
287 enum NTDB_ERROR ecode
;
289 if (ntdb
->flags
& NTDB_RDONLY
) {
290 return ntdb_logerr(ntdb
, NTDB_ERR_RDONLY
, NTDB_LOG_USE_ERROR
,
291 "Write to read-only database");
294 ecode
= ntdb_oob(ntdb
, off
, len
, false);
295 if (ecode
!= NTDB_SUCCESS
) {
299 if (ntdb
->file
->map_ptr
) {
300 memcpy(off
+ (char *)ntdb
->file
->map_ptr
, buf
, len
);
302 #ifdef HAVE_INCOHERENT_MMAP
306 ret
= pwrite(ntdb
->file
->fd
, buf
, len
, off
);
308 /* This shouldn't happen: we avoid sparse files. */
312 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
313 "ntdb_write: %zi at %zu len=%zu (%s)",
314 ret
, (size_t)off
, (size_t)len
,
322 /* read a lump of data at a specified offset */
323 static enum NTDB_ERROR
ntdb_read(struct ntdb_context
*ntdb
, ntdb_off_t off
,
324 void *buf
, ntdb_len_t len
)
326 enum NTDB_ERROR ecode
;
328 ecode
= ntdb_oob(ntdb
, off
, len
, false);
329 if (ecode
!= NTDB_SUCCESS
) {
333 if (ntdb
->file
->map_ptr
) {
334 memcpy(buf
, off
+ (char *)ntdb
->file
->map_ptr
, len
);
336 #ifdef HAVE_INCOHERENT_MMAP
339 ssize_t r
= pread(ntdb
->file
->fd
, buf
, len
, off
);
341 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
342 "ntdb_read failed with %zi at %zu "
343 "len=%zu (%s) map_size=%zu",
344 r
, (size_t)off
, (size_t)len
,
346 (size_t)ntdb
->file
->map_size
);
353 enum NTDB_ERROR
ntdb_write_convert(struct ntdb_context
*ntdb
, ntdb_off_t off
,
354 const void *rec
, size_t len
)
356 enum NTDB_ERROR ecode
;
358 if (unlikely((ntdb
->flags
& NTDB_CONVERT
))) {
359 void *conv
= ntdb
->alloc_fn(ntdb
, len
, ntdb
->alloc_data
);
361 return ntdb_logerr(ntdb
, NTDB_ERR_OOM
, NTDB_LOG_ERROR
,
362 "ntdb_write: no memory converting"
365 memcpy(conv
, rec
, len
);
366 ecode
= ntdb
->io
->twrite(ntdb
, off
,
367 ntdb_convert(ntdb
, conv
, len
), len
);
368 ntdb
->free_fn(conv
, ntdb
->alloc_data
);
370 ecode
= ntdb
->io
->twrite(ntdb
, off
, rec
, len
);
375 enum NTDB_ERROR
ntdb_read_convert(struct ntdb_context
*ntdb
, ntdb_off_t off
,
376 void *rec
, size_t len
)
378 enum NTDB_ERROR ecode
= ntdb
->io
->tread(ntdb
, off
, rec
, len
);
379 ntdb_convert(ntdb
, rec
, len
);
383 static void *_ntdb_alloc_read(struct ntdb_context
*ntdb
, ntdb_off_t offset
,
384 ntdb_len_t len
, unsigned int prefix
)
387 enum NTDB_ERROR ecode
;
389 /* some systems don't like zero length malloc */
390 buf
= ntdb
->alloc_fn(ntdb
, prefix
+ len
? prefix
+ len
: 1,
393 ntdb_logerr(ntdb
, NTDB_ERR_OOM
, NTDB_LOG_ERROR
,
394 "ntdb_alloc_read alloc failed len=%zu",
395 (size_t)(prefix
+ len
));
396 return NTDB_ERR_PTR(NTDB_ERR_OOM
);
398 ecode
= ntdb
->io
->tread(ntdb
, offset
, buf
+prefix
, len
);
399 if (unlikely(ecode
!= NTDB_SUCCESS
)) {
400 ntdb
->free_fn(buf
, ntdb
->alloc_data
);
401 return NTDB_ERR_PTR(ecode
);
407 /* read a lump of data, allocating the space for it */
408 void *ntdb_alloc_read(struct ntdb_context
*ntdb
, ntdb_off_t offset
, ntdb_len_t len
)
410 return _ntdb_alloc_read(ntdb
, offset
, len
, 0);
413 static enum NTDB_ERROR
fill(struct ntdb_context
*ntdb
,
414 const void *buf
, size_t size
,
415 ntdb_off_t off
, ntdb_len_t len
)
418 size_t n
= len
> size
? size
: len
;
419 ssize_t ret
= pwrite(ntdb
->file
->fd
, buf
, n
, off
);
424 return ntdb_logerr(ntdb
, NTDB_ERR_IO
, NTDB_LOG_ERROR
,
426 " %zi at %zu len=%zu (%s)",
427 ret
, (size_t)off
, (size_t)len
,
436 /* expand a file. we prefer to use ftruncate, as that is what posix
437 says to use for mmap expansion */
438 static enum NTDB_ERROR
ntdb_expand_file(struct ntdb_context
*ntdb
,
442 enum NTDB_ERROR ecode
;
444 assert((ntdb
->file
->map_size
+ addition
) % NTDB_PGSIZE
== 0);
445 if (ntdb
->flags
& NTDB_RDONLY
) {
446 return ntdb_logerr(ntdb
, NTDB_ERR_RDONLY
, NTDB_LOG_USE_ERROR
,
447 "Expand on read-only database");
450 if (ntdb
->flags
& NTDB_INTERNAL
) {
453 /* Can't free it if we have direct accesses. */
454 if (ntdb
->file
->direct_count
) {
455 ecode
= save_old_map(ntdb
);
459 new = ntdb
->alloc_fn(ntdb
->file
,
460 ntdb
->file
->map_size
+ addition
,
463 memcpy(new, ntdb
->file
->map_ptr
,
464 ntdb
->file
->map_size
);
467 new = ntdb
->expand_fn(ntdb
->file
->map_ptr
,
468 ntdb
->file
->map_size
+ addition
,
472 return ntdb_logerr(ntdb
, NTDB_ERR_OOM
, NTDB_LOG_ERROR
,
473 "No memory to expand database");
475 ntdb
->file
->map_ptr
= new;
476 ntdb
->file
->map_size
+= addition
;
479 /* Unmap before trying to write; old NTDB claimed OpenBSD had
480 * problem with this otherwise. */
481 ecode
= ntdb_munmap(ntdb
);
486 /* If this fails, we try to fill anyway. */
487 if (ftruncate(ntdb
->file
->fd
, ntdb
->file
->map_size
+ addition
))
490 /* now fill the file with something. This ensures that the
491 file isn't sparse, which would be very bad if we ran out of
492 disk. This must be done with write, not via mmap */
493 memset(buf
, 0x43, sizeof(buf
));
494 ecode
= fill(ntdb
, buf
, sizeof(buf
), ntdb
->file
->map_size
,
496 if (ecode
!= NTDB_SUCCESS
)
498 ntdb
->file
->map_size
+= addition
;
499 return ntdb_mmap(ntdb
);
503 const void *ntdb_access_read(struct ntdb_context
*ntdb
,
504 ntdb_off_t off
, ntdb_len_t len
, bool convert
)
508 if (likely(!(ntdb
->flags
& NTDB_CONVERT
))) {
509 ret
= ntdb
->io
->direct(ntdb
, off
, len
, false);
511 if (NTDB_PTR_IS_ERR(ret
)) {
516 struct ntdb_access_hdr
*hdr
;
517 hdr
= _ntdb_alloc_read(ntdb
, off
, len
, sizeof(*hdr
));
518 if (NTDB_PTR_IS_ERR(hdr
)) {
521 hdr
->next
= ntdb
->access
;
525 ntdb_convert(ntdb
, (void *)ret
, len
);
528 ntdb
->file
->direct_count
++;
534 void *ntdb_access_write(struct ntdb_context
*ntdb
,
535 ntdb_off_t off
, ntdb_len_t len
, bool convert
)
539 if (ntdb
->flags
& NTDB_RDONLY
) {
540 ntdb_logerr(ntdb
, NTDB_ERR_RDONLY
, NTDB_LOG_USE_ERROR
,
541 "Write to read-only database");
542 return NTDB_ERR_PTR(NTDB_ERR_RDONLY
);
545 if (likely(!(ntdb
->flags
& NTDB_CONVERT
))) {
546 ret
= ntdb
->io
->direct(ntdb
, off
, len
, true);
548 if (NTDB_PTR_IS_ERR(ret
)) {
554 struct ntdb_access_hdr
*hdr
;
555 hdr
= _ntdb_alloc_read(ntdb
, off
, len
, sizeof(*hdr
));
556 if (NTDB_PTR_IS_ERR(hdr
)) {
559 hdr
->next
= ntdb
->access
;
563 hdr
->convert
= convert
;
566 ntdb_convert(ntdb
, (void *)ret
, len
);
568 ntdb
->file
->direct_count
++;
573 static struct ntdb_access_hdr
**find_hdr(struct ntdb_context
*ntdb
, const void *p
)
575 struct ntdb_access_hdr
**hp
;
577 for (hp
= &ntdb
->access
; *hp
; hp
= &(*hp
)->next
) {
584 void ntdb_access_release(struct ntdb_context
*ntdb
, const void *p
)
586 struct ntdb_access_hdr
*hdr
, **hp
= find_hdr(ntdb
, p
);
591 ntdb
->free_fn(hdr
, ntdb
->alloc_data
);
593 if (--ntdb
->file
->direct_count
== 0) {
594 free_old_mmaps(ntdb
);
599 enum NTDB_ERROR
ntdb_access_commit(struct ntdb_context
*ntdb
, void *p
)
601 struct ntdb_access_hdr
*hdr
, **hp
= find_hdr(ntdb
, p
);
602 enum NTDB_ERROR ecode
;
607 ecode
= ntdb_write_convert(ntdb
, hdr
->off
, p
, hdr
->len
);
609 ecode
= ntdb_write(ntdb
, hdr
->off
, p
, hdr
->len
);
611 ntdb
->free_fn(hdr
, ntdb
->alloc_data
);
613 if (--ntdb
->file
->direct_count
== 0) {
614 free_old_mmaps(ntdb
);
616 ecode
= NTDB_SUCCESS
;
622 static void *ntdb_direct(struct ntdb_context
*ntdb
, ntdb_off_t off
, size_t len
,
625 enum NTDB_ERROR ecode
;
627 if (unlikely(!ntdb
->file
->map_ptr
))
630 ecode
= ntdb_oob(ntdb
, off
, len
, false);
631 if (unlikely(ecode
!= NTDB_SUCCESS
))
632 return NTDB_ERR_PTR(ecode
);
633 return (char *)ntdb
->file
->map_ptr
+ off
;
636 static ntdb_off_t
ntdb_read_normal_off(struct ntdb_context
*ntdb
,
640 enum NTDB_ERROR ecode
;
643 p
= ntdb_direct(ntdb
, off
, sizeof(*p
), false);
644 if (NTDB_PTR_IS_ERR(p
)) {
645 return NTDB_ERR_TO_OFF(NTDB_PTR_ERR(p
));
651 ecode
= ntdb_read(ntdb
, off
, &ret
, sizeof(ret
));
652 if (ecode
!= NTDB_SUCCESS
) {
653 return NTDB_ERR_TO_OFF(ecode
);
658 static ntdb_off_t
ntdb_read_convert_off(struct ntdb_context
*ntdb
,
662 enum NTDB_ERROR ecode
;
664 ecode
= ntdb_read_convert(ntdb
, off
, &ret
, sizeof(ret
));
665 if (ecode
!= NTDB_SUCCESS
) {
666 return NTDB_ERR_TO_OFF(ecode
);
671 static enum NTDB_ERROR
ntdb_write_normal_off(struct ntdb_context
*ntdb
,
672 ntdb_off_t off
, ntdb_off_t val
)
676 p
= ntdb_direct(ntdb
, off
, sizeof(*p
), true);
677 if (NTDB_PTR_IS_ERR(p
)) {
678 return NTDB_PTR_ERR(p
);
684 return ntdb_write(ntdb
, off
, &val
, sizeof(val
));
687 static enum NTDB_ERROR
ntdb_write_convert_off(struct ntdb_context
*ntdb
,
688 ntdb_off_t off
, ntdb_off_t val
)
690 return ntdb_write_convert(ntdb
, off
, &val
, sizeof(val
));
693 void ntdb_inc_seqnum(struct ntdb_context
*ntdb
)
697 if (likely(!(ntdb
->flags
& NTDB_CONVERT
))) {
700 direct
= ntdb
->io
->direct(ntdb
,
701 offsetof(struct ntdb_header
, seqnum
),
702 sizeof(*direct
), true);
703 if (likely(direct
)) {
704 /* Don't let it go negative, even briefly */
705 if (unlikely((*direct
) + 1) < 0)
712 seq
= ntdb_read_off(ntdb
, offsetof(struct ntdb_header
, seqnum
));
713 if (!NTDB_OFF_IS_ERR(seq
)) {
715 if (unlikely((int64_t)seq
< 0))
717 ntdb_write_off(ntdb
, offsetof(struct ntdb_header
, seqnum
), seq
);
721 static const struct ntdb_methods io_methods
= {
727 ntdb_read_normal_off
,
728 ntdb_write_normal_off
,
731 static const struct ntdb_methods io_convert_methods
= {
737 ntdb_read_convert_off
,
738 ntdb_write_convert_off
,
742 initialise the default methods table
744 void ntdb_io_init(struct ntdb_context
*ntdb
)
746 if (ntdb
->flags
& NTDB_CONVERT
)
747 ntdb
->io
= &io_convert_methods
;
749 ntdb
->io
= &io_methods
;