2 Unix SMB/CIFS implementation.
3 Database interface wrapper around ctdbd
4 Copyright (C) Volker Lendecke 2007-2009
5 Copyright (C) Michael Adam 2009
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "system/filesys.h"
23 #include "lib/tdb_wrap/tdb_wrap.h"
25 #include "dbwrap/dbwrap.h"
26 #include "dbwrap/dbwrap_ctdb.h"
27 #include "dbwrap/dbwrap_rbt.h"
28 #include "lib/param/param.h"
30 #ifdef CLUSTER_SUPPORT
33 * It is not possible to include ctdb.h and tdb_compat.h (included via
34 * some other include above) without warnings. This fixes those
42 #ifdef typesafe_cb_preargs
43 #undef typesafe_cb_preargs
46 #ifdef typesafe_cb_postargs
47 #undef typesafe_cb_postargs
51 #include "ctdb_private.h"
52 #include "ctdbd_conn.h"
53 #include "dbwrap/dbwrap.h"
54 #include "dbwrap/dbwrap_private.h"
55 #include "dbwrap/dbwrap_ctdb.h"
59 struct db_ctdb_transaction_handle
{
60 struct db_ctdb_ctx
*ctx
;
62 * we store the writes done under a transaction:
64 struct ctdb_marshall_buffer
*m_write
;
71 struct db_context
*db
;
72 struct tdb_wrap
*wtdb
;
74 struct db_ctdb_transaction_handle
*transaction
;
75 struct g_lock_ctx
*lock_ctx
;
79 struct db_ctdb_ctx
*ctdb_ctx
;
80 struct ctdb_ltdb_header header
;
81 struct timeval lock_time
;
84 static NTSTATUS
tdb_error_to_ntstatus(struct tdb_context
*tdb
)
86 enum TDB_ERROR tret
= tdb_error(tdb
);
88 return map_nt_error_from_tdb(tret
);
91 struct db_ctdb_ltdb_parse_state
{
92 void (*parser
)(TDB_DATA key
, struct ctdb_ltdb_header
*header
,
93 TDB_DATA data
, void *private_data
);
97 static int db_ctdb_ltdb_parser(TDB_DATA key
, TDB_DATA data
,
100 struct db_ctdb_ltdb_parse_state
*state
=
101 (struct db_ctdb_ltdb_parse_state
*)private_data
;
103 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
107 key
, (struct ctdb_ltdb_header
*)data
.dptr
,
108 make_tdb_data(data
.dptr
+ sizeof(struct ctdb_ltdb_header
),
109 data
.dsize
- sizeof(struct ctdb_ltdb_header
)),
110 state
->private_data
);
114 static NTSTATUS
db_ctdb_ltdb_parse(
115 struct db_ctdb_ctx
*db
, TDB_DATA key
,
116 void (*parser
)(TDB_DATA key
, struct ctdb_ltdb_header
*header
,
117 TDB_DATA data
, void *private_data
),
120 struct db_ctdb_ltdb_parse_state state
;
123 state
.parser
= parser
;
124 state
.private_data
= private_data
;
126 ret
= tdb_parse_record(db
->wtdb
->tdb
, key
, db_ctdb_ltdb_parser
,
129 return NT_STATUS_NOT_FOUND
;
135 * Store a record together with the ctdb record header
136 * in the local copy of the database.
138 static NTSTATUS
db_ctdb_ltdb_store(struct db_ctdb_ctx
*db
,
140 struct ctdb_ltdb_header
*header
,
143 TALLOC_CTX
*tmp_ctx
= talloc_stackframe();
147 rec
.dsize
= data
.dsize
+ sizeof(struct ctdb_ltdb_header
);
148 rec
.dptr
= (uint8_t *)talloc_size(tmp_ctx
, rec
.dsize
);
150 if (rec
.dptr
== NULL
) {
151 talloc_free(tmp_ctx
);
152 return NT_STATUS_NO_MEMORY
;
155 memcpy(rec
.dptr
, header
, sizeof(struct ctdb_ltdb_header
));
156 memcpy(sizeof(struct ctdb_ltdb_header
) + (uint8_t *)rec
.dptr
, data
.dptr
, data
.dsize
);
158 ret
= tdb_store(db
->wtdb
->tdb
, key
, rec
, TDB_REPLACE
);
160 talloc_free(tmp_ctx
);
162 return (ret
== 0) ? NT_STATUS_OK
163 : tdb_error_to_ntstatus(db
->wtdb
->tdb
);
168 form a ctdb_rec_data record from a key/data pair
170 static struct ctdb_rec_data
*db_ctdb_marshall_record(TALLOC_CTX
*mem_ctx
, uint32_t reqid
,
172 struct ctdb_ltdb_header
*header
,
176 struct ctdb_rec_data
*d
;
178 length
= offsetof(struct ctdb_rec_data
, data
) + key
.dsize
+
179 data
.dsize
+ sizeof(*header
);
180 d
= (struct ctdb_rec_data
*)talloc_size(mem_ctx
, length
);
186 d
->keylen
= key
.dsize
;
187 memcpy(&d
->data
[0], key
.dptr
, key
.dsize
);
189 d
->datalen
= data
.dsize
+ sizeof(*header
);
190 memcpy(&d
->data
[key
.dsize
], header
, sizeof(*header
));
191 memcpy(&d
->data
[key
.dsize
+sizeof(*header
)], data
.dptr
, data
.dsize
);
196 /* helper function for marshalling multiple records */
197 static struct ctdb_marshall_buffer
*db_ctdb_marshall_add(TALLOC_CTX
*mem_ctx
,
198 struct ctdb_marshall_buffer
*m
,
202 struct ctdb_ltdb_header
*header
,
205 struct ctdb_rec_data
*r
;
206 size_t m_size
, r_size
;
207 struct ctdb_marshall_buffer
*m2
= NULL
;
209 r
= db_ctdb_marshall_record(talloc_tos(), reqid
, key
, header
, data
);
216 m
= (struct ctdb_marshall_buffer
*)talloc_zero_size(
217 mem_ctx
, offsetof(struct ctdb_marshall_buffer
, data
));
224 m_size
= talloc_get_size(m
);
225 r_size
= talloc_get_size(r
);
227 m2
= (struct ctdb_marshall_buffer
*)talloc_realloc_size(
228 mem_ctx
, m
, m_size
+ r_size
);
234 memcpy(m_size
+ (uint8_t *)m2
, r
, r_size
);
243 /* we've finished marshalling, return a data blob with the marshalled records */
244 static TDB_DATA
db_ctdb_marshall_finish(struct ctdb_marshall_buffer
*m
)
247 data
.dptr
= (uint8_t *)m
;
248 data
.dsize
= talloc_get_size(m
);
253 loop over a marshalling buffer
255 - pass r==NULL to start
256 - loop the number of times indicated by m->count
258 static struct ctdb_rec_data
*db_ctdb_marshall_loop_next_key(
259 struct ctdb_marshall_buffer
*m
, struct ctdb_rec_data
*r
, TDB_DATA
*key
)
262 r
= (struct ctdb_rec_data
*)&m
->data
[0];
264 r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
267 key
->dptr
= &r
->data
[0];
268 key
->dsize
= r
->keylen
;
272 static bool db_ctdb_marshall_buf_parse(
273 struct ctdb_rec_data
*r
, uint32_t *reqid
,
274 struct ctdb_ltdb_header
**header
, TDB_DATA
*data
)
276 if (r
->datalen
< sizeof(struct ctdb_ltdb_header
)) {
282 data
->dptr
= &r
->data
[r
->keylen
] + sizeof(struct ctdb_ltdb_header
);
283 data
->dsize
= r
->datalen
- sizeof(struct ctdb_ltdb_header
);
285 *header
= (struct ctdb_ltdb_header
*)&r
->data
[r
->keylen
];
291 * CTDB transaction destructor
293 static int db_ctdb_transaction_destructor(struct db_ctdb_transaction_handle
*h
)
297 status
= g_lock_unlock(h
->ctx
->lock_ctx
, h
->lock_name
);
298 if (!NT_STATUS_IS_OK(status
)) {
299 DEBUG(0, ("g_lock_unlock failed for %s: %s\n", h
->lock_name
,
307 * CTDB dbwrap API: transaction_start function
308 * starts a transaction on a persistent database
310 static int db_ctdb_transaction_start(struct db_context
*db
)
312 struct db_ctdb_transaction_handle
*h
;
314 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
317 if (!db
->persistent
) {
318 DEBUG(0,("transactions not supported on non-persistent database 0x%08x\n",
323 if (ctx
->transaction
) {
324 ctx
->transaction
->nesting
++;
325 DEBUG(5, (__location__
" transaction start on db 0x%08x: nesting %d -> %d\n",
326 ctx
->db_id
, ctx
->transaction
->nesting
- 1, ctx
->transaction
->nesting
));
330 h
= talloc_zero(db
, struct db_ctdb_transaction_handle
);
332 DEBUG(0,(__location__
" oom for transaction handle\n"));
338 h
->lock_name
= talloc_asprintf(h
, "transaction_db_0x%08x",
339 (unsigned int)ctx
->db_id
);
340 if (h
->lock_name
== NULL
) {
341 DEBUG(0, ("talloc_asprintf failed\n"));
347 * Wait a day, i.e. forever...
349 status
= g_lock_lock(ctx
->lock_ctx
, h
->lock_name
, G_LOCK_WRITE
,
350 timeval_set(86400, 0));
351 if (!NT_STATUS_IS_OK(status
)) {
352 DEBUG(0, ("g_lock_lock failed: %s\n", nt_errstr(status
)));
357 talloc_set_destructor(h
, db_ctdb_transaction_destructor
);
359 ctx
->transaction
= h
;
361 DEBUG(5,(__location__
" transaction started on db 0x%08x\n", ctx
->db_id
));
366 static bool parse_newest_in_marshall_buffer(
367 struct ctdb_marshall_buffer
*buf
, TDB_DATA key
,
368 void (*parser
)(TDB_DATA key
, struct ctdb_ltdb_header
*header
,
369 TDB_DATA data
, void *private_data
),
372 struct ctdb_rec_data
*rec
= NULL
;
373 struct ctdb_ltdb_header
*h
= NULL
;
382 * Walk the list of records written during this
383 * transaction. If we want to read one we have already
384 * written, return the last written sample. Thus we do not do
385 * a "break;" for the first hit, this record might have been
389 for (i
=0; i
<buf
->count
; i
++) {
393 rec
= db_ctdb_marshall_loop_next_key(buf
, rec
, &tkey
);
398 if (!tdb_data_equal(key
, tkey
)) {
402 if (!db_ctdb_marshall_buf_parse(rec
, &reqid
, &h
, &data
)) {
411 parser(key
, h
, data
, private_data
);
416 struct pull_newest_from_marshall_buffer_state
{
417 struct ctdb_ltdb_header
*pheader
;
422 static void pull_newest_from_marshall_buffer_parser(
423 TDB_DATA key
, struct ctdb_ltdb_header
*header
,
424 TDB_DATA data
, void *private_data
)
426 struct pull_newest_from_marshall_buffer_state
*state
=
427 (struct pull_newest_from_marshall_buffer_state
*)private_data
;
429 if (state
->pheader
!= NULL
) {
430 memcpy(state
->pheader
, header
, sizeof(*state
->pheader
));
432 if (state
->pdata
!= NULL
) {
433 state
->pdata
->dsize
= data
.dsize
;
434 state
->pdata
->dptr
= (uint8_t *)talloc_memdup(
435 state
->mem_ctx
, data
.dptr
, data
.dsize
);
439 static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer
*buf
,
441 struct ctdb_ltdb_header
*pheader
,
445 struct pull_newest_from_marshall_buffer_state state
;
447 state
.pheader
= pheader
;
448 state
.mem_ctx
= mem_ctx
;
451 if (!parse_newest_in_marshall_buffer(
452 buf
, key
, pull_newest_from_marshall_buffer_parser
,
456 if ((pdata
!= NULL
) && (pdata
->dsize
!= 0) && (pdata
->dptr
== NULL
)) {
463 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
);
464 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
);
466 static struct db_record
*db_ctdb_fetch_locked_transaction(struct db_ctdb_ctx
*ctx
,
470 struct db_record
*result
;
473 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
474 DEBUG(0, ("talloc failed\n"));
478 result
->private_data
= ctx
->transaction
;
480 result
->key
.dsize
= key
.dsize
;
481 result
->key
.dptr
= (uint8_t *)talloc_memdup(result
, key
.dptr
,
483 if (result
->key
.dptr
== NULL
) {
484 DEBUG(0, ("talloc failed\n"));
489 result
->store
= db_ctdb_store_transaction
;
490 result
->delete_rec
= db_ctdb_delete_transaction
;
492 if (pull_newest_from_marshall_buffer(ctx
->transaction
->m_write
, key
,
493 NULL
, result
, &result
->value
)) {
497 ctdb_data
= tdb_fetch_compat(ctx
->wtdb
->tdb
, key
);
498 if (ctdb_data
.dptr
== NULL
) {
499 /* create the record */
500 result
->value
= tdb_null
;
504 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(struct ctdb_ltdb_header
);
505 result
->value
.dptr
= NULL
;
507 if ((result
->value
.dsize
!= 0)
508 && !(result
->value
.dptr
= (uint8_t *)talloc_memdup(
509 result
, ctdb_data
.dptr
+ sizeof(struct ctdb_ltdb_header
),
510 result
->value
.dsize
))) {
511 DEBUG(0, ("talloc failed\n"));
515 SAFE_FREE(ctdb_data
.dptr
);
520 static int db_ctdb_record_destructor(struct db_record
**recp
)
522 struct db_record
*rec
= talloc_get_type_abort(*recp
, struct db_record
);
523 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
524 rec
->private_data
, struct db_ctdb_transaction_handle
);
525 int ret
= h
->ctx
->db
->transaction_commit(h
->ctx
->db
);
527 DEBUG(0,(__location__
" transaction_commit failed\n"));
533 auto-create a transaction for persistent databases
535 static struct db_record
*db_ctdb_fetch_locked_persistent(struct db_ctdb_ctx
*ctx
,
540 struct db_record
*rec
, **recp
;
542 res
= db_ctdb_transaction_start(ctx
->db
);
547 rec
= db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
549 ctx
->db
->transaction_cancel(ctx
->db
);
553 /* destroy this transaction when we release the lock */
554 recp
= talloc(rec
, struct db_record
*);
556 ctx
->db
->transaction_cancel(ctx
->db
);
561 talloc_set_destructor(recp
, db_ctdb_record_destructor
);
567 stores a record inside a transaction
569 static NTSTATUS
db_ctdb_transaction_store(struct db_ctdb_transaction_handle
*h
,
570 TDB_DATA key
, TDB_DATA data
)
572 TALLOC_CTX
*tmp_ctx
= talloc_new(h
);
574 struct ctdb_ltdb_header header
;
578 /* we need the header so we can update the RSN */
580 if (!pull_newest_from_marshall_buffer(h
->m_write
, key
, &header
,
583 rec
= tdb_fetch_compat(h
->ctx
->wtdb
->tdb
, key
);
585 if (rec
.dptr
!= NULL
) {
586 memcpy(&header
, rec
.dptr
,
587 sizeof(struct ctdb_ltdb_header
));
588 rec
.dsize
-= sizeof(struct ctdb_ltdb_header
);
591 * a special case, we are writing the same
592 * data that is there now
594 if (data
.dsize
== rec
.dsize
&&
596 rec
.dptr
+ sizeof(struct ctdb_ltdb_header
),
599 talloc_free(tmp_ctx
);
606 header
.dmaster
= get_my_vnn();
609 h
->m_write
= db_ctdb_marshall_add(h
, h
->m_write
, h
->ctx
->db_id
, 0, key
, &header
, data
);
610 if (h
->m_write
== NULL
) {
611 DEBUG(0,(__location__
" Failed to add to marshalling record\n"));
612 talloc_free(tmp_ctx
);
613 return NT_STATUS_NO_MEMORY
;
616 talloc_free(tmp_ctx
);
622 a record store inside a transaction
624 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
)
626 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
627 rec
->private_data
, struct db_ctdb_transaction_handle
);
630 status
= db_ctdb_transaction_store(h
, rec
->key
, data
);
635 a record delete inside a transaction
637 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
)
639 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
640 rec
->private_data
, struct db_ctdb_transaction_handle
);
643 status
= db_ctdb_transaction_store(h
, rec
->key
, tdb_null
);
647 static void db_ctdb_fetch_db_seqnum_parser(
648 TDB_DATA key
, struct ctdb_ltdb_header
*header
,
649 TDB_DATA data
, void *private_data
)
651 uint64_t *seqnum
= (uint64_t *)private_data
;
653 if (data
.dsize
!= sizeof(uint64_t)) {
657 memcpy(seqnum
, data
.dptr
, sizeof(*seqnum
));
661 * Fetch the db sequence number of a persistent db directly from the db.
663 static NTSTATUS
db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx
*db
,
669 if (seqnum
== NULL
) {
670 return NT_STATUS_INVALID_PARAMETER
;
673 key
= string_term_tdb_data(CTDB_DB_SEQNUM_KEY
);
675 status
= db_ctdb_ltdb_parse(
676 db
, key
, db_ctdb_fetch_db_seqnum_parser
, seqnum
);
678 if (NT_STATUS_IS_OK(status
)) {
681 if (NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
)) {
689 * Store the database sequence number inside a transaction.
691 static NTSTATUS
db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle
*h
,
695 const char *keyname
= CTDB_DB_SEQNUM_KEY
;
699 key
= string_term_tdb_data(keyname
);
701 data
.dptr
= (uint8_t *)&seqnum
;
702 data
.dsize
= sizeof(uint64_t);
704 status
= db_ctdb_transaction_store(h
, key
, data
);
712 static int db_ctdb_transaction_commit(struct db_context
*db
)
714 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
718 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
719 uint64_t old_seqnum
, new_seqnum
;
723 DEBUG(0,(__location__
" transaction commit with no open transaction on db 0x%08x\n", ctx
->db_id
));
727 if (h
->nested_cancel
) {
728 db
->transaction_cancel(db
);
729 DEBUG(5,(__location__
" Failed transaction commit after nested cancel\n"));
733 if (h
->nesting
!= 0) {
735 DEBUG(5, (__location__
" transaction commit on db 0x%08x: nesting %d -> %d\n",
736 ctx
->db_id
, ctx
->transaction
->nesting
+ 1, ctx
->transaction
->nesting
));
740 if (h
->m_write
== NULL
) {
742 * No changes were made, so don't change the seqnum,
743 * don't push to other node, just exit with success.
749 DEBUG(5,(__location__
" transaction commit on db 0x%08x\n", ctx
->db_id
));
752 * As the last db action before committing, bump the database sequence
753 * number. Note that this undoes all changes to the seqnum records
754 * performed under the transaction. This record is not meant to be
755 * modified by user interaction. It is for internal use only...
757 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &old_seqnum
);
758 if (!NT_STATUS_IS_OK(rets
)) {
759 DEBUG(1, (__location__
" failed to fetch the db sequence number "
760 "in transaction commit on db 0x%08x\n", ctx
->db_id
));
765 new_seqnum
= old_seqnum
+ 1;
767 rets
= db_ctdb_store_db_seqnum(h
, new_seqnum
);
768 if (!NT_STATUS_IS_OK(rets
)) {
769 DEBUG(1, (__location__
"failed to store the db sequence number "
770 " in transaction commit on db 0x%08x\n", ctx
->db_id
));
776 /* tell ctdbd to commit to the other nodes */
777 rets
= ctdbd_control_local(messaging_ctdbd_connection(),
778 CTDB_CONTROL_TRANS3_COMMIT
,
780 db_ctdb_marshall_finish(h
->m_write
),
781 NULL
, NULL
, &status
);
782 if (!NT_STATUS_IS_OK(rets
) || status
!= 0) {
784 * The TRANS3_COMMIT control should only possibly fail when a
785 * recovery has been running concurrently. In any case, the db
786 * will be the same on all nodes, either the new copy or the
787 * old copy. This can be detected by comparing the old and new
788 * local sequence numbers.
790 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &new_seqnum
);
791 if (!NT_STATUS_IS_OK(rets
)) {
792 DEBUG(1, (__location__
" failed to refetch db sequence "
793 "number after failed TRANS3_COMMIT\n"));
798 if (new_seqnum
== old_seqnum
) {
799 /* Recovery prevented all our changes: retry. */
802 if (new_seqnum
!= (old_seqnum
+ 1)) {
803 DEBUG(0, (__location__
" ERROR: new_seqnum[%lu] != "
804 "old_seqnum[%lu] + (0 or 1) after failed "
805 "TRANS3_COMMIT - this should not happen!\n",
806 (unsigned long)new_seqnum
,
807 (unsigned long)old_seqnum
));
812 * Recovery propagated our changes to all nodes, completing
813 * our commit for us - succeed.
820 h
->ctx
->transaction
= NULL
;
829 static int db_ctdb_transaction_cancel(struct db_context
*db
)
831 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
833 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
836 DEBUG(0,(__location__
" transaction cancel with no open transaction on db 0x%08x\n", ctx
->db_id
));
840 if (h
->nesting
!= 0) {
842 h
->nested_cancel
= true;
843 DEBUG(5, (__location__
" transaction cancel on db 0x%08x: nesting %d -> %d\n",
844 ctx
->db_id
, ctx
->transaction
->nesting
+ 1, ctx
->transaction
->nesting
));
848 DEBUG(5,(__location__
" Cancel transaction on db 0x%08x\n", ctx
->db_id
));
850 ctx
->transaction
= NULL
;
856 static NTSTATUS
db_ctdb_store(struct db_record
*rec
, TDB_DATA data
, int flag
)
858 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
859 rec
->private_data
, struct db_ctdb_rec
);
861 return db_ctdb_ltdb_store(crec
->ctdb_ctx
, rec
->key
, &(crec
->header
), data
);
866 #ifdef HAVE_CTDB_CONTROL_SCHEDULE_FOR_DELETION_DECL
867 static NTSTATUS
db_ctdb_send_schedule_for_deletion(struct db_record
*rec
)
870 struct ctdb_control_schedule_for_deletion
*dd
;
873 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
874 rec
->private_data
, struct db_ctdb_rec
);
876 indata
.dsize
= offsetof(struct ctdb_control_schedule_for_deletion
, key
) + rec
->key
.dsize
;
877 indata
.dptr
= talloc_zero_array(crec
, uint8_t, indata
.dsize
);
878 if (indata
.dptr
== NULL
) {
879 DEBUG(0, (__location__
" talloc failed!\n"));
880 return NT_STATUS_NO_MEMORY
;
883 dd
= (struct ctdb_control_schedule_for_deletion
*)(void *)indata
.dptr
;
884 dd
->db_id
= crec
->ctdb_ctx
->db_id
;
885 dd
->hdr
= crec
->header
;
886 dd
->keylen
= rec
->key
.dsize
;
887 memcpy(dd
->key
, rec
->key
.dptr
, rec
->key
.dsize
);
889 status
= ctdbd_control_local(messaging_ctdbd_connection(),
890 CTDB_CONTROL_SCHEDULE_FOR_DELETION
,
891 crec
->ctdb_ctx
->db_id
,
892 CTDB_CTRL_FLAG_NOREPLY
, /* flags */
897 talloc_free(indata
.dptr
);
899 if (!NT_STATUS_IS_OK(status
) || cstatus
!= 0) {
900 DEBUG(1, (__location__
" Error sending local control "
901 "SCHEDULE_FOR_DELETION: %s, cstatus = %d\n",
902 nt_errstr(status
), cstatus
));
903 if (NT_STATUS_IS_OK(status
)) {
904 status
= NT_STATUS_UNSUCCESSFUL
;
912 static NTSTATUS
db_ctdb_delete(struct db_record
*rec
)
918 * We have to store the header with empty data. TODO: Fix the
924 status
= db_ctdb_store(rec
, data
, 0);
925 if (!NT_STATUS_IS_OK(status
)) {
929 #ifdef HAVE_CTDB_CONTROL_SCHEDULE_FOR_DELETION_DECL
930 status
= db_ctdb_send_schedule_for_deletion(rec
);
936 static int db_ctdb_record_destr(struct db_record
* data
)
938 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
939 data
->private_data
, struct db_ctdb_rec
);
942 DEBUG(10, (DEBUGLEVEL
> 10
943 ? "Unlocking db %u key %s\n"
944 : "Unlocking db %u key %.20s\n",
945 (int)crec
->ctdb_ctx
->db_id
,
946 hex_encode_talloc(data
, (unsigned char *)data
->key
.dptr
,
949 tdb_chainunlock(crec
->ctdb_ctx
->wtdb
->tdb
, data
->key
);
951 threshold
= lp_ctdb_locktime_warn_threshold();
952 if (threshold
!= 0) {
953 double timediff
= timeval_elapsed(&crec
->lock_time
);
954 if ((timediff
* 1000) > threshold
) {
957 key
= hex_encode_talloc(data
,
958 (unsigned char *)data
->key
.dptr
,
960 DEBUG(0, ("Held tdb lock on db %s, key %s %f seconds\n",
961 tdb_name(crec
->ctdb_ctx
->wtdb
->tdb
), key
,
970 * Check whether we have a valid local copy of the given record,
971 * either for reading or for writing.
973 static bool db_ctdb_can_use_local_hdr(const struct ctdb_ltdb_header
*hdr
,
976 #ifdef HAVE_CTDB_WANT_READONLY_DECL
977 if (hdr
->dmaster
!= get_my_vnn()) {
978 /* If we're not dmaster, it must be r/o copy. */
979 return read_only
&& (hdr
->flags
& CTDB_REC_RO_HAVE_READONLY
);
983 * If we want write access, no one may have r/o copies.
985 return read_only
|| !(hdr
->flags
& CTDB_REC_RO_HAVE_DELEGATIONS
);
987 return (hdr
->dmaster
== get_my_vnn());
991 static bool db_ctdb_can_use_local_copy(TDB_DATA ctdb_data
, bool read_only
)
993 if (ctdb_data
.dptr
== NULL
)
996 if (ctdb_data
.dsize
< sizeof(struct ctdb_ltdb_header
))
999 return db_ctdb_can_use_local_hdr(
1000 (struct ctdb_ltdb_header
*)ctdb_data
.dptr
, read_only
);
1003 static struct db_record
*fetch_locked_internal(struct db_ctdb_ctx
*ctx
,
1004 TALLOC_CTX
*mem_ctx
,
1008 struct db_record
*result
;
1009 struct db_ctdb_rec
*crec
;
1012 int migrate_attempts
= 0;
1015 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
1016 DEBUG(0, ("talloc failed\n"));
1020 if (!(crec
= talloc_zero(result
, struct db_ctdb_rec
))) {
1021 DEBUG(0, ("talloc failed\n"));
1022 TALLOC_FREE(result
);
1026 result
->db
= ctx
->db
;
1027 result
->private_data
= (void *)crec
;
1028 crec
->ctdb_ctx
= ctx
;
1030 result
->key
.dsize
= key
.dsize
;
1031 result
->key
.dptr
= (uint8_t *)talloc_memdup(result
, key
.dptr
,
1033 if (result
->key
.dptr
== NULL
) {
1034 DEBUG(0, ("talloc failed\n"));
1035 TALLOC_FREE(result
);
1040 * Do a blocking lock on the record
1044 if (DEBUGLEVEL
>= 10) {
1045 char *keystr
= hex_encode_talloc(result
, key
.dptr
, key
.dsize
);
1046 DEBUG(10, (DEBUGLEVEL
> 10
1047 ? "Locking db %u key %s\n"
1048 : "Locking db %u key %.20s\n",
1049 (int)crec
->ctdb_ctx
->db_id
, keystr
));
1050 TALLOC_FREE(keystr
);
1054 ? tdb_chainlock_nonblock(ctx
->wtdb
->tdb
, key
)
1055 : tdb_chainlock(ctx
->wtdb
->tdb
, key
);
1057 DEBUG(3, ("tdb_chainlock failed\n"));
1058 TALLOC_FREE(result
);
1062 result
->store
= db_ctdb_store
;
1063 result
->delete_rec
= db_ctdb_delete
;
1064 talloc_set_destructor(result
, db_ctdb_record_destr
);
1066 ctdb_data
= tdb_fetch_compat(ctx
->wtdb
->tdb
, key
);
1069 * See if we have a valid record and we are the dmaster. If so, we can
1070 * take the shortcut and just return it.
1073 if (!db_ctdb_can_use_local_copy(ctdb_data
, false)) {
1074 SAFE_FREE(ctdb_data
.dptr
);
1075 tdb_chainunlock(ctx
->wtdb
->tdb
, key
);
1076 talloc_set_destructor(result
, NULL
);
1078 if (tryonly
&& (migrate_attempts
!= 0)) {
1079 DEBUG(5, ("record migrated away again\n"));
1080 TALLOC_FREE(result
);
1084 migrate_attempts
+= 1;
1086 DEBUG(10, ("ctdb_data.dptr = %p, dmaster = %u (%u) %u\n",
1087 ctdb_data
.dptr
, ctdb_data
.dptr
?
1088 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
: -1,
1091 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->flags
: 0));
1093 status
= ctdbd_migrate(messaging_ctdbd_connection(), ctx
->db_id
,
1095 if (!NT_STATUS_IS_OK(status
)) {
1096 DEBUG(5, ("ctdb_migrate failed: %s\n",
1097 nt_errstr(status
)));
1098 TALLOC_FREE(result
);
1101 /* now its migrated, try again */
1105 if (migrate_attempts
> 10) {
1106 DEBUG(0, ("db_ctdb_fetch_locked for %s key %s needed %d "
1107 "attempts\n", tdb_name(ctx
->wtdb
->tdb
),
1108 hex_encode_talloc(talloc_tos(),
1109 (unsigned char *)key
.dptr
,
1114 GetTimeOfDay(&crec
->lock_time
);
1116 memcpy(&crec
->header
, ctdb_data
.dptr
, sizeof(crec
->header
));
1118 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(crec
->header
);
1119 result
->value
.dptr
= NULL
;
1121 if ((result
->value
.dsize
!= 0)
1122 && !(result
->value
.dptr
= (uint8_t *)talloc_memdup(
1123 result
, ctdb_data
.dptr
+ sizeof(crec
->header
),
1124 result
->value
.dsize
))) {
1125 DEBUG(0, ("talloc failed\n"));
1126 TALLOC_FREE(result
);
1129 SAFE_FREE(ctdb_data
.dptr
);
1134 static struct db_record
*db_ctdb_fetch_locked(struct db_context
*db
,
1135 TALLOC_CTX
*mem_ctx
,
1138 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1139 struct db_ctdb_ctx
);
1141 if (ctx
->transaction
!= NULL
) {
1142 return db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
1145 if (db
->persistent
) {
1146 return db_ctdb_fetch_locked_persistent(ctx
, mem_ctx
, key
);
1149 return fetch_locked_internal(ctx
, mem_ctx
, key
, false);
1152 static struct db_record
*db_ctdb_try_fetch_locked(struct db_context
*db
,
1153 TALLOC_CTX
*mem_ctx
,
1156 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1157 struct db_ctdb_ctx
);
1159 if (ctx
->transaction
!= NULL
) {
1160 return db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
1163 if (db
->persistent
) {
1164 return db_ctdb_fetch_locked_persistent(ctx
, mem_ctx
, key
);
1167 return fetch_locked_internal(ctx
, mem_ctx
, key
, true);
1170 struct db_ctdb_parse_record_state
{
1171 void (*parser
)(TDB_DATA key
, TDB_DATA data
, void *private_data
);
1173 bool ask_for_readonly_copy
;
1177 static void db_ctdb_parse_record_parser(
1178 TDB_DATA key
, struct ctdb_ltdb_header
*header
,
1179 TDB_DATA data
, void *private_data
)
1181 struct db_ctdb_parse_record_state
*state
=
1182 (struct db_ctdb_parse_record_state
*)private_data
;
1183 state
->parser(key
, data
, state
->private_data
);
1186 static void db_ctdb_parse_record_parser_nonpersistent(
1187 TDB_DATA key
, struct ctdb_ltdb_header
*header
,
1188 TDB_DATA data
, void *private_data
)
1190 struct db_ctdb_parse_record_state
*state
=
1191 (struct db_ctdb_parse_record_state
*)private_data
;
1193 if (db_ctdb_can_use_local_hdr(header
, true)) {
1194 state
->parser(key
, data
, state
->private_data
);
1198 * We found something in the db, so it seems that this record,
1199 * while not usable locally right now, is popular. Ask for a
1202 state
->ask_for_readonly_copy
= true;
1206 static NTSTATUS
db_ctdb_parse_record(struct db_context
*db
, TDB_DATA key
,
1207 void (*parser
)(TDB_DATA key
,
1209 void *private_data
),
1212 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(
1213 db
->private_data
, struct db_ctdb_ctx
);
1214 struct db_ctdb_parse_record_state state
;
1218 state
.parser
= parser
;
1219 state
.private_data
= private_data
;
1221 if (ctx
->transaction
!= NULL
) {
1222 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
1226 * Transactions only happen for persistent db's.
1229 found
= parse_newest_in_marshall_buffer(
1230 h
->m_write
, key
, db_ctdb_parse_record_parser
, &state
);
1233 return NT_STATUS_OK
;
1237 if (db
->persistent
) {
1239 * Persistent db, but not found in the transaction buffer
1241 return db_ctdb_ltdb_parse(
1242 ctx
, key
, db_ctdb_parse_record_parser
, &state
);
1246 state
.ask_for_readonly_copy
= false;
1248 status
= db_ctdb_ltdb_parse(
1249 ctx
, key
, db_ctdb_parse_record_parser_nonpersistent
, &state
);
1250 if (NT_STATUS_IS_OK(status
) && state
.done
) {
1251 return NT_STATUS_OK
;
1254 status
= ctdbd_fetch(messaging_ctdbd_connection(), ctx
->db_id
, key
,
1255 talloc_tos(), &data
, state
.ask_for_readonly_copy
);
1256 if (!NT_STATUS_IS_OK(status
)) {
1259 parser(key
, data
, private_data
);
1260 TALLOC_FREE(data
.dptr
);
1261 return NT_STATUS_OK
;
1264 struct traverse_state
{
1265 struct db_context
*db
;
1266 int (*fn
)(struct db_record
*rec
, void *private_data
);
1271 static void traverse_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1273 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1274 struct db_record
*rec
;
1275 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1276 /* we have to give them a locked record to prevent races */
1277 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, key
);
1278 if (rec
&& rec
->value
.dsize
> 0) {
1279 state
->fn(rec
, state
->private_data
);
1281 talloc_free(tmp_ctx
);
1284 static int traverse_persistent_callback(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1287 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1288 struct db_record
*rec
;
1289 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1293 * Skip the __db_sequence_number__ key:
1294 * This is used for persistent transactions internally.
1296 if (kbuf
.dsize
== strlen(CTDB_DB_SEQNUM_KEY
) + 1 &&
1297 strcmp((const char*)kbuf
.dptr
, CTDB_DB_SEQNUM_KEY
) == 0)
1302 /* we have to give them a locked record to prevent races */
1303 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, kbuf
);
1304 if (rec
&& rec
->value
.dsize
> 0) {
1305 ret
= state
->fn(rec
, state
->private_data
);
1309 talloc_free(tmp_ctx
);
1313 /* wrapper to use traverse_persistent_callback with dbwrap */
1314 static int traverse_persistent_callback_dbwrap(struct db_record
*rec
, void* data
)
1316 return traverse_persistent_callback(NULL
, rec
->key
, rec
->value
, data
);
1320 static int db_ctdb_traverse(struct db_context
*db
,
1321 int (*fn
)(struct db_record
*rec
,
1322 void *private_data
),
1326 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1327 struct db_ctdb_ctx
);
1328 struct traverse_state state
;
1332 state
.private_data
= private_data
;
1335 if (db
->persistent
) {
1336 struct tdb_context
*ltdb
= ctx
->wtdb
->tdb
;
1339 /* for persistent databases we don't need to do a ctdb traverse,
1340 we can do a faster local traverse */
1341 ret
= tdb_traverse(ltdb
, traverse_persistent_callback
, &state
);
1345 if (ctx
->transaction
&& ctx
->transaction
->m_write
) {
1347 * we now have to handle keys not yet
1348 * present at transaction start
1350 struct db_context
*newkeys
= db_open_rbt(talloc_tos());
1351 struct ctdb_marshall_buffer
*mbuf
= ctx
->transaction
->m_write
;
1352 struct ctdb_rec_data
*rec
=NULL
;
1356 if (newkeys
== NULL
) {
1360 for (i
=0; i
<mbuf
->count
; i
++) {
1362 rec
= db_ctdb_marshall_loop_next_key(
1364 SMB_ASSERT(rec
!= NULL
);
1366 if (!tdb_exists(ltdb
, key
)) {
1367 dbwrap_store(newkeys
, key
, tdb_null
, 0);
1370 status
= dbwrap_traverse(newkeys
,
1371 traverse_persistent_callback_dbwrap
,
1374 talloc_free(newkeys
);
1375 if (!NT_STATUS_IS_OK(status
)) {
1383 status
= ctdbd_traverse(ctx
->db_id
, traverse_callback
, &state
);
1384 if (!NT_STATUS_IS_OK(status
)) {
1390 static NTSTATUS
db_ctdb_store_deny(struct db_record
*rec
, TDB_DATA data
, int flag
)
1392 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1395 static NTSTATUS
db_ctdb_delete_deny(struct db_record
*rec
)
1397 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1400 static void traverse_read_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1402 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1403 struct db_record rec
;
1406 rec
.store
= db_ctdb_store_deny
;
1407 rec
.delete_rec
= db_ctdb_delete_deny
;
1408 rec
.private_data
= state
->db
;
1409 state
->fn(&rec
, state
->private_data
);
1413 static int traverse_persistent_callback_read(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1416 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1417 struct db_record rec
;
1420 * Skip the __db_sequence_number__ key:
1421 * This is used for persistent transactions internally.
1423 if (kbuf
.dsize
== strlen(CTDB_DB_SEQNUM_KEY
) + 1 &&
1424 strcmp((const char*)kbuf
.dptr
, CTDB_DB_SEQNUM_KEY
) == 0)
1431 rec
.store
= db_ctdb_store_deny
;
1432 rec
.delete_rec
= db_ctdb_delete_deny
;
1433 rec
.private_data
= state
->db
;
1435 if (rec
.value
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1436 /* a deleted record */
1439 rec
.value
.dsize
-= sizeof(struct ctdb_ltdb_header
);
1440 rec
.value
.dptr
+= sizeof(struct ctdb_ltdb_header
);
1443 return state
->fn(&rec
, state
->private_data
);
1446 static int db_ctdb_traverse_read(struct db_context
*db
,
1447 int (*fn
)(struct db_record
*rec
,
1448 void *private_data
),
1452 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1453 struct db_ctdb_ctx
);
1454 struct traverse_state state
;
1458 state
.private_data
= private_data
;
1461 if (db
->persistent
) {
1462 /* for persistent databases we don't need to do a ctdb traverse,
1463 we can do a faster local traverse */
1464 return tdb_traverse_read(ctx
->wtdb
->tdb
, traverse_persistent_callback_read
, &state
);
1467 status
= ctdbd_traverse(ctx
->db_id
, traverse_read_callback
, &state
);
1468 if (!NT_STATUS_IS_OK(status
)) {
1474 static int db_ctdb_get_seqnum(struct db_context
*db
)
1476 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1477 struct db_ctdb_ctx
);
1478 return tdb_get_seqnum(ctx
->wtdb
->tdb
);
1481 static void db_ctdb_id(struct db_context
*db
, const uint8_t **id
,
1484 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(
1485 db
->private_data
, struct db_ctdb_ctx
);
1487 *id
= (uint8_t *)&ctx
->db_id
;
1488 *idlen
= sizeof(ctx
->db_id
);
1491 struct db_context
*db_open_ctdb(TALLOC_CTX
*mem_ctx
,
1493 int hash_size
, int tdb_flags
,
1494 int open_flags
, mode_t mode
,
1495 enum dbwrap_lock_order lock_order
)
1497 struct db_context
*result
;
1498 struct db_ctdb_ctx
*db_ctdb
;
1500 struct ctdbd_connection
*conn
;
1501 struct loadparm_context
*lp_ctx
;
1502 struct ctdb_db_priority prio
;
1506 if (!lp_clustering()) {
1507 DEBUG(10, ("Clustering disabled -- no ctdb\n"));
1511 if (!(result
= talloc_zero(mem_ctx
, struct db_context
))) {
1512 DEBUG(0, ("talloc failed\n"));
1513 TALLOC_FREE(result
);
1517 if (!(db_ctdb
= talloc(result
, struct db_ctdb_ctx
))) {
1518 DEBUG(0, ("talloc failed\n"));
1519 TALLOC_FREE(result
);
1523 db_ctdb
->transaction
= NULL
;
1524 db_ctdb
->db
= result
;
1526 conn
= messaging_ctdbd_connection();
1528 DEBUG(1, ("Could not connect to ctdb\n"));
1529 TALLOC_FREE(result
);
1533 if (!NT_STATUS_IS_OK(ctdbd_db_attach(conn
, name
, &db_ctdb
->db_id
, tdb_flags
))) {
1534 DEBUG(0, ("ctdbd_db_attach failed for %s\n", name
));
1535 TALLOC_FREE(result
);
1539 db_path
= ctdbd_dbpath(conn
, db_ctdb
, db_ctdb
->db_id
);
1541 result
->persistent
= ((tdb_flags
& TDB_CLEAR_IF_FIRST
) == 0);
1542 result
->lock_order
= lock_order
;
1544 /* only pass through specific flags */
1545 tdb_flags
&= TDB_SEQNUM
;
1547 /* honor permissions if user has specified O_CREAT */
1548 if (open_flags
& O_CREAT
) {
1549 chmod(db_path
, mode
);
1552 prio
.db_id
= db_ctdb
->db_id
;
1553 prio
.priority
= lock_order
;
1555 status
= ctdbd_control_local(
1556 conn
, CTDB_CONTROL_SET_DB_PRIORITY
, 0, 0,
1557 make_tdb_data((uint8_t *)&prio
, sizeof(prio
)),
1558 NULL
, NULL
, &cstatus
);
1560 if (!NT_STATUS_IS_OK(status
) || (cstatus
!= 0)) {
1561 DEBUG(1, ("CTDB_CONTROL_SET_DB_PRIORITY failed: %s, %d\n",
1562 nt_errstr(status
), cstatus
));
1563 TALLOC_FREE(result
);
1567 lp_ctx
= loadparm_init_s3(db_path
, loadparm_s3_helpers());
1569 db_ctdb
->wtdb
= tdb_wrap_open(db_ctdb
, db_path
, hash_size
, tdb_flags
,
1571 talloc_unlink(db_path
, lp_ctx
);
1572 if (db_ctdb
->wtdb
== NULL
) {
1573 DEBUG(0, ("Could not open tdb %s: %s\n", db_path
, strerror(errno
)));
1574 TALLOC_FREE(result
);
1577 talloc_free(db_path
);
1579 if (result
->persistent
) {
1580 db_ctdb
->lock_ctx
= g_lock_ctx_init(db_ctdb
,
1581 ctdb_conn_msg_ctx(conn
));
1582 if (db_ctdb
->lock_ctx
== NULL
) {
1583 DEBUG(0, ("g_lock_ctx_init failed\n"));
1584 TALLOC_FREE(result
);
1589 result
->private_data
= (void *)db_ctdb
;
1590 result
->fetch_locked
= db_ctdb_fetch_locked
;
1591 result
->try_fetch_locked
= db_ctdb_try_fetch_locked
;
1592 result
->parse_record
= db_ctdb_parse_record
;
1593 result
->traverse
= db_ctdb_traverse
;
1594 result
->traverse_read
= db_ctdb_traverse_read
;
1595 result
->get_seqnum
= db_ctdb_get_seqnum
;
1596 result
->transaction_start
= db_ctdb_transaction_start
;
1597 result
->transaction_commit
= db_ctdb_transaction_commit
;
1598 result
->transaction_cancel
= db_ctdb_transaction_cancel
;
1599 result
->id
= db_ctdb_id
;
1600 result
->stored_callback
= NULL
;
1602 DEBUG(3,("db_open_ctdb: opened database '%s' with dbid 0x%x\n",
1603 name
, db_ctdb
->db_id
));
1608 #else /* CLUSTER_SUPPORT */
1610 struct db_context
*db_open_ctdb(TALLOC_CTX
*mem_ctx
,
1612 int hash_size
, int tdb_flags
,
1613 int open_flags
, mode_t mode
,
1614 enum dbwrap_lock_order lock_order
)
1616 DEBUG(3, ("db_open_ctdb: no cluster support!\n"));