2 Unix SMB/CIFS implementation.
3 Database interface wrapper around ctdbd
4 Copyright (C) Volker Lendecke 2007-2009
5 Copyright (C) Michael Adam 2009
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #ifdef CLUSTER_SUPPORT
24 #include "ctdb_private.h"
25 #include "ctdbd_conn.h"
28 struct db_ctdb_transaction_handle
{
29 struct db_ctdb_ctx
*ctx
;
31 * we store the reads and writes done under a transaction:
32 * - one list stores both reads and writes (m_all),
33 * - the other just writes (m_write)
35 struct ctdb_marshall_buffer
*m_all
;
36 struct ctdb_marshall_buffer
*m_write
;
43 struct db_context
*db
;
44 struct tdb_wrap
*wtdb
;
46 struct db_ctdb_transaction_handle
*transaction
;
47 struct g_lock_ctx
*lock_ctx
;
51 struct db_ctdb_ctx
*ctdb_ctx
;
52 struct ctdb_ltdb_header header
;
53 struct timeval lock_time
;
56 static NTSTATUS
tdb_error_to_ntstatus(struct tdb_context
*tdb
)
59 enum TDB_ERROR tret
= tdb_error(tdb
);
63 status
= NT_STATUS_OBJECT_NAME_COLLISION
;
66 status
= NT_STATUS_OBJECT_NAME_NOT_FOUND
;
69 status
= NT_STATUS_INTERNAL_DB_CORRUPTION
;
78 * fetch a record from the tdb, separating out the header
79 * information and returning the body of the record.
81 static NTSTATUS
db_ctdb_ltdb_fetch(struct db_ctdb_ctx
*db
,
83 struct ctdb_ltdb_header
*header
,
90 rec
= tdb_fetch(db
->wtdb
->tdb
, key
);
91 if (rec
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
92 status
= NT_STATUS_NOT_FOUND
;
97 header
->dmaster
= (uint32_t)-1;
104 *header
= *(struct ctdb_ltdb_header
*)rec
.dptr
;
108 data
->dsize
= rec
.dsize
- sizeof(struct ctdb_ltdb_header
);
109 if (data
->dsize
== 0) {
112 data
->dptr
= (unsigned char *)talloc_memdup(mem_ctx
,
114 + sizeof(struct ctdb_ltdb_header
),
116 if (data
->dptr
== NULL
) {
117 status
= NT_STATUS_NO_MEMORY
;
123 status
= NT_STATUS_OK
;
131 * Store a record together with the ctdb record header
132 * in the local copy of the database.
134 static NTSTATUS
db_ctdb_ltdb_store(struct db_ctdb_ctx
*db
,
136 struct ctdb_ltdb_header
*header
,
139 TALLOC_CTX
*tmp_ctx
= talloc_stackframe();
143 rec
.dsize
= data
.dsize
+ sizeof(struct ctdb_ltdb_header
);
144 rec
.dptr
= (uint8_t *)talloc_size(tmp_ctx
, rec
.dsize
);
146 if (rec
.dptr
== NULL
) {
147 talloc_free(tmp_ctx
);
148 return NT_STATUS_NO_MEMORY
;
151 memcpy(rec
.dptr
, header
, sizeof(struct ctdb_ltdb_header
));
152 memcpy(sizeof(struct ctdb_ltdb_header
) + (uint8_t *)rec
.dptr
, data
.dptr
, data
.dsize
);
154 ret
= tdb_store(db
->wtdb
->tdb
, key
, rec
, TDB_REPLACE
);
156 talloc_free(tmp_ctx
);
158 return (ret
== 0) ? NT_STATUS_OK
159 : tdb_error_to_ntstatus(db
->wtdb
->tdb
);
164 form a ctdb_rec_data record from a key/data pair
166 note that header may be NULL. If not NULL then it is included in the data portion
169 static struct ctdb_rec_data
*db_ctdb_marshall_record(TALLOC_CTX
*mem_ctx
, uint32_t reqid
,
171 struct ctdb_ltdb_header
*header
,
175 struct ctdb_rec_data
*d
;
177 length
= offsetof(struct ctdb_rec_data
, data
) + key
.dsize
+
178 data
.dsize
+ (header
?sizeof(*header
):0);
179 d
= (struct ctdb_rec_data
*)talloc_size(mem_ctx
, length
);
185 d
->keylen
= key
.dsize
;
186 memcpy(&d
->data
[0], key
.dptr
, key
.dsize
);
188 d
->datalen
= data
.dsize
+ sizeof(*header
);
189 memcpy(&d
->data
[key
.dsize
], header
, sizeof(*header
));
190 memcpy(&d
->data
[key
.dsize
+sizeof(*header
)], data
.dptr
, data
.dsize
);
192 d
->datalen
= data
.dsize
;
193 memcpy(&d
->data
[key
.dsize
], data
.dptr
, data
.dsize
);
199 /* helper function for marshalling multiple records */
200 static struct ctdb_marshall_buffer
*db_ctdb_marshall_add(TALLOC_CTX
*mem_ctx
,
201 struct ctdb_marshall_buffer
*m
,
205 struct ctdb_ltdb_header
*header
,
208 struct ctdb_rec_data
*r
;
209 size_t m_size
, r_size
;
210 struct ctdb_marshall_buffer
*m2
= NULL
;
212 r
= db_ctdb_marshall_record(talloc_tos(), reqid
, key
, header
, data
);
219 m
= (struct ctdb_marshall_buffer
*)talloc_zero_size(
220 mem_ctx
, offsetof(struct ctdb_marshall_buffer
, data
));
227 m_size
= talloc_get_size(m
);
228 r_size
= talloc_get_size(r
);
230 m2
= (struct ctdb_marshall_buffer
*)talloc_realloc_size(
231 mem_ctx
, m
, m_size
+ r_size
);
237 memcpy(m_size
+ (uint8_t *)m2
, r
, r_size
);
246 /* we've finished marshalling, return a data blob with the marshalled records */
247 static TDB_DATA
db_ctdb_marshall_finish(struct ctdb_marshall_buffer
*m
)
250 data
.dptr
= (uint8_t *)m
;
251 data
.dsize
= talloc_get_size(m
);
256 loop over a marshalling buffer
258 - pass r==NULL to start
259 - loop the number of times indicated by m->count
261 static struct ctdb_rec_data
*db_ctdb_marshall_loop_next(struct ctdb_marshall_buffer
*m
, struct ctdb_rec_data
*r
,
263 struct ctdb_ltdb_header
*header
,
264 TDB_DATA
*key
, TDB_DATA
*data
)
267 r
= (struct ctdb_rec_data
*)&m
->data
[0];
269 r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
277 key
->dptr
= &r
->data
[0];
278 key
->dsize
= r
->keylen
;
281 data
->dptr
= &r
->data
[r
->keylen
];
282 data
->dsize
= r
->datalen
;
283 if (header
!= NULL
) {
284 data
->dptr
+= sizeof(*header
);
285 data
->dsize
-= sizeof(*header
);
289 if (header
!= NULL
) {
290 if (r
->datalen
< sizeof(*header
)) {
293 *header
= *(struct ctdb_ltdb_header
*)&r
->data
[r
->keylen
];
300 * CTDB transaction destructor
302 static int db_ctdb_transaction_destructor(struct db_ctdb_transaction_handle
*h
)
306 status
= g_lock_unlock(h
->ctx
->lock_ctx
, h
->lock_name
);
307 if (!NT_STATUS_IS_OK(status
)) {
308 DEBUG(0, ("g_lock_unlock failed: %s\n", nt_errstr(status
)));
315 * CTDB dbwrap API: transaction_start function
316 * starts a transaction on a persistent database
318 static int db_ctdb_transaction_start(struct db_context
*db
)
320 struct db_ctdb_transaction_handle
*h
;
322 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
325 if (!db
->persistent
) {
326 DEBUG(0,("transactions not supported on non-persistent database 0x%08x\n",
331 if (ctx
->transaction
) {
332 ctx
->transaction
->nesting
++;
336 h
= talloc_zero(db
, struct db_ctdb_transaction_handle
);
338 DEBUG(0,(__location__
" oom for transaction handle\n"));
344 h
->lock_name
= talloc_asprintf(h
, "transaction_db_0x%08x",
345 (unsigned int)ctx
->db_id
);
346 if (h
->lock_name
== NULL
) {
347 DEBUG(0, ("talloc_asprintf failed\n"));
353 * Wait a day, i.e. forever...
355 status
= g_lock_lock(ctx
->lock_ctx
, h
->lock_name
, G_LOCK_WRITE
,
356 timeval_set(86400, 0));
357 if (!NT_STATUS_IS_OK(status
)) {
358 DEBUG(0, ("g_lock_lock failed: %s\n", nt_errstr(status
)));
363 talloc_set_destructor(h
, db_ctdb_transaction_destructor
);
365 ctx
->transaction
= h
;
367 DEBUG(5,(__location__
" Started transaction on db 0x%08x\n", ctx
->db_id
));
372 static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer
*buf
,
374 struct ctdb_ltdb_header
*pheader
,
378 struct ctdb_rec_data
*rec
= NULL
;
379 struct ctdb_ltdb_header h
;
392 * Walk the list of records written during this
393 * transaction. If we want to read one we have already
394 * written, return the last written sample. Thus we do not do
395 * a "break;" for the first hit, this record might have been
399 for (i
=0; i
<buf
->count
; i
++) {
400 TDB_DATA tkey
, tdata
;
402 struct ctdb_ltdb_header hdr
;
406 rec
= db_ctdb_marshall_loop_next(buf
, rec
, &reqid
, &hdr
, &tkey
,
412 if (tdb_data_equal(key
, tkey
)) {
424 data
.dptr
= (uint8_t *)talloc_memdup(mem_ctx
, data
.dptr
,
426 if ((data
.dsize
!= 0) && (data
.dptr
== NULL
)) {
432 if (pheader
!= NULL
) {
440 fetch a record inside a transaction
442 static int db_ctdb_transaction_fetch(struct db_ctdb_ctx
*db
,
444 TDB_DATA key
, TDB_DATA
*data
)
446 struct db_ctdb_transaction_handle
*h
= db
->transaction
;
450 found
= pull_newest_from_marshall_buffer(h
->m_write
, key
, NULL
,
456 status
= db_ctdb_ltdb_fetch(h
->ctx
, key
, NULL
, mem_ctx
, data
);
458 if (NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
)) {
460 } else if (!NT_STATUS_IS_OK(status
)) {
464 h
->m_all
= db_ctdb_marshall_add(h
, h
->m_all
, h
->ctx
->db_id
, 1, key
,
466 if (h
->m_all
== NULL
) {
467 DEBUG(0,(__location__
" Failed to add to marshalling "
470 talloc_free(data
->dptr
);
478 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
);
479 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
);
481 static struct db_record
*db_ctdb_fetch_locked_transaction(struct db_ctdb_ctx
*ctx
,
485 struct db_record
*result
;
488 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
489 DEBUG(0, ("talloc failed\n"));
493 result
->private_data
= ctx
->transaction
;
495 result
->key
.dsize
= key
.dsize
;
496 result
->key
.dptr
= (uint8
*)talloc_memdup(result
, key
.dptr
, key
.dsize
);
497 if (result
->key
.dptr
== NULL
) {
498 DEBUG(0, ("talloc failed\n"));
503 result
->store
= db_ctdb_store_transaction
;
504 result
->delete_rec
= db_ctdb_delete_transaction
;
506 if (pull_newest_from_marshall_buffer(ctx
->transaction
->m_write
, key
,
507 NULL
, result
, &result
->value
)) {
511 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
512 if (ctdb_data
.dptr
== NULL
) {
513 /* create the record */
514 result
->value
= tdb_null
;
518 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(struct ctdb_ltdb_header
);
519 result
->value
.dptr
= NULL
;
521 if ((result
->value
.dsize
!= 0)
522 && !(result
->value
.dptr
= (uint8
*)talloc_memdup(
523 result
, ctdb_data
.dptr
+ sizeof(struct ctdb_ltdb_header
),
524 result
->value
.dsize
))) {
525 DEBUG(0, ("talloc failed\n"));
529 SAFE_FREE(ctdb_data
.dptr
);
534 static int db_ctdb_record_destructor(struct db_record
**recp
)
536 struct db_record
*rec
= talloc_get_type_abort(*recp
, struct db_record
);
537 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
538 rec
->private_data
, struct db_ctdb_transaction_handle
);
539 int ret
= h
->ctx
->db
->transaction_commit(h
->ctx
->db
);
541 DEBUG(0,(__location__
" transaction_commit failed\n"));
547 auto-create a transaction for persistent databases
549 static struct db_record
*db_ctdb_fetch_locked_persistent(struct db_ctdb_ctx
*ctx
,
554 struct db_record
*rec
, **recp
;
556 res
= db_ctdb_transaction_start(ctx
->db
);
561 rec
= db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
563 ctx
->db
->transaction_cancel(ctx
->db
);
567 /* destroy this transaction when we release the lock */
568 recp
= talloc(rec
, struct db_record
*);
570 ctx
->db
->transaction_cancel(ctx
->db
);
575 talloc_set_destructor(recp
, db_ctdb_record_destructor
);
581 stores a record inside a transaction
583 static NTSTATUS
db_ctdb_transaction_store(struct db_ctdb_transaction_handle
*h
,
584 TDB_DATA key
, TDB_DATA data
)
586 TALLOC_CTX
*tmp_ctx
= talloc_new(h
);
588 struct ctdb_ltdb_header header
;
592 /* we need the header so we can update the RSN */
594 if (!pull_newest_from_marshall_buffer(h
->m_write
, key
, &header
,
597 rec
= tdb_fetch(h
->ctx
->wtdb
->tdb
, key
);
599 if (rec
.dptr
!= NULL
) {
600 memcpy(&header
, rec
.dptr
,
601 sizeof(struct ctdb_ltdb_header
));
602 rec
.dsize
-= sizeof(struct ctdb_ltdb_header
);
605 * a special case, we are writing the same
606 * data that is there now
608 if (data
.dsize
== rec
.dsize
&&
610 rec
.dptr
+ sizeof(struct ctdb_ltdb_header
),
613 talloc_free(tmp_ctx
);
620 header
.dmaster
= get_my_vnn();
623 h
->m_all
= db_ctdb_marshall_add(h
, h
->m_all
, h
->ctx
->db_id
, 0, key
,
625 if (h
->m_all
== NULL
) {
626 DEBUG(0,(__location__
" Failed to add to marshalling "
628 talloc_free(tmp_ctx
);
629 return NT_STATUS_NO_MEMORY
;
632 h
->m_write
= db_ctdb_marshall_add(h
, h
->m_write
, h
->ctx
->db_id
, 0, key
, &header
, data
);
633 if (h
->m_write
== NULL
) {
634 DEBUG(0,(__location__
" Failed to add to marshalling record\n"));
635 talloc_free(tmp_ctx
);
636 return NT_STATUS_NO_MEMORY
;
639 talloc_free(tmp_ctx
);
645 a record store inside a transaction
647 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
)
649 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
650 rec
->private_data
, struct db_ctdb_transaction_handle
);
653 status
= db_ctdb_transaction_store(h
, rec
->key
, data
);
658 a record delete inside a transaction
660 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
)
662 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
663 rec
->private_data
, struct db_ctdb_transaction_handle
);
666 status
= db_ctdb_transaction_store(h
, rec
->key
, tdb_null
);
671 * Fetch the db sequence number of a persistent db directly from the db.
673 static NTSTATUS
db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx
*db
,
677 const char *keyname
= CTDB_DB_SEQNUM_KEY
;
680 struct ctdb_ltdb_header header
;
681 TALLOC_CTX
*mem_ctx
= talloc_stackframe();
683 if (seqnum
== NULL
) {
684 return NT_STATUS_INVALID_PARAMETER
;
687 key
= string_term_tdb_data(keyname
);
689 status
= db_ctdb_ltdb_fetch(db
, key
, &header
, mem_ctx
, &data
);
690 if (!NT_STATUS_IS_OK(status
) &&
691 !NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
))
696 status
= NT_STATUS_OK
;
698 if (data
.dsize
!= sizeof(uint64_t)) {
703 *seqnum
= *(uint64_t *)data
.dptr
;
706 TALLOC_FREE(mem_ctx
);
711 * Store the database sequence number inside a transaction.
713 static NTSTATUS
db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle
*h
,
717 const char *keyname
= CTDB_DB_SEQNUM_KEY
;
721 key
= string_term_tdb_data(keyname
);
723 data
.dptr
= (uint8_t *)&seqnum
;
724 data
.dsize
= sizeof(uint64_t);
726 status
= db_ctdb_transaction_store(h
, key
, data
);
734 static int db_ctdb_transaction_commit(struct db_context
*db
)
736 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
740 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
741 uint64_t old_seqnum
, new_seqnum
;
745 DEBUG(0,(__location__
" transaction commit with no open transaction on db 0x%08x\n", ctx
->db_id
));
749 if (h
->nested_cancel
) {
750 db
->transaction_cancel(db
);
751 DEBUG(5,(__location__
" Failed transaction commit after nested cancel\n"));
755 if (h
->nesting
!= 0) {
760 if (h
->m_write
== NULL
) {
762 * No changes were made, so don't change the seqnum,
763 * don't push to other node, just exit with success.
769 DEBUG(5,(__location__
" Commit transaction on db 0x%08x\n", ctx
->db_id
));
772 * As the last db action before committing, bump the database sequence
773 * number. Note that this undoes all changes to the seqnum records
774 * performed under the transaction. This record is not meant to be
775 * modified by user interaction. It is for internal use only...
777 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &old_seqnum
);
778 if (!NT_STATUS_IS_OK(rets
)) {
779 DEBUG(1, (__location__
" failed to fetch the db sequence number "
780 "in transaction commit on db 0x%08x\n", ctx
->db_id
));
785 new_seqnum
= old_seqnum
+ 1;
787 rets
= db_ctdb_store_db_seqnum(h
, new_seqnum
);
788 if (!NT_STATUS_IS_OK(rets
)) {
789 DEBUG(1, (__location__
"failed to store the db sequence number "
790 " in transaction commit on db 0x%08x\n", ctx
->db_id
));
796 /* tell ctdbd to commit to the other nodes */
797 rets
= ctdbd_control_local(messaging_ctdbd_connection(procid_self()),
798 CTDB_CONTROL_TRANS3_COMMIT
,
800 db_ctdb_marshall_finish(h
->m_write
),
801 NULL
, NULL
, &status
);
802 if (!NT_STATUS_IS_OK(rets
) || status
!= 0) {
804 * The TRANS3_COMMIT control should only possibly fail when a
805 * recovery has been running concurrently. In any case, the db
806 * will be the same on all nodes, either the new copy or the
807 * old copy. This can be detected by comparing the old and new
808 * local sequence numbers.
810 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &new_seqnum
);
811 if (!NT_STATUS_IS_OK(rets
)) {
812 DEBUG(1, (__location__
" failed to refetch db sequence "
813 "number after failed TRANS3_COMMIT\n"));
818 if (new_seqnum
== old_seqnum
) {
819 /* Recovery prevented all our changes: retry. */
821 } else if (new_seqnum
!= (old_seqnum
+ 1)) {
822 DEBUG(0, (__location__
" ERROR: new_seqnum[%lu] != "
823 "old_seqnum[%lu] + (0 or 1) after failed "
824 "TRANS3_COMMIT - this should not happen!\n",
825 (unsigned long)new_seqnum
,
826 (unsigned long)old_seqnum
));
831 * Recovery propagated our changes to all nodes, completing
832 * our commit for us - succeed.
839 h
->ctx
->transaction
= NULL
;
848 static int db_ctdb_transaction_cancel(struct db_context
*db
)
850 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
852 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
855 DEBUG(0,(__location__
" transaction cancel with no open transaction on db 0x%08x\n", ctx
->db_id
));
859 if (h
->nesting
!= 0) {
861 h
->nested_cancel
= true;
865 DEBUG(5,(__location__
" Cancel transaction on db 0x%08x\n", ctx
->db_id
));
867 ctx
->transaction
= NULL
;
873 static NTSTATUS
db_ctdb_store(struct db_record
*rec
, TDB_DATA data
, int flag
)
875 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
876 rec
->private_data
, struct db_ctdb_rec
);
878 return db_ctdb_ltdb_store(crec
->ctdb_ctx
, rec
->key
, &(crec
->header
), data
);
883 static NTSTATUS
db_ctdb_delete(struct db_record
*rec
)
888 * We have to store the header with empty data. TODO: Fix the
894 return db_ctdb_store(rec
, data
, 0);
898 static int db_ctdb_record_destr(struct db_record
* data
)
900 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
901 data
->private_data
, struct db_ctdb_rec
);
904 DEBUG(10, (DEBUGLEVEL
> 10
905 ? "Unlocking db %u key %s\n"
906 : "Unlocking db %u key %.20s\n",
907 (int)crec
->ctdb_ctx
->db_id
,
908 hex_encode_talloc(data
, (unsigned char *)data
->key
.dptr
,
911 if (tdb_chainunlock(crec
->ctdb_ctx
->wtdb
->tdb
, data
->key
) != 0) {
912 DEBUG(0, ("tdb_chainunlock failed\n"));
916 threshold
= lp_ctdb_locktime_warn_threshold();
917 if (threshold
!= 0) {
918 double timediff
= timeval_elapsed(&crec
->lock_time
);
919 if ((timediff
* 1000) > threshold
) {
920 DEBUG(0, ("Held tdb lock %f seconds\n", timediff
));
927 static struct db_record
*fetch_locked_internal(struct db_ctdb_ctx
*ctx
,
931 struct db_record
*result
;
932 struct db_ctdb_rec
*crec
;
935 int migrate_attempts
= 0;
937 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
938 DEBUG(0, ("talloc failed\n"));
942 if (!(crec
= TALLOC_ZERO_P(result
, struct db_ctdb_rec
))) {
943 DEBUG(0, ("talloc failed\n"));
948 result
->private_data
= (void *)crec
;
949 crec
->ctdb_ctx
= ctx
;
951 result
->key
.dsize
= key
.dsize
;
952 result
->key
.dptr
= (uint8
*)talloc_memdup(result
, key
.dptr
, key
.dsize
);
953 if (result
->key
.dptr
== NULL
) {
954 DEBUG(0, ("talloc failed\n"));
960 * Do a blocking lock on the record
964 if (DEBUGLEVEL
>= 10) {
965 char *keystr
= hex_encode_talloc(result
, key
.dptr
, key
.dsize
);
966 DEBUG(10, (DEBUGLEVEL
> 10
967 ? "Locking db %u key %s\n"
968 : "Locking db %u key %.20s\n",
969 (int)crec
->ctdb_ctx
->db_id
, keystr
));
973 if (tdb_chainlock(ctx
->wtdb
->tdb
, key
) != 0) {
974 DEBUG(3, ("tdb_chainlock failed\n"));
979 result
->store
= db_ctdb_store
;
980 result
->delete_rec
= db_ctdb_delete
;
981 talloc_set_destructor(result
, db_ctdb_record_destr
);
983 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
986 * See if we have a valid record and we are the dmaster. If so, we can
987 * take the shortcut and just return it.
990 if ((ctdb_data
.dptr
== NULL
) ||
991 (ctdb_data
.dsize
< sizeof(struct ctdb_ltdb_header
)) ||
992 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
!= get_my_vnn()
994 || (random() % 2 != 0)
997 SAFE_FREE(ctdb_data
.dptr
);
998 tdb_chainunlock(ctx
->wtdb
->tdb
, key
);
999 talloc_set_destructor(result
, NULL
);
1001 migrate_attempts
+= 1;
1003 DEBUG(10, ("ctdb_data.dptr = %p, dmaster = %u (%u)\n",
1004 ctdb_data
.dptr
, ctdb_data
.dptr
?
1005 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
: -1,
1008 status
= ctdbd_migrate(
1009 messaging_ctdbd_connection(procid_self()), ctx
->db_id
,
1011 if (!NT_STATUS_IS_OK(status
)) {
1012 DEBUG(5, ("ctdb_migrate failed: %s\n",
1013 nt_errstr(status
)));
1014 TALLOC_FREE(result
);
1017 /* now its migrated, try again */
1021 if (migrate_attempts
> 10) {
1022 DEBUG(0, ("db_ctdb_fetch_locked needed %d attempts\n",
1026 GetTimeOfDay(&crec
->lock_time
);
1028 memcpy(&crec
->header
, ctdb_data
.dptr
, sizeof(crec
->header
));
1030 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(crec
->header
);
1031 result
->value
.dptr
= NULL
;
1033 if ((result
->value
.dsize
!= 0)
1034 && !(result
->value
.dptr
= (uint8
*)talloc_memdup(
1035 result
, ctdb_data
.dptr
+ sizeof(crec
->header
),
1036 result
->value
.dsize
))) {
1037 DEBUG(0, ("talloc failed\n"));
1038 TALLOC_FREE(result
);
1041 SAFE_FREE(ctdb_data
.dptr
);
1046 static struct db_record
*db_ctdb_fetch_locked(struct db_context
*db
,
1047 TALLOC_CTX
*mem_ctx
,
1050 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1051 struct db_ctdb_ctx
);
1053 if (ctx
->transaction
!= NULL
) {
1054 return db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
1057 if (db
->persistent
) {
1058 return db_ctdb_fetch_locked_persistent(ctx
, mem_ctx
, key
);
1061 return fetch_locked_internal(ctx
, mem_ctx
, key
);
1065 fetch (unlocked, no migration) operation on ctdb
1067 static int db_ctdb_fetch(struct db_context
*db
, TALLOC_CTX
*mem_ctx
,
1068 TDB_DATA key
, TDB_DATA
*data
)
1070 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1071 struct db_ctdb_ctx
);
1075 if (ctx
->transaction
) {
1076 return db_ctdb_transaction_fetch(ctx
, mem_ctx
, key
, data
);
1079 /* try a direct fetch */
1080 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
1083 * See if we have a valid record and we are the dmaster. If so, we can
1084 * take the shortcut and just return it.
1085 * we bypass the dmaster check for persistent databases
1087 if ((ctdb_data
.dptr
!= NULL
) &&
1088 (ctdb_data
.dsize
>= sizeof(struct ctdb_ltdb_header
)) &&
1090 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
== get_my_vnn())) {
1091 /* we are the dmaster - avoid the ctdb protocol op */
1093 data
->dsize
= ctdb_data
.dsize
- sizeof(struct ctdb_ltdb_header
);
1094 if (data
->dsize
== 0) {
1095 SAFE_FREE(ctdb_data
.dptr
);
1100 data
->dptr
= (uint8
*)talloc_memdup(
1101 mem_ctx
, ctdb_data
.dptr
+sizeof(struct ctdb_ltdb_header
),
1104 SAFE_FREE(ctdb_data
.dptr
);
1106 if (data
->dptr
== NULL
) {
1112 SAFE_FREE(ctdb_data
.dptr
);
1114 /* we weren't able to get it locally - ask ctdb to fetch it for us */
1115 status
= ctdbd_fetch(messaging_ctdbd_connection(procid_self()),
1116 ctx
->db_id
, key
, mem_ctx
, data
);
1117 if (!NT_STATUS_IS_OK(status
)) {
1118 DEBUG(5, ("ctdbd_fetch failed: %s\n", nt_errstr(status
)));
1125 struct traverse_state
{
1126 struct db_context
*db
;
1127 int (*fn
)(struct db_record
*rec
, void *private_data
);
1131 static void traverse_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1133 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1134 struct db_record
*rec
;
1135 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1136 /* we have to give them a locked record to prevent races */
1137 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, key
);
1138 if (rec
&& rec
->value
.dsize
> 0) {
1139 state
->fn(rec
, state
->private_data
);
1141 talloc_free(tmp_ctx
);
1144 static int traverse_persistent_callback(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1147 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1148 struct db_record
*rec
;
1149 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1151 /* we have to give them a locked record to prevent races */
1152 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, kbuf
);
1153 if (rec
&& rec
->value
.dsize
> 0) {
1154 ret
= state
->fn(rec
, state
->private_data
);
1156 talloc_free(tmp_ctx
);
1160 static int db_ctdb_traverse(struct db_context
*db
,
1161 int (*fn
)(struct db_record
*rec
,
1162 void *private_data
),
1165 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1166 struct db_ctdb_ctx
);
1167 struct traverse_state state
;
1171 state
.private_data
= private_data
;
1173 if (db
->persistent
) {
1174 /* for persistent databases we don't need to do a ctdb traverse,
1175 we can do a faster local traverse */
1176 return tdb_traverse(ctx
->wtdb
->tdb
, traverse_persistent_callback
, &state
);
1180 ctdbd_traverse(ctx
->db_id
, traverse_callback
, &state
);
1184 static NTSTATUS
db_ctdb_store_deny(struct db_record
*rec
, TDB_DATA data
, int flag
)
1186 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1189 static NTSTATUS
db_ctdb_delete_deny(struct db_record
*rec
)
1191 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1194 static void traverse_read_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1196 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1197 struct db_record rec
;
1200 rec
.store
= db_ctdb_store_deny
;
1201 rec
.delete_rec
= db_ctdb_delete_deny
;
1202 rec
.private_data
= state
->db
;
1203 state
->fn(&rec
, state
->private_data
);
1206 static int traverse_persistent_callback_read(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1209 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1210 struct db_record rec
;
1213 rec
.store
= db_ctdb_store_deny
;
1214 rec
.delete_rec
= db_ctdb_delete_deny
;
1215 rec
.private_data
= state
->db
;
1217 if (rec
.value
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1218 /* a deleted record */
1221 rec
.value
.dsize
-= sizeof(struct ctdb_ltdb_header
);
1222 rec
.value
.dptr
+= sizeof(struct ctdb_ltdb_header
);
1224 return state
->fn(&rec
, state
->private_data
);
1227 static int db_ctdb_traverse_read(struct db_context
*db
,
1228 int (*fn
)(struct db_record
*rec
,
1229 void *private_data
),
1232 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1233 struct db_ctdb_ctx
);
1234 struct traverse_state state
;
1238 state
.private_data
= private_data
;
1240 if (db
->persistent
) {
1241 /* for persistent databases we don't need to do a ctdb traverse,
1242 we can do a faster local traverse */
1243 return tdb_traverse_read(ctx
->wtdb
->tdb
, traverse_persistent_callback_read
, &state
);
1246 ctdbd_traverse(ctx
->db_id
, traverse_read_callback
, &state
);
1250 static int db_ctdb_get_seqnum(struct db_context
*db
)
1252 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1253 struct db_ctdb_ctx
);
1254 return tdb_get_seqnum(ctx
->wtdb
->tdb
);
1257 static int db_ctdb_get_flags(struct db_context
*db
)
1259 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1260 struct db_ctdb_ctx
);
1261 return tdb_get_flags(ctx
->wtdb
->tdb
);
1264 struct db_context
*db_open_ctdb(TALLOC_CTX
*mem_ctx
,
1266 int hash_size
, int tdb_flags
,
1267 int open_flags
, mode_t mode
)
1269 struct db_context
*result
;
1270 struct db_ctdb_ctx
*db_ctdb
;
1272 struct ctdbd_connection
*conn
;
1274 if (!lp_clustering()) {
1275 DEBUG(10, ("Clustering disabled -- no ctdb\n"));
1279 if (!(result
= TALLOC_ZERO_P(mem_ctx
, struct db_context
))) {
1280 DEBUG(0, ("talloc failed\n"));
1281 TALLOC_FREE(result
);
1285 if (!(db_ctdb
= TALLOC_P(result
, struct db_ctdb_ctx
))) {
1286 DEBUG(0, ("talloc failed\n"));
1287 TALLOC_FREE(result
);
1291 db_ctdb
->transaction
= NULL
;
1292 db_ctdb
->db
= result
;
1294 conn
= messaging_ctdbd_connection(procid_self());
1296 if (!NT_STATUS_IS_OK(ctdbd_db_attach(conn
, name
, &db_ctdb
->db_id
, tdb_flags
))) {
1297 DEBUG(0, ("ctdbd_db_attach failed for %s\n", name
));
1298 TALLOC_FREE(result
);
1302 db_path
= ctdbd_dbpath(conn
, db_ctdb
, db_ctdb
->db_id
);
1304 result
->persistent
= ((tdb_flags
& TDB_CLEAR_IF_FIRST
) == 0);
1306 /* only pass through specific flags */
1307 tdb_flags
&= TDB_SEQNUM
;
1309 /* honor permissions if user has specified O_CREAT */
1310 if (open_flags
& O_CREAT
) {
1311 chmod(db_path
, mode
);
1314 db_ctdb
->wtdb
= tdb_wrap_open(db_ctdb
, db_path
, hash_size
, tdb_flags
, O_RDWR
, 0);
1315 if (db_ctdb
->wtdb
== NULL
) {
1316 DEBUG(0, ("Could not open tdb %s: %s\n", db_path
, strerror(errno
)));
1317 TALLOC_FREE(result
);
1320 talloc_free(db_path
);
1322 if (result
->persistent
) {
1323 db_ctdb
->lock_ctx
= g_lock_ctx_init(db_ctdb
,
1324 ctdb_conn_msg_ctx(conn
));
1325 if (db_ctdb
->lock_ctx
== NULL
) {
1326 DEBUG(0, ("g_lock_ctx_init failed\n"));
1327 TALLOC_FREE(result
);
1332 result
->private_data
= (void *)db_ctdb
;
1333 result
->fetch_locked
= db_ctdb_fetch_locked
;
1334 result
->fetch
= db_ctdb_fetch
;
1335 result
->traverse
= db_ctdb_traverse
;
1336 result
->traverse_read
= db_ctdb_traverse_read
;
1337 result
->get_seqnum
= db_ctdb_get_seqnum
;
1338 result
->get_flags
= db_ctdb_get_flags
;
1339 result
->transaction_start
= db_ctdb_transaction_start
;
1340 result
->transaction_commit
= db_ctdb_transaction_commit
;
1341 result
->transaction_cancel
= db_ctdb_transaction_cancel
;
1343 DEBUG(3,("db_open_ctdb: opened database '%s' with dbid 0x%x\n",
1344 name
, db_ctdb
->db_id
));