2 Unix SMB/CIFS implementation.
3 Database interface wrapper around ctdbd
4 Copyright (C) Volker Lendecke 2007-2009
5 Copyright (C) Michael Adam 2009
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "system/filesys.h"
23 #include "lib/util/tdb_wrap.h"
26 #ifdef CLUSTER_SUPPORT
28 #include "ctdb_private.h"
29 #include "ctdbd_conn.h"
33 struct db_ctdb_transaction_handle
{
34 struct db_ctdb_ctx
*ctx
;
36 * we store the reads and writes done under a transaction:
37 * - one list stores both reads and writes (m_all),
38 * - the other just writes (m_write)
40 struct ctdb_marshall_buffer
*m_all
;
41 struct ctdb_marshall_buffer
*m_write
;
48 struct db_context
*db
;
49 struct tdb_wrap
*wtdb
;
51 struct db_ctdb_transaction_handle
*transaction
;
52 struct g_lock_ctx
*lock_ctx
;
56 struct db_ctdb_ctx
*ctdb_ctx
;
57 struct ctdb_ltdb_header header
;
58 struct timeval lock_time
;
61 static NTSTATUS
tdb_error_to_ntstatus(struct tdb_context
*tdb
)
64 enum TDB_ERROR tret
= tdb_error(tdb
);
68 status
= NT_STATUS_OBJECT_NAME_COLLISION
;
71 status
= NT_STATUS_OBJECT_NAME_NOT_FOUND
;
74 status
= NT_STATUS_INTERNAL_DB_CORRUPTION
;
83 * fetch a record from the tdb, separating out the header
84 * information and returning the body of the record.
86 static NTSTATUS
db_ctdb_ltdb_fetch(struct db_ctdb_ctx
*db
,
88 struct ctdb_ltdb_header
*header
,
95 rec
= tdb_fetch(db
->wtdb
->tdb
, key
);
96 if (rec
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
97 status
= NT_STATUS_NOT_FOUND
;
102 header
->dmaster
= (uint32_t)-1;
109 *header
= *(struct ctdb_ltdb_header
*)rec
.dptr
;
113 data
->dsize
= rec
.dsize
- sizeof(struct ctdb_ltdb_header
);
114 if (data
->dsize
== 0) {
117 data
->dptr
= (unsigned char *)talloc_memdup(mem_ctx
,
119 + sizeof(struct ctdb_ltdb_header
),
121 if (data
->dptr
== NULL
) {
122 status
= NT_STATUS_NO_MEMORY
;
128 status
= NT_STATUS_OK
;
136 * Store a record together with the ctdb record header
137 * in the local copy of the database.
139 static NTSTATUS
db_ctdb_ltdb_store(struct db_ctdb_ctx
*db
,
141 struct ctdb_ltdb_header
*header
,
144 TALLOC_CTX
*tmp_ctx
= talloc_stackframe();
148 rec
.dsize
= data
.dsize
+ sizeof(struct ctdb_ltdb_header
);
149 rec
.dptr
= (uint8_t *)talloc_size(tmp_ctx
, rec
.dsize
);
151 if (rec
.dptr
== NULL
) {
152 talloc_free(tmp_ctx
);
153 return NT_STATUS_NO_MEMORY
;
156 memcpy(rec
.dptr
, header
, sizeof(struct ctdb_ltdb_header
));
157 memcpy(sizeof(struct ctdb_ltdb_header
) + (uint8_t *)rec
.dptr
, data
.dptr
, data
.dsize
);
159 ret
= tdb_store(db
->wtdb
->tdb
, key
, rec
, TDB_REPLACE
);
161 talloc_free(tmp_ctx
);
163 return (ret
== 0) ? NT_STATUS_OK
164 : tdb_error_to_ntstatus(db
->wtdb
->tdb
);
169 form a ctdb_rec_data record from a key/data pair
171 note that header may be NULL. If not NULL then it is included in the data portion
174 static struct ctdb_rec_data
*db_ctdb_marshall_record(TALLOC_CTX
*mem_ctx
, uint32_t reqid
,
176 struct ctdb_ltdb_header
*header
,
180 struct ctdb_rec_data
*d
;
182 length
= offsetof(struct ctdb_rec_data
, data
) + key
.dsize
+
183 data
.dsize
+ (header
?sizeof(*header
):0);
184 d
= (struct ctdb_rec_data
*)talloc_size(mem_ctx
, length
);
190 d
->keylen
= key
.dsize
;
191 memcpy(&d
->data
[0], key
.dptr
, key
.dsize
);
193 d
->datalen
= data
.dsize
+ sizeof(*header
);
194 memcpy(&d
->data
[key
.dsize
], header
, sizeof(*header
));
195 memcpy(&d
->data
[key
.dsize
+sizeof(*header
)], data
.dptr
, data
.dsize
);
197 d
->datalen
= data
.dsize
;
198 memcpy(&d
->data
[key
.dsize
], data
.dptr
, data
.dsize
);
204 /* helper function for marshalling multiple records */
205 static struct ctdb_marshall_buffer
*db_ctdb_marshall_add(TALLOC_CTX
*mem_ctx
,
206 struct ctdb_marshall_buffer
*m
,
210 struct ctdb_ltdb_header
*header
,
213 struct ctdb_rec_data
*r
;
214 size_t m_size
, r_size
;
215 struct ctdb_marshall_buffer
*m2
= NULL
;
217 r
= db_ctdb_marshall_record(talloc_tos(), reqid
, key
, header
, data
);
224 m
= (struct ctdb_marshall_buffer
*)talloc_zero_size(
225 mem_ctx
, offsetof(struct ctdb_marshall_buffer
, data
));
232 m_size
= talloc_get_size(m
);
233 r_size
= talloc_get_size(r
);
235 m2
= (struct ctdb_marshall_buffer
*)talloc_realloc_size(
236 mem_ctx
, m
, m_size
+ r_size
);
242 memcpy(m_size
+ (uint8_t *)m2
, r
, r_size
);
251 /* we've finished marshalling, return a data blob with the marshalled records */
252 static TDB_DATA
db_ctdb_marshall_finish(struct ctdb_marshall_buffer
*m
)
255 data
.dptr
= (uint8_t *)m
;
256 data
.dsize
= talloc_get_size(m
);
261 loop over a marshalling buffer
263 - pass r==NULL to start
264 - loop the number of times indicated by m->count
266 static struct ctdb_rec_data
*db_ctdb_marshall_loop_next(struct ctdb_marshall_buffer
*m
, struct ctdb_rec_data
*r
,
268 struct ctdb_ltdb_header
*header
,
269 TDB_DATA
*key
, TDB_DATA
*data
)
272 r
= (struct ctdb_rec_data
*)&m
->data
[0];
274 r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
282 key
->dptr
= &r
->data
[0];
283 key
->dsize
= r
->keylen
;
286 data
->dptr
= &r
->data
[r
->keylen
];
287 data
->dsize
= r
->datalen
;
288 if (header
!= NULL
) {
289 data
->dptr
+= sizeof(*header
);
290 data
->dsize
-= sizeof(*header
);
294 if (header
!= NULL
) {
295 if (r
->datalen
< sizeof(*header
)) {
298 *header
= *(struct ctdb_ltdb_header
*)&r
->data
[r
->keylen
];
305 * CTDB transaction destructor
307 static int db_ctdb_transaction_destructor(struct db_ctdb_transaction_handle
*h
)
311 status
= g_lock_unlock(h
->ctx
->lock_ctx
, h
->lock_name
);
312 if (!NT_STATUS_IS_OK(status
)) {
313 DEBUG(0, ("g_lock_unlock failed: %s\n", nt_errstr(status
)));
320 * CTDB dbwrap API: transaction_start function
321 * starts a transaction on a persistent database
323 static int db_ctdb_transaction_start(struct db_context
*db
)
325 struct db_ctdb_transaction_handle
*h
;
327 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
330 if (!db
->persistent
) {
331 DEBUG(0,("transactions not supported on non-persistent database 0x%08x\n",
336 if (ctx
->transaction
) {
337 ctx
->transaction
->nesting
++;
341 h
= talloc_zero(db
, struct db_ctdb_transaction_handle
);
343 DEBUG(0,(__location__
" oom for transaction handle\n"));
349 h
->lock_name
= talloc_asprintf(h
, "transaction_db_0x%08x",
350 (unsigned int)ctx
->db_id
);
351 if (h
->lock_name
== NULL
) {
352 DEBUG(0, ("talloc_asprintf failed\n"));
358 * Wait a day, i.e. forever...
360 status
= g_lock_lock(ctx
->lock_ctx
, h
->lock_name
, G_LOCK_WRITE
,
361 timeval_set(86400, 0));
362 if (!NT_STATUS_IS_OK(status
)) {
363 DEBUG(0, ("g_lock_lock failed: %s\n", nt_errstr(status
)));
368 talloc_set_destructor(h
, db_ctdb_transaction_destructor
);
370 ctx
->transaction
= h
;
372 DEBUG(5,(__location__
" Started transaction on db 0x%08x\n", ctx
->db_id
));
377 static bool pull_newest_from_marshall_buffer(struct ctdb_marshall_buffer
*buf
,
379 struct ctdb_ltdb_header
*pheader
,
383 struct ctdb_rec_data
*rec
= NULL
;
384 struct ctdb_ltdb_header h
;
397 * Walk the list of records written during this
398 * transaction. If we want to read one we have already
399 * written, return the last written sample. Thus we do not do
400 * a "break;" for the first hit, this record might have been
404 for (i
=0; i
<buf
->count
; i
++) {
405 TDB_DATA tkey
, tdata
;
407 struct ctdb_ltdb_header hdr
;
411 rec
= db_ctdb_marshall_loop_next(buf
, rec
, &reqid
, &hdr
, &tkey
,
417 if (tdb_data_equal(key
, tkey
)) {
429 data
.dptr
= (uint8_t *)talloc_memdup(mem_ctx
, data
.dptr
,
431 if ((data
.dsize
!= 0) && (data
.dptr
== NULL
)) {
437 if (pheader
!= NULL
) {
445 fetch a record inside a transaction
447 static int db_ctdb_transaction_fetch(struct db_ctdb_ctx
*db
,
449 TDB_DATA key
, TDB_DATA
*data
)
451 struct db_ctdb_transaction_handle
*h
= db
->transaction
;
455 found
= pull_newest_from_marshall_buffer(h
->m_write
, key
, NULL
,
461 status
= db_ctdb_ltdb_fetch(h
->ctx
, key
, NULL
, mem_ctx
, data
);
463 if (NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
)) {
465 } else if (!NT_STATUS_IS_OK(status
)) {
469 h
->m_all
= db_ctdb_marshall_add(h
, h
->m_all
, h
->ctx
->db_id
, 1, key
,
471 if (h
->m_all
== NULL
) {
472 DEBUG(0,(__location__
" Failed to add to marshalling "
475 talloc_free(data
->dptr
);
483 * Fetch a record from a persistent database
484 * without record locking and without an active transaction.
486 * This just fetches from the local database copy.
487 * Since the databases are kept in syc cluster-wide,
488 * there is no point in doing a ctdb call to fetch the
489 * record from the lmaster. It does even harm since migration
490 * of records bump their RSN and hence render the persistent
491 * database inconsistent.
493 static int db_ctdb_fetch_persistent(struct db_ctdb_ctx
*db
,
495 TDB_DATA key
, TDB_DATA
*data
)
499 status
= db_ctdb_ltdb_fetch(db
, key
, NULL
, mem_ctx
, data
);
501 if (NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
)) {
503 } else if (!NT_STATUS_IS_OK(status
)) {
510 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
);
511 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
);
513 static struct db_record
*db_ctdb_fetch_locked_transaction(struct db_ctdb_ctx
*ctx
,
517 struct db_record
*result
;
520 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
521 DEBUG(0, ("talloc failed\n"));
525 result
->private_data
= ctx
->transaction
;
527 result
->key
.dsize
= key
.dsize
;
528 result
->key
.dptr
= (uint8
*)talloc_memdup(result
, key
.dptr
, key
.dsize
);
529 if (result
->key
.dptr
== NULL
) {
530 DEBUG(0, ("talloc failed\n"));
535 result
->store
= db_ctdb_store_transaction
;
536 result
->delete_rec
= db_ctdb_delete_transaction
;
538 if (pull_newest_from_marshall_buffer(ctx
->transaction
->m_write
, key
,
539 NULL
, result
, &result
->value
)) {
543 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
544 if (ctdb_data
.dptr
== NULL
) {
545 /* create the record */
546 result
->value
= tdb_null
;
550 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(struct ctdb_ltdb_header
);
551 result
->value
.dptr
= NULL
;
553 if ((result
->value
.dsize
!= 0)
554 && !(result
->value
.dptr
= (uint8
*)talloc_memdup(
555 result
, ctdb_data
.dptr
+ sizeof(struct ctdb_ltdb_header
),
556 result
->value
.dsize
))) {
557 DEBUG(0, ("talloc failed\n"));
561 SAFE_FREE(ctdb_data
.dptr
);
566 static int db_ctdb_record_destructor(struct db_record
**recp
)
568 struct db_record
*rec
= talloc_get_type_abort(*recp
, struct db_record
);
569 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
570 rec
->private_data
, struct db_ctdb_transaction_handle
);
571 int ret
= h
->ctx
->db
->transaction_commit(h
->ctx
->db
);
573 DEBUG(0,(__location__
" transaction_commit failed\n"));
579 auto-create a transaction for persistent databases
581 static struct db_record
*db_ctdb_fetch_locked_persistent(struct db_ctdb_ctx
*ctx
,
586 struct db_record
*rec
, **recp
;
588 res
= db_ctdb_transaction_start(ctx
->db
);
593 rec
= db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
595 ctx
->db
->transaction_cancel(ctx
->db
);
599 /* destroy this transaction when we release the lock */
600 recp
= talloc(rec
, struct db_record
*);
602 ctx
->db
->transaction_cancel(ctx
->db
);
607 talloc_set_destructor(recp
, db_ctdb_record_destructor
);
613 stores a record inside a transaction
615 static NTSTATUS
db_ctdb_transaction_store(struct db_ctdb_transaction_handle
*h
,
616 TDB_DATA key
, TDB_DATA data
)
618 TALLOC_CTX
*tmp_ctx
= talloc_new(h
);
620 struct ctdb_ltdb_header header
;
624 /* we need the header so we can update the RSN */
626 if (!pull_newest_from_marshall_buffer(h
->m_write
, key
, &header
,
629 rec
= tdb_fetch(h
->ctx
->wtdb
->tdb
, key
);
631 if (rec
.dptr
!= NULL
) {
632 memcpy(&header
, rec
.dptr
,
633 sizeof(struct ctdb_ltdb_header
));
634 rec
.dsize
-= sizeof(struct ctdb_ltdb_header
);
637 * a special case, we are writing the same
638 * data that is there now
640 if (data
.dsize
== rec
.dsize
&&
642 rec
.dptr
+ sizeof(struct ctdb_ltdb_header
),
645 talloc_free(tmp_ctx
);
652 header
.dmaster
= get_my_vnn();
655 h
->m_all
= db_ctdb_marshall_add(h
, h
->m_all
, h
->ctx
->db_id
, 0, key
,
657 if (h
->m_all
== NULL
) {
658 DEBUG(0,(__location__
" Failed to add to marshalling "
660 talloc_free(tmp_ctx
);
661 return NT_STATUS_NO_MEMORY
;
664 h
->m_write
= db_ctdb_marshall_add(h
, h
->m_write
, h
->ctx
->db_id
, 0, key
, &header
, data
);
665 if (h
->m_write
== NULL
) {
666 DEBUG(0,(__location__
" Failed to add to marshalling record\n"));
667 talloc_free(tmp_ctx
);
668 return NT_STATUS_NO_MEMORY
;
671 talloc_free(tmp_ctx
);
677 a record store inside a transaction
679 static NTSTATUS
db_ctdb_store_transaction(struct db_record
*rec
, TDB_DATA data
, int flag
)
681 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
682 rec
->private_data
, struct db_ctdb_transaction_handle
);
685 status
= db_ctdb_transaction_store(h
, rec
->key
, data
);
690 a record delete inside a transaction
692 static NTSTATUS
db_ctdb_delete_transaction(struct db_record
*rec
)
694 struct db_ctdb_transaction_handle
*h
= talloc_get_type_abort(
695 rec
->private_data
, struct db_ctdb_transaction_handle
);
698 status
= db_ctdb_transaction_store(h
, rec
->key
, tdb_null
);
703 * Fetch the db sequence number of a persistent db directly from the db.
705 static NTSTATUS
db_ctdb_fetch_db_seqnum_from_db(struct db_ctdb_ctx
*db
,
709 const char *keyname
= CTDB_DB_SEQNUM_KEY
;
712 struct ctdb_ltdb_header header
;
713 TALLOC_CTX
*mem_ctx
= talloc_stackframe();
715 if (seqnum
== NULL
) {
716 return NT_STATUS_INVALID_PARAMETER
;
719 key
= string_term_tdb_data(keyname
);
721 status
= db_ctdb_ltdb_fetch(db
, key
, &header
, mem_ctx
, &data
);
722 if (!NT_STATUS_IS_OK(status
) &&
723 !NT_STATUS_EQUAL(status
, NT_STATUS_NOT_FOUND
))
728 status
= NT_STATUS_OK
;
730 if (data
.dsize
!= sizeof(uint64_t)) {
735 *seqnum
= *(uint64_t *)data
.dptr
;
738 TALLOC_FREE(mem_ctx
);
743 * Store the database sequence number inside a transaction.
745 static NTSTATUS
db_ctdb_store_db_seqnum(struct db_ctdb_transaction_handle
*h
,
749 const char *keyname
= CTDB_DB_SEQNUM_KEY
;
753 key
= string_term_tdb_data(keyname
);
755 data
.dptr
= (uint8_t *)&seqnum
;
756 data
.dsize
= sizeof(uint64_t);
758 status
= db_ctdb_transaction_store(h
, key
, data
);
766 static int db_ctdb_transaction_commit(struct db_context
*db
)
768 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
772 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
773 uint64_t old_seqnum
, new_seqnum
;
777 DEBUG(0,(__location__
" transaction commit with no open transaction on db 0x%08x\n", ctx
->db_id
));
781 if (h
->nested_cancel
) {
782 db
->transaction_cancel(db
);
783 DEBUG(5,(__location__
" Failed transaction commit after nested cancel\n"));
787 if (h
->nesting
!= 0) {
792 if (h
->m_write
== NULL
) {
794 * No changes were made, so don't change the seqnum,
795 * don't push to other node, just exit with success.
801 DEBUG(5,(__location__
" Commit transaction on db 0x%08x\n", ctx
->db_id
));
804 * As the last db action before committing, bump the database sequence
805 * number. Note that this undoes all changes to the seqnum records
806 * performed under the transaction. This record is not meant to be
807 * modified by user interaction. It is for internal use only...
809 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &old_seqnum
);
810 if (!NT_STATUS_IS_OK(rets
)) {
811 DEBUG(1, (__location__
" failed to fetch the db sequence number "
812 "in transaction commit on db 0x%08x\n", ctx
->db_id
));
817 new_seqnum
= old_seqnum
+ 1;
819 rets
= db_ctdb_store_db_seqnum(h
, new_seqnum
);
820 if (!NT_STATUS_IS_OK(rets
)) {
821 DEBUG(1, (__location__
"failed to store the db sequence number "
822 " in transaction commit on db 0x%08x\n", ctx
->db_id
));
828 /* tell ctdbd to commit to the other nodes */
829 rets
= ctdbd_control_local(messaging_ctdbd_connection(),
830 CTDB_CONTROL_TRANS3_COMMIT
,
832 db_ctdb_marshall_finish(h
->m_write
),
833 NULL
, NULL
, &status
);
834 if (!NT_STATUS_IS_OK(rets
) || status
!= 0) {
836 * The TRANS3_COMMIT control should only possibly fail when a
837 * recovery has been running concurrently. In any case, the db
838 * will be the same on all nodes, either the new copy or the
839 * old copy. This can be detected by comparing the old and new
840 * local sequence numbers.
842 rets
= db_ctdb_fetch_db_seqnum_from_db(ctx
, &new_seqnum
);
843 if (!NT_STATUS_IS_OK(rets
)) {
844 DEBUG(1, (__location__
" failed to refetch db sequence "
845 "number after failed TRANS3_COMMIT\n"));
850 if (new_seqnum
== old_seqnum
) {
851 /* Recovery prevented all our changes: retry. */
853 } else if (new_seqnum
!= (old_seqnum
+ 1)) {
854 DEBUG(0, (__location__
" ERROR: new_seqnum[%lu] != "
855 "old_seqnum[%lu] + (0 or 1) after failed "
856 "TRANS3_COMMIT - this should not happen!\n",
857 (unsigned long)new_seqnum
,
858 (unsigned long)old_seqnum
));
863 * Recovery propagated our changes to all nodes, completing
864 * our commit for us - succeed.
871 h
->ctx
->transaction
= NULL
;
880 static int db_ctdb_transaction_cancel(struct db_context
*db
)
882 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
884 struct db_ctdb_transaction_handle
*h
= ctx
->transaction
;
887 DEBUG(0,(__location__
" transaction cancel with no open transaction on db 0x%08x\n", ctx
->db_id
));
891 if (h
->nesting
!= 0) {
893 h
->nested_cancel
= true;
897 DEBUG(5,(__location__
" Cancel transaction on db 0x%08x\n", ctx
->db_id
));
899 ctx
->transaction
= NULL
;
905 static NTSTATUS
db_ctdb_store(struct db_record
*rec
, TDB_DATA data
, int flag
)
907 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
908 rec
->private_data
, struct db_ctdb_rec
);
910 return db_ctdb_ltdb_store(crec
->ctdb_ctx
, rec
->key
, &(crec
->header
), data
);
915 #ifdef HAVE_CTDB_CONTROL_SCHEDULE_FOR_DELETION_DECL
916 static NTSTATUS
db_ctdb_send_schedule_for_deletion(struct db_record
*rec
)
919 struct ctdb_control_schedule_for_deletion
*dd
;
922 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
923 rec
->private_data
, struct db_ctdb_rec
);
925 indata
.dsize
= offsetof(struct ctdb_control_schedule_for_deletion
, key
) + rec
->key
.dsize
;
926 indata
.dptr
= talloc_zero_array(crec
, uint8_t, indata
.dsize
);
927 if (indata
.dptr
== NULL
) {
928 DEBUG(0, (__location__
" talloc failed!\n"));
929 return NT_STATUS_NO_MEMORY
;
932 dd
= (struct ctdb_control_schedule_for_deletion
*)(void *)indata
.dptr
;
933 dd
->db_id
= crec
->ctdb_ctx
->db_id
;
934 dd
->hdr
= crec
->header
;
935 dd
->keylen
= rec
->key
.dsize
;
936 memcpy(dd
->key
, rec
->key
.dptr
, rec
->key
.dsize
);
938 status
= ctdbd_control_local(messaging_ctdbd_connection(),
939 CTDB_CONTROL_SCHEDULE_FOR_DELETION
,
940 crec
->ctdb_ctx
->db_id
,
941 CTDB_CTRL_FLAG_NOREPLY
, /* flags */
946 talloc_free(indata
.dptr
);
948 if (!NT_STATUS_IS_OK(status
) || cstatus
!= 0) {
949 DEBUG(1, (__location__
" Error sending local control "
950 "SCHEDULE_FOR_DELETION: %s, cstatus = %d\n",
951 nt_errstr(status
), cstatus
));
952 if (NT_STATUS_IS_OK(status
)) {
953 status
= NT_STATUS_UNSUCCESSFUL
;
961 static NTSTATUS
db_ctdb_delete(struct db_record
*rec
)
967 * We have to store the header with empty data. TODO: Fix the
973 status
= db_ctdb_store(rec
, data
, 0);
974 if (!NT_STATUS_IS_OK(status
)) {
978 #ifdef HAVE_CTDB_CONTROL_SCHEDULE_FOR_DELETION_DECL
979 status
= db_ctdb_send_schedule_for_deletion(rec
);
985 static int db_ctdb_record_destr(struct db_record
* data
)
987 struct db_ctdb_rec
*crec
= talloc_get_type_abort(
988 data
->private_data
, struct db_ctdb_rec
);
991 DEBUG(10, (DEBUGLEVEL
> 10
992 ? "Unlocking db %u key %s\n"
993 : "Unlocking db %u key %.20s\n",
994 (int)crec
->ctdb_ctx
->db_id
,
995 hex_encode_talloc(data
, (unsigned char *)data
->key
.dptr
,
998 if (tdb_chainunlock(crec
->ctdb_ctx
->wtdb
->tdb
, data
->key
) != 0) {
999 DEBUG(0, ("tdb_chainunlock failed\n"));
1003 threshold
= lp_ctdb_locktime_warn_threshold();
1004 if (threshold
!= 0) {
1005 double timediff
= timeval_elapsed(&crec
->lock_time
);
1006 if ((timediff
* 1000) > threshold
) {
1007 DEBUG(0, ("Held tdb lock %f seconds\n", timediff
));
1014 static struct db_record
*fetch_locked_internal(struct db_ctdb_ctx
*ctx
,
1015 TALLOC_CTX
*mem_ctx
,
1018 struct db_record
*result
;
1019 struct db_ctdb_rec
*crec
;
1022 int migrate_attempts
= 0;
1024 if (!(result
= talloc(mem_ctx
, struct db_record
))) {
1025 DEBUG(0, ("talloc failed\n"));
1029 if (!(crec
= TALLOC_ZERO_P(result
, struct db_ctdb_rec
))) {
1030 DEBUG(0, ("talloc failed\n"));
1031 TALLOC_FREE(result
);
1035 result
->private_data
= (void *)crec
;
1036 crec
->ctdb_ctx
= ctx
;
1038 result
->key
.dsize
= key
.dsize
;
1039 result
->key
.dptr
= (uint8
*)talloc_memdup(result
, key
.dptr
, key
.dsize
);
1040 if (result
->key
.dptr
== NULL
) {
1041 DEBUG(0, ("talloc failed\n"));
1042 TALLOC_FREE(result
);
1047 * Do a blocking lock on the record
1051 if (DEBUGLEVEL
>= 10) {
1052 char *keystr
= hex_encode_talloc(result
, key
.dptr
, key
.dsize
);
1053 DEBUG(10, (DEBUGLEVEL
> 10
1054 ? "Locking db %u key %s\n"
1055 : "Locking db %u key %.20s\n",
1056 (int)crec
->ctdb_ctx
->db_id
, keystr
));
1057 TALLOC_FREE(keystr
);
1060 if (tdb_chainlock(ctx
->wtdb
->tdb
, key
) != 0) {
1061 DEBUG(3, ("tdb_chainlock failed\n"));
1062 TALLOC_FREE(result
);
1066 result
->store
= db_ctdb_store
;
1067 result
->delete_rec
= db_ctdb_delete
;
1068 talloc_set_destructor(result
, db_ctdb_record_destr
);
1070 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
1073 * See if we have a valid record and we are the dmaster. If so, we can
1074 * take the shortcut and just return it.
1077 if ((ctdb_data
.dptr
== NULL
) ||
1078 (ctdb_data
.dsize
< sizeof(struct ctdb_ltdb_header
)) ||
1079 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
!= get_my_vnn()
1081 || (random() % 2 != 0)
1084 SAFE_FREE(ctdb_data
.dptr
);
1085 tdb_chainunlock(ctx
->wtdb
->tdb
, key
);
1086 talloc_set_destructor(result
, NULL
);
1088 migrate_attempts
+= 1;
1090 DEBUG(10, ("ctdb_data.dptr = %p, dmaster = %u (%u)\n",
1091 ctdb_data
.dptr
, ctdb_data
.dptr
?
1092 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
: -1,
1095 status
= ctdbd_migrate(messaging_ctdbd_connection(), ctx
->db_id
,
1097 if (!NT_STATUS_IS_OK(status
)) {
1098 DEBUG(5, ("ctdb_migrate failed: %s\n",
1099 nt_errstr(status
)));
1100 TALLOC_FREE(result
);
1103 /* now its migrated, try again */
1107 if (migrate_attempts
> 10) {
1108 DEBUG(0, ("db_ctdb_fetch_locked needed %d attempts\n",
1112 GetTimeOfDay(&crec
->lock_time
);
1114 memcpy(&crec
->header
, ctdb_data
.dptr
, sizeof(crec
->header
));
1116 result
->value
.dsize
= ctdb_data
.dsize
- sizeof(crec
->header
);
1117 result
->value
.dptr
= NULL
;
1119 if ((result
->value
.dsize
!= 0)
1120 && !(result
->value
.dptr
= (uint8
*)talloc_memdup(
1121 result
, ctdb_data
.dptr
+ sizeof(crec
->header
),
1122 result
->value
.dsize
))) {
1123 DEBUG(0, ("talloc failed\n"));
1124 TALLOC_FREE(result
);
1127 SAFE_FREE(ctdb_data
.dptr
);
1132 static struct db_record
*db_ctdb_fetch_locked(struct db_context
*db
,
1133 TALLOC_CTX
*mem_ctx
,
1136 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1137 struct db_ctdb_ctx
);
1139 if (ctx
->transaction
!= NULL
) {
1140 return db_ctdb_fetch_locked_transaction(ctx
, mem_ctx
, key
);
1143 if (db
->persistent
) {
1144 return db_ctdb_fetch_locked_persistent(ctx
, mem_ctx
, key
);
1147 return fetch_locked_internal(ctx
, mem_ctx
, key
);
1151 fetch (unlocked, no migration) operation on ctdb
1153 static int db_ctdb_fetch(struct db_context
*db
, TALLOC_CTX
*mem_ctx
,
1154 TDB_DATA key
, TDB_DATA
*data
)
1156 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1157 struct db_ctdb_ctx
);
1161 if (ctx
->transaction
) {
1162 return db_ctdb_transaction_fetch(ctx
, mem_ctx
, key
, data
);
1165 if (db
->persistent
) {
1166 return db_ctdb_fetch_persistent(ctx
, mem_ctx
, key
, data
);
1169 /* try a direct fetch */
1170 ctdb_data
= tdb_fetch(ctx
->wtdb
->tdb
, key
);
1173 * See if we have a valid record and we are the dmaster. If so, we can
1174 * take the shortcut and just return it.
1175 * we bypass the dmaster check for persistent databases
1177 if ((ctdb_data
.dptr
!= NULL
) &&
1178 (ctdb_data
.dsize
>= sizeof(struct ctdb_ltdb_header
)) &&
1179 ((struct ctdb_ltdb_header
*)ctdb_data
.dptr
)->dmaster
== get_my_vnn())
1181 /* we are the dmaster - avoid the ctdb protocol op */
1183 data
->dsize
= ctdb_data
.dsize
- sizeof(struct ctdb_ltdb_header
);
1184 if (data
->dsize
== 0) {
1185 SAFE_FREE(ctdb_data
.dptr
);
1190 data
->dptr
= (uint8
*)talloc_memdup(
1191 mem_ctx
, ctdb_data
.dptr
+sizeof(struct ctdb_ltdb_header
),
1194 SAFE_FREE(ctdb_data
.dptr
);
1196 if (data
->dptr
== NULL
) {
1202 SAFE_FREE(ctdb_data
.dptr
);
1204 /* we weren't able to get it locally - ask ctdb to fetch it for us */
1205 status
= ctdbd_fetch(messaging_ctdbd_connection(), ctx
->db_id
, key
,
1207 if (!NT_STATUS_IS_OK(status
)) {
1208 DEBUG(5, ("ctdbd_fetch failed: %s\n", nt_errstr(status
)));
1215 struct traverse_state
{
1216 struct db_context
*db
;
1217 int (*fn
)(struct db_record
*rec
, void *private_data
);
1221 static void traverse_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1223 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1224 struct db_record
*rec
;
1225 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1226 /* we have to give them a locked record to prevent races */
1227 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, key
);
1228 if (rec
&& rec
->value
.dsize
> 0) {
1229 state
->fn(rec
, state
->private_data
);
1231 talloc_free(tmp_ctx
);
1234 static int traverse_persistent_callback(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1237 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1238 struct db_record
*rec
;
1239 TALLOC_CTX
*tmp_ctx
= talloc_new(state
->db
);
1241 /* we have to give them a locked record to prevent races */
1242 rec
= db_ctdb_fetch_locked(state
->db
, tmp_ctx
, kbuf
);
1243 if (rec
&& rec
->value
.dsize
> 0) {
1244 ret
= state
->fn(rec
, state
->private_data
);
1246 talloc_free(tmp_ctx
);
1250 /* wrapper to use traverse_persistent_callback with dbwrap */
1251 static int traverse_persistent_callback_dbwrap(struct db_record
*rec
, void* data
)
1253 return traverse_persistent_callback(NULL
, rec
->key
, rec
->value
, data
);
1257 static int db_ctdb_traverse(struct db_context
*db
,
1258 int (*fn
)(struct db_record
*rec
,
1259 void *private_data
),
1262 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1263 struct db_ctdb_ctx
);
1264 struct traverse_state state
;
1268 state
.private_data
= private_data
;
1270 if (db
->persistent
) {
1271 struct tdb_context
*ltdb
= ctx
->wtdb
->tdb
;
1274 /* for persistent databases we don't need to do a ctdb traverse,
1275 we can do a faster local traverse */
1276 ret
= tdb_traverse(ltdb
, traverse_persistent_callback
, &state
);
1280 if (ctx
->transaction
&& ctx
->transaction
->m_write
) {
1282 * we now have to handle keys not yet
1283 * present at transaction start
1285 struct db_context
*newkeys
= db_open_rbt(talloc_tos());
1286 struct ctdb_marshall_buffer
*mbuf
= ctx
->transaction
->m_write
;
1287 struct ctdb_rec_data
*rec
=NULL
;
1292 if (newkeys
== NULL
) {
1296 for (i
=0; i
<mbuf
->count
; i
++) {
1298 rec
=db_ctdb_marshall_loop_next(mbuf
, rec
,
1301 SMB_ASSERT(rec
!= NULL
);
1303 if (!tdb_exists(ltdb
, key
)) {
1304 dbwrap_store(newkeys
, key
, tdb_null
, 0);
1307 status
= dbwrap_traverse(newkeys
,
1308 traverse_persistent_callback_dbwrap
,
1311 talloc_free(newkeys
);
1312 if (!NT_STATUS_IS_OK(status
)) {
1321 ctdbd_traverse(ctx
->db_id
, traverse_callback
, &state
);
1325 static NTSTATUS
db_ctdb_store_deny(struct db_record
*rec
, TDB_DATA data
, int flag
)
1327 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1330 static NTSTATUS
db_ctdb_delete_deny(struct db_record
*rec
)
1332 return NT_STATUS_MEDIA_WRITE_PROTECTED
;
1335 static void traverse_read_callback(TDB_DATA key
, TDB_DATA data
, void *private_data
)
1337 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1338 struct db_record rec
;
1341 rec
.store
= db_ctdb_store_deny
;
1342 rec
.delete_rec
= db_ctdb_delete_deny
;
1343 rec
.private_data
= state
->db
;
1344 state
->fn(&rec
, state
->private_data
);
1347 static int traverse_persistent_callback_read(TDB_CONTEXT
*tdb
, TDB_DATA kbuf
, TDB_DATA dbuf
,
1350 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1351 struct db_record rec
;
1354 rec
.store
= db_ctdb_store_deny
;
1355 rec
.delete_rec
= db_ctdb_delete_deny
;
1356 rec
.private_data
= state
->db
;
1358 if (rec
.value
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1359 /* a deleted record */
1362 rec
.value
.dsize
-= sizeof(struct ctdb_ltdb_header
);
1363 rec
.value
.dptr
+= sizeof(struct ctdb_ltdb_header
);
1365 return state
->fn(&rec
, state
->private_data
);
1368 static int db_ctdb_traverse_read(struct db_context
*db
,
1369 int (*fn
)(struct db_record
*rec
,
1370 void *private_data
),
1373 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1374 struct db_ctdb_ctx
);
1375 struct traverse_state state
;
1379 state
.private_data
= private_data
;
1381 if (db
->persistent
) {
1382 /* for persistent databases we don't need to do a ctdb traverse,
1383 we can do a faster local traverse */
1384 return tdb_traverse_read(ctx
->wtdb
->tdb
, traverse_persistent_callback_read
, &state
);
1387 ctdbd_traverse(ctx
->db_id
, traverse_read_callback
, &state
);
1391 static int db_ctdb_get_seqnum(struct db_context
*db
)
1393 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1394 struct db_ctdb_ctx
);
1395 return tdb_get_seqnum(ctx
->wtdb
->tdb
);
1398 static int db_ctdb_get_flags(struct db_context
*db
)
1400 struct db_ctdb_ctx
*ctx
= talloc_get_type_abort(db
->private_data
,
1401 struct db_ctdb_ctx
);
1402 return tdb_get_flags(ctx
->wtdb
->tdb
);
1405 struct db_context
*db_open_ctdb(TALLOC_CTX
*mem_ctx
,
1407 int hash_size
, int tdb_flags
,
1408 int open_flags
, mode_t mode
)
1410 struct db_context
*result
;
1411 struct db_ctdb_ctx
*db_ctdb
;
1413 struct ctdbd_connection
*conn
;
1415 if (!lp_clustering()) {
1416 DEBUG(10, ("Clustering disabled -- no ctdb\n"));
1420 if (!(result
= TALLOC_ZERO_P(mem_ctx
, struct db_context
))) {
1421 DEBUG(0, ("talloc failed\n"));
1422 TALLOC_FREE(result
);
1426 if (!(db_ctdb
= TALLOC_P(result
, struct db_ctdb_ctx
))) {
1427 DEBUG(0, ("talloc failed\n"));
1428 TALLOC_FREE(result
);
1432 db_ctdb
->transaction
= NULL
;
1433 db_ctdb
->db
= result
;
1435 conn
= messaging_ctdbd_connection();
1437 DEBUG(1, ("Could not connect to ctdb\n"));
1438 TALLOC_FREE(result
);
1442 if (!NT_STATUS_IS_OK(ctdbd_db_attach(conn
, name
, &db_ctdb
->db_id
, tdb_flags
))) {
1443 DEBUG(0, ("ctdbd_db_attach failed for %s\n", name
));
1444 TALLOC_FREE(result
);
1448 db_path
= ctdbd_dbpath(conn
, db_ctdb
, db_ctdb
->db_id
);
1450 result
->persistent
= ((tdb_flags
& TDB_CLEAR_IF_FIRST
) == 0);
1452 /* only pass through specific flags */
1453 tdb_flags
&= TDB_SEQNUM
;
1455 /* honor permissions if user has specified O_CREAT */
1456 if (open_flags
& O_CREAT
) {
1457 chmod(db_path
, mode
);
1460 db_ctdb
->wtdb
= tdb_wrap_open(db_ctdb
, db_path
, hash_size
, tdb_flags
, O_RDWR
, 0);
1461 if (db_ctdb
->wtdb
== NULL
) {
1462 DEBUG(0, ("Could not open tdb %s: %s\n", db_path
, strerror(errno
)));
1463 TALLOC_FREE(result
);
1466 talloc_free(db_path
);
1468 if (result
->persistent
) {
1469 db_ctdb
->lock_ctx
= g_lock_ctx_init(db_ctdb
,
1470 ctdb_conn_msg_ctx(conn
));
1471 if (db_ctdb
->lock_ctx
== NULL
) {
1472 DEBUG(0, ("g_lock_ctx_init failed\n"));
1473 TALLOC_FREE(result
);
1478 result
->private_data
= (void *)db_ctdb
;
1479 result
->fetch_locked
= db_ctdb_fetch_locked
;
1480 result
->fetch
= db_ctdb_fetch
;
1481 result
->traverse
= db_ctdb_traverse
;
1482 result
->traverse_read
= db_ctdb_traverse_read
;
1483 result
->get_seqnum
= db_ctdb_get_seqnum
;
1484 result
->get_flags
= db_ctdb_get_flags
;
1485 result
->transaction_start
= db_ctdb_transaction_start
;
1486 result
->transaction_commit
= db_ctdb_transaction_commit
;
1487 result
->transaction_cancel
= db_ctdb_transaction_cancel
;
1489 DEBUG(3,("db_open_ctdb: opened database '%s' with dbid 0x%x\n",
1490 name
, db_ctdb
->db_id
));