nsswitch:libwbclient - fix leak in wbcCtxPingDc2
[Samba.git] / ctdb / server / ctdb_recover.c
blob004ddb30efddc4f72be902456dfe7c45cb0c703a
1 /*
2 ctdb recovery code
4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/time.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
26 #include <talloc.h>
27 #include <tevent.h>
28 #include <tdb.h>
30 #include "lib/tdb_wrap/tdb_wrap.h"
31 #include "lib/util/dlinklist.h"
32 #include "lib/util/debug.h"
33 #include "lib/util/time.h"
34 #include "lib/util/util_process.h"
36 #include "ctdb_private.h"
37 #include "ctdb_client.h"
39 #include "common/system.h"
40 #include "common/common.h"
41 #include "common/logging.h"
43 #include "ctdb_cluster_mutex.h"
45 int
46 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
48 struct ctdb_vnn_map_wire *map;
49 size_t len;
51 CHECK_CONTROL_DATA_SIZE(0);
53 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
54 map = talloc_size(outdata, len);
55 CTDB_NO_MEMORY(ctdb, map);
57 map->generation = ctdb->vnn_map->generation;
58 map->size = ctdb->vnn_map->size;
59 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
61 outdata->dsize = len;
62 outdata->dptr = (uint8_t *)map;
64 return 0;
67 int
68 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
70 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
72 if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
73 DEBUG(DEBUG_ERR, ("Attempt to set vnnmap when not in recovery\n"));
74 return -1;
77 talloc_free(ctdb->vnn_map);
79 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
80 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
82 ctdb->vnn_map->generation = map->generation;
83 ctdb->vnn_map->size = map->size;
84 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
85 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
87 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
89 return 0;
92 int
93 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
95 uint32_t i, len;
96 struct ctdb_db_context *ctdb_db;
97 struct ctdb_dbid_map_old *dbid_map;
99 CHECK_CONTROL_DATA_SIZE(0);
101 len = 0;
102 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
103 len++;
107 outdata->dsize = offsetof(struct ctdb_dbid_map_old, dbs) + sizeof(dbid_map->dbs[0])*len;
108 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
109 if (!outdata->dptr) {
110 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
111 exit(1);
114 dbid_map = (struct ctdb_dbid_map_old *)outdata->dptr;
115 dbid_map->num = len;
116 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
117 dbid_map->dbs[i].db_id = ctdb_db->db_id;
118 dbid_map->dbs[i].flags = ctdb_db->db_flags;
121 return 0;
125 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
127 CHECK_CONTROL_DATA_SIZE(0);
129 outdata->dptr = (unsigned char *)ctdb_node_list_to_map(ctdb->nodes,
130 ctdb->num_nodes,
131 outdata);
132 if (outdata->dptr == NULL) {
133 return -1;
136 outdata->dsize = talloc_get_size(outdata->dptr);
138 return 0;
142 reload the nodes file
145 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
147 unsigned int i, num_nodes;
148 TALLOC_CTX *tmp_ctx;
149 struct ctdb_node **nodes;
151 tmp_ctx = talloc_new(ctdb);
153 /* steal the old nodes file for a while */
154 talloc_steal(tmp_ctx, ctdb->nodes);
155 nodes = ctdb->nodes;
156 ctdb->nodes = NULL;
157 num_nodes = ctdb->num_nodes;
158 ctdb->num_nodes = 0;
160 /* load the new nodes file */
161 ctdb_load_nodes_file(ctdb);
163 for (i=0; i<ctdb->num_nodes; i++) {
164 /* keep any identical pre-existing nodes and connections */
165 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
166 talloc_free(ctdb->nodes[i]);
167 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
168 continue;
171 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
172 continue;
175 /* any new or different nodes must be added */
176 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
177 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
178 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
180 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
181 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
182 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
186 /* tell the recovery daemon to reload the nodes file too */
187 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
189 talloc_free(tmp_ctx);
191 return 0;
194 struct db_pull_state {
195 struct ctdb_context *ctdb;
196 struct ctdb_db_context *ctdb_db;
197 struct ctdb_marshall_buffer *recs;
198 uint32_t pnn;
199 uint64_t srvid;
200 uint32_t num_records;
203 static int traverse_db_pull(struct tdb_context *tdb, TDB_DATA key,
204 TDB_DATA data, void *private_data)
206 struct db_pull_state *state = (struct db_pull_state *)private_data;
207 struct ctdb_marshall_buffer *recs;
209 recs = ctdb_marshall_add(state->ctdb, state->recs,
210 state->ctdb_db->db_id, 0, key, NULL, data);
211 if (recs == NULL) {
212 TALLOC_FREE(state->recs);
213 return -1;
215 state->recs = recs;
217 if (talloc_get_size(state->recs) >=
218 state->ctdb->tunable.rec_buffer_size_limit) {
219 TDB_DATA buffer;
220 int ret;
222 buffer = ctdb_marshall_finish(state->recs);
223 ret = ctdb_daemon_send_message(state->ctdb, state->pnn,
224 state->srvid, buffer);
225 if (ret != 0) {
226 TALLOC_FREE(state->recs);
227 return -1;
230 state->num_records += state->recs->count;
231 TALLOC_FREE(state->recs);
234 return 0;
237 int32_t ctdb_control_db_pull(struct ctdb_context *ctdb,
238 struct ctdb_req_control_old *c,
239 TDB_DATA indata, TDB_DATA *outdata)
241 struct ctdb_pulldb_ext *pulldb_ext;
242 struct ctdb_db_context *ctdb_db;
243 struct db_pull_state state;
244 int ret;
246 pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
248 ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
249 if (ctdb_db == NULL) {
250 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n",
251 pulldb_ext->db_id));
252 return -1;
255 if (!ctdb_db_frozen(ctdb_db)) {
256 DEBUG(DEBUG_ERR,
257 ("rejecting ctdb_control_pull_db when not frozen\n"));
258 return -1;
261 if (ctdb_db->unhealthy_reason) {
262 /* this is just a warning, as the tdb should be empty anyway */
263 DEBUG(DEBUG_WARNING,
264 ("db(%s) unhealty in ctdb_control_db_pull: %s\n",
265 ctdb_db->db_name, ctdb_db->unhealthy_reason));
268 state.ctdb = ctdb;
269 state.ctdb_db = ctdb_db;
270 state.recs = NULL;
271 state.pnn = c->hdr.srcnode;
272 state.srvid = pulldb_ext->srvid;
273 state.num_records = 0;
275 /* If the records are invalid, we are done */
276 if (ctdb_db->invalid_records) {
277 goto done;
280 if (ctdb_lockdb_mark(ctdb_db) != 0) {
281 DEBUG(DEBUG_ERR,
282 (__location__ " Failed to get lock on entire db - failing\n"));
283 return -1;
286 ret = tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_db_pull, &state);
287 if (ret == -1) {
288 DEBUG(DEBUG_ERR,
289 (__location__ " Failed to get traverse db '%s'\n",
290 ctdb_db->db_name));
291 ctdb_lockdb_unmark(ctdb_db);
292 return -1;
295 /* Last few records */
296 if (state.recs != NULL) {
297 TDB_DATA buffer;
299 buffer = ctdb_marshall_finish(state.recs);
300 ret = ctdb_daemon_send_message(state.ctdb, state.pnn,
301 state.srvid, buffer);
302 if (ret != 0) {
303 TALLOC_FREE(state.recs);
304 ctdb_lockdb_unmark(ctdb_db);
305 return -1;
308 state.num_records += state.recs->count;
309 TALLOC_FREE(state.recs);
312 ctdb_lockdb_unmark(ctdb_db);
314 done:
315 outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
316 if (outdata->dptr == NULL) {
317 DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
318 return -1;
321 memcpy(outdata->dptr, (uint8_t *)&state.num_records, sizeof(uint32_t));
322 outdata->dsize = sizeof(uint32_t);
324 return 0;
327 struct db_push_state {
328 struct ctdb_context *ctdb;
329 struct ctdb_db_context *ctdb_db;
330 uint64_t srvid;
331 uint32_t num_records;
332 bool failed;
335 static void db_push_msg_handler(uint64_t srvid, TDB_DATA indata,
336 void *private_data)
338 struct db_push_state *state = talloc_get_type(
339 private_data, struct db_push_state);
340 struct ctdb_marshall_buffer *recs;
341 struct ctdb_rec_data_old *rec;
342 unsigned int i;
343 int ret;
345 if (state->failed) {
346 return;
349 recs = (struct ctdb_marshall_buffer *)indata.dptr;
350 rec = (struct ctdb_rec_data_old *)&recs->data[0];
352 DEBUG(DEBUG_INFO, ("starting push of %u records for dbid 0x%x\n",
353 recs->count, recs->db_id));
355 for (i=0; i<recs->count; i++) {
356 TDB_DATA key, data;
357 struct ctdb_ltdb_header *hdr;
359 key.dptr = &rec->data[0];
360 key.dsize = rec->keylen;
361 data.dptr = &rec->data[key.dsize];
362 data.dsize = rec->datalen;
364 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
365 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
366 goto failed;
369 hdr = (struct ctdb_ltdb_header *)data.dptr;
370 /* Strip off any read only record flags.
371 * All readonly records are revoked implicitely by a recovery.
373 hdr->flags &= ~CTDB_REC_RO_FLAGS;
375 data.dptr += sizeof(*hdr);
376 data.dsize -= sizeof(*hdr);
378 ret = ctdb_ltdb_store(state->ctdb_db, key, hdr, data);
379 if (ret != 0) {
380 DEBUG(DEBUG_ERR,
381 (__location__ " Unable to store record\n"));
382 goto failed;
385 rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
388 DEBUG(DEBUG_DEBUG, ("finished push of %u records for dbid 0x%x\n",
389 recs->count, recs->db_id));
391 state->num_records += recs->count;
392 return;
394 failed:
395 state->failed = true;
398 int32_t ctdb_control_db_push_start(struct ctdb_context *ctdb, TDB_DATA indata)
400 struct ctdb_pulldb_ext *pulldb_ext;
401 struct ctdb_db_context *ctdb_db;
402 struct db_push_state *state;
403 int ret;
405 pulldb_ext = (struct ctdb_pulldb_ext *)indata.dptr;
407 ctdb_db = find_ctdb_db(ctdb, pulldb_ext->db_id);
408 if (ctdb_db == NULL) {
409 DEBUG(DEBUG_ERR,
410 (__location__ " Unknown db 0x%08x\n", pulldb_ext->db_id));
411 return -1;
414 if (!ctdb_db_frozen(ctdb_db)) {
415 DEBUG(DEBUG_ERR,
416 ("rejecting ctdb_control_db_push_start when not frozen\n"));
417 return -1;
420 if (ctdb_db->push_started) {
421 DEBUG(DEBUG_WARNING,
422 (__location__ " DB push already started for %s\n",
423 ctdb_db->db_name));
425 /* De-register old state */
426 state = (struct db_push_state *)ctdb_db->push_state;
427 if (state != NULL) {
428 srvid_deregister(ctdb->srv, state->srvid, state);
429 talloc_free(state);
430 ctdb_db->push_state = NULL;
434 state = talloc_zero(ctdb_db, struct db_push_state);
435 if (state == NULL) {
436 DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
437 return -1;
440 state->ctdb = ctdb;
441 state->ctdb_db = ctdb_db;
442 state->srvid = pulldb_ext->srvid;
443 state->failed = false;
445 ret = srvid_register(ctdb->srv, state, state->srvid,
446 db_push_msg_handler, state);
447 if (ret != 0) {
448 DEBUG(DEBUG_ERR,
449 (__location__ " Failed to register srvid for db push\n"));
450 talloc_free(state);
451 return -1;
454 if (ctdb_lockdb_mark(ctdb_db) != 0) {
455 DEBUG(DEBUG_ERR,
456 (__location__ " Failed to get lock on entire db - failing\n"));
457 srvid_deregister(ctdb->srv, state->srvid, state);
458 talloc_free(state);
459 return -1;
462 ctdb_db->push_started = true;
463 ctdb_db->push_state = state;
465 return 0;
468 int32_t ctdb_control_db_push_confirm(struct ctdb_context *ctdb,
469 TDB_DATA indata, TDB_DATA *outdata)
471 uint32_t db_id;
472 struct ctdb_db_context *ctdb_db;
473 struct db_push_state *state;
475 db_id = *(uint32_t *)indata.dptr;
477 ctdb_db = find_ctdb_db(ctdb, db_id);
478 if (ctdb_db == NULL) {
479 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
480 return -1;
483 if (!ctdb_db_frozen(ctdb_db)) {
484 DEBUG(DEBUG_ERR,
485 ("rejecting ctdb_control_db_push_confirm when not frozen\n"));
486 return -1;
489 if (!ctdb_db->push_started) {
490 DEBUG(DEBUG_ERR, (__location__ " DB push not started\n"));
491 return -1;
494 if (ctdb_db_readonly(ctdb_db)) {
495 DEBUG(DEBUG_ERR,
496 ("Clearing the tracking database for dbid 0x%x\n",
497 ctdb_db->db_id));
498 if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
499 DEBUG(DEBUG_ERR,
500 ("Failed to wipe tracking database for 0x%x."
501 " Dropping read-only delegation support\n",
502 ctdb_db->db_id));
503 tdb_close(ctdb_db->rottdb);
504 ctdb_db->rottdb = NULL;
505 ctdb_db_reset_readonly(ctdb_db);
508 while (ctdb_db->revokechild_active != NULL) {
509 talloc_free(ctdb_db->revokechild_active);
513 ctdb_lockdb_unmark(ctdb_db);
515 state = (struct db_push_state *)ctdb_db->push_state;
516 if (state == NULL) {
517 DEBUG(DEBUG_ERR, (__location__ " Missing push db state\n"));
518 return -1;
521 srvid_deregister(ctdb->srv, state->srvid, state);
523 outdata->dptr = talloc_size(outdata, sizeof(uint32_t));
524 if (outdata->dptr == NULL) {
525 DEBUG(DEBUG_ERR, (__location__ " Memory allocation error\n"));
526 talloc_free(state);
527 ctdb_db->push_state = NULL;
528 return -1;
531 memcpy(outdata->dptr, (uint8_t *)&state->num_records, sizeof(uint32_t));
532 outdata->dsize = sizeof(uint32_t);
534 talloc_free(state);
535 ctdb_db->push_started = false;
536 ctdb_db->push_state = NULL;
538 return 0;
541 struct set_recmode_state {
542 struct ctdb_context *ctdb;
543 struct ctdb_req_control_old *c;
546 static void set_recmode_handler(char status,
547 double latency,
548 void *private_data)
550 struct set_recmode_state *state = talloc_get_type_abort(
551 private_data, struct set_recmode_state);
552 int s = 0;
553 const char *err = NULL;
555 switch (status) {
556 case '0':
557 /* Mutex taken */
558 DEBUG(DEBUG_ERR,
559 ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
560 state->ctdb->recovery_lock));
561 s = -1;
562 err = "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem";
563 break;
565 case '1':
566 /* Contention */
567 DEBUG(DEBUG_DEBUG, (__location__ " Recovery lock check OK\n"));
568 state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
569 ctdb_process_deferred_attach(state->ctdb);
571 s = 0;
573 CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock",
574 reclock.ctdbd, latency);
575 break;
577 case '2':
578 /* Timeout. Consider this a success, not a failure,
579 * as we failed to set the recovery lock which is what
580 * we wanted. This can be caused by the cluster
581 * filesystem being very slow to arbitrate locks
582 * immediately after a node failure. */
583 DEBUG(DEBUG_WARNING,
584 (__location__
585 "Time out getting recovery lock, allowing recmode set anyway\n"));
586 state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
587 ctdb_process_deferred_attach(state->ctdb);
589 s = 0;
590 break;
592 default:
593 DEBUG(DEBUG_ERR,
594 ("Unexpected error when testing recovery lock\n"));
595 s = -1;
596 err = "Unexpected error when testing recovery lock";
599 ctdb_request_control_reply(state->ctdb, state->c, NULL, s, err);
600 talloc_free(state);
603 static void
604 ctdb_drop_all_ips_event(struct tevent_context *ev, struct tevent_timer *te,
605 struct timeval t, void *private_data)
607 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
609 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
610 talloc_free(ctdb->release_ips_ctx);
611 ctdb->release_ips_ctx = NULL;
613 ctdb_release_all_ips(ctdb);
617 * Set up an event to drop all public ips if we remain in recovery for too
618 * long
620 int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
622 if (ctdb->release_ips_ctx != NULL) {
623 talloc_free(ctdb->release_ips_ctx);
625 ctdb->release_ips_ctx = talloc_new(ctdb);
626 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
628 tevent_add_timer(ctdb->ev, ctdb->release_ips_ctx,
629 timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0),
630 ctdb_drop_all_ips_event, ctdb);
631 return 0;
635 set the recovery mode
637 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
638 struct ctdb_req_control_old *c,
639 TDB_DATA indata, bool *async_reply,
640 const char **errormsg)
642 uint32_t recmode = *(uint32_t *)indata.dptr;
643 struct ctdb_db_context *ctdb_db;
644 struct set_recmode_state *state;
645 struct ctdb_cluster_mutex_handle *h;
647 if (recmode == ctdb->recovery_mode) {
648 D_INFO("Recovery mode already set to %s\n",
649 recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
650 return 0;
653 D_NOTICE("Recovery mode set to %s\n",
654 recmode == CTDB_RECOVERY_NORMAL ? "NORMAL" : "ACTIVE");
656 /* if we enter recovery but stay in recovery for too long
657 we will eventually drop all our ip addresses
659 if (recmode == CTDB_RECOVERY_ACTIVE) {
660 if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
661 D_ERR("Failed to set up deferred drop all ips\n");
664 ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
665 return 0;
668 /* From this point: recmode == CTDB_RECOVERY_NORMAL
670 * Therefore, what follows is special handling when setting
671 * recovery mode back to normal */
673 TALLOC_FREE(ctdb->release_ips_ctx);
675 for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
676 if (ctdb_db->generation != ctdb->vnn_map->generation) {
677 DEBUG(DEBUG_ERR,
678 ("Inconsistent DB generation %u for %s\n",
679 ctdb_db->generation, ctdb_db->db_name));
680 DEBUG(DEBUG_ERR, ("Recovery mode set to ACTIVE\n"));
681 return -1;
685 /* force the databases to thaw */
686 if (ctdb_db_all_frozen(ctdb)) {
687 ctdb_control_thaw(ctdb, false);
690 if (ctdb->recovery_lock == NULL) {
691 /* Not using recovery lock file */
692 ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
693 ctdb_process_deferred_attach(ctdb);
694 return 0;
697 state = talloc_zero(ctdb, struct set_recmode_state);
698 if (state == NULL) {
699 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
700 return -1;
702 state->ctdb = ctdb;
703 state->c = NULL;
705 h = ctdb_cluster_mutex(state, ctdb, ctdb->recovery_lock, 5,
706 set_recmode_handler, state, NULL, NULL);
707 if (h == NULL) {
708 talloc_free(state);
709 return -1;
712 state->c = talloc_steal(state, c);
713 *async_reply = true;
715 return 0;
720 delete a record as part of the vacuum process
721 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
722 use non-blocking locks
724 return 0 if the record was successfully deleted (i.e. it does not exist
725 when the function returns)
726 or !0 is the record still exists in the tdb after returning.
728 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data_old *rec)
730 TDB_DATA key, data, data2;
731 struct ctdb_ltdb_header *hdr, *hdr2;
733 /* these are really internal tdb functions - but we need them here for
734 non-blocking lock of the freelist */
735 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
736 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
739 key.dsize = rec->keylen;
740 key.dptr = &rec->data[0];
741 data.dsize = rec->datalen;
742 data.dptr = &rec->data[rec->keylen];
744 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
745 DBG_INFO("Called delete on record where we are lmaster\n");
746 return -1;
749 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
750 DBG_ERR("Bad record size\n");
751 return -1;
754 hdr = (struct ctdb_ltdb_header *)data.dptr;
756 /* use a non-blocking lock */
757 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
758 DBG_INFO("Failed to get non-blocking chain lock\n");
759 return -1;
762 data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
763 if (data2.dptr == NULL) {
764 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
765 return 0;
768 if (data2.dsize < sizeof(struct ctdb_ltdb_header)) {
769 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
770 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
771 DBG_ERR("Failed to delete corrupt record\n");
773 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
774 DBG_ERR("Deleted corrupt record\n");
776 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
777 free(data2.dptr);
778 return 0;
781 hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
783 if (hdr2->rsn > hdr->rsn) {
784 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
785 DBG_INFO("Skipping record with rsn=%llu - called with rsn=%llu\n",
786 (unsigned long long)hdr2->rsn,
787 (unsigned long long)hdr->rsn);
788 free(data2.dptr);
789 return -1;
792 /* do not allow deleting record that have readonly flags set. */
793 if (hdr->flags & CTDB_REC_RO_FLAGS) {
794 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
795 DBG_INFO("Skipping record with readonly flags set\n");
796 free(data2.dptr);
797 return -1;
799 if (hdr2->flags & CTDB_REC_RO_FLAGS) {
800 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
801 DBG_INFO("Skipping record with readonly flags set locally\n");
802 free(data2.dptr);
803 return -1;
806 if (hdr2->dmaster == ctdb->pnn) {
807 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
808 DBG_INFO("Attempted delete record where we are the dmaster\n");
809 free(data2.dptr);
810 return -1;
813 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
814 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
815 DBG_INFO("Failed to get non-blocking freelist lock\n");
816 free(data2.dptr);
817 return -1;
820 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
821 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
822 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
823 DBG_INFO("Failed to delete record\n");
824 free(data2.dptr);
825 return -1;
828 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
829 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
830 free(data2.dptr);
831 return 0;
836 struct recovery_callback_state {
837 struct ctdb_req_control_old *c;
842 called when the 'recovered' event script has finished
844 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
846 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
848 CTDB_INCREMENT_STAT(ctdb, num_recoveries);
850 if (status != 0) {
851 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
852 if (status == -ETIMEDOUT) {
853 ctdb_ban_self(ctdb);
857 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
858 talloc_free(state);
860 gettimeofday(&ctdb->last_recovery_finished, NULL);
862 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
863 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_STARTUP);
868 recovery has finished
870 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
871 struct ctdb_req_control_old *c,
872 bool *async_reply)
874 int ret;
875 struct recovery_callback_state *state;
877 DEBUG(DEBUG_ERR,("Recovery has finished\n"));
879 ctdb_persistent_finish_trans3_commits(ctdb);
881 state = talloc(ctdb, struct recovery_callback_state);
882 CTDB_NO_MEMORY(ctdb, state);
884 state->c = c;
886 ret = ctdb_event_script_callback(ctdb, state,
887 ctdb_end_recovery_callback,
888 state,
889 CTDB_EVENT_RECOVERED, "%s", "");
891 if (ret != 0) {
892 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
893 talloc_free(state);
894 return -1;
897 /* tell the control that we will be reply asynchronously */
898 state->c = talloc_steal(state, c);
899 *async_reply = true;
900 return 0;
904 called when the 'startrecovery' event script has finished
906 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
908 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
910 if (status != 0) {
911 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
914 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
915 talloc_free(state);
918 static void run_start_recovery_event(struct ctdb_context *ctdb,
919 struct recovery_callback_state *state)
921 int ret;
923 ret = ctdb_event_script_callback(ctdb, state,
924 ctdb_start_recovery_callback,
925 state,
926 CTDB_EVENT_START_RECOVERY,
927 "%s", "");
929 if (ret != 0) {
930 DEBUG(DEBUG_ERR,("Unable to run startrecovery event\n"));
931 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
932 talloc_free(state);
933 return;
936 return;
939 static bool reclock_strings_equal(const char *a, const char *b)
941 return (a == NULL && b == NULL) ||
942 (a != NULL && b != NULL && strcmp(a, b) == 0);
945 static void start_recovery_reclock_callback(struct ctdb_context *ctdb,
946 int32_t status,
947 TDB_DATA data,
948 const char *errormsg,
949 void *private_data)
951 struct recovery_callback_state *state = talloc_get_type_abort(
952 private_data, struct recovery_callback_state);
953 const char *local = ctdb->recovery_lock;
954 const char *remote = NULL;
956 if (status != 0) {
957 DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
958 ctdb_request_control_reply(ctdb, state->c, NULL,
959 status, errormsg);
960 talloc_free(state);
961 return;
964 /* Check reclock consistency */
965 if (data.dsize > 0) {
966 /* Ensure NUL-termination */
967 data.dptr[data.dsize-1] = '\0';
968 remote = (const char *)data.dptr;
970 if (! reclock_strings_equal(local, remote)) {
971 /* Inconsistent */
972 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
973 DEBUG(DEBUG_ERR,
974 ("Recovery lock configuration inconsistent: "
975 "recmaster has %s, this node has %s, shutting down\n",
976 remote == NULL ? "NULL" : remote,
977 local == NULL ? "NULL" : local));
978 talloc_free(state);
979 ctdb_shutdown_sequence(ctdb, 1);
981 DEBUG(DEBUG_INFO,
982 ("Recovery lock consistency check successful\n"));
984 run_start_recovery_event(ctdb, state);
987 /* Check recovery lock consistency and run eventscripts for the
988 * "startrecovery" event */
989 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
990 struct ctdb_req_control_old *c,
991 bool *async_reply)
993 int ret;
994 struct recovery_callback_state *state;
995 uint32_t recmaster = c->hdr.srcnode;
997 DEBUG(DEBUG_ERR, ("Recovery has started\n"));
998 gettimeofday(&ctdb->last_recovery_started, NULL);
1000 state = talloc(ctdb, struct recovery_callback_state);
1001 CTDB_NO_MEMORY(ctdb, state);
1003 state->c = c;
1005 /* Although the recovery master sent this node a start
1006 * recovery control, this node might still think the recovery
1007 * master is disconnected. In this case defer the recovery
1008 * lock consistency check. */
1009 if (ctdb->nodes[recmaster]->flags & NODE_FLAGS_DISCONNECTED) {
1010 run_start_recovery_event(ctdb, state);
1011 } else {
1012 /* Ask the recovery master about its reclock setting */
1013 ret = ctdb_daemon_send_control(ctdb,
1014 recmaster,
1016 CTDB_CONTROL_GET_RECLOCK_FILE,
1017 0, 0,
1018 tdb_null,
1019 start_recovery_reclock_callback,
1020 state);
1022 if (ret != 0) {
1023 DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
1024 talloc_free(state);
1025 return -1;
1029 /* tell the control that we will be reply asynchronously */
1030 state->c = talloc_steal(state, c);
1031 *async_reply = true;
1033 return 0;
1037 try to delete all these records as part of the vacuuming process
1038 and return the records we failed to delete
1040 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1042 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1043 struct ctdb_db_context *ctdb_db;
1044 unsigned int i;
1045 struct ctdb_rec_data_old *rec;
1046 struct ctdb_marshall_buffer *records;
1048 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1049 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1050 return -1;
1053 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1054 if (!ctdb_db) {
1055 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1056 return -1;
1060 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1061 reply->count, reply->db_id));
1064 /* create a blob to send back the records we couldnt delete */
1065 records = (struct ctdb_marshall_buffer *)
1066 talloc_zero_size(outdata,
1067 offsetof(struct ctdb_marshall_buffer, data));
1068 if (records == NULL) {
1069 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1070 return -1;
1072 records->db_id = ctdb_db->db_id;
1075 rec = (struct ctdb_rec_data_old *)&reply->data[0];
1076 for (i=0;i<reply->count;i++) {
1077 TDB_DATA key, data;
1079 key.dptr = &rec->data[0];
1080 key.dsize = rec->keylen;
1081 data.dptr = &rec->data[key.dsize];
1082 data.dsize = rec->datalen;
1084 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1085 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1086 talloc_free(records);
1087 return -1;
1090 /* If we cant delete the record we must add it to the reply
1091 so the lmaster knows it may not purge this record
1093 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1094 size_t old_size;
1095 struct ctdb_ltdb_header *hdr;
1097 hdr = (struct ctdb_ltdb_header *)data.dptr;
1098 data.dptr += sizeof(*hdr);
1099 data.dsize -= sizeof(*hdr);
1101 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1103 old_size = talloc_get_size(records);
1104 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1105 if (records == NULL) {
1106 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1107 return -1;
1109 records->count++;
1110 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1113 rec = (struct ctdb_rec_data_old *)(rec->length + (uint8_t *)rec);
1117 *outdata = ctdb_marshall_finish(records);
1119 return 0;
1123 report capabilities
1125 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1127 uint32_t *capabilities = NULL;
1129 capabilities = talloc(outdata, uint32_t);
1130 CTDB_NO_MEMORY(ctdb, capabilities);
1131 *capabilities = ctdb->capabilities;
1133 outdata->dsize = sizeof(uint32_t);
1134 outdata->dptr = (uint8_t *)capabilities;
1136 return 0;
1139 /* The recovery daemon will ping us at regular intervals.
1140 If we havent been pinged for a while we assume the recovery
1141 daemon is inoperable and we restart.
1143 static void ctdb_recd_ping_timeout(struct tevent_context *ev,
1144 struct tevent_timer *te,
1145 struct timeval t, void *p)
1147 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1148 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1150 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1152 if (*count < ctdb->tunable.recd_ping_failcount) {
1153 (*count)++;
1154 tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
1155 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1156 ctdb_recd_ping_timeout, ctdb);
1157 return;
1160 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
1162 ctdb_stop_recoverd(ctdb);
1163 ctdb_start_recoverd(ctdb);
1166 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1168 talloc_free(ctdb->recd_ping_count);
1170 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1171 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1173 if (ctdb->tunable.recd_ping_timeout != 0) {
1174 tevent_add_timer(ctdb->ev, ctdb->recd_ping_count,
1175 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1176 ctdb_recd_ping_timeout, ctdb);
1179 return 0;
1182 void ctdb_node_become_inactive(struct ctdb_context *ctdb)
1184 struct ctdb_db_context *ctdb_db;
1186 D_WARNING("Making node INACTIVE\n");
1189 * Do not service database calls - reset generation to invalid
1190 * so this node ignores any REQ/REPLY CALL/DMASTER
1192 ctdb->vnn_map->generation = INVALID_GENERATION;
1193 for (ctdb_db = ctdb->db_list; ctdb_db != NULL; ctdb_db = ctdb_db->next) {
1194 ctdb_db->generation = INVALID_GENERATION;
1198 * Although this bypasses the control, the only thing missing
1199 * is the deferred drop of all public IPs, which isn't
1200 * necessary because they are dropped below
1202 if (ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
1203 D_NOTICE("Recovery mode set to ACTIVE\n");
1204 ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1208 * Initiate database freeze - this will be scheduled for
1209 * immediate execution and will be in progress long before the
1210 * calling control returns
1212 ctdb_daemon_send_control(ctdb,
1213 ctdb->pnn,
1215 CTDB_CONTROL_FREEZE,
1217 CTDB_CTRL_FLAG_NOREPLY,
1218 tdb_null,
1219 NULL,
1220 NULL);
1222 D_NOTICE("Dropping all public IP addresses\n");
1223 ctdb_release_all_ips(ctdb);
1226 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb)
1228 DEBUG(DEBUG_ERR, ("Stopping node\n"));
1229 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1231 ctdb_node_become_inactive(ctdb);
1233 return 0;
1236 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1238 DEBUG(DEBUG_ERR, ("Continue node\n"));
1239 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1241 return 0;