ctdb_mutex_ceph_rados_helper: fix deadlock via lock renewals
[Samba.git] / ctdb / utils / ceph / ctdb_mutex_ceph_rados_helper.c
blob7ef76c26e02826eac43f49d37d69a244cb16ab40
1 /*
2 CTDB mutex helper using Ceph librados locks
4 Copyright (C) David Disseldorp 2016-2018
6 Based on ctdb_mutex_fcntl_helper.c, which is:
7 Copyright (C) Martin Schwenke 2015
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "replace.h"
25 #include "tevent.h"
26 #include "talloc.h"
27 #include "rados/librados.h"
29 #define CTDB_MUTEX_CEPH_LOCK_NAME "ctdb_reclock_mutex"
30 #define CTDB_MUTEX_CEPH_LOCK_COOKIE CTDB_MUTEX_CEPH_LOCK_NAME
31 #define CTDB_MUTEX_CEPH_LOCK_DESC "CTDB recovery lock"
33 * During failover it may take up to <lock duration> seconds before the
34 * newly elected recovery master can obtain the lock.
36 #define CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT 10
38 #define CTDB_MUTEX_STATUS_HOLDING "0"
39 #define CTDB_MUTEX_STATUS_CONTENDED "1"
40 #define CTDB_MUTEX_STATUS_TIMEOUT "2"
41 #define CTDB_MUTEX_STATUS_ERROR "3"
43 static char *progname = NULL;
45 static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name,
46 const char *ceph_auth_name,
47 const char *pool_name,
48 rados_t *_ceph_cluster,
49 rados_ioctx_t *_ioctx)
51 rados_t ceph_cluster = NULL;
52 rados_ioctx_t ioctx = NULL;
53 int ret;
55 ret = rados_create2(&ceph_cluster, ceph_cluster_name, ceph_auth_name, 0);
56 if (ret < 0) {
57 fprintf(stderr, "%s: failed to initialise Ceph cluster %s as %s"
58 " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
59 strerror(-ret));
60 return ret;
63 /* path=NULL tells librados to use default locations */
64 ret = rados_conf_read_file(ceph_cluster, NULL);
65 if (ret < 0) {
66 fprintf(stderr, "%s: failed to parse Ceph cluster config"
67 " - (%s)\n", progname, strerror(-ret));
68 rados_shutdown(ceph_cluster);
69 return ret;
72 ret = rados_connect(ceph_cluster);
73 if (ret < 0) {
74 fprintf(stderr, "%s: failed to connect to Ceph cluster %s as %s"
75 " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
76 strerror(-ret));
77 rados_shutdown(ceph_cluster);
78 return ret;
82 ret = rados_ioctx_create(ceph_cluster, pool_name, &ioctx);
83 if (ret < 0) {
84 fprintf(stderr, "%s: failed to create Ceph ioctx for pool %s"
85 " - (%s)\n", progname, pool_name, strerror(-ret));
86 rados_shutdown(ceph_cluster);
87 return ret;
90 *_ceph_cluster = ceph_cluster;
91 *_ioctx = ioctx;
93 return 0;
96 static int ctdb_mutex_rados_lock(rados_ioctx_t *ioctx,
97 const char *oid,
98 uint64_t lock_duration_s,
99 uint8_t flags)
101 int ret;
102 struct timeval tv = { lock_duration_s, 0 };
104 ret = rados_lock_exclusive(ioctx, oid,
105 CTDB_MUTEX_CEPH_LOCK_NAME,
106 CTDB_MUTEX_CEPH_LOCK_COOKIE,
107 CTDB_MUTEX_CEPH_LOCK_DESC,
108 lock_duration_s == 0 ? NULL : &tv,
109 flags);
110 if ((ret == -EEXIST) || (ret == -EBUSY)) {
111 /* lock contention */
112 return ret;
113 } else if (ret < 0) {
114 /* unexpected failure */
115 fprintf(stderr,
116 "%s: Failed to get lock on RADOS object '%s' - (%s)\n",
117 progname, oid, strerror(-ret));
118 return ret;
121 /* lock obtained */
122 return 0;
125 static int ctdb_mutex_rados_unlock(rados_ioctx_t *ioctx,
126 const char *oid)
128 int ret;
130 ret = rados_unlock(ioctx, oid,
131 CTDB_MUTEX_CEPH_LOCK_NAME,
132 CTDB_MUTEX_CEPH_LOCK_COOKIE);
133 if (ret < 0) {
134 fprintf(stderr,
135 "%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
136 progname, oid, strerror(-ret));
137 return ret;
140 return 0;
143 struct ctdb_mutex_rados_state {
144 bool holding_mutex;
145 const char *ceph_cluster_name;
146 const char *ceph_auth_name;
147 const char *pool_name;
148 const char *object;
149 uint64_t lock_duration_s;
150 int ppid;
151 struct tevent_context *ev;
152 struct tevent_signal *sigterm_ev;
153 struct tevent_signal *sigint_ev;
154 struct tevent_timer *ppid_timer_ev;
155 struct tevent_timer *renew_timer_ev;
156 rados_t ceph_cluster;
157 rados_ioctx_t ioctx;
160 static void ctdb_mutex_rados_sigterm_cb(struct tevent_context *ev,
161 struct tevent_signal *se,
162 int signum,
163 int count,
164 void *siginfo,
165 void *private_data)
167 struct ctdb_mutex_rados_state *cmr_state = private_data;
168 int ret = 0;
170 if (!cmr_state->holding_mutex) {
171 fprintf(stderr, "Sigterm callback invoked without mutex!\n");
172 ret = -EINVAL;
175 talloc_free(cmr_state);
176 exit(ret ? 1 : 0);
179 static void ctdb_mutex_rados_ppid_timer_cb(struct tevent_context *ev,
180 struct tevent_timer *te,
181 struct timeval current_time,
182 void *private_data)
184 struct ctdb_mutex_rados_state *cmr_state = private_data;
185 int ret = 0;
187 if (!cmr_state->holding_mutex) {
188 fprintf(stderr, "Timer callback invoked without mutex!\n");
189 ret = -EINVAL;
190 goto err_ctx_cleanup;
193 if ((kill(cmr_state->ppid, 0) == 0) || (errno != ESRCH)) {
194 /* parent still around, keep waiting */
195 cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev,
196 cmr_state,
197 tevent_timeval_current_ofs(5, 0),
198 ctdb_mutex_rados_ppid_timer_cb,
199 cmr_state);
200 if (cmr_state->ppid_timer_ev == NULL) {
201 fprintf(stderr, "Failed to create timer event\n");
202 /* rely on signal cb */
204 return;
207 /* parent ended, drop lock (via destructor) and exit */
208 err_ctx_cleanup:
209 talloc_free(cmr_state);
210 exit(ret ? 1 : 0);
213 #define USECS_IN_SEC 1000000
215 static void ctdb_mutex_rados_lock_renew_timer_cb(struct tevent_context *ev,
216 struct tevent_timer *te,
217 struct timeval current_time,
218 void *private_data)
220 struct ctdb_mutex_rados_state *cmr_state = private_data;
221 struct timeval tv;
222 int ret;
224 ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object,
225 cmr_state->lock_duration_s,
226 LIBRADOS_LOCK_FLAG_RENEW);
227 if (ret == -EBUSY) {
228 /* should never get -EEXIST on renewal */
229 fprintf(stderr, "Lock contention during renew: %d\n", ret);
230 goto err_ctx_cleanup;
231 } else if (ret < 0) {
232 fprintf(stderr, "Lock renew failed\n");
233 goto err_ctx_cleanup;
236 tv = tevent_timeval_current_ofs(0,
237 cmr_state->lock_duration_s * (USECS_IN_SEC / 2));
238 cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev,
239 cmr_state,
241 ctdb_mutex_rados_lock_renew_timer_cb,
242 cmr_state);
243 if (cmr_state->renew_timer_ev == NULL) {
244 fprintf(stderr, "Failed to create timer event\n");
245 goto err_ctx_cleanup;
248 return;
250 err_ctx_cleanup:
251 /* drop lock (via destructor) and exit */
252 talloc_free(cmr_state);
253 exit(1);
256 static int ctdb_mutex_rados_state_destroy(struct ctdb_mutex_rados_state *cmr_state)
258 if (cmr_state->holding_mutex) {
259 ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
261 if (cmr_state->ioctx != NULL) {
262 rados_ioctx_destroy(cmr_state->ioctx);
264 if (cmr_state->ceph_cluster != NULL) {
265 rados_shutdown(cmr_state->ceph_cluster);
267 return 0;
270 int main(int argc, char *argv[])
272 int ret;
273 struct ctdb_mutex_rados_state *cmr_state;
275 progname = argv[0];
277 if ((argc != 5) && (argc != 6)) {
278 fprintf(stderr, "Usage: %s <Ceph Cluster> <Ceph user> "
279 "<RADOS pool> <RADOS object> "
280 "[lock duration secs]\n",
281 progname);
282 ret = -EINVAL;
283 goto err_out;
286 ret = setvbuf(stdout, NULL, _IONBF, 0);
287 if (ret != 0) {
288 fprintf(stderr, "Failed to configure unbuffered stdout I/O\n");
291 cmr_state = talloc_zero(NULL, struct ctdb_mutex_rados_state);
292 if (cmr_state == NULL) {
293 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
294 ret = -ENOMEM;
295 goto err_out;
298 talloc_set_destructor(cmr_state, ctdb_mutex_rados_state_destroy);
299 cmr_state->ceph_cluster_name = argv[1];
300 cmr_state->ceph_auth_name = argv[2];
301 cmr_state->pool_name = argv[3];
302 cmr_state->object = argv[4];
303 if (argc == 6) {
304 /* optional lock duration provided */
305 char *endptr = NULL;
306 cmr_state->lock_duration_s = strtoull(argv[5], &endptr, 0);
307 if ((endptr == argv[5]) || (*endptr != '\0')) {
308 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
309 ret = -EINVAL;
310 goto err_ctx_cleanup;
312 } else {
313 cmr_state->lock_duration_s
314 = CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT;
317 cmr_state->ppid = getppid();
318 if (cmr_state->ppid == 1) {
320 * The original parent is gone and the process has
321 * been reparented to init. This can happen if the
322 * helper is started just as the parent is killed
323 * during shutdown. The error message doesn't need to
324 * be stellar, since there won't be anything around to
325 * capture and log it...
327 fprintf(stderr, "%s: PPID == 1\n", progname);
328 ret = -EPIPE;
329 goto err_ctx_cleanup;
332 cmr_state->ev = tevent_context_init(cmr_state);
333 if (cmr_state->ev == NULL) {
334 fprintf(stderr, "tevent_context_init failed\n");
335 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
336 ret = -ENOMEM;
337 goto err_ctx_cleanup;
340 /* wait for sigterm */
341 cmr_state->sigterm_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0,
342 ctdb_mutex_rados_sigterm_cb,
343 cmr_state);
344 if (cmr_state->sigterm_ev == NULL) {
345 fprintf(stderr, "Failed to create term signal event\n");
346 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
347 ret = -ENOMEM;
348 goto err_ctx_cleanup;
351 cmr_state->sigint_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGINT, 0,
352 ctdb_mutex_rados_sigterm_cb,
353 cmr_state);
354 if (cmr_state->sigint_ev == NULL) {
355 fprintf(stderr, "Failed to create int signal event\n");
356 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
357 ret = -ENOMEM;
358 goto err_ctx_cleanup;
361 /* periodically check parent */
362 cmr_state->ppid_timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
363 tevent_timeval_current_ofs(5, 0),
364 ctdb_mutex_rados_ppid_timer_cb,
365 cmr_state);
366 if (cmr_state->ppid_timer_ev == NULL) {
367 fprintf(stderr, "Failed to create timer event\n");
368 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
369 ret = -ENOMEM;
370 goto err_ctx_cleanup;
373 ret = ctdb_mutex_rados_ctx_create(cmr_state->ceph_cluster_name,
374 cmr_state->ceph_auth_name,
375 cmr_state->pool_name,
376 &cmr_state->ceph_cluster,
377 &cmr_state->ioctx);
378 if (ret < 0) {
379 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
380 goto err_ctx_cleanup;
383 ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object,
384 cmr_state->lock_duration_s,
386 if ((ret == -EEXIST) || (ret == -EBUSY)) {
387 fprintf(stdout, CTDB_MUTEX_STATUS_CONTENDED);
388 goto err_ctx_cleanup;
389 } else if (ret < 0) {
390 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
391 goto err_ctx_cleanup;
393 cmr_state->holding_mutex = true;
395 if (cmr_state->lock_duration_s != 0) {
397 * renew (reobtain) the lock, using a period of half the lock
398 * duration. Convert to usecs to avoid rounding.
400 struct timeval tv = tevent_timeval_current_ofs(0,
401 cmr_state->lock_duration_s * (USECS_IN_SEC / 2));
402 cmr_state->renew_timer_ev = tevent_add_timer(cmr_state->ev,
403 cmr_state,
405 ctdb_mutex_rados_lock_renew_timer_cb,
406 cmr_state);
407 if (cmr_state->renew_timer_ev == NULL) {
408 fprintf(stderr, "Failed to create timer event\n");
409 fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
410 ret = -ENOMEM;
411 goto err_ctx_cleanup;
415 fprintf(stdout, CTDB_MUTEX_STATUS_HOLDING);
417 /* wait for the signal / timer events to do their work */
418 ret = tevent_loop_wait(cmr_state->ev);
419 if (ret < 0) {
420 goto err_ctx_cleanup;
422 err_ctx_cleanup:
423 talloc_free(cmr_state);
424 err_out:
425 return ret ? 1 : 0;