2 CTDB mutex helper using Ceph librados locks
4 Copyright (C) David Disseldorp 2016-2018
6 Based on ctdb_mutex_fcntl_helper.c, which is:
7 Copyright (C) Martin Schwenke 2015
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 #include "rados/librados.h"
29 #define CTDB_MUTEX_CEPH_LOCK_NAME "ctdb_reclock_mutex"
30 #define CTDB_MUTEX_CEPH_LOCK_COOKIE CTDB_MUTEX_CEPH_LOCK_NAME
31 #define CTDB_MUTEX_CEPH_LOCK_DESC "CTDB recovery lock"
33 * During failover it may take up to <lock duration> seconds before the
34 * newly elected recovery master can obtain the lock.
36 #define CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT 10
38 #define CTDB_MUTEX_STATUS_HOLDING "0"
39 #define CTDB_MUTEX_STATUS_CONTENDED "1"
40 #define CTDB_MUTEX_STATUS_TIMEOUT "2"
41 #define CTDB_MUTEX_STATUS_ERROR "3"
43 static char *progname
= NULL
;
45 static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name
,
46 const char *ceph_auth_name
,
47 const char *pool_name
,
48 rados_t
*_ceph_cluster
,
49 rados_ioctx_t
*_ioctx
)
51 rados_t ceph_cluster
= NULL
;
52 rados_ioctx_t ioctx
= NULL
;
55 ret
= rados_create2(&ceph_cluster
, ceph_cluster_name
, ceph_auth_name
, 0);
57 fprintf(stderr
, "%s: failed to initialise Ceph cluster %s as %s"
58 " - (%s)\n", progname
, ceph_cluster_name
, ceph_auth_name
,
63 /* path=NULL tells librados to use default locations */
64 ret
= rados_conf_read_file(ceph_cluster
, NULL
);
66 fprintf(stderr
, "%s: failed to parse Ceph cluster config"
67 " - (%s)\n", progname
, strerror(-ret
));
68 rados_shutdown(ceph_cluster
);
72 ret
= rados_connect(ceph_cluster
);
74 fprintf(stderr
, "%s: failed to connect to Ceph cluster %s as %s"
75 " - (%s)\n", progname
, ceph_cluster_name
, ceph_auth_name
,
77 rados_shutdown(ceph_cluster
);
82 ret
= rados_ioctx_create(ceph_cluster
, pool_name
, &ioctx
);
84 fprintf(stderr
, "%s: failed to create Ceph ioctx for pool %s"
85 " - (%s)\n", progname
, pool_name
, strerror(-ret
));
86 rados_shutdown(ceph_cluster
);
90 *_ceph_cluster
= ceph_cluster
;
96 static int ctdb_mutex_rados_lock(rados_ioctx_t
*ioctx
,
98 uint64_t lock_duration_s
,
102 struct timeval tv
= { lock_duration_s
, 0 };
104 ret
= rados_lock_exclusive(ioctx
, oid
,
105 CTDB_MUTEX_CEPH_LOCK_NAME
,
106 CTDB_MUTEX_CEPH_LOCK_COOKIE
,
107 CTDB_MUTEX_CEPH_LOCK_DESC
,
108 lock_duration_s
== 0 ? NULL
: &tv
,
110 if ((ret
== -EEXIST
) || (ret
== -EBUSY
)) {
111 /* lock contention */
113 } else if (ret
< 0) {
114 /* unexpected failure */
116 "%s: Failed to get lock on RADOS object '%s' - (%s)\n",
117 progname
, oid
, strerror(-ret
));
125 static int ctdb_mutex_rados_unlock(rados_ioctx_t
*ioctx
,
130 ret
= rados_unlock(ioctx
, oid
,
131 CTDB_MUTEX_CEPH_LOCK_NAME
,
132 CTDB_MUTEX_CEPH_LOCK_COOKIE
);
135 "%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
136 progname
, oid
, strerror(-ret
));
143 struct ctdb_mutex_rados_state
{
145 const char *ceph_cluster_name
;
146 const char *ceph_auth_name
;
147 const char *pool_name
;
149 uint64_t lock_duration_s
;
151 struct tevent_context
*ev
;
152 struct tevent_signal
*sigterm_ev
;
153 struct tevent_signal
*sigint_ev
;
154 struct tevent_timer
*ppid_timer_ev
;
155 struct tevent_timer
*renew_timer_ev
;
156 rados_t ceph_cluster
;
160 static void ctdb_mutex_rados_sigterm_cb(struct tevent_context
*ev
,
161 struct tevent_signal
*se
,
167 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
170 if (!cmr_state
->holding_mutex
) {
171 fprintf(stderr
, "Sigterm callback invoked without mutex!\n");
175 talloc_free(cmr_state
);
179 static void ctdb_mutex_rados_ppid_timer_cb(struct tevent_context
*ev
,
180 struct tevent_timer
*te
,
181 struct timeval current_time
,
184 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
187 if (!cmr_state
->holding_mutex
) {
188 fprintf(stderr
, "Timer callback invoked without mutex!\n");
190 goto err_ctx_cleanup
;
193 if ((kill(cmr_state
->ppid
, 0) == 0) || (errno
!= ESRCH
)) {
194 /* parent still around, keep waiting */
195 cmr_state
->ppid_timer_ev
= tevent_add_timer(cmr_state
->ev
,
197 tevent_timeval_current_ofs(5, 0),
198 ctdb_mutex_rados_ppid_timer_cb
,
200 if (cmr_state
->ppid_timer_ev
== NULL
) {
201 fprintf(stderr
, "Failed to create timer event\n");
202 /* rely on signal cb */
207 /* parent ended, drop lock (via destructor) and exit */
209 talloc_free(cmr_state
);
213 #define USECS_IN_SEC 1000000
215 static void ctdb_mutex_rados_lock_renew_timer_cb(struct tevent_context
*ev
,
216 struct tevent_timer
*te
,
217 struct timeval current_time
,
220 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
224 ret
= ctdb_mutex_rados_lock(cmr_state
->ioctx
, cmr_state
->object
,
225 cmr_state
->lock_duration_s
,
226 LIBRADOS_LOCK_FLAG_RENEW
);
228 /* should never get -EEXIST on renewal */
229 fprintf(stderr
, "Lock contention during renew: %d\n", ret
);
230 goto err_ctx_cleanup
;
231 } else if (ret
< 0) {
232 fprintf(stderr
, "Lock renew failed\n");
233 goto err_ctx_cleanup
;
236 tv
= tevent_timeval_current_ofs(0,
237 cmr_state
->lock_duration_s
* (USECS_IN_SEC
/ 2));
238 cmr_state
->renew_timer_ev
= tevent_add_timer(cmr_state
->ev
,
241 ctdb_mutex_rados_lock_renew_timer_cb
,
243 if (cmr_state
->renew_timer_ev
== NULL
) {
244 fprintf(stderr
, "Failed to create timer event\n");
245 goto err_ctx_cleanup
;
251 /* drop lock (via destructor) and exit */
252 talloc_free(cmr_state
);
256 static int ctdb_mutex_rados_state_destroy(struct ctdb_mutex_rados_state
*cmr_state
)
258 if (cmr_state
->holding_mutex
) {
259 ctdb_mutex_rados_unlock(cmr_state
->ioctx
, cmr_state
->object
);
261 if (cmr_state
->ioctx
!= NULL
) {
262 rados_ioctx_destroy(cmr_state
->ioctx
);
264 if (cmr_state
->ceph_cluster
!= NULL
) {
265 rados_shutdown(cmr_state
->ceph_cluster
);
270 int main(int argc
, char *argv
[])
273 struct ctdb_mutex_rados_state
*cmr_state
;
277 if ((argc
!= 5) && (argc
!= 6)) {
278 fprintf(stderr
, "Usage: %s <Ceph Cluster> <Ceph user> "
279 "<RADOS pool> <RADOS object> "
280 "[lock duration secs]\n",
286 ret
= setvbuf(stdout
, NULL
, _IONBF
, 0);
288 fprintf(stderr
, "Failed to configure unbuffered stdout I/O\n");
291 cmr_state
= talloc_zero(NULL
, struct ctdb_mutex_rados_state
);
292 if (cmr_state
== NULL
) {
293 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
298 talloc_set_destructor(cmr_state
, ctdb_mutex_rados_state_destroy
);
299 cmr_state
->ceph_cluster_name
= argv
[1];
300 cmr_state
->ceph_auth_name
= argv
[2];
301 cmr_state
->pool_name
= argv
[3];
302 cmr_state
->object
= argv
[4];
304 /* optional lock duration provided */
306 cmr_state
->lock_duration_s
= strtoull(argv
[5], &endptr
, 0);
307 if ((endptr
== argv
[5]) || (*endptr
!= '\0')) {
308 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
310 goto err_ctx_cleanup
;
313 cmr_state
->lock_duration_s
314 = CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT
;
317 cmr_state
->ppid
= getppid();
318 if (cmr_state
->ppid
== 1) {
320 * The original parent is gone and the process has
321 * been reparented to init. This can happen if the
322 * helper is started just as the parent is killed
323 * during shutdown. The error message doesn't need to
324 * be stellar, since there won't be anything around to
325 * capture and log it...
327 fprintf(stderr
, "%s: PPID == 1\n", progname
);
329 goto err_ctx_cleanup
;
332 cmr_state
->ev
= tevent_context_init(cmr_state
);
333 if (cmr_state
->ev
== NULL
) {
334 fprintf(stderr
, "tevent_context_init failed\n");
335 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
337 goto err_ctx_cleanup
;
340 /* wait for sigterm */
341 cmr_state
->sigterm_ev
= tevent_add_signal(cmr_state
->ev
, cmr_state
, SIGTERM
, 0,
342 ctdb_mutex_rados_sigterm_cb
,
344 if (cmr_state
->sigterm_ev
== NULL
) {
345 fprintf(stderr
, "Failed to create term signal event\n");
346 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
348 goto err_ctx_cleanup
;
351 cmr_state
->sigint_ev
= tevent_add_signal(cmr_state
->ev
, cmr_state
, SIGINT
, 0,
352 ctdb_mutex_rados_sigterm_cb
,
354 if (cmr_state
->sigint_ev
== NULL
) {
355 fprintf(stderr
, "Failed to create int signal event\n");
356 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
358 goto err_ctx_cleanup
;
361 /* periodically check parent */
362 cmr_state
->ppid_timer_ev
= tevent_add_timer(cmr_state
->ev
, cmr_state
,
363 tevent_timeval_current_ofs(5, 0),
364 ctdb_mutex_rados_ppid_timer_cb
,
366 if (cmr_state
->ppid_timer_ev
== NULL
) {
367 fprintf(stderr
, "Failed to create timer event\n");
368 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
370 goto err_ctx_cleanup
;
373 ret
= ctdb_mutex_rados_ctx_create(cmr_state
->ceph_cluster_name
,
374 cmr_state
->ceph_auth_name
,
375 cmr_state
->pool_name
,
376 &cmr_state
->ceph_cluster
,
379 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
380 goto err_ctx_cleanup
;
383 ret
= ctdb_mutex_rados_lock(cmr_state
->ioctx
, cmr_state
->object
,
384 cmr_state
->lock_duration_s
,
386 if ((ret
== -EEXIST
) || (ret
== -EBUSY
)) {
387 fprintf(stdout
, CTDB_MUTEX_STATUS_CONTENDED
);
388 goto err_ctx_cleanup
;
389 } else if (ret
< 0) {
390 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
391 goto err_ctx_cleanup
;
393 cmr_state
->holding_mutex
= true;
395 if (cmr_state
->lock_duration_s
!= 0) {
397 * renew (reobtain) the lock, using a period of half the lock
398 * duration. Convert to usecs to avoid rounding.
400 struct timeval tv
= tevent_timeval_current_ofs(0,
401 cmr_state
->lock_duration_s
* (USECS_IN_SEC
/ 2));
402 cmr_state
->renew_timer_ev
= tevent_add_timer(cmr_state
->ev
,
405 ctdb_mutex_rados_lock_renew_timer_cb
,
407 if (cmr_state
->renew_timer_ev
== NULL
) {
408 fprintf(stderr
, "Failed to create timer event\n");
409 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
411 goto err_ctx_cleanup
;
415 fprintf(stdout
, CTDB_MUTEX_STATUS_HOLDING
);
417 /* wait for the signal / timer events to do their work */
418 ret
= tevent_loop_wait(cmr_state
->ev
);
420 goto err_ctx_cleanup
;
423 talloc_free(cmr_state
);