From d497d7e4a796c6bda153e7f10fe9478724b855ab Mon Sep 17 00:00:00 2001 From: "Roland C. Dowdeswell" Date: Fri, 10 May 2019 19:13:26 +0100 Subject: [PATCH] krb5_sendto_kdc: failover for multiple AAAA/A RRs on one domain We found that the libraries behaviour when dealing with domains with more than one entry in them is slightly suboptimal. The situation was kdc1 IN A 1.2.3.4 kdc1 IN AAAA ff02::1 I.e. a single hostmame with both IPv6 and IPv4 addresses. When we run krb5_sendto_kdc on a box with only IPv4 addresses, there is a 3s delay before it fails back to the IPv4 address. This is because the library sets the 2nd address on each hostname to be 3s in the future and each additional one another 3s. We change wait_response() s.t. if one is able to make progress, we iterate over the list of hosts and move them all 1s forward. We also modify submit_request() to skip hosts if host_connect() fails. --- lib/krb5/send_to_kdc.c | 68 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/lib/krb5/send_to_kdc.c b/lib/krb5/send_to_kdc.c index 14dc77cab..123a20d47 100644 --- a/lib/krb5/send_to_kdc.c +++ b/lib/krb5/send_to_kdc.c @@ -329,6 +329,7 @@ static void debug_host(krb5_context context, int level, struct host *host, const char *fmt, ...) { const char *proto = "unknown"; + const char *state; char name[NI_MAXHOST], port[NI_MAXSERV]; char *text = NULL; va_list ap; @@ -354,8 +355,17 @@ debug_host(krb5_context context, int level, struct host *host, const char *fmt, name, sizeof(name), port, sizeof(port), NI_NUMERICHOST) != 0) name[0] = '\0'; - _krb5_debug(context, level, "%s: %s %s:%s (%s) tid: %08x", text, - proto, name, port, host->hi->hostname, host->tid); + switch (host->state) { + case CONNECT: state = "CONNECT"; break; + case CONNECTING: state = "CONNECTING"; break; + case CONNECTED: state = "CONNECTED"; break; + case WAITING_REPLY: state = "WAITING_REPLY"; break; + case DEAD: state = "DEAD"; break; + default: state = "unknown"; break; + } + + _krb5_debug(context, level, "%s: %s %s:%s (%s) state=%s tid: %08x", text, + proto, name, port, host->hi->hostname, state, host->tid); free(text); } @@ -896,11 +906,18 @@ submit_request(krb5_context context, krb5_sendto_ctx ctx, krb5_krbhst_info *hi) host->tries = host->fun->ntries; /* - * Connect directly next host, wait a host_timeout for each next address + * Connect directly next host, wait a host_timeout for each next address. + * We try host_connect() here, checking the return code because as we do + * non-blocking connects, any error here indicates that the address is just + * offline. That is, it's something like "No route to host" which is not + * worth retrying. And so, we fail directly and immediately to the next + * address for this host without enqueueing the address for retries. */ - if (submitted_host == 0) + if (submitted_host == 0) { host_connect(context, ctx, host); - else { + if (host->state == DEAD) + continue; + } else { debug_host(context, 5, host, "Queuing host in future (in %ds), its the %lu address on the same name", (int)(context->host_timeout * submitted_host), submitted_host + 1); @@ -908,16 +925,14 @@ submit_request(krb5_context context, krb5_sendto_ctx ctx, krb5_krbhst_info *hi) } heim_array_append_value(ctx->hosts, host); - heim_release(host); - submitted_host++; } if (freeai) freeaddrinfo(ai); - if (!submitted_host) + if (submitted_host == 0) return KRB5_KDC_UNREACH; return 0; @@ -928,7 +943,7 @@ struct wait_ctx { krb5_sendto_ctx ctx; fd_set rfds; fd_set wfds; - unsigned max_fd; + int max_fd; int got_reply; time_t timenow; }; @@ -939,16 +954,16 @@ wait_setup(heim_object_t obj, void *iter_ctx, int *stop) struct wait_ctx *wait_ctx = iter_ctx; struct host *h = (struct host *)obj; + if (h->state == CONNECT) { + if (h->timeout >= wait_ctx->timenow) + return; + host_connect(wait_ctx->context, wait_ctx->ctx, h); + } + /* skip dead hosts */ if (h->state == DEAD) return; - if (h->state == CONNECT) { - if (h->timeout < wait_ctx->timenow) - host_connect(wait_ctx->context, wait_ctx->ctx, h); - return; - } - /* if host timed out, dec tries and (retry or kill host) */ if (h->timeout < wait_ctx->timenow) { heim_assert(h->tries != 0, "tries should not reach 0"); @@ -976,6 +991,7 @@ wait_setup(heim_object_t obj, void *iter_ctx, int *stop) FD_SET(h->fd, &wait_ctx->wfds); break; default: + debug_host(wait_ctx->context, 5, h, "invalid sendto host state"); heim_abort("invalid sendto host state"); } if (h->fd > wait_ctx->max_fd) @@ -990,6 +1006,15 @@ wait_filter_dead(heim_object_t obj, void *ctx) } static void +wait_accelerate(heim_object_t obj, void *ctx, int *stop) +{ + struct host *h = (struct host *)obj; + + if (h->state == CONNECT && h->timeout > 0) + h->timeout--; +} + +static void wait_process(heim_object_t obj, void *ctx, int *stop) { struct wait_ctx *wait_ctx = ctx; @@ -1022,7 +1047,7 @@ wait_response(krb5_context context, int *action, krb5_sendto_ctx ctx) wait_ctx.ctx = ctx; FD_ZERO(&wait_ctx.rfds); FD_ZERO(&wait_ctx.wfds); - wait_ctx.max_fd = 0; + wait_ctx.max_fd = -1; /* oh, we have a reply, it must be a plugin that got it for us */ if (ctx->response.length) { @@ -1048,6 +1073,17 @@ wait_response(krb5_context context, int *action, krb5_sendto_ctx ctx) return 0; } + if (wait_ctx.max_fd == -1) { + /* + * If we don't find a host which can make progress, then + * we accelerate the process by moving all of the contestants + * up by 1s. + */ + _krb5_debug(context, 5, "wait_response: moving the contestants forward"); + heim_array_iterate_f(ctx->hosts, &wait_ctx, wait_accelerate); + return 0; + } + tv.tv_sec = 1; tv.tv_usec = 0; -- 2.11.4.GIT