dns query core: detect udp truncation at recv time
[musl.git] / src / network / res_msend.c
blob11c6aa0edfc43eca7db0c4307c49c93a459d6a83
1 #include <sys/socket.h>
2 #include <netinet/in.h>
3 #include <netinet/tcp.h>
4 #include <netdb.h>
5 #include <arpa/inet.h>
6 #include <stdint.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <time.h>
10 #include <ctype.h>
11 #include <unistd.h>
12 #include <errno.h>
13 #include <pthread.h>
14 #include "stdio_impl.h"
15 #include "syscall.h"
16 #include "lookup.h"
18 static void cleanup(void *p)
20 struct pollfd *pfd = p;
21 for (int i=0; pfd[i].fd >= -1; i++)
22 if (pfd[i].fd >= 0) __syscall(SYS_close, pfd[i].fd);
25 static unsigned long mtime()
27 struct timespec ts;
28 clock_gettime(CLOCK_REALTIME, &ts);
29 return (unsigned long)ts.tv_sec * 1000
30 + ts.tv_nsec / 1000000;
33 static int start_tcp(struct pollfd *pfd, int family, const void *sa, socklen_t sl, const unsigned char *q, int ql)
35 struct msghdr mh = {
36 .msg_name = (void *)sa,
37 .msg_namelen = sl,
38 .msg_iovlen = 2,
39 .msg_iov = (struct iovec [2]){
40 { .iov_base = (uint8_t[]){ ql>>8, ql }, .iov_len = 2 },
41 { .iov_base = (void *)q, .iov_len = ql } }
43 int r;
44 int fd = socket(family, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
45 pfd->fd = fd;
46 pfd->events = POLLOUT;
47 if (!setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN_CONNECT,
48 &(int){1}, sizeof(int))) {
49 r = sendmsg(fd, &mh, MSG_FASTOPEN|MSG_NOSIGNAL);
50 if (r == ql+2) pfd->events = POLLIN;
51 if (r >= 0) return r;
52 if (errno == EINPROGRESS) return 0;
54 r = connect(fd, sa, sl);
55 if (!r || errno == EINPROGRESS) return 0;
56 close(fd);
57 pfd->fd = -1;
58 return -1;
61 static void step_mh(struct msghdr *mh, size_t n)
63 /* Adjust iovec in msghdr to skip first n bytes. */
64 while (mh->msg_iovlen && n >= mh->msg_iov->iov_len) {
65 n -= mh->msg_iov->iov_len;
66 mh->msg_iov++;
67 mh->msg_iovlen--;
69 if (!mh->msg_iovlen) return;
70 mh->msg_iov->iov_base = (char *)mh->msg_iov->iov_base + n;
71 mh->msg_iov->iov_len -= n;
74 /* Internal contract for __res_msend[_rc]: asize must be >=512, nqueries
75 * must be sufficiently small to be safe as VLA size. In practice it's
76 * either 1 or 2, anyway. */
78 int __res_msend_rc(int nqueries, const unsigned char *const *queries,
79 const int *qlens, unsigned char *const *answers, int *alens, int asize,
80 const struct resolvconf *conf)
82 int fd;
83 int timeout, attempts, retry_interval, servfail_retry;
84 union {
85 struct sockaddr_in sin;
86 struct sockaddr_in6 sin6;
87 } sa = {0}, ns[MAXNS] = {{0}};
88 socklen_t sl = sizeof sa.sin;
89 int nns = 0;
90 int family = AF_INET;
91 int rlen;
92 int next;
93 int i, j;
94 int cs;
95 struct pollfd pfd[nqueries+2];
96 int qpos[nqueries], apos[nqueries];
97 unsigned char alen_buf[nqueries][2];
98 int r;
99 unsigned long t0, t1, t2;
101 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
103 timeout = 1000*conf->timeout;
104 attempts = conf->attempts;
106 for (nns=0; nns<conf->nns; nns++) {
107 const struct address *iplit = &conf->ns[nns];
108 if (iplit->family == AF_INET) {
109 memcpy(&ns[nns].sin.sin_addr, iplit->addr, 4);
110 ns[nns].sin.sin_port = htons(53);
111 ns[nns].sin.sin_family = AF_INET;
112 } else {
113 sl = sizeof sa.sin6;
114 memcpy(&ns[nns].sin6.sin6_addr, iplit->addr, 16);
115 ns[nns].sin6.sin6_port = htons(53);
116 ns[nns].sin6.sin6_scope_id = iplit->scopeid;
117 ns[nns].sin6.sin6_family = family = AF_INET6;
121 /* Get local address and open/bind a socket */
122 fd = socket(family, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
124 /* Handle case where system lacks IPv6 support */
125 if (fd < 0 && family == AF_INET6 && errno == EAFNOSUPPORT) {
126 for (i=0; i<nns && conf->ns[nns].family == AF_INET6; i++);
127 if (i==nns) {
128 pthread_setcancelstate(cs, 0);
129 return -1;
131 fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
132 family = AF_INET;
133 sl = sizeof sa.sin;
135 sa.sin.sin_family = family;
136 if (fd < 0 || bind(fd, (void *)&sa, sl) < 0) {
137 if (fd >= 0) close(fd);
138 pthread_setcancelstate(cs, 0);
139 return -1;
142 /* Past this point, there are no errors. Each individual query will
143 * yield either no reply (indicated by zero length) or an answer
144 * packet which is up to the caller to interpret. */
146 for (i=0; i<nqueries; i++) pfd[i].fd = -1;
147 pfd[nqueries].fd = fd;
148 pfd[nqueries].events = POLLIN;
149 pfd[nqueries+1].fd = -2;
151 pthread_cleanup_push(cleanup, pfd);
152 pthread_setcancelstate(cs, 0);
154 /* Convert any IPv4 addresses in a mixed environment to v4-mapped */
155 if (family == AF_INET6) {
156 setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &(int){0}, sizeof 0);
157 for (i=0; i<nns; i++) {
158 if (ns[i].sin.sin_family != AF_INET) continue;
159 memcpy(ns[i].sin6.sin6_addr.s6_addr+12,
160 &ns[i].sin.sin_addr, 4);
161 memcpy(ns[i].sin6.sin6_addr.s6_addr,
162 "\0\0\0\0\0\0\0\0\0\0\xff\xff", 12);
163 ns[i].sin6.sin6_family = AF_INET6;
164 ns[i].sin6.sin6_flowinfo = 0;
165 ns[i].sin6.sin6_scope_id = 0;
169 memset(alens, 0, sizeof *alens * nqueries);
171 retry_interval = timeout / attempts;
172 next = 0;
173 t0 = t2 = mtime();
174 t1 = t2 - retry_interval;
176 for (; t2-t0 < timeout; t2=mtime()) {
177 /* This is the loop exit condition: that all queries
178 * have an accepted answer. */
179 for (i=0; i<nqueries && alens[i]>0; i++);
180 if (i==nqueries) break;
182 if (t2-t1 >= retry_interval) {
183 /* Query all configured namservers in parallel */
184 for (i=0; i<nqueries; i++)
185 if (!alens[i])
186 for (j=0; j<nns; j++)
187 sendto(fd, queries[i],
188 qlens[i], MSG_NOSIGNAL,
189 (void *)&ns[j], sl);
190 t1 = t2;
191 servfail_retry = 2 * nqueries;
194 /* Wait for a response, or until time to retry */
195 if (poll(pfd, nqueries+1, t1+retry_interval-t2) <= 0) continue;
197 while (next < nqueries) {
198 struct msghdr mh = {
199 .msg_name = (void *)&sa,
200 .msg_namelen = sl,
201 .msg_iovlen = 1,
202 .msg_iov = (struct iovec []){
203 { .iov_base = (void *)answers[next],
204 .iov_len = asize }
207 rlen = recvmsg(fd, &mh, 0);
208 if (rlen < 0) break;
210 /* Ignore non-identifiable packets */
211 if (rlen < 4) continue;
213 /* Ignore replies from addresses we didn't send to */
214 for (j=0; j<nns && memcmp(ns+j, &sa, sl); j++);
215 if (j==nns) continue;
217 /* Find which query this answer goes with, if any */
218 for (i=next; i<nqueries && (
219 answers[next][0] != queries[i][0] ||
220 answers[next][1] != queries[i][1] ); i++);
221 if (i==nqueries) continue;
222 if (alens[i]) continue;
224 /* Only accept positive or negative responses;
225 * retry immediately on server failure, and ignore
226 * all other codes such as refusal. */
227 switch (answers[next][3] & 15) {
228 case 0:
229 case 3:
230 break;
231 case 2:
232 if (servfail_retry && servfail_retry--)
233 sendto(fd, queries[i],
234 qlens[i], MSG_NOSIGNAL,
235 (void *)&ns[j], sl);
236 default:
237 continue;
240 /* Store answer in the right slot, or update next
241 * available temp slot if it's already in place. */
242 alens[i] = rlen;
243 if (i == next)
244 for (; next<nqueries && alens[next]; next++);
245 else
246 memcpy(answers[i], answers[next], rlen);
248 /* Ignore further UDP if all slots full or TCP-mode */
249 if (next == nqueries) pfd[nqueries].events = 0;
251 /* If answer is truncated (TC bit), fallback to TCP */
252 if ((answers[i][2] & 2) || (mh.msg_flags & MSG_TRUNC)) {
253 alens[i] = -1;
254 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
255 r = start_tcp(pfd+i, family, ns+j, sl, queries[i], qlens[i]);
256 pthread_setcancelstate(cs, 0);
257 if (r >= 0) {
258 qpos[i] = r;
259 apos[i] = 0;
261 continue;
265 for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLOUT) {
266 struct msghdr mh = {
267 .msg_iovlen = 2,
268 .msg_iov = (struct iovec [2]){
269 { .iov_base = (uint8_t[]){ qlens[i]>>8, qlens[i] }, .iov_len = 2 },
270 { .iov_base = (void *)queries[i], .iov_len = qlens[i] } }
272 step_mh(&mh, qpos[i]);
273 r = sendmsg(pfd[i].fd, &mh, MSG_NOSIGNAL);
274 if (r < 0) goto out;
275 qpos[i] += r;
276 if (qpos[i] == qlens[i]+2)
277 pfd[i].events = POLLIN;
280 for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLIN) {
281 struct msghdr mh = {
282 .msg_iovlen = 2,
283 .msg_iov = (struct iovec [2]){
284 { .iov_base = alen_buf[i], .iov_len = 2 },
285 { .iov_base = answers[i], .iov_len = asize } }
287 step_mh(&mh, apos[i]);
288 r = recvmsg(pfd[i].fd, &mh, 0);
289 if (r < 0) goto out;
290 apos[i] += r;
291 if (apos[i] < 2) continue;
292 int alen = alen_buf[i][0]*256 + alen_buf[i][1];
293 if (alen < 13) goto out;
294 if (apos[i] < alen+2 && apos[i] < asize+2)
295 continue;
296 int rcode = answers[i][3] & 15;
297 if (rcode != 0 && rcode != 3)
298 goto out;
300 /* Storing the length here commits the accepted answer.
301 * Immediately close TCP socket so as not to consume
302 * resources we no longer need. */
303 alens[i] = alen;
304 __syscall(SYS_close, pfd[i].fd);
305 pfd[i].fd = -1;
308 out:
309 pthread_cleanup_pop(1);
311 /* Disregard any incomplete TCP results */
312 for (i=0; i<nqueries; i++) if (alens[i]<0) alens[i] = 0;
314 return 0;
317 int __res_msend(int nqueries, const unsigned char *const *queries,
318 const int *qlens, unsigned char *const *answers, int *alens, int asize)
320 struct resolvconf conf;
321 if (__get_resolv_conf(&conf, 0, 0) < 0) return -1;
322 return __res_msend_rc(nqueries, queries, qlens, answers, alens, asize, &conf);