Add BIND 9.2.4rc7.
[dragonfly.git] / contrib / bind-9.2.4rc7 / lib / isc / unix / socket.c
blob373817ca80bd8803285edd9634aa4eb118524b90
1 /*
2 * Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1998-2003 Internet Software Consortium.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
18 /* $Id: socket.c,v 1.207.2.29 2004/07/01 04:51:44 marka Exp $ */
20 #include <config.h>
22 #include <sys/param.h>
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/uio.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <stddef.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
35 #include <isc/buffer.h>
36 #include <isc/bufferlist.h>
37 #include <isc/condition.h>
38 #include <isc/formatcheck.h>
39 #include <isc/list.h>
40 #include <isc/log.h>
41 #include <isc/mem.h>
42 #include <isc/msgs.h>
43 #include <isc/mutex.h>
44 #include <isc/net.h>
45 #include <isc/platform.h>
46 #include <isc/print.h>
47 #include <isc/region.h>
48 #include <isc/socket.h>
49 #include <isc/strerror.h>
50 #include <isc/task.h>
51 #include <isc/thread.h>
52 #include <isc/util.h>
54 #include "errno2result.h"
56 #ifndef ISC_PLATFORM_USETHREADS
57 #include "socket_p.h"
58 #endif /* ISC_PLATFORM_USETHREADS */
61 * Some systems define the socket length argument as an int, some as size_t,
62 * some as socklen_t. This is here so it can be easily changed if needed.
64 #ifndef ISC_SOCKADDR_LEN_T
65 #ifdef _BSD_SOCKLEN_T_
66 #define ISC_SOCKADDR_LEN_T _BSD_SOCKLEN_T_
67 #else
68 #define ISC_SOCKADDR_LEN_T unsigned int
69 #endif
70 #endif
73 * Define what the possible "soft" errors can be. These are non-fatal returns
74 * of various network related functions, like recv() and so on.
76 * For some reason, BSDI (and perhaps others) will sometimes return <0
77 * from recv() but will have errno==0. This is broken, but we have to
78 * work around it here.
80 #define SOFT_ERROR(e) ((e) == EAGAIN || \
81 (e) == EWOULDBLOCK || \
82 (e) == EINTR || \
83 (e) == 0)
85 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
88 * DLVL(90) -- Function entry/exit and other tracing.
89 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
90 * DLVL(60) -- Socket data send/receive
91 * DLVL(50) -- Event tracing, including receiving/sending completion events.
92 * DLVL(20) -- Socket creation/destruction.
94 #define TRACE_LEVEL 90
95 #define CORRECTNESS_LEVEL 70
96 #define IOEVENT_LEVEL 60
97 #define EVENT_LEVEL 50
98 #define CREATION_LEVEL 20
100 #define TRACE DLVL(TRACE_LEVEL)
101 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
102 #define IOEVENT DLVL(IOEVENT_LEVEL)
103 #define EVENT DLVL(EVENT_LEVEL)
104 #define CREATION DLVL(CREATION_LEVEL)
106 typedef isc_event_t intev_t;
108 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
109 #define VALID_SOCKET(t) ISC_MAGIC_VALID(t, SOCKET_MAGIC)
112 * IPv6 control information. If the socket is an IPv6 socket we want
113 * to collect the destination address and interface so the client can
114 * set them on outgoing packets.
116 #ifdef ISC_PLATFORM_HAVEIPV6
117 #ifndef USE_CMSG
118 #define USE_CMSG 1
119 #endif
120 #endif
123 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
124 * a setsockopt() like interface to request timestamps, and if the OS
125 * doesn't do it for us, call gettimeofday() on every UDP receive?
127 #ifdef SO_TIMESTAMP
128 #ifndef USE_CMSG
129 #define USE_CMSG 1
130 #endif
131 #endif
134 * The number of times a send operation is repeated if the result is EINTR.
136 #define NRETRIES 10
138 struct isc_socket {
139 /* Not locked. */
140 unsigned int magic;
141 isc_socketmgr_t *manager;
142 isc_mutex_t lock;
143 isc_sockettype_t type;
145 /* Locked by socket lock. */
146 ISC_LINK(isc_socket_t) link;
147 unsigned int references;
148 int fd;
149 int pf;
151 ISC_LIST(isc_socketevent_t) send_list;
152 ISC_LIST(isc_socketevent_t) recv_list;
153 ISC_LIST(isc_socket_newconnev_t) accept_list;
154 isc_socket_connev_t *connect_ev;
157 * Internal events. Posted when a descriptor is readable or
158 * writable. These are statically allocated and never freed.
159 * They will be set to non-purgable before use.
161 intev_t readable_ev;
162 intev_t writable_ev;
164 isc_sockaddr_t address; /* remote address */
166 unsigned int pending_recv : 1,
167 pending_send : 1,
168 pending_accept : 1,
169 listener : 1, /* listener socket */
170 connected : 1,
171 connecting : 1, /* connect pending */
172 bound : 1; /* bound to local addr */
174 #ifdef ISC_NET_RECVOVERFLOW
175 unsigned char overflow; /* used for MSG_TRUNC fake */
176 #endif
178 char *recvcmsgbuf;
179 ISC_SOCKADDR_LEN_T recvcmsgbuflen;
180 char *sendcmsgbuf;
181 ISC_SOCKADDR_LEN_T sendcmsgbuflen;
184 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
185 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
187 struct isc_socketmgr {
188 /* Not locked. */
189 unsigned int magic;
190 isc_mem_t *mctx;
191 isc_mutex_t lock;
192 /* Locked by manager lock. */
193 ISC_LIST(isc_socket_t) socklist;
194 fd_set read_fds;
195 fd_set write_fds;
196 isc_socket_t *fds[FD_SETSIZE];
197 int fdstate[FD_SETSIZE];
198 int maxfd;
199 #ifdef ISC_PLATFORM_USETHREADS
200 isc_thread_t watcher;
201 isc_condition_t shutdown_ok;
202 int pipe_fds[2];
203 #else /* ISC_PLATFORM_USETHREADS */
204 unsigned int refs;
205 #endif /* ISC_PLATFORM_USETHREADS */
208 #ifndef ISC_PLATFORM_USETHREADS
209 static isc_socketmgr_t *socketmgr = NULL;
210 #endif /* ISC_PLATFORM_USETHREADS */
212 #define CLOSED 0 /* this one must be zero */
213 #define MANAGED 1
214 #define CLOSE_PENDING 2
217 * send() and recv() iovec counts
219 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
220 #ifdef ISC_NET_RECVOVERFLOW
221 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
222 #else
223 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
224 #endif
226 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
227 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
228 static void free_socket(isc_socket_t **);
229 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
230 isc_socket_t **);
231 static void destroy(isc_socket_t **);
232 static void internal_accept(isc_task_t *, isc_event_t *);
233 static void internal_connect(isc_task_t *, isc_event_t *);
234 static void internal_recv(isc_task_t *, isc_event_t *);
235 static void internal_send(isc_task_t *, isc_event_t *);
236 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
237 static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
238 struct msghdr *, struct iovec *, size_t *);
239 static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
240 struct msghdr *, struct iovec *, size_t *);
242 #define SELECT_POKE_SHUTDOWN (-1)
243 #define SELECT_POKE_NOTHING (-2)
244 #define SELECT_POKE_READ (-3)
245 #define SELECT_POKE_ACCEPT (-3) /* Same as _READ */
246 #define SELECT_POKE_WRITE (-4)
247 #define SELECT_POKE_CONNECT (-4) /* Same as _WRITE */
248 #define SELECT_POKE_CLOSE (-5)
250 #define SOCK_DEAD(s) ((s)->references == 0)
252 static void
253 manager_log(isc_socketmgr_t *sockmgr,
254 isc_logcategory_t *category, isc_logmodule_t *module, int level,
255 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
256 static void
257 manager_log(isc_socketmgr_t *sockmgr,
258 isc_logcategory_t *category, isc_logmodule_t *module, int level,
259 const char *fmt, ...)
261 char msgbuf[2048];
262 va_list ap;
264 if (! isc_log_wouldlog(isc_lctx, level))
265 return;
267 va_start(ap, fmt);
268 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
269 va_end(ap);
271 isc_log_write(isc_lctx, category, module, level,
272 "sockmgr %p: %s", sockmgr, msgbuf);
275 static void
276 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
277 isc_logcategory_t *category, isc_logmodule_t *module, int level,
278 isc_msgcat_t *msgcat, int msgset, int message,
279 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
280 static void
281 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
282 isc_logcategory_t *category, isc_logmodule_t *module, int level,
283 isc_msgcat_t *msgcat, int msgset, int message,
284 const char *fmt, ...)
286 char msgbuf[2048];
287 char peerbuf[256];
288 va_list ap;
290 if (! isc_log_wouldlog(isc_lctx, level))
291 return;
293 va_start(ap, fmt);
294 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
295 va_end(ap);
297 if (address == NULL) {
298 isc_log_iwrite(isc_lctx, category, module, level,
299 msgcat, msgset, message,
300 "socket %p: %s", sock, msgbuf);
301 } else {
302 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
303 isc_log_iwrite(isc_lctx, category, module, level,
304 msgcat, msgset, message,
305 "socket %p %s: %s", sock, peerbuf, msgbuf);
309 static void
310 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
311 isc_socket_t *sock;
314 * This is a wakeup on a socket. If the socket is not in the
315 * process of being closed, start watching it for either reads
316 * or writes.
319 INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
321 if (manager->fdstate[fd] == CLOSE_PENDING) {
322 manager->fdstate[fd] = CLOSED;
323 FD_CLR(fd, &manager->read_fds);
324 FD_CLR(fd, &manager->write_fds);
325 close(fd);
326 return;
328 if (manager->fdstate[fd] != MANAGED)
329 return;
331 sock = manager->fds[fd];
334 * Set requested bit.
336 if (msg == SELECT_POKE_READ)
337 FD_SET(sock->fd, &manager->read_fds);
338 if (msg == SELECT_POKE_WRITE)
339 FD_SET(sock->fd, &manager->write_fds);
342 #ifdef ISC_PLATFORM_USETHREADS
344 * Poke the select loop when there is something for us to do.
345 * The write is required (by POSIX) to complete. That is, we
346 * will not get partial writes.
348 static void
349 select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
350 int cc;
351 int buf[2];
352 char strbuf[ISC_STRERRORSIZE];
354 buf[0] = fd;
355 buf[1] = msg;
357 do {
358 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
359 #ifdef ENOSR
361 * Treat ENOSR as EAGAIN but loop slowly as it is
362 * unlikely to clear fast.
364 if (cc < 0 && errno == ENOSR) {
365 sleep(1);
366 errno = EAGAIN;
368 #endif
369 } while (cc < 0 && SOFT_ERROR(errno));
371 if (cc < 0) {
372 isc__strerror(errno, strbuf, sizeof(strbuf));
373 FATAL_ERROR(__FILE__, __LINE__,
374 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
375 ISC_MSG_WRITEFAILED,
376 "write() failed "
377 "during watcher poke: %s"),
378 strbuf);
381 INSIST(cc == sizeof(buf));
385 * Read a message on the internal fd.
387 static void
388 select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
389 int buf[2];
390 int cc;
391 char strbuf[ISC_STRERRORSIZE];
393 cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
394 if (cc < 0) {
395 *msg = SELECT_POKE_NOTHING;
396 if (SOFT_ERROR(errno))
397 return;
399 isc__strerror(errno, strbuf, sizeof(strbuf));
400 FATAL_ERROR(__FILE__, __LINE__,
401 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
402 ISC_MSG_READFAILED,
403 "read() failed "
404 "during watcher poke: %s"),
405 strbuf);
407 return;
409 INSIST(cc == sizeof(buf));
411 *fd = buf[0];
412 *msg = buf[1];
414 #else /* ISC_PLATFORM_USETHREADS */
416 * Update the state of the socketmgr when something changes.
418 static void
419 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
420 if (msg == SELECT_POKE_SHUTDOWN)
421 return;
422 else if (fd >= 0)
423 wakeup_socket(manager, fd, msg);
424 return;
426 #endif /* ISC_PLATFORM_USETHREADS */
429 * Make a fd non-blocking.
431 static isc_result_t
432 make_nonblock(int fd) {
433 int ret;
434 int flags;
435 char strbuf[ISC_STRERRORSIZE];
437 flags = fcntl(fd, F_GETFL, 0);
438 flags |= O_NONBLOCK;
439 ret = fcntl(fd, F_SETFL, flags);
441 if (ret == -1) {
442 isc__strerror(errno, strbuf, sizeof(strbuf));
443 UNEXPECTED_ERROR(__FILE__, __LINE__,
444 "fcntl(%d, F_SETFL, %d): %s",
445 fd, flags, strbuf);
447 return (ISC_R_UNEXPECTED);
450 return (ISC_R_SUCCESS);
453 #ifdef USE_CMSG
455 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
456 * In order to ensure as much portability as possible, we provide wrapper
457 * functions of these macros.
458 * Note that cmsg_space() could run slow on OSes that do not have
459 * CMSG_SPACE.
461 static inline ISC_SOCKADDR_LEN_T
462 cmsg_len(ISC_SOCKADDR_LEN_T len) {
463 #ifdef CMSG_LEN
464 return (CMSG_LEN(len));
465 #else
466 ISC_SOCKADDR_LEN_T hdrlen;
468 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(NULL); /* XXX */
469 return (hdrlen + len);
470 #endif
473 static inline ISC_SOCKADDR_LEN_T
474 cmsg_space(ISC_SOCKADDR_LEN_T len) {
475 #ifdef CMSG_SPACE
476 return (CMSG_SPACE(len));
477 #else
478 struct msghdr msg;
479 struct cmsghdr *cmsgp;
481 * XXX: The buffer length is an ad-hoc value, but should be enough
482 * in a practical sense.
484 char dummybuf[sizeof(struct cmsghdr) + 1024];
486 memset(&msg, 0, sizeof(msg));
487 msg.msg_control = dummybuf;
488 msg.msg_controllen = sizeof(dummybuf);
490 cmsgp = (struct cmsghdr *)dummybuf;
491 cmsgp->cmsg_len = cmsg_len(len);
493 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
494 if (cmsgp != NULL)
495 return ((char *)cmsgp - (char *)msg.msg_control);
496 else
497 return (0);
498 #endif
500 #endif /* USE_CMSG */
503 * Process control messages received on a socket.
505 static void
506 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
507 #ifdef USE_CMSG
508 struct cmsghdr *cmsgp;
509 #ifdef ISC_PLATFORM_HAVEIPV6
510 struct in6_pktinfo *pktinfop;
511 #endif
512 #ifdef SO_TIMESTAMP
513 struct timeval *timevalp;
514 #endif
515 #endif
518 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
519 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
520 * They are all here, outside of the CPP tests, because it is
521 * more consistent with the usual ISC coding style.
523 UNUSED(sock);
524 UNUSED(msg);
525 UNUSED(dev);
527 #ifdef ISC_NET_BSD44MSGHDR
529 #ifdef MSG_TRUNC
530 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
531 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
532 #endif
534 #ifdef MSG_CTRUNC
535 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
536 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
537 #endif
539 #ifndef USE_CMSG
540 return;
541 #else
542 if (msg->msg_controllen == 0U || msg->msg_control == NULL)
543 return;
545 #ifdef SO_TIMESTAMP
546 timevalp = NULL;
547 #endif
548 #ifdef ISC_PLATFORM_HAVEIPV6
549 pktinfop = NULL;
550 #endif
552 cmsgp = CMSG_FIRSTHDR(msg);
553 while (cmsgp != NULL) {
554 socket_log(sock, NULL, TRACE,
555 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
556 "processing cmsg %p", cmsgp);
558 #ifdef ISC_PLATFORM_HAVEIPV6
559 if (cmsgp->cmsg_level == IPPROTO_IPV6
560 && cmsgp->cmsg_type == IPV6_PKTINFO) {
562 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
563 memcpy(&dev->pktinfo, pktinfop,
564 sizeof(struct in6_pktinfo));
565 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
566 socket_log(sock, NULL, TRACE,
567 isc_msgcat, ISC_MSGSET_SOCKET,
568 ISC_MSG_IFRECEIVED,
569 "interface received on ifindex %u",
570 dev->pktinfo.ipi6_ifindex);
571 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
572 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
573 goto next;
575 #endif
577 #ifdef SO_TIMESTAMP
578 if (cmsgp->cmsg_level == SOL_SOCKET
579 && cmsgp->cmsg_type == SCM_TIMESTAMP) {
580 timevalp = (struct timeval *)CMSG_DATA(cmsgp);
581 dev->timestamp.seconds = timevalp->tv_sec;
582 dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
583 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
584 goto next;
586 #endif
588 next:
589 cmsgp = CMSG_NXTHDR(msg, cmsgp);
591 #endif /* USE_CMSG */
593 #endif /* ISC_NET_BSD44MSGHDR */
597 * Construct an iov array and attach it to the msghdr passed in. This is
598 * the SEND constructor, which will use the used region of the buffer
599 * (if using a buffer list) or will use the internal region (if a single
600 * buffer I/O is requested).
602 * Nothing can be NULL, and the done event must list at least one buffer
603 * on the buffer linked list for this function to be meaningful.
605 * If write_countp != NULL, *write_countp will hold the number of bytes
606 * this transaction can send.
608 static void
609 build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
610 struct msghdr *msg, struct iovec *iov, size_t *write_countp)
612 unsigned int iovcount;
613 isc_buffer_t *buffer;
614 isc_region_t used;
615 size_t write_count;
616 size_t skip_count;
618 memset(msg, 0, sizeof(*msg));
620 if (sock->type == isc_sockettype_udp) {
621 msg->msg_name = (void *)&dev->address.type.sa;
622 msg->msg_namelen = dev->address.length;
623 } else {
624 msg->msg_name = NULL;
625 msg->msg_namelen = 0;
628 buffer = ISC_LIST_HEAD(dev->bufferlist);
629 write_count = 0;
630 iovcount = 0;
633 * Single buffer I/O? Skip what we've done so far in this region.
635 if (buffer == NULL) {
636 write_count = dev->region.length - dev->n;
637 iov[0].iov_base = (void *)(dev->region.base + dev->n);
638 iov[0].iov_len = write_count;
639 iovcount = 1;
641 goto config;
645 * Multibuffer I/O.
646 * Skip the data in the buffer list that we have already written.
648 skip_count = dev->n;
649 while (buffer != NULL) {
650 REQUIRE(ISC_BUFFER_VALID(buffer));
651 if (skip_count < isc_buffer_usedlength(buffer))
652 break;
653 skip_count -= isc_buffer_usedlength(buffer);
654 buffer = ISC_LIST_NEXT(buffer, link);
657 while (buffer != NULL) {
658 INSIST(iovcount < MAXSCATTERGATHER_SEND);
660 isc_buffer_usedregion(buffer, &used);
662 if (used.length > 0) {
663 iov[iovcount].iov_base = (void *)(used.base
664 + skip_count);
665 iov[iovcount].iov_len = used.length - skip_count;
666 write_count += (used.length - skip_count);
667 skip_count = 0;
668 iovcount++;
670 buffer = ISC_LIST_NEXT(buffer, link);
673 INSIST(skip_count == 0U);
675 config:
676 msg->msg_iov = iov;
677 msg->msg_iovlen = iovcount;
679 #ifdef ISC_NET_BSD44MSGHDR
680 msg->msg_control = NULL;
681 msg->msg_controllen = 0;
682 msg->msg_flags = 0;
683 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIPV6)
684 if ((sock->type == isc_sockettype_udp)
685 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
686 struct cmsghdr *cmsgp;
687 struct in6_pktinfo *pktinfop;
689 socket_log(sock, NULL, TRACE,
690 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
691 "sendto pktinfo data, ifindex %u",
692 dev->pktinfo.ipi6_ifindex);
694 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
695 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
696 msg->msg_control = (void *)sock->sendcmsgbuf;
698 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
699 cmsgp->cmsg_level = IPPROTO_IPV6;
700 cmsgp->cmsg_type = IPV6_PKTINFO;
701 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
702 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
703 memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
705 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
706 #else /* ISC_NET_BSD44MSGHDR */
707 msg->msg_accrights = NULL;
708 msg->msg_accrightslen = 0;
709 #endif /* ISC_NET_BSD44MSGHDR */
711 if (write_countp != NULL)
712 *write_countp = write_count;
716 * Construct an iov array and attach it to the msghdr passed in. This is
717 * the RECV constructor, which will use the avialable region of the buffer
718 * (if using a buffer list) or will use the internal region (if a single
719 * buffer I/O is requested).
721 * Nothing can be NULL, and the done event must list at least one buffer
722 * on the buffer linked list for this function to be meaningful.
724 * If read_countp != NULL, *read_countp will hold the number of bytes
725 * this transaction can receive.
727 static void
728 build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
729 struct msghdr *msg, struct iovec *iov, size_t *read_countp)
731 unsigned int iovcount;
732 isc_buffer_t *buffer;
733 isc_region_t available;
734 size_t read_count;
736 memset(msg, 0, sizeof(struct msghdr));
738 if (sock->type == isc_sockettype_udp) {
739 memset(&dev->address, 0, sizeof(dev->address));
740 msg->msg_name = (void *)&dev->address.type.sa;
741 msg->msg_namelen = sizeof(dev->address.type);
742 #ifdef ISC_NET_RECVOVERFLOW
743 /* If needed, steal one iovec for overflow detection. */
744 maxiov--;
745 #endif
746 } else { /* TCP */
747 msg->msg_name = NULL;
748 msg->msg_namelen = 0;
749 dev->address = sock->address;
752 buffer = ISC_LIST_HEAD(dev->bufferlist);
753 read_count = 0;
756 * Single buffer I/O? Skip what we've done so far in this region.
758 if (buffer == NULL) {
759 read_count = dev->region.length - dev->n;
760 iov[0].iov_base = (void *)(dev->region.base + dev->n);
761 iov[0].iov_len = read_count;
762 iovcount = 1;
764 goto config;
768 * Multibuffer I/O.
769 * Skip empty buffers.
771 while (buffer != NULL) {
772 REQUIRE(ISC_BUFFER_VALID(buffer));
773 if (isc_buffer_availablelength(buffer) != 0)
774 break;
775 buffer = ISC_LIST_NEXT(buffer, link);
778 iovcount = 0;
779 while (buffer != NULL) {
780 INSIST(iovcount < MAXSCATTERGATHER_RECV);
782 isc_buffer_availableregion(buffer, &available);
784 if (available.length > 0) {
785 iov[iovcount].iov_base = (void *)(available.base);
786 iov[iovcount].iov_len = available.length;
787 read_count += available.length;
788 iovcount++;
790 buffer = ISC_LIST_NEXT(buffer, link);
793 config:
796 * If needed, set up to receive that one extra byte. Note that
797 * we know there is at least one iov left, since we stole it
798 * at the top of this function.
800 #ifdef ISC_NET_RECVOVERFLOW
801 if (sock->type == isc_sockettype_udp) {
802 iov[iovcount].iov_base = (void *)(&sock->overflow);
803 iov[iovcount].iov_len = 1;
804 iovcount++;
806 #endif
808 msg->msg_iov = iov;
809 msg->msg_iovlen = iovcount;
811 #ifdef ISC_NET_BSD44MSGHDR
812 msg->msg_control = NULL;
813 msg->msg_controllen = 0;
814 msg->msg_flags = 0;
815 #if defined(USE_CMSG)
816 if (sock->type == isc_sockettype_udp) {
817 msg->msg_control = sock->recvcmsgbuf;
818 msg->msg_controllen = sock->recvcmsgbuflen;
820 #endif /* USE_CMSG */
821 #else /* ISC_NET_BSD44MSGHDR */
822 msg->msg_accrights = NULL;
823 msg->msg_accrightslen = 0;
824 #endif /* ISC_NET_BSD44MSGHDR */
826 if (read_countp != NULL)
827 *read_countp = read_count;
830 static void
831 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
832 isc_socketevent_t *dev)
834 if (sock->type == isc_sockettype_udp) {
835 if (address != NULL)
836 dev->address = *address;
837 else
838 dev->address = sock->address;
839 } else if (sock->type == isc_sockettype_tcp) {
840 INSIST(address == NULL);
841 dev->address = sock->address;
845 static isc_socketevent_t *
846 allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
847 isc_taskaction_t action, const void *arg)
849 isc_socketevent_t *ev;
851 ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
852 sock, eventtype,
853 action, arg,
854 sizeof(*ev));
856 if (ev == NULL)
857 return (NULL);
859 ev->result = ISC_R_UNEXPECTED;
860 ISC_LINK_INIT(ev, ev_link);
861 ISC_LIST_INIT(ev->bufferlist);
862 ev->region.base = NULL;
863 ev->n = 0;
864 ev->offset = 0;
865 ev->attributes = 0;
867 return (ev);
870 #if defined(ISC_SOCKET_DEBUG)
871 static void
872 dump_msg(struct msghdr *msg) {
873 unsigned int i;
875 printf("MSGHDR %p\n", msg);
876 printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
877 printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
878 for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++)
879 printf("\t\t%d\tbase %p, len %d\n", i,
880 msg->msg_iov[i].iov_base,
881 msg->msg_iov[i].iov_len);
882 #ifdef ISC_NET_BSD44MSGHDR
883 printf("\tcontrol %p, controllen %d\n", msg->msg_control,
884 msg->msg_controllen);
885 #endif
887 #endif
889 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
890 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
891 #define DOIO_HARD 2 /* i/o error, event sent */
892 #define DOIO_EOF 3 /* EOF, no event sent */
894 static int
895 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
896 int cc;
897 struct iovec iov[MAXSCATTERGATHER_RECV];
898 size_t read_count;
899 size_t actual_count;
900 struct msghdr msghdr;
901 isc_buffer_t *buffer;
902 int recv_errno;
903 char strbuf[ISC_STRERRORSIZE];
905 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
907 #if defined(ISC_SOCKET_DEBUG)
908 dump_msg(&msghdr);
909 #endif
911 cc = recvmsg(sock->fd, &msghdr, 0);
912 recv_errno = errno;
914 if (cc < 0) {
915 if (SOFT_ERROR(recv_errno))
916 return (DOIO_SOFT);
918 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
919 isc__strerror(recv_errno, strbuf, sizeof(strbuf));
920 socket_log(sock, NULL, IOEVENT,
921 isc_msgcat, ISC_MSGSET_SOCKET,
922 ISC_MSG_DOIORECV,
923 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
924 sock->fd, cc, recv_errno, strbuf);
927 #define SOFT_OR_HARD(_system, _isc) \
928 if (recv_errno == _system) { \
929 if (sock->connected) { \
930 dev->result = _isc; \
931 return (DOIO_HARD); \
933 return (DOIO_SOFT); \
935 #define ALWAYS_HARD(_system, _isc) \
936 if (recv_errno == _system) { \
937 dev->result = _isc; \
938 return (DOIO_HARD); \
941 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
942 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
943 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
944 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
945 /* HPUX 11.11 can return EADDRNOTAVAIL. */
946 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
947 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
949 #undef SOFT_OR_HARD
950 #undef ALWAYS_HARD
952 dev->result = isc__errno2result(recv_errno);
953 return (DOIO_HARD);
957 * On TCP, zero length reads indicate EOF, while on
958 * UDP, zero length reads are perfectly valid, although
959 * strange.
961 if ((sock->type == isc_sockettype_tcp) && (cc == 0))
962 return (DOIO_EOF);
964 if (sock->type == isc_sockettype_udp) {
965 dev->address.length = msghdr.msg_namelen;
966 if (isc_sockaddr_getport(&dev->address) == 0) {
967 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
968 socket_log(sock, &dev->address, IOEVENT,
969 isc_msgcat, ISC_MSGSET_SOCKET,
970 ISC_MSG_ZEROPORT,
971 "dropping source port zero packet");
973 return (DOIO_SOFT);
977 socket_log(sock, &dev->address, IOEVENT,
978 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
979 "packet received correctly");
982 * Overflow bit detection. If we received MORE bytes than we should,
983 * this indicates an overflow situation. Set the flag in the
984 * dev entry and adjust how much we read by one.
986 #ifdef ISC_NET_RECVOVERFLOW
987 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
988 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
989 cc--;
991 #endif
994 * If there are control messages attached, run through them and pull
995 * out the interesting bits.
997 if (sock->type == isc_sockettype_udp)
998 process_cmsg(sock, &msghdr, dev);
1001 * update the buffers (if any) and the i/o count
1003 dev->n += cc;
1004 actual_count = cc;
1005 buffer = ISC_LIST_HEAD(dev->bufferlist);
1006 while (buffer != NULL && actual_count > 0U) {
1007 REQUIRE(ISC_BUFFER_VALID(buffer));
1008 if (isc_buffer_availablelength(buffer) <= actual_count) {
1009 actual_count -= isc_buffer_availablelength(buffer);
1010 isc_buffer_add(buffer,
1011 isc_buffer_availablelength(buffer));
1012 } else {
1013 isc_buffer_add(buffer, actual_count);
1014 actual_count = 0;
1015 break;
1017 buffer = ISC_LIST_NEXT(buffer, link);
1018 if (buffer == NULL) {
1019 INSIST(actual_count == 0U);
1024 * If we read less than we expected, update counters,
1025 * and let the upper layer poke the descriptor.
1027 if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1028 return (DOIO_SOFT);
1031 * Full reads are posted, or partials if partials are ok.
1033 dev->result = ISC_R_SUCCESS;
1034 return (DOIO_SUCCESS);
1038 * Returns:
1039 * DOIO_SUCCESS The operation succeeded. dev->result contains
1040 * ISC_R_SUCCESS.
1042 * DOIO_HARD A hard or unexpected I/O error was encountered.
1043 * dev->result contains the appropriate error.
1045 * DOIO_SOFT A soft I/O error was encountered. No senddone
1046 * event was sent. The operation should be retried.
1048 * No other return values are possible.
1050 static int
1051 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1052 int cc;
1053 struct iovec iov[MAXSCATTERGATHER_SEND];
1054 size_t write_count;
1055 struct msghdr msghdr;
1056 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1057 int attempts = 0;
1058 int send_errno;
1059 char strbuf[ISC_STRERRORSIZE];
1061 build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
1063 resend:
1064 cc = sendmsg(sock->fd, &msghdr, 0);
1065 send_errno = errno;
1068 * Check for error or block condition.
1070 if (cc < 0) {
1071 if (send_errno == EINTR && ++attempts < NRETRIES)
1072 goto resend;
1074 if (SOFT_ERROR(send_errno))
1075 return (DOIO_SOFT);
1077 #define SOFT_OR_HARD(_system, _isc) \
1078 if (send_errno == _system) { \
1079 if (sock->connected) { \
1080 dev->result = _isc; \
1081 return (DOIO_HARD); \
1083 return (DOIO_SOFT); \
1085 #define ALWAYS_HARD(_system, _isc) \
1086 if (send_errno == _system) { \
1087 dev->result = _isc; \
1088 return (DOIO_HARD); \
1091 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1092 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1093 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1094 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1095 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1096 #ifdef EHOSTDOWN
1097 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1098 #endif
1099 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1100 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1101 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1102 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1103 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1105 #undef SOFT_OR_HARD
1106 #undef ALWAYS_HARD
1109 * The other error types depend on whether or not the
1110 * socket is UDP or TCP. If it is UDP, some errors
1111 * that we expect to be fatal under TCP are merely
1112 * annoying, and are really soft errors.
1114 * However, these soft errors are still returned as
1115 * a status.
1117 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1118 isc__strerror(send_errno, strbuf, sizeof(strbuf));
1119 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1120 addrbuf, strbuf);
1121 dev->result = isc__errno2result(send_errno);
1122 return (DOIO_HARD);
1125 if (cc == 0)
1126 UNEXPECTED_ERROR(__FILE__, __LINE__,
1127 "internal_send: send() %s 0",
1128 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1129 ISC_MSG_RETURNED, "returned"));
1132 * If we write less than we expected, update counters, poke.
1134 dev->n += cc;
1135 if ((size_t)cc != write_count)
1136 return (DOIO_SOFT);
1139 * Exactly what we wanted to write. We're done with this
1140 * entry. Post its completion event.
1142 dev->result = ISC_R_SUCCESS;
1143 return (DOIO_SUCCESS);
1147 * Kill.
1149 * Caller must ensure that the socket is not locked and no external
1150 * references exist.
1152 static void
1153 destroy(isc_socket_t **sockp) {
1154 isc_socket_t *sock = *sockp;
1155 isc_socketmgr_t *manager = sock->manager;
1157 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1158 ISC_MSG_DESTROYING, "destroying");
1160 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1161 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1162 INSIST(ISC_LIST_EMPTY(sock->send_list));
1163 INSIST(sock->connect_ev == NULL);
1164 REQUIRE(sock->fd >= 0 && sock->fd < (int)FD_SETSIZE);
1166 LOCK(&manager->lock);
1169 * No one has this socket open, so the watcher doesn't have to be
1170 * poked, and the socket doesn't have to be locked.
1172 manager->fds[sock->fd] = NULL;
1173 manager->fdstate[sock->fd] = CLOSE_PENDING;
1174 select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
1175 ISC_LIST_UNLINK(manager->socklist, sock, link);
1177 #ifdef ISC_PLATFORM_USETHREADS
1178 if (ISC_LIST_EMPTY(manager->socklist))
1179 SIGNAL(&manager->shutdown_ok);
1180 #endif /* ISC_PLATFORM_USETHREADS */
1183 * XXX should reset manager->maxfd here
1186 UNLOCK(&manager->lock);
1188 free_socket(sockp);
1191 static isc_result_t
1192 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1193 isc_socket_t **socketp)
1195 isc_socket_t *sock;
1196 isc_result_t ret;
1197 ISC_SOCKADDR_LEN_T cmsgbuflen;
1199 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1201 if (sock == NULL)
1202 return (ISC_R_NOMEMORY);
1204 ret = ISC_R_UNEXPECTED;
1206 sock->magic = 0;
1207 sock->references = 0;
1209 sock->manager = manager;
1210 sock->type = type;
1211 sock->fd = -1;
1213 ISC_LINK_INIT(sock, link);
1215 sock->recvcmsgbuf = NULL;
1216 sock->sendcmsgbuf = NULL;
1219 * set up cmsg buffers
1221 cmsgbuflen = 0;
1222 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIPV6)
1223 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1224 #endif
1225 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
1226 cmsgbuflen += cmsg_space(sizeof(struct timeval));
1227 #endif
1228 sock->recvcmsgbuflen = cmsgbuflen;
1229 if (sock->recvcmsgbuflen != 0) {
1230 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1231 if (sock->recvcmsgbuf == NULL)
1232 goto error;
1235 cmsgbuflen = 0;
1236 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIPV6)
1237 cmsgbuflen = cmsg_space(sizeof(struct in6_pktinfo));
1238 #endif
1239 sock->sendcmsgbuflen = cmsgbuflen;
1240 if (sock->sendcmsgbuflen != 0) {
1241 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
1242 if (sock->sendcmsgbuf == NULL)
1243 goto error;
1247 * set up list of readers and writers to be initially empty
1249 ISC_LIST_INIT(sock->recv_list);
1250 ISC_LIST_INIT(sock->send_list);
1251 ISC_LIST_INIT(sock->accept_list);
1252 sock->connect_ev = NULL;
1253 sock->pending_recv = 0;
1254 sock->pending_send = 0;
1255 sock->pending_accept = 0;
1256 sock->listener = 0;
1257 sock->connected = 0;
1258 sock->connecting = 0;
1259 sock->bound = 0;
1262 * initialize the lock
1264 if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
1265 sock->magic = 0;
1266 UNEXPECTED_ERROR(__FILE__, __LINE__,
1267 "isc_mutex_init() %s",
1268 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1269 ISC_MSG_FAILED, "failed"));
1270 ret = ISC_R_UNEXPECTED;
1271 goto error;
1275 * Initialize readable and writable events
1277 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1278 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1279 NULL, sock, sock, NULL, NULL);
1280 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1281 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1282 NULL, sock, sock, NULL, NULL);
1284 sock->magic = SOCKET_MAGIC;
1285 *socketp = sock;
1287 return (ISC_R_SUCCESS);
1289 error:
1290 if (sock->recvcmsgbuf != NULL)
1291 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
1292 sock->recvcmsgbuflen);
1293 if (sock->sendcmsgbuf != NULL)
1294 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
1295 sock->sendcmsgbuflen);
1296 isc_mem_put(manager->mctx, sock, sizeof(*sock));
1298 return (ret);
1302 * This event requires that the various lists be empty, that the reference
1303 * count be 1, and that the magic number is valid. The other socket bits,
1304 * like the lock, must be initialized as well. The fd associated must be
1305 * marked as closed, by setting it to -1 on close, or this routine will
1306 * also close the socket.
1308 static void
1309 free_socket(isc_socket_t **socketp) {
1310 isc_socket_t *sock = *socketp;
1312 INSIST(sock->references == 0);
1313 INSIST(VALID_SOCKET(sock));
1314 INSIST(!sock->connecting);
1315 INSIST(!sock->pending_recv);
1316 INSIST(!sock->pending_send);
1317 INSIST(!sock->pending_accept);
1318 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1319 INSIST(ISC_LIST_EMPTY(sock->send_list));
1320 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1321 INSIST(!ISC_LINK_LINKED(sock, link));
1323 if (sock->recvcmsgbuf != NULL)
1324 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
1325 sock->recvcmsgbuflen);
1326 if (sock->sendcmsgbuf != NULL)
1327 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
1328 sock->sendcmsgbuflen);
1330 sock->magic = 0;
1332 DESTROYLOCK(&sock->lock);
1334 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1336 *socketp = NULL;
1340 * Create a new 'type' socket managed by 'manager'. Events
1341 * will be posted to 'task' and when dispatched 'action' will be
1342 * called with 'arg' as the arg value. The new socket is returned
1343 * in 'socketp'.
1345 isc_result_t
1346 isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1347 isc_socket_t **socketp)
1349 isc_socket_t *sock = NULL;
1350 isc_result_t ret;
1351 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
1352 int on = 1;
1353 #endif
1354 char strbuf[ISC_STRERRORSIZE];
1356 REQUIRE(VALID_MANAGER(manager));
1357 REQUIRE(socketp != NULL && *socketp == NULL);
1359 ret = allocate_socket(manager, type, &sock);
1360 if (ret != ISC_R_SUCCESS)
1361 return (ret);
1363 sock->pf = pf;
1364 switch (type) {
1365 case isc_sockettype_udp:
1366 sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1367 break;
1368 case isc_sockettype_tcp:
1369 sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1370 break;
1373 #ifdef F_DUPFD
1375 * Leave a space for stdio to work in.
1377 if (sock->fd >= 0 && sock->fd < 20) {
1378 int new, tmp;
1379 new = fcntl(sock->fd, F_DUPFD, 20);
1380 tmp = errno;
1381 (void)close(sock->fd);
1382 errno = tmp;
1383 sock->fd = new;
1385 #endif
1387 if (sock->fd >= (int)FD_SETSIZE) {
1388 (void)close(sock->fd);
1389 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1390 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1391 isc_msgcat, ISC_MSGSET_SOCKET,
1392 ISC_MSG_TOOMANYFDS,
1393 "%s: too many open file descriptors", "socket");
1394 free_socket(&sock);
1395 return (ISC_R_NORESOURCES);
1398 if (sock->fd < 0) {
1399 free_socket(&sock);
1401 switch (errno) {
1402 case EMFILE:
1403 case ENFILE:
1404 case ENOBUFS:
1405 return (ISC_R_NORESOURCES);
1407 case EPROTONOSUPPORT:
1408 case EPFNOSUPPORT:
1409 case EAFNOSUPPORT:
1411 * Linux 2.2 (and maybe others) return EINVAL instead of
1412 * EAFNOSUPPORT.
1414 case EINVAL:
1415 return (ISC_R_FAMILYNOSUPPORT);
1417 default:
1418 isc__strerror(errno, strbuf, sizeof(strbuf));
1419 UNEXPECTED_ERROR(__FILE__, __LINE__,
1420 "socket() %s: %s",
1421 isc_msgcat_get(isc_msgcat,
1422 ISC_MSGSET_GENERAL,
1423 ISC_MSG_FAILED,
1424 "failed"),
1425 strbuf);
1426 return (ISC_R_UNEXPECTED);
1430 if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
1431 (void)close(sock->fd);
1432 free_socket(&sock);
1433 return (ISC_R_UNEXPECTED);
1436 #ifdef SO_BSDCOMPAT
1437 if (setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
1438 (void *)&on, sizeof(on)) < 0) {
1439 isc__strerror(errno, strbuf, sizeof(strbuf));
1440 UNEXPECTED_ERROR(__FILE__, __LINE__,
1441 "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
1442 sock->fd,
1443 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
1444 ISC_MSG_FAILED, "failed"),
1445 strbuf);
1446 /* Press on... */
1448 #endif
1450 #if defined(USE_CMSG)
1451 if (type == isc_sockettype_udp) {
1453 #if defined(SO_TIMESTAMP)
1454 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1455 (void *)&on, sizeof(on)) < 0
1456 && errno != ENOPROTOOPT) {
1457 isc__strerror(errno, strbuf, sizeof(strbuf));
1458 UNEXPECTED_ERROR(__FILE__, __LINE__,
1459 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1460 sock->fd,
1461 isc_msgcat_get(isc_msgcat,
1462 ISC_MSGSET_GENERAL,
1463 ISC_MSG_FAILED,
1464 "failed"),
1465 strbuf);
1466 /* Press on... */
1468 #endif /* SO_TIMESTAMP */
1470 #if defined(ISC_PLATFORM_HAVEIPV6)
1471 if (pf == AF_INET6 && sock->recvcmsgbuflen == 0) {
1473 * Warn explicitly because this anomaly can be hidden
1474 * in usual operation (and unexpectedly appear later).
1476 UNEXPECTED_ERROR(__FILE__, __LINE__,
1477 "No buffer available to receive "
1478 "IPv6 destination");
1480 #ifdef IPV6_RECVPKTINFO
1481 /* 2292bis */
1482 if ((pf == AF_INET6)
1483 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1484 (void *)&on, sizeof(on)) < 0)) {
1485 isc__strerror(errno, strbuf, sizeof(strbuf));
1486 UNEXPECTED_ERROR(__FILE__, __LINE__,
1487 "setsockopt(%d, IPV6_RECVPKTINFO) "
1488 "%s: %s", sock->fd,
1489 isc_msgcat_get(isc_msgcat,
1490 ISC_MSGSET_GENERAL,
1491 ISC_MSG_FAILED,
1492 "failed"),
1493 strbuf);
1495 #else
1496 /* 2292 */
1497 if ((pf == AF_INET6)
1498 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1499 (void *)&on, sizeof(on)) < 0)) {
1500 isc__strerror(errno, strbuf, sizeof(strbuf));
1501 UNEXPECTED_ERROR(__FILE__, __LINE__,
1502 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1503 sock->fd,
1504 isc_msgcat_get(isc_msgcat,
1505 ISC_MSGSET_GENERAL,
1506 ISC_MSG_FAILED,
1507 "failed"),
1508 strbuf);
1510 #endif /* IPV6_RECVPKTINFO */
1511 #ifdef IPV6_USE_MIN_MTU /*2292bis, not too common yet*/
1512 /* use minimum MTU */
1513 if (pf == AF_INET6) {
1514 (void)setsockopt(sock->fd, IPPROTO_IPV6,
1515 IPV6_USE_MIN_MTU,
1516 (void *)&on, sizeof(on));
1518 #endif
1519 #endif /* ISC_PLATFORM_HAVEIPV6 */
1522 #endif /* USE_CMSG */
1524 sock->references = 1;
1525 *socketp = sock;
1527 LOCK(&manager->lock);
1530 * Note we don't have to lock the socket like we normally would because
1531 * there are no external references to it yet.
1534 manager->fds[sock->fd] = sock;
1535 manager->fdstate[sock->fd] = MANAGED;
1536 ISC_LIST_APPEND(manager->socklist, sock, link);
1537 if (manager->maxfd < sock->fd)
1538 manager->maxfd = sock->fd;
1540 UNLOCK(&manager->lock);
1542 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1543 ISC_MSG_CREATED, "created");
1545 return (ISC_R_SUCCESS);
1549 * Attach to a socket. Caller must explicitly detach when it is done.
1551 void
1552 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1553 REQUIRE(VALID_SOCKET(sock));
1554 REQUIRE(socketp != NULL && *socketp == NULL);
1556 LOCK(&sock->lock);
1557 sock->references++;
1558 UNLOCK(&sock->lock);
1560 *socketp = sock;
1564 * Dereference a socket. If this is the last reference to it, clean things
1565 * up by destroying the socket.
1567 void
1568 isc_socket_detach(isc_socket_t **socketp) {
1569 isc_socket_t *sock;
1570 isc_boolean_t kill_socket = ISC_FALSE;
1572 REQUIRE(socketp != NULL);
1573 sock = *socketp;
1574 REQUIRE(VALID_SOCKET(sock));
1576 LOCK(&sock->lock);
1577 REQUIRE(sock->references > 0);
1578 sock->references--;
1579 if (sock->references == 0)
1580 kill_socket = ISC_TRUE;
1581 UNLOCK(&sock->lock);
1583 if (kill_socket)
1584 destroy(&sock);
1586 *socketp = NULL;
1590 * I/O is possible on a given socket. Schedule an event to this task that
1591 * will call an internal function to do the I/O. This will charge the
1592 * task with the I/O operation and let our select loop handler get back
1593 * to doing something real as fast as possible.
1595 * The socket and manager must be locked before calling this function.
1597 static void
1598 dispatch_recv(isc_socket_t *sock) {
1599 intev_t *iev;
1600 isc_socketevent_t *ev;
1602 INSIST(!sock->pending_recv);
1604 ev = ISC_LIST_HEAD(sock->recv_list);
1605 if (ev == NULL)
1606 return;
1608 sock->pending_recv = 1;
1609 iev = &sock->readable_ev;
1611 socket_log(sock, NULL, EVENT, NULL, 0, 0,
1612 "dispatch_recv: event %p -> task %p", ev, ev->ev_sender);
1614 sock->references++;
1615 iev->ev_sender = sock;
1616 iev->ev_action = internal_recv;
1617 iev->ev_arg = sock;
1619 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1622 static void
1623 dispatch_send(isc_socket_t *sock) {
1624 intev_t *iev;
1625 isc_socketevent_t *ev;
1627 INSIST(!sock->pending_send);
1629 ev = ISC_LIST_HEAD(sock->send_list);
1630 if (ev == NULL)
1631 return;
1633 sock->pending_send = 1;
1634 iev = &sock->writable_ev;
1636 socket_log(sock, NULL, EVENT, NULL, 0, 0,
1637 "dispatch_send: event %p -> task %p", ev, ev->ev_sender);
1639 sock->references++;
1640 iev->ev_sender = sock;
1641 iev->ev_action = internal_send;
1642 iev->ev_arg = sock;
1644 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1648 * Dispatch an internal accept event.
1650 static void
1651 dispatch_accept(isc_socket_t *sock) {
1652 intev_t *iev;
1653 isc_socket_newconnev_t *ev;
1655 INSIST(!sock->pending_accept);
1658 * Are there any done events left, or were they all canceled
1659 * before the manager got the socket lock?
1661 ev = ISC_LIST_HEAD(sock->accept_list);
1662 if (ev == NULL)
1663 return;
1665 sock->pending_accept = 1;
1666 iev = &sock->readable_ev;
1668 sock->references++; /* keep socket around for this internal event */
1669 iev->ev_sender = sock;
1670 iev->ev_action = internal_accept;
1671 iev->ev_arg = sock;
1673 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1676 static void
1677 dispatch_connect(isc_socket_t *sock) {
1678 intev_t *iev;
1679 isc_socket_connev_t *ev;
1681 iev = &sock->writable_ev;
1683 ev = sock->connect_ev;
1684 INSIST(ev != NULL); /* XXX */
1686 INSIST(sock->connecting);
1688 sock->references++; /* keep socket around for this internal event */
1689 iev->ev_sender = sock;
1690 iev->ev_action = internal_connect;
1691 iev->ev_arg = sock;
1693 isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1697 * Dequeue an item off the given socket's read queue, set the result code
1698 * in the done event to the one provided, and send it to the task it was
1699 * destined for.
1701 * If the event to be sent is on a list, remove it before sending. If
1702 * asked to, send and detach from the socket as well.
1704 * Caller must have the socket locked if the event is attached to the socket.
1706 static void
1707 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1708 isc_task_t *task;
1710 task = (*dev)->ev_sender;
1712 (*dev)->ev_sender = sock;
1714 if (ISC_LINK_LINKED(*dev, ev_link))
1715 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1717 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1718 == ISC_SOCKEVENTATTR_ATTACHED)
1719 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1720 else
1721 isc_task_send(task, (isc_event_t **)dev);
1725 * See comments for send_recvdone_event() above.
1727 * Caller must have the socket locked if the event is attached to the socket.
1729 static void
1730 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1731 isc_task_t *task;
1733 INSIST(dev != NULL && *dev != NULL);
1735 task = (*dev)->ev_sender;
1736 (*dev)->ev_sender = sock;
1738 if (ISC_LINK_LINKED(*dev, ev_link))
1739 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1741 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1742 == ISC_SOCKEVENTATTR_ATTACHED)
1743 isc_task_sendanddetach(&task, (isc_event_t **)dev);
1744 else
1745 isc_task_send(task, (isc_event_t **)dev);
1749 * Call accept() on a socket, to get the new file descriptor. The listen
1750 * socket is used as a prototype to create a new isc_socket_t. The new
1751 * socket has one outstanding reference. The task receiving the event
1752 * will be detached from just after the event is delivered.
1754 * On entry to this function, the event delivered is the internal
1755 * readable event, and the first item on the accept_list should be
1756 * the done event we want to send. If the list is empty, this is a no-op,
1757 * so just unlock and return.
1759 static void
1760 internal_accept(isc_task_t *me, isc_event_t *ev) {
1761 isc_socket_t *sock;
1762 isc_socketmgr_t *manager;
1763 isc_socket_newconnev_t *dev;
1764 isc_task_t *task;
1765 ISC_SOCKADDR_LEN_T addrlen;
1766 int fd;
1767 isc_result_t result = ISC_R_SUCCESS;
1768 char strbuf[ISC_STRERRORSIZE];
1770 UNUSED(me);
1772 sock = ev->ev_sender;
1773 INSIST(VALID_SOCKET(sock));
1775 LOCK(&sock->lock);
1776 socket_log(sock, NULL, TRACE,
1777 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1778 "internal_accept called, locked socket");
1780 manager = sock->manager;
1781 INSIST(VALID_MANAGER(manager));
1783 INSIST(sock->listener);
1784 INSIST(sock->pending_accept == 1);
1785 sock->pending_accept = 0;
1787 INSIST(sock->references > 0);
1788 sock->references--; /* the internal event is done with this socket */
1789 if (sock->references == 0) {
1790 UNLOCK(&sock->lock);
1791 destroy(&sock);
1792 return;
1796 * Get the first item off the accept list.
1797 * If it is empty, unlock the socket and return.
1799 dev = ISC_LIST_HEAD(sock->accept_list);
1800 if (dev == NULL) {
1801 UNLOCK(&sock->lock);
1802 return;
1806 * Try to accept the new connection. If the accept fails with
1807 * EAGAIN or EINTR, simply poke the watcher to watch this socket
1808 * again. Also ignore ECONNRESET, which has been reported to
1809 * be spuriously returned on Linux 2.2.19 although it is not
1810 * a documented error for accept(). ECONNABORTED has been
1811 * reported for Solaris 8. The rest are thrown in not because
1812 * we have seen them but because they are ignored by other
1813 * deamons such as BIND 8 and Apache.
1816 addrlen = sizeof(dev->newsocket->address.type);
1817 memset(&dev->newsocket->address.type.sa, 0, addrlen);
1818 fd = accept(sock->fd, &dev->newsocket->address.type.sa,
1819 (void *)&addrlen);
1821 #ifdef F_DUPFD
1823 * Leave a space for stdio to work in.
1825 if (fd >= 0 && fd < 20) {
1826 int new, tmp;
1827 new = fcntl(fd, F_DUPFD, 20);
1828 tmp = errno;
1829 (void)close(fd);
1830 errno = tmp;
1831 fd = new;
1833 #endif
1835 if (fd < 0) {
1836 if (SOFT_ERROR(errno))
1837 goto soft_error;
1838 switch (errno) {
1839 case ENOBUFS:
1840 case ENFILE:
1841 case ENOMEM:
1842 case ECONNRESET:
1843 case ECONNABORTED:
1844 case EHOSTUNREACH:
1845 case EHOSTDOWN:
1846 case ENETUNREACH:
1847 case ENETDOWN:
1848 case ECONNREFUSED:
1849 #ifdef EPROTO
1850 case EPROTO:
1851 #endif
1852 #ifdef ENONET
1853 case ENONET:
1854 #endif
1855 goto soft_error;
1856 default:
1857 break;
1859 isc__strerror(errno, strbuf, sizeof(strbuf));
1860 UNEXPECTED_ERROR(__FILE__, __LINE__,
1861 "internal_accept: accept() %s: %s",
1862 isc_msgcat_get(isc_msgcat,
1863 ISC_MSGSET_GENERAL,
1864 ISC_MSG_FAILED,
1865 "failed"),
1866 strbuf);
1867 fd = -1;
1868 result = ISC_R_UNEXPECTED;
1869 } else {
1870 if (addrlen == 0) {
1871 UNEXPECTED_ERROR(__FILE__, __LINE__,
1872 "internal_accept(): "
1873 "accept() failed to return "
1874 "remote address");
1876 (void)close(fd);
1877 goto soft_error;
1878 } else if (dev->newsocket->address.type.sa.sa_family !=
1879 sock->pf)
1881 UNEXPECTED_ERROR(__FILE__, __LINE__,
1882 "internal_accept(): "
1883 "accept() returned peer address "
1884 "family %u (expected %u)",
1885 dev->newsocket->address.
1886 type.sa.sa_family,
1887 sock->pf);
1888 (void)close(fd);
1889 goto soft_error;
1890 } else if (fd >= (int)FD_SETSIZE) {
1891 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1892 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1893 isc_msgcat, ISC_MSGSET_SOCKET,
1894 ISC_MSG_TOOMANYFDS,
1895 "%s: too many open file descriptors",
1896 "accept");
1897 (void)close(fd);
1898 goto soft_error;
1902 if (fd != -1) {
1903 dev->newsocket->address.length = addrlen;
1904 dev->newsocket->pf = sock->pf;
1908 * Pull off the done event.
1910 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
1913 * Poke watcher if there are more pending accepts.
1915 if (!ISC_LIST_EMPTY(sock->accept_list))
1916 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
1918 UNLOCK(&sock->lock);
1920 if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
1921 close(fd);
1922 fd = -1;
1923 result = ISC_R_UNEXPECTED;
1927 * -1 means the new socket didn't happen.
1929 if (fd != -1) {
1930 LOCK(&manager->lock);
1931 ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
1933 dev->newsocket->fd = fd;
1934 dev->newsocket->bound = 1;
1935 dev->newsocket->connected = 1;
1938 * Save away the remote address
1940 dev->address = dev->newsocket->address;
1942 manager->fds[fd] = dev->newsocket;
1943 manager->fdstate[fd] = MANAGED;
1944 if (manager->maxfd < fd)
1945 manager->maxfd = fd;
1947 socket_log(sock, &dev->newsocket->address, CREATION,
1948 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
1949 "accepted connection, new socket %p",
1950 dev->newsocket);
1952 UNLOCK(&manager->lock);
1953 } else {
1954 dev->newsocket->references--;
1955 free_socket(&dev->newsocket);
1959 * Fill in the done event details and send it off.
1961 dev->result = result;
1962 task = dev->ev_sender;
1963 dev->ev_sender = sock;
1965 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
1966 return;
1968 soft_error:
1969 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
1970 UNLOCK(&sock->lock);
1971 return;
1974 static void
1975 internal_recv(isc_task_t *me, isc_event_t *ev) {
1976 isc_socketevent_t *dev;
1977 isc_socket_t *sock;
1979 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1981 sock = ev->ev_sender;
1982 INSIST(VALID_SOCKET(sock));
1984 LOCK(&sock->lock);
1985 socket_log(sock, NULL, IOEVENT,
1986 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
1987 "internal_recv: task %p got event %p", me, ev);
1989 INSIST(sock->pending_recv == 1);
1990 sock->pending_recv = 0;
1992 INSIST(sock->references > 0);
1993 sock->references--; /* the internal event is done with this socket */
1994 if (sock->references == 0) {
1995 UNLOCK(&sock->lock);
1996 destroy(&sock);
1997 return;
2001 * Try to do as much I/O as possible on this socket. There are no
2002 * limits here, currently.
2004 dev = ISC_LIST_HEAD(sock->recv_list);
2005 while (dev != NULL) {
2006 switch (doio_recv(sock, dev)) {
2007 case DOIO_SOFT:
2008 goto poke;
2010 case DOIO_EOF:
2012 * read of 0 means the remote end was closed.
2013 * Run through the event queue and dispatch all
2014 * the events with an EOF result code.
2016 do {
2017 dev->result = ISC_R_EOF;
2018 send_recvdone_event(sock, &dev);
2019 dev = ISC_LIST_HEAD(sock->recv_list);
2020 } while (dev != NULL);
2021 goto poke;
2023 case DOIO_SUCCESS:
2024 case DOIO_HARD:
2025 send_recvdone_event(sock, &dev);
2026 break;
2029 dev = ISC_LIST_HEAD(sock->recv_list);
2032 poke:
2033 if (!ISC_LIST_EMPTY(sock->recv_list))
2034 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2036 UNLOCK(&sock->lock);
2039 static void
2040 internal_send(isc_task_t *me, isc_event_t *ev) {
2041 isc_socketevent_t *dev;
2042 isc_socket_t *sock;
2044 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2047 * Find out what socket this is and lock it.
2049 sock = (isc_socket_t *)ev->ev_sender;
2050 INSIST(VALID_SOCKET(sock));
2052 LOCK(&sock->lock);
2053 socket_log(sock, NULL, IOEVENT,
2054 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2055 "internal_send: task %p got event %p", me, ev);
2057 INSIST(sock->pending_send == 1);
2058 sock->pending_send = 0;
2060 INSIST(sock->references > 0);
2061 sock->references--; /* the internal event is done with this socket */
2062 if (sock->references == 0) {
2063 UNLOCK(&sock->lock);
2064 destroy(&sock);
2065 return;
2069 * Try to do as much I/O as possible on this socket. There are no
2070 * limits here, currently.
2072 dev = ISC_LIST_HEAD(sock->send_list);
2073 while (dev != NULL) {
2074 switch (doio_send(sock, dev)) {
2075 case DOIO_SOFT:
2076 goto poke;
2078 case DOIO_HARD:
2079 case DOIO_SUCCESS:
2080 send_senddone_event(sock, &dev);
2081 break;
2084 dev = ISC_LIST_HEAD(sock->send_list);
2087 poke:
2088 if (!ISC_LIST_EMPTY(sock->send_list))
2089 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
2091 UNLOCK(&sock->lock);
2094 static void
2095 process_fds(isc_socketmgr_t *manager, int maxfd,
2096 fd_set *readfds, fd_set *writefds)
2098 int i;
2099 isc_socket_t *sock;
2100 isc_boolean_t unlock_sock;
2102 REQUIRE(maxfd <= (int)FD_SETSIZE);
2105 * Process read/writes on other fds here. Avoid locking
2106 * and unlocking twice if both reads and writes are possible.
2108 for (i = 0 ; i < maxfd ; i++) {
2109 #ifdef ISC_PLATFORM_USETHREADS
2110 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
2111 continue;
2112 #endif /* ISC_PLATFORM_USETHREADS */
2114 if (manager->fdstate[i] == CLOSE_PENDING) {
2115 manager->fdstate[i] = CLOSED;
2116 FD_CLR(i, &manager->read_fds);
2117 FD_CLR(i, &manager->write_fds);
2119 close(i);
2121 continue;
2124 sock = manager->fds[i];
2125 unlock_sock = ISC_FALSE;
2126 if (FD_ISSET(i, readfds)) {
2127 if (sock == NULL) {
2128 FD_CLR(i, &manager->read_fds);
2129 goto check_write;
2131 unlock_sock = ISC_TRUE;
2132 LOCK(&sock->lock);
2133 if (!SOCK_DEAD(sock)) {
2134 if (sock->listener)
2135 dispatch_accept(sock);
2136 else
2137 dispatch_recv(sock);
2139 FD_CLR(i, &manager->read_fds);
2141 check_write:
2142 if (FD_ISSET(i, writefds)) {
2143 if (sock == NULL) {
2144 FD_CLR(i, &manager->write_fds);
2145 continue;
2147 if (!unlock_sock) {
2148 unlock_sock = ISC_TRUE;
2149 LOCK(&sock->lock);
2151 if (!SOCK_DEAD(sock)) {
2152 if (sock->connecting)
2153 dispatch_connect(sock);
2154 else
2155 dispatch_send(sock);
2157 FD_CLR(i, &manager->write_fds);
2159 if (unlock_sock)
2160 UNLOCK(&sock->lock);
2164 #ifdef ISC_PLATFORM_USETHREADS
2166 * This is the thread that will loop forever, always in a select or poll
2167 * call.
2169 * When select returns something to do, track down what thread gets to do
2170 * this I/O and post the event to it.
2172 static isc_threadresult_t
2173 watcher(void *uap) {
2174 isc_socketmgr_t *manager = uap;
2175 isc_boolean_t done;
2176 int ctlfd;
2177 int cc;
2178 fd_set readfds;
2179 fd_set writefds;
2180 int msg, fd;
2181 int maxfd;
2182 char strbuf[ISC_STRERRORSIZE];
2185 * Get the control fd here. This will never change.
2187 LOCK(&manager->lock);
2188 ctlfd = manager->pipe_fds[0];
2190 done = ISC_FALSE;
2191 while (!done) {
2192 do {
2193 readfds = manager->read_fds;
2194 writefds = manager->write_fds;
2195 maxfd = manager->maxfd + 1;
2197 UNLOCK(&manager->lock);
2199 cc = select(maxfd, &readfds, &writefds, NULL, NULL);
2200 if (cc < 0) {
2201 if (!SOFT_ERROR(errno)) {
2202 isc__strerror(errno, strbuf,
2203 sizeof(strbuf));
2204 FATAL_ERROR(__FILE__, __LINE__,
2205 "select() %s: %s",
2206 isc_msgcat_get(isc_msgcat,
2207 ISC_MSGSET_GENERAL,
2208 ISC_MSG_FAILED,
2209 "failed"),
2210 strbuf);
2214 LOCK(&manager->lock);
2215 } while (cc < 0);
2219 * Process reads on internal, control fd.
2221 if (FD_ISSET(ctlfd, &readfds)) {
2222 for (;;) {
2223 select_readmsg(manager, &fd, &msg);
2225 manager_log(manager, IOEVENT,
2226 isc_msgcat_get(isc_msgcat,
2227 ISC_MSGSET_SOCKET,
2228 ISC_MSG_WATCHERMSG,
2229 "watcher got message %d"),
2230 msg);
2233 * Nothing to read?
2235 if (msg == SELECT_POKE_NOTHING)
2236 break;
2239 * Handle shutdown message. We really should
2240 * jump out of this loop right away, but
2241 * it doesn't matter if we have to do a little
2242 * more work first.
2244 if (msg == SELECT_POKE_SHUTDOWN) {
2245 done = ISC_TRUE;
2247 break;
2251 * This is a wakeup on a socket. Look
2252 * at the event queue for both read and write,
2253 * and decide if we need to watch on it now
2254 * or not.
2256 wakeup_socket(manager, fd, msg);
2260 process_fds(manager, maxfd, &readfds, &writefds);
2263 manager_log(manager, TRACE,
2264 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2265 ISC_MSG_EXITING, "watcher exiting"));
2267 UNLOCK(&manager->lock);
2268 return ((isc_threadresult_t)0);
2270 #endif /* ISC_PLATFORM_USETHREADS */
2273 * Create a new socket manager.
2275 isc_result_t
2276 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2277 isc_socketmgr_t *manager;
2278 #ifdef ISC_PLATFORM_USETHREADS
2279 char strbuf[ISC_STRERRORSIZE];
2280 #endif
2282 REQUIRE(managerp != NULL && *managerp == NULL);
2284 #ifndef ISC_PLATFORM_USETHREADS
2285 if (socketmgr != NULL) {
2286 socketmgr->refs++;
2287 *managerp = socketmgr;
2288 return (ISC_R_SUCCESS);
2290 #endif /* ISC_PLATFORM_USETHREADS */
2292 manager = isc_mem_get(mctx, sizeof(*manager));
2293 if (manager == NULL)
2294 return (ISC_R_NOMEMORY);
2296 manager->magic = SOCKET_MANAGER_MAGIC;
2297 manager->mctx = NULL;
2298 memset(manager->fds, 0, sizeof(manager->fds));
2299 ISC_LIST_INIT(manager->socklist);
2300 if (isc_mutex_init(&manager->lock) != ISC_R_SUCCESS) {
2301 isc_mem_put(mctx, manager, sizeof(*manager));
2302 UNEXPECTED_ERROR(__FILE__, __LINE__,
2303 "isc_mutex_init() %s",
2304 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2305 ISC_MSG_FAILED, "failed"));
2306 return (ISC_R_UNEXPECTED);
2308 #ifdef ISC_PLATFORM_USETHREADS
2309 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2310 DESTROYLOCK(&manager->lock);
2311 isc_mem_put(mctx, manager, sizeof(*manager));
2312 UNEXPECTED_ERROR(__FILE__, __LINE__,
2313 "isc_condition_init() %s",
2314 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2315 ISC_MSG_FAILED, "failed"));
2316 return (ISC_R_UNEXPECTED);
2320 * Create the special fds that will be used to wake up the
2321 * select/poll loop when something internal needs to be done.
2323 if (pipe(manager->pipe_fds) != 0) {
2324 DESTROYLOCK(&manager->lock);
2325 isc_mem_put(mctx, manager, sizeof(*manager));
2326 isc__strerror(errno, strbuf, sizeof(strbuf));
2327 UNEXPECTED_ERROR(__FILE__, __LINE__,
2328 "pipe() %s: %s",
2329 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2330 ISC_MSG_FAILED, "failed"),
2331 strbuf);
2333 return (ISC_R_UNEXPECTED);
2336 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
2337 #if 0
2338 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
2339 #endif
2340 #else /* ISC_PLATFORM_USETHREADS */
2341 manager->refs = 1;
2342 #endif /* ISC_PLATFORM_USETHREADS */
2345 * Set up initial state for the select loop
2347 FD_ZERO(&manager->read_fds);
2348 FD_ZERO(&manager->write_fds);
2349 #ifdef ISC_PLATFORM_USETHREADS
2350 FD_SET(manager->pipe_fds[0], &manager->read_fds);
2351 manager->maxfd = manager->pipe_fds[0];
2352 #else /* ISC_PLATFORM_USETHREADS */
2353 manager->maxfd = 0;
2354 #endif /* ISC_PLATFORM_USETHREADS */
2355 memset(manager->fdstate, 0, sizeof(manager->fdstate));
2357 #ifdef ISC_PLATFORM_USETHREADS
2359 * Start up the select/poll thread.
2361 if (isc_thread_create(watcher, manager, &manager->watcher) !=
2362 ISC_R_SUCCESS) {
2363 close(manager->pipe_fds[0]);
2364 close(manager->pipe_fds[1]);
2365 DESTROYLOCK(&manager->lock);
2366 isc_mem_put(mctx, manager, sizeof(*manager));
2367 UNEXPECTED_ERROR(__FILE__, __LINE__,
2368 "isc_thread_create() %s",
2369 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2370 ISC_MSG_FAILED, "failed"));
2371 return (ISC_R_UNEXPECTED);
2373 #endif /* ISC_PLATFORM_USETHREADS */
2374 isc_mem_attach(mctx, &manager->mctx);
2376 #ifndef ISC_PLATFORM_USETHREADS
2377 socketmgr = manager;
2378 #endif /* ISC_PLATFORM_USETHREADS */
2379 *managerp = manager;
2381 return (ISC_R_SUCCESS);
2384 void
2385 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
2386 isc_socketmgr_t *manager;
2387 int i;
2388 isc_mem_t *mctx;
2391 * Destroy a socket manager.
2394 REQUIRE(managerp != NULL);
2395 manager = *managerp;
2396 REQUIRE(VALID_MANAGER(manager));
2398 #ifndef ISC_PLATFORM_USETHREADS
2399 if (manager->refs > 1) {
2400 manager->refs--;
2401 *managerp = NULL;
2402 return;
2404 #endif /* ISC_PLATFORM_USETHREADS */
2406 LOCK(&manager->lock);
2408 #ifdef ISC_PLATFORM_USETHREADS
2410 * Wait for all sockets to be destroyed.
2412 while (!ISC_LIST_EMPTY(manager->socklist)) {
2413 manager_log(manager, CREATION,
2414 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2415 ISC_MSG_SOCKETSREMAIN,
2416 "sockets exist"));
2417 WAIT(&manager->shutdown_ok, &manager->lock);
2419 #else /* ISC_PLATFORM_USETHREADS */
2421 * Hope all sockets have been destroyed.
2423 if (!ISC_LIST_EMPTY(manager->socklist)) {
2424 manager_log(manager, CREATION,
2425 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2426 ISC_MSG_SOCKETSREMAIN,
2427 "sockets exist"));
2428 INSIST(0);
2430 #endif /* ISC_PLATFORM_USETHREADS */
2432 UNLOCK(&manager->lock);
2435 * Here, poke our select/poll thread. Do this by closing the write
2436 * half of the pipe, which will send EOF to the read half.
2437 * This is currently a no-op in the non-threaded case.
2439 select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2441 #ifdef ISC_PLATFORM_USETHREADS
2443 * Wait for thread to exit.
2445 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
2446 UNEXPECTED_ERROR(__FILE__, __LINE__,
2447 "isc_thread_join() %s",
2448 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2449 ISC_MSG_FAILED, "failed"));
2450 #endif /* ISC_PLATFORM_USETHREADS */
2453 * Clean up.
2455 #ifdef ISC_PLATFORM_USETHREADS
2456 close(manager->pipe_fds[0]);
2457 close(manager->pipe_fds[1]);
2458 (void)isc_condition_destroy(&manager->shutdown_ok);
2459 #endif /* ISC_PLATFORM_USETHREADS */
2461 for (i = 0 ; i < (int)FD_SETSIZE ; i++)
2462 if (manager->fdstate[i] == CLOSE_PENDING)
2463 close(i);
2465 DESTROYLOCK(&manager->lock);
2466 manager->magic = 0;
2467 mctx= manager->mctx;
2468 isc_mem_put(mctx, manager, sizeof(*manager));
2470 isc_mem_detach(&mctx);
2472 *managerp = NULL;
2475 static isc_result_t
2476 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2477 unsigned int flags)
2479 int io_state;
2480 isc_boolean_t have_lock = ISC_FALSE;
2481 isc_task_t *ntask = NULL;
2482 isc_result_t result = ISC_R_SUCCESS;
2484 dev->ev_sender = task;
2486 if (sock->type == isc_sockettype_udp) {
2487 io_state = doio_recv(sock, dev);
2488 } else {
2489 LOCK(&sock->lock);
2490 have_lock = ISC_TRUE;
2492 if (ISC_LIST_EMPTY(sock->recv_list))
2493 io_state = doio_recv(sock, dev);
2494 else
2495 io_state = DOIO_SOFT;
2498 switch (io_state) {
2499 case DOIO_SOFT:
2501 * We couldn't read all or part of the request right now, so
2502 * queue it.
2504 * Attach to socket and to task
2506 isc_task_attach(task, &ntask);
2507 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2509 if (!have_lock) {
2510 LOCK(&sock->lock);
2511 have_lock = ISC_TRUE;
2515 * Enqueue the request. If the socket was previously not being
2516 * watched, poke the watcher to start paying attention to it.
2518 if (ISC_LIST_EMPTY(sock->recv_list))
2519 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2520 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2522 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2523 "socket_recv: event %p -> task %p",
2524 dev, ntask);
2526 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2527 result = ISC_R_INPROGRESS;
2528 break;
2530 case DOIO_EOF:
2531 dev->result = ISC_R_EOF;
2532 /* fallthrough */
2534 case DOIO_HARD:
2535 case DOIO_SUCCESS:
2536 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2537 send_recvdone_event(sock, &dev);
2538 break;
2541 if (have_lock)
2542 UNLOCK(&sock->lock);
2544 return (result);
2547 isc_result_t
2548 isc_socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2549 unsigned int minimum, isc_task_t *task,
2550 isc_taskaction_t action, const void *arg)
2552 isc_socketevent_t *dev;
2553 isc_socketmgr_t *manager;
2554 unsigned int iocount;
2555 isc_buffer_t *buffer;
2557 REQUIRE(VALID_SOCKET(sock));
2558 REQUIRE(buflist != NULL);
2559 REQUIRE(!ISC_LIST_EMPTY(*buflist));
2560 REQUIRE(task != NULL);
2561 REQUIRE(action != NULL);
2563 manager = sock->manager;
2564 REQUIRE(VALID_MANAGER(manager));
2566 iocount = isc_bufferlist_availablecount(buflist);
2567 REQUIRE(iocount > 0);
2569 INSIST(sock->bound);
2571 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2572 if (dev == NULL) {
2573 return (ISC_R_NOMEMORY);
2577 * UDP sockets are always partial read
2579 if (sock->type == isc_sockettype_udp)
2580 dev->minimum = 1;
2581 else {
2582 if (minimum == 0)
2583 dev->minimum = iocount;
2584 else
2585 dev->minimum = minimum;
2589 * Move each buffer from the passed in list to our internal one.
2591 buffer = ISC_LIST_HEAD(*buflist);
2592 while (buffer != NULL) {
2593 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2594 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2595 buffer = ISC_LIST_HEAD(*buflist);
2598 return (socket_recv(sock, dev, task, 0));
2601 isc_result_t
2602 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
2603 isc_task_t *task, isc_taskaction_t action, const void *arg)
2605 isc_socketevent_t *dev;
2606 isc_socketmgr_t *manager;
2608 REQUIRE(VALID_SOCKET(sock));
2609 REQUIRE(action != NULL);
2611 manager = sock->manager;
2612 REQUIRE(VALID_MANAGER(manager));
2614 INSIST(sock->bound);
2616 dev = allocate_socketevent(sock, ISC_SOCKEVENT_RECVDONE, action, arg);
2617 if (dev == NULL)
2618 return (ISC_R_NOMEMORY);
2620 return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
2623 isc_result_t
2624 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region,
2625 unsigned int minimum, isc_task_t *task,
2626 isc_socketevent_t *event, unsigned int flags)
2628 event->ev_sender = sock;
2629 event->result = ISC_R_UNEXPECTED;
2630 ISC_LIST_INIT(event->bufferlist);
2631 event->region = *region;
2632 event->n = 0;
2633 event->offset = 0;
2634 event->attributes = 0;
2637 * UDP sockets are always partial read.
2639 if (sock->type == isc_sockettype_udp)
2640 event->minimum = 1;
2641 else {
2642 if (minimum == 0)
2643 event->minimum = region->length;
2644 else
2645 event->minimum = minimum;
2648 return (socket_recv(sock, event, task, flags));
2651 static isc_result_t
2652 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2653 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2654 unsigned int flags)
2656 int io_state;
2657 isc_boolean_t have_lock = ISC_FALSE;
2658 isc_task_t *ntask = NULL;
2659 isc_result_t result = ISC_R_SUCCESS;
2661 dev->ev_sender = task;
2663 set_dev_address(address, sock, dev);
2664 if (pktinfo != NULL) {
2665 socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
2666 ISC_MSG_PKTINFOPROVIDED,
2667 "pktinfo structure provided, ifindex %u (set to 0)",
2668 pktinfo->ipi6_ifindex);
2670 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2671 dev->pktinfo = *pktinfo;
2673 * Set the pktinfo index to 0 here, to let the kernel decide
2674 * what interface it should send on.
2676 dev->pktinfo.ipi6_ifindex = 0;
2679 if (sock->type == isc_sockettype_udp)
2680 io_state = doio_send(sock, dev);
2681 else {
2682 LOCK(&sock->lock);
2683 have_lock = ISC_TRUE;
2685 if (ISC_LIST_EMPTY(sock->send_list))
2686 io_state = doio_send(sock, dev);
2687 else
2688 io_state = DOIO_SOFT;
2691 switch (io_state) {
2692 case DOIO_SOFT:
2694 * We couldn't send all or part of the request right now, so
2695 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2697 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2698 isc_task_attach(task, &ntask);
2699 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2701 if (!have_lock) {
2702 LOCK(&sock->lock);
2703 have_lock = ISC_TRUE;
2707 * Enqueue the request. If the socket was previously
2708 * not being watched, poke the watcher to start
2709 * paying attention to it.
2711 if (ISC_LIST_EMPTY(sock->send_list))
2712 select_poke(sock->manager, sock->fd,
2713 SELECT_POKE_WRITE);
2714 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2716 socket_log(sock, NULL, EVENT, NULL, 0, 0,
2717 "socket_send: event %p -> task %p",
2718 dev, ntask);
2720 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2721 result = ISC_R_INPROGRESS;
2722 break;
2725 case DOIO_HARD:
2726 case DOIO_SUCCESS:
2727 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2728 send_senddone_event(sock, &dev);
2729 break;
2732 if (have_lock)
2733 UNLOCK(&sock->lock);
2735 return (result);
2738 isc_result_t
2739 isc_socket_send(isc_socket_t *sock, isc_region_t *region,
2740 isc_task_t *task, isc_taskaction_t action, const void *arg)
2743 * REQUIRE() checking is performed in isc_socket_sendto().
2745 return (isc_socket_sendto(sock, region, task, action, arg, NULL,
2746 NULL));
2749 isc_result_t
2750 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region,
2751 isc_task_t *task, isc_taskaction_t action, const void *arg,
2752 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2754 isc_socketevent_t *dev;
2755 isc_socketmgr_t *manager;
2757 REQUIRE(VALID_SOCKET(sock));
2758 REQUIRE(region != NULL);
2759 REQUIRE(task != NULL);
2760 REQUIRE(action != NULL);
2762 manager = sock->manager;
2763 REQUIRE(VALID_MANAGER(manager));
2765 INSIST(sock->bound);
2767 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2768 if (dev == NULL) {
2769 return (ISC_R_NOMEMORY);
2772 dev->region = *region;
2774 return (socket_send(sock, dev, task, address, pktinfo, 0));
2777 isc_result_t
2778 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2779 isc_task_t *task, isc_taskaction_t action, const void *arg)
2781 return (isc_socket_sendtov(sock, buflist, task, action, arg, NULL,
2782 NULL));
2785 isc_result_t
2786 isc_socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
2787 isc_task_t *task, isc_taskaction_t action, const void *arg,
2788 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
2790 isc_socketevent_t *dev;
2791 isc_socketmgr_t *manager;
2792 unsigned int iocount;
2793 isc_buffer_t *buffer;
2795 REQUIRE(VALID_SOCKET(sock));
2796 REQUIRE(buflist != NULL);
2797 REQUIRE(!ISC_LIST_EMPTY(*buflist));
2798 REQUIRE(task != NULL);
2799 REQUIRE(action != NULL);
2801 manager = sock->manager;
2802 REQUIRE(VALID_MANAGER(manager));
2804 iocount = isc_bufferlist_usedcount(buflist);
2805 REQUIRE(iocount > 0);
2807 dev = allocate_socketevent(sock, ISC_SOCKEVENT_SENDDONE, action, arg);
2808 if (dev == NULL) {
2809 return (ISC_R_NOMEMORY);
2813 * Move each buffer from the passed in list to our internal one.
2815 buffer = ISC_LIST_HEAD(*buflist);
2816 while (buffer != NULL) {
2817 ISC_LIST_DEQUEUE(*buflist, buffer, link);
2818 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2819 buffer = ISC_LIST_HEAD(*buflist);
2822 return (socket_send(sock, dev, task, address, pktinfo, 0));
2825 isc_result_t
2826 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region,
2827 isc_task_t *task,
2828 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2829 isc_socketevent_t *event, unsigned int flags)
2831 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
2832 if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
2833 REQUIRE(sock->type == isc_sockettype_udp);
2834 event->ev_sender = sock;
2835 event->result = ISC_R_UNEXPECTED;
2836 ISC_LIST_INIT(event->bufferlist);
2837 event->region = *region;
2838 event->n = 0;
2839 event->offset = 0;
2840 event->attributes = 0;
2842 return (socket_send(sock, event, task, address, pktinfo, flags));
2845 isc_result_t
2846 isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) {
2847 char strbuf[ISC_STRERRORSIZE];
2848 int on = 1;
2850 LOCK(&sock->lock);
2852 INSIST(!sock->bound);
2854 if (sock->pf != sockaddr->type.sa.sa_family) {
2855 UNLOCK(&sock->lock);
2856 return (ISC_R_FAMILYMISMATCH);
2859 * Only set SO_REUSEADDR when we want a specific port.
2861 if (isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2862 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2863 sizeof(on)) < 0) {
2864 UNEXPECTED_ERROR(__FILE__, __LINE__,
2865 "setsockopt(%d) %s", sock->fd,
2866 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2867 ISC_MSG_FAILED, "failed"));
2868 /* Press on... */
2870 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2871 UNLOCK(&sock->lock);
2872 switch (errno) {
2873 case EACCES:
2874 return (ISC_R_NOPERM);
2875 case EADDRNOTAVAIL:
2876 return (ISC_R_ADDRNOTAVAIL);
2877 case EADDRINUSE:
2878 return (ISC_R_ADDRINUSE);
2879 case EINVAL:
2880 return (ISC_R_BOUND);
2881 default:
2882 isc__strerror(errno, strbuf, sizeof(strbuf));
2883 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2884 strbuf);
2885 return (ISC_R_UNEXPECTED);
2889 socket_log(sock, sockaddr, TRACE,
2890 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
2891 sock->bound = 1;
2893 UNLOCK(&sock->lock);
2894 return (ISC_R_SUCCESS);
2898 * Set up to listen on a given socket. We do this by creating an internal
2899 * event that will be dispatched when the socket has read activity. The
2900 * watcher will send the internal event to the task when there is a new
2901 * connection.
2903 * Unlike in read, we don't preallocate a done event here. Every time there
2904 * is a new connection we'll have to allocate a new one anyway, so we might
2905 * as well keep things simple rather than having to track them.
2907 isc_result_t
2908 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
2909 char strbuf[ISC_STRERRORSIZE];
2911 REQUIRE(VALID_SOCKET(sock));
2913 LOCK(&sock->lock);
2915 REQUIRE(!sock->listener);
2916 REQUIRE(sock->bound);
2917 REQUIRE(sock->type == isc_sockettype_tcp);
2919 if (backlog == 0)
2920 backlog = SOMAXCONN;
2922 if (listen(sock->fd, (int)backlog) < 0) {
2923 UNLOCK(&sock->lock);
2924 isc__strerror(errno, strbuf, sizeof(strbuf));
2926 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
2928 return (ISC_R_UNEXPECTED);
2931 sock->listener = 1;
2933 UNLOCK(&sock->lock);
2934 return (ISC_R_SUCCESS);
2938 * This should try to do agressive accept() XXXMLG
2940 isc_result_t
2941 isc_socket_accept(isc_socket_t *sock,
2942 isc_task_t *task, isc_taskaction_t action, const void *arg)
2944 isc_socket_newconnev_t *dev;
2945 isc_socketmgr_t *manager;
2946 isc_task_t *ntask = NULL;
2947 isc_socket_t *nsock;
2948 isc_result_t ret;
2949 isc_boolean_t do_poke = ISC_FALSE;
2951 REQUIRE(VALID_SOCKET(sock));
2952 manager = sock->manager;
2953 REQUIRE(VALID_MANAGER(manager));
2955 LOCK(&sock->lock);
2957 REQUIRE(sock->listener);
2960 * Sender field is overloaded here with the task we will be sending
2961 * this event to. Just before the actual event is delivered the
2962 * actual ev_sender will be touched up to be the socket.
2964 dev = (isc_socket_newconnev_t *)
2965 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
2966 action, arg, sizeof(*dev));
2967 if (dev == NULL) {
2968 UNLOCK(&sock->lock);
2969 return (ISC_R_NOMEMORY);
2971 ISC_LINK_INIT(dev, ev_link);
2973 ret = allocate_socket(manager, sock->type, &nsock);
2974 if (ret != ISC_R_SUCCESS) {
2975 isc_event_free(ISC_EVENT_PTR(&dev));
2976 UNLOCK(&sock->lock);
2977 return (ret);
2981 * Attach to socket and to task.
2983 isc_task_attach(task, &ntask);
2984 nsock->references++;
2986 dev->ev_sender = ntask;
2987 dev->newsocket = nsock;
2990 * Poke watcher here. We still have the socket locked, so there
2991 * is no race condition. We will keep the lock for such a short
2992 * bit of time waking it up now or later won't matter all that much.
2994 if (ISC_LIST_EMPTY(sock->accept_list))
2995 do_poke = ISC_TRUE;
2997 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
2999 if (do_poke)
3000 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
3002 UNLOCK(&sock->lock);
3003 return (ISC_R_SUCCESS);
3006 isc_result_t
3007 isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
3008 isc_task_t *task, isc_taskaction_t action, const void *arg)
3010 isc_socket_connev_t *dev;
3011 isc_task_t *ntask = NULL;
3012 isc_socketmgr_t *manager;
3013 int cc;
3014 char strbuf[ISC_STRERRORSIZE];
3016 REQUIRE(VALID_SOCKET(sock));
3017 REQUIRE(addr != NULL);
3018 REQUIRE(task != NULL);
3019 REQUIRE(action != NULL);
3021 manager = sock->manager;
3022 REQUIRE(VALID_MANAGER(manager));
3023 REQUIRE(addr != NULL);
3025 if (isc_sockaddr_ismulticast(addr))
3026 return (ISC_R_MULTICAST);
3028 LOCK(&sock->lock);
3030 REQUIRE(!sock->connecting);
3032 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
3033 ISC_SOCKEVENT_CONNECT,
3034 action, arg,
3035 sizeof(*dev));
3036 if (dev == NULL) {
3037 UNLOCK(&sock->lock);
3038 return (ISC_R_NOMEMORY);
3040 ISC_LINK_INIT(dev, ev_link);
3043 * Try to do the connect right away, as there can be only one
3044 * outstanding, and it might happen to complete.
3046 sock->address = *addr;
3047 cc = connect(sock->fd, &addr->type.sa, addr->length);
3048 if (cc < 0) {
3049 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
3050 goto queue;
3052 switch (errno) {
3053 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
3054 ERROR_MATCH(EACCES, ISC_R_NOPERM);
3055 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3056 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3057 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3058 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3059 #ifdef EHOSTDOWN
3060 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3061 #endif
3062 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3063 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3064 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3065 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3066 #undef ERROR_MATCH
3069 sock->connected = 0;
3071 isc__strerror(errno, strbuf, sizeof(strbuf));
3072 UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
3074 UNLOCK(&sock->lock);
3075 isc_event_free(ISC_EVENT_PTR(&dev));
3076 return (ISC_R_UNEXPECTED);
3078 err_exit:
3079 sock->connected = 0;
3080 isc_task_send(task, ISC_EVENT_PTR(&dev));
3082 UNLOCK(&sock->lock);
3083 return (ISC_R_SUCCESS);
3087 * If connect completed, fire off the done event.
3089 if (cc == 0) {
3090 sock->connected = 1;
3091 sock->bound = 1;
3092 dev->result = ISC_R_SUCCESS;
3093 isc_task_send(task, ISC_EVENT_PTR(&dev));
3095 UNLOCK(&sock->lock);
3096 return (ISC_R_SUCCESS);
3099 queue:
3102 * Attach to task.
3104 isc_task_attach(task, &ntask);
3106 sock->connecting = 1;
3108 dev->ev_sender = ntask;
3111 * Poke watcher here. We still have the socket locked, so there
3112 * is no race condition. We will keep the lock for such a short
3113 * bit of time waking it up now or later won't matter all that much.
3115 if (sock->connect_ev == NULL)
3116 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
3118 sock->connect_ev = dev;
3120 UNLOCK(&sock->lock);
3121 return (ISC_R_SUCCESS);
3125 * Called when a socket with a pending connect() finishes.
3127 static void
3128 internal_connect(isc_task_t *me, isc_event_t *ev) {
3129 isc_socket_t *sock;
3130 isc_socket_connev_t *dev;
3131 isc_task_t *task;
3132 int cc;
3133 ISC_SOCKADDR_LEN_T optlen;
3134 char strbuf[ISC_STRERRORSIZE];
3136 UNUSED(me);
3137 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
3139 sock = ev->ev_sender;
3140 INSIST(VALID_SOCKET(sock));
3142 LOCK(&sock->lock);
3145 * When the internal event was sent the reference count was bumped
3146 * to keep the socket around for us. Decrement the count here.
3148 INSIST(sock->references > 0);
3149 sock->references--;
3150 if (sock->references == 0) {
3151 UNLOCK(&sock->lock);
3152 destroy(&sock);
3153 return;
3157 * Has this event been canceled?
3159 dev = sock->connect_ev;
3160 if (dev == NULL) {
3161 INSIST(!sock->connecting);
3162 UNLOCK(&sock->lock);
3163 return;
3166 INSIST(sock->connecting);
3167 sock->connecting = 0;
3170 * Get any possible error status here.
3172 optlen = sizeof(cc);
3173 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
3174 (void *)&cc, (void *)&optlen) < 0)
3175 cc = errno;
3176 else
3177 errno = cc;
3179 if (errno != 0) {
3181 * If the error is EAGAIN, just re-select on this
3182 * fd and pretend nothing strange happened.
3184 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
3185 sock->connecting = 1;
3186 select_poke(sock->manager, sock->fd,
3187 SELECT_POKE_CONNECT);
3188 UNLOCK(&sock->lock);
3190 return;
3194 * Translate other errors into ISC_R_* flavors.
3196 switch (errno) {
3197 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
3198 ERROR_MATCH(EACCES, ISC_R_NOPERM);
3199 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
3200 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
3201 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
3202 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
3203 #ifdef EHOSTDOWN
3204 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
3205 #endif
3206 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
3207 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
3208 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
3209 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
3210 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
3211 #undef ERROR_MATCH
3212 default:
3213 dev->result = ISC_R_UNEXPECTED;
3214 isc__strerror(errno, strbuf, sizeof(strbuf));
3215 UNEXPECTED_ERROR(__FILE__, __LINE__,
3216 "internal_connect: connect() %s",
3217 strbuf);
3219 } else {
3220 dev->result = ISC_R_SUCCESS;
3221 sock->connected = 1;
3222 sock->bound = 1;
3225 sock->connect_ev = NULL;
3227 UNLOCK(&sock->lock);
3229 task = dev->ev_sender;
3230 dev->ev_sender = sock;
3231 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
3234 isc_result_t
3235 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3236 isc_result_t ret;
3238 REQUIRE(VALID_SOCKET(sock));
3239 REQUIRE(addressp != NULL);
3241 LOCK(&sock->lock);
3243 if (sock->connected) {
3244 *addressp = sock->address;
3245 ret = ISC_R_SUCCESS;
3246 } else {
3247 ret = ISC_R_NOTCONNECTED;
3250 UNLOCK(&sock->lock);
3252 return (ret);
3255 isc_result_t
3256 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
3257 ISC_SOCKADDR_LEN_T len;
3258 isc_result_t ret;
3259 char strbuf[ISC_STRERRORSIZE];
3261 REQUIRE(VALID_SOCKET(sock));
3262 REQUIRE(addressp != NULL);
3264 LOCK(&sock->lock);
3266 if (!sock->bound) {
3267 ret = ISC_R_NOTBOUND;
3268 goto out;
3271 ret = ISC_R_SUCCESS;
3273 len = sizeof(addressp->type);
3274 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
3275 isc__strerror(errno, strbuf, sizeof(strbuf));
3276 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
3277 strbuf);
3278 ret = ISC_R_UNEXPECTED;
3279 goto out;
3281 addressp->length = (unsigned int)len;
3283 out:
3284 UNLOCK(&sock->lock);
3286 return (ret);
3290 * Run through the list of events on this socket, and cancel the ones
3291 * queued for task "task" of type "how". "how" is a bitmask.
3293 void
3294 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
3296 REQUIRE(VALID_SOCKET(sock));
3299 * Quick exit if there is nothing to do. Don't even bother locking
3300 * in this case.
3302 if (how == 0)
3303 return;
3305 LOCK(&sock->lock);
3308 * All of these do the same thing, more or less.
3309 * Each will:
3310 * o If the internal event is marked as "posted" try to
3311 * remove it from the task's queue. If this fails, mark it
3312 * as canceled instead, and let the task clean it up later.
3313 * o For each I/O request for that task of that type, post
3314 * its done event with status of "ISC_R_CANCELED".
3315 * o Reset any state needed.
3317 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
3318 && !ISC_LIST_EMPTY(sock->recv_list)) {
3319 isc_socketevent_t *dev;
3320 isc_socketevent_t *next;
3321 isc_task_t *current_task;
3323 dev = ISC_LIST_HEAD(sock->recv_list);
3325 while (dev != NULL) {
3326 current_task = dev->ev_sender;
3327 next = ISC_LIST_NEXT(dev, ev_link);
3329 if ((task == NULL) || (task == current_task)) {
3330 dev->result = ISC_R_CANCELED;
3331 send_recvdone_event(sock, &dev);
3333 dev = next;
3337 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
3338 && !ISC_LIST_EMPTY(sock->send_list)) {
3339 isc_socketevent_t *dev;
3340 isc_socketevent_t *next;
3341 isc_task_t *current_task;
3343 dev = ISC_LIST_HEAD(sock->send_list);
3345 while (dev != NULL) {
3346 current_task = dev->ev_sender;
3347 next = ISC_LIST_NEXT(dev, ev_link);
3349 if ((task == NULL) || (task == current_task)) {
3350 dev->result = ISC_R_CANCELED;
3351 send_senddone_event(sock, &dev);
3353 dev = next;
3357 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
3358 && !ISC_LIST_EMPTY(sock->accept_list)) {
3359 isc_socket_newconnev_t *dev;
3360 isc_socket_newconnev_t *next;
3361 isc_task_t *current_task;
3363 dev = ISC_LIST_HEAD(sock->accept_list);
3364 while (dev != NULL) {
3365 current_task = dev->ev_sender;
3366 next = ISC_LIST_NEXT(dev, ev_link);
3368 if ((task == NULL) || (task == current_task)) {
3370 ISC_LIST_UNLINK(sock->accept_list, dev,
3371 ev_link);
3373 dev->newsocket->references--;
3374 free_socket(&dev->newsocket);
3376 dev->result = ISC_R_CANCELED;
3377 dev->ev_sender = sock;
3378 isc_task_sendanddetach(&current_task,
3379 ISC_EVENT_PTR(&dev));
3382 dev = next;
3387 * Connecting is not a list.
3389 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
3390 && sock->connect_ev != NULL) {
3391 isc_socket_connev_t *dev;
3392 isc_task_t *current_task;
3394 INSIST(sock->connecting);
3395 sock->connecting = 0;
3397 dev = sock->connect_ev;
3398 current_task = dev->ev_sender;
3400 if ((task == NULL) || (task == current_task)) {
3401 sock->connect_ev = NULL;
3403 dev->result = ISC_R_CANCELED;
3404 dev->ev_sender = sock;
3405 isc_task_sendanddetach(&current_task,
3406 ISC_EVENT_PTR(&dev));
3410 UNLOCK(&sock->lock);
3413 isc_sockettype_t
3414 isc_socket_gettype(isc_socket_t *sock) {
3415 REQUIRE(VALID_SOCKET(sock));
3417 return (sock->type);
3420 isc_boolean_t
3421 isc_socket_isbound(isc_socket_t *sock) {
3422 isc_boolean_t val;
3424 LOCK(&sock->lock);
3425 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
3426 UNLOCK(&sock->lock);
3428 return (val);
3431 #ifndef ISC_PLATFORM_USETHREADS
3432 void
3433 isc__socketmgr_getfdsets(fd_set *readset, fd_set *writeset, int *maxfd) {
3434 if (socketmgr == NULL)
3435 *maxfd = 0;
3436 else {
3437 *readset = socketmgr->read_fds;
3438 *writeset = socketmgr->write_fds;
3439 *maxfd = socketmgr->maxfd + 1;
3443 isc_result_t
3444 isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
3445 isc_socketmgr_t *manager = socketmgr;
3447 if (manager == NULL)
3448 return (ISC_R_NOTFOUND);
3450 process_fds(manager, maxfd, readset, writeset);
3451 return (ISC_R_SUCCESS);
3453 #endif /* ISC_PLATFORM_USETHREADS */