From 88da6203404cc737cdbc49d09c95a3ea552ef7e3 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Tue, 29 Nov 2011 21:38:34 +0800 Subject: [PATCH] accept: Save foreign address earlier, if protocol supports it - Add so_faddr into socket, which records the accepted socket's foreign address. If it is set, kern_accept() will use it directly instead of calling protocol specific method to extract the foreign address. - Add protocol specific method, pru_safefaddr, which will save the foreign address into socket.so_faddr if the necessary information is supplied. This protocol method will only be called in protocol thread. - Pass the foreign address to sonewconn() if possible, so the foreign address could be saved before the accepted socket is put onto the complete list. Currently only IPv4/TCP implemented pru_savefaddr This intends to address the following problems: - Calling pru_accept directly from user context is not MPSAFE, we always races the socket.so_pcb check->use against protocol thread clear/free socket.so_pcb, though the race window is too tiny to be hit. To make it mpsafe, we should dispatch pru_accept to protocol thread. If socket.so_faddr is set here, we are race against nothing and nothing expensive like put the current user thread into sleep will happen. However, if the socket is dropped when it still sits on the complete list, the error will not be timely delivered, i.e. accept(2) will not return error, but the later on read(2)/write(2) on the socket will deliver the error. - Calling pru_accept directly races against the inpcb.inp_f{addr,port} setting up in the protocol thread, since inpcb.inp_f{addr,port} is setup _after_ the accepted socket was put onto the complete list. user thread proto thread : : : accepted socket -> comp : (inpcb.inp_f{addr,port} are 0 here) comp -> socket : pru_accept : : setup inpcb.inp_f{addr,port} Returning of 0.0.0.0:0 from accept(2) was observed on heavily loaded web servers. --- sys/kern/uipc_socket.c | 14 +++++++++++--- sys/kern/uipc_socket2.c | 16 +++++++++++++++- sys/kern/uipc_syscalls.c | 10 +++++++++- sys/netinet/in_pcb.c | 17 +++++++++++++++++ sys/netinet/in_pcb.h | 1 + sys/netinet/tcp_syncache.c | 25 +++++++++++++++++-------- sys/netinet/tcp_usrreq.c | 9 ++++++++- sys/sys/protosw.h | 7 +++++++ sys/sys/socketvar.h | 4 ++++ 9 files changed, 89 insertions(+), 14 deletions(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 6620fb16ef..118a741545 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -281,6 +281,8 @@ sodealloc(struct socket *so) do_setopt_accept_filter(so, NULL); #endif /* INET */ crfree(so->so_cred); + if (so->so_faddr != NULL) + kfree(so->so_faddr, M_SONAME); kfree(so, M_SOCKET); } @@ -597,14 +599,20 @@ soabort_oncpu(struct socket *so) * so is passed in ref'd, which becomes owned by * the cleared SS_NOFDREF flag. */ +void +soaccept_generic(struct socket *so) +{ + if ((so->so_state & SS_NOFDREF) == 0) + panic("soaccept: !NOFDREF"); + soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ +} + int soaccept(struct socket *so, struct sockaddr **nam) { int error; - if ((so->so_state & SS_NOFDREF) == 0) - panic("soaccept: !NOFDREF"); - soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ + soaccept_generic(so); error = so_pru_accept_direct(so, nam); return (error); } diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index cf7da9ac91..dc3ca6a9be 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -327,7 +327,8 @@ sosetport(struct socket *so, lwkt_port_t port) * The reference is implied by so_pcb. */ struct socket * -sonewconn(struct socket *head, int connstatus) +sonewconn_faddr(struct socket *head, int connstatus, + const struct sockaddr *faddr) { struct socket *so; struct socket *sp; @@ -405,6 +406,13 @@ sonewconn(struct socket *head, int connstatus) else so->so_snd.ssb_flags &= ~SSB_AUTOSIZE; + /* + * Save the faddr, if the information is provided and + * the protocol can perform the saving opertation. + */ + if (faddr != NULL && so->so_proto->pr_usrreqs->pru_savefaddr != NULL) + so->so_proto->pr_usrreqs->pru_savefaddr(so, faddr); + lwkt_getpooltoken(head); if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); @@ -437,6 +445,12 @@ sonewconn(struct socket *head, int connstatus) return (so); } +struct socket * +sonewconn(struct socket *head, int connstatus) +{ + return sonewconn_faddr(head, connstatus, NULL); +} + /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index cdb3bf1e7b..35760bdbca 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -356,7 +356,15 @@ accepted: fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, td->td_ucred, NULL); sa = NULL; - error = soaccept(so, &sa); + if (so->so_faddr != NULL) { + sa = so->so_faddr; + so->so_faddr = NULL; + + soaccept_generic(so); + error = 0; + } else { + error = soaccept(so, &sa); + } /* * Set the returned name and namelen as applicable. Set the returned diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 2c253788ce..12aa3a8a4f 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1687,3 +1687,20 @@ in_pcblist_global_nomarker(SYSCTL_HANDLER_ARGS, struct xinpcb **xi0, int *nxi0) return 0; } + +void +in_savefaddr(struct socket *so, const struct sockaddr *faddr) +{ + struct sockaddr_in *sin; + + KASSERT(faddr->sa_family == AF_INET, + ("not AF_INET faddr %d\n", faddr->sa_family)); + + sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port; + sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr; + + so->so_faddr = (struct sockaddr *)sin; +} diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 26715bbde3..1bb8e09bef 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -442,6 +442,7 @@ void in_pcbremwildcardhash_oncpu(struct inpcb *, struct inpcbinfo *); void in_pcbremconnhash(struct inpcb *inp); void in_pcbremlists (struct inpcb *inp); int prison_xinpcb (struct thread *p, struct inpcb *inp); +void in_savefaddr (struct socket *so, const struct sockaddr *faddr); int in_pcblist_global(SYSCTL_HANDLER_ARGS); int in_pcblist_global_nomarker(SYSCTL_HANDLER_ARGS, diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 6a5987ffc2..9a7191df73 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -686,6 +686,21 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) #else const boolean_t isipv6 = FALSE; #endif + struct sockaddr_in sin_faddr; + struct sockaddr *faddr; + + if (isipv6) { + /* XXX Not ready yet */ + faddr = NULL; + } else { + /* XXX duplicate later on code */ + faddr = (struct sockaddr *)&sin_faddr; + sin_faddr.sin_family = AF_INET; + sin_faddr.sin_len = sizeof(sin_faddr); + sin_faddr.sin_addr = sc->sc_inc.inc_faddr; + sin_faddr.sin_port = sc->sc_inc.inc_fport; + bzero(sin_faddr.sin_zero, sizeof(sin_faddr.sin_zero)); + } /* * Ok, create the full blown connection, and set things up @@ -696,7 +711,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) * Set the protocol processing port for the socket to the current * port (that the connection came in on). */ - so = sonewconn(lso, SS_ISCONNECTED); + so = sonewconn_faddr(lso, SS_ISCONNECTED, faddr); if (so == NULL) { /* * Drop the connection; we will send a RST if the peer @@ -773,7 +788,6 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) } } else { struct in_addr laddr; - struct sockaddr_in sin; inp->inp_options = ip_srcroute(m); if (inp->inp_options == NULL) { @@ -783,15 +797,10 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) inp->inp_route = sc->sc_route; sc->sc_route.ro_rt = NULL; - sin.sin_family = AF_INET; - sin.sin_len = sizeof sin; - sin.sin_addr = sc->sc_inc.inc_faddr; - sin.sin_port = sc->sc_inc.inc_fport; - bzero(sin.sin_zero, sizeof sin.sin_zero); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; - if (in_pcbconnect(inp, (struct sockaddr *)&sin, &thread0)) { + if (in_pcbconnect(inp, faddr, &thread0)) { inp->inp_laddr = laddr; goto abort; } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 6ffb76c41b..0f3a126914 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -856,6 +856,12 @@ tcp_usr_rcvoob(netmsg_t msg) COMMON_END(PRU_RCVOOB); } +static void +tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) +{ + in_savefaddr(so, faddr); +} + /* xxx - should be const */ struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, @@ -876,7 +882,8 @@ struct pr_usrreqs tcp_usrreqs = { .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_setsockaddr_dispatch, .pru_sosend = sosendtcp, - .pru_soreceive = soreceive + .pru_soreceive = soreceive, + .pru_savefaddr = tcp_usr_savefaddr }; #ifdef INET6 diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h index 65acdcd1b5..4081e62bc3 100644 --- a/sys/sys/protosw.h +++ b/sys/sys/protosw.h @@ -226,6 +226,9 @@ struct pru_attach_info { * * pru_soreceive() - called synchronously from user context. Typically * runs generic kernel code and remains synchronous. + * + * pru_savefaddr() - called synchronoutly by protocol thread. Typically + * save the foreign address into socket.so_faddr. */ struct pr_usrreqs { void (*pru_abort) (netmsg_t msg); @@ -265,6 +268,10 @@ struct pr_usrreqs { struct uio *uio, struct sockbuf *sio, struct mbuf **controlp, int *flagsp); + + /* synchronously called by protocol thread */ + void (*pru_savefaddr) (struct socket *so, + const struct sockaddr *addr); }; typedef int (*pru_sosend_fn_t) (struct socket *so, struct sockaddr *addr, diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index ab494be669..b3d83f2dc8 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -154,6 +154,7 @@ struct socket { } *so_accf; struct netmsg_base so_clomsg; + struct sockaddr *so_faddr; }; #endif @@ -394,6 +395,7 @@ void soabort (struct socket *so); void soaborta (struct socket *so); void soabort_oncpu (struct socket *so); int soaccept (struct socket *so, struct sockaddr **nam); +void soaccept_generic (struct socket *so); struct socket *soalloc (int waitok); int sobind (struct socket *so, struct sockaddr *nam, struct thread *td); void socantrcvmore (struct socket *so); @@ -417,6 +419,8 @@ void soisreconnecting (struct socket *so); void sosetport (struct socket *so, struct lwkt_port *port); int solisten (struct socket *so, int backlog, struct thread *td); struct socket *sonewconn (struct socket *head, int connstatus); +struct socket *sonewconn_faddr (struct socket *head, int connstatus, + const struct sockaddr *faddr); int sooptcopyin (struct sockopt *sopt, void *buf, size_t len, size_t minlen); int soopt_to_kbuf (struct sockopt *sopt, void *buf, size_t len, -- 2.11.4.GIT