2 * Copyright (c) 1982, 1986, 1988, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
34 #include "opt_inet6.h"
37 #error "IPDIVERT requires INET"
40 #include <sys/param.h>
41 #include <sys/eventhandler.h>
42 #include <sys/kernel.h>
44 #include <sys/malloc.h>
46 #include <sys/module.h>
47 #include <sys/kernel.h>
50 #include <sys/protosw.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/sysctl.h>
57 #include <net/if_var.h>
58 #include <net/netisr.h>
60 #include <netinet/in.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip_var.h>
67 #include <netinet/ip6.h>
68 #include <netinet6/ip6_var.h>
71 #include <netinet/sctp_crc32.h>
74 #include <security/mac/mac_framework.h>
81 * Allocate enough space to hold a full IP packet
83 #define DIVSNDQ (65536 + 100)
84 #define DIVRCVQ (65536 + 100)
87 * Divert sockets work in conjunction with ipfw or other packet filters,
88 * see the divert(4) manpage for features.
89 * Packets are selected by the packet filter and tagged with an
90 * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
91 * the packet filter) and information on the matching filter rule for
92 * subsequent reinjection. The divert_port is used to put the packet
93 * on the corresponding divert socket, while the rule number is passed
94 * up (at least partially) as the sin_port in the struct sockaddr.
96 * Packets written to the divert socket carry in sin_addr a
97 * destination address, and in sin_port the number of the filter rule
98 * after which to continue processing.
99 * If the destination address is INADDR_ANY, the packet is treated as
100 * as outgoing and sent to ip_output(); otherwise it is treated as
101 * incoming and sent to ip_input().
102 * Further, sin_zero carries some information on the interface,
103 * which can be used in the reinject -- see comments in the code.
105 * On reinjection, processing in ip_input() and ip_output()
106 * will be exactly the same as for the original packet, except that
107 * packet filter processing will start at the rule number after the one
108 * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
109 * will apply the entire ruleset to the packet).
112 /* Internal variables. */
113 static VNET_DEFINE(struct inpcbhead
, divcb
);
114 static VNET_DEFINE(struct inpcbinfo
, divcbinfo
);
116 #define V_divcb VNET(divcb)
117 #define V_divcbinfo VNET(divcbinfo)
119 static u_long div_sendspace
= DIVSNDQ
; /* XXX sysctl ? */
120 static u_long div_recvspace
= DIVRCVQ
; /* XXX sysctl ? */
122 static eventhandler_tag ip_divert_event_tag
;
125 * Initialize divert connection block queue.
128 div_zone_change(void *tag
)
131 uma_zone_set_max(V_divcbinfo
.ipi_zone
, maxsockets
);
135 div_inpcb_init(void *mem
, int size
, int flags
)
137 struct inpcb
*inp
= mem
;
139 INP_LOCK_INIT(inp
, "inp", "divinp");
144 div_inpcb_fini(void *mem
, int size
)
146 struct inpcb
*inp
= mem
;
148 INP_LOCK_DESTROY(inp
);
156 * XXX We don't use the hash list for divert IP, but it's easier to
157 * allocate one-entry hash lists than it is to check all over the
158 * place for hashbase == NULL.
160 in_pcbinfo_init(&V_divcbinfo
, "div", &V_divcb
, 1, 1, "divcb",
161 div_inpcb_init
, div_inpcb_fini
, 0, IPI_HASHFIELDS_NONE
);
165 div_destroy(void *unused __unused
)
168 in_pcbinfo_destroy(&V_divcbinfo
);
170 VNET_SYSUNINIT(divert
, SI_SUB_PROTO_DOMAININIT
, SI_ORDER_ANY
,
174 * IPPROTO_DIVERT is not in the real IP protocol number space; this
175 * function should never be called. Just in case, drop any packets.
178 div_input(struct mbuf
**mp
, int *offp
, int proto
)
180 struct mbuf
*m
= *mp
;
182 KMOD_IPSTAT_INC(ips_noproto
);
184 return (IPPROTO_DONE
);
188 * Divert a packet by passing it up to the divert socket at port 'port'.
190 * Setup generic address and protocol structures for div_input routine,
191 * then pass them along with mbuf chain.
194 divert_packet(struct mbuf
*m
, int incoming
)
200 struct sockaddr_in divsrc
;
203 mtag
= m_tag_locate(m
, MTAG_IPFW_RULE
, 0, NULL
);
209 if (m
->m_len
< sizeof(struct ip
) &&
210 (m
= m_pullup(m
, sizeof(struct ip
))) == NULL
)
212 ip
= mtod(m
, struct ip
*);
214 /* Delayed checksums are currently not compatible with divert. */
215 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
217 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
220 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP
) {
221 sctp_delayed_cksum(m
, (uint32_t)(ip
->ip_hl
<< 2));
222 m
->m_pkthdr
.csum_flags
&= ~CSUM_SCTP
;
225 bzero(&divsrc
, sizeof(divsrc
));
226 divsrc
.sin_len
= sizeof(divsrc
);
227 divsrc
.sin_family
= AF_INET
;
228 /* record matching rule, in host format */
229 divsrc
.sin_port
= ((struct ipfw_rule_ref
*)(mtag
+1))->rulenum
;
231 * Record receive interface address, if any.
232 * But only for incoming packets.
241 /* Find IP address for receive interface */
242 ifp
= m
->m_pkthdr
.rcvif
;
244 TAILQ_FOREACH(ifa
, &ifp
->if_addrhead
, ifa_link
) {
245 if (ifa
->ifa_addr
->sa_family
!= AF_INET
)
248 ((struct sockaddr_in
*) ifa
->ifa_addr
)->sin_addr
;
251 if_addr_runlock(ifp
);
254 * Record the incoming interface name whenever we have one.
256 if (m
->m_pkthdr
.rcvif
) {
258 * Hide the actual interface name in there in the
259 * sin_zero array. XXX This needs to be moved to a
260 * different sockaddr type for divert, e.g.
261 * sockaddr_div with multiple fields like
262 * sockaddr_dl. Presently we have only 7 bytes
263 * but that will do for now as most interfaces
264 * are 4 or less + 2 or less bytes for unit.
265 * There is probably a faster way of doing this,
266 * possibly taking it from the sockaddr_dl on the iface.
267 * This solves the problem of a P2P link and a LAN interface
268 * having the same address, which can result in the wrong
269 * interface being assigned to the packet when fed back
270 * into the divert socket. Theoretically if the daemon saves
271 * and re-uses the sockaddr_in as suggested in the man pages,
272 * this iface name will come along for the ride.
273 * (see div_output for the other half of this.)
275 strlcpy(divsrc
.sin_zero
, m
->m_pkthdr
.rcvif
->if_xname
,
276 sizeof(divsrc
.sin_zero
));
279 /* Put packet on socket queue, if any */
281 nport
= htons((u_int16_t
)(((struct ipfw_rule_ref
*)(mtag
+1))->info
));
282 INP_INFO_RLOCK(&V_divcbinfo
);
283 LIST_FOREACH(inp
, &V_divcb
, inp_list
) {
284 /* XXX why does only one socket match? */
285 if (inp
->inp_lport
== nport
) {
287 sa
= inp
->inp_socket
;
288 SOCKBUF_LOCK(&sa
->so_rcv
);
289 if (sbappendaddr_locked(&sa
->so_rcv
,
290 (struct sockaddr
*)&divsrc
, m
,
291 (struct mbuf
*)0) == 0) {
292 SOCKBUF_UNLOCK(&sa
->so_rcv
);
293 sa
= NULL
; /* force mbuf reclaim below */
295 sorwakeup_locked(sa
);
300 INP_INFO_RUNLOCK(&V_divcbinfo
);
303 KMOD_IPSTAT_INC(ips_noproto
);
304 KMOD_IPSTAT_DEC(ips_delivered
);
309 * Deliver packet back into the IP processing machinery.
311 * If no address specified, or address is 0.0.0.0, send to ip_output();
312 * otherwise, send to ip_input() and mark as having been received on
313 * the interface with that address.
316 div_output(struct socket
*so
, struct mbuf
*m
, struct sockaddr_in
*sin
,
317 struct mbuf
*control
)
319 struct ip
*const ip
= mtod(m
, struct ip
*);
321 struct ipfw_rule_ref
*dt
;
325 * An mbuf may hasn't come from userland, but we pretend
328 m
->m_pkthdr
.rcvif
= NULL
;
330 M_SETFIB(m
, so
->so_fibnum
);
333 m_freem(control
); /* XXX */
335 mtag
= m_tag_locate(m
, MTAG_IPFW_RULE
, 0, NULL
);
337 /* this should be normal */
338 mtag
= m_tag_alloc(MTAG_IPFW_RULE
, 0,
339 sizeof(struct ipfw_rule_ref
), M_NOWAIT
| M_ZERO
);
344 m_tag_prepend(m
, mtag
);
346 dt
= (struct ipfw_rule_ref
*)(mtag
+1);
348 /* Loopback avoidance and state recovery */
352 /* set the starting point. We provide a non-zero slot,
353 * but a non_matching chain_id to skip that info and use
354 * the rulenum/rule_id.
356 dt
->slot
= 1; /* dummy, chain_id is invalid */
358 dt
->rulenum
= sin
->sin_port
+1; /* host format ? */
361 * Find receive interface with the given name, stuffed
362 * (if it exists) in the sin_zero[] field.
363 * The name is user supplied data so don't trust its size
364 * or that it is zero terminated.
366 for (i
= 0; i
< sizeof(sin
->sin_zero
) && sin
->sin_zero
[i
]; i
++)
368 if ( i
> 0 && i
< sizeof(sin
->sin_zero
))
369 m
->m_pkthdr
.rcvif
= ifunit(sin
->sin_zero
);
372 /* Reinject packet into the system as incoming or outgoing */
373 if (!sin
|| sin
->sin_addr
.s_addr
== 0) {
374 struct mbuf
*options
= NULL
;
377 dt
->info
|= IPFW_IS_DIVERT
| IPFW_INFO_OUT
;
383 * Don't allow both user specified and setsockopt
384 * options, and don't allow packet length sizes that
387 if ((((ip
->ip_hl
<< 2) != sizeof(struct ip
)) &&
388 inp
->inp_options
!= NULL
) ||
389 ((u_short
)ntohs(ip
->ip_len
) > m
->m_pkthdr
.len
)) {
396 case IPV6_VERSION
>> 4:
398 struct ip6_hdr
*const ip6
= mtod(m
, struct ip6_hdr
*);
400 /* Don't allow packet length sizes that will crash */
401 if (((u_short
)ntohs(ip6
->ip6_plen
) > m
->m_pkthdr
.len
)) {
415 /* Send packet to output processing */
416 KMOD_IPSTAT_INC(ips_rawout
); /* XXX */
419 mac_inpcb_create_mbuf(inp
, m
);
422 * Get ready to inject the packet into ip_output().
423 * Just in case socket options were specified on the
424 * divert socket, we duplicate them. This is done
425 * to avoid having to hold the PCB locks over the call
426 * to ip_output(), as doing this results in a number of
427 * lock ordering complexities.
429 * Note that we set the multicast options argument for
430 * ip_output() to NULL since it should be invariant that
431 * they are not present.
433 KASSERT(inp
->inp_moptions
== NULL
,
434 ("multicast options set on a divert socket"));
436 * XXXCSJP: It is unclear to me whether or not it makes
437 * sense for divert sockets to have options. However,
438 * for now we will duplicate them with the INP locks
439 * held so we can use them in ip_output() without
440 * requring a reference to the pcb.
442 if (inp
->inp_options
!= NULL
) {
443 options
= m_dup(inp
->inp_options
, M_NOWAIT
);
444 if (options
== NULL
) {
454 error
= ip_output(m
, options
, NULL
,
455 ((so
->so_options
& SO_DONTROUTE
) ? IP_ROUTETOIF
: 0)
456 | IP_ALLOWBROADCAST
| IP_RAWOUTPUT
, NULL
, NULL
);
459 case IPV6_VERSION
>> 4:
460 error
= ip6_output(m
, NULL
, NULL
, 0, NULL
, NULL
, NULL
);
467 dt
->info
|= IPFW_IS_DIVERT
| IPFW_INFO_IN
;
468 if (m
->m_pkthdr
.rcvif
== NULL
) {
470 * No luck with the name, check by IP address.
471 * Clear the port and the ifname to make sure
472 * there are no distractions for ifa_ifwithaddr.
476 bzero(sin
->sin_zero
, sizeof(sin
->sin_zero
));
478 ifa
= ifa_ifwithaddr((struct sockaddr
*) sin
);
480 error
= EADDRNOTAVAIL
;
483 m
->m_pkthdr
.rcvif
= ifa
->ifa_ifp
;
487 mac_socket_create_mbuf(so
, m
);
489 /* Send packet to input processing via netisr */
492 netisr_queue_src(NETISR_IP
, (uintptr_t)so
, m
);
495 case IPV6_VERSION
>> 4:
496 netisr_queue_src(NETISR_IPV6
, (uintptr_t)so
, m
);
513 div_attach(struct socket
*so
, int proto
, struct thread
*td
)
519 KASSERT(inp
== NULL
, ("div_attach: inp != NULL"));
521 error
= priv_check(td
, PRIV_NETINET_DIVERT
);
525 error
= soreserve(so
, div_sendspace
, div_recvspace
);
528 INP_INFO_WLOCK(&V_divcbinfo
);
529 error
= in_pcballoc(so
, &V_divcbinfo
);
531 INP_INFO_WUNLOCK(&V_divcbinfo
);
534 inp
= (struct inpcb
*)so
->so_pcb
;
535 INP_INFO_WUNLOCK(&V_divcbinfo
);
536 inp
->inp_ip_p
= proto
;
537 inp
->inp_vflag
|= INP_IPV4
;
538 inp
->inp_flags
|= INP_HDRINCL
;
544 div_detach(struct socket
*so
)
549 KASSERT(inp
!= NULL
, ("div_detach: inp == NULL"));
550 INP_INFO_WLOCK(&V_divcbinfo
);
554 INP_INFO_WUNLOCK(&V_divcbinfo
);
558 div_bind(struct socket
*so
, struct sockaddr
*nam
, struct thread
*td
)
564 KASSERT(inp
!= NULL
, ("div_bind: inp == NULL"));
565 /* in_pcbbind assumes that nam is a sockaddr_in
566 * and in_pcbbind requires a valid address. Since divert
567 * sockets don't we need to make sure the address is
568 * filled in properly.
569 * XXX -- divert should not be abusing in_pcbind
570 * and should probably have its own family.
572 if (nam
->sa_family
!= AF_INET
)
574 ((struct sockaddr_in
*)nam
)->sin_addr
.s_addr
= INADDR_ANY
;
575 INP_INFO_WLOCK(&V_divcbinfo
);
577 INP_HASH_WLOCK(&V_divcbinfo
);
578 error
= in_pcbbind(inp
, nam
, td
->td_ucred
);
579 INP_HASH_WUNLOCK(&V_divcbinfo
);
581 INP_INFO_WUNLOCK(&V_divcbinfo
);
586 div_shutdown(struct socket
*so
)
591 KASSERT(inp
!= NULL
, ("div_shutdown: inp == NULL"));
599 div_send(struct socket
*so
, int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
600 struct mbuf
*control
, struct thread
*td
)
603 /* Packet must have a header (but that's about it) */
604 if (m
->m_len
< sizeof (struct ip
) &&
605 (m
= m_pullup(m
, sizeof (struct ip
))) == NULL
) {
606 KMOD_IPSTAT_INC(ips_toosmall
);
612 return div_output(so
, m
, (struct sockaddr_in
*)nam
, control
);
616 div_ctlinput(int cmd
, struct sockaddr
*sa
, void *vip
)
618 struct in_addr faddr
;
620 faddr
= ((struct sockaddr_in
*)sa
)->sin_addr
;
621 if (sa
->sa_family
!= AF_INET
|| faddr
.s_addr
== INADDR_ANY
)
623 if (PRC_IS_REDIRECT(cmd
))
628 div_pcblist(SYSCTL_HANDLER_ARGS
)
631 struct inpcb
*inp
, **inp_list
;
636 * The process of preparing the TCB list is too time-consuming and
637 * resource-intensive to repeat twice on every request.
639 if (req
->oldptr
== 0) {
640 n
= V_divcbinfo
.ipi_count
;
641 n
+= imax(n
/ 8, 10);
642 req
->oldidx
= 2 * (sizeof xig
) + n
* sizeof(struct xinpcb
);
646 if (req
->newptr
!= 0)
650 * OK, now we're committed to doing something.
652 INP_INFO_RLOCK(&V_divcbinfo
);
653 gencnt
= V_divcbinfo
.ipi_gencnt
;
654 n
= V_divcbinfo
.ipi_count
;
655 INP_INFO_RUNLOCK(&V_divcbinfo
);
657 error
= sysctl_wire_old_buffer(req
,
658 2 * sizeof(xig
) + n
*sizeof(struct xinpcb
));
662 xig
.xig_len
= sizeof xig
;
664 xig
.xig_gen
= gencnt
;
665 xig
.xig_sogen
= so_gencnt
;
666 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
670 inp_list
= malloc(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
671 if (inp_list
== NULL
)
674 INP_INFO_RLOCK(&V_divcbinfo
);
675 for (inp
= LIST_FIRST(V_divcbinfo
.ipi_listhead
), i
= 0; inp
&& i
< n
;
676 inp
= LIST_NEXT(inp
, inp_list
)) {
678 if (inp
->inp_gencnt
<= gencnt
&&
679 cr_canseeinpcb(req
->td
->td_ucred
, inp
) == 0) {
685 INP_INFO_RUNLOCK(&V_divcbinfo
);
689 for (i
= 0; i
< n
; i
++) {
692 if (inp
->inp_gencnt
<= gencnt
) {
694 bzero(&xi
, sizeof(xi
));
695 xi
.xi_len
= sizeof xi
;
696 /* XXX should avoid extra copy */
697 bcopy(inp
, &xi
.xi_inp
, sizeof *inp
);
699 sotoxsocket(inp
->inp_socket
, &xi
.xi_socket
);
701 error
= SYSCTL_OUT(req
, &xi
, sizeof xi
);
705 INP_INFO_WLOCK(&V_divcbinfo
);
706 for (i
= 0; i
< n
; i
++) {
709 if (!in_pcbrele_rlocked(inp
))
712 INP_INFO_WUNLOCK(&V_divcbinfo
);
716 * Give the user an updated idea of our state.
717 * If the generation differs from what we told
718 * her before, she knows that something happened
719 * while we were processing this request, and it
720 * might be necessary to retry.
722 INP_INFO_RLOCK(&V_divcbinfo
);
723 xig
.xig_gen
= V_divcbinfo
.ipi_gencnt
;
724 xig
.xig_sogen
= so_gencnt
;
725 xig
.xig_count
= V_divcbinfo
.ipi_count
;
726 INP_INFO_RUNLOCK(&V_divcbinfo
);
727 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
729 free(inp_list
, M_TEMP
);
734 static SYSCTL_NODE(_net_inet
, IPPROTO_DIVERT
, divert
, CTLFLAG_RW
, 0,
736 SYSCTL_PROC(_net_inet_divert
, OID_AUTO
, pcblist
, CTLTYPE_OPAQUE
| CTLFLAG_RD
,
737 NULL
, 0, div_pcblist
, "S,xinpcb", "List of active divert sockets");
740 struct pr_usrreqs div_usrreqs
= {
741 .pru_attach
= div_attach
,
742 .pru_bind
= div_bind
,
743 .pru_control
= in_control
,
744 .pru_detach
= div_detach
,
745 .pru_peeraddr
= in_getpeeraddr
,
746 .pru_send
= div_send
,
747 .pru_shutdown
= div_shutdown
,
748 .pru_sockaddr
= in_getsockaddr
,
749 .pru_sosetlabel
= in_pcbsosetlabel
752 struct protosw div_protosw
= {
754 .pr_protocol
= IPPROTO_DIVERT
,
755 .pr_flags
= PR_ATOMIC
|PR_ADDR
,
756 .pr_input
= div_input
,
757 .pr_ctlinput
= div_ctlinput
,
758 .pr_ctloutput
= ip_ctloutput
,
760 .pr_usrreqs
= &div_usrreqs
764 div_modevent(module_t mod
, int type
, void *unused
)
771 * Protocol will be initialized by pf_proto_register().
772 * We don't have to register ip_protox because we are not
773 * a true IP protocol that goes over the wire.
775 err
= pf_proto_register(PF_INET
, &div_protosw
);
778 ip_divert_ptr
= divert_packet
;
779 ip_divert_event_tag
= EVENTHANDLER_REGISTER(maxsockets_change
,
780 div_zone_change
, NULL
, EVENTHANDLER_PRI_ANY
);
784 * IPDIVERT may normally not be unloaded because of the
785 * potential race conditions. Tell kldunload we can't be
786 * unloaded unless the unload is forced.
794 * Module ipdivert can only be unloaded if no sockets are
795 * connected. Maybe this can be changed later to forcefully
796 * disconnect any open sockets.
798 * XXXRW: Note that there is a slight race here, as a new
799 * socket open request could be spinning on the lock and then
800 * we destroy the lock.
802 INP_INFO_WLOCK(&V_divcbinfo
);
803 if (V_divcbinfo
.ipi_count
!= 0) {
805 INP_INFO_WUNLOCK(&V_divcbinfo
);
808 ip_divert_ptr
= NULL
;
809 err
= pf_proto_unregister(PF_INET
, IPPROTO_DIVERT
, SOCK_RAW
);
810 INP_INFO_WUNLOCK(&V_divcbinfo
);
814 EVENTHANDLER_DEREGISTER(maxsockets_change
, ip_divert_event_tag
);
823 static moduledata_t ipdivertmod
= {
829 DECLARE_MODULE(ipdivert
, ipdivertmod
, SI_SUB_PROTO_FIREWALL
, SI_ORDER_ANY
);
830 MODULE_DEPEND(ipdivert
, ipfw
, 3, 3, 3);
831 MODULE_VERSION(ipdivert
, 1);