2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
30 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.37 2003/04/15 06:44:45 silby Exp $
31 * $DragonFly: src/sys/netinet/ip_output.c,v 1.37 2007/04/04 06:13:26 dillon Exp $
38 #include "opt_ipdivert.h"
39 #include "opt_ipfilter.h"
40 #include "opt_ipsec.h"
41 #include "opt_mbuf_stress_test.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
52 #include <sys/sysctl.h>
53 #include <sys/thread2.h>
54 #include <sys/in_cksum.h>
57 #include <net/netisr.h>
59 #include <net/route.h>
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/ip.h>
64 #include <netinet/in_pcb.h>
65 #include <netinet/in_var.h>
66 #include <netinet/ip_var.h>
68 static MALLOC_DEFINE(M_IPMOPTS
, "ip_moptions", "internet multicast options");
71 #include <netinet6/ipsec.h>
72 #include <netproto/key/key.h>
74 #include <netproto/key/key_debug.h>
76 #define KEYDEBUG(lev,arg)
81 #include <netproto/ipsec/ipsec.h>
82 #include <netproto/ipsec/xform.h>
83 #include <netproto/ipsec/key.h>
86 #include <net/ipfw/ip_fw.h>
87 #include <net/dummynet/ip_dummynet.h>
89 #define print_ip(x, a, y) kprintf("%s %d.%d.%d.%d%s",\
90 x, (ntohl(a.s_addr)>>24)&0xFF,\
91 (ntohl(a.s_addr)>>16)&0xFF,\
92 (ntohl(a.s_addr)>>8)&0xFF,\
93 (ntohl(a.s_addr))&0xFF, y);
97 #ifdef MBUF_STRESS_TEST
98 int mbuf_frag_size
= 0;
99 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, mbuf_frag_size
, CTLFLAG_RW
,
100 &mbuf_frag_size
, 0, "Fragment outgoing mbufs to this size");
103 static struct mbuf
*ip_insertoptions(struct mbuf
*, struct mbuf
*, int *);
104 static struct ifnet
*ip_multicast_if(struct in_addr
*, int *);
105 static void ip_mloopback
106 (struct ifnet
*, struct mbuf
*, struct sockaddr_in
*, int);
107 static int ip_getmoptions
108 (struct sockopt
*, struct ip_moptions
*);
109 static int ip_pcbopts(int, struct mbuf
**, struct mbuf
*);
110 static int ip_setmoptions
111 (struct sockopt
*, struct ip_moptions
**);
113 int ip_optcopy(struct ip
*, struct ip
*);
116 extern struct protosw inetsw
[];
119 * IP output. The packet in mbuf chain m contains a skeletal IP
120 * header (with len, off, ttl, proto, tos, src, dst).
121 * The mbuf chain containing the packet will be freed.
122 * The mbuf opt, if present, will not be freed.
125 ip_output(struct mbuf
*m0
, struct mbuf
*opt
, struct route
*ro
,
126 int flags
, struct ip_moptions
*imo
, struct inpcb
*inp
)
129 struct ifnet
*ifp
= NULL
; /* keep compiler happy */
131 int hlen
= sizeof(struct ip
);
132 int len
, off
, error
= 0;
133 struct sockaddr_in
*dst
= NULL
; /* keep compiler happy */
134 struct in_ifaddr
*ia
= NULL
;
135 int isbroadcast
, sw_csum
;
136 struct in_addr pkt_dst
;
137 struct route iproute
;
139 struct secpolicy
*sp
= NULL
;
140 struct socket
*so
= inp
? inp
->inp_socket
: NULL
;
144 struct secpolicy
*sp
= NULL
;
145 struct tdb_ident
*tdbi
;
146 #endif /* FAST_IPSEC */
147 struct ip_fw_args args
;
148 int src_was_INADDR_ANY
= 0; /* as the name says... */
152 args
.next_hop
= NULL
;
154 /* Grab info from MT_TAG mbufs prepended to the chain. */
155 while (m0
!= NULL
&& m0
->m_type
== MT_TAG
) {
156 switch(m0
->_m_tag_id
) {
157 case PACKET_TAG_DUMMYNET
:
159 * the packet was already tagged, so part of the
160 * processing was already done, and we need to go down.
161 * Get parameters from the header.
163 args
.rule
= ((struct dn_pkt
*)m0
)->rule
;
165 ro
= &((struct dn_pkt
*)m0
)->ro
;
167 dst
= ((struct dn_pkt
*)m0
)->dn_dst
;
168 ifp
= ((struct dn_pkt
*)m0
)->ifp
;
169 flags
= ((struct dn_pkt
*)m0
)->flags
;
171 case PACKET_TAG_IPFORWARD
:
172 args
.next_hop
= (struct sockaddr_in
*)m0
->m_data
;
175 kprintf("ip_output: unrecognised MT_TAG tag %d\n",
182 KASSERT(m
!= NULL
&& (m
->m_flags
& M_PKTHDR
), ("ip_output: no HDR"));
186 bzero(ro
, sizeof *ro
);
189 if (args
.rule
!= NULL
) { /* dummynet already saw us */
190 ip
= mtod(m
, struct ip
*);
191 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2 ;
193 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
199 m
= ip_insertoptions(m
, opt
, &len
);
203 ip
= mtod(m
, struct ip
*);
204 pkt_dst
= args
.next_hop
? args
.next_hop
->sin_addr
: ip
->ip_dst
;
209 if (!(flags
& (IP_FORWARDING
|IP_RAWOUTPUT
))) {
210 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, hlen
>> 2);
212 ip
->ip_id
= ip_newid();
213 ipstat
.ips_localout
++;
215 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
218 dst
= (struct sockaddr_in
*)&ro
->ro_dst
;
220 * If there is a cached route,
221 * check that it is to the same destination
222 * and is still up. If not, free it and try again.
223 * The address family should also be checked in case of sharing the
227 (!(ro
->ro_rt
->rt_flags
& RTF_UP
) ||
228 dst
->sin_family
!= AF_INET
||
229 dst
->sin_addr
.s_addr
!= pkt_dst
.s_addr
)) {
231 ro
->ro_rt
= (struct rtentry
*)NULL
;
233 if (ro
->ro_rt
== NULL
) {
234 bzero(dst
, sizeof *dst
);
235 dst
->sin_family
= AF_INET
;
236 dst
->sin_len
= sizeof *dst
;
237 dst
->sin_addr
= pkt_dst
;
240 * If routing to interface only,
241 * short circuit routing lookup.
243 if (flags
& IP_ROUTETOIF
) {
244 if ((ia
= ifatoia(ifa_ifwithdstaddr(sintosa(dst
)))) == NULL
&&
245 (ia
= ifatoia(ifa_ifwithnet(sintosa(dst
)))) == NULL
) {
246 ipstat
.ips_noroute
++;
252 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
253 } else if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
)) &&
254 imo
!= NULL
&& imo
->imo_multicast_ifp
!= NULL
) {
256 * Bypass the normal routing lookup for multicast
257 * packets if the interface is specified.
259 ifp
= imo
->imo_multicast_ifp
;
261 isbroadcast
= 0; /* fool gcc */
264 * If this is the case, we probably don't want to allocate
265 * a protocol-cloned route since we didn't get one from the
266 * ULP. This lets TCP do its thing, while not burdening
267 * forwarding or ICMP with the overhead of cloning a route.
268 * Of course, we still want to do any cloning requested by
269 * the link layer, as this is probably required in all cases
270 * for correct operation (as it is for ARP).
272 if (ro
->ro_rt
== NULL
)
273 rtalloc_ign(ro
, RTF_PRCLONING
);
274 if (ro
->ro_rt
== NULL
) {
275 ipstat
.ips_noroute
++;
276 error
= EHOSTUNREACH
;
279 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
280 ifp
= ro
->ro_rt
->rt_ifp
;
282 if (ro
->ro_rt
->rt_flags
& RTF_GATEWAY
)
283 dst
= (struct sockaddr_in
*)ro
->ro_rt
->rt_gateway
;
284 if (ro
->ro_rt
->rt_flags
& RTF_HOST
)
285 isbroadcast
= (ro
->ro_rt
->rt_flags
& RTF_BROADCAST
);
287 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
289 if (IN_MULTICAST(ntohl(pkt_dst
.s_addr
))) {
290 struct in_multi
*inm
;
292 m
->m_flags
|= M_MCAST
;
294 * IP destination address is multicast. Make sure "dst"
295 * still points to the address in "ro". (It may have been
296 * changed to point to a gateway address, above.)
298 dst
= (struct sockaddr_in
*)&ro
->ro_dst
;
300 * See if the caller provided any multicast options
303 ip
->ip_ttl
= imo
->imo_multicast_ttl
;
304 if (imo
->imo_multicast_vif
!= -1)
307 ip_mcast_src(imo
->imo_multicast_vif
) :
310 ip
->ip_ttl
= IP_DEFAULT_MULTICAST_TTL
;
312 * Confirm that the outgoing interface supports multicast.
314 if ((imo
== NULL
) || (imo
->imo_multicast_vif
== -1)) {
315 if (!(ifp
->if_flags
& IFF_MULTICAST
)) {
316 ipstat
.ips_noroute
++;
322 * If source address not specified yet, use address
323 * of outgoing interface.
325 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
326 /* Interface may have no addresses. */
328 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
331 IN_LOOKUP_MULTI(pkt_dst
, ifp
, inm
);
333 (imo
== NULL
|| imo
->imo_multicast_loop
)) {
335 * If we belong to the destination multicast group
336 * on the outgoing interface, and the caller did not
337 * forbid loopback, loop back a copy.
339 ip_mloopback(ifp
, m
, dst
, hlen
);
343 * If we are acting as a multicast router, perform
344 * multicast forwarding as if the packet had just
345 * arrived on the interface to which we are about
346 * to send. The multicast forwarding function
347 * recursively calls this function, using the
348 * IP_FORWARDING flag to prevent infinite recursion.
350 * Multicasts that are looped back by ip_mloopback(),
351 * above, will be forwarded by the ip_input() routine,
354 if (ip_mrouter
&& !(flags
& IP_FORWARDING
)) {
356 * If rsvp daemon is not running, do not
357 * set ip_moptions. This ensures that the packet
358 * is multicast and not just sent down one link
359 * as prescribed by rsvpd.
364 ip_mforward(ip
, ifp
, m
, imo
) != 0) {
372 * Multicasts with a time-to-live of zero may be looped-
373 * back, above, but must not be transmitted on a network.
374 * Also, multicasts addressed to the loopback interface
375 * are not sent -- the above call to ip_mloopback() will
376 * loop back a copy if this host actually belongs to the
377 * destination group on the loopback interface.
379 if (ip
->ip_ttl
== 0 || ifp
->if_flags
& IFF_LOOPBACK
) {
388 * If the source address is not specified yet, use the address
389 * of the outoing interface. In case, keep note we did that, so
390 * if the the firewall changes the next-hop causing the output
391 * interface to change, we can fix that.
393 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
394 /* Interface may have no addresses. */
396 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
397 src_was_INADDR_ANY
= 1;
403 * Disable packet drop hack.
404 * Packetdrop should be done by queueing.
408 * Verify that we have any chance at all of being able to queue
409 * the packet or packet fragments
411 if ((ifp
->if_snd
.ifq_len
+ ip
->ip_len
/ ifp
->if_mtu
+ 1) >=
412 ifp
->if_snd
.ifq_maxlen
) {
414 ipstat
.ips_odropped
++;
420 * Look for broadcast address and
421 * verify user is allowed to send
425 if (!(ifp
->if_flags
& IFF_BROADCAST
)) {
426 error
= EADDRNOTAVAIL
;
429 if (!(flags
& IP_ALLOWBROADCAST
)) {
433 /* don't allow broadcast messages to be fragmented */
434 if (ip
->ip_len
> ifp
->if_mtu
) {
438 m
->m_flags
|= M_BCAST
;
440 m
->m_flags
&= ~M_BCAST
;
445 /* get SP for this packet */
447 sp
= ipsec4_getpolicybyaddr(m
, IPSEC_DIR_OUTBOUND
, flags
, &error
);
449 sp
= ipsec4_getpolicybysock(m
, IPSEC_DIR_OUTBOUND
, so
, &error
);
452 ipsecstat
.out_inval
++;
459 switch (sp
->policy
) {
460 case IPSEC_POLICY_DISCARD
:
462 * This packet is just discarded.
464 ipsecstat
.out_polvio
++;
467 case IPSEC_POLICY_BYPASS
:
468 case IPSEC_POLICY_NONE
:
469 /* no need to do IPsec. */
472 case IPSEC_POLICY_IPSEC
:
473 if (sp
->req
== NULL
) {
474 /* acquire a policy */
475 error
= key_spdacquire(sp
);
480 case IPSEC_POLICY_ENTRUST
:
482 kprintf("ip_output: Invalid policy found. %d\n", sp
->policy
);
485 struct ipsec_output_state state
;
486 bzero(&state
, sizeof state
);
488 if (flags
& IP_ROUTETOIF
) {
490 bzero(&iproute
, sizeof iproute
);
493 state
.dst
= (struct sockaddr
*)dst
;
499 * delayed checksums are not currently compatible with IPsec
501 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
503 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
506 ip
->ip_len
= htons(ip
->ip_len
);
507 ip
->ip_off
= htons(ip
->ip_off
);
509 error
= ipsec4_output(&state
, sp
, flags
);
512 if (flags
& IP_ROUTETOIF
) {
514 * if we have tunnel mode SA, we may need to ignore
517 if (state
.ro
!= &iproute
|| state
.ro
->ro_rt
!= NULL
) {
518 flags
&= ~IP_ROUTETOIF
;
523 dst
= (struct sockaddr_in
*)state
.dst
;
525 /* mbuf is already reclaimed in ipsec4_output. */
535 kprintf("ip4_output (ipsec): error code %d\n", error
);
538 /* don't show these error codes to the user */
546 /* be sure to update variables that are affected by ipsec4_output() */
547 ip
= mtod(m
, struct ip
*);
549 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
551 hlen
= ip
->ip_hl
<< 2;
553 if (ro
->ro_rt
== NULL
) {
554 if (!(flags
& IP_ROUTETOIF
)) {
555 kprintf("ip_output: "
556 "can't update route after IPsec processing\n");
557 error
= EHOSTUNREACH
; /*XXX*/
561 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
562 ifp
= ro
->ro_rt
->rt_ifp
;
565 /* make it flipped, again. */
566 ip
->ip_len
= ntohs(ip
->ip_len
);
567 ip
->ip_off
= ntohs(ip
->ip_off
);
572 * Check the security policy (SP) for the packet and, if
573 * required, do IPsec-related processing. There are two
574 * cases here; the first time a packet is sent through
575 * it will be untagged and handled by ipsec4_checkpolicy.
576 * If the packet is resubmitted to ip_output (e.g. after
577 * AH, ESP, etc. processing), there will be a tag to bypass
578 * the lookup and related policy checking.
580 mtag
= m_tag_find(m
, PACKET_TAG_IPSEC_PENDING_TDB
, NULL
);
583 tdbi
= (struct tdb_ident
*)m_tag_data(mtag
);
584 sp
= ipsec_getpolicy(tdbi
, IPSEC_DIR_OUTBOUND
);
586 error
= -EINVAL
; /* force silent drop */
587 m_tag_delete(m
, mtag
);
589 sp
= ipsec4_checkpolicy(m
, IPSEC_DIR_OUTBOUND
, flags
,
593 * There are four return cases:
594 * sp != NULL apply IPsec policy
595 * sp == NULL, error == 0 no IPsec handling needed
596 * sp == NULL, error == -EINVAL discard packet w/o error
597 * sp == NULL, error != 0 discard packet, report error
600 /* Loop detection, check if ipsec processing already done */
601 KASSERT(sp
->req
!= NULL
, ("ip_output: no ipsec request"));
602 for (mtag
= m_tag_first(m
); mtag
!= NULL
;
603 mtag
= m_tag_next(m
, mtag
)) {
604 if (mtag
->m_tag_cookie
!= MTAG_ABI_COMPAT
)
606 if (mtag
->m_tag_id
!= PACKET_TAG_IPSEC_OUT_DONE
&&
607 mtag
->m_tag_id
!= PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED
)
610 * Check if policy has an SA associated with it.
611 * This can happen when an SP has yet to acquire
612 * an SA; e.g. on first reference. If it occurs,
613 * then we let ipsec4_process_packet do its thing.
615 if (sp
->req
->sav
== NULL
)
617 tdbi
= (struct tdb_ident
*)m_tag_data(mtag
);
618 if (tdbi
->spi
== sp
->req
->sav
->spi
&&
619 tdbi
->proto
== sp
->req
->sav
->sah
->saidx
.proto
&&
620 bcmp(&tdbi
->dst
, &sp
->req
->sav
->sah
->saidx
.dst
,
621 sizeof(union sockaddr_union
)) == 0) {
623 * No IPsec processing is needed, free
626 * NB: null pointer to avoid free at
629 KEY_FREESP(&sp
), sp
= NULL
;
636 * Do delayed checksums now because we send before
637 * this is done in the normal processing path.
639 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
641 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
644 ip
->ip_len
= htons(ip
->ip_len
);
645 ip
->ip_off
= htons(ip
->ip_off
);
647 /* NB: callee frees mbuf */
648 error
= ipsec4_process_packet(m
, sp
->req
, flags
, 0);
650 * Preserve KAME behaviour: ENOENT can be returned
651 * when an SA acquire is in progress. Don't propagate
652 * this to user-level; it confuses applications.
654 * XXX this will go away when the SADB is redone.
665 * Hack: -EINVAL is used to signal that a packet
666 * should be silently discarded. This is typically
667 * because we asked key management for an SA and
668 * it was delayed (e.g. kicked up to IKE).
670 if (error
== -EINVAL
)
674 /* No IPsec processing for this packet. */
678 * If deferred crypto processing is needed, check that
679 * the interface supports it.
681 mtag
= m_tag_find(m
, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED
, NULL
);
682 if (mtag
!= NULL
&& !(ifp
->if_capenable
& IFCAP_IPSEC
)) {
683 /* notify IPsec to do its own crypto */
684 ipsp_skipcrypto_unmark((struct tdb_ident
*)m_tag_data(mtag
));
685 error
= EHOSTUNREACH
;
691 #endif /* FAST_IPSEC */
694 * - Xlate: translate packet's addr/port (NAT).
695 * - Firewall: deny/allow/etc.
696 * - Wrap: fake packet's addr/port <unimpl.>
697 * - Encapsulate: put it in another IP and send out. <unimp.>
701 * Run through list of hooks for output packets.
703 if (pfil_has_hooks(&inet_pfil_hook
)) {
704 error
= pfil_run_hooks(&inet_pfil_hook
, &m
, ifp
, PFIL_OUT
);
705 if (error
!= 0 || m
== NULL
)
707 ip
= mtod(m
, struct ip
*);
711 * Check with the firewall...
712 * but not if we are already being fwd'd from a firewall.
714 if (fw_enable
&& IPFW_LOADED
&& !args
.next_hop
) {
715 struct sockaddr_in
*old
= dst
;
720 off
= ip_fw_chk_ptr(&args
);
725 * On return we must do the following:
726 * m == NULL -> drop the pkt (old interface, deprecated)
727 * (off & IP_FW_PORT_DENY_FLAG) -> drop the pkt (new interface)
728 * 1<=off<= 0xffff -> DIVERT
729 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
730 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
731 * dst != old -> IPFIREWALL_FORWARD
732 * off==0, dst==old -> accept
733 * If some of the above modules are not compiled in, then
734 * we should't have to check the corresponding condition
735 * (because the ipfw control socket should not accept
736 * unsupported rules), but better play safe and drop
737 * packets in case of doubt.
739 if ( (off
& IP_FW_PORT_DENY_FLAG
) || m
== NULL
) {
745 ip
= mtod(m
, struct ip
*);
746 if (off
== 0 && dst
== old
) /* common case */
748 if (DUMMYNET_LOADED
&& (off
& IP_FW_PORT_DYNT_FLAG
)) {
750 * pass the pkt to dummynet. Need to include
751 * pipe number, m, ifp, ro, dst because these are
752 * not recomputed in the next pass.
753 * All other parameters have been already used and
754 * so they are not needed anymore.
755 * XXX note: if the ifp or ro entry are deleted
756 * while a pkt is in dummynet, we are in trouble!
762 error
= ip_dn_io_ptr(m
, off
& 0xffff, DN_TO_IP_OUT
,
767 if (off
!= 0 && !(off
& IP_FW_PORT_DYNT_FLAG
)) {
768 struct mbuf
*clone
= NULL
;
770 /* Clone packet if we're doing a 'tee' */
771 if ((off
& IP_FW_PORT_TEE_FLAG
))
772 clone
= m_dup(m
, MB_DONTWAIT
);
776 * delayed checksums are not currently compatible
777 * with divert sockets.
779 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
781 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
784 /* Restore packet header fields to original values */
785 ip
->ip_len
= htons(ip
->ip_len
);
786 ip
->ip_off
= htons(ip
->ip_off
);
788 /* Deliver packet to divert input routine */
789 divert_packet(m
, 0, off
& 0xffff);
791 /* If 'tee', continue with original packet */
794 ip
= mtod(m
, struct ip
*);
801 /* IPFIREWALL_FORWARD */
803 * Check dst to make sure it is directly reachable on the
804 * interface we previously thought it was.
805 * If it isn't (which may be likely in some situations) we have
806 * to re-route it (ie, find a route for the next-hop and the
807 * associated interface) and set them here. This is nested
808 * forwarding which in most cases is undesirable, except where
809 * such control is nigh impossible. So we do it here.
812 if (off
== 0 && old
!= dst
) { /* FORWARD, dst has changed */
815 * XXX To improve readability, this block should be
816 * changed into a function call as below:
818 error
= ip_ipforward(&m
, &dst
, &ifp
);
821 if (m
== NULL
) /* ip_input consumed the mbuf */
824 struct in_ifaddr
*ia
;
827 * XXX sro_fwd below is static, and a pointer
828 * to it gets passed to routines downstream.
829 * This could have surprisingly bad results in
830 * practice, because its content is overwritten
831 * by subsequent packets.
833 /* There must be a better way to do this next line... */
834 static struct route sro_fwd
;
835 struct route
*ro_fwd
= &sro_fwd
;
838 print_ip("IPFIREWALL_FORWARD: New dst ip: ",
839 dst
->sin_addr
, "\n");
843 * We need to figure out if we have been forwarded
844 * to a local socket. If so, then we should somehow
845 * "loop back" to ip_input, and get directed to the
846 * PCB as if we had received this packet. This is
847 * because it may be dificult to identify the packets
848 * you want to forward until they are being output
849 * and have selected an interface. (e.g. locally
850 * initiated packets) If we used the loopback inteface,
851 * we would not be able to control what happens
852 * as the packet runs through ip_input() as
853 * it is done through a ISR.
855 LIST_FOREACH(ia
, INADDR_HASH(dst
->sin_addr
.s_addr
),
858 * If the addr to forward to is one
859 * of ours, we pretend to
860 * be the destination for this packet.
862 if (IA_SIN(ia
)->sin_addr
.s_addr
==
863 dst
->sin_addr
.s_addr
)
866 if (ia
!= NULL
) { /* tell ip_input "dont filter" */
869 tag
.mh_type
= MT_TAG
;
870 tag
.mh_flags
= PACKET_TAG_IPFORWARD
;
871 tag
.mh_data
= (caddr_t
)args
.next_hop
;
874 if (m
->m_pkthdr
.rcvif
== NULL
)
875 m
->m_pkthdr
.rcvif
= ifunit("lo0");
876 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
877 m
->m_pkthdr
.csum_flags
|=
878 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
879 m
->m_pkthdr
.csum_data
= 0xffff;
881 m
->m_pkthdr
.csum_flags
|=
882 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
883 ip
->ip_len
= htons(ip
->ip_len
);
884 ip
->ip_off
= htons(ip
->ip_off
);
885 ip_input((struct mbuf
*)&tag
);
888 /* Some of the logic for this was nicked from above.
890 * This rewrites the cached route in a local PCB.
891 * Is this what we want to do?
893 bcopy(dst
, &ro_fwd
->ro_dst
, sizeof *dst
);
894 ro_fwd
->ro_rt
= NULL
;
896 rtalloc_ign(ro_fwd
, RTF_PRCLONING
);
897 if (ro_fwd
->ro_rt
== NULL
) {
898 ipstat
.ips_noroute
++;
899 error
= EHOSTUNREACH
;
903 ia
= ifatoia(ro_fwd
->ro_rt
->rt_ifa
);
904 ifp
= ro_fwd
->ro_rt
->rt_ifp
;
905 ro_fwd
->ro_rt
->rt_use
++;
906 if (ro_fwd
->ro_rt
->rt_flags
& RTF_GATEWAY
)
907 dst
= (struct sockaddr_in
*)
908 ro_fwd
->ro_rt
->rt_gateway
;
909 if (ro_fwd
->ro_rt
->rt_flags
& RTF_HOST
)
911 (ro_fwd
->ro_rt
->rt_flags
& RTF_BROADCAST
);
913 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
914 if (ro
->ro_rt
!= NULL
)
916 ro
->ro_rt
= ro_fwd
->ro_rt
;
917 dst
= (struct sockaddr_in
*)&ro_fwd
->ro_dst
;
919 #endif /* ... block to be put into a function */
921 * If we added a default src ip earlier,
922 * which would have been gotten from the-then
923 * interface, do it again, from the new one.
925 if (src_was_INADDR_ANY
)
926 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
931 * if we get here, none of the above matches, and
932 * we have to drop the pkt
935 error
= EACCES
; /* not sure this is the right error msg */
940 /* 127/8 must not appear on wire - RFC1122. */
941 if ((ntohl(ip
->ip_dst
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
||
942 (ntohl(ip
->ip_src
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
) {
943 if (!(ifp
->if_flags
& IFF_LOOPBACK
)) {
944 ipstat
.ips_badaddr
++;
945 error
= EADDRNOTAVAIL
;
950 m
->m_pkthdr
.csum_flags
|= CSUM_IP
;
951 sw_csum
= m
->m_pkthdr
.csum_flags
& ~ifp
->if_hwassist
;
952 if (sw_csum
& CSUM_DELAY_DATA
) {
954 sw_csum
&= ~CSUM_DELAY_DATA
;
956 m
->m_pkthdr
.csum_flags
&= ifp
->if_hwassist
;
959 * If small enough for interface, or the interface will take
960 * care of the fragmentation for us, can just send directly.
962 if (ip
->ip_len
<= ifp
->if_mtu
|| ((ifp
->if_hwassist
& CSUM_FRAGMENT
) &&
963 !(ip
->ip_off
& IP_DF
))) {
964 ip
->ip_len
= htons(ip
->ip_len
);
965 ip
->ip_off
= htons(ip
->ip_off
);
967 if (sw_csum
& CSUM_DELAY_IP
) {
968 if (ip
->ip_vhl
== IP_VHL_BORING
) {
969 ip
->ip_sum
= in_cksum_hdr(ip
);
971 ip
->ip_sum
= in_cksum(m
, hlen
);
975 /* Record statistics for this interface address. */
976 if (!(flags
& IP_FORWARDING
) && ia
) {
977 ia
->ia_ifa
.if_opackets
++;
978 ia
->ia_ifa
.if_obytes
+= m
->m_pkthdr
.len
;
982 /* clean ipsec history once it goes out of the node */
986 #ifdef MBUF_STRESS_TEST
987 if (mbuf_frag_size
&& m
->m_pkthdr
.len
> mbuf_frag_size
) {
988 struct mbuf
*m1
, *m2
;
991 tmp
= length
= m
->m_pkthdr
.len
;
993 while ((length
-= mbuf_frag_size
) >= 1) {
994 m1
= m_split(m
, length
, MB_DONTWAIT
);
998 while (m2
->m_next
!= NULL
)
1002 m
->m_pkthdr
.len
= tmp
;
1005 lwkt_serialize_enter(ifp
->if_serializer
);
1006 error
= (*ifp
->if_output
)(ifp
, m
, (struct sockaddr
*)dst
,
1008 lwkt_serialize_exit(ifp
->if_serializer
);
1012 if (ip
->ip_off
& IP_DF
) {
1015 * This case can happen if the user changed the MTU
1016 * of an interface after enabling IP on it. Because
1017 * most netifs don't keep track of routes pointing to
1018 * them, there is no way for one to update all its
1019 * routes when the MTU is changed.
1021 if ((ro
->ro_rt
->rt_flags
& (RTF_UP
| RTF_HOST
)) &&
1022 !(ro
->ro_rt
->rt_rmx
.rmx_locks
& RTV_MTU
) &&
1023 (ro
->ro_rt
->rt_rmx
.rmx_mtu
> ifp
->if_mtu
)) {
1024 ro
->ro_rt
->rt_rmx
.rmx_mtu
= ifp
->if_mtu
;
1026 ipstat
.ips_cantfrag
++;
1031 * Too large for interface; fragment if possible. If successful,
1032 * on return, m will point to a list of packets to be sent.
1034 error
= ip_fragment(ip
, &m
, ifp
->if_mtu
, ifp
->if_hwassist
, sw_csum
);
1039 m
->m_nextpkt
= NULL
;
1041 /* clean ipsec history once it goes out of the node */
1045 /* Record statistics for this interface address. */
1047 ia
->ia_ifa
.if_opackets
++;
1048 ia
->ia_ifa
.if_obytes
+= m
->m_pkthdr
.len
;
1050 lwkt_serialize_enter(ifp
->if_serializer
);
1051 error
= (*ifp
->if_output
)(ifp
, m
,
1052 (struct sockaddr
*)dst
,
1054 lwkt_serialize_exit(ifp
->if_serializer
);
1061 ipstat
.ips_fragmented
++;
1064 if (ro
== &iproute
&& ro
->ro_rt
!= NULL
) {
1070 KEYDEBUG(KEYDEBUG_IPSEC_STAMP
,
1071 kprintf("DP ip_output call free SP:%p\n", sp
));
1086 * Create a chain of fragments which fit the given mtu. m_frag points to the
1087 * mbuf to be fragmented; on return it points to the chain with the fragments.
1088 * Return 0 if no error. If error, m_frag may contain a partially built
1089 * chain of fragments that should be freed by the caller.
1091 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
1092 * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
1095 ip_fragment(struct ip
*ip
, struct mbuf
**m_frag
, int mtu
,
1096 u_long if_hwassist_flags
, int sw_csum
)
1099 int hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1100 int len
= (mtu
- hlen
) & ~7; /* size of payload in each fragment */
1102 struct mbuf
*m0
= *m_frag
; /* the original packet */
1104 struct mbuf
**mnext
;
1107 if (ip
->ip_off
& IP_DF
) { /* Fragmentation not allowed */
1108 ipstat
.ips_cantfrag
++;
1113 * Must be able to put at least 8 bytes per fragment.
1119 * If the interface will not calculate checksums on
1120 * fragmented packets, then do it here.
1122 if ((m0
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) &&
1123 !(if_hwassist_flags
& CSUM_IP_FRAGS
)) {
1124 in_delayed_cksum(m0
);
1125 m0
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1128 if (len
> PAGE_SIZE
) {
1130 * Fragment large datagrams such that each segment
1131 * contains a multiple of PAGE_SIZE amount of data,
1132 * plus headers. This enables a receiver to perform
1133 * page-flipping zero-copy optimizations.
1135 * XXX When does this help given that sender and receiver
1136 * could have different page sizes, and also mtu could
1137 * be less than the receiver's page size ?
1142 for (m
= m0
, off
= 0; m
&& (off
+m
->m_len
) <= mtu
; m
= m
->m_next
)
1146 * firstlen (off - hlen) must be aligned on an
1150 goto smart_frag_failure
;
1151 off
= ((off
- hlen
) & ~7) + hlen
;
1152 newlen
= (~PAGE_MASK
) & mtu
;
1153 if ((newlen
+ sizeof(struct ip
)) > mtu
) {
1154 /* we failed, go back the default */
1165 firstlen
= off
- hlen
;
1166 mnext
= &m0
->m_nextpkt
; /* pointer to next packet */
1169 * Loop through length of segment after first fragment,
1170 * make new header and copy data of each part and link onto chain.
1171 * Here, m0 is the original packet, m is the fragment being created.
1172 * The fragments are linked off the m_nextpkt of the original
1173 * packet, which after processing serves as the first fragment.
1175 for (nfrags
= 1; off
< ip
->ip_len
; off
+= len
, nfrags
++) {
1176 struct ip
*mhip
; /* ip header on the fragment */
1178 int mhlen
= sizeof(struct ip
);
1180 MGETHDR(m
, MB_DONTWAIT
, MT_HEADER
);
1183 ipstat
.ips_odropped
++;
1186 m
->m_flags
|= (m0
->m_flags
& M_MCAST
) | M_FRAG
;
1188 * In the first mbuf, leave room for the link header, then
1189 * copy the original IP header including options. The payload
1190 * goes into an additional mbuf chain returned by m_copy().
1192 m
->m_data
+= max_linkhdr
;
1193 mhip
= mtod(m
, struct ip
*);
1195 if (hlen
> sizeof(struct ip
)) {
1196 mhlen
= ip_optcopy(ip
, mhip
) + sizeof(struct ip
);
1197 mhip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, mhlen
>> 2);
1200 /* XXX do we need to add ip->ip_off below ? */
1201 mhip
->ip_off
= ((off
- hlen
) >> 3) + ip
->ip_off
;
1202 if (off
+ len
>= ip
->ip_len
) { /* last fragment */
1203 len
= ip
->ip_len
- off
;
1204 m
->m_flags
|= M_LASTFRAG
;
1206 mhip
->ip_off
|= IP_MF
;
1207 mhip
->ip_len
= htons((u_short
)(len
+ mhlen
));
1208 m
->m_next
= m_copy(m0
, off
, len
);
1209 if (m
->m_next
== NULL
) { /* copy failed */
1211 error
= ENOBUFS
; /* ??? */
1212 ipstat
.ips_odropped
++;
1215 m
->m_pkthdr
.len
= mhlen
+ len
;
1216 m
->m_pkthdr
.rcvif
= (struct ifnet
*)NULL
;
1217 m
->m_pkthdr
.csum_flags
= m0
->m_pkthdr
.csum_flags
;
1218 mhip
->ip_off
= htons(mhip
->ip_off
);
1220 if (sw_csum
& CSUM_DELAY_IP
)
1221 mhip
->ip_sum
= in_cksum(m
, mhlen
);
1223 mnext
= &m
->m_nextpkt
;
1225 ipstat
.ips_ofragments
+= nfrags
;
1227 /* set first marker for fragment chain */
1228 m0
->m_flags
|= M_FIRSTFRAG
| M_FRAG
;
1229 m0
->m_pkthdr
.csum_data
= nfrags
;
1232 * Update first fragment by trimming what's been copied out
1233 * and updating header.
1235 m_adj(m0
, hlen
+ firstlen
- ip
->ip_len
);
1236 m0
->m_pkthdr
.len
= hlen
+ firstlen
;
1237 ip
->ip_len
= htons((u_short
)m0
->m_pkthdr
.len
);
1238 ip
->ip_off
|= IP_MF
;
1239 ip
->ip_off
= htons(ip
->ip_off
);
1241 if (sw_csum
& CSUM_DELAY_IP
)
1242 ip
->ip_sum
= in_cksum(m0
, hlen
);
1250 in_delayed_cksum(struct mbuf
*m
)
1253 u_short csum
, offset
;
1255 ip
= mtod(m
, struct ip
*);
1256 offset
= IP_VHL_HL(ip
->ip_vhl
) << 2 ;
1257 csum
= in_cksum_skip(m
, ip
->ip_len
, offset
);
1258 if (m
->m_pkthdr
.csum_flags
& CSUM_UDP
&& csum
== 0)
1260 offset
+= m
->m_pkthdr
.csum_data
; /* checksum offset */
1262 if (offset
+ sizeof(u_short
) > m
->m_len
) {
1263 kprintf("delayed m_pullup, m->len: %d off: %d p: %d\n",
1264 m
->m_len
, offset
, ip
->ip_p
);
1267 * this shouldn't happen, but if it does, the
1268 * correct behavior may be to insert the checksum
1269 * in the existing chain instead of rearranging it.
1271 m
= m_pullup(m
, offset
+ sizeof(u_short
));
1273 *(u_short
*)(m
->m_data
+ offset
) = csum
;
1277 * Insert IP options into preformed packet.
1278 * Adjust IP destination as required for IP source routing,
1279 * as indicated by a non-zero in_addr at the start of the options.
1281 * XXX This routine assumes that the packet has no options in place.
1283 static struct mbuf
*
1284 ip_insertoptions(struct mbuf
*m
, struct mbuf
*opt
, int *phlen
)
1286 struct ipoption
*p
= mtod(opt
, struct ipoption
*);
1288 struct ip
*ip
= mtod(m
, struct ip
*);
1291 optlen
= opt
->m_len
- sizeof p
->ipopt_dst
;
1292 if (optlen
+ (u_short
)ip
->ip_len
> IP_MAXPACKET
) {
1294 return (m
); /* XXX should fail */
1296 if (p
->ipopt_dst
.s_addr
)
1297 ip
->ip_dst
= p
->ipopt_dst
;
1298 if (m
->m_flags
& M_EXT
|| m
->m_data
- optlen
< m
->m_pktdat
) {
1299 MGETHDR(n
, MB_DONTWAIT
, MT_HEADER
);
1304 n
->m_pkthdr
.rcvif
= (struct ifnet
*)NULL
;
1305 n
->m_pkthdr
.len
= m
->m_pkthdr
.len
+ optlen
;
1306 m
->m_len
-= sizeof(struct ip
);
1307 m
->m_data
+= sizeof(struct ip
);
1310 m
->m_len
= optlen
+ sizeof(struct ip
);
1311 m
->m_data
+= max_linkhdr
;
1312 memcpy(mtod(m
, void *), ip
, sizeof(struct ip
));
1314 m
->m_data
-= optlen
;
1316 m
->m_pkthdr
.len
+= optlen
;
1317 ovbcopy(ip
, mtod(m
, caddr_t
), sizeof(struct ip
));
1319 ip
= mtod(m
, struct ip
*);
1320 bcopy(p
->ipopt_list
, ip
+ 1, optlen
);
1321 *phlen
= sizeof(struct ip
) + optlen
;
1322 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, *phlen
>> 2);
1323 ip
->ip_len
+= optlen
;
1328 * Copy options from ip to jp,
1329 * omitting those not copied during fragmentation.
1332 ip_optcopy(struct ip
*ip
, struct ip
*jp
)
1335 int opt
, optlen
, cnt
;
1337 cp
= (u_char
*)(ip
+ 1);
1338 dp
= (u_char
*)(jp
+ 1);
1339 cnt
= (IP_VHL_HL(ip
->ip_vhl
) << 2) - sizeof(struct ip
);
1340 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
1342 if (opt
== IPOPT_EOL
)
1344 if (opt
== IPOPT_NOP
) {
1345 /* Preserve for IP mcast tunnel's LSRR alignment. */
1351 KASSERT(cnt
>= IPOPT_OLEN
+ sizeof *cp
,
1352 ("ip_optcopy: malformed ipv4 option"));
1353 optlen
= cp
[IPOPT_OLEN
];
1354 KASSERT(optlen
>= IPOPT_OLEN
+ sizeof *cp
&& optlen
<= cnt
,
1355 ("ip_optcopy: malformed ipv4 option"));
1357 /* bogus lengths should have been caught by ip_dooptions */
1360 if (IPOPT_COPIED(opt
)) {
1361 bcopy(cp
, dp
, optlen
);
1365 for (optlen
= dp
- (u_char
*)(jp
+1); optlen
& 0x3; optlen
++)
1371 * IP socket option processing.
1374 ip_ctloutput(struct socket
*so
, struct sockopt
*sopt
)
1376 struct inpcb
*inp
= so
->so_pcb
;
1380 if (sopt
->sopt_level
!= IPPROTO_IP
) {
1384 switch (sopt
->sopt_dir
) {
1386 switch (sopt
->sopt_name
) {
1393 if (sopt
->sopt_valsize
> MLEN
) {
1397 MGET(m
, sopt
->sopt_td
? MB_WAIT
: MB_DONTWAIT
, MT_HEADER
);
1402 m
->m_len
= sopt
->sopt_valsize
;
1403 error
= sooptcopyin(sopt
, mtod(m
, char *), m
->m_len
,
1406 return (ip_pcbopts(sopt
->sopt_name
, &inp
->inp_options
,
1414 case IP_RECVRETOPTS
:
1415 case IP_RECVDSTADDR
:
1419 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
1424 switch (sopt
->sopt_name
) {
1426 inp
->inp_ip_tos
= optval
;
1430 inp
->inp_ip_ttl
= optval
;
1433 if (optval
> 0 && optval
<= MAXTTL
)
1434 inp
->inp_ip_minttl
= optval
;
1438 #define OPTSET(bit) \
1440 inp->inp_flags |= bit; \
1442 inp->inp_flags &= ~bit;
1445 OPTSET(INP_RECVOPTS
);
1448 case IP_RECVRETOPTS
:
1449 OPTSET(INP_RECVRETOPTS
);
1452 case IP_RECVDSTADDR
:
1453 OPTSET(INP_RECVDSTADDR
);
1461 OPTSET(INP_RECVTTL
);
1471 case IP_MULTICAST_IF
:
1472 case IP_MULTICAST_VIF
:
1473 case IP_MULTICAST_TTL
:
1474 case IP_MULTICAST_LOOP
:
1475 case IP_ADD_MEMBERSHIP
:
1476 case IP_DROP_MEMBERSHIP
:
1477 error
= ip_setmoptions(sopt
, &inp
->inp_moptions
);
1481 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
1487 case IP_PORTRANGE_DEFAULT
:
1488 inp
->inp_flags
&= ~(INP_LOWPORT
);
1489 inp
->inp_flags
&= ~(INP_HIGHPORT
);
1492 case IP_PORTRANGE_HIGH
:
1493 inp
->inp_flags
&= ~(INP_LOWPORT
);
1494 inp
->inp_flags
|= INP_HIGHPORT
;
1497 case IP_PORTRANGE_LOW
:
1498 inp
->inp_flags
&= ~(INP_HIGHPORT
);
1499 inp
->inp_flags
|= INP_LOWPORT
;
1508 #if defined(IPSEC) || defined(FAST_IPSEC)
1509 case IP_IPSEC_POLICY
:
1517 if ((error
= soopt_getm(sopt
, &m
)) != 0) /* XXX */
1519 if ((error
= soopt_mcopyin(sopt
, m
)) != 0) /* XXX */
1521 priv
= (sopt
->sopt_td
!= NULL
&&
1522 suser(sopt
->sopt_td
) != 0) ? 0 : 1;
1523 req
= mtod(m
, caddr_t
);
1525 optname
= sopt
->sopt_name
;
1526 error
= ipsec4_set_policy(inp
, optname
, req
, len
, priv
);
1533 error
= ENOPROTOOPT
;
1539 switch (sopt
->sopt_name
) {
1542 if (inp
->inp_options
)
1543 error
= sooptcopyout(sopt
,
1544 mtod(inp
->inp_options
,
1546 inp
->inp_options
->m_len
);
1548 sopt
->sopt_valsize
= 0;
1555 case IP_RECVRETOPTS
:
1556 case IP_RECVDSTADDR
:
1561 switch (sopt
->sopt_name
) {
1564 optval
= inp
->inp_ip_tos
;
1568 optval
= inp
->inp_ip_ttl
;
1571 optval
= inp
->inp_ip_minttl
;
1574 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
1577 optval
= OPTBIT(INP_RECVOPTS
);
1580 case IP_RECVRETOPTS
:
1581 optval
= OPTBIT(INP_RECVRETOPTS
);
1584 case IP_RECVDSTADDR
:
1585 optval
= OPTBIT(INP_RECVDSTADDR
);
1589 optval
= OPTBIT(INP_RECVTTL
);
1593 optval
= OPTBIT(INP_RECVIF
);
1597 if (inp
->inp_flags
& INP_HIGHPORT
)
1598 optval
= IP_PORTRANGE_HIGH
;
1599 else if (inp
->inp_flags
& INP_LOWPORT
)
1600 optval
= IP_PORTRANGE_LOW
;
1606 optval
= OPTBIT(INP_FAITH
);
1609 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
1612 case IP_MULTICAST_IF
:
1613 case IP_MULTICAST_VIF
:
1614 case IP_MULTICAST_TTL
:
1615 case IP_MULTICAST_LOOP
:
1616 case IP_ADD_MEMBERSHIP
:
1617 case IP_DROP_MEMBERSHIP
:
1618 error
= ip_getmoptions(sopt
, inp
->inp_moptions
);
1621 #if defined(IPSEC) || defined(FAST_IPSEC)
1622 case IP_IPSEC_POLICY
:
1624 struct mbuf
*m
= NULL
;
1629 req
= mtod(m
, caddr_t
);
1632 error
= ipsec4_get_policy(so
->so_pcb
, req
, len
, &m
);
1634 error
= soopt_mcopyout(sopt
, m
); /* XXX */
1642 error
= ENOPROTOOPT
;
1651 * Set up IP options in pcb for insertion in output packets.
1652 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1653 * with destination address if source routed.
1656 ip_pcbopts(int optname
, struct mbuf
**pcbopt
, struct mbuf
*m
)
1662 /* turn off any old options */
1666 if (m
== NULL
|| m
->m_len
== 0) {
1668 * Only turning off any previous options.
1675 if (m
->m_len
% sizeof(int32_t))
1678 * IP first-hop destination address will be stored before
1679 * actual options; move other options back
1680 * and clear it when none present.
1682 if (m
->m_data
+ m
->m_len
+ sizeof(struct in_addr
) >= &m
->m_dat
[MLEN
])
1685 m
->m_len
+= sizeof(struct in_addr
);
1686 cp
= mtod(m
, u_char
*) + sizeof(struct in_addr
);
1687 ovbcopy(mtod(m
, caddr_t
), cp
, cnt
);
1688 bzero(mtod(m
, caddr_t
), sizeof(struct in_addr
));
1690 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
1691 opt
= cp
[IPOPT_OPTVAL
];
1692 if (opt
== IPOPT_EOL
)
1694 if (opt
== IPOPT_NOP
)
1697 if (cnt
< IPOPT_OLEN
+ sizeof *cp
)
1699 optlen
= cp
[IPOPT_OLEN
];
1700 if (optlen
< IPOPT_OLEN
+ sizeof *cp
|| optlen
> cnt
)
1711 * user process specifies route as:
1713 * D must be our final destination (but we can't
1714 * check that since we may not have connected yet).
1715 * A is first hop destination, which doesn't appear in
1716 * actual IP option, but is stored before the options.
1718 if (optlen
< IPOPT_MINOFF
- 1 + sizeof(struct in_addr
))
1720 m
->m_len
-= sizeof(struct in_addr
);
1721 cnt
-= sizeof(struct in_addr
);
1722 optlen
-= sizeof(struct in_addr
);
1723 cp
[IPOPT_OLEN
] = optlen
;
1725 * Move first hop before start of options.
1727 bcopy(&cp
[IPOPT_OFFSET
+1], mtod(m
, caddr_t
),
1728 sizeof(struct in_addr
));
1730 * Then copy rest of options back
1731 * to close up the deleted entry.
1733 ovbcopy(&cp
[IPOPT_OFFSET
+1] + sizeof(struct in_addr
),
1734 &cp
[IPOPT_OFFSET
+1],
1735 cnt
- (IPOPT_MINOFF
- 1));
1739 if (m
->m_len
> MAX_IPOPTLEN
+ sizeof(struct in_addr
))
1751 * The whole multicast option thing needs to be re-thought.
1752 * Several of these options are equally applicable to non-multicast
1753 * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
1754 * standard option (IP_TTL).
1758 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1760 static struct ifnet
*
1761 ip_multicast_if(struct in_addr
*a
, int *ifindexp
)
1768 if (ntohl(a
->s_addr
) >> 24 == 0) {
1769 ifindex
= ntohl(a
->s_addr
) & 0xffffff;
1770 if (ifindex
< 0 || if_index
< ifindex
)
1772 ifp
= ifindex2ifnet
[ifindex
];
1774 *ifindexp
= ifindex
;
1776 INADDR_TO_IFP(*a
, ifp
);
1782 * Set the IP multicast options in response to user setsockopt().
1785 ip_setmoptions(struct sockopt
*sopt
, struct ip_moptions
**imop
)
1789 struct in_addr addr
;
1790 struct ip_mreq mreq
;
1792 struct ip_moptions
*imo
= *imop
;
1797 * No multicast option buffer attached to the pcb;
1798 * allocate one and initialize to default values.
1800 imo
= kmalloc(sizeof *imo
, M_IPMOPTS
, M_WAITOK
);
1805 imo
->imo_multicast_ifp
= NULL
;
1806 imo
->imo_multicast_addr
.s_addr
= INADDR_ANY
;
1807 imo
->imo_multicast_vif
= -1;
1808 imo
->imo_multicast_ttl
= IP_DEFAULT_MULTICAST_TTL
;
1809 imo
->imo_multicast_loop
= IP_DEFAULT_MULTICAST_LOOP
;
1810 imo
->imo_num_memberships
= 0;
1813 switch (sopt
->sopt_name
) {
1814 /* store an index number for the vif you wanna use in the send */
1815 case IP_MULTICAST_VIF
:
1816 if (legal_vif_num
== 0) {
1820 error
= sooptcopyin(sopt
, &i
, sizeof i
, sizeof i
);
1823 if (!legal_vif_num(i
) && (i
!= -1)) {
1827 imo
->imo_multicast_vif
= i
;
1830 case IP_MULTICAST_IF
:
1832 * Select the interface for outgoing multicast packets.
1834 error
= sooptcopyin(sopt
, &addr
, sizeof addr
, sizeof addr
);
1838 * INADDR_ANY is used to remove a previous selection.
1839 * When no interface is selected, a default one is
1840 * chosen every time a multicast packet is sent.
1842 if (addr
.s_addr
== INADDR_ANY
) {
1843 imo
->imo_multicast_ifp
= NULL
;
1847 * The selected interface is identified by its local
1848 * IP address. Find the interface and confirm that
1849 * it supports multicasting.
1852 ifp
= ip_multicast_if(&addr
, &ifindex
);
1853 if (ifp
== NULL
|| !(ifp
->if_flags
& IFF_MULTICAST
)) {
1855 error
= EADDRNOTAVAIL
;
1858 imo
->imo_multicast_ifp
= ifp
;
1860 imo
->imo_multicast_addr
= addr
;
1862 imo
->imo_multicast_addr
.s_addr
= INADDR_ANY
;
1866 case IP_MULTICAST_TTL
:
1868 * Set the IP time-to-live for outgoing multicast packets.
1869 * The original multicast API required a char argument,
1870 * which is inconsistent with the rest of the socket API.
1871 * We allow either a char or an int.
1873 if (sopt
->sopt_valsize
== 1) {
1875 error
= sooptcopyin(sopt
, &ttl
, 1, 1);
1878 imo
->imo_multicast_ttl
= ttl
;
1881 error
= sooptcopyin(sopt
, &ttl
, sizeof ttl
, sizeof ttl
);
1887 imo
->imo_multicast_ttl
= ttl
;
1891 case IP_MULTICAST_LOOP
:
1893 * Set the loopback flag for outgoing multicast packets.
1894 * Must be zero or one. The original multicast API required a
1895 * char argument, which is inconsistent with the rest
1896 * of the socket API. We allow either a char or an int.
1898 if (sopt
->sopt_valsize
== 1) {
1901 error
= sooptcopyin(sopt
, &loop
, 1, 1);
1904 imo
->imo_multicast_loop
= !!loop
;
1908 error
= sooptcopyin(sopt
, &loop
, sizeof loop
,
1912 imo
->imo_multicast_loop
= !!loop
;
1916 case IP_ADD_MEMBERSHIP
:
1918 * Add a multicast group membership.
1919 * Group must be a valid IP multicast address.
1921 error
= sooptcopyin(sopt
, &mreq
, sizeof mreq
, sizeof mreq
);
1925 if (!IN_MULTICAST(ntohl(mreq
.imr_multiaddr
.s_addr
))) {
1931 * If no interface address was provided, use the interface of
1932 * the route to the given multicast address.
1934 if (mreq
.imr_interface
.s_addr
== INADDR_ANY
) {
1935 struct sockaddr_in dst
;
1938 bzero(&dst
, sizeof(struct sockaddr_in
));
1939 dst
.sin_len
= sizeof(struct sockaddr_in
);
1940 dst
.sin_family
= AF_INET
;
1941 dst
.sin_addr
= mreq
.imr_multiaddr
;
1942 rt
= rtlookup((struct sockaddr
*)&dst
);
1944 error
= EADDRNOTAVAIL
;
1951 ifp
= ip_multicast_if(&mreq
.imr_interface
, NULL
);
1955 * See if we found an interface, and confirm that it
1956 * supports multicast.
1958 if (ifp
== NULL
|| !(ifp
->if_flags
& IFF_MULTICAST
)) {
1959 error
= EADDRNOTAVAIL
;
1964 * See if the membership already exists or if all the
1965 * membership slots are full.
1967 for (i
= 0; i
< imo
->imo_num_memberships
; ++i
) {
1968 if (imo
->imo_membership
[i
]->inm_ifp
== ifp
&&
1969 imo
->imo_membership
[i
]->inm_addr
.s_addr
1970 == mreq
.imr_multiaddr
.s_addr
)
1973 if (i
< imo
->imo_num_memberships
) {
1978 if (i
== IP_MAX_MEMBERSHIPS
) {
1979 error
= ETOOMANYREFS
;
1984 * Everything looks good; add a new record to the multicast
1985 * address list for the given interface.
1987 if ((imo
->imo_membership
[i
] =
1988 in_addmulti(&mreq
.imr_multiaddr
, ifp
)) == NULL
) {
1993 ++imo
->imo_num_memberships
;
1997 case IP_DROP_MEMBERSHIP
:
1999 * Drop a multicast group membership.
2000 * Group must be a valid IP multicast address.
2002 error
= sooptcopyin(sopt
, &mreq
, sizeof mreq
, sizeof mreq
);
2006 if (!IN_MULTICAST(ntohl(mreq
.imr_multiaddr
.s_addr
))) {
2013 * If an interface address was specified, get a pointer
2014 * to its ifnet structure.
2016 if (mreq
.imr_interface
.s_addr
== INADDR_ANY
)
2019 ifp
= ip_multicast_if(&mreq
.imr_interface
, NULL
);
2021 error
= EADDRNOTAVAIL
;
2027 * Find the membership in the membership array.
2029 for (i
= 0; i
< imo
->imo_num_memberships
; ++i
) {
2031 imo
->imo_membership
[i
]->inm_ifp
== ifp
) &&
2032 imo
->imo_membership
[i
]->inm_addr
.s_addr
==
2033 mreq
.imr_multiaddr
.s_addr
)
2036 if (i
== imo
->imo_num_memberships
) {
2037 error
= EADDRNOTAVAIL
;
2042 * Give up the multicast address record to which the
2043 * membership points.
2045 in_delmulti(imo
->imo_membership
[i
]);
2047 * Remove the gap in the membership array.
2049 for (++i
; i
< imo
->imo_num_memberships
; ++i
)
2050 imo
->imo_membership
[i
-1] = imo
->imo_membership
[i
];
2051 --imo
->imo_num_memberships
;
2061 * If all options have default values, no need to keep the mbuf.
2063 if (imo
->imo_multicast_ifp
== NULL
&&
2064 imo
->imo_multicast_vif
== -1 &&
2065 imo
->imo_multicast_ttl
== IP_DEFAULT_MULTICAST_TTL
&&
2066 imo
->imo_multicast_loop
== IP_DEFAULT_MULTICAST_LOOP
&&
2067 imo
->imo_num_memberships
== 0) {
2068 kfree(*imop
, M_IPMOPTS
);
2076 * Return the IP multicast options in response to user getsockopt().
2079 ip_getmoptions(struct sockopt
*sopt
, struct ip_moptions
*imo
)
2081 struct in_addr addr
;
2082 struct in_ifaddr
*ia
;
2087 switch (sopt
->sopt_name
) {
2088 case IP_MULTICAST_VIF
:
2090 optval
= imo
->imo_multicast_vif
;
2093 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
2096 case IP_MULTICAST_IF
:
2097 if (imo
== NULL
|| imo
->imo_multicast_ifp
== NULL
)
2098 addr
.s_addr
= INADDR_ANY
;
2099 else if (imo
->imo_multicast_addr
.s_addr
) {
2100 /* return the value user has set */
2101 addr
= imo
->imo_multicast_addr
;
2103 IFP_TO_IA(imo
->imo_multicast_ifp
, ia
);
2104 addr
.s_addr
= (ia
== NULL
) ? INADDR_ANY
2105 : IA_SIN(ia
)->sin_addr
.s_addr
;
2107 error
= sooptcopyout(sopt
, &addr
, sizeof addr
);
2110 case IP_MULTICAST_TTL
:
2112 optval
= coptval
= IP_DEFAULT_MULTICAST_TTL
;
2114 optval
= coptval
= imo
->imo_multicast_ttl
;
2115 if (sopt
->sopt_valsize
== 1)
2116 error
= sooptcopyout(sopt
, &coptval
, 1);
2118 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
2121 case IP_MULTICAST_LOOP
:
2123 optval
= coptval
= IP_DEFAULT_MULTICAST_LOOP
;
2125 optval
= coptval
= imo
->imo_multicast_loop
;
2126 if (sopt
->sopt_valsize
== 1)
2127 error
= sooptcopyout(sopt
, &coptval
, 1);
2129 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
2133 error
= ENOPROTOOPT
;
2140 * Discard the IP multicast options.
2143 ip_freemoptions(struct ip_moptions
*imo
)
2148 for (i
= 0; i
< imo
->imo_num_memberships
; ++i
)
2149 in_delmulti(imo
->imo_membership
[i
]);
2150 kfree(imo
, M_IPMOPTS
);
2155 * Routine called from ip_output() to loop back a copy of an IP multicast
2156 * packet to the input queue of a specified interface. Note that this
2157 * calls the output routine of the loopback "driver", but with an interface
2158 * pointer that might NOT be a loopback interface -- evil, but easier than
2159 * replicating that code here.
2162 ip_mloopback(struct ifnet
*ifp
, struct mbuf
*m
, struct sockaddr_in
*dst
,
2168 copym
= m_copypacket(m
, MB_DONTWAIT
);
2169 if (copym
!= NULL
&& (copym
->m_flags
& M_EXT
|| copym
->m_len
< hlen
))
2170 copym
= m_pullup(copym
, hlen
);
2171 if (copym
!= NULL
) {
2173 * if the checksum hasn't been computed, mark it as valid
2175 if (copym
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
2176 in_delayed_cksum(copym
);
2177 copym
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
2178 copym
->m_pkthdr
.csum_flags
|=
2179 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
2180 copym
->m_pkthdr
.csum_data
= 0xffff;
2183 * We don't bother to fragment if the IP length is greater
2184 * than the interface's MTU. Can this possibly matter?
2186 ip
= mtod(copym
, struct ip
*);
2187 ip
->ip_len
= htons(ip
->ip_len
);
2188 ip
->ip_off
= htons(ip
->ip_off
);
2190 if (ip
->ip_vhl
== IP_VHL_BORING
) {
2191 ip
->ip_sum
= in_cksum_hdr(ip
);
2193 ip
->ip_sum
= in_cksum(copym
, hlen
);
2197 * It's not clear whether there are any lingering
2198 * reentrancy problems in other areas which might
2199 * be exposed by using ip_input directly (in
2200 * particular, everything which modifies the packet
2201 * in-place). Yet another option is using the
2202 * protosw directly to deliver the looped back
2203 * packet. For the moment, we'll err on the side
2204 * of safety by using if_simloop().
2207 if (dst
->sin_family
!= AF_INET
) {
2208 kprintf("ip_mloopback: bad address family %d\n",
2210 dst
->sin_family
= AF_INET
;
2215 copym
->m_pkthdr
.rcvif
= ifp
;
2218 if_simloop(ifp
, copym
, dst
->sin_family
, 0);