2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
36 #include "opt_ipsec.h"
37 #include "opt_mbuf_stress_test.h"
38 #include "opt_mpath.h"
39 #include "opt_route.h"
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
47 #include <sys/malloc.h>
51 #include <sys/protosw.h>
52 #include <sys/rmlock.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/sysctl.h>
57 #include <sys/ucred.h>
60 #include <net/if_var.h>
61 #include <net/if_llatbl.h>
62 #include <net/netisr.h>
64 #include <net/route.h>
65 #include <net/flowtable.h>
67 #include <net/radix_mpath.h>
69 #include <net/rss_config.h>
72 #include <netinet/in.h>
73 #include <netinet/in_kdtrace.h>
74 #include <netinet/in_systm.h>
75 #include <netinet/ip.h>
76 #include <netinet/in_pcb.h>
77 #include <netinet/in_rss.h>
78 #include <netinet/in_var.h>
79 #include <netinet/ip_var.h>
80 #include <netinet/ip_options.h>
82 #include <netinet/sctp.h>
83 #include <netinet/sctp_crc32.h>
87 #include <netinet/ip_ipsec.h>
88 #include <netipsec/ipsec.h>
91 #include <machine/in_cksum.h>
93 #include <security/mac/mac_framework.h>
95 #ifdef MBUF_STRESS_TEST
96 static int mbuf_frag_size
= 0;
97 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, mbuf_frag_size
, CTLFLAG_RW
,
98 &mbuf_frag_size
, 0, "Fragment outgoing mbufs to this size");
101 static void ip_mloopback(struct ifnet
*, const struct mbuf
*, int);
104 extern int in_mcast_loop
;
105 extern struct protosw inetsw
[];
108 ip_output_pfil(struct mbuf
**mp
, struct ifnet
*ifp
, struct inpcb
*inp
,
109 struct sockaddr_in
*dst
, int *fibnum
, int *error
)
111 struct m_tag
*fwd_tag
= NULL
;
117 ip
= mtod(m
, struct ip
*);
119 /* Run through list of hooks for output packets. */
120 odst
.s_addr
= ip
->ip_dst
.s_addr
;
121 *error
= pfil_run_hooks(&V_inet_pfil_hook
, mp
, ifp
, PFIL_OUT
, inp
);
123 if ((*error
) != 0 || m
== NULL
)
124 return 1; /* Finished */
126 ip
= mtod(m
, struct ip
*);
128 /* See if destination IP address was changed by packet filter. */
129 if (odst
.s_addr
!= ip
->ip_dst
.s_addr
) {
130 m
->m_flags
|= M_SKIP_FIREWALL
;
131 /* If destination is now ourself drop to ip_input(). */
132 if (in_localip(ip
->ip_dst
)) {
133 m
->m_flags
|= M_FASTFWD_OURS
;
134 if (m
->m_pkthdr
.rcvif
== NULL
)
135 m
->m_pkthdr
.rcvif
= V_loif
;
136 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
137 m
->m_pkthdr
.csum_flags
|=
138 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
139 m
->m_pkthdr
.csum_data
= 0xffff;
141 m
->m_pkthdr
.csum_flags
|=
142 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
144 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP
)
145 m
->m_pkthdr
.csum_flags
|= CSUM_SCTP_VALID
;
147 *error
= netisr_queue(NETISR_IP
, m
);
148 return 1; /* Finished */
151 bzero(dst
, sizeof(*dst
));
152 dst
->sin_family
= AF_INET
;
153 dst
->sin_len
= sizeof(*dst
);
154 dst
->sin_addr
= ip
->ip_dst
;
156 return -1; /* Reloop */
158 /* See if fib was changed by packet filter. */
159 if ((*fibnum
) != M_GETFIB(m
)) {
160 m
->m_flags
|= M_SKIP_FIREWALL
;
161 *fibnum
= M_GETFIB(m
);
162 return -1; /* Reloop for FIB change */
165 /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
166 if (m
->m_flags
& M_FASTFWD_OURS
) {
167 if (m
->m_pkthdr
.rcvif
== NULL
)
168 m
->m_pkthdr
.rcvif
= V_loif
;
169 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
170 m
->m_pkthdr
.csum_flags
|=
171 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
172 m
->m_pkthdr
.csum_data
= 0xffff;
175 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP
)
176 m
->m_pkthdr
.csum_flags
|= CSUM_SCTP_VALID
;
178 m
->m_pkthdr
.csum_flags
|=
179 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
181 *error
= netisr_queue(NETISR_IP
, m
);
182 return 1; /* Finished */
184 /* Or forward to some other address? */
185 if ((m
->m_flags
& M_IP_NEXTHOP
) &&
186 ((fwd_tag
= m_tag_find(m
, PACKET_TAG_IPFORWARD
, NULL
)) != NULL
)) {
187 bcopy((fwd_tag
+1), dst
, sizeof(struct sockaddr_in
));
188 m
->m_flags
|= M_SKIP_FIREWALL
;
189 m
->m_flags
&= ~M_IP_NEXTHOP
;
190 m_tag_delete(m
, fwd_tag
);
192 return -1; /* Reloop for CHANGE of dst */
199 * IP output. The packet in mbuf chain m contains a skeletal IP
200 * header (with len, off, ttl, proto, tos, src, dst).
201 * The mbuf chain containing the packet will be freed.
202 * The mbuf opt, if present, will not be freed.
203 * If route ro is present and has ro_rt initialized, route lookup would be
204 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
205 * then result of route lookup is stored in ro->ro_rt.
207 * In the IP forwarding case, the packet will arrive with options already
208 * inserted, so must have a NULL opt pointer.
211 ip_output(struct mbuf
*m
, struct mbuf
*opt
, struct route
*ro
, int flags
,
212 struct ip_moptions
*imo
, struct inpcb
*inp
)
214 struct rm_priotracker in_ifa_tracker
;
216 struct ifnet
*ifp
= NULL
; /* keep compiler happy */
218 int hlen
= sizeof (struct ip
);
221 struct sockaddr_in
*dst
;
222 const struct sockaddr_in
*gw
;
223 struct in_ifaddr
*ia
;
225 uint16_t ip_len
, ip_off
;
226 struct route iproute
;
227 struct rtentry
*rte
; /* cache for ro->ro_rt */
231 int no_route_but_check_spd
= 0;
236 INP_LOCK_ASSERT(inp
);
237 M_SETFIB(m
, inp
->inp_inc
.inc_fibnum
);
238 if ((flags
& IP_NODEFAULTFLOWID
) == 0) {
239 m
->m_pkthdr
.flowid
= inp
->inp_flowid
;
240 M_HASHTYPE_SET(m
, inp
->inp_flowtype
);
246 bzero(ro
, sizeof (*ro
));
248 ro
->ro_flags
|= RT_LLE_CACHE
;
251 if (ro
->ro_rt
== NULL
)
252 (void )flowtable_lookup(AF_INET
, m
, ro
);
257 m
= ip_insertoptions(m
, opt
, &len
);
259 hlen
= len
; /* ip->ip_hl is updated above */
261 ip
= mtod(m
, struct ip
*);
262 ip_len
= ntohs(ip
->ip_len
);
263 ip_off
= ntohs(ip
->ip_off
);
265 if ((flags
& (IP_FORWARDING
|IP_RAWOUTPUT
)) == 0) {
266 ip
->ip_v
= IPVERSION
;
267 ip
->ip_hl
= hlen
>> 2;
269 IPSTAT_INC(ips_localout
);
271 /* Header already set, fetch hlen from there */
272 hlen
= ip
->ip_hl
<< 2;
278 * dst can be rewritten but always points to &ro->ro_dst.
279 * gw is readonly but can point either to dst OR rt_gateway,
280 * therefore we need restore gw if we're redoing lookup.
282 gw
= dst
= (struct sockaddr_in
*)&ro
->ro_dst
;
283 fibnum
= (inp
!= NULL
) ? inp
->inp_inc
.inc_fibnum
: M_GETFIB(m
);
286 bzero(dst
, sizeof(*dst
));
287 dst
->sin_family
= AF_INET
;
288 dst
->sin_len
= sizeof(*dst
);
289 dst
->sin_addr
= ip
->ip_dst
;
293 * Validate route against routing table additions;
294 * a better/more specific route might have been added.
297 RT_VALIDATE(ro
, &inp
->inp_rt_cookie
, fibnum
);
299 * If there is a cached route,
300 * check that it is to the same destination
301 * and is still up. If not, free it and try again.
302 * The address family should also be checked in case of sharing the
304 * Also check whether routing cache needs invalidation.
307 if (rte
&& ((rte
->rt_flags
& RTF_UP
) == 0 ||
308 rte
->rt_ifp
== NULL
||
309 !RT_LINK_IS_UP(rte
->rt_ifp
) ||
310 dst
->sin_family
!= AF_INET
||
311 dst
->sin_addr
.s_addr
!= ip
->ip_dst
.s_addr
)) {
313 rte
= ro
->ro_rt
= (struct rtentry
*)NULL
;
315 LLE_FREE(ro
->ro_lle
); /* zeros ro_lle */
316 ro
->ro_lle
= (struct llentry
*)NULL
;
321 * If routing to interface only, short circuit routing lookup.
322 * The use of an all-ones broadcast address implies this; an
323 * interface is specified by the broadcast address of an interface,
324 * or the destination address of a ptp interface.
326 if (flags
& IP_SENDONES
) {
327 if ((ia
= ifatoia(ifa_ifwithbroadaddr(sintosa(dst
),
328 M_GETFIB(m
)))) == NULL
&&
329 (ia
= ifatoia(ifa_ifwithdstaddr(sintosa(dst
),
330 M_GETFIB(m
)))) == NULL
) {
331 IPSTAT_INC(ips_noroute
);
336 ip
->ip_dst
.s_addr
= INADDR_BROADCAST
;
337 dst
->sin_addr
= ip
->ip_dst
;
341 } else if (flags
& IP_ROUTETOIF
) {
342 if ((ia
= ifatoia(ifa_ifwithdstaddr(sintosa(dst
),
343 M_GETFIB(m
)))) == NULL
&&
344 (ia
= ifatoia(ifa_ifwithnet(sintosa(dst
), 0,
345 M_GETFIB(m
)))) == NULL
) {
346 IPSTAT_INC(ips_noroute
);
353 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
354 } else if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
)) &&
355 imo
!= NULL
&& imo
->imo_multicast_ifp
!= NULL
) {
357 * Bypass the normal routing lookup for multicast
358 * packets if the interface is specified.
360 ifp
= imo
->imo_multicast_ifp
;
361 IFP_TO_IA(ifp
, ia
, &in_ifa_tracker
);
364 isbroadcast
= 0; /* fool gcc */
367 * We want to do any cloning requested by the link layer,
368 * as this is probably required in all cases for correct
369 * operation (as it is for ARP).
373 rtalloc_mpath_fib(ro
,
374 ntohl(ip
->ip_src
.s_addr
^ ip
->ip_dst
.s_addr
),
377 in_rtalloc_ign(ro
, 0, fibnum
);
382 (rte
->rt_flags
& RTF_UP
) == 0 ||
383 rte
->rt_ifp
== NULL
||
384 !RT_LINK_IS_UP(rte
->rt_ifp
)) {
387 * There is no route for this packet, but it is
388 * possible that a matching SPD entry exists.
390 no_route_but_check_spd
= 1;
391 mtu
= 0; /* Silence GCC warning. */
394 IPSTAT_INC(ips_noroute
);
395 error
= EHOSTUNREACH
;
398 ia
= ifatoia(rte
->rt_ifa
);
400 counter_u64_add(rte
->rt_pksent
, 1);
401 rt_update_ro_flags(ro
);
402 if (rte
->rt_flags
& RTF_GATEWAY
)
403 gw
= (struct sockaddr_in
*)rte
->rt_gateway
;
404 if (rte
->rt_flags
& RTF_HOST
)
405 isbroadcast
= (rte
->rt_flags
& RTF_BROADCAST
);
407 isbroadcast
= in_broadcast(gw
->sin_addr
, ifp
);
411 * Calculate MTU. If we have a route that is up, use that,
412 * otherwise use the interface's MTU.
414 if (rte
!= NULL
&& (rte
->rt_flags
& (RTF_UP
|RTF_HOST
)))
418 /* Catch a possible divide by zero later. */
419 KASSERT(mtu
> 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
420 __func__
, mtu
, rte
, (rte
!= NULL
) ? rte
->rt_flags
: 0, ifp
));
422 if (IN_MULTICAST(ntohl(ip
->ip_dst
.s_addr
))) {
423 m
->m_flags
|= M_MCAST
;
425 * IP destination address is multicast. Make sure "gw"
426 * still points to the address in "ro". (It may have been
427 * changed to point to a gateway address, above.)
431 * See if the caller provided any multicast options
434 ip
->ip_ttl
= imo
->imo_multicast_ttl
;
435 if (imo
->imo_multicast_vif
!= -1)
438 ip_mcast_src(imo
->imo_multicast_vif
) :
441 ip
->ip_ttl
= IP_DEFAULT_MULTICAST_TTL
;
443 * Confirm that the outgoing interface supports multicast.
445 if ((imo
== NULL
) || (imo
->imo_multicast_vif
== -1)) {
446 if ((ifp
->if_flags
& IFF_MULTICAST
) == 0) {
447 IPSTAT_INC(ips_noroute
);
453 * If source address not specified yet, use address
454 * of outgoing interface.
456 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
457 /* Interface may have no addresses. */
459 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
462 if ((imo
== NULL
&& in_mcast_loop
) ||
463 (imo
&& imo
->imo_multicast_loop
)) {
465 * Loop back multicast datagram if not expressly
466 * forbidden to do so, even if we are not a member
467 * of the group; ip_input() will filter it later,
468 * thus deferring a hash lookup and mutex acquisition
469 * at the expense of a cheap copy using m_copym().
471 ip_mloopback(ifp
, m
, hlen
);
474 * If we are acting as a multicast router, perform
475 * multicast forwarding as if the packet had just
476 * arrived on the interface to which we are about
477 * to send. The multicast forwarding function
478 * recursively calls this function, using the
479 * IP_FORWARDING flag to prevent infinite recursion.
481 * Multicasts that are looped back by ip_mloopback(),
482 * above, will be forwarded by the ip_input() routine,
485 if (V_ip_mrouter
&& (flags
& IP_FORWARDING
) == 0) {
487 * If rsvp daemon is not running, do not
488 * set ip_moptions. This ensures that the packet
489 * is multicast and not just sent down one link
490 * as prescribed by rsvpd.
495 ip_mforward(ip
, ifp
, m
, imo
) != 0) {
503 * Multicasts with a time-to-live of zero may be looped-
504 * back, above, but must not be transmitted on a network.
505 * Also, multicasts addressed to the loopback interface
506 * are not sent -- the above call to ip_mloopback() will
507 * loop back a copy. ip_input() will drop the copy if
508 * this host does not belong to the destination group on
509 * the loopback interface.
511 if (ip
->ip_ttl
== 0 || ifp
->if_flags
& IFF_LOOPBACK
) {
520 * If the source address is not specified yet, use the address
521 * of the outoing interface.
523 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
524 /* Interface may have no addresses. */
526 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
531 * Look for broadcast address and
532 * verify user is allowed to send
536 if ((ifp
->if_flags
& IFF_BROADCAST
) == 0) {
537 error
= EADDRNOTAVAIL
;
540 if ((flags
& IP_ALLOWBROADCAST
) == 0) {
544 /* don't allow broadcast messages to be fragmented */
549 m
->m_flags
|= M_BCAST
;
551 m
->m_flags
&= ~M_BCAST
;
556 switch(ip_ipsec_output(&m
, inp
, &error
)) {
563 break; /* Continue with packet processing. */
566 * Check if there was a route for this packet; return error if not.
568 if (no_route_but_check_spd
) {
569 IPSTAT_INC(ips_noroute
);
570 error
= EHOSTUNREACH
;
573 /* Update variables that are affected by ipsec4_output(). */
574 ip
= mtod(m
, struct ip
*);
575 hlen
= ip
->ip_hl
<< 2;
578 /* Jump over all PFIL processing if hooks are not active. */
579 if (PFIL_HOOKED(&V_inet_pfil_hook
)) {
580 switch (ip_output_pfil(&m
, ifp
, inp
, dst
, &fibnum
, &error
)) {
581 case 1: /* Finished */
584 case 0: /* Continue normally */
585 ip
= mtod(m
, struct ip
*);
588 case -1: /* Need to try again */
589 /* Reset everything for a new round */
592 ifa_free(&ia
->ia_ifa
);
593 ro
->ro_prepend
= NULL
;
596 ip
= mtod(m
, struct ip
*);
602 /* 127/8 must not appear on wire - RFC1122. */
603 if ((ntohl(ip
->ip_dst
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
||
604 (ntohl(ip
->ip_src
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
) {
605 if ((ifp
->if_flags
& IFF_LOOPBACK
) == 0) {
606 IPSTAT_INC(ips_badaddr
);
607 error
= EADDRNOTAVAIL
;
612 m
->m_pkthdr
.csum_flags
|= CSUM_IP
;
613 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
& ~ifp
->if_hwassist
) {
615 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
618 if (m
->m_pkthdr
.csum_flags
& CSUM_SCTP
& ~ifp
->if_hwassist
) {
619 sctp_delayed_cksum(m
, (uint32_t)(ip
->ip_hl
<< 2));
620 m
->m_pkthdr
.csum_flags
&= ~CSUM_SCTP
;
625 * If small enough for interface, or the interface will take
626 * care of the fragmentation for us, we can just send directly.
629 (m
->m_pkthdr
.csum_flags
& ifp
->if_hwassist
& CSUM_TSO
) != 0) {
631 if (m
->m_pkthdr
.csum_flags
& CSUM_IP
& ~ifp
->if_hwassist
) {
632 ip
->ip_sum
= in_cksum(m
, hlen
);
633 m
->m_pkthdr
.csum_flags
&= ~CSUM_IP
;
637 * Record statistics for this interface address.
638 * With CSUM_TSO the byte/packet count will be slightly
639 * incorrect because we count the IP+TCP headers only
640 * once instead of for every generated packet.
642 if (!(flags
& IP_FORWARDING
) && ia
) {
643 if (m
->m_pkthdr
.csum_flags
& CSUM_TSO
)
644 counter_u64_add(ia
->ia_ifa
.ifa_opackets
,
645 m
->m_pkthdr
.len
/ m
->m_pkthdr
.tso_segsz
);
647 counter_u64_add(ia
->ia_ifa
.ifa_opackets
, 1);
649 counter_u64_add(ia
->ia_ifa
.ifa_obytes
, m
->m_pkthdr
.len
);
651 #ifdef MBUF_STRESS_TEST
652 if (mbuf_frag_size
&& m
->m_pkthdr
.len
> mbuf_frag_size
)
653 m
= m_fragment(m
, M_NOWAIT
, mbuf_frag_size
);
656 * Reset layer specific mbuf flags
657 * to avoid confusing lower layers.
660 IP_PROBE(send
, NULL
, NULL
, ip
, ifp
, ip
, NULL
);
661 error
= (*ifp
->if_output
)(ifp
, m
,
662 (const struct sockaddr
*)gw
, ro
);
666 /* Balk when DF bit is set or the interface didn't support TSO. */
667 if ((ip_off
& IP_DF
) || (m
->m_pkthdr
.csum_flags
& CSUM_TSO
)) {
669 IPSTAT_INC(ips_cantfrag
);
674 * Too large for interface; fragment if possible. If successful,
675 * on return, m will point to a list of packets to be sent.
677 error
= ip_fragment(ip
, &m
, mtu
, ifp
->if_hwassist
);
684 /* Record statistics for this interface address. */
686 counter_u64_add(ia
->ia_ifa
.ifa_opackets
, 1);
687 counter_u64_add(ia
->ia_ifa
.ifa_obytes
,
691 * Reset layer specific mbuf flags
692 * to avoid confusing upper layers.
696 IP_PROBE(send
, NULL
, NULL
, ip
, ifp
, ip
, NULL
);
697 error
= (*ifp
->if_output
)(ifp
, m
,
698 (const struct sockaddr
*)gw
, ro
);
704 IPSTAT_INC(ips_fragmented
);
708 * Release the route if using our private route, or if
709 * (with flowtable) we don't have our own reference.
711 if (ro
== &iproute
|| ro
->ro_flags
& RT_NORTREF
)
713 else if (rte
== NULL
)
715 * If the caller supplied a route but somehow the reference
716 * to it has been released need to prevent the caller
717 * calling RTFREE on it again.
721 ifa_free(&ia
->ia_ifa
);
729 * Create a chain of fragments which fit the given mtu. m_frag points to the
730 * mbuf to be fragmented; on return it points to the chain with the fragments.
731 * Return 0 if no error. If error, m_frag may contain a partially built
732 * chain of fragments that should be freed by the caller.
734 * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
737 ip_fragment(struct ip
*ip
, struct mbuf
**m_frag
, int mtu
,
738 u_long if_hwassist_flags
)
741 int hlen
= ip
->ip_hl
<< 2;
742 int len
= (mtu
- hlen
) & ~7; /* size of payload in each fragment */
744 struct mbuf
*m0
= *m_frag
; /* the original packet */
748 uint16_t ip_len
, ip_off
;
750 ip_len
= ntohs(ip
->ip_len
);
751 ip_off
= ntohs(ip
->ip_off
);
753 if (ip_off
& IP_DF
) { /* Fragmentation not allowed */
754 IPSTAT_INC(ips_cantfrag
);
759 * Must be able to put at least 8 bytes per fragment.
765 * If the interface will not calculate checksums on
766 * fragmented packets, then do it here.
768 if (m0
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
769 in_delayed_cksum(m0
);
770 m0
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
773 if (m0
->m_pkthdr
.csum_flags
& CSUM_SCTP
) {
774 sctp_delayed_cksum(m0
, hlen
);
775 m0
->m_pkthdr
.csum_flags
&= ~CSUM_SCTP
;
778 if (len
> PAGE_SIZE
) {
780 * Fragment large datagrams such that each segment
781 * contains a multiple of PAGE_SIZE amount of data,
782 * plus headers. This enables a receiver to perform
783 * page-flipping zero-copy optimizations.
785 * XXX When does this help given that sender and receiver
786 * could have different page sizes, and also mtu could
787 * be less than the receiver's page size ?
791 off
= MIN(mtu
, m0
->m_pkthdr
.len
);
794 * firstlen (off - hlen) must be aligned on an
798 goto smart_frag_failure
;
799 off
= ((off
- hlen
) & ~7) + hlen
;
800 newlen
= (~PAGE_MASK
) & mtu
;
801 if ((newlen
+ sizeof (struct ip
)) > mtu
) {
802 /* we failed, go back the default */
813 firstlen
= off
- hlen
;
814 mnext
= &m0
->m_nextpkt
; /* pointer to next packet */
817 * Loop through length of segment after first fragment,
818 * make new header and copy data of each part and link onto chain.
819 * Here, m0 is the original packet, m is the fragment being created.
820 * The fragments are linked off the m_nextpkt of the original
821 * packet, which after processing serves as the first fragment.
823 for (nfrags
= 1; off
< ip_len
; off
+= len
, nfrags
++) {
824 struct ip
*mhip
; /* ip header on the fragment */
826 int mhlen
= sizeof (struct ip
);
828 m
= m_gethdr(M_NOWAIT
, MT_DATA
);
831 IPSTAT_INC(ips_odropped
);
835 * Make sure the complete packet header gets copied
836 * from the originating mbuf to the newly created
837 * mbuf. This also ensures that existing firewall
838 * classification(s), VLAN tags and so on get copied
839 * to the resulting fragmented packet(s):
841 if (m_dup_pkthdr(m
, m0
, M_NOWAIT
) == 0) {
844 IPSTAT_INC(ips_odropped
);
848 * In the first mbuf, leave room for the link header, then
849 * copy the original IP header including options. The payload
850 * goes into an additional mbuf chain returned by m_copym().
852 m
->m_data
+= max_linkhdr
;
853 mhip
= mtod(m
, struct ip
*);
855 if (hlen
> sizeof (struct ip
)) {
856 mhlen
= ip_optcopy(ip
, mhip
) + sizeof (struct ip
);
857 mhip
->ip_v
= IPVERSION
;
858 mhip
->ip_hl
= mhlen
>> 2;
861 /* XXX do we need to add ip_off below ? */
862 mhip
->ip_off
= ((off
- hlen
) >> 3) + ip_off
;
863 if (off
+ len
>= ip_len
)
866 mhip
->ip_off
|= IP_MF
;
867 mhip
->ip_len
= htons((u_short
)(len
+ mhlen
));
868 m
->m_next
= m_copym(m0
, off
, len
, M_NOWAIT
);
869 if (m
->m_next
== NULL
) { /* copy failed */
871 error
= ENOBUFS
; /* ??? */
872 IPSTAT_INC(ips_odropped
);
875 m
->m_pkthdr
.len
= mhlen
+ len
;
877 mac_netinet_fragment(m0
, m
);
879 mhip
->ip_off
= htons(mhip
->ip_off
);
881 if (m
->m_pkthdr
.csum_flags
& CSUM_IP
& ~if_hwassist_flags
) {
882 mhip
->ip_sum
= in_cksum(m
, mhlen
);
883 m
->m_pkthdr
.csum_flags
&= ~CSUM_IP
;
886 mnext
= &m
->m_nextpkt
;
888 IPSTAT_ADD(ips_ofragments
, nfrags
);
891 * Update first fragment by trimming what's been copied out
892 * and updating header.
894 m_adj(m0
, hlen
+ firstlen
- ip_len
);
895 m0
->m_pkthdr
.len
= hlen
+ firstlen
;
896 ip
->ip_len
= htons((u_short
)m0
->m_pkthdr
.len
);
897 ip
->ip_off
= htons(ip_off
| IP_MF
);
899 if (m0
->m_pkthdr
.csum_flags
& CSUM_IP
& ~if_hwassist_flags
) {
900 ip
->ip_sum
= in_cksum(m0
, hlen
);
901 m0
->m_pkthdr
.csum_flags
&= ~CSUM_IP
;
910 in_delayed_cksum(struct mbuf
*m
)
913 uint16_t csum
, offset
, ip_len
;
915 ip
= mtod(m
, struct ip
*);
916 offset
= ip
->ip_hl
<< 2 ;
917 ip_len
= ntohs(ip
->ip_len
);
918 csum
= in_cksum_skip(m
, ip_len
, offset
);
919 if (m
->m_pkthdr
.csum_flags
& CSUM_UDP
&& csum
== 0)
921 offset
+= m
->m_pkthdr
.csum_data
; /* checksum offset */
923 /* find the mbuf in the chain where the checksum starts*/
924 while ((m
!= NULL
) && (offset
>= m
->m_len
)) {
928 KASSERT(m
!= NULL
, ("in_delayed_cksum: checksum outside mbuf chain."));
929 KASSERT(offset
+ sizeof(u_short
) <= m
->m_len
, ("in_delayed_cksum: checksum split between mbufs."));
930 *(u_short
*)(m
->m_data
+ offset
) = csum
;
934 * IP socket option processing.
937 ip_ctloutput(struct socket
*so
, struct sockopt
*sopt
)
939 struct inpcb
*inp
= sotoinpcb(so
);
947 if (sopt
->sopt_level
!= IPPROTO_IP
) {
950 if (sopt
->sopt_level
== SOL_SOCKET
&&
951 sopt
->sopt_dir
== SOPT_SET
) {
952 switch (sopt
->sopt_name
) {
955 if ((so
->so_options
& SO_REUSEADDR
) != 0)
956 inp
->inp_flags2
|= INP_REUSEADDR
;
958 inp
->inp_flags2
&= ~INP_REUSEADDR
;
964 if ((so
->so_options
& SO_REUSEPORT
) != 0)
965 inp
->inp_flags2
|= INP_REUSEPORT
;
967 inp
->inp_flags2
&= ~INP_REUSEPORT
;
973 inp
->inp_inc
.inc_fibnum
= so
->so_fibnum
;
984 switch (sopt
->sopt_dir
) {
986 switch (sopt
->sopt_name
) {
993 if (sopt
->sopt_valsize
> MLEN
) {
997 m
= m_get(sopt
->sopt_td
? M_WAITOK
: M_NOWAIT
, MT_DATA
);
1002 m
->m_len
= sopt
->sopt_valsize
;
1003 error
= sooptcopyin(sopt
, mtod(m
, char *), m
->m_len
,
1010 error
= ip_pcbopts(inp
, sopt
->sopt_name
, m
);
1016 if (sopt
->sopt_td
!= NULL
) {
1017 error
= priv_check(sopt
->sopt_td
,
1018 PRIV_NETINET_BINDANY
);
1025 case IP_RSS_LISTEN_BUCKET
:
1031 case IP_RECVRETOPTS
:
1032 case IP_RECVDSTADDR
:
1040 case IP_RECVRSSBUCKETID
:
1042 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
1047 switch (sopt
->sopt_name
) {
1049 inp
->inp_ip_tos
= optval
;
1053 inp
->inp_ip_ttl
= optval
;
1057 if (optval
>= 0 && optval
<= MAXTTL
)
1058 inp
->inp_ip_minttl
= optval
;
1063 #define OPTSET(bit) do { \
1066 inp->inp_flags |= bit; \
1068 inp->inp_flags &= ~bit; \
1072 #define OPTSET2(bit, val) do { \
1075 inp->inp_flags2 |= bit; \
1077 inp->inp_flags2 &= ~bit; \
1082 OPTSET(INP_RECVOPTS
);
1085 case IP_RECVRETOPTS
:
1086 OPTSET(INP_RECVRETOPTS
);
1089 case IP_RECVDSTADDR
:
1090 OPTSET(INP_RECVDSTADDR
);
1094 OPTSET(INP_RECVTTL
);
1102 OPTSET(INP_ONESBCAST
);
1105 OPTSET(INP_DONTFRAG
);
1108 OPTSET(INP_BINDANY
);
1111 OPTSET(INP_RECVTOS
);
1114 OPTSET2(INP_BINDMULTI
, optval
);
1117 OPTSET2(INP_RECVFLOWID
, optval
);
1120 case IP_RSS_LISTEN_BUCKET
:
1121 if ((optval
>= 0) &&
1122 (optval
< rss_getnumbuckets())) {
1123 inp
->inp_rss_listen_bucket
= optval
;
1124 OPTSET2(INP_RSS_BUCKET_SET
, 1);
1129 case IP_RECVRSSBUCKETID
:
1130 OPTSET2(INP_RECVRSSBUCKETID
, optval
);
1139 * Multicast socket options are processed by the in_mcast
1142 case IP_MULTICAST_IF
:
1143 case IP_MULTICAST_VIF
:
1144 case IP_MULTICAST_TTL
:
1145 case IP_MULTICAST_LOOP
:
1146 case IP_ADD_MEMBERSHIP
:
1147 case IP_DROP_MEMBERSHIP
:
1148 case IP_ADD_SOURCE_MEMBERSHIP
:
1149 case IP_DROP_SOURCE_MEMBERSHIP
:
1150 case IP_BLOCK_SOURCE
:
1151 case IP_UNBLOCK_SOURCE
:
1153 case MCAST_JOIN_GROUP
:
1154 case MCAST_LEAVE_GROUP
:
1155 case MCAST_JOIN_SOURCE_GROUP
:
1156 case MCAST_LEAVE_SOURCE_GROUP
:
1157 case MCAST_BLOCK_SOURCE
:
1158 case MCAST_UNBLOCK_SOURCE
:
1159 error
= inp_setmoptions(inp
, sopt
);
1163 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
1170 case IP_PORTRANGE_DEFAULT
:
1171 inp
->inp_flags
&= ~(INP_LOWPORT
);
1172 inp
->inp_flags
&= ~(INP_HIGHPORT
);
1175 case IP_PORTRANGE_HIGH
:
1176 inp
->inp_flags
&= ~(INP_LOWPORT
);
1177 inp
->inp_flags
|= INP_HIGHPORT
;
1180 case IP_PORTRANGE_LOW
:
1181 inp
->inp_flags
&= ~(INP_HIGHPORT
);
1182 inp
->inp_flags
|= INP_LOWPORT
;
1193 case IP_IPSEC_POLICY
:
1198 if ((error
= soopt_getm(sopt
, &m
)) != 0) /* XXX */
1200 if ((error
= soopt_mcopyin(sopt
, m
)) != 0) /* XXX */
1202 req
= mtod(m
, caddr_t
);
1203 error
= ipsec_set_policy(inp
, sopt
->sopt_name
, req
,
1204 m
->m_len
, (sopt
->sopt_td
!= NULL
) ?
1205 sopt
->sopt_td
->td_ucred
: NULL
);
1212 error
= ENOPROTOOPT
;
1218 switch (sopt
->sopt_name
) {
1221 if (inp
->inp_options
)
1222 error
= sooptcopyout(sopt
,
1223 mtod(inp
->inp_options
,
1225 inp
->inp_options
->m_len
);
1227 sopt
->sopt_valsize
= 0;
1234 case IP_RECVRETOPTS
:
1235 case IP_RECVDSTADDR
:
1248 case IP_RSSBUCKETID
:
1249 case IP_RECVRSSBUCKETID
:
1251 switch (sopt
->sopt_name
) {
1254 optval
= inp
->inp_ip_tos
;
1258 optval
= inp
->inp_ip_ttl
;
1262 optval
= inp
->inp_ip_minttl
;
1265 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
1266 #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
1269 optval
= OPTBIT(INP_RECVOPTS
);
1272 case IP_RECVRETOPTS
:
1273 optval
= OPTBIT(INP_RECVRETOPTS
);
1276 case IP_RECVDSTADDR
:
1277 optval
= OPTBIT(INP_RECVDSTADDR
);
1281 optval
= OPTBIT(INP_RECVTTL
);
1285 optval
= OPTBIT(INP_RECVIF
);
1289 if (inp
->inp_flags
& INP_HIGHPORT
)
1290 optval
= IP_PORTRANGE_HIGH
;
1291 else if (inp
->inp_flags
& INP_LOWPORT
)
1292 optval
= IP_PORTRANGE_LOW
;
1298 optval
= OPTBIT(INP_ONESBCAST
);
1301 optval
= OPTBIT(INP_DONTFRAG
);
1304 optval
= OPTBIT(INP_BINDANY
);
1307 optval
= OPTBIT(INP_RECVTOS
);
1310 optval
= inp
->inp_flowid
;
1313 optval
= inp
->inp_flowtype
;
1316 optval
= OPTBIT2(INP_RECVFLOWID
);
1319 case IP_RSSBUCKETID
:
1320 retval
= rss_hash2bucket(inp
->inp_flowid
,
1324 optval
= rss_bucket
;
1328 case IP_RECVRSSBUCKETID
:
1329 optval
= OPTBIT2(INP_RECVRSSBUCKETID
);
1333 optval
= OPTBIT2(INP_BINDMULTI
);
1336 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
1340 * Multicast socket options are processed by the in_mcast
1343 case IP_MULTICAST_IF
:
1344 case IP_MULTICAST_VIF
:
1345 case IP_MULTICAST_TTL
:
1346 case IP_MULTICAST_LOOP
:
1348 error
= inp_getmoptions(inp
, sopt
);
1352 case IP_IPSEC_POLICY
:
1354 struct mbuf
*m
= NULL
;
1359 req
= mtod(m
, caddr_t
);
1362 error
= ipsec_get_policy(sotoinpcb(so
), req
, len
, &m
);
1364 error
= soopt_mcopyout(sopt
, m
); /* XXX */
1372 error
= ENOPROTOOPT
;
1381 * Routine called from ip_output() to loop back a copy of an IP multicast
1382 * packet to the input queue of a specified interface. Note that this
1383 * calls the output routine of the loopback "driver", but with an interface
1384 * pointer that might NOT be a loopback interface -- evil, but easier than
1385 * replicating that code here.
1388 ip_mloopback(struct ifnet
*ifp
, const struct mbuf
*m
, int hlen
)
1394 * Make a deep copy of the packet because we're going to
1395 * modify the pack in order to generate checksums.
1397 copym
= m_dup(m
, M_NOWAIT
);
1398 if (copym
!= NULL
&& (!M_WRITABLE(copym
) || copym
->m_len
< hlen
))
1399 copym
= m_pullup(copym
, hlen
);
1400 if (copym
!= NULL
) {
1401 /* If needed, compute the checksum and mark it as valid. */
1402 if (copym
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1403 in_delayed_cksum(copym
);
1404 copym
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1405 copym
->m_pkthdr
.csum_flags
|=
1406 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
1407 copym
->m_pkthdr
.csum_data
= 0xffff;
1410 * We don't bother to fragment if the IP length is greater
1411 * than the interface's MTU. Can this possibly matter?
1413 ip
= mtod(copym
, struct ip
*);
1415 ip
->ip_sum
= in_cksum(copym
, hlen
);
1416 if_simloop(ifp
, copym
, AF_INET
, 0);