2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
71 #include <linux/skbuff.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
84 int sysctl_ip_default_ttl __read_mostly
= IPDEFTTL
;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl
);
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__
void ip_send_check(struct iphdr
*iph
)
91 iph
->check
= ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
93 EXPORT_SYMBOL(ip_send_check
);
95 int __ip_local_out(struct sk_buff
*skb
)
97 struct iphdr
*iph
= ip_hdr(skb
);
99 iph
->tot_len
= htons(skb
->len
);
101 return nf_hook(NFPROTO_IPV4
, NF_INET_LOCAL_OUT
, skb
, NULL
,
102 skb_dst(skb
)->dev
, dst_output
);
105 int ip_local_out(struct sk_buff
*skb
)
109 err
= __ip_local_out(skb
);
110 if (likely(err
== 1))
111 err
= dst_output(skb
);
115 EXPORT_SYMBOL_GPL(ip_local_out
);
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff
*newskb
)
120 skb_reset_mac_header(newskb
);
121 __skb_pull(newskb
, skb_network_offset(newskb
));
122 newskb
->pkt_type
= PACKET_LOOPBACK
;
123 newskb
->ip_summed
= CHECKSUM_UNNECESSARY
;
124 WARN_ON(!skb_dst(newskb
));
129 static inline int ip_select_ttl(struct inet_sock
*inet
, struct dst_entry
*dst
)
131 int ttl
= inet
->uc_ttl
;
134 ttl
= ip4_dst_hoplimit(dst
);
139 * Add an ip header to a skbuff and send it out.
142 int ip_build_and_send_pkt(struct sk_buff
*skb
, struct sock
*sk
,
143 __be32 saddr
, __be32 daddr
, struct ip_options_rcu
*opt
)
145 struct inet_sock
*inet
= inet_sk(sk
);
146 struct rtable
*rt
= skb_rtable(skb
);
149 /* Build the IP header. */
150 skb_push(skb
, sizeof(struct iphdr
) + (opt
? opt
->opt
.optlen
: 0));
151 skb_reset_network_header(skb
);
155 iph
->tos
= inet
->tos
;
156 if (ip_dont_fragment(sk
, &rt
->dst
))
157 iph
->frag_off
= htons(IP_DF
);
160 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
161 iph
->daddr
= (opt
&& opt
->opt
.srr
? opt
->opt
.faddr
: daddr
);
163 iph
->protocol
= sk
->sk_protocol
;
164 ip_select_ident(iph
, &rt
->dst
, sk
);
166 if (opt
&& opt
->opt
.optlen
) {
167 iph
->ihl
+= opt
->opt
.optlen
>>2;
168 ip_options_build(skb
, &opt
->opt
, daddr
, rt
, 0);
171 skb
->priority
= sk
->sk_priority
;
172 skb
->mark
= sk
->sk_mark
;
175 return ip_local_out(skb
);
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt
);
179 static inline int ip_finish_output2(struct sk_buff
*skb
)
181 struct dst_entry
*dst
= skb_dst(skb
);
182 struct rtable
*rt
= (struct rtable
*)dst
;
183 struct net_device
*dev
= dst
->dev
;
184 unsigned int hh_len
= LL_RESERVED_SPACE(dev
);
185 struct neighbour
*neigh
;
187 if (rt
->rt_type
== RTN_MULTICAST
) {
188 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTMCAST
, skb
->len
);
189 } else if (rt
->rt_type
== RTN_BROADCAST
)
190 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTBCAST
, skb
->len
);
192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb
) < hh_len
&& dev
->header_ops
)) {
194 struct sk_buff
*skb2
;
196 skb2
= skb_realloc_headroom(skb
, LL_RESERVED_SPACE(dev
));
202 skb_set_owner_w(skb2
, skb
->sk
);
208 neigh
= dst_get_neighbour(dst
);
210 int res
= neigh_output(neigh
, skb
);
218 printk(KERN_DEBUG
"ip_finish_output2: No header cache and no neighbour!\n");
223 static inline int ip_skb_dst_mtu(struct sk_buff
*skb
)
225 struct inet_sock
*inet
= skb
->sk
? inet_sk(skb
->sk
) : NULL
;
227 return (inet
&& inet
->pmtudisc
== IP_PMTUDISC_PROBE
) ?
228 skb_dst(skb
)->dev
->mtu
: dst_mtu(skb_dst(skb
));
231 static int ip_finish_output(struct sk_buff
*skb
)
233 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
235 if (skb_dst(skb
)->xfrm
!= NULL
) {
236 IPCB(skb
)->flags
|= IPSKB_REROUTED
;
237 return dst_output(skb
);
240 if (skb
->len
> ip_skb_dst_mtu(skb
) && !skb_is_gso(skb
))
241 return ip_fragment(skb
, ip_finish_output2
);
243 return ip_finish_output2(skb
);
246 int ip_mc_output(struct sk_buff
*skb
)
248 struct sock
*sk
= skb
->sk
;
249 struct rtable
*rt
= skb_rtable(skb
);
250 struct net_device
*dev
= rt
->dst
.dev
;
253 * If the indicated interface is up and running, send the packet.
255 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
258 skb
->protocol
= htons(ETH_P_IP
);
261 * Multicasts are looped back for other local users
264 if (rt
->rt_flags
&RTCF_MULTICAST
) {
266 #ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
273 This check is duplicated in ip_mr_input at the moment.
276 ((rt
->rt_flags
& RTCF_LOCAL
) ||
277 !(IPCB(skb
)->flags
& IPSKB_FORWARDED
))
280 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
282 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
,
283 newskb
, NULL
, newskb
->dev
,
284 ip_dev_loopback_xmit
);
287 /* Multicasts with ttl 0 must not go beyond the host */
289 if (ip_hdr(skb
)->ttl
== 0) {
295 if (rt
->rt_flags
&RTCF_BROADCAST
) {
296 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
298 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, newskb
,
299 NULL
, newskb
->dev
, ip_dev_loopback_xmit
);
302 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
,
303 skb
->dev
, ip_finish_output
,
304 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
307 int ip_output(struct sk_buff
*skb
)
309 struct net_device
*dev
= skb_dst(skb
)->dev
;
311 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
314 skb
->protocol
= htons(ETH_P_IP
);
316 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
, dev
,
318 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
321 int ip_queue_xmit(struct sk_buff
*skb
, struct flowi
*fl
)
323 struct sock
*sk
= skb
->sk
;
324 struct inet_sock
*inet
= inet_sk(sk
);
325 struct ip_options_rcu
*inet_opt
;
331 /* Skip all of this if the packet is already routed,
332 * f.e. by something like SCTP.
335 inet_opt
= rcu_dereference(inet
->inet_opt
);
337 rt
= skb_rtable(skb
);
341 /* Make sure we can route this packet. */
342 rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
346 /* Use correct destination address if we have options. */
347 daddr
= inet
->inet_daddr
;
348 if (inet_opt
&& inet_opt
->opt
.srr
)
349 daddr
= inet_opt
->opt
.faddr
;
351 /* If this fails, retransmit mechanism of transport layer will
352 * keep trying until route appears or the connection times
355 rt
= ip_route_output_ports(sock_net(sk
), fl4
, sk
,
356 daddr
, inet
->inet_saddr
,
361 sk
->sk_bound_dev_if
);
364 sk_setup_caps(sk
, &rt
->dst
);
366 skb_dst_set_noref(skb
, &rt
->dst
);
369 if (inet_opt
&& inet_opt
->opt
.is_strictroute
&& fl4
->daddr
!= rt
->rt_gateway
)
372 /* OK, we know where to send it, allocate and build IP header. */
373 skb_push(skb
, sizeof(struct iphdr
) + (inet_opt
? inet_opt
->opt
.optlen
: 0));
374 skb_reset_network_header(skb
);
376 *((__be16
*)iph
) = htons((4 << 12) | (5 << 8) | (inet
->tos
& 0xff));
377 if (ip_dont_fragment(sk
, &rt
->dst
) && !skb
->local_df
)
378 iph
->frag_off
= htons(IP_DF
);
381 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
382 iph
->protocol
= sk
->sk_protocol
;
383 iph
->saddr
= fl4
->saddr
;
384 iph
->daddr
= fl4
->daddr
;
385 /* Transport layer set skb->h.foo itself. */
387 if (inet_opt
&& inet_opt
->opt
.optlen
) {
388 iph
->ihl
+= inet_opt
->opt
.optlen
>> 2;
389 ip_options_build(skb
, &inet_opt
->opt
, inet
->inet_daddr
, rt
, 0);
392 ip_select_ident_more(iph
, &rt
->dst
, sk
,
393 (skb_shinfo(skb
)->gso_segs
?: 1) - 1);
395 skb
->priority
= sk
->sk_priority
;
396 skb
->mark
= sk
->sk_mark
;
398 res
= ip_local_out(skb
);
404 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
406 return -EHOSTUNREACH
;
408 EXPORT_SYMBOL(ip_queue_xmit
);
411 static void ip_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
413 to
->pkt_type
= from
->pkt_type
;
414 to
->priority
= from
->priority
;
415 to
->protocol
= from
->protocol
;
417 skb_dst_copy(to
, from
);
419 to
->mark
= from
->mark
;
421 /* Copy the flags to each fragment. */
422 IPCB(to
)->flags
= IPCB(from
)->flags
;
424 #ifdef CONFIG_NET_SCHED
425 to
->tc_index
= from
->tc_index
;
428 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
429 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
430 to
->nf_trace
= from
->nf_trace
;
432 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
433 to
->ipvs_property
= from
->ipvs_property
;
435 skb_copy_secmark(to
, from
);
439 * This IP datagram is too large to be sent in one piece. Break it up into
440 * smaller pieces (each of size equal to IP header plus
441 * a block of the data of the original IP data part) that will yet fit in a
442 * single device frame, and queue such a frame for sending.
445 int ip_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
449 struct net_device
*dev
;
450 struct sk_buff
*skb2
;
451 unsigned int mtu
, hlen
, left
, len
, ll_rs
;
453 __be16 not_last_frag
;
454 struct rtable
*rt
= skb_rtable(skb
);
460 * Point into the IP datagram header.
465 if (unlikely((iph
->frag_off
& htons(IP_DF
)) && !skb
->local_df
)) {
466 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
467 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
,
468 htonl(ip_skb_dst_mtu(skb
)));
474 * Setup starting values.
478 mtu
= dst_mtu(&rt
->dst
) - hlen
; /* Size of data space */
479 #ifdef CONFIG_BRIDGE_NETFILTER
481 mtu
-= nf_bridge_mtu_reduction(skb
);
483 IPCB(skb
)->flags
|= IPSKB_FRAG_COMPLETE
;
485 /* When frag_list is given, use it. First, check its validity:
486 * some transformers could create wrong frag_list or break existing
487 * one, it is not prohibited. In this case fall back to copying.
489 * LATER: this step can be merged to real generation of fragments,
490 * we can switch to copy when see the first bad fragment.
492 if (skb_has_frag_list(skb
)) {
493 struct sk_buff
*frag
, *frag2
;
494 int first_len
= skb_pagelen(skb
);
496 if (first_len
- hlen
> mtu
||
497 ((first_len
- hlen
) & 7) ||
498 ip_is_fragment(iph
) ||
502 skb_walk_frags(skb
, frag
) {
503 /* Correct geometry. */
504 if (frag
->len
> mtu
||
505 ((frag
->len
& 7) && frag
->next
) ||
506 skb_headroom(frag
) < hlen
)
507 goto slow_path_clean
;
509 /* Partially cloned skb? */
510 if (skb_shared(frag
))
511 goto slow_path_clean
;
516 frag
->destructor
= sock_wfree
;
518 skb
->truesize
-= frag
->truesize
;
521 /* Everything is OK. Generate! */
525 frag
= skb_shinfo(skb
)->frag_list
;
526 skb_frag_list_init(skb
);
527 skb
->data_len
= first_len
- skb_headlen(skb
);
528 skb
->len
= first_len
;
529 iph
->tot_len
= htons(first_len
);
530 iph
->frag_off
= htons(IP_MF
);
534 /* Prepare header of the next frame,
535 * before previous one went down. */
537 frag
->ip_summed
= CHECKSUM_NONE
;
538 skb_reset_transport_header(frag
);
539 __skb_push(frag
, hlen
);
540 skb_reset_network_header(frag
);
541 memcpy(skb_network_header(frag
), iph
, hlen
);
543 iph
->tot_len
= htons(frag
->len
);
544 ip_copy_metadata(frag
, skb
);
546 ip_options_fragment(frag
);
547 offset
+= skb
->len
- hlen
;
548 iph
->frag_off
= htons(offset
>>3);
549 if (frag
->next
!= NULL
)
550 iph
->frag_off
|= htons(IP_MF
);
551 /* Ready, complete checksum */
558 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
568 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
577 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
581 skb_walk_frags(skb
, frag2
) {
585 frag2
->destructor
= NULL
;
586 skb
->truesize
+= frag2
->truesize
;
591 left
= skb
->len
- hlen
; /* Space per frame */
592 ptr
= hlen
; /* Where to start from */
594 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
595 * we need to make room for the encapsulating header
597 ll_rs
= LL_RESERVED_SPACE_EXTRA(rt
->dst
.dev
, nf_bridge_pad(skb
));
600 * Fragment the datagram.
603 offset
= (ntohs(iph
->frag_off
) & IP_OFFSET
) << 3;
604 not_last_frag
= iph
->frag_off
& htons(IP_MF
);
607 * Keep copying data until we run out.
612 /* IF: it doesn't fit, use 'mtu' - the data space left */
615 /* IF: we are not sending up to and including the packet end
616 then align the next start on an eight byte boundary */
624 if ((skb2
= alloc_skb(len
+hlen
+ll_rs
, GFP_ATOMIC
)) == NULL
) {
625 NETDEBUG(KERN_INFO
"IP: frag: no memory for new fragment!\n");
631 * Set up data on packet
634 ip_copy_metadata(skb2
, skb
);
635 skb_reserve(skb2
, ll_rs
);
636 skb_put(skb2
, len
+ hlen
);
637 skb_reset_network_header(skb2
);
638 skb2
->transport_header
= skb2
->network_header
+ hlen
;
641 * Charge the memory for the fragment to any owner
646 skb_set_owner_w(skb2
, skb
->sk
);
649 * Copy the packet header into the new buffer.
652 skb_copy_from_linear_data(skb
, skb_network_header(skb2
), hlen
);
655 * Copy a block of the IP datagram.
657 if (skb_copy_bits(skb
, ptr
, skb_transport_header(skb2
), len
))
662 * Fill in the new header fields.
665 iph
->frag_off
= htons((offset
>> 3));
667 /* ANK: dirty, but effective trick. Upgrade options only if
668 * the segment to be fragmented was THE FIRST (otherwise,
669 * options are already fixed) and make it ONCE
670 * on the initial skb, so that all the following fragments
671 * will inherit fixed options.
674 ip_options_fragment(skb
);
677 * Added AC : If we are fragmenting a fragment that's not the
678 * last fragment then keep MF on each bit
680 if (left
> 0 || not_last_frag
)
681 iph
->frag_off
|= htons(IP_MF
);
686 * Put this fragment into the sending queue.
688 iph
->tot_len
= htons(len
+ hlen
);
696 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
699 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
704 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
707 EXPORT_SYMBOL(ip_fragment
);
710 ip_generic_getfrag(void *from
, char *to
, int offset
, int len
, int odd
, struct sk_buff
*skb
)
712 struct iovec
*iov
= from
;
714 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
715 if (memcpy_fromiovecend(to
, iov
, offset
, len
) < 0)
719 if (csum_partial_copy_fromiovecend(to
, iov
, offset
, len
, &csum
) < 0)
721 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
725 EXPORT_SYMBOL(ip_generic_getfrag
);
728 csum_page(struct page
*page
, int offset
, int copy
)
733 csum
= csum_partial(kaddr
+ offset
, copy
, 0);
738 static inline int ip_ufo_append_data(struct sock
*sk
,
739 struct sk_buff_head
*queue
,
740 int getfrag(void *from
, char *to
, int offset
, int len
,
741 int odd
, struct sk_buff
*skb
),
742 void *from
, int length
, int hh_len
, int fragheaderlen
,
743 int transhdrlen
, int maxfraglen
, unsigned int flags
)
748 /* There is support for UDP fragmentation offload by network
749 * device, so create one single skb packet containing complete
752 if ((skb
= skb_peek_tail(queue
)) == NULL
) {
753 skb
= sock_alloc_send_skb(sk
,
754 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
755 (flags
& MSG_DONTWAIT
), &err
);
760 /* reserve space for Hardware header */
761 skb_reserve(skb
, hh_len
);
763 /* create space for UDP/IP header */
764 skb_put(skb
, fragheaderlen
+ transhdrlen
);
766 /* initialize network header pointer */
767 skb_reset_network_header(skb
);
769 /* initialize protocol header pointer */
770 skb
->transport_header
= skb
->network_header
+ fragheaderlen
;
772 skb
->ip_summed
= CHECKSUM_PARTIAL
;
775 /* specify the length of each IP datagram fragment */
776 skb_shinfo(skb
)->gso_size
= maxfraglen
- fragheaderlen
;
777 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
778 __skb_queue_tail(queue
, skb
);
781 return skb_append_datato_frags(sk
, skb
, getfrag
, from
,
782 (length
- transhdrlen
));
785 static int __ip_append_data(struct sock
*sk
,
787 struct sk_buff_head
*queue
,
788 struct inet_cork
*cork
,
789 int getfrag(void *from
, char *to
, int offset
,
790 int len
, int odd
, struct sk_buff
*skb
),
791 void *from
, int length
, int transhdrlen
,
794 struct inet_sock
*inet
= inet_sk(sk
);
797 struct ip_options
*opt
= cork
->opt
;
804 unsigned int maxfraglen
, fragheaderlen
;
805 int csummode
= CHECKSUM_NONE
;
806 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
808 skb
= skb_peek_tail(queue
);
810 exthdrlen
= !skb
? rt
->dst
.header_len
: 0;
811 mtu
= cork
->fragsize
;
813 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
815 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
816 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
818 if (cork
->length
+ length
> 0xFFFF - fragheaderlen
) {
819 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
,
825 * transhdrlen > 0 means that this is the first fragment and we wish
826 * it won't be fragmented in the future.
829 length
+ fragheaderlen
<= mtu
&&
830 rt
->dst
.dev
->features
& NETIF_F_V4_CSUM
&&
832 csummode
= CHECKSUM_PARTIAL
;
834 cork
->length
+= length
;
835 if (((length
> mtu
) || (skb
&& skb_is_gso(skb
))) &&
836 (sk
->sk_protocol
== IPPROTO_UDP
) &&
837 (rt
->dst
.dev
->features
& NETIF_F_UFO
) && !rt
->dst
.header_len
) {
838 err
= ip_ufo_append_data(sk
, queue
, getfrag
, from
, length
,
839 hh_len
, fragheaderlen
, transhdrlen
,
846 /* So, what's going on in the loop below?
848 * We use calculated fragment length to generate chained skb,
849 * each of segments is IP fragment ready for sending to network after
850 * adding appropriate IP header.
857 /* Check if the remaining data fits into current packet. */
858 copy
= mtu
- skb
->len
;
860 copy
= maxfraglen
- skb
->len
;
863 unsigned int datalen
;
864 unsigned int fraglen
;
865 unsigned int fraggap
;
866 unsigned int alloclen
;
867 struct sk_buff
*skb_prev
;
871 fraggap
= skb_prev
->len
- maxfraglen
;
876 * If remaining data exceeds the mtu,
877 * we know we need more fragment(s).
879 datalen
= length
+ fraggap
;
880 if (datalen
> mtu
- fragheaderlen
)
881 datalen
= maxfraglen
- fragheaderlen
;
882 fraglen
= datalen
+ fragheaderlen
;
884 if ((flags
& MSG_MORE
) &&
885 !(rt
->dst
.dev
->features
&NETIF_F_SG
))
890 alloclen
+= exthdrlen
;
892 /* The last fragment gets additional space at tail.
893 * Note, with MSG_MORE we overallocate on fragments,
894 * because we have no idea what fragment will be
897 if (datalen
== length
+ fraggap
)
898 alloclen
+= rt
->dst
.trailer_len
;
901 skb
= sock_alloc_send_skb(sk
,
902 alloclen
+ hh_len
+ 15,
903 (flags
& MSG_DONTWAIT
), &err
);
906 if (atomic_read(&sk
->sk_wmem_alloc
) <=
908 skb
= sock_wmalloc(sk
,
909 alloclen
+ hh_len
+ 15, 1,
911 if (unlikely(skb
== NULL
))
914 /* only the initial fragment is
922 * Fill in the control structures
924 skb
->ip_summed
= csummode
;
926 skb_reserve(skb
, hh_len
);
927 skb_shinfo(skb
)->tx_flags
= cork
->tx_flags
;
930 * Find where to start putting bytes.
932 data
= skb_put(skb
, fraglen
+ exthdrlen
);
933 skb_set_network_header(skb
, exthdrlen
);
934 skb
->transport_header
= (skb
->network_header
+
936 data
+= fragheaderlen
+ exthdrlen
;
939 skb
->csum
= skb_copy_and_csum_bits(
940 skb_prev
, maxfraglen
,
941 data
+ transhdrlen
, fraggap
, 0);
942 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
945 pskb_trim_unique(skb_prev
, maxfraglen
);
948 copy
= datalen
- transhdrlen
- fraggap
;
949 if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
956 length
-= datalen
- fraggap
;
959 csummode
= CHECKSUM_NONE
;
962 * Put the packet on the pending queue.
964 __skb_queue_tail(queue
, skb
);
971 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
)) {
975 if (getfrag(from
, skb_put(skb
, copy
),
976 offset
, copy
, off
, skb
) < 0) {
977 __skb_trim(skb
, off
);
982 int i
= skb_shinfo(skb
)->nr_frags
;
983 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
984 struct page
*page
= cork
->page
;
988 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
991 if (page
!= frag
->page
) {
992 if (i
== MAX_SKB_FRAGS
) {
997 skb_fill_page_desc(skb
, i
, page
, off
, 0);
998 frag
= &skb_shinfo(skb
)->frags
[i
];
1000 } else if (i
< MAX_SKB_FRAGS
) {
1001 if (copy
> PAGE_SIZE
)
1003 page
= alloc_pages(sk
->sk_allocation
, 0);
1011 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1012 frag
= &skb_shinfo(skb
)->frags
[i
];
1017 if (getfrag(from
, page_address(frag
->page
)+frag
->page_offset
+frag
->size
, offset
, copy
, skb
->len
, skb
) < 0) {
1024 skb
->data_len
+= copy
;
1025 skb
->truesize
+= copy
;
1026 atomic_add(copy
, &sk
->sk_wmem_alloc
);
1035 cork
->length
-= length
;
1036 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1040 static int ip_setup_cork(struct sock
*sk
, struct inet_cork
*cork
,
1041 struct ipcm_cookie
*ipc
, struct rtable
**rtp
)
1043 struct inet_sock
*inet
= inet_sk(sk
);
1044 struct ip_options_rcu
*opt
;
1048 * setup for corking.
1052 if (cork
->opt
== NULL
) {
1053 cork
->opt
= kmalloc(sizeof(struct ip_options
) + 40,
1055 if (unlikely(cork
->opt
== NULL
))
1058 memcpy(cork
->opt
, &opt
->opt
, sizeof(struct ip_options
) + opt
->opt
.optlen
);
1059 cork
->flags
|= IPCORK_OPT
;
1060 cork
->addr
= ipc
->addr
;
1066 * We steal reference to this route, caller should not release it
1069 cork
->fragsize
= inet
->pmtudisc
== IP_PMTUDISC_PROBE
?
1070 rt
->dst
.dev
->mtu
: dst_mtu(&rt
->dst
);
1071 cork
->dst
= &rt
->dst
;
1073 cork
->tx_flags
= ipc
->tx_flags
;
1081 * ip_append_data() and ip_append_page() can make one large IP datagram
1082 * from many pieces of data. Each pieces will be holded on the socket
1083 * until ip_push_pending_frames() is called. Each piece can be a page
1086 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1087 * this interface potentially.
1089 * LATER: length must be adjusted by pad at tail, when it is required.
1091 int ip_append_data(struct sock
*sk
, struct flowi4
*fl4
,
1092 int getfrag(void *from
, char *to
, int offset
, int len
,
1093 int odd
, struct sk_buff
*skb
),
1094 void *from
, int length
, int transhdrlen
,
1095 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1098 struct inet_sock
*inet
= inet_sk(sk
);
1101 if (flags
&MSG_PROBE
)
1104 if (skb_queue_empty(&sk
->sk_write_queue
)) {
1105 err
= ip_setup_cork(sk
, &inet
->cork
.base
, ipc
, rtp
);
1112 return __ip_append_data(sk
, fl4
, &sk
->sk_write_queue
, &inet
->cork
.base
, getfrag
,
1113 from
, length
, transhdrlen
, flags
);
1116 ssize_t
ip_append_page(struct sock
*sk
, struct flowi4
*fl4
, struct page
*page
,
1117 int offset
, size_t size
, int flags
)
1119 struct inet_sock
*inet
= inet_sk(sk
);
1120 struct sk_buff
*skb
;
1122 struct ip_options
*opt
= NULL
;
1123 struct inet_cork
*cork
;
1128 unsigned int maxfraglen
, fragheaderlen
, fraggap
;
1133 if (flags
&MSG_PROBE
)
1136 if (skb_queue_empty(&sk
->sk_write_queue
))
1139 cork
= &inet
->cork
.base
;
1140 rt
= (struct rtable
*)cork
->dst
;
1141 if (cork
->flags
& IPCORK_OPT
)
1144 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
))
1147 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
1148 mtu
= cork
->fragsize
;
1150 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
1151 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
1153 if (cork
->length
+ size
> 0xFFFF - fragheaderlen
) {
1154 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
, mtu
);
1158 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1161 cork
->length
+= size
;
1162 if ((size
+ skb
->len
> mtu
) &&
1163 (sk
->sk_protocol
== IPPROTO_UDP
) &&
1164 (rt
->dst
.dev
->features
& NETIF_F_UFO
)) {
1165 skb_shinfo(skb
)->gso_size
= mtu
- fragheaderlen
;
1166 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
1173 if (skb_is_gso(skb
))
1177 /* Check if the remaining data fits into current packet. */
1178 len
= mtu
- skb
->len
;
1180 len
= maxfraglen
- skb
->len
;
1183 struct sk_buff
*skb_prev
;
1187 fraggap
= skb_prev
->len
- maxfraglen
;
1189 alloclen
= fragheaderlen
+ hh_len
+ fraggap
+ 15;
1190 skb
= sock_wmalloc(sk
, alloclen
, 1, sk
->sk_allocation
);
1191 if (unlikely(!skb
)) {
1197 * Fill in the control structures
1199 skb
->ip_summed
= CHECKSUM_NONE
;
1201 skb_reserve(skb
, hh_len
);
1204 * Find where to start putting bytes.
1206 skb_put(skb
, fragheaderlen
+ fraggap
);
1207 skb_reset_network_header(skb
);
1208 skb
->transport_header
= (skb
->network_header
+
1211 skb
->csum
= skb_copy_and_csum_bits(skb_prev
,
1213 skb_transport_header(skb
),
1215 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1217 pskb_trim_unique(skb_prev
, maxfraglen
);
1221 * Put the packet on the pending queue.
1223 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1227 i
= skb_shinfo(skb
)->nr_frags
;
1230 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
1231 skb_shinfo(skb
)->frags
[i
-1].size
+= len
;
1232 } else if (i
< MAX_SKB_FRAGS
) {
1234 skb_fill_page_desc(skb
, i
, page
, offset
, len
);
1240 if (skb
->ip_summed
== CHECKSUM_NONE
) {
1242 csum
= csum_page(page
, offset
, len
);
1243 skb
->csum
= csum_block_add(skb
->csum
, csum
, skb
->len
);
1247 skb
->data_len
+= len
;
1248 skb
->truesize
+= len
;
1249 atomic_add(len
, &sk
->sk_wmem_alloc
);
1256 cork
->length
-= size
;
1257 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1261 static void ip_cork_release(struct inet_cork
*cork
)
1263 cork
->flags
&= ~IPCORK_OPT
;
1266 dst_release(cork
->dst
);
1271 * Combined all pending IP fragments on the socket as one IP datagram
1272 * and push them out.
1274 struct sk_buff
*__ip_make_skb(struct sock
*sk
,
1276 struct sk_buff_head
*queue
,
1277 struct inet_cork
*cork
)
1279 struct sk_buff
*skb
, *tmp_skb
;
1280 struct sk_buff
**tail_skb
;
1281 struct inet_sock
*inet
= inet_sk(sk
);
1282 struct net
*net
= sock_net(sk
);
1283 struct ip_options
*opt
= NULL
;
1284 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
1289 if ((skb
= __skb_dequeue(queue
)) == NULL
)
1291 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1293 /* move skb->data to ip header from ext header */
1294 if (skb
->data
< skb_network_header(skb
))
1295 __skb_pull(skb
, skb_network_offset(skb
));
1296 while ((tmp_skb
= __skb_dequeue(queue
)) != NULL
) {
1297 __skb_pull(tmp_skb
, skb_network_header_len(skb
));
1298 *tail_skb
= tmp_skb
;
1299 tail_skb
= &(tmp_skb
->next
);
1300 skb
->len
+= tmp_skb
->len
;
1301 skb
->data_len
+= tmp_skb
->len
;
1302 skb
->truesize
+= tmp_skb
->truesize
;
1303 tmp_skb
->destructor
= NULL
;
1307 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1308 * to fragment the frame generated here. No matter, what transforms
1309 * how transforms change size of the packet, it will come out.
1311 if (inet
->pmtudisc
< IP_PMTUDISC_DO
)
1314 /* DF bit is set when we want to see DF on outgoing frames.
1315 * If local_df is set too, we still allow to fragment this frame
1317 if (inet
->pmtudisc
>= IP_PMTUDISC_DO
||
1318 (skb
->len
<= dst_mtu(&rt
->dst
) &&
1319 ip_dont_fragment(sk
, &rt
->dst
)))
1322 if (cork
->flags
& IPCORK_OPT
)
1325 if (rt
->rt_type
== RTN_MULTICAST
)
1328 ttl
= ip_select_ttl(inet
, &rt
->dst
);
1330 iph
= (struct iphdr
*)skb
->data
;
1333 iph
->tos
= inet
->tos
;
1335 ip_select_ident(iph
, &rt
->dst
, sk
);
1337 iph
->protocol
= sk
->sk_protocol
;
1338 iph
->saddr
= fl4
->saddr
;
1339 iph
->daddr
= fl4
->daddr
;
1342 iph
->ihl
+= opt
->optlen
>>2;
1343 ip_options_build(skb
, opt
, cork
->addr
, rt
, 0);
1346 skb
->priority
= sk
->sk_priority
;
1347 skb
->mark
= sk
->sk_mark
;
1349 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1353 skb_dst_set(skb
, &rt
->dst
);
1355 if (iph
->protocol
== IPPROTO_ICMP
)
1356 icmp_out_count(net
, ((struct icmphdr
*)
1357 skb_transport_header(skb
))->type
);
1359 ip_cork_release(cork
);
1364 int ip_send_skb(struct sk_buff
*skb
)
1366 struct net
*net
= sock_net(skb
->sk
);
1369 err
= ip_local_out(skb
);
1372 err
= net_xmit_errno(err
);
1374 IP_INC_STATS(net
, IPSTATS_MIB_OUTDISCARDS
);
1380 int ip_push_pending_frames(struct sock
*sk
, struct flowi4
*fl4
)
1382 struct sk_buff
*skb
;
1384 skb
= ip_finish_skb(sk
, fl4
);
1388 /* Netfilter gets whole the not fragmented skb. */
1389 return ip_send_skb(skb
);
1393 * Throw away all pending data on the socket.
1395 static void __ip_flush_pending_frames(struct sock
*sk
,
1396 struct sk_buff_head
*queue
,
1397 struct inet_cork
*cork
)
1399 struct sk_buff
*skb
;
1401 while ((skb
= __skb_dequeue_tail(queue
)) != NULL
)
1404 ip_cork_release(cork
);
1407 void ip_flush_pending_frames(struct sock
*sk
)
1409 __ip_flush_pending_frames(sk
, &sk
->sk_write_queue
, &inet_sk(sk
)->cork
.base
);
1412 struct sk_buff
*ip_make_skb(struct sock
*sk
,
1414 int getfrag(void *from
, char *to
, int offset
,
1415 int len
, int odd
, struct sk_buff
*skb
),
1416 void *from
, int length
, int transhdrlen
,
1417 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1420 struct inet_cork cork
;
1421 struct sk_buff_head queue
;
1424 if (flags
& MSG_PROBE
)
1427 __skb_queue_head_init(&queue
);
1432 err
= ip_setup_cork(sk
, &cork
, ipc
, rtp
);
1434 return ERR_PTR(err
);
1436 err
= __ip_append_data(sk
, fl4
, &queue
, &cork
, getfrag
,
1437 from
, length
, transhdrlen
, flags
);
1439 __ip_flush_pending_frames(sk
, &queue
, &cork
);
1440 return ERR_PTR(err
);
1443 return __ip_make_skb(sk
, fl4
, &queue
, &cork
);
1447 * Fetch data from kernel space and fill in checksum if needed.
1449 static int ip_reply_glue_bits(void *dptr
, char *to
, int offset
,
1450 int len
, int odd
, struct sk_buff
*skb
)
1454 csum
= csum_partial_copy_nocheck(dptr
+offset
, to
, len
, 0);
1455 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
1460 * Generic function to send a packet as reply to another packet.
1461 * Used to send TCP resets so far. ICMP should use this function too.
1463 * Should run single threaded per socket because it uses the sock
1464 * structure to pass arguments.
1466 void ip_send_reply(struct sock
*sk
, struct sk_buff
*skb
, __be32 daddr
,
1467 struct ip_reply_arg
*arg
, unsigned int len
)
1469 struct inet_sock
*inet
= inet_sk(sk
);
1470 struct ip_options_data replyopts
;
1471 struct ipcm_cookie ipc
;
1473 struct rtable
*rt
= skb_rtable(skb
);
1475 if (ip_options_echo(&replyopts
.opt
.opt
, skb
))
1482 if (replyopts
.opt
.opt
.optlen
) {
1483 ipc
.opt
= &replyopts
.opt
;
1485 if (replyopts
.opt
.opt
.srr
)
1486 daddr
= replyopts
.opt
.opt
.faddr
;
1489 flowi4_init_output(&fl4
, arg
->bound_dev_if
, 0,
1490 RT_TOS(ip_hdr(skb
)->tos
),
1491 RT_SCOPE_UNIVERSE
, sk
->sk_protocol
,
1492 ip_reply_arg_flowi_flags(arg
),
1493 daddr
, rt
->rt_spec_dst
,
1494 tcp_hdr(skb
)->source
, tcp_hdr(skb
)->dest
);
1495 security_skb_classify_flow(skb
, flowi4_to_flowi(&fl4
));
1496 rt
= ip_route_output_key(sock_net(sk
), &fl4
);
1500 /* And let IP do all the hard work.
1502 This chunk is not reenterable, hence spinlock.
1503 Note that it uses the fact, that this function is called
1504 with locally disabled BH and that sk cannot be already spinlocked.
1507 inet
->tos
= ip_hdr(skb
)->tos
;
1508 sk
->sk_priority
= skb
->priority
;
1509 sk
->sk_protocol
= ip_hdr(skb
)->protocol
;
1510 sk
->sk_bound_dev_if
= arg
->bound_dev_if
;
1511 ip_append_data(sk
, &fl4
, ip_reply_glue_bits
, arg
->iov
->iov_base
, len
, 0,
1512 &ipc
, &rt
, MSG_DONTWAIT
);
1513 if ((skb
= skb_peek(&sk
->sk_write_queue
)) != NULL
) {
1514 if (arg
->csumoffset
>= 0)
1515 *((__sum16
*)skb_transport_header(skb
) +
1516 arg
->csumoffset
) = csum_fold(csum_add(skb
->csum
,
1518 skb
->ip_summed
= CHECKSUM_NONE
;
1519 ip_push_pending_frames(sk
, &fl4
);
1527 void __init
ip_init(void)
1532 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1533 igmp_mc_proc_init();