2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
20 * See ip_input.c for original log
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
44 * Hirokazu Takahashi: sendfile() on UDP works now.
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
86 int sysctl_ip_default_ttl
= IPDEFTTL
;
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__
void ip_send_check(struct iphdr
*iph
)
92 iph
->check
= ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff
*newskb
)
98 newskb
->mac
.raw
= newskb
->data
;
99 __skb_pull(newskb
, newskb
->nh
.raw
- newskb
->data
);
100 newskb
->pkt_type
= PACKET_LOOPBACK
;
101 newskb
->ip_summed
= CHECKSUM_UNNECESSARY
;
102 BUG_TRAP(newskb
->dst
);
107 static inline int ip_select_ttl(struct inet_sock
*inet
, struct dst_entry
*dst
)
109 int ttl
= inet
->uc_ttl
;
112 ttl
= dst_metric(dst
, RTAX_HOPLIMIT
);
117 * Add an ip header to a skbuff and send it out.
120 int ip_build_and_send_pkt(struct sk_buff
*skb
, struct sock
*sk
,
121 u32 saddr
, u32 daddr
, struct ip_options
*opt
)
123 struct inet_sock
*inet
= inet_sk(sk
);
124 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
127 /* Build the IP header. */
129 iph
=(struct iphdr
*)skb_push(skb
,sizeof(struct iphdr
) + opt
->optlen
);
131 iph
=(struct iphdr
*)skb_push(skb
,sizeof(struct iphdr
));
135 iph
->tos
= inet
->tos
;
136 if (ip_dont_fragment(sk
, &rt
->u
.dst
))
137 iph
->frag_off
= htons(IP_DF
);
140 iph
->ttl
= ip_select_ttl(inet
, &rt
->u
.dst
);
141 iph
->daddr
= rt
->rt_dst
;
142 iph
->saddr
= rt
->rt_src
;
143 iph
->protocol
= sk
->sk_protocol
;
144 iph
->tot_len
= htons(skb
->len
);
145 ip_select_ident(iph
, &rt
->u
.dst
, sk
);
148 if (opt
&& opt
->optlen
) {
149 iph
->ihl
+= opt
->optlen
>>2;
150 ip_options_build(skb
, opt
, daddr
, rt
, 0);
154 skb
->priority
= sk
->sk_priority
;
157 return NF_HOOK(PF_INET
, NF_IP_LOCAL_OUT
, skb
, NULL
, rt
->u
.dst
.dev
,
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt
);
163 static inline int ip_finish_output2(struct sk_buff
*skb
)
165 struct dst_entry
*dst
= skb
->dst
;
166 struct hh_cache
*hh
= dst
->hh
;
167 struct net_device
*dev
= dst
->dev
;
168 int hh_len
= LL_RESERVED_SPACE(dev
);
170 /* Be paranoid, rather than too clever. */
171 if (unlikely(skb_headroom(skb
) < hh_len
&& dev
->hard_header
)) {
172 struct sk_buff
*skb2
;
174 skb2
= skb_realloc_headroom(skb
, LL_RESERVED_SPACE(dev
));
180 skb_set_owner_w(skb2
, skb
->sk
);
188 read_lock_bh(&hh
->hh_lock
);
189 hh_alen
= HH_DATA_ALIGN(hh
->hh_len
);
190 memcpy(skb
->data
- hh_alen
, hh
->hh_data
, hh_alen
);
191 read_unlock_bh(&hh
->hh_lock
);
192 skb_push(skb
, hh
->hh_len
);
193 return hh
->hh_output(skb
);
194 } else if (dst
->neighbour
)
195 return dst
->neighbour
->output(skb
);
198 printk(KERN_DEBUG
"ip_finish_output2: No header cache and no neighbour!\n");
203 static inline int ip_finish_output(struct sk_buff
*skb
)
205 struct net_device
*dev
= skb
->dst
->dev
;
208 skb
->protocol
= htons(ETH_P_IP
);
210 return NF_HOOK(PF_INET
, NF_IP_POST_ROUTING
, skb
, NULL
, dev
,
214 int ip_mc_output(struct sk_buff
*skb
)
216 struct sock
*sk
= skb
->sk
;
217 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
218 struct net_device
*dev
= rt
->u
.dst
.dev
;
221 * If the indicated interface is up and running, send the packet.
223 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS
);
226 skb
->protocol
= htons(ETH_P_IP
);
229 * Multicasts are looped back for other local users
232 if (rt
->rt_flags
&RTCF_MULTICAST
) {
233 if ((!sk
|| inet_sk(sk
)->mc_loop
)
234 #ifdef CONFIG_IP_MROUTE
235 /* Small optimization: do not loopback not local frames,
236 which returned after forwarding; they will be dropped
237 by ip_mr_input in any case.
238 Note, that local frames are looped back to be delivered
241 This check is duplicated in ip_mr_input at the moment.
243 && ((rt
->rt_flags
&RTCF_LOCAL
) || !(IPCB(skb
)->flags
&IPSKB_FORWARDED
))
246 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
248 NF_HOOK(PF_INET
, NF_IP_POST_ROUTING
, newskb
, NULL
,
250 ip_dev_loopback_xmit
);
253 /* Multicasts with ttl 0 must not go beyond the host */
255 if (skb
->nh
.iph
->ttl
== 0) {
261 if (rt
->rt_flags
&RTCF_BROADCAST
) {
262 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
264 NF_HOOK(PF_INET
, NF_IP_POST_ROUTING
, newskb
, NULL
,
265 newskb
->dev
, ip_dev_loopback_xmit
);
268 if (skb
->len
> dst_mtu(&rt
->u
.dst
))
269 return ip_fragment(skb
, ip_finish_output
);
271 return ip_finish_output(skb
);
274 int ip_output(struct sk_buff
*skb
)
276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS
);
278 if (skb
->len
> dst_mtu(skb
->dst
) &&
279 !(skb_shinfo(skb
)->ufo_size
|| skb_shinfo(skb
)->tso_size
))
280 return ip_fragment(skb
, ip_finish_output
);
282 return ip_finish_output(skb
);
285 int ip_queue_xmit(struct sk_buff
*skb
, int ipfragok
)
287 struct sock
*sk
= skb
->sk
;
288 struct inet_sock
*inet
= inet_sk(sk
);
289 struct ip_options
*opt
= inet
->opt
;
293 /* Skip all of this if the packet is already routed,
294 * f.e. by something like SCTP.
296 rt
= (struct rtable
*) skb
->dst
;
300 /* Make sure we can route this packet. */
301 rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
305 /* Use correct destination address if we have options. */
311 struct flowi fl
= { .oif
= sk
->sk_bound_dev_if
,
314 .saddr
= inet
->saddr
,
315 .tos
= RT_CONN_FLAGS(sk
) } },
316 .proto
= sk
->sk_protocol
,
318 { .sport
= inet
->sport
,
319 .dport
= inet
->dport
} } };
321 /* If this fails, retransmit mechanism of transport layer will
322 * keep trying until route appears or the connection times
325 if (ip_route_output_flow(&rt
, &fl
, sk
, 0))
328 sk_setup_caps(sk
, &rt
->u
.dst
);
330 skb
->dst
= dst_clone(&rt
->u
.dst
);
333 if (opt
&& opt
->is_strictroute
&& rt
->rt_dst
!= rt
->rt_gateway
)
336 /* OK, we know where to send it, allocate and build IP header. */
337 iph
= (struct iphdr
*) skb_push(skb
, sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0));
338 *((__u16
*)iph
) = htons((4 << 12) | (5 << 8) | (inet
->tos
& 0xff));
339 iph
->tot_len
= htons(skb
->len
);
340 if (ip_dont_fragment(sk
, &rt
->u
.dst
) && !ipfragok
)
341 iph
->frag_off
= htons(IP_DF
);
344 iph
->ttl
= ip_select_ttl(inet
, &rt
->u
.dst
);
345 iph
->protocol
= sk
->sk_protocol
;
346 iph
->saddr
= rt
->rt_src
;
347 iph
->daddr
= rt
->rt_dst
;
349 /* Transport layer set skb->h.foo itself. */
351 if (opt
&& opt
->optlen
) {
352 iph
->ihl
+= opt
->optlen
>> 2;
353 ip_options_build(skb
, opt
, inet
->daddr
, rt
, 0);
356 ip_select_ident_more(iph
, &rt
->u
.dst
, sk
,
357 (skb_shinfo(skb
)->tso_segs
?: 1) - 1);
359 /* Add an IP checksum. */
362 skb
->priority
= sk
->sk_priority
;
364 return NF_HOOK(PF_INET
, NF_IP_LOCAL_OUT
, skb
, NULL
, rt
->u
.dst
.dev
,
368 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES
);
370 return -EHOSTUNREACH
;
374 static void ip_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
376 to
->pkt_type
= from
->pkt_type
;
377 to
->priority
= from
->priority
;
378 to
->protocol
= from
->protocol
;
379 dst_release(to
->dst
);
380 to
->dst
= dst_clone(from
->dst
);
383 /* Copy the flags to each fragment. */
384 IPCB(to
)->flags
= IPCB(from
)->flags
;
386 #ifdef CONFIG_NET_SCHED
387 to
->tc_index
= from
->tc_index
;
389 #ifdef CONFIG_NETFILTER
390 to
->nfmark
= from
->nfmark
;
391 /* Connection association is same as pre-frag packet */
392 nf_conntrack_put(to
->nfct
);
393 to
->nfct
= from
->nfct
;
394 nf_conntrack_get(to
->nfct
);
395 to
->nfctinfo
= from
->nfctinfo
;
396 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
397 to
->ipvs_property
= from
->ipvs_property
;
399 #ifdef CONFIG_BRIDGE_NETFILTER
400 nf_bridge_put(to
->nf_bridge
);
401 to
->nf_bridge
= from
->nf_bridge
;
402 nf_bridge_get(to
->nf_bridge
);
408 * This IP datagram is too large to be sent in one piece. Break it up into
409 * smaller pieces (each of size equal to IP header plus
410 * a block of the data of the original IP data part) that will yet fit in a
411 * single device frame, and queue such a frame for sending.
414 int ip_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
419 struct net_device
*dev
;
420 struct sk_buff
*skb2
;
421 unsigned int mtu
, hlen
, left
, len
, ll_rs
;
424 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
430 * Point into the IP datagram header.
435 if (unlikely((iph
->frag_off
& htons(IP_DF
)) && !skb
->local_df
)) {
436 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
,
437 htonl(dst_mtu(&rt
->u
.dst
)));
443 * Setup starting values.
447 mtu
= dst_mtu(&rt
->u
.dst
) - hlen
; /* Size of data space */
449 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing
451 * one, it is not prohibited. In this case fall back to copying.
453 * LATER: this step can be merged to real generation of fragments,
454 * we can switch to copy when see the first bad fragment.
456 if (skb_shinfo(skb
)->frag_list
) {
457 struct sk_buff
*frag
;
458 int first_len
= skb_pagelen(skb
);
460 if (first_len
- hlen
> mtu
||
461 ((first_len
- hlen
) & 7) ||
462 (iph
->frag_off
& htons(IP_MF
|IP_OFFSET
)) ||
466 for (frag
= skb_shinfo(skb
)->frag_list
; frag
; frag
= frag
->next
) {
467 /* Correct geometry. */
468 if (frag
->len
> mtu
||
469 ((frag
->len
& 7) && frag
->next
) ||
470 skb_headroom(frag
) < hlen
)
473 /* Partially cloned skb? */
474 if (skb_shared(frag
))
481 frag
->destructor
= sock_wfree
;
482 skb
->truesize
-= frag
->truesize
;
486 /* Everything is OK. Generate! */
490 frag
= skb_shinfo(skb
)->frag_list
;
491 skb_shinfo(skb
)->frag_list
= NULL
;
492 skb
->data_len
= first_len
- skb_headlen(skb
);
493 skb
->len
= first_len
;
494 iph
->tot_len
= htons(first_len
);
495 iph
->frag_off
= htons(IP_MF
);
499 /* Prepare header of the next frame,
500 * before previous one went down. */
502 frag
->ip_summed
= CHECKSUM_NONE
;
503 frag
->h
.raw
= frag
->data
;
504 frag
->nh
.raw
= __skb_push(frag
, hlen
);
505 memcpy(frag
->nh
.raw
, iph
, hlen
);
507 iph
->tot_len
= htons(frag
->len
);
508 ip_copy_metadata(frag
, skb
);
510 ip_options_fragment(frag
);
511 offset
+= skb
->len
- hlen
;
512 iph
->frag_off
= htons(offset
>>3);
513 if (frag
->next
!= NULL
)
514 iph
->frag_off
|= htons(IP_MF
);
515 /* Ready, complete checksum */
530 IP_INC_STATS(IPSTATS_MIB_FRAGOKS
);
539 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS
);
544 left
= skb
->len
- hlen
; /* Space per frame */
545 ptr
= raw
+ hlen
; /* Where to start from */
547 #ifdef CONFIG_BRIDGE_NETFILTER
548 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
549 * we need to make room for the encapsulating header */
550 ll_rs
= LL_RESERVED_SPACE_EXTRA(rt
->u
.dst
.dev
, nf_bridge_pad(skb
));
551 mtu
-= nf_bridge_pad(skb
);
553 ll_rs
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
);
556 * Fragment the datagram.
559 offset
= (ntohs(iph
->frag_off
) & IP_OFFSET
) << 3;
560 not_last_frag
= iph
->frag_off
& htons(IP_MF
);
563 * Keep copying data until we run out.
568 /* IF: it doesn't fit, use 'mtu' - the data space left */
571 /* IF: we are not sending upto and including the packet end
572 then align the next start on an eight byte boundary */
580 if ((skb2
= alloc_skb(len
+hlen
+ll_rs
, GFP_ATOMIC
)) == NULL
) {
581 NETDEBUG(KERN_INFO
"IP: frag: no memory for new fragment!\n");
587 * Set up data on packet
590 ip_copy_metadata(skb2
, skb
);
591 skb_reserve(skb2
, ll_rs
);
592 skb_put(skb2
, len
+ hlen
);
593 skb2
->nh
.raw
= skb2
->data
;
594 skb2
->h
.raw
= skb2
->data
+ hlen
;
597 * Charge the memory for the fragment to any owner
602 skb_set_owner_w(skb2
, skb
->sk
);
605 * Copy the packet header into the new buffer.
608 memcpy(skb2
->nh
.raw
, skb
->data
, hlen
);
611 * Copy a block of the IP datagram.
613 if (skb_copy_bits(skb
, ptr
, skb2
->h
.raw
, len
))
618 * Fill in the new header fields.
621 iph
->frag_off
= htons((offset
>> 3));
623 /* ANK: dirty, but effective trick. Upgrade options only if
624 * the segment to be fragmented was THE FIRST (otherwise,
625 * options are already fixed) and make it ONCE
626 * on the initial skb, so that all the following fragments
627 * will inherit fixed options.
630 ip_options_fragment(skb
);
633 * Added AC : If we are fragmenting a fragment that's not the
634 * last fragment then keep MF on each bit
636 if (left
> 0 || not_last_frag
)
637 iph
->frag_off
|= htons(IP_MF
);
642 * Put this fragment into the sending queue.
645 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES
);
647 iph
->tot_len
= htons(len
+ hlen
);
656 IP_INC_STATS(IPSTATS_MIB_FRAGOKS
);
661 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS
);
666 ip_generic_getfrag(void *from
, char *to
, int offset
, int len
, int odd
, struct sk_buff
*skb
)
668 struct iovec
*iov
= from
;
670 if (skb
->ip_summed
== CHECKSUM_HW
) {
671 if (memcpy_fromiovecend(to
, iov
, offset
, len
) < 0)
674 unsigned int csum
= 0;
675 if (csum_partial_copy_fromiovecend(to
, iov
, offset
, len
, &csum
) < 0)
677 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
682 static inline unsigned int
683 csum_page(struct page
*page
, int offset
, int copy
)
688 csum
= csum_partial(kaddr
+ offset
, copy
, 0);
693 inline int ip_ufo_append_data(struct sock
*sk
,
694 int getfrag(void *from
, char *to
, int offset
, int len
,
695 int odd
, struct sk_buff
*skb
),
696 void *from
, int length
, int hh_len
, int fragheaderlen
,
697 int transhdrlen
, int mtu
,unsigned int flags
)
702 /* There is support for UDP fragmentation offload by network
703 * device, so create one single skb packet containing complete
706 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
) {
707 skb
= sock_alloc_send_skb(sk
,
708 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
709 (flags
& MSG_DONTWAIT
), &err
);
714 /* reserve space for Hardware header */
715 skb_reserve(skb
, hh_len
);
717 /* create space for UDP/IP header */
718 skb_put(skb
,fragheaderlen
+ transhdrlen
);
720 /* initialize network header pointer */
721 skb
->nh
.raw
= skb
->data
;
723 /* initialize protocol header pointer */
724 skb
->h
.raw
= skb
->data
+ fragheaderlen
;
726 skb
->ip_summed
= CHECKSUM_HW
;
728 sk
->sk_sndmsg_off
= 0;
731 err
= skb_append_datato_frags(sk
,skb
, getfrag
, from
,
732 (length
- transhdrlen
));
734 /* specify the length of each IP datagram fragment*/
735 skb_shinfo(skb
)->ufo_size
= (mtu
- fragheaderlen
);
736 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
740 /* There is not enough support do UFO ,
741 * so follow normal path
748 * ip_append_data() and ip_append_page() can make one large IP datagram
749 * from many pieces of data. Each pieces will be holded on the socket
750 * until ip_push_pending_frames() is called. Each piece can be a page
753 * Not only UDP, other transport protocols - e.g. raw sockets - can use
754 * this interface potentially.
756 * LATER: length must be adjusted by pad at tail, when it is required.
758 int ip_append_data(struct sock
*sk
,
759 int getfrag(void *from
, char *to
, int offset
, int len
,
760 int odd
, struct sk_buff
*skb
),
761 void *from
, int length
, int transhdrlen
,
762 struct ipcm_cookie
*ipc
, struct rtable
*rt
,
765 struct inet_sock
*inet
= inet_sk(sk
);
768 struct ip_options
*opt
= NULL
;
775 unsigned int maxfraglen
, fragheaderlen
;
776 int csummode
= CHECKSUM_NONE
;
781 if (skb_queue_empty(&sk
->sk_write_queue
)) {
787 if (inet
->cork
.opt
== NULL
) {
788 inet
->cork
.opt
= kmalloc(sizeof(struct ip_options
) + 40, sk
->sk_allocation
);
789 if (unlikely(inet
->cork
.opt
== NULL
))
792 memcpy(inet
->cork
.opt
, opt
, sizeof(struct ip_options
)+opt
->optlen
);
793 inet
->cork
.flags
|= IPCORK_OPT
;
794 inet
->cork
.addr
= ipc
->addr
;
796 dst_hold(&rt
->u
.dst
);
797 inet
->cork
.fragsize
= mtu
= dst_mtu(rt
->u
.dst
.path
);
799 inet
->cork
.length
= 0;
800 sk
->sk_sndmsg_page
= NULL
;
801 sk
->sk_sndmsg_off
= 0;
802 if ((exthdrlen
= rt
->u
.dst
.header_len
) != 0) {
804 transhdrlen
+= exthdrlen
;
808 if (inet
->cork
.flags
& IPCORK_OPT
)
809 opt
= inet
->cork
.opt
;
813 mtu
= inet
->cork
.fragsize
;
815 hh_len
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
);
817 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
818 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
820 if (inet
->cork
.length
+ length
> 0xFFFF - fragheaderlen
) {
821 ip_local_error(sk
, EMSGSIZE
, rt
->rt_dst
, inet
->dport
, mtu
-exthdrlen
);
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
830 length
+ fragheaderlen
<= mtu
&&
831 rt
->u
.dst
.dev
->features
&(NETIF_F_IP_CSUM
|NETIF_F_NO_CSUM
|NETIF_F_HW_CSUM
) &&
833 csummode
= CHECKSUM_HW
;
835 inet
->cork
.length
+= length
;
836 if (((length
> mtu
) && (sk
->sk_protocol
== IPPROTO_UDP
)) &&
837 (rt
->u
.dst
.dev
->features
& NETIF_F_UFO
)) {
839 if(ip_ufo_append_data(sk
, getfrag
, from
, length
, hh_len
,
840 fragheaderlen
, transhdrlen
, mtu
, flags
))
846 /* So, what's going on in the loop below?
848 * We use calculated fragment length to generate chained skb,
849 * each of segments is IP fragment ready for sending to network after
850 * adding appropriate IP header.
853 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
857 /* Check if the remaining data fits into current packet. */
858 copy
= mtu
- skb
->len
;
860 copy
= maxfraglen
- skb
->len
;
863 unsigned int datalen
;
864 unsigned int fraglen
;
865 unsigned int fraggap
;
866 unsigned int alloclen
;
867 struct sk_buff
*skb_prev
;
871 fraggap
= skb_prev
->len
- maxfraglen
;
876 * If remaining data exceeds the mtu,
877 * we know we need more fragment(s).
879 datalen
= length
+ fraggap
;
880 if (datalen
> mtu
- fragheaderlen
)
881 datalen
= maxfraglen
- fragheaderlen
;
882 fraglen
= datalen
+ fragheaderlen
;
884 if ((flags
& MSG_MORE
) &&
885 !(rt
->u
.dst
.dev
->features
&NETIF_F_SG
))
888 alloclen
= datalen
+ fragheaderlen
;
890 /* The last fragment gets additional space at tail.
891 * Note, with MSG_MORE we overallocate on fragments,
892 * because we have no idea what fragment will be
895 if (datalen
== length
)
896 alloclen
+= rt
->u
.dst
.trailer_len
;
899 skb
= sock_alloc_send_skb(sk
,
900 alloclen
+ hh_len
+ 15,
901 (flags
& MSG_DONTWAIT
), &err
);
904 if (atomic_read(&sk
->sk_wmem_alloc
) <=
906 skb
= sock_wmalloc(sk
,
907 alloclen
+ hh_len
+ 15, 1,
909 if (unlikely(skb
== NULL
))
916 * Fill in the control structures
918 skb
->ip_summed
= csummode
;
920 skb_reserve(skb
, hh_len
);
923 * Find where to start putting bytes.
925 data
= skb_put(skb
, fraglen
);
926 skb
->nh
.raw
= data
+ exthdrlen
;
927 data
+= fragheaderlen
;
928 skb
->h
.raw
= data
+ exthdrlen
;
931 skb
->csum
= skb_copy_and_csum_bits(
932 skb_prev
, maxfraglen
,
933 data
+ transhdrlen
, fraggap
, 0);
934 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
937 skb_trim(skb_prev
, maxfraglen
);
940 copy
= datalen
- transhdrlen
- fraggap
;
941 if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
948 length
-= datalen
- fraggap
;
951 csummode
= CHECKSUM_NONE
;
954 * Put the packet on the pending queue.
956 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
963 if (!(rt
->u
.dst
.dev
->features
&NETIF_F_SG
)) {
967 if (getfrag(from
, skb_put(skb
, copy
),
968 offset
, copy
, off
, skb
) < 0) {
969 __skb_trim(skb
, off
);
974 int i
= skb_shinfo(skb
)->nr_frags
;
975 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
976 struct page
*page
= sk
->sk_sndmsg_page
;
977 int off
= sk
->sk_sndmsg_off
;
980 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
983 if (page
!= frag
->page
) {
984 if (i
== MAX_SKB_FRAGS
) {
989 skb_fill_page_desc(skb
, i
, page
, sk
->sk_sndmsg_off
, 0);
990 frag
= &skb_shinfo(skb
)->frags
[i
];
992 } else if (i
< MAX_SKB_FRAGS
) {
993 if (copy
> PAGE_SIZE
)
995 page
= alloc_pages(sk
->sk_allocation
, 0);
1000 sk
->sk_sndmsg_page
= page
;
1001 sk
->sk_sndmsg_off
= 0;
1003 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1004 frag
= &skb_shinfo(skb
)->frags
[i
];
1005 skb
->truesize
+= PAGE_SIZE
;
1006 atomic_add(PAGE_SIZE
, &sk
->sk_wmem_alloc
);
1011 if (getfrag(from
, page_address(frag
->page
)+frag
->page_offset
+frag
->size
, offset
, copy
, skb
->len
, skb
) < 0) {
1015 sk
->sk_sndmsg_off
+= copy
;
1018 skb
->data_len
+= copy
;
1027 inet
->cork
.length
-= length
;
1028 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS
);
1032 ssize_t
ip_append_page(struct sock
*sk
, struct page
*page
,
1033 int offset
, size_t size
, int flags
)
1035 struct inet_sock
*inet
= inet_sk(sk
);
1036 struct sk_buff
*skb
;
1038 struct ip_options
*opt
= NULL
;
1043 unsigned int maxfraglen
, fragheaderlen
, fraggap
;
1048 if (flags
&MSG_PROBE
)
1051 if (skb_queue_empty(&sk
->sk_write_queue
))
1055 if (inet
->cork
.flags
& IPCORK_OPT
)
1056 opt
= inet
->cork
.opt
;
1058 if (!(rt
->u
.dst
.dev
->features
&NETIF_F_SG
))
1061 hh_len
= LL_RESERVED_SPACE(rt
->u
.dst
.dev
);
1062 mtu
= inet
->cork
.fragsize
;
1064 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
1065 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
1067 if (inet
->cork
.length
+ size
> 0xFFFF - fragheaderlen
) {
1068 ip_local_error(sk
, EMSGSIZE
, rt
->rt_dst
, inet
->dport
, mtu
);
1072 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1075 inet
->cork
.length
+= size
;
1076 if ((sk
->sk_protocol
== IPPROTO_UDP
) &&
1077 (rt
->u
.dst
.dev
->features
& NETIF_F_UFO
))
1078 skb_shinfo(skb
)->ufo_size
= (mtu
- fragheaderlen
);
1084 if (skb_shinfo(skb
)->ufo_size
)
1088 /* Check if the remaining data fits into current packet. */
1089 len
= mtu
- skb
->len
;
1091 len
= maxfraglen
- skb
->len
;
1094 struct sk_buff
*skb_prev
;
1100 fraggap
= skb_prev
->len
- maxfraglen
;
1102 alloclen
= fragheaderlen
+ hh_len
+ fraggap
+ 15;
1103 skb
= sock_wmalloc(sk
, alloclen
, 1, sk
->sk_allocation
);
1104 if (unlikely(!skb
)) {
1110 * Fill in the control structures
1112 skb
->ip_summed
= CHECKSUM_NONE
;
1114 skb_reserve(skb
, hh_len
);
1117 * Find where to start putting bytes.
1119 data
= skb_put(skb
, fragheaderlen
+ fraggap
);
1120 skb
->nh
.iph
= iph
= (struct iphdr
*)data
;
1121 data
+= fragheaderlen
;
1125 skb
->csum
= skb_copy_and_csum_bits(
1126 skb_prev
, maxfraglen
,
1128 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1130 skb_trim(skb_prev
, maxfraglen
);
1134 * Put the packet on the pending queue.
1136 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1140 i
= skb_shinfo(skb
)->nr_frags
;
1143 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
1144 skb_shinfo(skb
)->frags
[i
-1].size
+= len
;
1145 } else if (i
< MAX_SKB_FRAGS
) {
1147 skb_fill_page_desc(skb
, i
, page
, offset
, len
);
1153 if (skb
->ip_summed
== CHECKSUM_NONE
) {
1155 csum
= csum_page(page
, offset
, len
);
1156 skb
->csum
= csum_block_add(skb
->csum
, csum
, skb
->len
);
1160 skb
->data_len
+= len
;
1167 inet
->cork
.length
-= size
;
1168 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS
);
1173 * Combined all pending IP fragments on the socket as one IP datagram
1174 * and push them out.
1176 int ip_push_pending_frames(struct sock
*sk
)
1178 struct sk_buff
*skb
, *tmp_skb
;
1179 struct sk_buff
**tail_skb
;
1180 struct inet_sock
*inet
= inet_sk(sk
);
1181 struct ip_options
*opt
= NULL
;
1182 struct rtable
*rt
= inet
->cork
.rt
;
1188 if ((skb
= __skb_dequeue(&sk
->sk_write_queue
)) == NULL
)
1190 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1192 /* move skb->data to ip header from ext header */
1193 if (skb
->data
< skb
->nh
.raw
)
1194 __skb_pull(skb
, skb
->nh
.raw
- skb
->data
);
1195 while ((tmp_skb
= __skb_dequeue(&sk
->sk_write_queue
)) != NULL
) {
1196 __skb_pull(tmp_skb
, skb
->h
.raw
- skb
->nh
.raw
);
1197 *tail_skb
= tmp_skb
;
1198 tail_skb
= &(tmp_skb
->next
);
1199 skb
->len
+= tmp_skb
->len
;
1200 skb
->data_len
+= tmp_skb
->len
;
1201 skb
->truesize
+= tmp_skb
->truesize
;
1202 __sock_put(tmp_skb
->sk
);
1203 tmp_skb
->destructor
= NULL
;
1207 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1208 * to fragment the frame generated here. No matter, what transforms
1209 * how transforms change size of the packet, it will come out.
1211 if (inet
->pmtudisc
!= IP_PMTUDISC_DO
)
1214 /* DF bit is set when we want to see DF on outgoing frames.
1215 * If local_df is set too, we still allow to fragment this frame
1217 if (inet
->pmtudisc
== IP_PMTUDISC_DO
||
1218 (skb
->len
<= dst_mtu(&rt
->u
.dst
) &&
1219 ip_dont_fragment(sk
, &rt
->u
.dst
)))
1222 if (inet
->cork
.flags
& IPCORK_OPT
)
1223 opt
= inet
->cork
.opt
;
1225 if (rt
->rt_type
== RTN_MULTICAST
)
1228 ttl
= ip_select_ttl(inet
, &rt
->u
.dst
);
1230 iph
= (struct iphdr
*)skb
->data
;
1234 iph
->ihl
+= opt
->optlen
>>2;
1235 ip_options_build(skb
, opt
, inet
->cork
.addr
, rt
, 0);
1237 iph
->tos
= inet
->tos
;
1238 iph
->tot_len
= htons(skb
->len
);
1241 __ip_select_ident(iph
, &rt
->u
.dst
, 0);
1243 iph
->id
= htons(inet
->id
++);
1246 iph
->protocol
= sk
->sk_protocol
;
1247 iph
->saddr
= rt
->rt_src
;
1248 iph
->daddr
= rt
->rt_dst
;
1251 skb
->priority
= sk
->sk_priority
;
1252 skb
->dst
= dst_clone(&rt
->u
.dst
);
1254 /* Netfilter gets whole the not fragmented skb. */
1255 err
= NF_HOOK(PF_INET
, NF_IP_LOCAL_OUT
, skb
, NULL
,
1256 skb
->dst
->dev
, dst_output
);
1259 err
= inet
->recverr
? net_xmit_errno(err
) : 0;
1265 inet
->cork
.flags
&= ~IPCORK_OPT
;
1266 kfree(inet
->cork
.opt
);
1267 inet
->cork
.opt
= NULL
;
1268 if (inet
->cork
.rt
) {
1269 ip_rt_put(inet
->cork
.rt
);
1270 inet
->cork
.rt
= NULL
;
1275 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS
);
1280 * Throw away all pending data on the socket.
1282 void ip_flush_pending_frames(struct sock
*sk
)
1284 struct inet_sock
*inet
= inet_sk(sk
);
1285 struct sk_buff
*skb
;
1287 while ((skb
= __skb_dequeue_tail(&sk
->sk_write_queue
)) != NULL
)
1290 inet
->cork
.flags
&= ~IPCORK_OPT
;
1291 kfree(inet
->cork
.opt
);
1292 inet
->cork
.opt
= NULL
;
1293 if (inet
->cork
.rt
) {
1294 ip_rt_put(inet
->cork
.rt
);
1295 inet
->cork
.rt
= NULL
;
1301 * Fetch data from kernel space and fill in checksum if needed.
1303 static int ip_reply_glue_bits(void *dptr
, char *to
, int offset
,
1304 int len
, int odd
, struct sk_buff
*skb
)
1308 csum
= csum_partial_copy_nocheck(dptr
+offset
, to
, len
, 0);
1309 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
1314 * Generic function to send a packet as reply to another packet.
1315 * Used to send TCP resets so far. ICMP should use this function too.
1317 * Should run single threaded per socket because it uses the sock
1318 * structure to pass arguments.
1320 * LATER: switch from ip_build_xmit to ip_append_*
1322 void ip_send_reply(struct sock
*sk
, struct sk_buff
*skb
, struct ip_reply_arg
*arg
,
1325 struct inet_sock
*inet
= inet_sk(sk
);
1327 struct ip_options opt
;
1330 struct ipcm_cookie ipc
;
1332 struct rtable
*rt
= (struct rtable
*)skb
->dst
;
1334 if (ip_options_echo(&replyopts
.opt
, skb
))
1337 daddr
= ipc
.addr
= rt
->rt_src
;
1340 if (replyopts
.opt
.optlen
) {
1341 ipc
.opt
= &replyopts
.opt
;
1344 daddr
= replyopts
.opt
.faddr
;
1348 struct flowi fl
= { .nl_u
= { .ip4_u
=
1350 .saddr
= rt
->rt_spec_dst
,
1351 .tos
= RT_TOS(skb
->nh
.iph
->tos
) } },
1352 /* Not quite clean, but right. */
1354 { .sport
= skb
->h
.th
->dest
,
1355 .dport
= skb
->h
.th
->source
} },
1356 .proto
= sk
->sk_protocol
};
1357 if (ip_route_output_key(&rt
, &fl
))
1361 /* And let IP do all the hard work.
1363 This chunk is not reenterable, hence spinlock.
1364 Note that it uses the fact, that this function is called
1365 with locally disabled BH and that sk cannot be already spinlocked.
1368 inet
->tos
= skb
->nh
.iph
->tos
;
1369 sk
->sk_priority
= skb
->priority
;
1370 sk
->sk_protocol
= skb
->nh
.iph
->protocol
;
1371 ip_append_data(sk
, ip_reply_glue_bits
, arg
->iov
->iov_base
, len
, 0,
1372 &ipc
, rt
, MSG_DONTWAIT
);
1373 if ((skb
= skb_peek(&sk
->sk_write_queue
)) != NULL
) {
1374 if (arg
->csumoffset
>= 0)
1375 *((u16
*)skb
->h
.raw
+ arg
->csumoffset
) = csum_fold(csum_add(skb
->csum
, arg
->csum
));
1376 skb
->ip_summed
= CHECKSUM_NONE
;
1377 ip_push_pending_frames(sk
);
1385 void __init
ip_init(void)
1390 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1391 igmp_mc_proc_init();
1395 EXPORT_SYMBOL(ip_fragment
);
1396 EXPORT_SYMBOL(ip_generic_getfrag
);
1397 EXPORT_SYMBOL(ip_queue_xmit
);
1398 EXPORT_SYMBOL(ip_send_check
);