[MIPS] Delete duplicate definitions of break codes.
[linux-2.6/mini2440.git] / net / ipv4 / ip_output.c
blob11c2f68254f0a8a04602222216a03578c0327d20
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
20 * See ip_input.c for original log
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
86 int sysctl_ip_default_ttl = IPDEFTTL;
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
91 iph->check = 0;
92 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
98 newskb->mac.raw = newskb->data;
99 __skb_pull(newskb, newskb->nh.raw - newskb->data);
100 newskb->pkt_type = PACKET_LOOPBACK;
101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 BUG_TRAP(newskb->dst);
103 netif_rx(newskb);
104 return 0;
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
109 int ttl = inet->uc_ttl;
111 if (ttl < 0)
112 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113 return ttl;
117 * Add an ip header to a skbuff and send it out.
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121 u32 saddr, u32 daddr, struct ip_options *opt)
123 struct inet_sock *inet = inet_sk(sk);
124 struct rtable *rt = (struct rtable *)skb->dst;
125 struct iphdr *iph;
127 /* Build the IP header. */
128 if (opt)
129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 else
131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
133 iph->version = 4;
134 iph->ihl = 5;
135 iph->tos = inet->tos;
136 if (ip_dont_fragment(sk, &rt->u.dst))
137 iph->frag_off = htons(IP_DF);
138 else
139 iph->frag_off = 0;
140 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
141 iph->daddr = rt->rt_dst;
142 iph->saddr = rt->rt_src;
143 iph->protocol = sk->sk_protocol;
144 iph->tot_len = htons(skb->len);
145 ip_select_ident(iph, &rt->u.dst, sk);
146 skb->nh.iph = iph;
148 if (opt && opt->optlen) {
149 iph->ihl += opt->optlen>>2;
150 ip_options_build(skb, opt, daddr, rt, 0);
152 ip_send_check(iph);
154 skb->priority = sk->sk_priority;
156 /* Send it out. */
157 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158 dst_output);
161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
163 static inline int ip_finish_output2(struct sk_buff *skb)
165 struct dst_entry *dst = skb->dst;
166 struct hh_cache *hh = dst->hh;
167 struct net_device *dev = dst->dev;
168 int hh_len = LL_RESERVED_SPACE(dev);
170 /* Be paranoid, rather than too clever. */
171 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172 struct sk_buff *skb2;
174 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175 if (skb2 == NULL) {
176 kfree_skb(skb);
177 return -ENOMEM;
179 if (skb->sk)
180 skb_set_owner_w(skb2, skb->sk);
181 kfree_skb(skb);
182 skb = skb2;
185 if (hh) {
186 int hh_alen;
188 read_lock_bh(&hh->hh_lock);
189 hh_alen = HH_DATA_ALIGN(hh->hh_len);
190 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191 read_unlock_bh(&hh->hh_lock);
192 skb_push(skb, hh->hh_len);
193 return hh->hh_output(skb);
194 } else if (dst->neighbour)
195 return dst->neighbour->output(skb);
197 if (net_ratelimit())
198 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199 kfree_skb(skb);
200 return -EINVAL;
203 static inline int ip_finish_output(struct sk_buff *skb)
205 struct net_device *dev = skb->dst->dev;
207 skb->dev = dev;
208 skb->protocol = htons(ETH_P_IP);
210 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211 ip_finish_output2);
214 int ip_mc_output(struct sk_buff *skb)
216 struct sock *sk = skb->sk;
217 struct rtable *rt = (struct rtable*)skb->dst;
218 struct net_device *dev = rt->u.dst.dev;
221 * If the indicated interface is up and running, send the packet.
223 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
225 skb->dev = dev;
226 skb->protocol = htons(ETH_P_IP);
229 * Multicasts are looped back for other local users
232 if (rt->rt_flags&RTCF_MULTICAST) {
233 if ((!sk || inet_sk(sk)->mc_loop)
234 #ifdef CONFIG_IP_MROUTE
235 /* Small optimization: do not loopback not local frames,
236 which returned after forwarding; they will be dropped
237 by ip_mr_input in any case.
238 Note, that local frames are looped back to be delivered
239 to local recipients.
241 This check is duplicated in ip_mr_input at the moment.
243 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244 #endif
246 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247 if (newskb)
248 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249 newskb->dev,
250 ip_dev_loopback_xmit);
253 /* Multicasts with ttl 0 must not go beyond the host */
255 if (skb->nh.iph->ttl == 0) {
256 kfree_skb(skb);
257 return 0;
261 if (rt->rt_flags&RTCF_BROADCAST) {
262 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263 if (newskb)
264 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265 newskb->dev, ip_dev_loopback_xmit);
268 if (skb->len > dst_mtu(&rt->u.dst))
269 return ip_fragment(skb, ip_finish_output);
270 else
271 return ip_finish_output(skb);
274 int ip_output(struct sk_buff *skb)
276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
278 if (skb->len > dst_mtu(skb->dst) &&
279 !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
280 return ip_fragment(skb, ip_finish_output);
281 else
282 return ip_finish_output(skb);
285 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
287 struct sock *sk = skb->sk;
288 struct inet_sock *inet = inet_sk(sk);
289 struct ip_options *opt = inet->opt;
290 struct rtable *rt;
291 struct iphdr *iph;
293 /* Skip all of this if the packet is already routed,
294 * f.e. by something like SCTP.
296 rt = (struct rtable *) skb->dst;
297 if (rt != NULL)
298 goto packet_routed;
300 /* Make sure we can route this packet. */
301 rt = (struct rtable *)__sk_dst_check(sk, 0);
302 if (rt == NULL) {
303 u32 daddr;
305 /* Use correct destination address if we have options. */
306 daddr = inet->daddr;
307 if(opt && opt->srr)
308 daddr = opt->faddr;
311 struct flowi fl = { .oif = sk->sk_bound_dev_if,
312 .nl_u = { .ip4_u =
313 { .daddr = daddr,
314 .saddr = inet->saddr,
315 .tos = RT_CONN_FLAGS(sk) } },
316 .proto = sk->sk_protocol,
317 .uli_u = { .ports =
318 { .sport = inet->sport,
319 .dport = inet->dport } } };
321 /* If this fails, retransmit mechanism of transport layer will
322 * keep trying until route appears or the connection times
323 * itself out.
325 if (ip_route_output_flow(&rt, &fl, sk, 0))
326 goto no_route;
328 sk_setup_caps(sk, &rt->u.dst);
330 skb->dst = dst_clone(&rt->u.dst);
332 packet_routed:
333 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
334 goto no_route;
336 /* OK, we know where to send it, allocate and build IP header. */
337 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
338 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
339 iph->tot_len = htons(skb->len);
340 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
341 iph->frag_off = htons(IP_DF);
342 else
343 iph->frag_off = 0;
344 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
345 iph->protocol = sk->sk_protocol;
346 iph->saddr = rt->rt_src;
347 iph->daddr = rt->rt_dst;
348 skb->nh.iph = iph;
349 /* Transport layer set skb->h.foo itself. */
351 if (opt && opt->optlen) {
352 iph->ihl += opt->optlen >> 2;
353 ip_options_build(skb, opt, inet->daddr, rt, 0);
356 ip_select_ident_more(iph, &rt->u.dst, sk,
357 (skb_shinfo(skb)->tso_segs ?: 1) - 1);
359 /* Add an IP checksum. */
360 ip_send_check(iph);
362 skb->priority = sk->sk_priority;
364 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
365 dst_output);
367 no_route:
368 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
369 kfree_skb(skb);
370 return -EHOSTUNREACH;
374 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
376 to->pkt_type = from->pkt_type;
377 to->priority = from->priority;
378 to->protocol = from->protocol;
379 dst_release(to->dst);
380 to->dst = dst_clone(from->dst);
381 to->dev = from->dev;
383 /* Copy the flags to each fragment. */
384 IPCB(to)->flags = IPCB(from)->flags;
386 #ifdef CONFIG_NET_SCHED
387 to->tc_index = from->tc_index;
388 #endif
389 #ifdef CONFIG_NETFILTER
390 to->nfmark = from->nfmark;
391 /* Connection association is same as pre-frag packet */
392 nf_conntrack_put(to->nfct);
393 to->nfct = from->nfct;
394 nf_conntrack_get(to->nfct);
395 to->nfctinfo = from->nfctinfo;
396 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
397 to->ipvs_property = from->ipvs_property;
398 #endif
399 #ifdef CONFIG_BRIDGE_NETFILTER
400 nf_bridge_put(to->nf_bridge);
401 to->nf_bridge = from->nf_bridge;
402 nf_bridge_get(to->nf_bridge);
403 #endif
404 #endif
408 * This IP datagram is too large to be sent in one piece. Break it up into
409 * smaller pieces (each of size equal to IP header plus
410 * a block of the data of the original IP data part) that will yet fit in a
411 * single device frame, and queue such a frame for sending.
414 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
416 struct iphdr *iph;
417 int raw = 0;
418 int ptr;
419 struct net_device *dev;
420 struct sk_buff *skb2;
421 unsigned int mtu, hlen, left, len, ll_rs;
422 int offset;
423 int not_last_frag;
424 struct rtable *rt = (struct rtable*)skb->dst;
425 int err = 0;
427 dev = rt->u.dst.dev;
430 * Point into the IP datagram header.
433 iph = skb->nh.iph;
435 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
436 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
437 htonl(dst_mtu(&rt->u.dst)));
438 kfree_skb(skb);
439 return -EMSGSIZE;
443 * Setup starting values.
446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
449 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing
451 * one, it is not prohibited. In this case fall back to copying.
453 * LATER: this step can be merged to real generation of fragments,
454 * we can switch to copy when see the first bad fragment.
456 if (skb_shinfo(skb)->frag_list) {
457 struct sk_buff *frag;
458 int first_len = skb_pagelen(skb);
460 if (first_len - hlen > mtu ||
461 ((first_len - hlen) & 7) ||
462 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
463 skb_cloned(skb))
464 goto slow_path;
466 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
467 /* Correct geometry. */
468 if (frag->len > mtu ||
469 ((frag->len & 7) && frag->next) ||
470 skb_headroom(frag) < hlen)
471 goto slow_path;
473 /* Partially cloned skb? */
474 if (skb_shared(frag))
475 goto slow_path;
477 BUG_ON(frag->sk);
478 if (skb->sk) {
479 sock_hold(skb->sk);
480 frag->sk = skb->sk;
481 frag->destructor = sock_wfree;
482 skb->truesize -= frag->truesize;
486 /* Everything is OK. Generate! */
488 err = 0;
489 offset = 0;
490 frag = skb_shinfo(skb)->frag_list;
491 skb_shinfo(skb)->frag_list = NULL;
492 skb->data_len = first_len - skb_headlen(skb);
493 skb->len = first_len;
494 iph->tot_len = htons(first_len);
495 iph->frag_off = htons(IP_MF);
496 ip_send_check(iph);
498 for (;;) {
499 /* Prepare header of the next frame,
500 * before previous one went down. */
501 if (frag) {
502 frag->ip_summed = CHECKSUM_NONE;
503 frag->h.raw = frag->data;
504 frag->nh.raw = __skb_push(frag, hlen);
505 memcpy(frag->nh.raw, iph, hlen);
506 iph = frag->nh.iph;
507 iph->tot_len = htons(frag->len);
508 ip_copy_metadata(frag, skb);
509 if (offset == 0)
510 ip_options_fragment(frag);
511 offset += skb->len - hlen;
512 iph->frag_off = htons(offset>>3);
513 if (frag->next != NULL)
514 iph->frag_off |= htons(IP_MF);
515 /* Ready, complete checksum */
516 ip_send_check(iph);
519 err = output(skb);
521 if (err || !frag)
522 break;
524 skb = frag;
525 frag = skb->next;
526 skb->next = NULL;
529 if (err == 0) {
530 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
531 return 0;
534 while (frag) {
535 skb = frag->next;
536 kfree_skb(frag);
537 frag = skb;
539 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
540 return err;
543 slow_path:
544 left = skb->len - hlen; /* Space per frame */
545 ptr = raw + hlen; /* Where to start from */
547 #ifdef CONFIG_BRIDGE_NETFILTER
548 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
549 * we need to make room for the encapsulating header */
550 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
551 mtu -= nf_bridge_pad(skb);
552 #else
553 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
554 #endif
556 * Fragment the datagram.
559 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
560 not_last_frag = iph->frag_off & htons(IP_MF);
563 * Keep copying data until we run out.
566 while(left > 0) {
567 len = left;
568 /* IF: it doesn't fit, use 'mtu' - the data space left */
569 if (len > mtu)
570 len = mtu;
571 /* IF: we are not sending upto and including the packet end
572 then align the next start on an eight byte boundary */
573 if (len < left) {
574 len &= ~7;
577 * Allocate buffer.
580 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
581 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
582 err = -ENOMEM;
583 goto fail;
587 * Set up data on packet
590 ip_copy_metadata(skb2, skb);
591 skb_reserve(skb2, ll_rs);
592 skb_put(skb2, len + hlen);
593 skb2->nh.raw = skb2->data;
594 skb2->h.raw = skb2->data + hlen;
597 * Charge the memory for the fragment to any owner
598 * it might possess
601 if (skb->sk)
602 skb_set_owner_w(skb2, skb->sk);
605 * Copy the packet header into the new buffer.
608 memcpy(skb2->nh.raw, skb->data, hlen);
611 * Copy a block of the IP datagram.
613 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
614 BUG();
615 left -= len;
618 * Fill in the new header fields.
620 iph = skb2->nh.iph;
621 iph->frag_off = htons((offset >> 3));
623 /* ANK: dirty, but effective trick. Upgrade options only if
624 * the segment to be fragmented was THE FIRST (otherwise,
625 * options are already fixed) and make it ONCE
626 * on the initial skb, so that all the following fragments
627 * will inherit fixed options.
629 if (offset == 0)
630 ip_options_fragment(skb);
633 * Added AC : If we are fragmenting a fragment that's not the
634 * last fragment then keep MF on each bit
636 if (left > 0 || not_last_frag)
637 iph->frag_off |= htons(IP_MF);
638 ptr += len;
639 offset += len;
642 * Put this fragment into the sending queue.
645 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
647 iph->tot_len = htons(len + hlen);
649 ip_send_check(iph);
651 err = output(skb2);
652 if (err)
653 goto fail;
655 kfree_skb(skb);
656 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
657 return err;
659 fail:
660 kfree_skb(skb);
661 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
662 return err;
666 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
668 struct iovec *iov = from;
670 if (skb->ip_summed == CHECKSUM_HW) {
671 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
672 return -EFAULT;
673 } else {
674 unsigned int csum = 0;
675 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
676 return -EFAULT;
677 skb->csum = csum_block_add(skb->csum, csum, odd);
679 return 0;
682 static inline unsigned int
683 csum_page(struct page *page, int offset, int copy)
685 char *kaddr;
686 unsigned int csum;
687 kaddr = kmap(page);
688 csum = csum_partial(kaddr + offset, copy, 0);
689 kunmap(page);
690 return csum;
693 inline int ip_ufo_append_data(struct sock *sk,
694 int getfrag(void *from, char *to, int offset, int len,
695 int odd, struct sk_buff *skb),
696 void *from, int length, int hh_len, int fragheaderlen,
697 int transhdrlen, int mtu,unsigned int flags)
699 struct sk_buff *skb;
700 int err;
702 /* There is support for UDP fragmentation offload by network
703 * device, so create one single skb packet containing complete
704 * udp datagram
706 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
707 skb = sock_alloc_send_skb(sk,
708 hh_len + fragheaderlen + transhdrlen + 20,
709 (flags & MSG_DONTWAIT), &err);
711 if (skb == NULL)
712 return err;
714 /* reserve space for Hardware header */
715 skb_reserve(skb, hh_len);
717 /* create space for UDP/IP header */
718 skb_put(skb,fragheaderlen + transhdrlen);
720 /* initialize network header pointer */
721 skb->nh.raw = skb->data;
723 /* initialize protocol header pointer */
724 skb->h.raw = skb->data + fragheaderlen;
726 skb->ip_summed = CHECKSUM_HW;
727 skb->csum = 0;
728 sk->sk_sndmsg_off = 0;
731 err = skb_append_datato_frags(sk,skb, getfrag, from,
732 (length - transhdrlen));
733 if (!err) {
734 /* specify the length of each IP datagram fragment*/
735 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
736 __skb_queue_tail(&sk->sk_write_queue, skb);
738 return 0;
740 /* There is not enough support do UFO ,
741 * so follow normal path
743 kfree_skb(skb);
744 return err;
748 * ip_append_data() and ip_append_page() can make one large IP datagram
749 * from many pieces of data. Each pieces will be holded on the socket
750 * until ip_push_pending_frames() is called. Each piece can be a page
751 * or non-page data.
753 * Not only UDP, other transport protocols - e.g. raw sockets - can use
754 * this interface potentially.
756 * LATER: length must be adjusted by pad at tail, when it is required.
758 int ip_append_data(struct sock *sk,
759 int getfrag(void *from, char *to, int offset, int len,
760 int odd, struct sk_buff *skb),
761 void *from, int length, int transhdrlen,
762 struct ipcm_cookie *ipc, struct rtable *rt,
763 unsigned int flags)
765 struct inet_sock *inet = inet_sk(sk);
766 struct sk_buff *skb;
768 struct ip_options *opt = NULL;
769 int hh_len;
770 int exthdrlen;
771 int mtu;
772 int copy;
773 int err;
774 int offset = 0;
775 unsigned int maxfraglen, fragheaderlen;
776 int csummode = CHECKSUM_NONE;
778 if (flags&MSG_PROBE)
779 return 0;
781 if (skb_queue_empty(&sk->sk_write_queue)) {
783 * setup for corking.
785 opt = ipc->opt;
786 if (opt) {
787 if (inet->cork.opt == NULL) {
788 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
789 if (unlikely(inet->cork.opt == NULL))
790 return -ENOBUFS;
792 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
793 inet->cork.flags |= IPCORK_OPT;
794 inet->cork.addr = ipc->addr;
796 dst_hold(&rt->u.dst);
797 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
798 inet->cork.rt = rt;
799 inet->cork.length = 0;
800 sk->sk_sndmsg_page = NULL;
801 sk->sk_sndmsg_off = 0;
802 if ((exthdrlen = rt->u.dst.header_len) != 0) {
803 length += exthdrlen;
804 transhdrlen += exthdrlen;
806 } else {
807 rt = inet->cork.rt;
808 if (inet->cork.flags & IPCORK_OPT)
809 opt = inet->cork.opt;
811 transhdrlen = 0;
812 exthdrlen = 0;
813 mtu = inet->cork.fragsize;
815 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
817 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
818 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
820 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
821 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
822 return -EMSGSIZE;
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
829 if (transhdrlen &&
830 length + fragheaderlen <= mtu &&
831 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
832 !exthdrlen)
833 csummode = CHECKSUM_HW;
835 inet->cork.length += length;
836 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
837 (rt->u.dst.dev->features & NETIF_F_UFO)) {
839 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
840 fragheaderlen, transhdrlen, mtu, flags))
841 goto error;
843 return 0;
846 /* So, what's going on in the loop below?
848 * We use calculated fragment length to generate chained skb,
849 * each of segments is IP fragment ready for sending to network after
850 * adding appropriate IP header.
853 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
854 goto alloc_new_skb;
856 while (length > 0) {
857 /* Check if the remaining data fits into current packet. */
858 copy = mtu - skb->len;
859 if (copy < length)
860 copy = maxfraglen - skb->len;
861 if (copy <= 0) {
862 char *data;
863 unsigned int datalen;
864 unsigned int fraglen;
865 unsigned int fraggap;
866 unsigned int alloclen;
867 struct sk_buff *skb_prev;
868 alloc_new_skb:
869 skb_prev = skb;
870 if (skb_prev)
871 fraggap = skb_prev->len - maxfraglen;
872 else
873 fraggap = 0;
876 * If remaining data exceeds the mtu,
877 * we know we need more fragment(s).
879 datalen = length + fraggap;
880 if (datalen > mtu - fragheaderlen)
881 datalen = maxfraglen - fragheaderlen;
882 fraglen = datalen + fragheaderlen;
884 if ((flags & MSG_MORE) &&
885 !(rt->u.dst.dev->features&NETIF_F_SG))
886 alloclen = mtu;
887 else
888 alloclen = datalen + fragheaderlen;
890 /* The last fragment gets additional space at tail.
891 * Note, with MSG_MORE we overallocate on fragments,
892 * because we have no idea what fragment will be
893 * the last.
895 if (datalen == length)
896 alloclen += rt->u.dst.trailer_len;
898 if (transhdrlen) {
899 skb = sock_alloc_send_skb(sk,
900 alloclen + hh_len + 15,
901 (flags & MSG_DONTWAIT), &err);
902 } else {
903 skb = NULL;
904 if (atomic_read(&sk->sk_wmem_alloc) <=
905 2 * sk->sk_sndbuf)
906 skb = sock_wmalloc(sk,
907 alloclen + hh_len + 15, 1,
908 sk->sk_allocation);
909 if (unlikely(skb == NULL))
910 err = -ENOBUFS;
912 if (skb == NULL)
913 goto error;
916 * Fill in the control structures
918 skb->ip_summed = csummode;
919 skb->csum = 0;
920 skb_reserve(skb, hh_len);
923 * Find where to start putting bytes.
925 data = skb_put(skb, fraglen);
926 skb->nh.raw = data + exthdrlen;
927 data += fragheaderlen;
928 skb->h.raw = data + exthdrlen;
930 if (fraggap) {
931 skb->csum = skb_copy_and_csum_bits(
932 skb_prev, maxfraglen,
933 data + transhdrlen, fraggap, 0);
934 skb_prev->csum = csum_sub(skb_prev->csum,
935 skb->csum);
936 data += fraggap;
937 skb_trim(skb_prev, maxfraglen);
940 copy = datalen - transhdrlen - fraggap;
941 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942 err = -EFAULT;
943 kfree_skb(skb);
944 goto error;
947 offset += copy;
948 length -= datalen - fraggap;
949 transhdrlen = 0;
950 exthdrlen = 0;
951 csummode = CHECKSUM_NONE;
954 * Put the packet on the pending queue.
956 __skb_queue_tail(&sk->sk_write_queue, skb);
957 continue;
960 if (copy > length)
961 copy = length;
963 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
964 unsigned int off;
966 off = skb->len;
967 if (getfrag(from, skb_put(skb, copy),
968 offset, copy, off, skb) < 0) {
969 __skb_trim(skb, off);
970 err = -EFAULT;
971 goto error;
973 } else {
974 int i = skb_shinfo(skb)->nr_frags;
975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976 struct page *page = sk->sk_sndmsg_page;
977 int off = sk->sk_sndmsg_off;
978 unsigned int left;
980 if (page && (left = PAGE_SIZE - off) > 0) {
981 if (copy >= left)
982 copy = left;
983 if (page != frag->page) {
984 if (i == MAX_SKB_FRAGS) {
985 err = -EMSGSIZE;
986 goto error;
988 get_page(page);
989 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
990 frag = &skb_shinfo(skb)->frags[i];
992 } else if (i < MAX_SKB_FRAGS) {
993 if (copy > PAGE_SIZE)
994 copy = PAGE_SIZE;
995 page = alloc_pages(sk->sk_allocation, 0);
996 if (page == NULL) {
997 err = -ENOMEM;
998 goto error;
1000 sk->sk_sndmsg_page = page;
1001 sk->sk_sndmsg_off = 0;
1003 skb_fill_page_desc(skb, i, page, 0, 0);
1004 frag = &skb_shinfo(skb)->frags[i];
1005 skb->truesize += PAGE_SIZE;
1006 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1007 } else {
1008 err = -EMSGSIZE;
1009 goto error;
1011 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012 err = -EFAULT;
1013 goto error;
1015 sk->sk_sndmsg_off += copy;
1016 frag->size += copy;
1017 skb->len += copy;
1018 skb->data_len += copy;
1020 offset += copy;
1021 length -= copy;
1024 return 0;
1026 error:
1027 inet->cork.length -= length;
1028 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1029 return err;
1032 ssize_t ip_append_page(struct sock *sk, struct page *page,
1033 int offset, size_t size, int flags)
1035 struct inet_sock *inet = inet_sk(sk);
1036 struct sk_buff *skb;
1037 struct rtable *rt;
1038 struct ip_options *opt = NULL;
1039 int hh_len;
1040 int mtu;
1041 int len;
1042 int err;
1043 unsigned int maxfraglen, fragheaderlen, fraggap;
1045 if (inet->hdrincl)
1046 return -EPERM;
1048 if (flags&MSG_PROBE)
1049 return 0;
1051 if (skb_queue_empty(&sk->sk_write_queue))
1052 return -EINVAL;
1054 rt = inet->cork.rt;
1055 if (inet->cork.flags & IPCORK_OPT)
1056 opt = inet->cork.opt;
1058 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1059 return -EOPNOTSUPP;
1061 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1062 mtu = inet->cork.fragsize;
1064 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1065 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1067 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1068 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1069 return -EMSGSIZE;
1072 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1073 return -EINVAL;
1075 inet->cork.length += size;
1076 if ((sk->sk_protocol == IPPROTO_UDP) &&
1077 (rt->u.dst.dev->features & NETIF_F_UFO))
1078 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1081 while (size > 0) {
1082 int i;
1084 if (skb_shinfo(skb)->ufo_size)
1085 len = size;
1086 else {
1088 /* Check if the remaining data fits into current packet. */
1089 len = mtu - skb->len;
1090 if (len < size)
1091 len = maxfraglen - skb->len;
1093 if (len <= 0) {
1094 struct sk_buff *skb_prev;
1095 char *data;
1096 struct iphdr *iph;
1097 int alloclen;
1099 skb_prev = skb;
1100 fraggap = skb_prev->len - maxfraglen;
1102 alloclen = fragheaderlen + hh_len + fraggap + 15;
1103 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1104 if (unlikely(!skb)) {
1105 err = -ENOBUFS;
1106 goto error;
1110 * Fill in the control structures
1112 skb->ip_summed = CHECKSUM_NONE;
1113 skb->csum = 0;
1114 skb_reserve(skb, hh_len);
1117 * Find where to start putting bytes.
1119 data = skb_put(skb, fragheaderlen + fraggap);
1120 skb->nh.iph = iph = (struct iphdr *)data;
1121 data += fragheaderlen;
1122 skb->h.raw = data;
1124 if (fraggap) {
1125 skb->csum = skb_copy_and_csum_bits(
1126 skb_prev, maxfraglen,
1127 data, fraggap, 0);
1128 skb_prev->csum = csum_sub(skb_prev->csum,
1129 skb->csum);
1130 skb_trim(skb_prev, maxfraglen);
1134 * Put the packet on the pending queue.
1136 __skb_queue_tail(&sk->sk_write_queue, skb);
1137 continue;
1140 i = skb_shinfo(skb)->nr_frags;
1141 if (len > size)
1142 len = size;
1143 if (skb_can_coalesce(skb, i, page, offset)) {
1144 skb_shinfo(skb)->frags[i-1].size += len;
1145 } else if (i < MAX_SKB_FRAGS) {
1146 get_page(page);
1147 skb_fill_page_desc(skb, i, page, offset, len);
1148 } else {
1149 err = -EMSGSIZE;
1150 goto error;
1153 if (skb->ip_summed == CHECKSUM_NONE) {
1154 unsigned int csum;
1155 csum = csum_page(page, offset, len);
1156 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1159 skb->len += len;
1160 skb->data_len += len;
1161 offset += len;
1162 size -= len;
1164 return 0;
1166 error:
1167 inet->cork.length -= size;
1168 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1169 return err;
1173 * Combined all pending IP fragments on the socket as one IP datagram
1174 * and push them out.
1176 int ip_push_pending_frames(struct sock *sk)
1178 struct sk_buff *skb, *tmp_skb;
1179 struct sk_buff **tail_skb;
1180 struct inet_sock *inet = inet_sk(sk);
1181 struct ip_options *opt = NULL;
1182 struct rtable *rt = inet->cork.rt;
1183 struct iphdr *iph;
1184 int df = 0;
1185 __u8 ttl;
1186 int err = 0;
1188 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1189 goto out;
1190 tail_skb = &(skb_shinfo(skb)->frag_list);
1192 /* move skb->data to ip header from ext header */
1193 if (skb->data < skb->nh.raw)
1194 __skb_pull(skb, skb->nh.raw - skb->data);
1195 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1196 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1197 *tail_skb = tmp_skb;
1198 tail_skb = &(tmp_skb->next);
1199 skb->len += tmp_skb->len;
1200 skb->data_len += tmp_skb->len;
1201 skb->truesize += tmp_skb->truesize;
1202 __sock_put(tmp_skb->sk);
1203 tmp_skb->destructor = NULL;
1204 tmp_skb->sk = NULL;
1207 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1208 * to fragment the frame generated here. No matter, what transforms
1209 * how transforms change size of the packet, it will come out.
1211 if (inet->pmtudisc != IP_PMTUDISC_DO)
1212 skb->local_df = 1;
1214 /* DF bit is set when we want to see DF on outgoing frames.
1215 * If local_df is set too, we still allow to fragment this frame
1216 * locally. */
1217 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1218 (skb->len <= dst_mtu(&rt->u.dst) &&
1219 ip_dont_fragment(sk, &rt->u.dst)))
1220 df = htons(IP_DF);
1222 if (inet->cork.flags & IPCORK_OPT)
1223 opt = inet->cork.opt;
1225 if (rt->rt_type == RTN_MULTICAST)
1226 ttl = inet->mc_ttl;
1227 else
1228 ttl = ip_select_ttl(inet, &rt->u.dst);
1230 iph = (struct iphdr *)skb->data;
1231 iph->version = 4;
1232 iph->ihl = 5;
1233 if (opt) {
1234 iph->ihl += opt->optlen>>2;
1235 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1237 iph->tos = inet->tos;
1238 iph->tot_len = htons(skb->len);
1239 iph->frag_off = df;
1240 if (!df) {
1241 __ip_select_ident(iph, &rt->u.dst, 0);
1242 } else {
1243 iph->id = htons(inet->id++);
1245 iph->ttl = ttl;
1246 iph->protocol = sk->sk_protocol;
1247 iph->saddr = rt->rt_src;
1248 iph->daddr = rt->rt_dst;
1249 ip_send_check(iph);
1251 skb->priority = sk->sk_priority;
1252 skb->dst = dst_clone(&rt->u.dst);
1254 /* Netfilter gets whole the not fragmented skb. */
1255 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1256 skb->dst->dev, dst_output);
1257 if (err) {
1258 if (err > 0)
1259 err = inet->recverr ? net_xmit_errno(err) : 0;
1260 if (err)
1261 goto error;
1264 out:
1265 inet->cork.flags &= ~IPCORK_OPT;
1266 kfree(inet->cork.opt);
1267 inet->cork.opt = NULL;
1268 if (inet->cork.rt) {
1269 ip_rt_put(inet->cork.rt);
1270 inet->cork.rt = NULL;
1272 return err;
1274 error:
1275 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1276 goto out;
1280 * Throw away all pending data on the socket.
1282 void ip_flush_pending_frames(struct sock *sk)
1284 struct inet_sock *inet = inet_sk(sk);
1285 struct sk_buff *skb;
1287 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1288 kfree_skb(skb);
1290 inet->cork.flags &= ~IPCORK_OPT;
1291 kfree(inet->cork.opt);
1292 inet->cork.opt = NULL;
1293 if (inet->cork.rt) {
1294 ip_rt_put(inet->cork.rt);
1295 inet->cork.rt = NULL;
1301 * Fetch data from kernel space and fill in checksum if needed.
1303 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1304 int len, int odd, struct sk_buff *skb)
1306 unsigned int csum;
1308 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1309 skb->csum = csum_block_add(skb->csum, csum, odd);
1310 return 0;
1314 * Generic function to send a packet as reply to another packet.
1315 * Used to send TCP resets so far. ICMP should use this function too.
1317 * Should run single threaded per socket because it uses the sock
1318 * structure to pass arguments.
1320 * LATER: switch from ip_build_xmit to ip_append_*
1322 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1323 unsigned int len)
1325 struct inet_sock *inet = inet_sk(sk);
1326 struct {
1327 struct ip_options opt;
1328 char data[40];
1329 } replyopts;
1330 struct ipcm_cookie ipc;
1331 u32 daddr;
1332 struct rtable *rt = (struct rtable*)skb->dst;
1334 if (ip_options_echo(&replyopts.opt, skb))
1335 return;
1337 daddr = ipc.addr = rt->rt_src;
1338 ipc.opt = NULL;
1340 if (replyopts.opt.optlen) {
1341 ipc.opt = &replyopts.opt;
1343 if (ipc.opt->srr)
1344 daddr = replyopts.opt.faddr;
1348 struct flowi fl = { .nl_u = { .ip4_u =
1349 { .daddr = daddr,
1350 .saddr = rt->rt_spec_dst,
1351 .tos = RT_TOS(skb->nh.iph->tos) } },
1352 /* Not quite clean, but right. */
1353 .uli_u = { .ports =
1354 { .sport = skb->h.th->dest,
1355 .dport = skb->h.th->source } },
1356 .proto = sk->sk_protocol };
1357 if (ip_route_output_key(&rt, &fl))
1358 return;
1361 /* And let IP do all the hard work.
1363 This chunk is not reenterable, hence spinlock.
1364 Note that it uses the fact, that this function is called
1365 with locally disabled BH and that sk cannot be already spinlocked.
1367 bh_lock_sock(sk);
1368 inet->tos = skb->nh.iph->tos;
1369 sk->sk_priority = skb->priority;
1370 sk->sk_protocol = skb->nh.iph->protocol;
1371 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1372 &ipc, rt, MSG_DONTWAIT);
1373 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1374 if (arg->csumoffset >= 0)
1375 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1376 skb->ip_summed = CHECKSUM_NONE;
1377 ip_push_pending_frames(sk);
1380 bh_unlock_sock(sk);
1382 ip_rt_put(rt);
1385 void __init ip_init(void)
1387 ip_rt_init();
1388 inet_initpeers();
1390 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1391 igmp_mc_proc_init();
1392 #endif
1395 EXPORT_SYMBOL(ip_fragment);
1396 EXPORT_SYMBOL(ip_generic_getfrag);
1397 EXPORT_SYMBOL(ip_queue_xmit);
1398 EXPORT_SYMBOL(ip_send_check);