net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         skb_reset_mac_header(newskb);
  99         __skb_pull(newskb, skb_network_offset(newskb));
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 129         skb_reset_network_header(skb);
 130         iph = ip_hdr(skb);
 131         iph->version  = 4;
 132         iph->ihl      = 5;
 133         iph->tos      = inet->tos;
 134         if (ip_dont_fragment(sk, &rt->u.dst))
 135                 iph->frag_off = htons(IP_DF);
 136         else
 137                 iph->frag_off = 0;
 138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 139         iph->daddr    = rt->rt_dst;
 140         iph->saddr    = rt->rt_src;
 141         iph->protocol = sk->sk_protocol;
 142         iph->tot_len  = htons(skb->len);
 143         ip_select_ident(iph, &rt->u.dst, sk);
 144
 145         if (opt && opt->optlen) {
 146                 iph->ihl += opt->optlen>>2;
 147                 ip_options_build(skb, opt, daddr, rt, 0);
 148         }
 149         ip_send_check(iph);
 150
 151         skb->priority = sk->sk_priority;
 152
 153         /* Send it out. */
 154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 155                        dst_output);
 156 }
 157
 158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct rtable *rt = (struct rtable *)dst;
 164         struct net_device *dev = dst->dev;
 165         int hh_len = LL_RESERVED_SPACE(dev);
 166
 167         if (rt->rt_type == RTN_MULTICAST)
 168                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 169         else if (rt->rt_type == RTN_BROADCAST)
 170                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 171
 172         /* Be paranoid, rather than too clever. */
 173         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 174                 struct sk_buff *skb2;
 175
 176                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 177                 if (skb2 == NULL) {
 178                         kfree_skb(skb);
 179                         return -ENOMEM;
 180                 }
 181                 if (skb->sk)
 182                         skb_set_owner_w(skb2, skb->sk);
 183                 kfree_skb(skb);
 184                 skb = skb2;
 185         }
 186
 187         if (dst->hh)
 188                 return neigh_hh_output(dst->hh, skb);
 189         else if (dst->neighbour)
 190                 return dst->neighbour->output(skb);
 191
 192         if (net_ratelimit())
 193                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 194         kfree_skb(skb);
 195         return -EINVAL;
 196 }
 197
 198 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 199 {
 200         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 201
 202         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 203                skb->dst->dev->mtu : dst_mtu(skb->dst);
 204 }
 205
 206 static inline int ip_finish_output(struct sk_buff *skb)
 207 {
 208 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 209         /* Policy lookup after SNAT yielded a new policy */
 210         if (skb->dst->xfrm != NULL) {
 211                 IPCB(skb)->flags |= IPSKB_REROUTED;
 212                 return dst_output(skb);
 213         }
 214 #endif
 215         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 216                 return ip_fragment(skb, ip_finish_output2);
 217         else
 218                 return ip_finish_output2(skb);
 219 }
 220
 221 int ip_mc_output(struct sk_buff *skb)
 222 {
 223         struct sock *sk = skb->sk;
 224         struct rtable *rt = (struct rtable*)skb->dst;
 225         struct net_device *dev = rt->u.dst.dev;
 226
 227         /*
 228          *      If the indicated interface is up and running, send the packet.
 229          */
 230         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 231
 232         skb->dev = dev;
 233         skb->protocol = htons(ETH_P_IP);
 234
 235         /*
 236          *      Multicasts are looped back for other local users
 237          */
 238
 239         if (rt->rt_flags&RTCF_MULTICAST) {
 240                 if ((!sk || inet_sk(sk)->mc_loop)
 241 #ifdef CONFIG_IP_MROUTE
 242                 /* Small optimization: do not loopback not local frames,
 243                    which returned after forwarding; they will be  dropped
 244                    by ip_mr_input in any case.
 245                    Note, that local frames are looped back to be delivered
 246                    to local recipients.
 247
 248                    This check is duplicated in ip_mr_input at the moment.
 249                  */
 250                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 251 #endif
 252                 ) {
 253                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 254                         if (newskb)
 255                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 256                                         newskb->dev,
 257                                         ip_dev_loopback_xmit);
 258                 }
 259
 260                 /* Multicasts with ttl 0 must not go beyond the host */
 261
 262                 if (ip_hdr(skb)->ttl == 0) {
 263                         kfree_skb(skb);
 264                         return 0;
 265                 }
 266         }
 267
 268         if (rt->rt_flags&RTCF_BROADCAST) {
 269                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 270                 if (newskb)
 271                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 272                                 newskb->dev, ip_dev_loopback_xmit);
 273         }
 274
 275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 276                             ip_finish_output,
 277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 278 }
 279
 280 int ip_output(struct sk_buff *skb)
 281 {
 282         struct net_device *dev = skb->dst->dev;
 283
 284         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 285
 286         skb->dev = dev;
 287         skb->protocol = htons(ETH_P_IP);
 288
 289         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 290                             ip_finish_output,
 291                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 292 }
 293
 294 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 295 {
 296         struct sock *sk = skb->sk;
 297         struct inet_sock *inet = inet_sk(sk);
 298         struct ip_options *opt = inet->opt;
 299         struct rtable *rt;
 300         struct iphdr *iph;
 301
 302         /* Skip all of this if the packet is already routed,
 303          * f.e. by something like SCTP.
 304          */
 305         rt = (struct rtable *) skb->dst;
 306         if (rt != NULL)
 307                 goto packet_routed;
 308
 309         /* Make sure we can route this packet. */
 310         rt = (struct rtable *)__sk_dst_check(sk, 0);
 311         if (rt == NULL) {
 312                 __be32 daddr;
 313
 314                 /* Use correct destination address if we have options. */
 315                 daddr = inet->daddr;
 316                 if(opt && opt->srr)
 317                         daddr = opt->faddr;
 318
 319                 {
 320                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 321                                             .nl_u = { .ip4_u =
 322                                                       { .daddr = daddr,
 323                                                         .saddr = inet->saddr,
 324                                                         .tos = RT_CONN_FLAGS(sk) } },
 325                                             .proto = sk->sk_protocol,
 326                                             .uli_u = { .ports =
 327                                                        { .sport = inet->sport,
 328                                                          .dport = inet->dport } } };
 329
 330                         /* If this fails, retransmit mechanism of transport layer will
 331                          * keep trying until route appears or the connection times
 332                          * itself out.
 333                          */
 334                         security_sk_classify_flow(sk, &fl);
 335                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 336                                 goto no_route;
 337                 }
 338                 sk_setup_caps(sk, &rt->u.dst);
 339         }
 340         skb->dst = dst_clone(&rt->u.dst);
 341
 342 packet_routed:
 343         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 344                 goto no_route;
 345
 346         /* OK, we know where to send it, allocate and build IP header. */
 347         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 348         skb_reset_network_header(skb);
 349         iph = ip_hdr(skb);
 350         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 351         iph->tot_len = htons(skb->len);
 352         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 353                 iph->frag_off = htons(IP_DF);
 354         else
 355                 iph->frag_off = 0;
 356         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 357         iph->protocol = sk->sk_protocol;
 358         iph->saddr    = rt->rt_src;
 359         iph->daddr    = rt->rt_dst;
 360         /* Transport layer set skb->h.foo itself. */
 361
 362         if (opt && opt->optlen) {
 363                 iph->ihl += opt->optlen >> 2;
 364                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 365         }
 366
 367         ip_select_ident_more(iph, &rt->u.dst, sk,
 368                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 369
 370         /* Add an IP checksum. */
 371         ip_send_check(iph);
 372
 373         skb->priority = sk->sk_priority;
 374
 375         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 376                        dst_output);
 377
 378 no_route:
 379         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 380         kfree_skb(skb);
 381         return -EHOSTUNREACH;
 382 }
 383
 384
 385 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 386 {
 387         to->pkt_type = from->pkt_type;
 388         to->priority = from->priority;
 389         to->protocol = from->protocol;
 390         dst_release(to->dst);
 391         to->dst = dst_clone(from->dst);
 392         to->dev = from->dev;
 393         to->mark = from->mark;
 394
 395         /* Copy the flags to each fragment. */
 396         IPCB(to)->flags = IPCB(from)->flags;
 397
 398 #ifdef CONFIG_NET_SCHED
 399         to->tc_index = from->tc_index;
 400 #endif
 401         nf_copy(to, from);
 402 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 403         to->ipvs_property = from->ipvs_property;
 404 #endif
 405         skb_copy_secmark(to, from);
 406 }
 407
 408 /*
 409  *      This IP datagram is too large to be sent in one piece.  Break it up into
 410  *      smaller pieces (each of size equal to IP header plus
 411  *      a block of the data of the original IP data part) that will yet fit in a
 412  *      single device frame, and queue such a frame for sending.
 413  */
 414
 415 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 416 {
 417         struct iphdr *iph;
 418         int raw = 0;
 419         int ptr;
 420         struct net_device *dev;
 421         struct sk_buff *skb2;
 422         unsigned int mtu, hlen, left, len, ll_rs, pad;
 423         int offset;
 424         __be16 not_last_frag;
 425         struct rtable *rt = (struct rtable*)skb->dst;
 426         int err = 0;
 427
 428         dev = rt->u.dst.dev;
 429
 430         /*
 431          *      Point into the IP datagram header.
 432          */
 433
 434         iph = ip_hdr(skb);
 435
 436         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 437                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 438                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 439                           htonl(ip_skb_dst_mtu(skb)));
 440                 kfree_skb(skb);
 441                 return -EMSGSIZE;
 442         }
 443
 444         /*
 445          *      Setup starting values.
 446          */
 447
 448         hlen = iph->ihl * 4;
 449         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 450         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 451
 452         /* When frag_list is given, use it. First, check its validity:
 453          * some transformers could create wrong frag_list or break existing
 454          * one, it is not prohibited. In this case fall back to copying.
 455          *
 456          * LATER: this step can be merged to real generation of fragments,
 457          * we can switch to copy when see the first bad fragment.
 458          */
 459         if (skb_shinfo(skb)->frag_list) {
 460                 struct sk_buff *frag;
 461                 int first_len = skb_pagelen(skb);
 462
 463                 if (first_len - hlen > mtu ||
 464                     ((first_len - hlen) & 7) ||
 465                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 466                     skb_cloned(skb))
 467                         goto slow_path;
 468
 469                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 470                         /* Correct geometry. */
 471                         if (frag->len > mtu ||
 472                             ((frag->len & 7) && frag->next) ||
 473                             skb_headroom(frag) < hlen)
 474                             goto slow_path;
 475
 476                         /* Partially cloned skb? */
 477                         if (skb_shared(frag))
 478                                 goto slow_path;
 479
 480                         BUG_ON(frag->sk);
 481                         if (skb->sk) {
 482                                 sock_hold(skb->sk);
 483                                 frag->sk = skb->sk;
 484                                 frag->destructor = sock_wfree;
 485                                 skb->truesize -= frag->truesize;
 486                         }
 487                 }
 488
 489                 /* Everything is OK. Generate! */
 490
 491                 err = 0;
 492                 offset = 0;
 493                 frag = skb_shinfo(skb)->frag_list;
 494                 skb_shinfo(skb)->frag_list = NULL;
 495                 skb->data_len = first_len - skb_headlen(skb);
 496                 skb->len = first_len;
 497                 iph->tot_len = htons(first_len);
 498                 iph->frag_off = htons(IP_MF);
 499                 ip_send_check(iph);
 500
 501                 for (;;) {
 502                         /* Prepare header of the next frame,
 503                          * before previous one went down. */
 504                         if (frag) {
 505                                 frag->ip_summed = CHECKSUM_NONE;
 506                                 skb_reset_transport_header(frag);
 507                                 __skb_push(frag, hlen);
 508                                 skb_reset_network_header(frag);
 509                                 memcpy(skb_network_header(frag), iph, hlen);
 510                                 iph = ip_hdr(frag);
 511                                 iph->tot_len = htons(frag->len);
 512                                 ip_copy_metadata(frag, skb);
 513                                 if (offset == 0)
 514                                         ip_options_fragment(frag);
 515                                 offset += skb->len - hlen;
 516                                 iph->frag_off = htons(offset>>3);
 517                                 if (frag->next != NULL)
 518                                         iph->frag_off |= htons(IP_MF);
 519                                 /* Ready, complete checksum */
 520                                 ip_send_check(iph);
 521                         }
 522
 523                         err = output(skb);
 524
 525                         if (!err)
 526                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 527                         if (err || !frag)
 528                                 break;
 529
 530                         skb = frag;
 531                         frag = skb->next;
 532                         skb->next = NULL;
 533                 }
 534
 535                 if (err == 0) {
 536                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 537                         return 0;
 538                 }
 539
 540                 while (frag) {
 541                         skb = frag->next;
 542                         kfree_skb(frag);
 543                         frag = skb;
 544                 }
 545                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 546                 return err;
 547         }
 548
 549 slow_path:
 550         left = skb->len - hlen;         /* Space per frame */
 551         ptr = raw + hlen;               /* Where to start from */
 552
 553         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 554          * we need to make room for the encapsulating header
 555          */
 556         pad = nf_bridge_pad(skb);
 557         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 558         mtu -= pad;
 559
 560         /*
 561          *      Fragment the datagram.
 562          */
 563
 564         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 565         not_last_frag = iph->frag_off & htons(IP_MF);
 566
 567         /*
 568          *      Keep copying data until we run out.
 569          */
 570
 571         while (left > 0) {
 572                 len = left;
 573                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 574                 if (len > mtu)
 575                         len = mtu;
 576                 /* IF: we are not sending upto and including the packet end
 577                    then align the next start on an eight byte boundary */
 578                 if (len < left) {
 579                         len &= ~7;
 580                 }
 581                 /*
 582                  *      Allocate buffer.
 583                  */
 584
 585                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 586                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 587                         err = -ENOMEM;
 588                         goto fail;
 589                 }
 590
 591                 /*
 592                  *      Set up data on packet
 593                  */
 594
 595                 ip_copy_metadata(skb2, skb);
 596                 skb_reserve(skb2, ll_rs);
 597                 skb_put(skb2, len + hlen);
 598                 skb_reset_network_header(skb2);
 599                 skb2->transport_header = skb2->network_header + hlen;
 600
 601                 /*
 602                  *      Charge the memory for the fragment to any owner
 603                  *      it might possess
 604                  */
 605
 606                 if (skb->sk)
 607                         skb_set_owner_w(skb2, skb->sk);
 608
 609                 /*
 610                  *      Copy the packet header into the new buffer.
 611                  */
 612
 613                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 614
 615                 /*
 616                  *      Copy a block of the IP datagram.
 617                  */
 618                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 619                         BUG();
 620                 left -= len;
 621
 622                 /*
 623                  *      Fill in the new header fields.
 624                  */
 625                 iph = ip_hdr(skb2);
 626                 iph->frag_off = htons((offset >> 3));
 627
 628                 /* ANK: dirty, but effective trick. Upgrade options only if
 629                  * the segment to be fragmented was THE FIRST (otherwise,
 630                  * options are already fixed) and make it ONCE
 631                  * on the initial skb, so that all the following fragments
 632                  * will inherit fixed options.
 633                  */
 634                 if (offset == 0)
 635                         ip_options_fragment(skb);
 636
 637                 /*
 638                  *      Added AC : If we are fragmenting a fragment that's not the
 639                  *                 last fragment then keep MF on each bit
 640                  */
 641                 if (left > 0 || not_last_frag)
 642                         iph->frag_off |= htons(IP_MF);
 643                 ptr += len;
 644                 offset += len;
 645
 646                 /*
 647                  *      Put this fragment into the sending queue.
 648                  */
 649                 iph->tot_len = htons(len + hlen);
 650
 651                 ip_send_check(iph);
 652
 653                 err = output(skb2);
 654                 if (err)
 655                         goto fail;
 656
 657                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 658         }
 659         kfree_skb(skb);
 660         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 661         return err;
 662
 663 fail:
 664         kfree_skb(skb);
 665         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 666         return err;
 667 }
 668
 669 EXPORT_SYMBOL(ip_fragment);
 670
 671 int
 672 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 673 {
 674         struct iovec *iov = from;
 675
 676         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 677                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 678                         return -EFAULT;
 679         } else {
 680                 __wsum csum = 0;
 681                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 682                         return -EFAULT;
 683                 skb->csum = csum_block_add(skb->csum, csum, odd);
 684         }
 685         return 0;
 686 }
 687
 688 static inline __wsum
 689 csum_page(struct page *page, int offset, int copy)
 690 {
 691         char *kaddr;
 692         __wsum csum;
 693         kaddr = kmap(page);
 694         csum = csum_partial(kaddr + offset, copy, 0);
 695         kunmap(page);
 696         return csum;
 697 }
 698
 699 static inline int ip_ufo_append_data(struct sock *sk,
 700                         int getfrag(void *from, char *to, int offset, int len,
 701                                int odd, struct sk_buff *skb),
 702                         void *from, int length, int hh_len, int fragheaderlen,
 703                         int transhdrlen, int mtu,unsigned int flags)
 704 {
 705         struct sk_buff *skb;
 706         int err;
 707
 708         /* There is support for UDP fragmentation offload by network
 709          * device, so create one single skb packet containing complete
 710          * udp datagram
 711          */
 712         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 713                 skb = sock_alloc_send_skb(sk,
 714                         hh_len + fragheaderlen + transhdrlen + 20,
 715                         (flags & MSG_DONTWAIT), &err);
 716
 717                 if (skb == NULL)
 718                         return err;
 719
 720                 /* reserve space for Hardware header */
 721                 skb_reserve(skb, hh_len);
 722
 723                 /* create space for UDP/IP header */
 724                 skb_put(skb,fragheaderlen + transhdrlen);
 725
 726                 /* initialize network header pointer */
 727                 skb_reset_network_header(skb);
 728
 729                 /* initialize protocol header pointer */
 730                 skb->transport_header = skb->network_header + fragheaderlen;
 731
 732                 skb->ip_summed = CHECKSUM_PARTIAL;
 733                 skb->csum = 0;
 734                 sk->sk_sndmsg_off = 0;
 735         }
 736
 737         err = skb_append_datato_frags(sk,skb, getfrag, from,
 738                                (length - transhdrlen));
 739         if (!err) {
 740                 /* specify the length of each IP datagram fragment*/
 741                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 742                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 743                 __skb_queue_tail(&sk->sk_write_queue, skb);
 744
 745                 return 0;
 746         }
 747         /* There is not enough support do UFO ,
 748          * so follow normal path
 749          */
 750         kfree_skb(skb);
 751         return err;
 752 }
 753
 754 /*
 755  *      ip_append_data() and ip_append_page() can make one large IP datagram
 756  *      from many pieces of data. Each pieces will be holded on the socket
 757  *      until ip_push_pending_frames() is called. Each piece can be a page
 758  *      or non-page data.
 759  *
 760  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 761  *      this interface potentially.
 762  *
 763  *      LATER: length must be adjusted by pad at tail, when it is required.
 764  */
 765 int ip_append_data(struct sock *sk,
 766                    int getfrag(void *from, char *to, int offset, int len,
 767                                int odd, struct sk_buff *skb),
 768                    void *from, int length, int transhdrlen,
 769                    struct ipcm_cookie *ipc, struct rtable *rt,
 770                    unsigned int flags)
 771 {
 772         struct inet_sock *inet = inet_sk(sk);
 773         struct sk_buff *skb;
 774
 775         struct ip_options *opt = NULL;
 776         int hh_len;
 777         int exthdrlen;
 778         int mtu;
 779         int copy;
 780         int err;
 781         int offset = 0;
 782         unsigned int maxfraglen, fragheaderlen;
 783         int csummode = CHECKSUM_NONE;
 784
 785         if (flags&MSG_PROBE)
 786                 return 0;
 787
 788         if (skb_queue_empty(&sk->sk_write_queue)) {
 789                 /*
 790                  * setup for corking.
 791                  */
 792                 opt = ipc->opt;
 793                 if (opt) {
 794                         if (inet->cork.opt == NULL) {
 795                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 796                                 if (unlikely(inet->cork.opt == NULL))
 797                                         return -ENOBUFS;
 798                         }
 799                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 800                         inet->cork.flags |= IPCORK_OPT;
 801                         inet->cork.addr = ipc->addr;
 802                 }
 803                 dst_hold(&rt->u.dst);
 804                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 805                                             rt->u.dst.dev->mtu :
 806                                             dst_mtu(rt->u.dst.path);
 807                 inet->cork.rt = rt;
 808                 inet->cork.length = 0;
 809                 sk->sk_sndmsg_page = NULL;
 810                 sk->sk_sndmsg_off = 0;
 811                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 812                         length += exthdrlen;
 813                         transhdrlen += exthdrlen;
 814                 }
 815         } else {
 816                 rt = inet->cork.rt;
 817                 if (inet->cork.flags & IPCORK_OPT)
 818                         opt = inet->cork.opt;
 819
 820                 transhdrlen = 0;
 821                 exthdrlen = 0;
 822                 mtu = inet->cork.fragsize;
 823         }
 824         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 825
 826         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 827         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 828
 829         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 830                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 831                 return -EMSGSIZE;
 832         }
 833
 834         /*
 835          * transhdrlen > 0 means that this is the first fragment and we wish
 836          * it won't be fragmented in the future.
 837          */
 838         if (transhdrlen &&
 839             length + fragheaderlen <= mtu &&
 840             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 841             !exthdrlen)
 842                 csummode = CHECKSUM_PARTIAL;
 843
 844         inet->cork.length += length;
 845         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 846                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 847
 848                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 849                                          fragheaderlen, transhdrlen, mtu,
 850                                          flags);
 851                 if (err)
 852                         goto error;
 853                 return 0;
 854         }
 855
 856         /* So, what's going on in the loop below?
 857          *
 858          * We use calculated fragment length to generate chained skb,
 859          * each of segments is IP fragment ready for sending to network after
 860          * adding appropriate IP header.
 861          */
 862
 863         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 864                 goto alloc_new_skb;
 865
 866         while (length > 0) {
 867                 /* Check if the remaining data fits into current packet. */
 868                 copy = mtu - skb->len;
 869                 if (copy < length)
 870                         copy = maxfraglen - skb->len;
 871                 if (copy <= 0) {
 872                         char *data;
 873                         unsigned int datalen;
 874                         unsigned int fraglen;
 875                         unsigned int fraggap;
 876                         unsigned int alloclen;
 877                         struct sk_buff *skb_prev;
 878 alloc_new_skb:
 879                         skb_prev = skb;
 880                         if (skb_prev)
 881                                 fraggap = skb_prev->len - maxfraglen;
 882                         else
 883                                 fraggap = 0;
 884
 885                         /*
 886                          * If remaining data exceeds the mtu,
 887                          * we know we need more fragment(s).
 888                          */
 889                         datalen = length + fraggap;
 890                         if (datalen > mtu - fragheaderlen)
 891                                 datalen = maxfraglen - fragheaderlen;
 892                         fraglen = datalen + fragheaderlen;
 893
 894                         if ((flags & MSG_MORE) &&
 895                             !(rt->u.dst.dev->features&NETIF_F_SG))
 896                                 alloclen = mtu;
 897                         else
 898                                 alloclen = datalen + fragheaderlen;
 899
 900                         /* The last fragment gets additional space at tail.
 901                          * Note, with MSG_MORE we overallocate on fragments,
 902                          * because we have no idea what fragment will be
 903                          * the last.
 904                          */
 905                         if (datalen == length + fraggap)
 906                                 alloclen += rt->u.dst.trailer_len;
 907
 908                         if (transhdrlen) {
 909                                 skb = sock_alloc_send_skb(sk,
 910                                                 alloclen + hh_len + 15,
 911                                                 (flags & MSG_DONTWAIT), &err);
 912                         } else {
 913                                 skb = NULL;
 914                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 915                                     2 * sk->sk_sndbuf)
 916                                         skb = sock_wmalloc(sk,
 917                                                            alloclen + hh_len + 15, 1,
 918                                                            sk->sk_allocation);
 919                                 if (unlikely(skb == NULL))
 920                                         err = -ENOBUFS;
 921                         }
 922                         if (skb == NULL)
 923                                 goto error;
 924
 925                         /*
 926                          *      Fill in the control structures
 927                          */
 928                         skb->ip_summed = csummode;
 929                         skb->csum = 0;
 930                         skb_reserve(skb, hh_len);
 931
 932                         /*
 933                          *      Find where to start putting bytes.
 934                          */
 935                         data = skb_put(skb, fraglen);
 936                         skb_set_network_header(skb, exthdrlen);
 937                         skb->transport_header = (skb->network_header +
 938                                                  fragheaderlen);
 939                         data += fragheaderlen;
 940
 941                         if (fraggap) {
 942                                 skb->csum = skb_copy_and_csum_bits(
 943                                         skb_prev, maxfraglen,
 944                                         data + transhdrlen, fraggap, 0);
 945                                 skb_prev->csum = csum_sub(skb_prev->csum,
 946                                                           skb->csum);
 947                                 data += fraggap;
 948                                 pskb_trim_unique(skb_prev, maxfraglen);
 949                         }
 950
 951                         copy = datalen - transhdrlen - fraggap;
 952                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 953                                 err = -EFAULT;
 954                                 kfree_skb(skb);
 955                                 goto error;
 956                         }
 957
 958                         offset += copy;
 959                         length -= datalen - fraggap;
 960                         transhdrlen = 0;
 961                         exthdrlen = 0;
 962                         csummode = CHECKSUM_NONE;
 963
 964                         /*
 965                          * Put the packet on the pending queue.
 966                          */
 967                         __skb_queue_tail(&sk->sk_write_queue, skb);
 968                         continue;
 969                 }
 970
 971                 if (copy > length)
 972                         copy = length;
 973
 974                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 975                         unsigned int off;
 976
 977                         off = skb->len;
 978                         if (getfrag(from, skb_put(skb, copy),
 979                                         offset, copy, off, skb) < 0) {
 980                                 __skb_trim(skb, off);
 981                                 err = -EFAULT;
 982                                 goto error;
 983                         }
 984                 } else {
 985                         int i = skb_shinfo(skb)->nr_frags;
 986                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 987                         struct page *page = sk->sk_sndmsg_page;
 988                         int off = sk->sk_sndmsg_off;
 989                         unsigned int left;
 990
 991                         if (page && (left = PAGE_SIZE - off) > 0) {
 992                                 if (copy >= left)
 993                                         copy = left;
 994                                 if (page != frag->page) {
 995                                         if (i == MAX_SKB_FRAGS) {
 996                                                 err = -EMSGSIZE;
 997                                                 goto error;
 998                                         }
 999                                         get_page(page);
1000                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1001                                         frag = &skb_shinfo(skb)->frags[i];
1002                                 }
1003                         } else if (i < MAX_SKB_FRAGS) {
1004                                 if (copy > PAGE_SIZE)
1005                                         copy = PAGE_SIZE;
1006                                 page = alloc_pages(sk->sk_allocation, 0);
1007                                 if (page == NULL)  {
1008                                         err = -ENOMEM;
1009                                         goto error;
1010                                 }
1011                                 sk->sk_sndmsg_page = page;
1012                                 sk->sk_sndmsg_off = 0;
1013
1014                                 skb_fill_page_desc(skb, i, page, 0, 0);
1015                                 frag = &skb_shinfo(skb)->frags[i];
1016                                 skb->truesize += PAGE_SIZE;
1017                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1018                         } else {
1019                                 err = -EMSGSIZE;
1020                                 goto error;
1021                         }
1022                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1023                                 err = -EFAULT;
1024                                 goto error;
1025                         }
1026                         sk->sk_sndmsg_off += copy;
1027                         frag->size += copy;
1028                         skb->len += copy;
1029                         skb->data_len += copy;
1030                 }
1031                 offset += copy;
1032                 length -= copy;
1033         }
1034
1035         return 0;
1036
1037 error:
1038         inet->cork.length -= length;
1039         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1040         return err;
1041 }
1042
1043 ssize_t ip_append_page(struct sock *sk, struct page *page,
1044                        int offset, size_t size, int flags)
1045 {
1046         struct inet_sock *inet = inet_sk(sk);
1047         struct sk_buff *skb;
1048         struct rtable *rt;
1049         struct ip_options *opt = NULL;
1050         int hh_len;
1051         int mtu;
1052         int len;
1053         int err;
1054         unsigned int maxfraglen, fragheaderlen, fraggap;
1055
1056         if (inet->hdrincl)
1057                 return -EPERM;
1058
1059         if (flags&MSG_PROBE)
1060                 return 0;
1061
1062         if (skb_queue_empty(&sk->sk_write_queue))
1063                 return -EINVAL;
1064
1065         rt = inet->cork.rt;
1066         if (inet->cork.flags & IPCORK_OPT)
1067                 opt = inet->cork.opt;
1068
1069         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1070                 return -EOPNOTSUPP;
1071
1072         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1073         mtu = inet->cork.fragsize;
1074
1075         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1076         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1077
1078         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1079                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1080                 return -EMSGSIZE;
1081         }
1082
1083         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1084                 return -EINVAL;
1085
1086         inet->cork.length += size;
1087         if ((sk->sk_protocol == IPPROTO_UDP) &&
1088             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1089                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1090                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1091         }
1092
1093
1094         while (size > 0) {
1095                 int i;
1096
1097                 if (skb_is_gso(skb))
1098                         len = size;
1099                 else {
1100
1101                         /* Check if the remaining data fits into current packet. */
1102                         len = mtu - skb->len;
1103                         if (len < size)
1104                                 len = maxfraglen - skb->len;
1105                 }
1106                 if (len <= 0) {
1107                         struct sk_buff *skb_prev;
1108                         int alloclen;
1109
1110                         skb_prev = skb;
1111                         fraggap = skb_prev->len - maxfraglen;
1112
1113                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1114                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1115                         if (unlikely(!skb)) {
1116                                 err = -ENOBUFS;
1117                                 goto error;
1118                         }
1119
1120                         /*
1121                          *      Fill in the control structures
1122                          */
1123                         skb->ip_summed = CHECKSUM_NONE;
1124                         skb->csum = 0;
1125                         skb_reserve(skb, hh_len);
1126
1127                         /*
1128                          *      Find where to start putting bytes.
1129                          */
1130                         skb_put(skb, fragheaderlen + fraggap);
1131                         skb_reset_network_header(skb);
1132                         skb->transport_header = (skb->network_header +
1133                                                  fragheaderlen);
1134                         if (fraggap) {
1135                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1136                                                                    maxfraglen,
1137                                                     skb_transport_header(skb),
1138                                                                    fraggap, 0);
1139                                 skb_prev->csum = csum_sub(skb_prev->csum,
1140                                                           skb->csum);
1141                                 pskb_trim_unique(skb_prev, maxfraglen);
1142                         }
1143
1144                         /*
1145                          * Put the packet on the pending queue.
1146                          */
1147                         __skb_queue_tail(&sk->sk_write_queue, skb);
1148                         continue;
1149                 }
1150
1151                 i = skb_shinfo(skb)->nr_frags;
1152                 if (len > size)
1153                         len = size;
1154                 if (skb_can_coalesce(skb, i, page, offset)) {
1155                         skb_shinfo(skb)->frags[i-1].size += len;
1156                 } else if (i < MAX_SKB_FRAGS) {
1157                         get_page(page);
1158                         skb_fill_page_desc(skb, i, page, offset, len);
1159                 } else {
1160                         err = -EMSGSIZE;
1161                         goto error;
1162                 }
1163
1164                 if (skb->ip_summed == CHECKSUM_NONE) {
1165                         __wsum csum;
1166                         csum = csum_page(page, offset, len);
1167                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1168                 }
1169
1170                 skb->len += len;
1171                 skb->data_len += len;
1172                 offset += len;
1173                 size -= len;
1174         }
1175         return 0;
1176
1177 error:
1178         inet->cork.length -= size;
1179         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1180         return err;
1181 }
1182
1183 /*
1184  *      Combined all pending IP fragments on the socket as one IP datagram
1185  *      and push them out.
1186  */
1187 int ip_push_pending_frames(struct sock *sk)
1188 {
1189         struct sk_buff *skb, *tmp_skb;
1190         struct sk_buff **tail_skb;
1191         struct inet_sock *inet = inet_sk(sk);
1192         struct ip_options *opt = NULL;
1193         struct rtable *rt = inet->cork.rt;
1194         struct iphdr *iph;
1195         __be16 df = 0;
1196         __u8 ttl;
1197         int err = 0;
1198
1199         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1200                 goto out;
1201         tail_skb = &(skb_shinfo(skb)->frag_list);
1202
1203         /* move skb->data to ip header from ext header */
1204         if (skb->data < skb_network_header(skb))
1205                 __skb_pull(skb, skb_network_offset(skb));
1206         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1207                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1208                 *tail_skb = tmp_skb;
1209                 tail_skb = &(tmp_skb->next);
1210                 skb->len += tmp_skb->len;
1211                 skb->data_len += tmp_skb->len;
1212                 skb->truesize += tmp_skb->truesize;
1213                 __sock_put(tmp_skb->sk);
1214                 tmp_skb->destructor = NULL;
1215                 tmp_skb->sk = NULL;
1216         }
1217
1218         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1219          * to fragment the frame generated here. No matter, what transforms
1220          * how transforms change size of the packet, it will come out.
1221          */
1222         if (inet->pmtudisc < IP_PMTUDISC_DO)
1223                 skb->local_df = 1;
1224
1225         /* DF bit is set when we want to see DF on outgoing frames.
1226          * If local_df is set too, we still allow to fragment this frame
1227          * locally. */
1228         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1229             (skb->len <= dst_mtu(&rt->u.dst) &&
1230              ip_dont_fragment(sk, &rt->u.dst)))
1231                 df = htons(IP_DF);
1232
1233         if (inet->cork.flags & IPCORK_OPT)
1234                 opt = inet->cork.opt;
1235
1236         if (rt->rt_type == RTN_MULTICAST)
1237                 ttl = inet->mc_ttl;
1238         else
1239                 ttl = ip_select_ttl(inet, &rt->u.dst);
1240
1241         iph = (struct iphdr *)skb->data;
1242         iph->version = 4;
1243         iph->ihl = 5;
1244         if (opt) {
1245                 iph->ihl += opt->optlen>>2;
1246                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1247         }
1248         iph->tos = inet->tos;
1249         iph->tot_len = htons(skb->len);
1250         iph->frag_off = df;
1251         ip_select_ident(iph, &rt->u.dst, sk);
1252         iph->ttl = ttl;
1253         iph->protocol = sk->sk_protocol;
1254         iph->saddr = rt->rt_src;
1255         iph->daddr = rt->rt_dst;
1256         ip_send_check(iph);
1257
1258         skb->priority = sk->sk_priority;
1259         skb->dst = dst_clone(&rt->u.dst);
1260
1261         /* Netfilter gets whole the not fragmented skb. */
1262         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1263                       skb->dst->dev, dst_output);
1264         if (err) {
1265                 if (err > 0)
1266                         err = inet->recverr ? net_xmit_errno(err) : 0;
1267                 if (err)
1268                         goto error;
1269         }
1270
1271 out:
1272         inet->cork.flags &= ~IPCORK_OPT;
1273         kfree(inet->cork.opt);
1274         inet->cork.opt = NULL;
1275         if (inet->cork.rt) {
1276                 ip_rt_put(inet->cork.rt);
1277                 inet->cork.rt = NULL;
1278         }
1279         return err;
1280
1281 error:
1282         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1283         goto out;
1284 }
1285
1286 /*
1287  *      Throw away all pending data on the socket.
1288  */
1289 void ip_flush_pending_frames(struct sock *sk)
1290 {
1291         struct inet_sock *inet = inet_sk(sk);
1292         struct sk_buff *skb;
1293
1294         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1295                 kfree_skb(skb);
1296
1297         inet->cork.flags &= ~IPCORK_OPT;
1298         kfree(inet->cork.opt);
1299         inet->cork.opt = NULL;
1300         if (inet->cork.rt) {
1301                 ip_rt_put(inet->cork.rt);
1302                 inet->cork.rt = NULL;
1303         }
1304 }
1305
1306
1307 /*
1308  *      Fetch data from kernel space and fill in checksum if needed.
1309  */
1310 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1311                               int len, int odd, struct sk_buff *skb)
1312 {
1313         __wsum csum;
1314
1315         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1316         skb->csum = csum_block_add(skb->csum, csum, odd);
1317         return 0;
1318 }
1319
1320 /*
1321  *      Generic function to send a packet as reply to another packet.
1322  *      Used to send TCP resets so far. ICMP should use this function too.
1323  *
1324  *      Should run single threaded per socket because it uses the sock
1325  *      structure to pass arguments.
1326  *
1327  *      LATER: switch from ip_build_xmit to ip_append_*
1328  */
1329 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1330                    unsigned int len)
1331 {
1332         struct inet_sock *inet = inet_sk(sk);
1333         struct {
1334                 struct ip_options       opt;
1335                 char                    data[40];
1336         } replyopts;
1337         struct ipcm_cookie ipc;
1338         __be32 daddr;
1339         struct rtable *rt = (struct rtable*)skb->dst;
1340
1341         if (ip_options_echo(&replyopts.opt, skb))
1342                 return;
1343
1344         daddr = ipc.addr = rt->rt_src;
1345         ipc.opt = NULL;
1346
1347         if (replyopts.opt.optlen) {
1348                 ipc.opt = &replyopts.opt;
1349
1350                 if (ipc.opt->srr)
1351                         daddr = replyopts.opt.faddr;
1352         }
1353
1354         {
1355                 struct flowi fl = { .nl_u = { .ip4_u =
1356                                               { .daddr = daddr,
1357                                                 .saddr = rt->rt_spec_dst,
1358                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1359                                     /* Not quite clean, but right. */
1360                                     .uli_u = { .ports =
1361                                                { .sport = tcp_hdr(skb)->dest,
1362                                                  .dport = tcp_hdr(skb)->source } },
1363                                     .proto = sk->sk_protocol };
1364                 security_skb_classify_flow(skb, &fl);
1365                 if (ip_route_output_key(&rt, &fl))
1366                         return;
1367         }
1368
1369         /* And let IP do all the hard work.
1370
1371            This chunk is not reenterable, hence spinlock.
1372            Note that it uses the fact, that this function is called
1373            with locally disabled BH and that sk cannot be already spinlocked.
1374          */
1375         bh_lock_sock(sk);
1376         inet->tos = ip_hdr(skb)->tos;
1377         sk->sk_priority = skb->priority;
1378         sk->sk_protocol = ip_hdr(skb)->protocol;
1379         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1380                        &ipc, rt, MSG_DONTWAIT);
1381         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1382                 if (arg->csumoffset >= 0)
1383                         *((__sum16 *)skb_transport_header(skb) +
1384                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1385                                                                 arg->csum));
1386                 skb->ip_summed = CHECKSUM_NONE;
1387                 ip_push_pending_frames(sk);
1388         }
1389
1390         bh_unlock_sock(sk);
1391
1392         ip_rt_put(rt);
1393 }
1394
1395 void __init ip_init(void)
1396 {
1397         ip_rt_init();
1398         inet_initpeers();
1399
1400 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1401         igmp_mc_proc_init();
1402 #endif
1403 }
1404
1405 EXPORT_SYMBOL(ip_generic_getfrag);
1406 EXPORT_SYMBOL(ip_queue_xmit);
1407 EXPORT_SYMBOL(ip_send_check);