net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         skb_reset_mac_header(newskb);
  99         __skb_pull(newskb, skb_network_offset(newskb));
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 129         skb_reset_network_header(skb);
 130         iph = ip_hdr(skb);
 131         iph->version  = 4;
 132         iph->ihl      = 5;
 133         iph->tos      = inet->tos;
 134         if (ip_dont_fragment(sk, &rt->u.dst))
 135                 iph->frag_off = htons(IP_DF);
 136         else
 137                 iph->frag_off = 0;
 138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 139         iph->daddr    = rt->rt_dst;
 140         iph->saddr    = rt->rt_src;
 141         iph->protocol = sk->sk_protocol;
 142         iph->tot_len  = htons(skb->len);
 143         ip_select_ident(iph, &rt->u.dst, sk);
 144
 145         if (opt && opt->optlen) {
 146                 iph->ihl += opt->optlen>>2;
 147                 ip_options_build(skb, opt, daddr, rt, 0);
 148         }
 149         ip_send_check(iph);
 150
 151         skb->priority = sk->sk_priority;
 152
 153         /* Send it out. */
 154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 155                        dst_output);
 156 }
 157
 158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct net_device *dev = dst->dev;
 164         int hh_len = LL_RESERVED_SPACE(dev);
 165
 166         /* Be paranoid, rather than too clever. */
 167         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 168                 struct sk_buff *skb2;
 169
 170                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 171                 if (skb2 == NULL) {
 172                         kfree_skb(skb);
 173                         return -ENOMEM;
 174                 }
 175                 if (skb->sk)
 176                         skb_set_owner_w(skb2, skb->sk);
 177                 kfree_skb(skb);
 178                 skb = skb2;
 179         }
 180
 181         if (dst->hh)
 182                 return neigh_hh_output(dst->hh, skb);
 183         else if (dst->neighbour)
 184                 return dst->neighbour->output(skb);
 185
 186         if (net_ratelimit())
 187                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 188         kfree_skb(skb);
 189         return -EINVAL;
 190 }
 191
 192 static inline int ip_finish_output(struct sk_buff *skb)
 193 {
 194 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 195         /* Policy lookup after SNAT yielded a new policy */
 196         if (skb->dst->xfrm != NULL) {
 197                 IPCB(skb)->flags |= IPSKB_REROUTED;
 198                 return dst_output(skb);
 199         }
 200 #endif
 201         if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
 202                 return ip_fragment(skb, ip_finish_output2);
 203         else
 204                 return ip_finish_output2(skb);
 205 }
 206
 207 int ip_mc_output(struct sk_buff *skb)
 208 {
 209         struct sock *sk = skb->sk;
 210         struct rtable *rt = (struct rtable*)skb->dst;
 211         struct net_device *dev = rt->u.dst.dev;
 212
 213         /*
 214          *      If the indicated interface is up and running, send the packet.
 215          */
 216         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 217
 218         skb->dev = dev;
 219         skb->protocol = htons(ETH_P_IP);
 220
 221         /*
 222          *      Multicasts are looped back for other local users
 223          */
 224
 225         if (rt->rt_flags&RTCF_MULTICAST) {
 226                 if ((!sk || inet_sk(sk)->mc_loop)
 227 #ifdef CONFIG_IP_MROUTE
 228                 /* Small optimization: do not loopback not local frames,
 229                    which returned after forwarding; they will be  dropped
 230                    by ip_mr_input in any case.
 231                    Note, that local frames are looped back to be delivered
 232                    to local recipients.
 233
 234                    This check is duplicated in ip_mr_input at the moment.
 235                  */
 236                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 237 #endif
 238                 ) {
 239                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 240                         if (newskb)
 241                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 242                                         newskb->dev,
 243                                         ip_dev_loopback_xmit);
 244                 }
 245
 246                 /* Multicasts with ttl 0 must not go beyond the host */
 247
 248                 if (ip_hdr(skb)->ttl == 0) {
 249                         kfree_skb(skb);
 250                         return 0;
 251                 }
 252         }
 253
 254         if (rt->rt_flags&RTCF_BROADCAST) {
 255                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 256                 if (newskb)
 257                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 258                                 newskb->dev, ip_dev_loopback_xmit);
 259         }
 260
 261         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 262                             ip_finish_output,
 263                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 264 }
 265
 266 int ip_output(struct sk_buff *skb)
 267 {
 268         struct net_device *dev = skb->dst->dev;
 269
 270         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 271
 272         skb->dev = dev;
 273         skb->protocol = htons(ETH_P_IP);
 274
 275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 276                             ip_finish_output,
 277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 278 }
 279
 280 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 281 {
 282         struct sock *sk = skb->sk;
 283         struct inet_sock *inet = inet_sk(sk);
 284         struct ip_options *opt = inet->opt;
 285         struct rtable *rt;
 286         struct iphdr *iph;
 287
 288         /* Skip all of this if the packet is already routed,
 289          * f.e. by something like SCTP.
 290          */
 291         rt = (struct rtable *) skb->dst;
 292         if (rt != NULL)
 293                 goto packet_routed;
 294
 295         /* Make sure we can route this packet. */
 296         rt = (struct rtable *)__sk_dst_check(sk, 0);
 297         if (rt == NULL) {
 298                 __be32 daddr;
 299
 300                 /* Use correct destination address if we have options. */
 301                 daddr = inet->daddr;
 302                 if(opt && opt->srr)
 303                         daddr = opt->faddr;
 304
 305                 {
 306                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 307                                             .nl_u = { .ip4_u =
 308                                                       { .daddr = daddr,
 309                                                         .saddr = inet->saddr,
 310                                                         .tos = RT_CONN_FLAGS(sk) } },
 311                                             .proto = sk->sk_protocol,
 312                                             .uli_u = { .ports =
 313                                                        { .sport = inet->sport,
 314                                                          .dport = inet->dport } } };
 315
 316                         /* If this fails, retransmit mechanism of transport layer will
 317                          * keep trying until route appears or the connection times
 318                          * itself out.
 319                          */
 320                         security_sk_classify_flow(sk, &fl);
 321                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 322                                 goto no_route;
 323                 }
 324                 sk_setup_caps(sk, &rt->u.dst);
 325         }
 326         skb->dst = dst_clone(&rt->u.dst);
 327
 328 packet_routed:
 329         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 330                 goto no_route;
 331
 332         /* OK, we know where to send it, allocate and build IP header. */
 333         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 334         skb_reset_network_header(skb);
 335         iph = ip_hdr(skb);
 336         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 337         iph->tot_len = htons(skb->len);
 338         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 339                 iph->frag_off = htons(IP_DF);
 340         else
 341                 iph->frag_off = 0;
 342         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 343         iph->protocol = sk->sk_protocol;
 344         iph->saddr    = rt->rt_src;
 345         iph->daddr    = rt->rt_dst;
 346         /* Transport layer set skb->h.foo itself. */
 347
 348         if (opt && opt->optlen) {
 349                 iph->ihl += opt->optlen >> 2;
 350                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 351         }
 352
 353         ip_select_ident_more(iph, &rt->u.dst, sk,
 354                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 355
 356         /* Add an IP checksum. */
 357         ip_send_check(iph);
 358
 359         skb->priority = sk->sk_priority;
 360
 361         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 362                        dst_output);
 363
 364 no_route:
 365         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 366         kfree_skb(skb);
 367         return -EHOSTUNREACH;
 368 }
 369
 370
 371 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 372 {
 373         to->pkt_type = from->pkt_type;
 374         to->priority = from->priority;
 375         to->protocol = from->protocol;
 376         dst_release(to->dst);
 377         to->dst = dst_clone(from->dst);
 378         to->dev = from->dev;
 379         to->mark = from->mark;
 380
 381         /* Copy the flags to each fragment. */
 382         IPCB(to)->flags = IPCB(from)->flags;
 383
 384 #ifdef CONFIG_NET_SCHED
 385         to->tc_index = from->tc_index;
 386 #endif
 387         nf_copy(to, from);
 388 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 389         to->ipvs_property = from->ipvs_property;
 390 #endif
 391         skb_copy_secmark(to, from);
 392 }
 393
 394 /*
 395  *      This IP datagram is too large to be sent in one piece.  Break it up into
 396  *      smaller pieces (each of size equal to IP header plus
 397  *      a block of the data of the original IP data part) that will yet fit in a
 398  *      single device frame, and queue such a frame for sending.
 399  */
 400
 401 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 402 {
 403         struct iphdr *iph;
 404         int raw = 0;
 405         int ptr;
 406         struct net_device *dev;
 407         struct sk_buff *skb2;
 408         unsigned int mtu, hlen, left, len, ll_rs, pad;
 409         int offset;
 410         __be16 not_last_frag;
 411         struct rtable *rt = (struct rtable*)skb->dst;
 412         int err = 0;
 413
 414         dev = rt->u.dst.dev;
 415
 416         /*
 417          *      Point into the IP datagram header.
 418          */
 419
 420         iph = ip_hdr(skb);
 421
 422         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 423                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 424                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 425                           htonl(dst_mtu(&rt->u.dst)));
 426                 kfree_skb(skb);
 427                 return -EMSGSIZE;
 428         }
 429
 430         /*
 431          *      Setup starting values.
 432          */
 433
 434         hlen = iph->ihl * 4;
 435         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 436         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 437
 438         /* When frag_list is given, use it. First, check its validity:
 439          * some transformers could create wrong frag_list or break existing
 440          * one, it is not prohibited. In this case fall back to copying.
 441          *
 442          * LATER: this step can be merged to real generation of fragments,
 443          * we can switch to copy when see the first bad fragment.
 444          */
 445         if (skb_shinfo(skb)->frag_list) {
 446                 struct sk_buff *frag;
 447                 int first_len = skb_pagelen(skb);
 448
 449                 if (first_len - hlen > mtu ||
 450                     ((first_len - hlen) & 7) ||
 451                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 452                     skb_cloned(skb))
 453                         goto slow_path;
 454
 455                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 456                         /* Correct geometry. */
 457                         if (frag->len > mtu ||
 458                             ((frag->len & 7) && frag->next) ||
 459                             skb_headroom(frag) < hlen)
 460                             goto slow_path;
 461
 462                         /* Partially cloned skb? */
 463                         if (skb_shared(frag))
 464                                 goto slow_path;
 465
 466                         BUG_ON(frag->sk);
 467                         if (skb->sk) {
 468                                 sock_hold(skb->sk);
 469                                 frag->sk = skb->sk;
 470                                 frag->destructor = sock_wfree;
 471                                 skb->truesize -= frag->truesize;
 472                         }
 473                 }
 474
 475                 /* Everything is OK. Generate! */
 476
 477                 err = 0;
 478                 offset = 0;
 479                 frag = skb_shinfo(skb)->frag_list;
 480                 skb_shinfo(skb)->frag_list = NULL;
 481                 skb->data_len = first_len - skb_headlen(skb);
 482                 skb->len = first_len;
 483                 iph->tot_len = htons(first_len);
 484                 iph->frag_off = htons(IP_MF);
 485                 ip_send_check(iph);
 486
 487                 for (;;) {
 488                         /* Prepare header of the next frame,
 489                          * before previous one went down. */
 490                         if (frag) {
 491                                 frag->ip_summed = CHECKSUM_NONE;
 492                                 skb_reset_transport_header(frag);
 493                                 __skb_push(frag, hlen);
 494                                 skb_reset_network_header(frag);
 495                                 memcpy(skb_network_header(frag), iph, hlen);
 496                                 iph = ip_hdr(frag);
 497                                 iph->tot_len = htons(frag->len);
 498                                 ip_copy_metadata(frag, skb);
 499                                 if (offset == 0)
 500                                         ip_options_fragment(frag);
 501                                 offset += skb->len - hlen;
 502                                 iph->frag_off = htons(offset>>3);
 503                                 if (frag->next != NULL)
 504                                         iph->frag_off |= htons(IP_MF);
 505                                 /* Ready, complete checksum */
 506                                 ip_send_check(iph);
 507                         }
 508
 509                         err = output(skb);
 510
 511                         if (!err)
 512                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 513                         if (err || !frag)
 514                                 break;
 515
 516                         skb = frag;
 517                         frag = skb->next;
 518                         skb->next = NULL;
 519                 }
 520
 521                 if (err == 0) {
 522                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 523                         return 0;
 524                 }
 525
 526                 while (frag) {
 527                         skb = frag->next;
 528                         kfree_skb(frag);
 529                         frag = skb;
 530                 }
 531                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 532                 return err;
 533         }
 534
 535 slow_path:
 536         left = skb->len - hlen;         /* Space per frame */
 537         ptr = raw + hlen;               /* Where to start from */
 538
 539         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 540          * we need to make room for the encapsulating header
 541          */
 542         pad = nf_bridge_pad(skb);
 543         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 544         mtu -= pad;
 545
 546         /*
 547          *      Fragment the datagram.
 548          */
 549
 550         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 551         not_last_frag = iph->frag_off & htons(IP_MF);
 552
 553         /*
 554          *      Keep copying data until we run out.
 555          */
 556
 557         while (left > 0) {
 558                 len = left;
 559                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 560                 if (len > mtu)
 561                         len = mtu;
 562                 /* IF: we are not sending upto and including the packet end
 563                    then align the next start on an eight byte boundary */
 564                 if (len < left) {
 565                         len &= ~7;
 566                 }
 567                 /*
 568                  *      Allocate buffer.
 569                  */
 570
 571                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 572                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 573                         err = -ENOMEM;
 574                         goto fail;
 575                 }
 576
 577                 /*
 578                  *      Set up data on packet
 579                  */
 580
 581                 ip_copy_metadata(skb2, skb);
 582                 skb_reserve(skb2, ll_rs);
 583                 skb_put(skb2, len + hlen);
 584                 skb_reset_network_header(skb2);
 585                 skb2->transport_header = skb2->network_header + hlen;
 586
 587                 /*
 588                  *      Charge the memory for the fragment to any owner
 589                  *      it might possess
 590                  */
 591
 592                 if (skb->sk)
 593                         skb_set_owner_w(skb2, skb->sk);
 594
 595                 /*
 596                  *      Copy the packet header into the new buffer.
 597                  */
 598
 599                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 600
 601                 /*
 602                  *      Copy a block of the IP datagram.
 603                  */
 604                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 605                         BUG();
 606                 left -= len;
 607
 608                 /*
 609                  *      Fill in the new header fields.
 610                  */
 611                 iph = ip_hdr(skb2);
 612                 iph->frag_off = htons((offset >> 3));
 613
 614                 /* ANK: dirty, but effective trick. Upgrade options only if
 615                  * the segment to be fragmented was THE FIRST (otherwise,
 616                  * options are already fixed) and make it ONCE
 617                  * on the initial skb, so that all the following fragments
 618                  * will inherit fixed options.
 619                  */
 620                 if (offset == 0)
 621                         ip_options_fragment(skb);
 622
 623                 /*
 624                  *      Added AC : If we are fragmenting a fragment that's not the
 625                  *                 last fragment then keep MF on each bit
 626                  */
 627                 if (left > 0 || not_last_frag)
 628                         iph->frag_off |= htons(IP_MF);
 629                 ptr += len;
 630                 offset += len;
 631
 632                 /*
 633                  *      Put this fragment into the sending queue.
 634                  */
 635                 iph->tot_len = htons(len + hlen);
 636
 637                 ip_send_check(iph);
 638
 639                 err = output(skb2);
 640                 if (err)
 641                         goto fail;
 642
 643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 644         }
 645         kfree_skb(skb);
 646         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 647         return err;
 648
 649 fail:
 650         kfree_skb(skb);
 651         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 652         return err;
 653 }
 654
 655 EXPORT_SYMBOL(ip_fragment);
 656
 657 int
 658 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 659 {
 660         struct iovec *iov = from;
 661
 662         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 663                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 664                         return -EFAULT;
 665         } else {
 666                 __wsum csum = 0;
 667                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 668                         return -EFAULT;
 669                 skb->csum = csum_block_add(skb->csum, csum, odd);
 670         }
 671         return 0;
 672 }
 673
 674 static inline __wsum
 675 csum_page(struct page *page, int offset, int copy)
 676 {
 677         char *kaddr;
 678         __wsum csum;
 679         kaddr = kmap(page);
 680         csum = csum_partial(kaddr + offset, copy, 0);
 681         kunmap(page);
 682         return csum;
 683 }
 684
 685 static inline int ip_ufo_append_data(struct sock *sk,
 686                         int getfrag(void *from, char *to, int offset, int len,
 687                                int odd, struct sk_buff *skb),
 688                         void *from, int length, int hh_len, int fragheaderlen,
 689                         int transhdrlen, int mtu,unsigned int flags)
 690 {
 691         struct sk_buff *skb;
 692         int err;
 693
 694         /* There is support for UDP fragmentation offload by network
 695          * device, so create one single skb packet containing complete
 696          * udp datagram
 697          */
 698         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 699                 skb = sock_alloc_send_skb(sk,
 700                         hh_len + fragheaderlen + transhdrlen + 20,
 701                         (flags & MSG_DONTWAIT), &err);
 702
 703                 if (skb == NULL)
 704                         return err;
 705
 706                 /* reserve space for Hardware header */
 707                 skb_reserve(skb, hh_len);
 708
 709                 /* create space for UDP/IP header */
 710                 skb_put(skb,fragheaderlen + transhdrlen);
 711
 712                 /* initialize network header pointer */
 713                 skb_reset_network_header(skb);
 714
 715                 /* initialize protocol header pointer */
 716                 skb->transport_header = skb->network_header + fragheaderlen;
 717
 718                 skb->ip_summed = CHECKSUM_PARTIAL;
 719                 skb->csum = 0;
 720                 sk->sk_sndmsg_off = 0;
 721         }
 722
 723         err = skb_append_datato_frags(sk,skb, getfrag, from,
 724                                (length - transhdrlen));
 725         if (!err) {
 726                 /* specify the length of each IP datagram fragment*/
 727                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 728                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 729                 __skb_queue_tail(&sk->sk_write_queue, skb);
 730
 731                 return 0;
 732         }
 733         /* There is not enough support do UFO ,
 734          * so follow normal path
 735          */
 736         kfree_skb(skb);
 737         return err;
 738 }
 739
 740 /*
 741  *      ip_append_data() and ip_append_page() can make one large IP datagram
 742  *      from many pieces of data. Each pieces will be holded on the socket
 743  *      until ip_push_pending_frames() is called. Each piece can be a page
 744  *      or non-page data.
 745  *
 746  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 747  *      this interface potentially.
 748  *
 749  *      LATER: length must be adjusted by pad at tail, when it is required.
 750  */
 751 int ip_append_data(struct sock *sk,
 752                    int getfrag(void *from, char *to, int offset, int len,
 753                                int odd, struct sk_buff *skb),
 754                    void *from, int length, int transhdrlen,
 755                    struct ipcm_cookie *ipc, struct rtable *rt,
 756                    unsigned int flags)
 757 {
 758         struct inet_sock *inet = inet_sk(sk);
 759         struct sk_buff *skb;
 760
 761         struct ip_options *opt = NULL;
 762         int hh_len;
 763         int exthdrlen;
 764         int mtu;
 765         int copy;
 766         int err;
 767         int offset = 0;
 768         unsigned int maxfraglen, fragheaderlen;
 769         int csummode = CHECKSUM_NONE;
 770
 771         if (flags&MSG_PROBE)
 772                 return 0;
 773
 774         if (skb_queue_empty(&sk->sk_write_queue)) {
 775                 /*
 776                  * setup for corking.
 777                  */
 778                 opt = ipc->opt;
 779                 if (opt) {
 780                         if (inet->cork.opt == NULL) {
 781                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 782                                 if (unlikely(inet->cork.opt == NULL))
 783                                         return -ENOBUFS;
 784                         }
 785                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 786                         inet->cork.flags |= IPCORK_OPT;
 787                         inet->cork.addr = ipc->addr;
 788                 }
 789                 dst_hold(&rt->u.dst);
 790                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 791                 inet->cork.rt = rt;
 792                 inet->cork.length = 0;
 793                 sk->sk_sndmsg_page = NULL;
 794                 sk->sk_sndmsg_off = 0;
 795                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 796                         length += exthdrlen;
 797                         transhdrlen += exthdrlen;
 798                 }
 799         } else {
 800                 rt = inet->cork.rt;
 801                 if (inet->cork.flags & IPCORK_OPT)
 802                         opt = inet->cork.opt;
 803
 804                 transhdrlen = 0;
 805                 exthdrlen = 0;
 806                 mtu = inet->cork.fragsize;
 807         }
 808         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 809
 810         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 811         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 812
 813         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 814                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 815                 return -EMSGSIZE;
 816         }
 817
 818         /*
 819          * transhdrlen > 0 means that this is the first fragment and we wish
 820          * it won't be fragmented in the future.
 821          */
 822         if (transhdrlen &&
 823             length + fragheaderlen <= mtu &&
 824             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 825             !exthdrlen)
 826                 csummode = CHECKSUM_PARTIAL;
 827
 828         inet->cork.length += length;
 829         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 830                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 831
 832                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 833                                          fragheaderlen, transhdrlen, mtu,
 834                                          flags);
 835                 if (err)
 836                         goto error;
 837                 return 0;
 838         }
 839
 840         /* So, what's going on in the loop below?
 841          *
 842          * We use calculated fragment length to generate chained skb,
 843          * each of segments is IP fragment ready for sending to network after
 844          * adding appropriate IP header.
 845          */
 846
 847         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 848                 goto alloc_new_skb;
 849
 850         while (length > 0) {
 851                 /* Check if the remaining data fits into current packet. */
 852                 copy = mtu - skb->len;
 853                 if (copy < length)
 854                         copy = maxfraglen - skb->len;
 855                 if (copy <= 0) {
 856                         char *data;
 857                         unsigned int datalen;
 858                         unsigned int fraglen;
 859                         unsigned int fraggap;
 860                         unsigned int alloclen;
 861                         struct sk_buff *skb_prev;
 862 alloc_new_skb:
 863                         skb_prev = skb;
 864                         if (skb_prev)
 865                                 fraggap = skb_prev->len - maxfraglen;
 866                         else
 867                                 fraggap = 0;
 868
 869                         /*
 870                          * If remaining data exceeds the mtu,
 871                          * we know we need more fragment(s).
 872                          */
 873                         datalen = length + fraggap;
 874                         if (datalen > mtu - fragheaderlen)
 875                                 datalen = maxfraglen - fragheaderlen;
 876                         fraglen = datalen + fragheaderlen;
 877
 878                         if ((flags & MSG_MORE) &&
 879                             !(rt->u.dst.dev->features&NETIF_F_SG))
 880                                 alloclen = mtu;
 881                         else
 882                                 alloclen = datalen + fragheaderlen;
 883
 884                         /* The last fragment gets additional space at tail.
 885                          * Note, with MSG_MORE we overallocate on fragments,
 886                          * because we have no idea what fragment will be
 887                          * the last.
 888                          */
 889                         if (datalen == length + fraggap)
 890                                 alloclen += rt->u.dst.trailer_len;
 891
 892                         if (transhdrlen) {
 893                                 skb = sock_alloc_send_skb(sk,
 894                                                 alloclen + hh_len + 15,
 895                                                 (flags & MSG_DONTWAIT), &err);
 896                         } else {
 897                                 skb = NULL;
 898                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 899                                     2 * sk->sk_sndbuf)
 900                                         skb = sock_wmalloc(sk,
 901                                                            alloclen + hh_len + 15, 1,
 902                                                            sk->sk_allocation);
 903                                 if (unlikely(skb == NULL))
 904                                         err = -ENOBUFS;
 905                         }
 906                         if (skb == NULL)
 907                                 goto error;
 908
 909                         /*
 910                          *      Fill in the control structures
 911                          */
 912                         skb->ip_summed = csummode;
 913                         skb->csum = 0;
 914                         skb_reserve(skb, hh_len);
 915
 916                         /*
 917                          *      Find where to start putting bytes.
 918                          */
 919                         data = skb_put(skb, fraglen);
 920                         skb_set_network_header(skb, exthdrlen);
 921                         skb->transport_header = (skb->network_header +
 922                                                  fragheaderlen);
 923                         data += fragheaderlen;
 924
 925                         if (fraggap) {
 926                                 skb->csum = skb_copy_and_csum_bits(
 927                                         skb_prev, maxfraglen,
 928                                         data + transhdrlen, fraggap, 0);
 929                                 skb_prev->csum = csum_sub(skb_prev->csum,
 930                                                           skb->csum);
 931                                 data += fraggap;
 932                                 pskb_trim_unique(skb_prev, maxfraglen);
 933                         }
 934
 935                         copy = datalen - transhdrlen - fraggap;
 936                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 937                                 err = -EFAULT;
 938                                 kfree_skb(skb);
 939                                 goto error;
 940                         }
 941
 942                         offset += copy;
 943                         length -= datalen - fraggap;
 944                         transhdrlen = 0;
 945                         exthdrlen = 0;
 946                         csummode = CHECKSUM_NONE;
 947
 948                         /*
 949                          * Put the packet on the pending queue.
 950                          */
 951                         __skb_queue_tail(&sk->sk_write_queue, skb);
 952                         continue;
 953                 }
 954
 955                 if (copy > length)
 956                         copy = length;
 957
 958                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 959                         unsigned int off;
 960
 961                         off = skb->len;
 962                         if (getfrag(from, skb_put(skb, copy),
 963                                         offset, copy, off, skb) < 0) {
 964                                 __skb_trim(skb, off);
 965                                 err = -EFAULT;
 966                                 goto error;
 967                         }
 968                 } else {
 969                         int i = skb_shinfo(skb)->nr_frags;
 970                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 971                         struct page *page = sk->sk_sndmsg_page;
 972                         int off = sk->sk_sndmsg_off;
 973                         unsigned int left;
 974
 975                         if (page && (left = PAGE_SIZE - off) > 0) {
 976                                 if (copy >= left)
 977                                         copy = left;
 978                                 if (page != frag->page) {
 979                                         if (i == MAX_SKB_FRAGS) {
 980                                                 err = -EMSGSIZE;
 981                                                 goto error;
 982                                         }
 983                                         get_page(page);
 984                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 985                                         frag = &skb_shinfo(skb)->frags[i];
 986                                 }
 987                         } else if (i < MAX_SKB_FRAGS) {
 988                                 if (copy > PAGE_SIZE)
 989                                         copy = PAGE_SIZE;
 990                                 page = alloc_pages(sk->sk_allocation, 0);
 991                                 if (page == NULL)  {
 992                                         err = -ENOMEM;
 993                                         goto error;
 994                                 }
 995                                 sk->sk_sndmsg_page = page;
 996                                 sk->sk_sndmsg_off = 0;
 997
 998                                 skb_fill_page_desc(skb, i, page, 0, 0);
 999                                 frag = &skb_shinfo(skb)->frags[i];
1000                                 skb->truesize += PAGE_SIZE;
1001                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1002                         } else {
1003                                 err = -EMSGSIZE;
1004                                 goto error;
1005                         }
1006                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1007                                 err = -EFAULT;
1008                                 goto error;
1009                         }
1010                         sk->sk_sndmsg_off += copy;
1011                         frag->size += copy;
1012                         skb->len += copy;
1013                         skb->data_len += copy;
1014                 }
1015                 offset += copy;
1016                 length -= copy;
1017         }
1018
1019         return 0;
1020
1021 error:
1022         inet->cork.length -= length;
1023         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1024         return err;
1025 }
1026
1027 ssize_t ip_append_page(struct sock *sk, struct page *page,
1028                        int offset, size_t size, int flags)
1029 {
1030         struct inet_sock *inet = inet_sk(sk);
1031         struct sk_buff *skb;
1032         struct rtable *rt;
1033         struct ip_options *opt = NULL;
1034         int hh_len;
1035         int mtu;
1036         int len;
1037         int err;
1038         unsigned int maxfraglen, fragheaderlen, fraggap;
1039
1040         if (inet->hdrincl)
1041                 return -EPERM;
1042
1043         if (flags&MSG_PROBE)
1044                 return 0;
1045
1046         if (skb_queue_empty(&sk->sk_write_queue))
1047                 return -EINVAL;
1048
1049         rt = inet->cork.rt;
1050         if (inet->cork.flags & IPCORK_OPT)
1051                 opt = inet->cork.opt;
1052
1053         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1054                 return -EOPNOTSUPP;
1055
1056         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1057         mtu = inet->cork.fragsize;
1058
1059         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1060         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1061
1062         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1063                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1064                 return -EMSGSIZE;
1065         }
1066
1067         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1068                 return -EINVAL;
1069
1070         inet->cork.length += size;
1071         if ((sk->sk_protocol == IPPROTO_UDP) &&
1072             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1073                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075         }
1076
1077
1078         while (size > 0) {
1079                 int i;
1080
1081                 if (skb_is_gso(skb))
1082                         len = size;
1083                 else {
1084
1085                         /* Check if the remaining data fits into current packet. */
1086                         len = mtu - skb->len;
1087                         if (len < size)
1088                                 len = maxfraglen - skb->len;
1089                 }
1090                 if (len <= 0) {
1091                         struct sk_buff *skb_prev;
1092                         int alloclen;
1093
1094                         skb_prev = skb;
1095                         fraggap = skb_prev->len - maxfraglen;
1096
1097                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1098                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1099                         if (unlikely(!skb)) {
1100                                 err = -ENOBUFS;
1101                                 goto error;
1102                         }
1103
1104                         /*
1105                          *      Fill in the control structures
1106                          */
1107                         skb->ip_summed = CHECKSUM_NONE;
1108                         skb->csum = 0;
1109                         skb_reserve(skb, hh_len);
1110
1111                         /*
1112                          *      Find where to start putting bytes.
1113                          */
1114                         skb_put(skb, fragheaderlen + fraggap);
1115                         skb_reset_network_header(skb);
1116                         skb->transport_header = (skb->network_header +
1117                                                  fragheaderlen);
1118                         if (fraggap) {
1119                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1120                                                                    maxfraglen,
1121                                                     skb_transport_header(skb),
1122                                                                    fraggap, 0);
1123                                 skb_prev->csum = csum_sub(skb_prev->csum,
1124                                                           skb->csum);
1125                                 pskb_trim_unique(skb_prev, maxfraglen);
1126                         }
1127
1128                         /*
1129                          * Put the packet on the pending queue.
1130                          */
1131                         __skb_queue_tail(&sk->sk_write_queue, skb);
1132                         continue;
1133                 }
1134
1135                 i = skb_shinfo(skb)->nr_frags;
1136                 if (len > size)
1137                         len = size;
1138                 if (skb_can_coalesce(skb, i, page, offset)) {
1139                         skb_shinfo(skb)->frags[i-1].size += len;
1140                 } else if (i < MAX_SKB_FRAGS) {
1141                         get_page(page);
1142                         skb_fill_page_desc(skb, i, page, offset, len);
1143                 } else {
1144                         err = -EMSGSIZE;
1145                         goto error;
1146                 }
1147
1148                 if (skb->ip_summed == CHECKSUM_NONE) {
1149                         __wsum csum;
1150                         csum = csum_page(page, offset, len);
1151                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1152                 }
1153
1154                 skb->len += len;
1155                 skb->data_len += len;
1156                 offset += len;
1157                 size -= len;
1158         }
1159         return 0;
1160
1161 error:
1162         inet->cork.length -= size;
1163         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1164         return err;
1165 }
1166
1167 /*
1168  *      Combined all pending IP fragments on the socket as one IP datagram
1169  *      and push them out.
1170  */
1171 int ip_push_pending_frames(struct sock *sk)
1172 {
1173         struct sk_buff *skb, *tmp_skb;
1174         struct sk_buff **tail_skb;
1175         struct inet_sock *inet = inet_sk(sk);
1176         struct ip_options *opt = NULL;
1177         struct rtable *rt = inet->cork.rt;
1178         struct iphdr *iph;
1179         __be16 df = 0;
1180         __u8 ttl;
1181         int err = 0;
1182
1183         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1184                 goto out;
1185         tail_skb = &(skb_shinfo(skb)->frag_list);
1186
1187         /* move skb->data to ip header from ext header */
1188         if (skb->data < skb_network_header(skb))
1189                 __skb_pull(skb, skb_network_offset(skb));
1190         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1191                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1192                 *tail_skb = tmp_skb;
1193                 tail_skb = &(tmp_skb->next);
1194                 skb->len += tmp_skb->len;
1195                 skb->data_len += tmp_skb->len;
1196                 skb->truesize += tmp_skb->truesize;
1197                 __sock_put(tmp_skb->sk);
1198                 tmp_skb->destructor = NULL;
1199                 tmp_skb->sk = NULL;
1200         }
1201
1202         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1203          * to fragment the frame generated here. No matter, what transforms
1204          * how transforms change size of the packet, it will come out.
1205          */
1206         if (inet->pmtudisc != IP_PMTUDISC_DO)
1207                 skb->local_df = 1;
1208
1209         /* DF bit is set when we want to see DF on outgoing frames.
1210          * If local_df is set too, we still allow to fragment this frame
1211          * locally. */
1212         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1213             (skb->len <= dst_mtu(&rt->u.dst) &&
1214              ip_dont_fragment(sk, &rt->u.dst)))
1215                 df = htons(IP_DF);
1216
1217         if (inet->cork.flags & IPCORK_OPT)
1218                 opt = inet->cork.opt;
1219
1220         if (rt->rt_type == RTN_MULTICAST)
1221                 ttl = inet->mc_ttl;
1222         else
1223                 ttl = ip_select_ttl(inet, &rt->u.dst);
1224
1225         iph = (struct iphdr *)skb->data;
1226         iph->version = 4;
1227         iph->ihl = 5;
1228         if (opt) {
1229                 iph->ihl += opt->optlen>>2;
1230                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1231         }
1232         iph->tos = inet->tos;
1233         iph->tot_len = htons(skb->len);
1234         iph->frag_off = df;
1235         ip_select_ident(iph, &rt->u.dst, sk);
1236         iph->ttl = ttl;
1237         iph->protocol = sk->sk_protocol;
1238         iph->saddr = rt->rt_src;
1239         iph->daddr = rt->rt_dst;
1240         ip_send_check(iph);
1241
1242         skb->priority = sk->sk_priority;
1243         skb->dst = dst_clone(&rt->u.dst);
1244
1245         /* Netfilter gets whole the not fragmented skb. */
1246         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1247                       skb->dst->dev, dst_output);
1248         if (err) {
1249                 if (err > 0)
1250                         err = inet->recverr ? net_xmit_errno(err) : 0;
1251                 if (err)
1252                         goto error;
1253         }
1254
1255 out:
1256         inet->cork.flags &= ~IPCORK_OPT;
1257         kfree(inet->cork.opt);
1258         inet->cork.opt = NULL;
1259         if (inet->cork.rt) {
1260                 ip_rt_put(inet->cork.rt);
1261                 inet->cork.rt = NULL;
1262         }
1263         return err;
1264
1265 error:
1266         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1267         goto out;
1268 }
1269
1270 /*
1271  *      Throw away all pending data on the socket.
1272  */
1273 void ip_flush_pending_frames(struct sock *sk)
1274 {
1275         struct inet_sock *inet = inet_sk(sk);
1276         struct sk_buff *skb;
1277
1278         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1279                 kfree_skb(skb);
1280
1281         inet->cork.flags &= ~IPCORK_OPT;
1282         kfree(inet->cork.opt);
1283         inet->cork.opt = NULL;
1284         if (inet->cork.rt) {
1285                 ip_rt_put(inet->cork.rt);
1286                 inet->cork.rt = NULL;
1287         }
1288 }
1289
1290
1291 /*
1292  *      Fetch data from kernel space and fill in checksum if needed.
1293  */
1294 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1295                               int len, int odd, struct sk_buff *skb)
1296 {
1297         __wsum csum;
1298
1299         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1300         skb->csum = csum_block_add(skb->csum, csum, odd);
1301         return 0;
1302 }
1303
1304 /*
1305  *      Generic function to send a packet as reply to another packet.
1306  *      Used to send TCP resets so far. ICMP should use this function too.
1307  *
1308  *      Should run single threaded per socket because it uses the sock
1309  *      structure to pass arguments.
1310  *
1311  *      LATER: switch from ip_build_xmit to ip_append_*
1312  */
1313 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1314                    unsigned int len)
1315 {
1316         struct inet_sock *inet = inet_sk(sk);
1317         struct {
1318                 struct ip_options       opt;
1319                 char                    data[40];
1320         } replyopts;
1321         struct ipcm_cookie ipc;
1322         __be32 daddr;
1323         struct rtable *rt = (struct rtable*)skb->dst;
1324
1325         if (ip_options_echo(&replyopts.opt, skb))
1326                 return;
1327
1328         daddr = ipc.addr = rt->rt_src;
1329         ipc.opt = NULL;
1330
1331         if (replyopts.opt.optlen) {
1332                 ipc.opt = &replyopts.opt;
1333
1334                 if (ipc.opt->srr)
1335                         daddr = replyopts.opt.faddr;
1336         }
1337
1338         {
1339                 struct flowi fl = { .nl_u = { .ip4_u =
1340                                               { .daddr = daddr,
1341                                                 .saddr = rt->rt_spec_dst,
1342                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1343                                     /* Not quite clean, but right. */
1344                                     .uli_u = { .ports =
1345                                                { .sport = tcp_hdr(skb)->dest,
1346                                                  .dport = tcp_hdr(skb)->source } },
1347                                     .proto = sk->sk_protocol };
1348                 security_skb_classify_flow(skb, &fl);
1349                 if (ip_route_output_key(&rt, &fl))
1350                         return;
1351         }
1352
1353         /* And let IP do all the hard work.
1354
1355            This chunk is not reenterable, hence spinlock.
1356            Note that it uses the fact, that this function is called
1357            with locally disabled BH and that sk cannot be already spinlocked.
1358          */
1359         bh_lock_sock(sk);
1360         inet->tos = ip_hdr(skb)->tos;
1361         sk->sk_priority = skb->priority;
1362         sk->sk_protocol = ip_hdr(skb)->protocol;
1363         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1364                        &ipc, rt, MSG_DONTWAIT);
1365         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1366                 if (arg->csumoffset >= 0)
1367                         *((__sum16 *)skb_transport_header(skb) +
1368                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1369                                                                 arg->csum));
1370                 skb->ip_summed = CHECKSUM_NONE;
1371                 ip_push_pending_frames(sk);
1372         }
1373
1374         bh_unlock_sock(sk);
1375
1376         ip_rt_put(rt);
1377 }
1378
1379 void __init ip_init(void)
1380 {
1381         ip_rt_init();
1382         inet_initpeers();
1383
1384 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1385         igmp_mc_proc_init();
1386 #endif
1387 }
1388
1389 EXPORT_SYMBOL(ip_generic_getfrag);
1390 EXPORT_SYMBOL(ip_queue_xmit);
1391 EXPORT_SYMBOL(ip_send_check);