net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         skb_reset_mac_header(newskb);
  99         __skb_pull(newskb, skb_network_offset(newskb));
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 129         skb_reset_network_header(skb);
 130         iph = ip_hdr(skb);
 131         iph->version  = 4;
 132         iph->ihl      = 5;
 133         iph->tos      = inet->tos;
 134         if (ip_dont_fragment(sk, &rt->u.dst))
 135                 iph->frag_off = htons(IP_DF);
 136         else
 137                 iph->frag_off = 0;
 138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 139         iph->daddr    = rt->rt_dst;
 140         iph->saddr    = rt->rt_src;
 141         iph->protocol = sk->sk_protocol;
 142         iph->tot_len  = htons(skb->len);
 143         ip_select_ident(iph, &rt->u.dst, sk);
 144
 145         if (opt && opt->optlen) {
 146                 iph->ihl += opt->optlen>>2;
 147                 ip_options_build(skb, opt, daddr, rt, 0);
 148         }
 149         ip_send_check(iph);
 150
 151         skb->priority = sk->sk_priority;
 152
 153         /* Send it out. */
 154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 155                        dst_output);
 156 }
 157
 158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct net_device *dev = dst->dev;
 164         int hh_len = LL_RESERVED_SPACE(dev);
 165
 166         /* Be paranoid, rather than too clever. */
 167         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 168                 struct sk_buff *skb2;
 169
 170                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 171                 if (skb2 == NULL) {
 172                         kfree_skb(skb);
 173                         return -ENOMEM;
 174                 }
 175                 if (skb->sk)
 176                         skb_set_owner_w(skb2, skb->sk);
 177                 kfree_skb(skb);
 178                 skb = skb2;
 179         }
 180
 181         if (dst->hh)
 182                 return neigh_hh_output(dst->hh, skb);
 183         else if (dst->neighbour)
 184                 return dst->neighbour->output(skb);
 185
 186         if (net_ratelimit())
 187                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 188         kfree_skb(skb);
 189         return -EINVAL;
 190 }
 191
 192 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 193 {
 194         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 195
 196         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 197                skb->dst->dev->mtu : dst_mtu(skb->dst);
 198 }
 199
 200 static inline int ip_finish_output(struct sk_buff *skb)
 201 {
 202 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 203         /* Policy lookup after SNAT yielded a new policy */
 204         if (skb->dst->xfrm != NULL) {
 205                 IPCB(skb)->flags |= IPSKB_REROUTED;
 206                 return dst_output(skb);
 207         }
 208 #endif
 209         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 210                 return ip_fragment(skb, ip_finish_output2);
 211         else
 212                 return ip_finish_output2(skb);
 213 }
 214
 215 int ip_mc_output(struct sk_buff *skb)
 216 {
 217         struct sock *sk = skb->sk;
 218         struct rtable *rt = (struct rtable*)skb->dst;
 219         struct net_device *dev = rt->u.dst.dev;
 220
 221         /*
 222          *      If the indicated interface is up and running, send the packet.
 223          */
 224         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 225
 226         skb->dev = dev;
 227         skb->protocol = htons(ETH_P_IP);
 228
 229         /*
 230          *      Multicasts are looped back for other local users
 231          */
 232
 233         if (rt->rt_flags&RTCF_MULTICAST) {
 234                 if ((!sk || inet_sk(sk)->mc_loop)
 235 #ifdef CONFIG_IP_MROUTE
 236                 /* Small optimization: do not loopback not local frames,
 237                    which returned after forwarding; they will be  dropped
 238                    by ip_mr_input in any case.
 239                    Note, that local frames are looped back to be delivered
 240                    to local recipients.
 241
 242                    This check is duplicated in ip_mr_input at the moment.
 243                  */
 244                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 245 #endif
 246                 ) {
 247                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 248                         if (newskb)
 249                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 250                                         newskb->dev,
 251                                         ip_dev_loopback_xmit);
 252                 }
 253
 254                 /* Multicasts with ttl 0 must not go beyond the host */
 255
 256                 if (ip_hdr(skb)->ttl == 0) {
 257                         kfree_skb(skb);
 258                         return 0;
 259                 }
 260         }
 261
 262         if (rt->rt_flags&RTCF_BROADCAST) {
 263                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 264                 if (newskb)
 265                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 266                                 newskb->dev, ip_dev_loopback_xmit);
 267         }
 268
 269         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 270                             ip_finish_output,
 271                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 272 }
 273
 274 int ip_output(struct sk_buff *skb)
 275 {
 276         struct net_device *dev = skb->dst->dev;
 277
 278         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 279
 280         skb->dev = dev;
 281         skb->protocol = htons(ETH_P_IP);
 282
 283         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 284                             ip_finish_output,
 285                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 286 }
 287
 288 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 289 {
 290         struct sock *sk = skb->sk;
 291         struct inet_sock *inet = inet_sk(sk);
 292         struct ip_options *opt = inet->opt;
 293         struct rtable *rt;
 294         struct iphdr *iph;
 295
 296         /* Skip all of this if the packet is already routed,
 297          * f.e. by something like SCTP.
 298          */
 299         rt = (struct rtable *) skb->dst;
 300         if (rt != NULL)
 301                 goto packet_routed;
 302
 303         /* Make sure we can route this packet. */
 304         rt = (struct rtable *)__sk_dst_check(sk, 0);
 305         if (rt == NULL) {
 306                 __be32 daddr;
 307
 308                 /* Use correct destination address if we have options. */
 309                 daddr = inet->daddr;
 310                 if(opt && opt->srr)
 311                         daddr = opt->faddr;
 312
 313                 {
 314                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 315                                             .nl_u = { .ip4_u =
 316                                                       { .daddr = daddr,
 317                                                         .saddr = inet->saddr,
 318                                                         .tos = RT_CONN_FLAGS(sk) } },
 319                                             .proto = sk->sk_protocol,
 320                                             .uli_u = { .ports =
 321                                                        { .sport = inet->sport,
 322                                                          .dport = inet->dport } } };
 323
 324                         /* If this fails, retransmit mechanism of transport layer will
 325                          * keep trying until route appears or the connection times
 326                          * itself out.
 327                          */
 328                         security_sk_classify_flow(sk, &fl);
 329                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 330                                 goto no_route;
 331                 }
 332                 sk_setup_caps(sk, &rt->u.dst);
 333         }
 334         skb->dst = dst_clone(&rt->u.dst);
 335
 336 packet_routed:
 337         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 338                 goto no_route;
 339
 340         /* OK, we know where to send it, allocate and build IP header. */
 341         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 342         skb_reset_network_header(skb);
 343         iph = ip_hdr(skb);
 344         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 345         iph->tot_len = htons(skb->len);
 346         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 347                 iph->frag_off = htons(IP_DF);
 348         else
 349                 iph->frag_off = 0;
 350         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 351         iph->protocol = sk->sk_protocol;
 352         iph->saddr    = rt->rt_src;
 353         iph->daddr    = rt->rt_dst;
 354         /* Transport layer set skb->h.foo itself. */
 355
 356         if (opt && opt->optlen) {
 357                 iph->ihl += opt->optlen >> 2;
 358                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 359         }
 360
 361         ip_select_ident_more(iph, &rt->u.dst, sk,
 362                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 363
 364         /* Add an IP checksum. */
 365         ip_send_check(iph);
 366
 367         skb->priority = sk->sk_priority;
 368
 369         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 370                        dst_output);
 371
 372 no_route:
 373         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 374         kfree_skb(skb);
 375         return -EHOSTUNREACH;
 376 }
 377
 378
 379 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 380 {
 381         to->pkt_type = from->pkt_type;
 382         to->priority = from->priority;
 383         to->protocol = from->protocol;
 384         dst_release(to->dst);
 385         to->dst = dst_clone(from->dst);
 386         to->dev = from->dev;
 387         to->mark = from->mark;
 388
 389         /* Copy the flags to each fragment. */
 390         IPCB(to)->flags = IPCB(from)->flags;
 391
 392 #ifdef CONFIG_NET_SCHED
 393         to->tc_index = from->tc_index;
 394 #endif
 395         nf_copy(to, from);
 396 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 397         to->ipvs_property = from->ipvs_property;
 398 #endif
 399         skb_copy_secmark(to, from);
 400 }
 401
 402 /*
 403  *      This IP datagram is too large to be sent in one piece.  Break it up into
 404  *      smaller pieces (each of size equal to IP header plus
 405  *      a block of the data of the original IP data part) that will yet fit in a
 406  *      single device frame, and queue such a frame for sending.
 407  */
 408
 409 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 410 {
 411         struct iphdr *iph;
 412         int raw = 0;
 413         int ptr;
 414         struct net_device *dev;
 415         struct sk_buff *skb2;
 416         unsigned int mtu, hlen, left, len, ll_rs, pad;
 417         int offset;
 418         __be16 not_last_frag;
 419         struct rtable *rt = (struct rtable*)skb->dst;
 420         int err = 0;
 421
 422         dev = rt->u.dst.dev;
 423
 424         /*
 425          *      Point into the IP datagram header.
 426          */
 427
 428         iph = ip_hdr(skb);
 429
 430         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 431                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 432                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 433                           htonl(ip_skb_dst_mtu(skb)));
 434                 kfree_skb(skb);
 435                 return -EMSGSIZE;
 436         }
 437
 438         /*
 439          *      Setup starting values.
 440          */
 441
 442         hlen = iph->ihl * 4;
 443         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 444         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 445
 446         /* When frag_list is given, use it. First, check its validity:
 447          * some transformers could create wrong frag_list or break existing
 448          * one, it is not prohibited. In this case fall back to copying.
 449          *
 450          * LATER: this step can be merged to real generation of fragments,
 451          * we can switch to copy when see the first bad fragment.
 452          */
 453         if (skb_shinfo(skb)->frag_list) {
 454                 struct sk_buff *frag;
 455                 int first_len = skb_pagelen(skb);
 456
 457                 if (first_len - hlen > mtu ||
 458                     ((first_len - hlen) & 7) ||
 459                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 460                     skb_cloned(skb))
 461                         goto slow_path;
 462
 463                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 464                         /* Correct geometry. */
 465                         if (frag->len > mtu ||
 466                             ((frag->len & 7) && frag->next) ||
 467                             skb_headroom(frag) < hlen)
 468                             goto slow_path;
 469
 470                         /* Partially cloned skb? */
 471                         if (skb_shared(frag))
 472                                 goto slow_path;
 473
 474                         BUG_ON(frag->sk);
 475                         if (skb->sk) {
 476                                 sock_hold(skb->sk);
 477                                 frag->sk = skb->sk;
 478                                 frag->destructor = sock_wfree;
 479                                 skb->truesize -= frag->truesize;
 480                         }
 481                 }
 482
 483                 /* Everything is OK. Generate! */
 484
 485                 err = 0;
 486                 offset = 0;
 487                 frag = skb_shinfo(skb)->frag_list;
 488                 skb_shinfo(skb)->frag_list = NULL;
 489                 skb->data_len = first_len - skb_headlen(skb);
 490                 skb->len = first_len;
 491                 iph->tot_len = htons(first_len);
 492                 iph->frag_off = htons(IP_MF);
 493                 ip_send_check(iph);
 494
 495                 for (;;) {
 496                         /* Prepare header of the next frame,
 497                          * before previous one went down. */
 498                         if (frag) {
 499                                 frag->ip_summed = CHECKSUM_NONE;
 500                                 skb_reset_transport_header(frag);
 501                                 __skb_push(frag, hlen);
 502                                 skb_reset_network_header(frag);
 503                                 memcpy(skb_network_header(frag), iph, hlen);
 504                                 iph = ip_hdr(frag);
 505                                 iph->tot_len = htons(frag->len);
 506                                 ip_copy_metadata(frag, skb);
 507                                 if (offset == 0)
 508                                         ip_options_fragment(frag);
 509                                 offset += skb->len - hlen;
 510                                 iph->frag_off = htons(offset>>3);
 511                                 if (frag->next != NULL)
 512                                         iph->frag_off |= htons(IP_MF);
 513                                 /* Ready, complete checksum */
 514                                 ip_send_check(iph);
 515                         }
 516
 517                         err = output(skb);
 518
 519                         if (!err)
 520                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 521                         if (err || !frag)
 522                                 break;
 523
 524                         skb = frag;
 525                         frag = skb->next;
 526                         skb->next = NULL;
 527                 }
 528
 529                 if (err == 0) {
 530                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 531                         return 0;
 532                 }
 533
 534                 while (frag) {
 535                         skb = frag->next;
 536                         kfree_skb(frag);
 537                         frag = skb;
 538                 }
 539                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 540                 return err;
 541         }
 542
 543 slow_path:
 544         left = skb->len - hlen;         /* Space per frame */
 545         ptr = raw + hlen;               /* Where to start from */
 546
 547         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 548          * we need to make room for the encapsulating header
 549          */
 550         pad = nf_bridge_pad(skb);
 551         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 552         mtu -= pad;
 553
 554         /*
 555          *      Fragment the datagram.
 556          */
 557
 558         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 559         not_last_frag = iph->frag_off & htons(IP_MF);
 560
 561         /*
 562          *      Keep copying data until we run out.
 563          */
 564
 565         while (left > 0) {
 566                 len = left;
 567                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 568                 if (len > mtu)
 569                         len = mtu;
 570                 /* IF: we are not sending upto and including the packet end
 571                    then align the next start on an eight byte boundary */
 572                 if (len < left) {
 573                         len &= ~7;
 574                 }
 575                 /*
 576                  *      Allocate buffer.
 577                  */
 578
 579                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 580                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 581                         err = -ENOMEM;
 582                         goto fail;
 583                 }
 584
 585                 /*
 586                  *      Set up data on packet
 587                  */
 588
 589                 ip_copy_metadata(skb2, skb);
 590                 skb_reserve(skb2, ll_rs);
 591                 skb_put(skb2, len + hlen);
 592                 skb_reset_network_header(skb2);
 593                 skb2->transport_header = skb2->network_header + hlen;
 594
 595                 /*
 596                  *      Charge the memory for the fragment to any owner
 597                  *      it might possess
 598                  */
 599
 600                 if (skb->sk)
 601                         skb_set_owner_w(skb2, skb->sk);
 602
 603                 /*
 604                  *      Copy the packet header into the new buffer.
 605                  */
 606
 607                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 608
 609                 /*
 610                  *      Copy a block of the IP datagram.
 611                  */
 612                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 613                         BUG();
 614                 left -= len;
 615
 616                 /*
 617                  *      Fill in the new header fields.
 618                  */
 619                 iph = ip_hdr(skb2);
 620                 iph->frag_off = htons((offset >> 3));
 621
 622                 /* ANK: dirty, but effective trick. Upgrade options only if
 623                  * the segment to be fragmented was THE FIRST (otherwise,
 624                  * options are already fixed) and make it ONCE
 625                  * on the initial skb, so that all the following fragments
 626                  * will inherit fixed options.
 627                  */
 628                 if (offset == 0)
 629                         ip_options_fragment(skb);
 630
 631                 /*
 632                  *      Added AC : If we are fragmenting a fragment that's not the
 633                  *                 last fragment then keep MF on each bit
 634                  */
 635                 if (left > 0 || not_last_frag)
 636                         iph->frag_off |= htons(IP_MF);
 637                 ptr += len;
 638                 offset += len;
 639
 640                 /*
 641                  *      Put this fragment into the sending queue.
 642                  */
 643                 iph->tot_len = htons(len + hlen);
 644
 645                 ip_send_check(iph);
 646
 647                 err = output(skb2);
 648                 if (err)
 649                         goto fail;
 650
 651                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 652         }
 653         kfree_skb(skb);
 654         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 655         return err;
 656
 657 fail:
 658         kfree_skb(skb);
 659         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 660         return err;
 661 }
 662
 663 EXPORT_SYMBOL(ip_fragment);
 664
 665 int
 666 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 667 {
 668         struct iovec *iov = from;
 669
 670         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 671                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 672                         return -EFAULT;
 673         } else {
 674                 __wsum csum = 0;
 675                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 676                         return -EFAULT;
 677                 skb->csum = csum_block_add(skb->csum, csum, odd);
 678         }
 679         return 0;
 680 }
 681
 682 static inline __wsum
 683 csum_page(struct page *page, int offset, int copy)
 684 {
 685         char *kaddr;
 686         __wsum csum;
 687         kaddr = kmap(page);
 688         csum = csum_partial(kaddr + offset, copy, 0);
 689         kunmap(page);
 690         return csum;
 691 }
 692
 693 static inline int ip_ufo_append_data(struct sock *sk,
 694                         int getfrag(void *from, char *to, int offset, int len,
 695                                int odd, struct sk_buff *skb),
 696                         void *from, int length, int hh_len, int fragheaderlen,
 697                         int transhdrlen, int mtu,unsigned int flags)
 698 {
 699         struct sk_buff *skb;
 700         int err;
 701
 702         /* There is support for UDP fragmentation offload by network
 703          * device, so create one single skb packet containing complete
 704          * udp datagram
 705          */
 706         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 707                 skb = sock_alloc_send_skb(sk,
 708                         hh_len + fragheaderlen + transhdrlen + 20,
 709                         (flags & MSG_DONTWAIT), &err);
 710
 711                 if (skb == NULL)
 712                         return err;
 713
 714                 /* reserve space for Hardware header */
 715                 skb_reserve(skb, hh_len);
 716
 717                 /* create space for UDP/IP header */
 718                 skb_put(skb,fragheaderlen + transhdrlen);
 719
 720                 /* initialize network header pointer */
 721                 skb_reset_network_header(skb);
 722
 723                 /* initialize protocol header pointer */
 724                 skb->transport_header = skb->network_header + fragheaderlen;
 725
 726                 skb->ip_summed = CHECKSUM_PARTIAL;
 727                 skb->csum = 0;
 728                 sk->sk_sndmsg_off = 0;
 729         }
 730
 731         err = skb_append_datato_frags(sk,skb, getfrag, from,
 732                                (length - transhdrlen));
 733         if (!err) {
 734                 /* specify the length of each IP datagram fragment*/
 735                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 736                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 737                 __skb_queue_tail(&sk->sk_write_queue, skb);
 738
 739                 return 0;
 740         }
 741         /* There is not enough support do UFO ,
 742          * so follow normal path
 743          */
 744         kfree_skb(skb);
 745         return err;
 746 }
 747
 748 /*
 749  *      ip_append_data() and ip_append_page() can make one large IP datagram
 750  *      from many pieces of data. Each pieces will be holded on the socket
 751  *      until ip_push_pending_frames() is called. Each piece can be a page
 752  *      or non-page data.
 753  *
 754  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 755  *      this interface potentially.
 756  *
 757  *      LATER: length must be adjusted by pad at tail, when it is required.
 758  */
 759 int ip_append_data(struct sock *sk,
 760                    int getfrag(void *from, char *to, int offset, int len,
 761                                int odd, struct sk_buff *skb),
 762                    void *from, int length, int transhdrlen,
 763                    struct ipcm_cookie *ipc, struct rtable *rt,
 764                    unsigned int flags)
 765 {
 766         struct inet_sock *inet = inet_sk(sk);
 767         struct sk_buff *skb;
 768
 769         struct ip_options *opt = NULL;
 770         int hh_len;
 771         int exthdrlen;
 772         int mtu;
 773         int copy;
 774         int err;
 775         int offset = 0;
 776         unsigned int maxfraglen, fragheaderlen;
 777         int csummode = CHECKSUM_NONE;
 778
 779         if (flags&MSG_PROBE)
 780                 return 0;
 781
 782         if (skb_queue_empty(&sk->sk_write_queue)) {
 783                 /*
 784                  * setup for corking.
 785                  */
 786                 opt = ipc->opt;
 787                 if (opt) {
 788                         if (inet->cork.opt == NULL) {
 789                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 790                                 if (unlikely(inet->cork.opt == NULL))
 791                                         return -ENOBUFS;
 792                         }
 793                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 794                         inet->cork.flags |= IPCORK_OPT;
 795                         inet->cork.addr = ipc->addr;
 796                 }
 797                 dst_hold(&rt->u.dst);
 798                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 799                                             rt->u.dst.dev->mtu :
 800                                             dst_mtu(rt->u.dst.path);
 801                 inet->cork.rt = rt;
 802                 inet->cork.length = 0;
 803                 sk->sk_sndmsg_page = NULL;
 804                 sk->sk_sndmsg_off = 0;
 805                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 806                         length += exthdrlen;
 807                         transhdrlen += exthdrlen;
 808                 }
 809         } else {
 810                 rt = inet->cork.rt;
 811                 if (inet->cork.flags & IPCORK_OPT)
 812                         opt = inet->cork.opt;
 813
 814                 transhdrlen = 0;
 815                 exthdrlen = 0;
 816                 mtu = inet->cork.fragsize;
 817         }
 818         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 819
 820         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 821         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 822
 823         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 824                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 825                 return -EMSGSIZE;
 826         }
 827
 828         /*
 829          * transhdrlen > 0 means that this is the first fragment and we wish
 830          * it won't be fragmented in the future.
 831          */
 832         if (transhdrlen &&
 833             length + fragheaderlen <= mtu &&
 834             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 835             !exthdrlen)
 836                 csummode = CHECKSUM_PARTIAL;
 837
 838         inet->cork.length += length;
 839         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 840                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 841
 842                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 843                                          fragheaderlen, transhdrlen, mtu,
 844                                          flags);
 845                 if (err)
 846                         goto error;
 847                 return 0;
 848         }
 849
 850         /* So, what's going on in the loop below?
 851          *
 852          * We use calculated fragment length to generate chained skb,
 853          * each of segments is IP fragment ready for sending to network after
 854          * adding appropriate IP header.
 855          */
 856
 857         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 858                 goto alloc_new_skb;
 859
 860         while (length > 0) {
 861                 /* Check if the remaining data fits into current packet. */
 862                 copy = mtu - skb->len;
 863                 if (copy < length)
 864                         copy = maxfraglen - skb->len;
 865                 if (copy <= 0) {
 866                         char *data;
 867                         unsigned int datalen;
 868                         unsigned int fraglen;
 869                         unsigned int fraggap;
 870                         unsigned int alloclen;
 871                         struct sk_buff *skb_prev;
 872 alloc_new_skb:
 873                         skb_prev = skb;
 874                         if (skb_prev)
 875                                 fraggap = skb_prev->len - maxfraglen;
 876                         else
 877                                 fraggap = 0;
 878
 879                         /*
 880                          * If remaining data exceeds the mtu,
 881                          * we know we need more fragment(s).
 882                          */
 883                         datalen = length + fraggap;
 884                         if (datalen > mtu - fragheaderlen)
 885                                 datalen = maxfraglen - fragheaderlen;
 886                         fraglen = datalen + fragheaderlen;
 887
 888                         if ((flags & MSG_MORE) &&
 889                             !(rt->u.dst.dev->features&NETIF_F_SG))
 890                                 alloclen = mtu;
 891                         else
 892                                 alloclen = datalen + fragheaderlen;
 893
 894                         /* The last fragment gets additional space at tail.
 895                          * Note, with MSG_MORE we overallocate on fragments,
 896                          * because we have no idea what fragment will be
 897                          * the last.
 898                          */
 899                         if (datalen == length + fraggap)
 900                                 alloclen += rt->u.dst.trailer_len;
 901
 902                         if (transhdrlen) {
 903                                 skb = sock_alloc_send_skb(sk,
 904                                                 alloclen + hh_len + 15,
 905                                                 (flags & MSG_DONTWAIT), &err);
 906                         } else {
 907                                 skb = NULL;
 908                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 909                                     2 * sk->sk_sndbuf)
 910                                         skb = sock_wmalloc(sk,
 911                                                            alloclen + hh_len + 15, 1,
 912                                                            sk->sk_allocation);
 913                                 if (unlikely(skb == NULL))
 914                                         err = -ENOBUFS;
 915                         }
 916                         if (skb == NULL)
 917                                 goto error;
 918
 919                         /*
 920                          *      Fill in the control structures
 921                          */
 922                         skb->ip_summed = csummode;
 923                         skb->csum = 0;
 924                         skb_reserve(skb, hh_len);
 925
 926                         /*
 927                          *      Find where to start putting bytes.
 928                          */
 929                         data = skb_put(skb, fraglen);
 930                         skb_set_network_header(skb, exthdrlen);
 931                         skb->transport_header = (skb->network_header +
 932                                                  fragheaderlen);
 933                         data += fragheaderlen;
 934
 935                         if (fraggap) {
 936                                 skb->csum = skb_copy_and_csum_bits(
 937                                         skb_prev, maxfraglen,
 938                                         data + transhdrlen, fraggap, 0);
 939                                 skb_prev->csum = csum_sub(skb_prev->csum,
 940                                                           skb->csum);
 941                                 data += fraggap;
 942                                 pskb_trim_unique(skb_prev, maxfraglen);
 943                         }
 944
 945                         copy = datalen - transhdrlen - fraggap;
 946                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 947                                 err = -EFAULT;
 948                                 kfree_skb(skb);
 949                                 goto error;
 950                         }
 951
 952                         offset += copy;
 953                         length -= datalen - fraggap;
 954                         transhdrlen = 0;
 955                         exthdrlen = 0;
 956                         csummode = CHECKSUM_NONE;
 957
 958                         /*
 959                          * Put the packet on the pending queue.
 960                          */
 961                         __skb_queue_tail(&sk->sk_write_queue, skb);
 962                         continue;
 963                 }
 964
 965                 if (copy > length)
 966                         copy = length;
 967
 968                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 969                         unsigned int off;
 970
 971                         off = skb->len;
 972                         if (getfrag(from, skb_put(skb, copy),
 973                                         offset, copy, off, skb) < 0) {
 974                                 __skb_trim(skb, off);
 975                                 err = -EFAULT;
 976                                 goto error;
 977                         }
 978                 } else {
 979                         int i = skb_shinfo(skb)->nr_frags;
 980                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 981                         struct page *page = sk->sk_sndmsg_page;
 982                         int off = sk->sk_sndmsg_off;
 983                         unsigned int left;
 984
 985                         if (page && (left = PAGE_SIZE - off) > 0) {
 986                                 if (copy >= left)
 987                                         copy = left;
 988                                 if (page != frag->page) {
 989                                         if (i == MAX_SKB_FRAGS) {
 990                                                 err = -EMSGSIZE;
 991                                                 goto error;
 992                                         }
 993                                         get_page(page);
 994                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 995                                         frag = &skb_shinfo(skb)->frags[i];
 996                                 }
 997                         } else if (i < MAX_SKB_FRAGS) {
 998                                 if (copy > PAGE_SIZE)
 999                                         copy = PAGE_SIZE;
1000                                 page = alloc_pages(sk->sk_allocation, 0);
1001                                 if (page == NULL)  {
1002                                         err = -ENOMEM;
1003                                         goto error;
1004                                 }
1005                                 sk->sk_sndmsg_page = page;
1006                                 sk->sk_sndmsg_off = 0;
1007
1008                                 skb_fill_page_desc(skb, i, page, 0, 0);
1009                                 frag = &skb_shinfo(skb)->frags[i];
1010                                 skb->truesize += PAGE_SIZE;
1011                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1012                         } else {
1013                                 err = -EMSGSIZE;
1014                                 goto error;
1015                         }
1016                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1017                                 err = -EFAULT;
1018                                 goto error;
1019                         }
1020                         sk->sk_sndmsg_off += copy;
1021                         frag->size += copy;
1022                         skb->len += copy;
1023                         skb->data_len += copy;
1024                 }
1025                 offset += copy;
1026                 length -= copy;
1027         }
1028
1029         return 0;
1030
1031 error:
1032         inet->cork.length -= length;
1033         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1034         return err;
1035 }
1036
1037 ssize_t ip_append_page(struct sock *sk, struct page *page,
1038                        int offset, size_t size, int flags)
1039 {
1040         struct inet_sock *inet = inet_sk(sk);
1041         struct sk_buff *skb;
1042         struct rtable *rt;
1043         struct ip_options *opt = NULL;
1044         int hh_len;
1045         int mtu;
1046         int len;
1047         int err;
1048         unsigned int maxfraglen, fragheaderlen, fraggap;
1049
1050         if (inet->hdrincl)
1051                 return -EPERM;
1052
1053         if (flags&MSG_PROBE)
1054                 return 0;
1055
1056         if (skb_queue_empty(&sk->sk_write_queue))
1057                 return -EINVAL;
1058
1059         rt = inet->cork.rt;
1060         if (inet->cork.flags & IPCORK_OPT)
1061                 opt = inet->cork.opt;
1062
1063         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1064                 return -EOPNOTSUPP;
1065
1066         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1067         mtu = inet->cork.fragsize;
1068
1069         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1070         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1071
1072         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1073                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1074                 return -EMSGSIZE;
1075         }
1076
1077         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1078                 return -EINVAL;
1079
1080         inet->cork.length += size;
1081         if ((sk->sk_protocol == IPPROTO_UDP) &&
1082             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1083                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1084                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1085         }
1086
1087
1088         while (size > 0) {
1089                 int i;
1090
1091                 if (skb_is_gso(skb))
1092                         len = size;
1093                 else {
1094
1095                         /* Check if the remaining data fits into current packet. */
1096                         len = mtu - skb->len;
1097                         if (len < size)
1098                                 len = maxfraglen - skb->len;
1099                 }
1100                 if (len <= 0) {
1101                         struct sk_buff *skb_prev;
1102                         int alloclen;
1103
1104                         skb_prev = skb;
1105                         fraggap = skb_prev->len - maxfraglen;
1106
1107                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1108                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1109                         if (unlikely(!skb)) {
1110                                 err = -ENOBUFS;
1111                                 goto error;
1112                         }
1113
1114                         /*
1115                          *      Fill in the control structures
1116                          */
1117                         skb->ip_summed = CHECKSUM_NONE;
1118                         skb->csum = 0;
1119                         skb_reserve(skb, hh_len);
1120
1121                         /*
1122                          *      Find where to start putting bytes.
1123                          */
1124                         skb_put(skb, fragheaderlen + fraggap);
1125                         skb_reset_network_header(skb);
1126                         skb->transport_header = (skb->network_header +
1127                                                  fragheaderlen);
1128                         if (fraggap) {
1129                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1130                                                                    maxfraglen,
1131                                                     skb_transport_header(skb),
1132                                                                    fraggap, 0);
1133                                 skb_prev->csum = csum_sub(skb_prev->csum,
1134                                                           skb->csum);
1135                                 pskb_trim_unique(skb_prev, maxfraglen);
1136                         }
1137
1138                         /*
1139                          * Put the packet on the pending queue.
1140                          */
1141                         __skb_queue_tail(&sk->sk_write_queue, skb);
1142                         continue;
1143                 }
1144
1145                 i = skb_shinfo(skb)->nr_frags;
1146                 if (len > size)
1147                         len = size;
1148                 if (skb_can_coalesce(skb, i, page, offset)) {
1149                         skb_shinfo(skb)->frags[i-1].size += len;
1150                 } else if (i < MAX_SKB_FRAGS) {
1151                         get_page(page);
1152                         skb_fill_page_desc(skb, i, page, offset, len);
1153                 } else {
1154                         err = -EMSGSIZE;
1155                         goto error;
1156                 }
1157
1158                 if (skb->ip_summed == CHECKSUM_NONE) {
1159                         __wsum csum;
1160                         csum = csum_page(page, offset, len);
1161                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1162                 }
1163
1164                 skb->len += len;
1165                 skb->data_len += len;
1166                 offset += len;
1167                 size -= len;
1168         }
1169         return 0;
1170
1171 error:
1172         inet->cork.length -= size;
1173         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1174         return err;
1175 }
1176
1177 /*
1178  *      Combined all pending IP fragments on the socket as one IP datagram
1179  *      and push them out.
1180  */
1181 int ip_push_pending_frames(struct sock *sk)
1182 {
1183         struct sk_buff *skb, *tmp_skb;
1184         struct sk_buff **tail_skb;
1185         struct inet_sock *inet = inet_sk(sk);
1186         struct ip_options *opt = NULL;
1187         struct rtable *rt = inet->cork.rt;
1188         struct iphdr *iph;
1189         __be16 df = 0;
1190         __u8 ttl;
1191         int err = 0;
1192
1193         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1194                 goto out;
1195         tail_skb = &(skb_shinfo(skb)->frag_list);
1196
1197         /* move skb->data to ip header from ext header */
1198         if (skb->data < skb_network_header(skb))
1199                 __skb_pull(skb, skb_network_offset(skb));
1200         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1201                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1202                 *tail_skb = tmp_skb;
1203                 tail_skb = &(tmp_skb->next);
1204                 skb->len += tmp_skb->len;
1205                 skb->data_len += tmp_skb->len;
1206                 skb->truesize += tmp_skb->truesize;
1207                 __sock_put(tmp_skb->sk);
1208                 tmp_skb->destructor = NULL;
1209                 tmp_skb->sk = NULL;
1210         }
1211
1212         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1213          * to fragment the frame generated here. No matter, what transforms
1214          * how transforms change size of the packet, it will come out.
1215          */
1216         if (inet->pmtudisc < IP_PMTUDISC_DO)
1217                 skb->local_df = 1;
1218
1219         /* DF bit is set when we want to see DF on outgoing frames.
1220          * If local_df is set too, we still allow to fragment this frame
1221          * locally. */
1222         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1223             (skb->len <= dst_mtu(&rt->u.dst) &&
1224              ip_dont_fragment(sk, &rt->u.dst)))
1225                 df = htons(IP_DF);
1226
1227         if (inet->cork.flags & IPCORK_OPT)
1228                 opt = inet->cork.opt;
1229
1230         if (rt->rt_type == RTN_MULTICAST)
1231                 ttl = inet->mc_ttl;
1232         else
1233                 ttl = ip_select_ttl(inet, &rt->u.dst);
1234
1235         iph = (struct iphdr *)skb->data;
1236         iph->version = 4;
1237         iph->ihl = 5;
1238         if (opt) {
1239                 iph->ihl += opt->optlen>>2;
1240                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1241         }
1242         iph->tos = inet->tos;
1243         iph->tot_len = htons(skb->len);
1244         iph->frag_off = df;
1245         ip_select_ident(iph, &rt->u.dst, sk);
1246         iph->ttl = ttl;
1247         iph->protocol = sk->sk_protocol;
1248         iph->saddr = rt->rt_src;
1249         iph->daddr = rt->rt_dst;
1250         ip_send_check(iph);
1251
1252         skb->priority = sk->sk_priority;
1253         skb->dst = dst_clone(&rt->u.dst);
1254
1255         /* Netfilter gets whole the not fragmented skb. */
1256         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1257                       skb->dst->dev, dst_output);
1258         if (err) {
1259                 if (err > 0)
1260                         err = inet->recverr ? net_xmit_errno(err) : 0;
1261                 if (err)
1262                         goto error;
1263         }
1264
1265 out:
1266         inet->cork.flags &= ~IPCORK_OPT;
1267         kfree(inet->cork.opt);
1268         inet->cork.opt = NULL;
1269         if (inet->cork.rt) {
1270                 ip_rt_put(inet->cork.rt);
1271                 inet->cork.rt = NULL;
1272         }
1273         return err;
1274
1275 error:
1276         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1277         goto out;
1278 }
1279
1280 /*
1281  *      Throw away all pending data on the socket.
1282  */
1283 void ip_flush_pending_frames(struct sock *sk)
1284 {
1285         struct inet_sock *inet = inet_sk(sk);
1286         struct sk_buff *skb;
1287
1288         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1289                 kfree_skb(skb);
1290
1291         inet->cork.flags &= ~IPCORK_OPT;
1292         kfree(inet->cork.opt);
1293         inet->cork.opt = NULL;
1294         if (inet->cork.rt) {
1295                 ip_rt_put(inet->cork.rt);
1296                 inet->cork.rt = NULL;
1297         }
1298 }
1299
1300
1301 /*
1302  *      Fetch data from kernel space and fill in checksum if needed.
1303  */
1304 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1305                               int len, int odd, struct sk_buff *skb)
1306 {
1307         __wsum csum;
1308
1309         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1310         skb->csum = csum_block_add(skb->csum, csum, odd);
1311         return 0;
1312 }
1313
1314 /*
1315  *      Generic function to send a packet as reply to another packet.
1316  *      Used to send TCP resets so far. ICMP should use this function too.
1317  *
1318  *      Should run single threaded per socket because it uses the sock
1319  *      structure to pass arguments.
1320  *
1321  *      LATER: switch from ip_build_xmit to ip_append_*
1322  */
1323 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1324                    unsigned int len)
1325 {
1326         struct inet_sock *inet = inet_sk(sk);
1327         struct {
1328                 struct ip_options       opt;
1329                 char                    data[40];
1330         } replyopts;
1331         struct ipcm_cookie ipc;
1332         __be32 daddr;
1333         struct rtable *rt = (struct rtable*)skb->dst;
1334
1335         if (ip_options_echo(&replyopts.opt, skb))
1336                 return;
1337
1338         daddr = ipc.addr = rt->rt_src;
1339         ipc.opt = NULL;
1340
1341         if (replyopts.opt.optlen) {
1342                 ipc.opt = &replyopts.opt;
1343
1344                 if (ipc.opt->srr)
1345                         daddr = replyopts.opt.faddr;
1346         }
1347
1348         {
1349                 struct flowi fl = { .nl_u = { .ip4_u =
1350                                               { .daddr = daddr,
1351                                                 .saddr = rt->rt_spec_dst,
1352                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1353                                     /* Not quite clean, but right. */
1354                                     .uli_u = { .ports =
1355                                                { .sport = tcp_hdr(skb)->dest,
1356                                                  .dport = tcp_hdr(skb)->source } },
1357                                     .proto = sk->sk_protocol };
1358                 security_skb_classify_flow(skb, &fl);
1359                 if (ip_route_output_key(&rt, &fl))
1360                         return;
1361         }
1362
1363         /* And let IP do all the hard work.
1364
1365            This chunk is not reenterable, hence spinlock.
1366            Note that it uses the fact, that this function is called
1367            with locally disabled BH and that sk cannot be already spinlocked.
1368          */
1369         bh_lock_sock(sk);
1370         inet->tos = ip_hdr(skb)->tos;
1371         sk->sk_priority = skb->priority;
1372         sk->sk_protocol = ip_hdr(skb)->protocol;
1373         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1374                        &ipc, rt, MSG_DONTWAIT);
1375         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1376                 if (arg->csumoffset >= 0)
1377                         *((__sum16 *)skb_transport_header(skb) +
1378                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1379                                                                 arg->csum));
1380                 skb->ip_summed = CHECKSUM_NONE;
1381                 ip_push_pending_frames(sk);
1382         }
1383
1384         bh_unlock_sock(sk);
1385
1386         ip_rt_put(rt);
1387 }
1388
1389 void __init ip_init(void)
1390 {
1391         ip_rt_init();
1392         inet_initpeers();
1393
1394 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1395         igmp_mc_proc_init();
1396 #endif
1397 }
1398
1399 EXPORT_SYMBOL(ip_generic_getfrag);
1400 EXPORT_SYMBOL(ip_queue_xmit);
1401 EXPORT_SYMBOL(ip_send_check);