release/src-rt/linux/linux-2.6/net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 int __ip_local_out(struct sk_buff *skb)
  96 {
  97         struct iphdr *iph = ip_hdr(skb);
  98
  99         iph->tot_len = htons(skb->len);
 100         ip_send_check(iph);
 101         return nf_hook(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev,
 102                        dst_output);
 103 }
 104
 105 int ip_local_out(struct sk_buff *skb)
 106 {
 107         int err;
 108
 109         err = __ip_local_out(skb);
 110         if (likely(err == 1))
 111                 err = dst_output(skb);
 112
 113         return err;
 114 }
 115 EXPORT_SYMBOL_GPL(ip_local_out);
 116
 117 /* dev_loopback_xmit for use with netfilter. */
 118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 119 {
 120         skb_reset_mac_header(newskb);
 121         __skb_pull(newskb, skb_network_offset(newskb));
 122         newskb->pkt_type = PACKET_LOOPBACK;
 123         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 124         BUG_TRAP(newskb->dst);
 125         netif_rx(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = (struct rtable *)skb->dst;
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->u.dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 161         iph->daddr    = rt->rt_dst;
 162         iph->saddr    = rt->rt_src;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(iph, &rt->u.dst, sk);
 165
 166         if (opt && opt->optlen) {
 167                 iph->ihl += opt->optlen>>2;
 168                 ip_options_build(skb, opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172
 173         /* Send it out. */
 174         return ip_local_out(skb);
 175 }
 176
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb->dst;
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         int hh_len = LL_RESERVED_SPACE(dev);
 185
 186         if (rt->rt_type == RTN_MULTICAST)
 187                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 188         else if (rt->rt_type == RTN_BROADCAST)
 189                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 190
 191         /* Be paranoid, rather than too clever. */
 192         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 193                 struct sk_buff *skb2;
 194
 195                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                 if (skb2 == NULL) {
 197                         kfree_skb(skb);
 198                         return -ENOMEM;
 199                 }
 200                 if (skb->sk)
 201                         skb_set_owner_w(skb2, skb->sk);
 202                 kfree_skb(skb);
 203                 skb = skb2;
 204         }
 205
 206         if (dst->hh)
 207                 return neigh_hh_output(dst->hh, skb);
 208         else if (dst->neighbour)
 209                 return dst->neighbour->output(skb);
 210
 211         if (net_ratelimit())
 212                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213         kfree_skb(skb);
 214         return -EINVAL;
 215 }
 216
 217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218 {
 219         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222                skb->dst->dev->mtu : dst_mtu(skb->dst);
 223 }
 224
 225 static inline int ip_finish_output(struct sk_buff *skb)
 226 {
 227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228         /* Policy lookup after SNAT yielded a new policy */
 229         if (skb->dst->xfrm != NULL) {
 230                 IPCB(skb)->flags |= IPSKB_REROUTED;
 231                 return dst_output(skb);
 232         }
 233 #endif
 234         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                 return ip_fragment(skb, ip_finish_output2);
 236         else
 237                 return ip_finish_output2(skb);
 238 }
 239
 240 int ip_mc_output(struct sk_buff *skb)
 241 {
 242         struct sock *sk = skb->sk;
 243         struct rtable *rt = (struct rtable*)skb->dst;
 244         struct net_device *dev = rt->u.dst.dev;
 245
 246         /*
 247          *      If the indicated interface is up and running, send the packet.
 248          */
 249         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 250
 251         skb->dev = dev;
 252         skb->protocol = htons(ETH_P_IP);
 253
 254         /*
 255          *      Multicasts are looped back for other local users
 256          */
 257
 258         if (rt->rt_flags&RTCF_MULTICAST) {
 259                 if ((!sk || inet_sk(sk)->mc_loop)
 260 #ifdef CONFIG_IP_MROUTE
 261                 /* Small optimization: do not loopback not local frames,
 262                    which returned after forwarding; they will be  dropped
 263                    by ip_mr_input in any case.
 264                    Note, that local frames are looped back to be delivered
 265                    to local recipients.
 266
 267                    This check is duplicated in ip_mr_input at the moment.
 268                  */
 269                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 270 #endif
 271                 ) {
 272                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 273                         if (newskb)
 274                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 275                                         newskb->dev,
 276                                         ip_dev_loopback_xmit);
 277                 }
 278
 279                 /* Multicasts with ttl 0 must not go beyond the host */
 280
 281                 if (ip_hdr(skb)->ttl == 0) {
 282                         kfree_skb(skb);
 283                         return 0;
 284                 }
 285         }
 286
 287         if (rt->rt_flags&RTCF_BROADCAST) {
 288                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 289                 if (newskb)
 290                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 291                                 newskb->dev, ip_dev_loopback_xmit);
 292         }
 293
 294         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 295                             ip_finish_output,
 296                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 297 }
 298
 299 int ip_output(struct sk_buff *skb)
 300 {
 301         struct net_device *dev = skb->dst->dev;
 302
 303         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 304
 305         skb->dev = dev;
 306         skb->protocol = htons(ETH_P_IP);
 307
 308         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 309                             ip_finish_output,
 310                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 311 }
 312
 313 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 314 {
 315         struct sock *sk = skb->sk;
 316         struct inet_sock *inet = inet_sk(sk);
 317         struct ip_options *opt = inet->opt;
 318         struct rtable *rt;
 319         struct iphdr *iph;
 320
 321         /* Skip all of this if the packet is already routed,
 322          * f.e. by something like SCTP.
 323          */
 324         rt = (struct rtable *) skb->dst;
 325         if (rt != NULL)
 326                 goto packet_routed;
 327
 328         /* Make sure we can route this packet. */
 329         rt = (struct rtable *)__sk_dst_check(sk, 0);
 330         if (rt == NULL) {
 331                 __be32 daddr;
 332
 333                 /* Use correct destination address if we have options. */
 334                 daddr = inet->daddr;
 335                 if(opt && opt->srr)
 336                         daddr = opt->faddr;
 337
 338                 {
 339                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 340                                             .nl_u = { .ip4_u =
 341                                                       { .daddr = daddr,
 342                                                         .saddr = inet->saddr,
 343                                                         .tos = RT_CONN_FLAGS(sk) } },
 344                                             .proto = sk->sk_protocol,
 345                                             .uli_u = { .ports =
 346                                                        { .sport = inet->sport,
 347                                                          .dport = inet->dport } } };
 348
 349                         /* If this fails, retransmit mechanism of transport layer will
 350                          * keep trying until route appears or the connection times
 351                          * itself out.
 352                          */
 353                         security_sk_classify_flow(sk, &fl);
 354                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 355                                 goto no_route;
 356                 }
 357                 sk_setup_caps(sk, &rt->u.dst);
 358         }
 359         skb->dst = dst_clone(&rt->u.dst);
 360
 361 packet_routed:
 362         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 363                 goto no_route;
 364
 365         /* OK, we know where to send it, allocate and build IP header. */
 366         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 367         skb_reset_network_header(skb);
 368         iph = ip_hdr(skb);
 369         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 370         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 371                 iph->frag_off = htons(IP_DF);
 372         else
 373                 iph->frag_off = 0;
 374         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 375         iph->protocol = sk->sk_protocol;
 376         iph->saddr    = rt->rt_src;
 377         iph->daddr    = rt->rt_dst;
 378         /* Transport layer set skb->h.foo itself. */
 379
 380         if (opt && opt->optlen) {
 381                 iph->ihl += opt->optlen >> 2;
 382                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 383         }
 384
 385         ip_select_ident_more(iph, &rt->u.dst, sk,
 386                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 387
 388         skb->priority = sk->sk_priority;
 389
 390         return ip_local_out(skb);
 391
 392 no_route:
 393         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 394         kfree_skb(skb);
 395         return -EHOSTUNREACH;
 396 }
 397
 398
 399 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 400 {
 401         to->pkt_type = from->pkt_type;
 402         to->priority = from->priority;
 403         to->protocol = from->protocol;
 404         dst_release(to->dst);
 405         to->dst = dst_clone(from->dst);
 406         to->dev = from->dev;
 407         to->mark = from->mark;
 408
 409         /* Copy the flags to each fragment. */
 410         IPCB(to)->flags = IPCB(from)->flags;
 411
 412 #ifdef CONFIG_NET_SCHED
 413         to->tc_index = from->tc_index;
 414 #endif
 415         nf_copy(to, from);
 416 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 417         to->ipvs_property = from->ipvs_property;
 418 #endif
 419         skb_copy_secmark(to, from);
 420 }
 421
 422 /*
 423  *      This IP datagram is too large to be sent in one piece.  Break it up into
 424  *      smaller pieces (each of size equal to IP header plus
 425  *      a block of the data of the original IP data part) that will yet fit in a
 426  *      single device frame, and queue such a frame for sending.
 427  */
 428
 429 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 430 {
 431         struct iphdr *iph;
 432         int raw = 0;
 433         int ptr;
 434         struct net_device *dev;
 435         struct sk_buff *skb2;
 436         unsigned int mtu, hlen, left, len, ll_rs, pad;
 437         int offset;
 438         __be16 not_last_frag;
 439         struct rtable *rt = (struct rtable*)skb->dst;
 440         int err = 0;
 441
 442         dev = rt->u.dst.dev;
 443
 444         /*
 445          *      Point into the IP datagram header.
 446          */
 447
 448         iph = ip_hdr(skb);
 449
 450         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 451                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 452                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 453                           htonl(ip_skb_dst_mtu(skb)));
 454                 kfree_skb(skb);
 455                 return -EMSGSIZE;
 456         }
 457
 458         /*
 459          *      Setup starting values.
 460          */
 461
 462         hlen = iph->ihl * 4;
 463         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 464         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 465
 466         /* When frag_list is given, use it. First, check its validity:
 467          * some transformers could create wrong frag_list or break existing
 468          * one, it is not prohibited. In this case fall back to copying.
 469          *
 470          * LATER: this step can be merged to real generation of fragments,
 471          * we can switch to copy when see the first bad fragment.
 472          */
 473         if (skb_shinfo(skb)->frag_list) {
 474                 struct sk_buff *frag, *frag2;
 475                 int first_len = skb_pagelen(skb);
 476
 477                 if (first_len - hlen > mtu ||
 478                     ((first_len - hlen) & 7) ||
 479                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 480                     skb_cloned(skb))
 481                         goto slow_path;
 482
 483                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 484                         /* Correct geometry. */
 485                         if (frag->len > mtu ||
 486                             ((frag->len & 7) && frag->next) ||
 487                             skb_headroom(frag) < hlen)
 488                                 goto slow_path_clean;
 489
 490                         /* Partially cloned skb? */
 491                         if (skb_shared(frag))
 492                                 goto slow_path_clean;
 493
 494                         BUG_ON(frag->sk);
 495                         if (skb->sk) {
 496                                 frag->sk = skb->sk;
 497                                 frag->destructor = sock_wfree;
 498                         }
 499                         skb->truesize -= frag->truesize;
 500                 }
 501
 502                 /* Everything is OK. Generate! */
 503
 504                 err = 0;
 505                 offset = 0;
 506                 frag = skb_shinfo(skb)->frag_list;
 507                 skb_shinfo(skb)->frag_list = NULL;
 508                 skb->data_len = first_len - skb_headlen(skb);
 509                 skb->len = first_len;
 510                 iph->tot_len = htons(first_len);
 511                 iph->frag_off = htons(IP_MF);
 512                 ip_send_check(iph);
 513
 514                 for (;;) {
 515                         /* Prepare header of the next frame,
 516                          * before previous one went down. */
 517                         if (frag) {
 518                                 frag->ip_summed = CHECKSUM_NONE;
 519                                 skb_reset_transport_header(frag);
 520                                 __skb_push(frag, hlen);
 521                                 skb_reset_network_header(frag);
 522                                 memcpy(skb_network_header(frag), iph, hlen);
 523                                 iph = ip_hdr(frag);
 524                                 iph->tot_len = htons(frag->len);
 525                                 ip_copy_metadata(frag, skb);
 526                                 if (offset == 0)
 527                                         ip_options_fragment(frag);
 528                                 offset += skb->len - hlen;
 529                                 iph->frag_off = htons(offset>>3);
 530                                 if (frag->next != NULL)
 531                                         iph->frag_off |= htons(IP_MF);
 532                                 /* Ready, complete checksum */
 533                                 ip_send_check(iph);
 534                         }
 535
 536                         err = output(skb);
 537
 538                         if (!err)
 539                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 540                         if (err || !frag)
 541                                 break;
 542
 543                         skb = frag;
 544                         frag = skb->next;
 545                         skb->next = NULL;
 546                 }
 547
 548                 if (err == 0) {
 549                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 550                         return 0;
 551                 }
 552
 553                 while (frag) {
 554                         skb = frag->next;
 555                         kfree_skb(frag);
 556                         frag = skb;
 557                 }
 558                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 559                 return err;
 560
 561 slow_path_clean:
 562                 for (frag2 = skb_shinfo(skb)->frag_list; frag2; frag2 = frag2->next) {
 563                         if (frag2 == frag)
 564                                 break;
 565                         frag2->sk = NULL;
 566                         frag2->destructor = NULL;
 567                         skb->truesize += frag2->truesize;
 568                 }
 569         }
 570
 571 slow_path:
 572         left = skb->len - hlen;         /* Space per frame */
 573         ptr = raw + hlen;               /* Where to start from */
 574
 575         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 576          * we need to make room for the encapsulating header
 577          */
 578         pad = nf_bridge_pad(skb);
 579         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 580         mtu -= pad;
 581
 582         /*
 583          *      Fragment the datagram.
 584          */
 585
 586         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 587         not_last_frag = iph->frag_off & htons(IP_MF);
 588
 589         /*
 590          *      Keep copying data until we run out.
 591          */
 592
 593         while (left > 0) {
 594                 len = left;
 595                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 596                 if (len > mtu)
 597                         len = mtu;
 598                 /* IF: we are not sending upto and including the packet end
 599                    then align the next start on an eight byte boundary */
 600                 if (len < left) {
 601                         len &= ~7;
 602                 }
 603                 /*
 604                  *      Allocate buffer.
 605                  */
 606
 607                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 608                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 609                         err = -ENOMEM;
 610                         goto fail;
 611                 }
 612
 613                 /*
 614                  *      Set up data on packet
 615                  */
 616
 617                 ip_copy_metadata(skb2, skb);
 618                 skb_reserve(skb2, ll_rs);
 619                 skb_put(skb2, len + hlen);
 620                 skb_reset_network_header(skb2);
 621                 skb2->transport_header = skb2->network_header + hlen;
 622
 623                 /*
 624                  *      Charge the memory for the fragment to any owner
 625                  *      it might possess
 626                  */
 627
 628                 if (skb->sk)
 629                         skb_set_owner_w(skb2, skb->sk);
 630
 631                 /*
 632                  *      Copy the packet header into the new buffer.
 633                  */
 634
 635                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 636
 637                 /*
 638                  *      Copy a block of the IP datagram.
 639                  */
 640                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 641                         BUG();
 642                 left -= len;
 643
 644                 /*
 645                  *      Fill in the new header fields.
 646                  */
 647                 iph = ip_hdr(skb2);
 648                 iph->frag_off = htons((offset >> 3));
 649
 650                 /* ANK: dirty, but effective trick. Upgrade options only if
 651                  * the segment to be fragmented was THE FIRST (otherwise,
 652                  * options are already fixed) and make it ONCE
 653                  * on the initial skb, so that all the following fragments
 654                  * will inherit fixed options.
 655                  */
 656                 if (offset == 0)
 657                         ip_options_fragment(skb);
 658
 659                 /*
 660                  *      Added AC : If we are fragmenting a fragment that's not the
 661                  *                 last fragment then keep MF on each bit
 662                  */
 663                 if (left > 0 || not_last_frag)
 664                         iph->frag_off |= htons(IP_MF);
 665                 ptr += len;
 666                 offset += len;
 667
 668                 /*
 669                  *      Put this fragment into the sending queue.
 670                  */
 671                 iph->tot_len = htons(len + hlen);
 672
 673                 ip_send_check(iph);
 674
 675                 err = output(skb2);
 676                 if (err)
 677                         goto fail;
 678
 679                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 680         }
 681         kfree_skb(skb);
 682         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 683         return err;
 684
 685 fail:
 686         kfree_skb(skb);
 687         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 688         return err;
 689 }
 690
 691 EXPORT_SYMBOL(ip_fragment);
 692
 693 int
 694 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 695 {
 696         struct iovec *iov = from;
 697
 698         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 699                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 700                         return -EFAULT;
 701         } else {
 702                 __wsum csum = 0;
 703                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 704                         return -EFAULT;
 705                 skb->csum = csum_block_add(skb->csum, csum, odd);
 706         }
 707         return 0;
 708 }
 709
 710 static inline __wsum
 711 csum_page(struct page *page, int offset, int copy)
 712 {
 713         char *kaddr;
 714         __wsum csum;
 715         kaddr = kmap(page);
 716         csum = csum_partial(kaddr + offset, copy, 0);
 717         kunmap(page);
 718         return csum;
 719 }
 720
 721 static inline int ip_ufo_append_data(struct sock *sk,
 722                         int getfrag(void *from, char *to, int offset, int len,
 723                                int odd, struct sk_buff *skb),
 724                         void *from, int length, int hh_len, int fragheaderlen,
 725                         int transhdrlen, int mtu,unsigned int flags)
 726 {
 727         struct sk_buff *skb;
 728         int err;
 729
 730         /* There is support for UDP fragmentation offload by network
 731          * device, so create one single skb packet containing complete
 732          * udp datagram
 733          */
 734         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 735                 skb = sock_alloc_send_skb(sk,
 736                         hh_len + fragheaderlen + transhdrlen + 20,
 737                         (flags & MSG_DONTWAIT), &err);
 738
 739                 if (skb == NULL)
 740                         return err;
 741
 742                 /* reserve space for Hardware header */
 743                 skb_reserve(skb, hh_len);
 744
 745                 /* create space for UDP/IP header */
 746                 skb_put(skb,fragheaderlen + transhdrlen);
 747
 748                 /* initialize network header pointer */
 749                 skb_reset_network_header(skb);
 750
 751                 /* initialize protocol header pointer */
 752                 skb->transport_header = skb->network_header + fragheaderlen;
 753
 754                 skb->ip_summed = CHECKSUM_PARTIAL;
 755                 skb->csum = 0;
 756                 sk->sk_sndmsg_off = 0;
 757
 758                 /* specify the length of each IP datagram fragment */
 759                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 760                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 761                 __skb_queue_tail(&sk->sk_write_queue, skb);
 762         }
 763
 764         return skb_append_datato_frags(sk, skb, getfrag, from,
 765                                        (length - transhdrlen));
 766 }
 767
 768 /*
 769  *      ip_append_data() and ip_append_page() can make one large IP datagram
 770  *      from many pieces of data. Each pieces will be holded on the socket
 771  *      until ip_push_pending_frames() is called. Each piece can be a page
 772  *      or non-page data.
 773  *
 774  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 775  *      this interface potentially.
 776  *
 777  *      LATER: length must be adjusted by pad at tail, when it is required.
 778  */
 779 int ip_append_data(struct sock *sk,
 780                    int getfrag(void *from, char *to, int offset, int len,
 781                                int odd, struct sk_buff *skb),
 782                    void *from, int length, int transhdrlen,
 783                    struct ipcm_cookie *ipc, struct rtable *rt,
 784                    unsigned int flags)
 785 {
 786         struct inet_sock *inet = inet_sk(sk);
 787         struct sk_buff *skb;
 788
 789         struct ip_options *opt = NULL;
 790         int hh_len;
 791         int exthdrlen;
 792         int mtu;
 793         int copy;
 794         int err;
 795         int offset = 0;
 796         unsigned int maxfraglen, fragheaderlen;
 797         int csummode = CHECKSUM_NONE;
 798
 799         if (flags&MSG_PROBE)
 800                 return 0;
 801
 802         if (skb_queue_empty(&sk->sk_write_queue)) {
 803                 /*
 804                  * setup for corking.
 805                  */
 806                 opt = ipc->opt;
 807                 if (opt) {
 808                         if (inet->cork.opt == NULL) {
 809                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 810                                 if (unlikely(inet->cork.opt == NULL))
 811                                         return -ENOBUFS;
 812                         }
 813                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 814                         inet->cork.flags |= IPCORK_OPT;
 815                         inet->cork.addr = ipc->addr;
 816                 }
 817                 dst_hold(&rt->u.dst);
 818                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 819                                             rt->u.dst.dev->mtu :
 820                                             dst_mtu(rt->u.dst.path);
 821                 inet->cork.rt = rt;
 822                 inet->cork.length = 0;
 823                 sk->sk_sndmsg_page = NULL;
 824                 sk->sk_sndmsg_off = 0;
 825                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 826                         length += exthdrlen;
 827                         transhdrlen += exthdrlen;
 828                 }
 829         } else {
 830                 rt = inet->cork.rt;
 831                 if (inet->cork.flags & IPCORK_OPT)
 832                         opt = inet->cork.opt;
 833
 834                 transhdrlen = 0;
 835                 exthdrlen = 0;
 836                 mtu = inet->cork.fragsize;
 837         }
 838         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 839
 840         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 841         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 842
 843         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 844                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 845                 return -EMSGSIZE;
 846         }
 847
 848         /*
 849          * transhdrlen > 0 means that this is the first fragment and we wish
 850          * it won't be fragmented in the future.
 851          */
 852         if (transhdrlen &&
 853             length + fragheaderlen <= mtu &&
 854             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 855             !exthdrlen)
 856                 csummode = CHECKSUM_PARTIAL;
 857
 858         inet->cork.length += length;
 859         if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
 860             (sk->sk_protocol == IPPROTO_UDP) &&
 861             (rt->u.dst.dev->features & NETIF_F_UFO)) {
 862                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 863                                          fragheaderlen, transhdrlen, mtu,
 864                                          flags);
 865                 if (err)
 866                         goto error;
 867                 return 0;
 868         }
 869
 870         /* So, what's going on in the loop below?
 871          *
 872          * We use calculated fragment length to generate chained skb,
 873          * each of segments is IP fragment ready for sending to network after
 874          * adding appropriate IP header.
 875          */
 876
 877         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 878                 goto alloc_new_skb;
 879
 880         while (length > 0) {
 881                 /* Check if the remaining data fits into current packet. */
 882                 copy = mtu - skb->len;
 883                 if (copy < length)
 884                         copy = maxfraglen - skb->len;
 885                 if (copy <= 0) {
 886                         char *data;
 887                         unsigned int datalen;
 888                         unsigned int fraglen;
 889                         unsigned int fraggap;
 890                         unsigned int alloclen;
 891                         struct sk_buff *skb_prev;
 892 alloc_new_skb:
 893                         skb_prev = skb;
 894                         if (skb_prev)
 895                                 fraggap = skb_prev->len - maxfraglen;
 896                         else
 897                                 fraggap = 0;
 898
 899                         /*
 900                          * If remaining data exceeds the mtu,
 901                          * we know we need more fragment(s).
 902                          */
 903                         datalen = length + fraggap;
 904                         if (datalen > mtu - fragheaderlen)
 905                                 datalen = maxfraglen - fragheaderlen;
 906                         fraglen = datalen + fragheaderlen;
 907
 908                         if ((flags & MSG_MORE) &&
 909                             !(rt->u.dst.dev->features&NETIF_F_SG))
 910                                 alloclen = mtu;
 911                         else
 912                                 alloclen = datalen + fragheaderlen;
 913
 914                         /* The last fragment gets additional space at tail.
 915                          * Note, with MSG_MORE we overallocate on fragments,
 916                          * because we have no idea what fragment will be
 917                          * the last.
 918                          */
 919                         if (datalen == length + fraggap)
 920                                 alloclen += rt->u.dst.trailer_len;
 921
 922                         if (transhdrlen) {
 923                                 skb = sock_alloc_send_skb(sk,
 924                                                 alloclen + hh_len + 15,
 925                                                 (flags & MSG_DONTWAIT), &err);
 926                         } else {
 927                                 skb = NULL;
 928                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 929                                     2 * sk->sk_sndbuf)
 930                                         skb = sock_wmalloc(sk,
 931                                                            alloclen + hh_len + 15, 1,
 932                                                            sk->sk_allocation);
 933                                 if (unlikely(skb == NULL))
 934                                         err = -ENOBUFS;
 935                         }
 936                         if (skb == NULL)
 937                                 goto error;
 938
 939                         /*
 940                          *      Fill in the control structures
 941                          */
 942                         skb->ip_summed = csummode;
 943                         skb->csum = 0;
 944                         skb_reserve(skb, hh_len);
 945
 946                         /*
 947                          *      Find where to start putting bytes.
 948                          */
 949                         data = skb_put(skb, fraglen);
 950                         skb_set_network_header(skb, exthdrlen);
 951                         skb->transport_header = (skb->network_header +
 952                                                  fragheaderlen);
 953                         data += fragheaderlen;
 954
 955                         if (fraggap) {
 956                                 skb->csum = skb_copy_and_csum_bits(
 957                                         skb_prev, maxfraglen,
 958                                         data + transhdrlen, fraggap, 0);
 959                                 skb_prev->csum = csum_sub(skb_prev->csum,
 960                                                           skb->csum);
 961                                 data += fraggap;
 962                                 pskb_trim_unique(skb_prev, maxfraglen);
 963                         }
 964
 965                         copy = datalen - transhdrlen - fraggap;
 966                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 967                                 err = -EFAULT;
 968                                 kfree_skb(skb);
 969                                 goto error;
 970                         }
 971
 972                         offset += copy;
 973                         length -= datalen - fraggap;
 974                         transhdrlen = 0;
 975                         exthdrlen = 0;
 976                         csummode = CHECKSUM_NONE;
 977
 978                         /*
 979                          * Put the packet on the pending queue.
 980                          */
 981                         __skb_queue_tail(&sk->sk_write_queue, skb);
 982                         continue;
 983                 }
 984
 985                 if (copy > length)
 986                         copy = length;
 987
 988                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 989                         unsigned int off;
 990
 991                         off = skb->len;
 992                         if (getfrag(from, skb_put(skb, copy),
 993                                         offset, copy, off, skb) < 0) {
 994                                 __skb_trim(skb, off);
 995                                 err = -EFAULT;
 996                                 goto error;
 997                         }
 998                 } else {
 999                         int i = skb_shinfo(skb)->nr_frags;
1000                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1001                         struct page *page = sk->sk_sndmsg_page;
1002                         int off = sk->sk_sndmsg_off;
1003                         unsigned int left;
1004
1005                         if (page && (left = PAGE_SIZE - off) > 0) {
1006                                 if (copy >= left)
1007                                         copy = left;
1008                                 if (page != frag->page) {
1009                                         if (i == MAX_SKB_FRAGS) {
1010                                                 err = -EMSGSIZE;
1011                                                 goto error;
1012                                         }
1013                                         get_page(page);
1014                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1015                                         frag = &skb_shinfo(skb)->frags[i];
1016                                 }
1017                         } else if (i < MAX_SKB_FRAGS) {
1018                                 if (copy > PAGE_SIZE)
1019                                         copy = PAGE_SIZE;
1020                                 page = alloc_pages(sk->sk_allocation, 0);
1021                                 if (page == NULL)  {
1022                                         err = -ENOMEM;
1023                                         goto error;
1024                                 }
1025                                 sk->sk_sndmsg_page = page;
1026                                 sk->sk_sndmsg_off = 0;
1027
1028                                 skb_fill_page_desc(skb, i, page, 0, 0);
1029                                 frag = &skb_shinfo(skb)->frags[i];
1030                         } else {
1031                                 err = -EMSGSIZE;
1032                                 goto error;
1033                         }
1034                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1035                                 err = -EFAULT;
1036                                 goto error;
1037                         }
1038                         sk->sk_sndmsg_off += copy;
1039                         frag->size += copy;
1040                         skb->len += copy;
1041                         skb->data_len += copy;
1042                         skb->truesize += copy;
1043                         atomic_add(copy, &sk->sk_wmem_alloc);
1044                 }
1045                 offset += copy;
1046                 length -= copy;
1047         }
1048
1049         return 0;
1050
1051 error:
1052         inet->cork.length -= length;
1053         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1054         return err;
1055 }
1056
1057 ssize_t ip_append_page(struct sock *sk, struct page *page,
1058                        int offset, size_t size, int flags)
1059 {
1060         struct inet_sock *inet = inet_sk(sk);
1061         struct sk_buff *skb;
1062         struct rtable *rt;
1063         struct ip_options *opt = NULL;
1064         int hh_len;
1065         int mtu;
1066         int len;
1067         int err;
1068         unsigned int maxfraglen, fragheaderlen, fraggap;
1069
1070         if (inet->hdrincl)
1071                 return -EPERM;
1072
1073         if (flags&MSG_PROBE)
1074                 return 0;
1075
1076         if (skb_queue_empty(&sk->sk_write_queue))
1077                 return -EINVAL;
1078
1079         rt = inet->cork.rt;
1080         if (inet->cork.flags & IPCORK_OPT)
1081                 opt = inet->cork.opt;
1082
1083         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1084                 return -EOPNOTSUPP;
1085
1086         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1087         mtu = inet->cork.fragsize;
1088
1089         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1090         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1091
1092         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1093                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1094                 return -EMSGSIZE;
1095         }
1096
1097         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1098                 return -EINVAL;
1099
1100         inet->cork.length += size;
1101         if ((sk->sk_protocol == IPPROTO_UDP) &&
1102             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1103                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1104                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1105         }
1106
1107
1108         while (size > 0) {
1109                 int i;
1110
1111                 if (skb_is_gso(skb))
1112                         len = size;
1113                 else {
1114
1115                         /* Check if the remaining data fits into current packet. */
1116                         len = mtu - skb->len;
1117                         if (len < size)
1118                                 len = maxfraglen - skb->len;
1119                 }
1120                 if (len <= 0) {
1121                         struct sk_buff *skb_prev;
1122                         int alloclen;
1123
1124                         skb_prev = skb;
1125                         fraggap = skb_prev->len - maxfraglen;
1126
1127                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1128                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1129                         if (unlikely(!skb)) {
1130                                 err = -ENOBUFS;
1131                                 goto error;
1132                         }
1133
1134                         /*
1135                          *      Fill in the control structures
1136                          */
1137                         skb->ip_summed = CHECKSUM_NONE;
1138                         skb->csum = 0;
1139                         skb_reserve(skb, hh_len);
1140
1141                         /*
1142                          *      Find where to start putting bytes.
1143                          */
1144                         skb_put(skb, fragheaderlen + fraggap);
1145                         skb_reset_network_header(skb);
1146                         skb->transport_header = (skb->network_header +
1147                                                  fragheaderlen);
1148                         if (fraggap) {
1149                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1150                                                                    maxfraglen,
1151                                                     skb_transport_header(skb),
1152                                                                    fraggap, 0);
1153                                 skb_prev->csum = csum_sub(skb_prev->csum,
1154                                                           skb->csum);
1155                                 pskb_trim_unique(skb_prev, maxfraglen);
1156                         }
1157
1158                         /*
1159                          * Put the packet on the pending queue.
1160                          */
1161                         __skb_queue_tail(&sk->sk_write_queue, skb);
1162                         continue;
1163                 }
1164
1165                 i = skb_shinfo(skb)->nr_frags;
1166                 if (len > size)
1167                         len = size;
1168                 if (skb_can_coalesce(skb, i, page, offset)) {
1169                         skb_shinfo(skb)->frags[i-1].size += len;
1170                 } else if (i < MAX_SKB_FRAGS) {
1171                         get_page(page);
1172                         skb_fill_page_desc(skb, i, page, offset, len);
1173                 } else {
1174                         err = -EMSGSIZE;
1175                         goto error;
1176                 }
1177
1178                 if (skb->ip_summed == CHECKSUM_NONE) {
1179                         __wsum csum;
1180                         csum = csum_page(page, offset, len);
1181                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1182                 }
1183
1184                 skb->len += len;
1185                 skb->data_len += len;
1186                 skb->truesize += len;
1187                 atomic_add(len, &sk->sk_wmem_alloc);
1188                 offset += len;
1189                 size -= len;
1190         }
1191         return 0;
1192
1193 error:
1194         inet->cork.length -= size;
1195         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1196         return err;
1197 }
1198
1199 /*
1200  *      Combined all pending IP fragments on the socket as one IP datagram
1201  *      and push them out.
1202  */
1203 int ip_push_pending_frames(struct sock *sk)
1204 {
1205         struct sk_buff *skb, *tmp_skb;
1206         struct sk_buff **tail_skb;
1207         struct inet_sock *inet = inet_sk(sk);
1208         struct ip_options *opt = NULL;
1209         struct rtable *rt = inet->cork.rt;
1210         struct iphdr *iph;
1211         __be16 df = 0;
1212         __u8 ttl;
1213         int err = 0;
1214
1215         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1216                 goto out;
1217         tail_skb = &(skb_shinfo(skb)->frag_list);
1218
1219         /* move skb->data to ip header from ext header */
1220         if (skb->data < skb_network_header(skb))
1221                 __skb_pull(skb, skb_network_offset(skb));
1222         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1223                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1224                 *tail_skb = tmp_skb;
1225                 tail_skb = &(tmp_skb->next);
1226                 skb->len += tmp_skb->len;
1227                 skb->data_len += tmp_skb->len;
1228                 skb->truesize += tmp_skb->truesize;
1229                 tmp_skb->destructor = NULL;
1230                 tmp_skb->sk = NULL;
1231         }
1232
1233         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1234          * to fragment the frame generated here. No matter, what transforms
1235          * how transforms change size of the packet, it will come out.
1236          */
1237         if (inet->pmtudisc < IP_PMTUDISC_DO)
1238                 skb->local_df = 1;
1239
1240         /* DF bit is set when we want to see DF on outgoing frames.
1241          * If local_df is set too, we still allow to fragment this frame
1242          * locally. */
1243         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1244             (skb->len <= dst_mtu(&rt->u.dst) &&
1245              ip_dont_fragment(sk, &rt->u.dst)))
1246                 df = htons(IP_DF);
1247
1248         if (inet->cork.flags & IPCORK_OPT)
1249                 opt = inet->cork.opt;
1250
1251         if (rt->rt_type == RTN_MULTICAST)
1252                 ttl = inet->mc_ttl;
1253         else
1254                 ttl = ip_select_ttl(inet, &rt->u.dst);
1255
1256         iph = (struct iphdr *)skb->data;
1257         iph->version = 4;
1258         iph->ihl = 5;
1259         if (opt) {
1260                 iph->ihl += opt->optlen>>2;
1261                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1262         }
1263         iph->tos = inet->tos;
1264         iph->frag_off = df;
1265         ip_select_ident(iph, &rt->u.dst, sk);
1266         iph->ttl = ttl;
1267         iph->protocol = sk->sk_protocol;
1268         iph->saddr = rt->rt_src;
1269         iph->daddr = rt->rt_dst;
1270
1271         skb->priority = sk->sk_priority;
1272         skb->dst = dst_clone(&rt->u.dst);
1273
1274         /* Netfilter gets whole the not fragmented skb. */
1275         err = ip_local_out(skb);
1276         if (err) {
1277                 if (err > 0)
1278                         err = inet->recverr ? net_xmit_errno(err) : 0;
1279                 if (err)
1280                         goto error;
1281         }
1282
1283 out:
1284         inet->cork.flags &= ~IPCORK_OPT;
1285         kfree(inet->cork.opt);
1286         inet->cork.opt = NULL;
1287         if (inet->cork.rt) {
1288                 ip_rt_put(inet->cork.rt);
1289                 inet->cork.rt = NULL;
1290         }
1291         return err;
1292
1293 error:
1294         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1295         goto out;
1296 }
1297
1298 /*
1299  *      Throw away all pending data on the socket.
1300  */
1301 void ip_flush_pending_frames(struct sock *sk)
1302 {
1303         struct inet_sock *inet = inet_sk(sk);
1304         struct sk_buff *skb;
1305
1306         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1307                 kfree_skb(skb);
1308
1309         inet->cork.flags &= ~IPCORK_OPT;
1310         kfree(inet->cork.opt);
1311         inet->cork.opt = NULL;
1312         if (inet->cork.rt) {
1313                 ip_rt_put(inet->cork.rt);
1314                 inet->cork.rt = NULL;
1315         }
1316 }
1317
1318
1319 /*
1320  *      Fetch data from kernel space and fill in checksum if needed.
1321  */
1322 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1323                               int len, int odd, struct sk_buff *skb)
1324 {
1325         __wsum csum;
1326
1327         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1328         skb->csum = csum_block_add(skb->csum, csum, odd);
1329         return 0;
1330 }
1331
1332 /*
1333  *      Generic function to send a packet as reply to another packet.
1334  *      Used to send TCP resets so far. ICMP should use this function too.
1335  *
1336  *      Should run single threaded per socket because it uses the sock
1337  *      structure to pass arguments.
1338  *
1339  *      LATER: switch from ip_build_xmit to ip_append_*
1340  */
1341 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1342                    unsigned int len)
1343 {
1344         struct inet_sock *inet = inet_sk(sk);
1345         struct {
1346                 struct ip_options       opt;
1347                 char                    data[40];
1348         } replyopts;
1349         struct ipcm_cookie ipc;
1350         __be32 daddr;
1351         struct rtable *rt = (struct rtable*)skb->dst;
1352
1353         if (ip_options_echo(&replyopts.opt, skb))
1354                 return;
1355
1356         daddr = ipc.addr = rt->rt_src;
1357         ipc.opt = NULL;
1358
1359         if (replyopts.opt.optlen) {
1360                 ipc.opt = &replyopts.opt;
1361
1362                 if (ipc.opt->srr)
1363                         daddr = replyopts.opt.faddr;
1364         }
1365
1366         {
1367                 struct flowi fl = { .oif = arg->bound_dev_if,
1368                                     .nl_u = { .ip4_u =
1369                                               { .daddr = daddr,
1370                                                 .saddr = rt->rt_spec_dst,
1371                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1372                                     /* Not quite clean, but right. */
1373                                     .uli_u = { .ports =
1374                                                { .sport = tcp_hdr(skb)->dest,
1375                                                  .dport = tcp_hdr(skb)->source } },
1376                                     .proto = sk->sk_protocol };
1377                 security_skb_classify_flow(skb, &fl);
1378                 if (ip_route_output_key(&rt, &fl))
1379                         return;
1380         }
1381
1382         /* And let IP do all the hard work.
1383
1384            This chunk is not reenterable, hence spinlock.
1385            Note that it uses the fact, that this function is called
1386            with locally disabled BH and that sk cannot be already spinlocked.
1387          */
1388         bh_lock_sock(sk);
1389         inet->tos = ip_hdr(skb)->tos;
1390         sk->sk_priority = skb->priority;
1391         sk->sk_protocol = ip_hdr(skb)->protocol;
1392         sk->sk_bound_dev_if = arg->bound_dev_if;
1393         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1394                        &ipc, rt, MSG_DONTWAIT);
1395         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1396                 if (arg->csumoffset >= 0)
1397                         *((__sum16 *)skb_transport_header(skb) +
1398                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1399                                                                 arg->csum));
1400                 skb->ip_summed = CHECKSUM_NONE;
1401                 ip_push_pending_frames(sk);
1402         }
1403
1404         bh_unlock_sock(sk);
1405
1406         ip_rt_put(rt);
1407 }
1408
1409 void __init ip_init(void)
1410 {
1411         ip_rt_init();
1412         inet_initpeers();
1413
1414 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1415         igmp_mc_proc_init();
1416 #endif
1417 }
1418
1419 EXPORT_SYMBOL(ip_generic_getfrag);
1420 EXPORT_SYMBOL(ip_queue_xmit);
1421 EXPORT_SYMBOL(ip_send_check);