net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         newskb->mac.raw = newskb->data;
  99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         if (opt)
 129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 130         else
 131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 132
 133         iph->version  = 4;
 134         iph->ihl      = 5;
 135         iph->tos      = inet->tos;
 136         if (ip_dont_fragment(sk, &rt->u.dst))
 137                 iph->frag_off = htons(IP_DF);
 138         else
 139                 iph->frag_off = 0;
 140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 141         iph->daddr    = rt->rt_dst;
 142         iph->saddr    = rt->rt_src;
 143         iph->protocol = sk->sk_protocol;
 144         iph->tot_len  = htons(skb->len);
 145         ip_select_ident(iph, &rt->u.dst, sk);
 146         skb->nh.iph   = iph;
 147
 148         if (opt && opt->optlen) {
 149                 iph->ihl += opt->optlen>>2;
 150                 ip_options_build(skb, opt, daddr, rt, 0);
 151         }
 152         ip_send_check(iph);
 153
 154         skb->priority = sk->sk_priority;
 155
 156         /* Send it out. */
 157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 158                        dst_output);
 159 }
 160
 161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 162
 163 static inline int ip_finish_output2(struct sk_buff *skb)
 164 {
 165         struct dst_entry *dst = skb->dst;
 166         struct net_device *dev = dst->dev;
 167         int hh_len = LL_RESERVED_SPACE(dev);
 168
 169         /* Be paranoid, rather than too clever. */
 170         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 171                 struct sk_buff *skb2;
 172
 173                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 174                 if (skb2 == NULL) {
 175                         kfree_skb(skb);
 176                         return -ENOMEM;
 177                 }
 178                 if (skb->sk)
 179                         skb_set_owner_w(skb2, skb->sk);
 180                 kfree_skb(skb);
 181                 skb = skb2;
 182         }
 183
 184         if (dst->hh)
 185                 return neigh_hh_output(dst->hh, skb);
 186         else if (dst->neighbour)
 187                 return dst->neighbour->output(skb);
 188
 189         if (net_ratelimit())
 190                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 191         kfree_skb(skb);
 192         return -EINVAL;
 193 }
 194
 195 static inline int ip_finish_output(struct sk_buff *skb)
 196 {
 197 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 198         /* Policy lookup after SNAT yielded a new policy */
 199         if (skb->dst->xfrm != NULL) {
 200                 IPCB(skb)->flags |= IPSKB_REROUTED;
 201                 return dst_output(skb);
 202         }
 203 #endif
 204         if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
 205                 return ip_fragment(skb, ip_finish_output2);
 206         else
 207                 return ip_finish_output2(skb);
 208 }
 209
 210 int ip_mc_output(struct sk_buff *skb)
 211 {
 212         struct sock *sk = skb->sk;
 213         struct rtable *rt = (struct rtable*)skb->dst;
 214         struct net_device *dev = rt->u.dst.dev;
 215
 216         /*
 217          *      If the indicated interface is up and running, send the packet.
 218          */
 219         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 220
 221         skb->dev = dev;
 222         skb->protocol = htons(ETH_P_IP);
 223
 224         /*
 225          *      Multicasts are looped back for other local users
 226          */
 227
 228         if (rt->rt_flags&RTCF_MULTICAST) {
 229                 if ((!sk || inet_sk(sk)->mc_loop)
 230 #ifdef CONFIG_IP_MROUTE
 231                 /* Small optimization: do not loopback not local frames,
 232                    which returned after forwarding; they will be  dropped
 233                    by ip_mr_input in any case.
 234                    Note, that local frames are looped back to be delivered
 235                    to local recipients.
 236
 237                    This check is duplicated in ip_mr_input at the moment.
 238                  */
 239                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 240 #endif
 241                 ) {
 242                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 243                         if (newskb)
 244                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 245                                         newskb->dev,
 246                                         ip_dev_loopback_xmit);
 247                 }
 248
 249                 /* Multicasts with ttl 0 must not go beyond the host */
 250
 251                 if (skb->nh.iph->ttl == 0) {
 252                         kfree_skb(skb);
 253                         return 0;
 254                 }
 255         }
 256
 257         if (rt->rt_flags&RTCF_BROADCAST) {
 258                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 259                 if (newskb)
 260                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 261                                 newskb->dev, ip_dev_loopback_xmit);
 262         }
 263
 264         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 265                             ip_finish_output,
 266                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 267 }
 268
 269 int ip_output(struct sk_buff *skb)
 270 {
 271         struct net_device *dev = skb->dst->dev;
 272
 273         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 274
 275         skb->dev = dev;
 276         skb->protocol = htons(ETH_P_IP);
 277
 278         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 279                             ip_finish_output,
 280                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 281 }
 282
 283 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 284 {
 285         struct sock *sk = skb->sk;
 286         struct inet_sock *inet = inet_sk(sk);
 287         struct ip_options *opt = inet->opt;
 288         struct rtable *rt;
 289         struct iphdr *iph;
 290
 291         /* Skip all of this if the packet is already routed,
 292          * f.e. by something like SCTP.
 293          */
 294         rt = (struct rtable *) skb->dst;
 295         if (rt != NULL)
 296                 goto packet_routed;
 297
 298         /* Make sure we can route this packet. */
 299         rt = (struct rtable *)__sk_dst_check(sk, 0);
 300         if (rt == NULL) {
 301                 __be32 daddr;
 302
 303                 /* Use correct destination address if we have options. */
 304                 daddr = inet->daddr;
 305                 if(opt && opt->srr)
 306                         daddr = opt->faddr;
 307
 308                 {
 309                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 310                                             .nl_u = { .ip4_u =
 311                                                       { .daddr = daddr,
 312                                                         .saddr = inet->saddr,
 313                                                         .tos = RT_CONN_FLAGS(sk) } },
 314                                             .proto = sk->sk_protocol,
 315                                             .uli_u = { .ports =
 316                                                        { .sport = inet->sport,
 317                                                          .dport = inet->dport } } };
 318
 319                         /* If this fails, retransmit mechanism of transport layer will
 320                          * keep trying until route appears or the connection times
 321                          * itself out.
 322                          */
 323                         security_sk_classify_flow(sk, &fl);
 324                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 325                                 goto no_route;
 326                 }
 327                 sk_setup_caps(sk, &rt->u.dst);
 328         }
 329         skb->dst = dst_clone(&rt->u.dst);
 330
 331 packet_routed:
 332         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 333                 goto no_route;
 334
 335         /* OK, we know where to send it, allocate and build IP header. */
 336         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 337         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 338         iph->tot_len = htons(skb->len);
 339         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 340                 iph->frag_off = htons(IP_DF);
 341         else
 342                 iph->frag_off = 0;
 343         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 344         iph->protocol = sk->sk_protocol;
 345         iph->saddr    = rt->rt_src;
 346         iph->daddr    = rt->rt_dst;
 347         skb->nh.iph   = iph;
 348         /* Transport layer set skb->h.foo itself. */
 349
 350         if (opt && opt->optlen) {
 351                 iph->ihl += opt->optlen >> 2;
 352                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 353         }
 354
 355         ip_select_ident_more(iph, &rt->u.dst, sk,
 356                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 357
 358         /* Add an IP checksum. */
 359         ip_send_check(iph);
 360
 361         skb->priority = sk->sk_priority;
 362
 363         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 364                        dst_output);
 365
 366 no_route:
 367         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 368         kfree_skb(skb);
 369         return -EHOSTUNREACH;
 370 }
 371
 372
 373 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 374 {
 375         to->pkt_type = from->pkt_type;
 376         to->priority = from->priority;
 377         to->protocol = from->protocol;
 378         dst_release(to->dst);
 379         to->dst = dst_clone(from->dst);
 380         to->dev = from->dev;
 381         to->mark = from->mark;
 382
 383         /* Copy the flags to each fragment. */
 384         IPCB(to)->flags = IPCB(from)->flags;
 385
 386 #ifdef CONFIG_NET_SCHED
 387         to->tc_index = from->tc_index;
 388 #endif
 389 #ifdef CONFIG_NETFILTER
 390         /* Connection association is same as pre-frag packet */
 391         nf_conntrack_put(to->nfct);
 392         to->nfct = from->nfct;
 393         nf_conntrack_get(to->nfct);
 394         to->nfctinfo = from->nfctinfo;
 395 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 396         to->ipvs_property = from->ipvs_property;
 397 #endif
 398 #ifdef CONFIG_BRIDGE_NETFILTER
 399         nf_bridge_put(to->nf_bridge);
 400         to->nf_bridge = from->nf_bridge;
 401         nf_bridge_get(to->nf_bridge);
 402 #endif
 403 #endif
 404         skb_copy_secmark(to, from);
 405 }
 406
 407 /*
 408  *      This IP datagram is too large to be sent in one piece.  Break it up into
 409  *      smaller pieces (each of size equal to IP header plus
 410  *      a block of the data of the original IP data part) that will yet fit in a
 411  *      single device frame, and queue such a frame for sending.
 412  */
 413
 414 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 415 {
 416         struct iphdr *iph;
 417         int raw = 0;
 418         int ptr;
 419         struct net_device *dev;
 420         struct sk_buff *skb2;
 421         unsigned int mtu, hlen, left, len, ll_rs, pad;
 422         int offset;
 423         __be16 not_last_frag;
 424         struct rtable *rt = (struct rtable*)skb->dst;
 425         int err = 0;
 426
 427         dev = rt->u.dst.dev;
 428
 429         /*
 430          *      Point into the IP datagram header.
 431          */
 432
 433         iph = skb->nh.iph;
 434
 435         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 436                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 437                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 438                           htonl(dst_mtu(&rt->u.dst)));
 439                 kfree_skb(skb);
 440                 return -EMSGSIZE;
 441         }
 442
 443         /*
 444          *      Setup starting values.
 445          */
 446
 447         hlen = iph->ihl * 4;
 448         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 449         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 450
 451         /* When frag_list is given, use it. First, check its validity:
 452          * some transformers could create wrong frag_list or break existing
 453          * one, it is not prohibited. In this case fall back to copying.
 454          *
 455          * LATER: this step can be merged to real generation of fragments,
 456          * we can switch to copy when see the first bad fragment.
 457          */
 458         if (skb_shinfo(skb)->frag_list) {
 459                 struct sk_buff *frag;
 460                 int first_len = skb_pagelen(skb);
 461
 462                 if (first_len - hlen > mtu ||
 463                     ((first_len - hlen) & 7) ||
 464                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 465                     skb_cloned(skb))
 466                         goto slow_path;
 467
 468                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 469                         /* Correct geometry. */
 470                         if (frag->len > mtu ||
 471                             ((frag->len & 7) && frag->next) ||
 472                             skb_headroom(frag) < hlen)
 473                             goto slow_path;
 474
 475                         /* Partially cloned skb? */
 476                         if (skb_shared(frag))
 477                                 goto slow_path;
 478
 479                         BUG_ON(frag->sk);
 480                         if (skb->sk) {
 481                                 sock_hold(skb->sk);
 482                                 frag->sk = skb->sk;
 483                                 frag->destructor = sock_wfree;
 484                                 skb->truesize -= frag->truesize;
 485                         }
 486                 }
 487
 488                 /* Everything is OK. Generate! */
 489
 490                 err = 0;
 491                 offset = 0;
 492                 frag = skb_shinfo(skb)->frag_list;
 493                 skb_shinfo(skb)->frag_list = NULL;
 494                 skb->data_len = first_len - skb_headlen(skb);
 495                 skb->len = first_len;
 496                 iph->tot_len = htons(first_len);
 497                 iph->frag_off = htons(IP_MF);
 498                 ip_send_check(iph);
 499
 500                 for (;;) {
 501                         /* Prepare header of the next frame,
 502                          * before previous one went down. */
 503                         if (frag) {
 504                                 frag->ip_summed = CHECKSUM_NONE;
 505                                 frag->h.raw = frag->data;
 506                                 frag->nh.raw = __skb_push(frag, hlen);
 507                                 memcpy(frag->nh.raw, iph, hlen);
 508                                 iph = frag->nh.iph;
 509                                 iph->tot_len = htons(frag->len);
 510                                 ip_copy_metadata(frag, skb);
 511                                 if (offset == 0)
 512                                         ip_options_fragment(frag);
 513                                 offset += skb->len - hlen;
 514                                 iph->frag_off = htons(offset>>3);
 515                                 if (frag->next != NULL)
 516                                         iph->frag_off |= htons(IP_MF);
 517                                 /* Ready, complete checksum */
 518                                 ip_send_check(iph);
 519                         }
 520
 521                         err = output(skb);
 522
 523                         if (!err)
 524                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 525                         if (err || !frag)
 526                                 break;
 527
 528                         skb = frag;
 529                         frag = skb->next;
 530                         skb->next = NULL;
 531                 }
 532
 533                 if (err == 0) {
 534                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 535                         return 0;
 536                 }
 537
 538                 while (frag) {
 539                         skb = frag->next;
 540                         kfree_skb(frag);
 541                         frag = skb;
 542                 }
 543                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 544                 return err;
 545         }
 546
 547 slow_path:
 548         left = skb->len - hlen;         /* Space per frame */
 549         ptr = raw + hlen;               /* Where to start from */
 550
 551         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 552          * we need to make room for the encapsulating header
 553          */
 554         pad = nf_bridge_pad(skb);
 555         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 556         mtu -= pad;
 557
 558         /*
 559          *      Fragment the datagram.
 560          */
 561
 562         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 563         not_last_frag = iph->frag_off & htons(IP_MF);
 564
 565         /*
 566          *      Keep copying data until we run out.
 567          */
 568
 569         while(left > 0) {
 570                 len = left;
 571                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 572                 if (len > mtu)
 573                         len = mtu;
 574                 /* IF: we are not sending upto and including the packet end
 575                    then align the next start on an eight byte boundary */
 576                 if (len < left) {
 577                         len &= ~7;
 578                 }
 579                 /*
 580                  *      Allocate buffer.
 581                  */
 582
 583                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 584                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 585                         err = -ENOMEM;
 586                         goto fail;
 587                 }
 588
 589                 /*
 590                  *      Set up data on packet
 591                  */
 592
 593                 ip_copy_metadata(skb2, skb);
 594                 skb_reserve(skb2, ll_rs);
 595                 skb_put(skb2, len + hlen);
 596                 skb2->nh.raw = skb2->data;
 597                 skb2->h.raw = skb2->data + hlen;
 598
 599                 /*
 600                  *      Charge the memory for the fragment to any owner
 601                  *      it might possess
 602                  */
 603
 604                 if (skb->sk)
 605                         skb_set_owner_w(skb2, skb->sk);
 606
 607                 /*
 608                  *      Copy the packet header into the new buffer.
 609                  */
 610
 611                 memcpy(skb2->nh.raw, skb->data, hlen);
 612
 613                 /*
 614                  *      Copy a block of the IP datagram.
 615                  */
 616                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 617                         BUG();
 618                 left -= len;
 619
 620                 /*
 621                  *      Fill in the new header fields.
 622                  */
 623                 iph = skb2->nh.iph;
 624                 iph->frag_off = htons((offset >> 3));
 625
 626                 /* ANK: dirty, but effective trick. Upgrade options only if
 627                  * the segment to be fragmented was THE FIRST (otherwise,
 628                  * options are already fixed) and make it ONCE
 629                  * on the initial skb, so that all the following fragments
 630                  * will inherit fixed options.
 631                  */
 632                 if (offset == 0)
 633                         ip_options_fragment(skb);
 634
 635                 /*
 636                  *      Added AC : If we are fragmenting a fragment that's not the
 637                  *                 last fragment then keep MF on each bit
 638                  */
 639                 if (left > 0 || not_last_frag)
 640                         iph->frag_off |= htons(IP_MF);
 641                 ptr += len;
 642                 offset += len;
 643
 644                 /*
 645                  *      Put this fragment into the sending queue.
 646                  */
 647                 iph->tot_len = htons(len + hlen);
 648
 649                 ip_send_check(iph);
 650
 651                 err = output(skb2);
 652                 if (err)
 653                         goto fail;
 654
 655                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 656         }
 657         kfree_skb(skb);
 658         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 659         return err;
 660
 661 fail:
 662         kfree_skb(skb);
 663         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 664         return err;
 665 }
 666
 667 EXPORT_SYMBOL(ip_fragment);
 668
 669 int
 670 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 671 {
 672         struct iovec *iov = from;
 673
 674         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 675                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 676                         return -EFAULT;
 677         } else {
 678                 __wsum csum = 0;
 679                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 680                         return -EFAULT;
 681                 skb->csum = csum_block_add(skb->csum, csum, odd);
 682         }
 683         return 0;
 684 }
 685
 686 static inline __wsum
 687 csum_page(struct page *page, int offset, int copy)
 688 {
 689         char *kaddr;
 690         __wsum csum;
 691         kaddr = kmap(page);
 692         csum = csum_partial(kaddr + offset, copy, 0);
 693         kunmap(page);
 694         return csum;
 695 }
 696
 697 static inline int ip_ufo_append_data(struct sock *sk,
 698                         int getfrag(void *from, char *to, int offset, int len,
 699                                int odd, struct sk_buff *skb),
 700                         void *from, int length, int hh_len, int fragheaderlen,
 701                         int transhdrlen, int mtu,unsigned int flags)
 702 {
 703         struct sk_buff *skb;
 704         int err;
 705
 706         /* There is support for UDP fragmentation offload by network
 707          * device, so create one single skb packet containing complete
 708          * udp datagram
 709          */
 710         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 711                 skb = sock_alloc_send_skb(sk,
 712                         hh_len + fragheaderlen + transhdrlen + 20,
 713                         (flags & MSG_DONTWAIT), &err);
 714
 715                 if (skb == NULL)
 716                         return err;
 717
 718                 /* reserve space for Hardware header */
 719                 skb_reserve(skb, hh_len);
 720
 721                 /* create space for UDP/IP header */
 722                 skb_put(skb,fragheaderlen + transhdrlen);
 723
 724                 /* initialize network header pointer */
 725                 skb->nh.raw = skb->data;
 726
 727                 /* initialize protocol header pointer */
 728                 skb->h.raw = skb->data + fragheaderlen;
 729
 730                 skb->ip_summed = CHECKSUM_PARTIAL;
 731                 skb->csum = 0;
 732                 sk->sk_sndmsg_off = 0;
 733         }
 734
 735         err = skb_append_datato_frags(sk,skb, getfrag, from,
 736                                (length - transhdrlen));
 737         if (!err) {
 738                 /* specify the length of each IP datagram fragment*/
 739                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 740                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 741                 __skb_queue_tail(&sk->sk_write_queue, skb);
 742
 743                 return 0;
 744         }
 745         /* There is not enough support do UFO ,
 746          * so follow normal path
 747          */
 748         kfree_skb(skb);
 749         return err;
 750 }
 751
 752 /*
 753  *      ip_append_data() and ip_append_page() can make one large IP datagram
 754  *      from many pieces of data. Each pieces will be holded on the socket
 755  *      until ip_push_pending_frames() is called. Each piece can be a page
 756  *      or non-page data.
 757  *
 758  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 759  *      this interface potentially.
 760  *
 761  *      LATER: length must be adjusted by pad at tail, when it is required.
 762  */
 763 int ip_append_data(struct sock *sk,
 764                    int getfrag(void *from, char *to, int offset, int len,
 765                                int odd, struct sk_buff *skb),
 766                    void *from, int length, int transhdrlen,
 767                    struct ipcm_cookie *ipc, struct rtable *rt,
 768                    unsigned int flags)
 769 {
 770         struct inet_sock *inet = inet_sk(sk);
 771         struct sk_buff *skb;
 772
 773         struct ip_options *opt = NULL;
 774         int hh_len;
 775         int exthdrlen;
 776         int mtu;
 777         int copy;
 778         int err;
 779         int offset = 0;
 780         unsigned int maxfraglen, fragheaderlen;
 781         int csummode = CHECKSUM_NONE;
 782
 783         if (flags&MSG_PROBE)
 784                 return 0;
 785
 786         if (skb_queue_empty(&sk->sk_write_queue)) {
 787                 /*
 788                  * setup for corking.
 789                  */
 790                 opt = ipc->opt;
 791                 if (opt) {
 792                         if (inet->cork.opt == NULL) {
 793                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 794                                 if (unlikely(inet->cork.opt == NULL))
 795                                         return -ENOBUFS;
 796                         }
 797                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 798                         inet->cork.flags |= IPCORK_OPT;
 799                         inet->cork.addr = ipc->addr;
 800                 }
 801                 dst_hold(&rt->u.dst);
 802                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 803                 inet->cork.rt = rt;
 804                 inet->cork.length = 0;
 805                 sk->sk_sndmsg_page = NULL;
 806                 sk->sk_sndmsg_off = 0;
 807                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 808                         length += exthdrlen;
 809                         transhdrlen += exthdrlen;
 810                 }
 811         } else {
 812                 rt = inet->cork.rt;
 813                 if (inet->cork.flags & IPCORK_OPT)
 814                         opt = inet->cork.opt;
 815
 816                 transhdrlen = 0;
 817                 exthdrlen = 0;
 818                 mtu = inet->cork.fragsize;
 819         }
 820         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 821
 822         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 823         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 824
 825         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 826                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 827                 return -EMSGSIZE;
 828         }
 829
 830         /*
 831          * transhdrlen > 0 means that this is the first fragment and we wish
 832          * it won't be fragmented in the future.
 833          */
 834         if (transhdrlen &&
 835             length + fragheaderlen <= mtu &&
 836             rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 837             !exthdrlen)
 838                 csummode = CHECKSUM_PARTIAL;
 839
 840         inet->cork.length += length;
 841         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 842                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 843
 844                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 845                                          fragheaderlen, transhdrlen, mtu,
 846                                          flags);
 847                 if (err)
 848                         goto error;
 849                 return 0;
 850         }
 851
 852         /* So, what's going on in the loop below?
 853          *
 854          * We use calculated fragment length to generate chained skb,
 855          * each of segments is IP fragment ready for sending to network after
 856          * adding appropriate IP header.
 857          */
 858
 859         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 860                 goto alloc_new_skb;
 861
 862         while (length > 0) {
 863                 /* Check if the remaining data fits into current packet. */
 864                 copy = mtu - skb->len;
 865                 if (copy < length)
 866                         copy = maxfraglen - skb->len;
 867                 if (copy <= 0) {
 868                         char *data;
 869                         unsigned int datalen;
 870                         unsigned int fraglen;
 871                         unsigned int fraggap;
 872                         unsigned int alloclen;
 873                         struct sk_buff *skb_prev;
 874 alloc_new_skb:
 875                         skb_prev = skb;
 876                         if (skb_prev)
 877                                 fraggap = skb_prev->len - maxfraglen;
 878                         else
 879                                 fraggap = 0;
 880
 881                         /*
 882                          * If remaining data exceeds the mtu,
 883                          * we know we need more fragment(s).
 884                          */
 885                         datalen = length + fraggap;
 886                         if (datalen > mtu - fragheaderlen)
 887                                 datalen = maxfraglen - fragheaderlen;
 888                         fraglen = datalen + fragheaderlen;
 889
 890                         if ((flags & MSG_MORE) &&
 891                             !(rt->u.dst.dev->features&NETIF_F_SG))
 892                                 alloclen = mtu;
 893                         else
 894                                 alloclen = datalen + fragheaderlen;
 895
 896                         /* The last fragment gets additional space at tail.
 897                          * Note, with MSG_MORE we overallocate on fragments,
 898                          * because we have no idea what fragment will be
 899                          * the last.
 900                          */
 901                         if (datalen == length + fraggap)
 902                                 alloclen += rt->u.dst.trailer_len;
 903
 904                         if (transhdrlen) {
 905                                 skb = sock_alloc_send_skb(sk,
 906                                                 alloclen + hh_len + 15,
 907                                                 (flags & MSG_DONTWAIT), &err);
 908                         } else {
 909                                 skb = NULL;
 910                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 911                                     2 * sk->sk_sndbuf)
 912                                         skb = sock_wmalloc(sk,
 913                                                            alloclen + hh_len + 15, 1,
 914                                                            sk->sk_allocation);
 915                                 if (unlikely(skb == NULL))
 916                                         err = -ENOBUFS;
 917                         }
 918                         if (skb == NULL)
 919                                 goto error;
 920
 921                         /*
 922                          *      Fill in the control structures
 923                          */
 924                         skb->ip_summed = csummode;
 925                         skb->csum = 0;
 926                         skb_reserve(skb, hh_len);
 927
 928                         /*
 929                          *      Find where to start putting bytes.
 930                          */
 931                         data = skb_put(skb, fraglen);
 932                         skb->nh.raw = data + exthdrlen;
 933                         data += fragheaderlen;
 934                         skb->h.raw = data + exthdrlen;
 935
 936                         if (fraggap) {
 937                                 skb->csum = skb_copy_and_csum_bits(
 938                                         skb_prev, maxfraglen,
 939                                         data + transhdrlen, fraggap, 0);
 940                                 skb_prev->csum = csum_sub(skb_prev->csum,
 941                                                           skb->csum);
 942                                 data += fraggap;
 943                                 pskb_trim_unique(skb_prev, maxfraglen);
 944                         }
 945
 946                         copy = datalen - transhdrlen - fraggap;
 947                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 948                                 err = -EFAULT;
 949                                 kfree_skb(skb);
 950                                 goto error;
 951                         }
 952
 953                         offset += copy;
 954                         length -= datalen - fraggap;
 955                         transhdrlen = 0;
 956                         exthdrlen = 0;
 957                         csummode = CHECKSUM_NONE;
 958
 959                         /*
 960                          * Put the packet on the pending queue.
 961                          */
 962                         __skb_queue_tail(&sk->sk_write_queue, skb);
 963                         continue;
 964                 }
 965
 966                 if (copy > length)
 967                         copy = length;
 968
 969                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 970                         unsigned int off;
 971
 972                         off = skb->len;
 973                         if (getfrag(from, skb_put(skb, copy),
 974                                         offset, copy, off, skb) < 0) {
 975                                 __skb_trim(skb, off);
 976                                 err = -EFAULT;
 977                                 goto error;
 978                         }
 979                 } else {
 980                         int i = skb_shinfo(skb)->nr_frags;
 981                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 982                         struct page *page = sk->sk_sndmsg_page;
 983                         int off = sk->sk_sndmsg_off;
 984                         unsigned int left;
 985
 986                         if (page && (left = PAGE_SIZE - off) > 0) {
 987                                 if (copy >= left)
 988                                         copy = left;
 989                                 if (page != frag->page) {
 990                                         if (i == MAX_SKB_FRAGS) {
 991                                                 err = -EMSGSIZE;
 992                                                 goto error;
 993                                         }
 994                                         get_page(page);
 995                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 996                                         frag = &skb_shinfo(skb)->frags[i];
 997                                 }
 998                         } else if (i < MAX_SKB_FRAGS) {
 999                                 if (copy > PAGE_SIZE)
1000                                         copy = PAGE_SIZE;
1001                                 page = alloc_pages(sk->sk_allocation, 0);
1002                                 if (page == NULL)  {
1003                                         err = -ENOMEM;
1004                                         goto error;
1005                                 }
1006                                 sk->sk_sndmsg_page = page;
1007                                 sk->sk_sndmsg_off = 0;
1008
1009                                 skb_fill_page_desc(skb, i, page, 0, 0);
1010                                 frag = &skb_shinfo(skb)->frags[i];
1011                                 skb->truesize += PAGE_SIZE;
1012                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1013                         } else {
1014                                 err = -EMSGSIZE;
1015                                 goto error;
1016                         }
1017                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018                                 err = -EFAULT;
1019                                 goto error;
1020                         }
1021                         sk->sk_sndmsg_off += copy;
1022                         frag->size += copy;
1023                         skb->len += copy;
1024                         skb->data_len += copy;
1025                 }
1026                 offset += copy;
1027                 length -= copy;
1028         }
1029
1030         return 0;
1031
1032 error:
1033         inet->cork.length -= length;
1034         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1035         return err;
1036 }
1037
1038 ssize_t ip_append_page(struct sock *sk, struct page *page,
1039                        int offset, size_t size, int flags)
1040 {
1041         struct inet_sock *inet = inet_sk(sk);
1042         struct sk_buff *skb;
1043         struct rtable *rt;
1044         struct ip_options *opt = NULL;
1045         int hh_len;
1046         int mtu;
1047         int len;
1048         int err;
1049         unsigned int maxfraglen, fragheaderlen, fraggap;
1050
1051         if (inet->hdrincl)
1052                 return -EPERM;
1053
1054         if (flags&MSG_PROBE)
1055                 return 0;
1056
1057         if (skb_queue_empty(&sk->sk_write_queue))
1058                 return -EINVAL;
1059
1060         rt = inet->cork.rt;
1061         if (inet->cork.flags & IPCORK_OPT)
1062                 opt = inet->cork.opt;
1063
1064         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1065                 return -EOPNOTSUPP;
1066
1067         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1068         mtu = inet->cork.fragsize;
1069
1070         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1071         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1072
1073         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1074                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1075                 return -EMSGSIZE;
1076         }
1077
1078         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1079                 return -EINVAL;
1080
1081         inet->cork.length += size;
1082         if ((sk->sk_protocol == IPPROTO_UDP) &&
1083             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1084                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1085                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1086         }
1087
1088
1089         while (size > 0) {
1090                 int i;
1091
1092                 if (skb_is_gso(skb))
1093                         len = size;
1094                 else {
1095
1096                         /* Check if the remaining data fits into current packet. */
1097                         len = mtu - skb->len;
1098                         if (len < size)
1099                                 len = maxfraglen - skb->len;
1100                 }
1101                 if (len <= 0) {
1102                         struct sk_buff *skb_prev;
1103                         char *data;
1104                         struct iphdr *iph;
1105                         int alloclen;
1106
1107                         skb_prev = skb;
1108                         fraggap = skb_prev->len - maxfraglen;
1109
1110                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1111                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1112                         if (unlikely(!skb)) {
1113                                 err = -ENOBUFS;
1114                                 goto error;
1115                         }
1116
1117                         /*
1118                          *      Fill in the control structures
1119                          */
1120                         skb->ip_summed = CHECKSUM_NONE;
1121                         skb->csum = 0;
1122                         skb_reserve(skb, hh_len);
1123
1124                         /*
1125                          *      Find where to start putting bytes.
1126                          */
1127                         data = skb_put(skb, fragheaderlen + fraggap);
1128                         skb->nh.iph = iph = (struct iphdr *)data;
1129                         data += fragheaderlen;
1130                         skb->h.raw = data;
1131
1132                         if (fraggap) {
1133                                 skb->csum = skb_copy_and_csum_bits(
1134                                         skb_prev, maxfraglen,
1135                                         data, fraggap, 0);
1136                                 skb_prev->csum = csum_sub(skb_prev->csum,
1137                                                           skb->csum);
1138                                 pskb_trim_unique(skb_prev, maxfraglen);
1139                         }
1140
1141                         /*
1142                          * Put the packet on the pending queue.
1143                          */
1144                         __skb_queue_tail(&sk->sk_write_queue, skb);
1145                         continue;
1146                 }
1147
1148                 i = skb_shinfo(skb)->nr_frags;
1149                 if (len > size)
1150                         len = size;
1151                 if (skb_can_coalesce(skb, i, page, offset)) {
1152                         skb_shinfo(skb)->frags[i-1].size += len;
1153                 } else if (i < MAX_SKB_FRAGS) {
1154                         get_page(page);
1155                         skb_fill_page_desc(skb, i, page, offset, len);
1156                 } else {
1157                         err = -EMSGSIZE;
1158                         goto error;
1159                 }
1160
1161                 if (skb->ip_summed == CHECKSUM_NONE) {
1162                         __wsum csum;
1163                         csum = csum_page(page, offset, len);
1164                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1165                 }
1166
1167                 skb->len += len;
1168                 skb->data_len += len;
1169                 offset += len;
1170                 size -= len;
1171         }
1172         return 0;
1173
1174 error:
1175         inet->cork.length -= size;
1176         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1177         return err;
1178 }
1179
1180 /*
1181  *      Combined all pending IP fragments on the socket as one IP datagram
1182  *      and push them out.
1183  */
1184 int ip_push_pending_frames(struct sock *sk)
1185 {
1186         struct sk_buff *skb, *tmp_skb;
1187         struct sk_buff **tail_skb;
1188         struct inet_sock *inet = inet_sk(sk);
1189         struct ip_options *opt = NULL;
1190         struct rtable *rt = inet->cork.rt;
1191         struct iphdr *iph;
1192         __be16 df = 0;
1193         __u8 ttl;
1194         int err = 0;
1195
1196         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1197                 goto out;
1198         tail_skb = &(skb_shinfo(skb)->frag_list);
1199
1200         /* move skb->data to ip header from ext header */
1201         if (skb->data < skb->nh.raw)
1202                 __skb_pull(skb, skb->nh.raw - skb->data);
1203         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1204                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1205                 *tail_skb = tmp_skb;
1206                 tail_skb = &(tmp_skb->next);
1207                 skb->len += tmp_skb->len;
1208                 skb->data_len += tmp_skb->len;
1209                 skb->truesize += tmp_skb->truesize;
1210                 __sock_put(tmp_skb->sk);
1211                 tmp_skb->destructor = NULL;
1212                 tmp_skb->sk = NULL;
1213         }
1214
1215         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1216          * to fragment the frame generated here. No matter, what transforms
1217          * how transforms change size of the packet, it will come out.
1218          */
1219         if (inet->pmtudisc != IP_PMTUDISC_DO)
1220                 skb->local_df = 1;
1221
1222         /* DF bit is set when we want to see DF on outgoing frames.
1223          * If local_df is set too, we still allow to fragment this frame
1224          * locally. */
1225         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1226             (skb->len <= dst_mtu(&rt->u.dst) &&
1227              ip_dont_fragment(sk, &rt->u.dst)))
1228                 df = htons(IP_DF);
1229
1230         if (inet->cork.flags & IPCORK_OPT)
1231                 opt = inet->cork.opt;
1232
1233         if (rt->rt_type == RTN_MULTICAST)
1234                 ttl = inet->mc_ttl;
1235         else
1236                 ttl = ip_select_ttl(inet, &rt->u.dst);
1237
1238         iph = (struct iphdr *)skb->data;
1239         iph->version = 4;
1240         iph->ihl = 5;
1241         if (opt) {
1242                 iph->ihl += opt->optlen>>2;
1243                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1244         }
1245         iph->tos = inet->tos;
1246         iph->tot_len = htons(skb->len);
1247         iph->frag_off = df;
1248         ip_select_ident(iph, &rt->u.dst, sk);
1249         iph->ttl = ttl;
1250         iph->protocol = sk->sk_protocol;
1251         iph->saddr = rt->rt_src;
1252         iph->daddr = rt->rt_dst;
1253         ip_send_check(iph);
1254
1255         skb->priority = sk->sk_priority;
1256         skb->dst = dst_clone(&rt->u.dst);
1257
1258         /* Netfilter gets whole the not fragmented skb. */
1259         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1260                       skb->dst->dev, dst_output);
1261         if (err) {
1262                 if (err > 0)
1263                         err = inet->recverr ? net_xmit_errno(err) : 0;
1264                 if (err)
1265                         goto error;
1266         }
1267
1268 out:
1269         inet->cork.flags &= ~IPCORK_OPT;
1270         kfree(inet->cork.opt);
1271         inet->cork.opt = NULL;
1272         if (inet->cork.rt) {
1273                 ip_rt_put(inet->cork.rt);
1274                 inet->cork.rt = NULL;
1275         }
1276         return err;
1277
1278 error:
1279         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1280         goto out;
1281 }
1282
1283 /*
1284  *      Throw away all pending data on the socket.
1285  */
1286 void ip_flush_pending_frames(struct sock *sk)
1287 {
1288         struct inet_sock *inet = inet_sk(sk);
1289         struct sk_buff *skb;
1290
1291         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1292                 kfree_skb(skb);
1293
1294         inet->cork.flags &= ~IPCORK_OPT;
1295         kfree(inet->cork.opt);
1296         inet->cork.opt = NULL;
1297         if (inet->cork.rt) {
1298                 ip_rt_put(inet->cork.rt);
1299                 inet->cork.rt = NULL;
1300         }
1301 }
1302
1303
1304 /*
1305  *      Fetch data from kernel space and fill in checksum if needed.
1306  */
1307 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1308                               int len, int odd, struct sk_buff *skb)
1309 {
1310         __wsum csum;
1311
1312         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1313         skb->csum = csum_block_add(skb->csum, csum, odd);
1314         return 0;
1315 }
1316
1317 /*
1318  *      Generic function to send a packet as reply to another packet.
1319  *      Used to send TCP resets so far. ICMP should use this function too.
1320  *
1321  *      Should run single threaded per socket because it uses the sock
1322  *      structure to pass arguments.
1323  *
1324  *      LATER: switch from ip_build_xmit to ip_append_*
1325  */
1326 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1327                    unsigned int len)
1328 {
1329         struct inet_sock *inet = inet_sk(sk);
1330         struct {
1331                 struct ip_options       opt;
1332                 char                    data[40];
1333         } replyopts;
1334         struct ipcm_cookie ipc;
1335         __be32 daddr;
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337
1338         if (ip_options_echo(&replyopts.opt, skb))
1339                 return;
1340
1341         daddr = ipc.addr = rt->rt_src;
1342         ipc.opt = NULL;
1343
1344         if (replyopts.opt.optlen) {
1345                 ipc.opt = &replyopts.opt;
1346
1347                 if (ipc.opt->srr)
1348                         daddr = replyopts.opt.faddr;
1349         }
1350
1351         {
1352                 struct flowi fl = { .nl_u = { .ip4_u =
1353                                               { .daddr = daddr,
1354                                                 .saddr = rt->rt_spec_dst,
1355                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1356                                     /* Not quite clean, but right. */
1357                                     .uli_u = { .ports =
1358                                                { .sport = skb->h.th->dest,
1359                                                  .dport = skb->h.th->source } },
1360                                     .proto = sk->sk_protocol };
1361                 security_skb_classify_flow(skb, &fl);
1362                 if (ip_route_output_key(&rt, &fl))
1363                         return;
1364         }
1365
1366         /* And let IP do all the hard work.
1367
1368            This chunk is not reenterable, hence spinlock.
1369            Note that it uses the fact, that this function is called
1370            with locally disabled BH and that sk cannot be already spinlocked.
1371          */
1372         bh_lock_sock(sk);
1373         inet->tos = skb->nh.iph->tos;
1374         sk->sk_priority = skb->priority;
1375         sk->sk_protocol = skb->nh.iph->protocol;
1376         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1377                        &ipc, rt, MSG_DONTWAIT);
1378         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1379                 if (arg->csumoffset >= 0)
1380                         *((__sum16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1381                 skb->ip_summed = CHECKSUM_NONE;
1382                 ip_push_pending_frames(sk);
1383         }
1384
1385         bh_unlock_sock(sk);
1386
1387         ip_rt_put(rt);
1388 }
1389
1390 void __init ip_init(void)
1391 {
1392         ip_rt_init();
1393         inet_initpeers();
1394
1395 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1396         igmp_mc_proc_init();
1397 #endif
1398 }
1399
1400 EXPORT_SYMBOL(ip_generic_getfrag);
1401 EXPORT_SYMBOL(ip_queue_xmit);
1402 EXPORT_SYMBOL(ip_send_check);