net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         newskb->mac.raw = newskb->data;
  99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           u32 saddr, u32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         if (opt)
 129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 130         else
 131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 132
 133         iph->version  = 4;
 134         iph->ihl      = 5;
 135         iph->tos      = inet->tos;
 136         if (ip_dont_fragment(sk, &rt->u.dst))
 137                 iph->frag_off = htons(IP_DF);
 138         else
 139                 iph->frag_off = 0;
 140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 141         iph->daddr    = rt->rt_dst;
 142         iph->saddr    = rt->rt_src;
 143         iph->protocol = sk->sk_protocol;
 144         iph->tot_len  = htons(skb->len);
 145         ip_select_ident(iph, &rt->u.dst, sk);
 146         skb->nh.iph   = iph;
 147
 148         if (opt && opt->optlen) {
 149                 iph->ihl += opt->optlen>>2;
 150                 ip_options_build(skb, opt, daddr, rt, 0);
 151         }
 152         ip_send_check(iph);
 153
 154         skb->priority = sk->sk_priority;
 155
 156         /* Send it out. */
 157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 158                        dst_output);
 159 }
 160
 161 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 162
 163 static inline int ip_finish_output2(struct sk_buff *skb)
 164 {
 165         struct dst_entry *dst = skb->dst;
 166         struct hh_cache *hh = dst->hh;
 167         struct net_device *dev = dst->dev;
 168         int hh_len = LL_RESERVED_SPACE(dev);
 169
 170         /* Be paranoid, rather than too clever. */
 171         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 172                 struct sk_buff *skb2;
 173
 174                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 175                 if (skb2 == NULL) {
 176                         kfree_skb(skb);
 177                         return -ENOMEM;
 178                 }
 179                 if (skb->sk)
 180                         skb_set_owner_w(skb2, skb->sk);
 181                 kfree_skb(skb);
 182                 skb = skb2;
 183         }
 184
 185         if (hh) {
 186                 int hh_alen;
 187
 188                 read_lock_bh(&hh->hh_lock);
 189                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 190                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 191                 read_unlock_bh(&hh->hh_lock);
 192                 skb_push(skb, hh->hh_len);
 193                 return hh->hh_output(skb);
 194         } else if (dst->neighbour)
 195                 return dst->neighbour->output(skb);
 196
 197         if (net_ratelimit())
 198                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 199         kfree_skb(skb);
 200         return -EINVAL;
 201 }
 202
 203 static inline int ip_finish_output(struct sk_buff *skb)
 204 {
 205 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 206         /* Policy lookup after SNAT yielded a new policy */
 207         if (skb->dst->xfrm != NULL)
 208                 return xfrm4_output_finish(skb);
 209 #endif
 210         if (skb->len > dst_mtu(skb->dst) &&
 211             !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
 212                 return ip_fragment(skb, ip_finish_output2);
 213         else
 214                 return ip_finish_output2(skb);
 215 }
 216
 217 int ip_mc_output(struct sk_buff *skb)
 218 {
 219         struct sock *sk = skb->sk;
 220         struct rtable *rt = (struct rtable*)skb->dst;
 221         struct net_device *dev = rt->u.dst.dev;
 222
 223         /*
 224          *      If the indicated interface is up and running, send the packet.
 225          */
 226         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 227
 228         skb->dev = dev;
 229         skb->protocol = htons(ETH_P_IP);
 230
 231         /*
 232          *      Multicasts are looped back for other local users
 233          */
 234
 235         if (rt->rt_flags&RTCF_MULTICAST) {
 236                 if ((!sk || inet_sk(sk)->mc_loop)
 237 #ifdef CONFIG_IP_MROUTE
 238                 /* Small optimization: do not loopback not local frames,
 239                    which returned after forwarding; they will be  dropped
 240                    by ip_mr_input in any case.
 241                    Note, that local frames are looped back to be delivered
 242                    to local recipients.
 243
 244                    This check is duplicated in ip_mr_input at the moment.
 245                  */
 246                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 247 #endif
 248                 ) {
 249                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 250                         if (newskb)
 251                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 252                                         newskb->dev,
 253                                         ip_dev_loopback_xmit);
 254                 }
 255
 256                 /* Multicasts with ttl 0 must not go beyond the host */
 257
 258                 if (skb->nh.iph->ttl == 0) {
 259                         kfree_skb(skb);
 260                         return 0;
 261                 }
 262         }
 263
 264         if (rt->rt_flags&RTCF_BROADCAST) {
 265                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 266                 if (newskb)
 267                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 268                                 newskb->dev, ip_dev_loopback_xmit);
 269         }
 270
 271         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 272                        ip_finish_output);
 273 }
 274
 275 int ip_output(struct sk_buff *skb)
 276 {
 277         struct net_device *dev = skb->dst->dev;
 278
 279         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 280
 281         skb->dev = dev;
 282         skb->protocol = htons(ETH_P_IP);
 283
 284         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 285                        ip_finish_output);
 286 }
 287
 288 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 289 {
 290         struct sock *sk = skb->sk;
 291         struct inet_sock *inet = inet_sk(sk);
 292         struct ip_options *opt = inet->opt;
 293         struct rtable *rt;
 294         struct iphdr *iph;
 295
 296         /* Skip all of this if the packet is already routed,
 297          * f.e. by something like SCTP.
 298          */
 299         rt = (struct rtable *) skb->dst;
 300         if (rt != NULL)
 301                 goto packet_routed;
 302
 303         /* Make sure we can route this packet. */
 304         rt = (struct rtable *)__sk_dst_check(sk, 0);
 305         if (rt == NULL) {
 306                 u32 daddr;
 307
 308                 /* Use correct destination address if we have options. */
 309                 daddr = inet->daddr;
 310                 if(opt && opt->srr)
 311                         daddr = opt->faddr;
 312
 313                 {
 314                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 315                                             .nl_u = { .ip4_u =
 316                                                       { .daddr = daddr,
 317                                                         .saddr = inet->saddr,
 318                                                         .tos = RT_CONN_FLAGS(sk) } },
 319                                             .proto = sk->sk_protocol,
 320                                             .uli_u = { .ports =
 321                                                        { .sport = inet->sport,
 322                                                          .dport = inet->dport } } };
 323
 324                         /* If this fails, retransmit mechanism of transport layer will
 325                          * keep trying until route appears or the connection times
 326                          * itself out.
 327                          */
 328                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 329                                 goto no_route;
 330                 }
 331                 sk_setup_caps(sk, &rt->u.dst);
 332         }
 333         skb->dst = dst_clone(&rt->u.dst);
 334
 335 packet_routed:
 336         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 337                 goto no_route;
 338
 339         /* OK, we know where to send it, allocate and build IP header. */
 340         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 341         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 342         iph->tot_len = htons(skb->len);
 343         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 344                 iph->frag_off = htons(IP_DF);
 345         else
 346                 iph->frag_off = 0;
 347         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 348         iph->protocol = sk->sk_protocol;
 349         iph->saddr    = rt->rt_src;
 350         iph->daddr    = rt->rt_dst;
 351         skb->nh.iph   = iph;
 352         /* Transport layer set skb->h.foo itself. */
 353
 354         if (opt && opt->optlen) {
 355                 iph->ihl += opt->optlen >> 2;
 356                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 357         }
 358
 359         ip_select_ident_more(iph, &rt->u.dst, sk,
 360                              (skb_shinfo(skb)->tso_segs ?: 1) - 1);
 361
 362         /* Add an IP checksum. */
 363         ip_send_check(iph);
 364
 365         skb->priority = sk->sk_priority;
 366
 367         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 368                        dst_output);
 369
 370 no_route:
 371         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 372         kfree_skb(skb);
 373         return -EHOSTUNREACH;
 374 }
 375
 376
 377 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 378 {
 379         to->pkt_type = from->pkt_type;
 380         to->priority = from->priority;
 381         to->protocol = from->protocol;
 382         dst_release(to->dst);
 383         to->dst = dst_clone(from->dst);
 384         to->dev = from->dev;
 385
 386         /* Copy the flags to each fragment. */
 387         IPCB(to)->flags = IPCB(from)->flags;
 388
 389 #ifdef CONFIG_NET_SCHED
 390         to->tc_index = from->tc_index;
 391 #endif
 392 #ifdef CONFIG_NETFILTER
 393         to->nfmark = from->nfmark;
 394         /* Connection association is same as pre-frag packet */
 395         nf_conntrack_put(to->nfct);
 396         to->nfct = from->nfct;
 397         nf_conntrack_get(to->nfct);
 398         to->nfctinfo = from->nfctinfo;
 399 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 400         to->ipvs_property = from->ipvs_property;
 401 #endif
 402 #ifdef CONFIG_BRIDGE_NETFILTER
 403         nf_bridge_put(to->nf_bridge);
 404         to->nf_bridge = from->nf_bridge;
 405         nf_bridge_get(to->nf_bridge);
 406 #endif
 407 #endif
 408 }
 409
 410 /*
 411  *      This IP datagram is too large to be sent in one piece.  Break it up into
 412  *      smaller pieces (each of size equal to IP header plus
 413  *      a block of the data of the original IP data part) that will yet fit in a
 414  *      single device frame, and queue such a frame for sending.
 415  */
 416
 417 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 418 {
 419         struct iphdr *iph;
 420         int raw = 0;
 421         int ptr;
 422         struct net_device *dev;
 423         struct sk_buff *skb2;
 424         unsigned int mtu, hlen, left, len, ll_rs;
 425         int offset;
 426         __be16 not_last_frag;
 427         struct rtable *rt = (struct rtable*)skb->dst;
 428         int err = 0;
 429
 430         dev = rt->u.dst.dev;
 431
 432         /*
 433          *      Point into the IP datagram header.
 434          */
 435
 436         iph = skb->nh.iph;
 437
 438         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 439                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 440                           htonl(dst_mtu(&rt->u.dst)));
 441                 kfree_skb(skb);
 442                 return -EMSGSIZE;
 443         }
 444
 445         /*
 446          *      Setup starting values.
 447          */
 448
 449         hlen = iph->ihl * 4;
 450         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 451         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 452
 453         /* When frag_list is given, use it. First, check its validity:
 454          * some transformers could create wrong frag_list or break existing
 455          * one, it is not prohibited. In this case fall back to copying.
 456          *
 457          * LATER: this step can be merged to real generation of fragments,
 458          * we can switch to copy when see the first bad fragment.
 459          */
 460         if (skb_shinfo(skb)->frag_list) {
 461                 struct sk_buff *frag;
 462                 int first_len = skb_pagelen(skb);
 463
 464                 if (first_len - hlen > mtu ||
 465                     ((first_len - hlen) & 7) ||
 466                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 467                     skb_cloned(skb))
 468                         goto slow_path;
 469
 470                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 471                         /* Correct geometry. */
 472                         if (frag->len > mtu ||
 473                             ((frag->len & 7) && frag->next) ||
 474                             skb_headroom(frag) < hlen)
 475                             goto slow_path;
 476
 477                         /* Partially cloned skb? */
 478                         if (skb_shared(frag))
 479                                 goto slow_path;
 480
 481                         BUG_ON(frag->sk);
 482                         if (skb->sk) {
 483                                 sock_hold(skb->sk);
 484                                 frag->sk = skb->sk;
 485                                 frag->destructor = sock_wfree;
 486                                 skb->truesize -= frag->truesize;
 487                         }
 488                 }
 489
 490                 /* Everything is OK. Generate! */
 491
 492                 err = 0;
 493                 offset = 0;
 494                 frag = skb_shinfo(skb)->frag_list;
 495                 skb_shinfo(skb)->frag_list = NULL;
 496                 skb->data_len = first_len - skb_headlen(skb);
 497                 skb->len = first_len;
 498                 iph->tot_len = htons(first_len);
 499                 iph->frag_off = htons(IP_MF);
 500                 ip_send_check(iph);
 501
 502                 for (;;) {
 503                         /* Prepare header of the next frame,
 504                          * before previous one went down. */
 505                         if (frag) {
 506                                 frag->ip_summed = CHECKSUM_NONE;
 507                                 frag->h.raw = frag->data;
 508                                 frag->nh.raw = __skb_push(frag, hlen);
 509                                 memcpy(frag->nh.raw, iph, hlen);
 510                                 iph = frag->nh.iph;
 511                                 iph->tot_len = htons(frag->len);
 512                                 ip_copy_metadata(frag, skb);
 513                                 if (offset == 0)
 514                                         ip_options_fragment(frag);
 515                                 offset += skb->len - hlen;
 516                                 iph->frag_off = htons(offset>>3);
 517                                 if (frag->next != NULL)
 518                                         iph->frag_off |= htons(IP_MF);
 519                                 /* Ready, complete checksum */
 520                                 ip_send_check(iph);
 521                         }
 522
 523                         err = output(skb);
 524
 525                         if (err || !frag)
 526                                 break;
 527
 528                         skb = frag;
 529                         frag = skb->next;
 530                         skb->next = NULL;
 531                 }
 532
 533                 if (err == 0) {
 534                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 535                         return 0;
 536                 }
 537
 538                 while (frag) {
 539                         skb = frag->next;
 540                         kfree_skb(frag);
 541                         frag = skb;
 542                 }
 543                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 544                 return err;
 545         }
 546
 547 slow_path:
 548         left = skb->len - hlen;         /* Space per frame */
 549         ptr = raw + hlen;               /* Where to start from */
 550
 551 #ifdef CONFIG_BRIDGE_NETFILTER
 552         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 553          * we need to make room for the encapsulating header */
 554         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 555         mtu -= nf_bridge_pad(skb);
 556 #else
 557         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 558 #endif
 559         /*
 560          *      Fragment the datagram.
 561          */
 562
 563         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 564         not_last_frag = iph->frag_off & htons(IP_MF);
 565
 566         /*
 567          *      Keep copying data until we run out.
 568          */
 569
 570         while(left > 0) {
 571                 len = left;
 572                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 573                 if (len > mtu)
 574                         len = mtu;
 575                 /* IF: we are not sending upto and including the packet end
 576                    then align the next start on an eight byte boundary */
 577                 if (len < left) {
 578                         len &= ~7;
 579                 }
 580                 /*
 581                  *      Allocate buffer.
 582                  */
 583
 584                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 585                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 586                         err = -ENOMEM;
 587                         goto fail;
 588                 }
 589
 590                 /*
 591                  *      Set up data on packet
 592                  */
 593
 594                 ip_copy_metadata(skb2, skb);
 595                 skb_reserve(skb2, ll_rs);
 596                 skb_put(skb2, len + hlen);
 597                 skb2->nh.raw = skb2->data;
 598                 skb2->h.raw = skb2->data + hlen;
 599
 600                 /*
 601                  *      Charge the memory for the fragment to any owner
 602                  *      it might possess
 603                  */
 604
 605                 if (skb->sk)
 606                         skb_set_owner_w(skb2, skb->sk);
 607
 608                 /*
 609                  *      Copy the packet header into the new buffer.
 610                  */
 611
 612                 memcpy(skb2->nh.raw, skb->data, hlen);
 613
 614                 /*
 615                  *      Copy a block of the IP datagram.
 616                  */
 617                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 618                         BUG();
 619                 left -= len;
 620
 621                 /*
 622                  *      Fill in the new header fields.
 623                  */
 624                 iph = skb2->nh.iph;
 625                 iph->frag_off = htons((offset >> 3));
 626
 627                 /* ANK: dirty, but effective trick. Upgrade options only if
 628                  * the segment to be fragmented was THE FIRST (otherwise,
 629                  * options are already fixed) and make it ONCE
 630                  * on the initial skb, so that all the following fragments
 631                  * will inherit fixed options.
 632                  */
 633                 if (offset == 0)
 634                         ip_options_fragment(skb);
 635
 636                 /*
 637                  *      Added AC : If we are fragmenting a fragment that's not the
 638                  *                 last fragment then keep MF on each bit
 639                  */
 640                 if (left > 0 || not_last_frag)
 641                         iph->frag_off |= htons(IP_MF);
 642                 ptr += len;
 643                 offset += len;
 644
 645                 /*
 646                  *      Put this fragment into the sending queue.
 647                  */
 648
 649                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 650
 651                 iph->tot_len = htons(len + hlen);
 652
 653                 ip_send_check(iph);
 654
 655                 err = output(skb2);
 656                 if (err)
 657                         goto fail;
 658         }
 659         kfree_skb(skb);
 660         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 661         return err;
 662
 663 fail:
 664         kfree_skb(skb);
 665         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 666         return err;
 667 }
 668
 669 int
 670 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 671 {
 672         struct iovec *iov = from;
 673
 674         if (skb->ip_summed == CHECKSUM_HW) {
 675                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 676                         return -EFAULT;
 677         } else {
 678                 unsigned int csum = 0;
 679                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 680                         return -EFAULT;
 681                 skb->csum = csum_block_add(skb->csum, csum, odd);
 682         }
 683         return 0;
 684 }
 685
 686 static inline unsigned int
 687 csum_page(struct page *page, int offset, int copy)
 688 {
 689         char *kaddr;
 690         unsigned int csum;
 691         kaddr = kmap(page);
 692         csum = csum_partial(kaddr + offset, copy, 0);
 693         kunmap(page);
 694         return csum;
 695 }
 696
 697 static inline int ip_ufo_append_data(struct sock *sk,
 698                         int getfrag(void *from, char *to, int offset, int len,
 699                                int odd, struct sk_buff *skb),
 700                         void *from, int length, int hh_len, int fragheaderlen,
 701                         int transhdrlen, int mtu,unsigned int flags)
 702 {
 703         struct sk_buff *skb;
 704         int err;
 705
 706         /* There is support for UDP fragmentation offload by network
 707          * device, so create one single skb packet containing complete
 708          * udp datagram
 709          */
 710         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 711                 skb = sock_alloc_send_skb(sk,
 712                         hh_len + fragheaderlen + transhdrlen + 20,
 713                         (flags & MSG_DONTWAIT), &err);
 714
 715                 if (skb == NULL)
 716                         return err;
 717
 718                 /* reserve space for Hardware header */
 719                 skb_reserve(skb, hh_len);
 720
 721                 /* create space for UDP/IP header */
 722                 skb_put(skb,fragheaderlen + transhdrlen);
 723
 724                 /* initialize network header pointer */
 725                 skb->nh.raw = skb->data;
 726
 727                 /* initialize protocol header pointer */
 728                 skb->h.raw = skb->data + fragheaderlen;
 729
 730                 skb->ip_summed = CHECKSUM_HW;
 731                 skb->csum = 0;
 732                 sk->sk_sndmsg_off = 0;
 733         }
 734
 735         err = skb_append_datato_frags(sk,skb, getfrag, from,
 736                                (length - transhdrlen));
 737         if (!err) {
 738                 /* specify the length of each IP datagram fragment*/
 739                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
 740                 __skb_queue_tail(&sk->sk_write_queue, skb);
 741
 742                 return 0;
 743         }
 744         /* There is not enough support do UFO ,
 745          * so follow normal path
 746          */
 747         kfree_skb(skb);
 748         return err;
 749 }
 750
 751 /*
 752  *      ip_append_data() and ip_append_page() can make one large IP datagram
 753  *      from many pieces of data. Each pieces will be holded on the socket
 754  *      until ip_push_pending_frames() is called. Each piece can be a page
 755  *      or non-page data.
 756  *
 757  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 758  *      this interface potentially.
 759  *
 760  *      LATER: length must be adjusted by pad at tail, when it is required.
 761  */
 762 int ip_append_data(struct sock *sk,
 763                    int getfrag(void *from, char *to, int offset, int len,
 764                                int odd, struct sk_buff *skb),
 765                    void *from, int length, int transhdrlen,
 766                    struct ipcm_cookie *ipc, struct rtable *rt,
 767                    unsigned int flags)
 768 {
 769         struct inet_sock *inet = inet_sk(sk);
 770         struct sk_buff *skb;
 771
 772         struct ip_options *opt = NULL;
 773         int hh_len;
 774         int exthdrlen;
 775         int mtu;
 776         int copy;
 777         int err;
 778         int offset = 0;
 779         unsigned int maxfraglen, fragheaderlen;
 780         int csummode = CHECKSUM_NONE;
 781
 782         if (flags&MSG_PROBE)
 783                 return 0;
 784
 785         if (skb_queue_empty(&sk->sk_write_queue)) {
 786                 /*
 787                  * setup for corking.
 788                  */
 789                 opt = ipc->opt;
 790                 if (opt) {
 791                         if (inet->cork.opt == NULL) {
 792                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 793                                 if (unlikely(inet->cork.opt == NULL))
 794                                         return -ENOBUFS;
 795                         }
 796                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 797                         inet->cork.flags |= IPCORK_OPT;
 798                         inet->cork.addr = ipc->addr;
 799                 }
 800                 dst_hold(&rt->u.dst);
 801                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 802                 inet->cork.rt = rt;
 803                 inet->cork.length = 0;
 804                 sk->sk_sndmsg_page = NULL;
 805                 sk->sk_sndmsg_off = 0;
 806                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 807                         length += exthdrlen;
 808                         transhdrlen += exthdrlen;
 809                 }
 810         } else {
 811                 rt = inet->cork.rt;
 812                 if (inet->cork.flags & IPCORK_OPT)
 813                         opt = inet->cork.opt;
 814
 815                 transhdrlen = 0;
 816                 exthdrlen = 0;
 817                 mtu = inet->cork.fragsize;
 818         }
 819         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 820
 821         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 822         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 823
 824         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 825                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 826                 return -EMSGSIZE;
 827         }
 828
 829         /*
 830          * transhdrlen > 0 means that this is the first fragment and we wish
 831          * it won't be fragmented in the future.
 832          */
 833         if (transhdrlen &&
 834             length + fragheaderlen <= mtu &&
 835             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 836             !exthdrlen)
 837                 csummode = CHECKSUM_HW;
 838
 839         inet->cork.length += length;
 840         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 841                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 842
 843                 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 844                                fragheaderlen, transhdrlen, mtu, flags))
 845                         goto error;
 846
 847                 return 0;
 848         }
 849
 850         /* So, what's going on in the loop below?
 851          *
 852          * We use calculated fragment length to generate chained skb,
 853          * each of segments is IP fragment ready for sending to network after
 854          * adding appropriate IP header.
 855          */
 856
 857         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 858                 goto alloc_new_skb;
 859
 860         while (length > 0) {
 861                 /* Check if the remaining data fits into current packet. */
 862                 copy = mtu - skb->len;
 863                 if (copy < length)
 864                         copy = maxfraglen - skb->len;
 865                 if (copy <= 0) {
 866                         char *data;
 867                         unsigned int datalen;
 868                         unsigned int fraglen;
 869                         unsigned int fraggap;
 870                         unsigned int alloclen;
 871                         struct sk_buff *skb_prev;
 872 alloc_new_skb:
 873                         skb_prev = skb;
 874                         if (skb_prev)
 875                                 fraggap = skb_prev->len - maxfraglen;
 876                         else
 877                                 fraggap = 0;
 878
 879                         /*
 880                          * If remaining data exceeds the mtu,
 881                          * we know we need more fragment(s).
 882                          */
 883                         datalen = length + fraggap;
 884                         if (datalen > mtu - fragheaderlen)
 885                                 datalen = maxfraglen - fragheaderlen;
 886                         fraglen = datalen + fragheaderlen;
 887
 888                         if ((flags & MSG_MORE) &&
 889                             !(rt->u.dst.dev->features&NETIF_F_SG))
 890                                 alloclen = mtu;
 891                         else
 892                                 alloclen = datalen + fragheaderlen;
 893
 894                         /* The last fragment gets additional space at tail.
 895                          * Note, with MSG_MORE we overallocate on fragments,
 896                          * because we have no idea what fragment will be
 897                          * the last.
 898                          */
 899                         if (datalen == length)
 900                                 alloclen += rt->u.dst.trailer_len;
 901
 902                         if (transhdrlen) {
 903                                 skb = sock_alloc_send_skb(sk,
 904                                                 alloclen + hh_len + 15,
 905                                                 (flags & MSG_DONTWAIT), &err);
 906                         } else {
 907                                 skb = NULL;
 908                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 909                                     2 * sk->sk_sndbuf)
 910                                         skb = sock_wmalloc(sk,
 911                                                            alloclen + hh_len + 15, 1,
 912                                                            sk->sk_allocation);
 913                                 if (unlikely(skb == NULL))
 914                                         err = -ENOBUFS;
 915                         }
 916                         if (skb == NULL)
 917                                 goto error;
 918
 919                         /*
 920                          *      Fill in the control structures
 921                          */
 922                         skb->ip_summed = csummode;
 923                         skb->csum = 0;
 924                         skb_reserve(skb, hh_len);
 925
 926                         /*
 927                          *      Find where to start putting bytes.
 928                          */
 929                         data = skb_put(skb, fraglen);
 930                         skb->nh.raw = data + exthdrlen;
 931                         data += fragheaderlen;
 932                         skb->h.raw = data + exthdrlen;
 933
 934                         if (fraggap) {
 935                                 skb->csum = skb_copy_and_csum_bits(
 936                                         skb_prev, maxfraglen,
 937                                         data + transhdrlen, fraggap, 0);
 938                                 skb_prev->csum = csum_sub(skb_prev->csum,
 939                                                           skb->csum);
 940                                 data += fraggap;
 941                                 skb_trim(skb_prev, maxfraglen);
 942                         }
 943
 944                         copy = datalen - transhdrlen - fraggap;
 945                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 946                                 err = -EFAULT;
 947                                 kfree_skb(skb);
 948                                 goto error;
 949                         }
 950
 951                         offset += copy;
 952                         length -= datalen - fraggap;
 953                         transhdrlen = 0;
 954                         exthdrlen = 0;
 955                         csummode = CHECKSUM_NONE;
 956
 957                         /*
 958                          * Put the packet on the pending queue.
 959                          */
 960                         __skb_queue_tail(&sk->sk_write_queue, skb);
 961                         continue;
 962                 }
 963
 964                 if (copy > length)
 965                         copy = length;
 966
 967                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 968                         unsigned int off;
 969
 970                         off = skb->len;
 971                         if (getfrag(from, skb_put(skb, copy),
 972                                         offset, copy, off, skb) < 0) {
 973                                 __skb_trim(skb, off);
 974                                 err = -EFAULT;
 975                                 goto error;
 976                         }
 977                 } else {
 978                         int i = skb_shinfo(skb)->nr_frags;
 979                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 980                         struct page *page = sk->sk_sndmsg_page;
 981                         int off = sk->sk_sndmsg_off;
 982                         unsigned int left;
 983
 984                         if (page && (left = PAGE_SIZE - off) > 0) {
 985                                 if (copy >= left)
 986                                         copy = left;
 987                                 if (page != frag->page) {
 988                                         if (i == MAX_SKB_FRAGS) {
 989                                                 err = -EMSGSIZE;
 990                                                 goto error;
 991                                         }
 992                                         get_page(page);
 993                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 994                                         frag = &skb_shinfo(skb)->frags[i];
 995                                 }
 996                         } else if (i < MAX_SKB_FRAGS) {
 997                                 if (copy > PAGE_SIZE)
 998                                         copy = PAGE_SIZE;
 999                                 page = alloc_pages(sk->sk_allocation, 0);
1000                                 if (page == NULL)  {
1001                                         err = -ENOMEM;
1002                                         goto error;
1003                                 }
1004                                 sk->sk_sndmsg_page = page;
1005                                 sk->sk_sndmsg_off = 0;
1006
1007                                 skb_fill_page_desc(skb, i, page, 0, 0);
1008                                 frag = &skb_shinfo(skb)->frags[i];
1009                                 skb->truesize += PAGE_SIZE;
1010                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1011                         } else {
1012                                 err = -EMSGSIZE;
1013                                 goto error;
1014                         }
1015                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1016                                 err = -EFAULT;
1017                                 goto error;
1018                         }
1019                         sk->sk_sndmsg_off += copy;
1020                         frag->size += copy;
1021                         skb->len += copy;
1022                         skb->data_len += copy;
1023                 }
1024                 offset += copy;
1025                 length -= copy;
1026         }
1027
1028         return 0;
1029
1030 error:
1031         inet->cork.length -= length;
1032         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1033         return err;
1034 }
1035
1036 ssize_t ip_append_page(struct sock *sk, struct page *page,
1037                        int offset, size_t size, int flags)
1038 {
1039         struct inet_sock *inet = inet_sk(sk);
1040         struct sk_buff *skb;
1041         struct rtable *rt;
1042         struct ip_options *opt = NULL;
1043         int hh_len;
1044         int mtu;
1045         int len;
1046         int err;
1047         unsigned int maxfraglen, fragheaderlen, fraggap;
1048
1049         if (inet->hdrincl)
1050                 return -EPERM;
1051
1052         if (flags&MSG_PROBE)
1053                 return 0;
1054
1055         if (skb_queue_empty(&sk->sk_write_queue))
1056                 return -EINVAL;
1057
1058         rt = inet->cork.rt;
1059         if (inet->cork.flags & IPCORK_OPT)
1060                 opt = inet->cork.opt;
1061
1062         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1063                 return -EOPNOTSUPP;
1064
1065         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1066         mtu = inet->cork.fragsize;
1067
1068         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1069         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1070
1071         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1072                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1073                 return -EMSGSIZE;
1074         }
1075
1076         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1077                 return -EINVAL;
1078
1079         inet->cork.length += size;
1080         if ((sk->sk_protocol == IPPROTO_UDP) &&
1081             (rt->u.dst.dev->features & NETIF_F_UFO))
1082                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1083
1084
1085         while (size > 0) {
1086                 int i;
1087
1088                 if (skb_shinfo(skb)->ufo_size)
1089                         len = size;
1090                 else {
1091
1092                         /* Check if the remaining data fits into current packet. */
1093                         len = mtu - skb->len;
1094                         if (len < size)
1095                                 len = maxfraglen - skb->len;
1096                 }
1097                 if (len <= 0) {
1098                         struct sk_buff *skb_prev;
1099                         char *data;
1100                         struct iphdr *iph;
1101                         int alloclen;
1102
1103                         skb_prev = skb;
1104                         fraggap = skb_prev->len - maxfraglen;
1105
1106                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1107                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1108                         if (unlikely(!skb)) {
1109                                 err = -ENOBUFS;
1110                                 goto error;
1111                         }
1112
1113                         /*
1114                          *      Fill in the control structures
1115                          */
1116                         skb->ip_summed = CHECKSUM_NONE;
1117                         skb->csum = 0;
1118                         skb_reserve(skb, hh_len);
1119
1120                         /*
1121                          *      Find where to start putting bytes.
1122                          */
1123                         data = skb_put(skb, fragheaderlen + fraggap);
1124                         skb->nh.iph = iph = (struct iphdr *)data;
1125                         data += fragheaderlen;
1126                         skb->h.raw = data;
1127
1128                         if (fraggap) {
1129                                 skb->csum = skb_copy_and_csum_bits(
1130                                         skb_prev, maxfraglen,
1131                                         data, fraggap, 0);
1132                                 skb_prev->csum = csum_sub(skb_prev->csum,
1133                                                           skb->csum);
1134                                 skb_trim(skb_prev, maxfraglen);
1135                         }
1136
1137                         /*
1138                          * Put the packet on the pending queue.
1139                          */
1140                         __skb_queue_tail(&sk->sk_write_queue, skb);
1141                         continue;
1142                 }
1143
1144                 i = skb_shinfo(skb)->nr_frags;
1145                 if (len > size)
1146                         len = size;
1147                 if (skb_can_coalesce(skb, i, page, offset)) {
1148                         skb_shinfo(skb)->frags[i-1].size += len;
1149                 } else if (i < MAX_SKB_FRAGS) {
1150                         get_page(page);
1151                         skb_fill_page_desc(skb, i, page, offset, len);
1152                 } else {
1153                         err = -EMSGSIZE;
1154                         goto error;
1155                 }
1156
1157                 if (skb->ip_summed == CHECKSUM_NONE) {
1158                         unsigned int csum;
1159                         csum = csum_page(page, offset, len);
1160                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1161                 }
1162
1163                 skb->len += len;
1164                 skb->data_len += len;
1165                 offset += len;
1166                 size -= len;
1167         }
1168         return 0;
1169
1170 error:
1171         inet->cork.length -= size;
1172         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1173         return err;
1174 }
1175
1176 /*
1177  *      Combined all pending IP fragments on the socket as one IP datagram
1178  *      and push them out.
1179  */
1180 int ip_push_pending_frames(struct sock *sk)
1181 {
1182         struct sk_buff *skb, *tmp_skb;
1183         struct sk_buff **tail_skb;
1184         struct inet_sock *inet = inet_sk(sk);
1185         struct ip_options *opt = NULL;
1186         struct rtable *rt = inet->cork.rt;
1187         struct iphdr *iph;
1188         __be16 df = 0;
1189         __u8 ttl;
1190         int err = 0;
1191
1192         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1193                 goto out;
1194         tail_skb = &(skb_shinfo(skb)->frag_list);
1195
1196         /* move skb->data to ip header from ext header */
1197         if (skb->data < skb->nh.raw)
1198                 __skb_pull(skb, skb->nh.raw - skb->data);
1199         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1200                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1201                 *tail_skb = tmp_skb;
1202                 tail_skb = &(tmp_skb->next);
1203                 skb->len += tmp_skb->len;
1204                 skb->data_len += tmp_skb->len;
1205                 skb->truesize += tmp_skb->truesize;
1206                 __sock_put(tmp_skb->sk);
1207                 tmp_skb->destructor = NULL;
1208                 tmp_skb->sk = NULL;
1209         }
1210
1211         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1212          * to fragment the frame generated here. No matter, what transforms
1213          * how transforms change size of the packet, it will come out.
1214          */
1215         if (inet->pmtudisc != IP_PMTUDISC_DO)
1216                 skb->local_df = 1;
1217
1218         /* DF bit is set when we want to see DF on outgoing frames.
1219          * If local_df is set too, we still allow to fragment this frame
1220          * locally. */
1221         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1222             (skb->len <= dst_mtu(&rt->u.dst) &&
1223              ip_dont_fragment(sk, &rt->u.dst)))
1224                 df = htons(IP_DF);
1225
1226         if (inet->cork.flags & IPCORK_OPT)
1227                 opt = inet->cork.opt;
1228
1229         if (rt->rt_type == RTN_MULTICAST)
1230                 ttl = inet->mc_ttl;
1231         else
1232                 ttl = ip_select_ttl(inet, &rt->u.dst);
1233
1234         iph = (struct iphdr *)skb->data;
1235         iph->version = 4;
1236         iph->ihl = 5;
1237         if (opt) {
1238                 iph->ihl += opt->optlen>>2;
1239                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1240         }
1241         iph->tos = inet->tos;
1242         iph->tot_len = htons(skb->len);
1243         iph->frag_off = df;
1244         if (!df) {
1245                 __ip_select_ident(iph, &rt->u.dst, 0);
1246         } else {
1247                 iph->id = htons(inet->id++);
1248         }
1249         iph->ttl = ttl;
1250         iph->protocol = sk->sk_protocol;
1251         iph->saddr = rt->rt_src;
1252         iph->daddr = rt->rt_dst;
1253         ip_send_check(iph);
1254
1255         skb->priority = sk->sk_priority;
1256         skb->dst = dst_clone(&rt->u.dst);
1257
1258         /* Netfilter gets whole the not fragmented skb. */
1259         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1260                       skb->dst->dev, dst_output);
1261         if (err) {
1262                 if (err > 0)
1263                         err = inet->recverr ? net_xmit_errno(err) : 0;
1264                 if (err)
1265                         goto error;
1266         }
1267
1268 out:
1269         inet->cork.flags &= ~IPCORK_OPT;
1270         kfree(inet->cork.opt);
1271         inet->cork.opt = NULL;
1272         if (inet->cork.rt) {
1273                 ip_rt_put(inet->cork.rt);
1274                 inet->cork.rt = NULL;
1275         }
1276         return err;
1277
1278 error:
1279         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1280         goto out;
1281 }
1282
1283 /*
1284  *      Throw away all pending data on the socket.
1285  */
1286 void ip_flush_pending_frames(struct sock *sk)
1287 {
1288         struct inet_sock *inet = inet_sk(sk);
1289         struct sk_buff *skb;
1290
1291         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1292                 kfree_skb(skb);
1293
1294         inet->cork.flags &= ~IPCORK_OPT;
1295         kfree(inet->cork.opt);
1296         inet->cork.opt = NULL;
1297         if (inet->cork.rt) {
1298                 ip_rt_put(inet->cork.rt);
1299                 inet->cork.rt = NULL;
1300         }
1301 }
1302
1303
1304 /*
1305  *      Fetch data from kernel space and fill in checksum if needed.
1306  */
1307 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1308                               int len, int odd, struct sk_buff *skb)
1309 {
1310         unsigned int csum;
1311
1312         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1313         skb->csum = csum_block_add(skb->csum, csum, odd);
1314         return 0;
1315 }
1316
1317 /*
1318  *      Generic function to send a packet as reply to another packet.
1319  *      Used to send TCP resets so far. ICMP should use this function too.
1320  *
1321  *      Should run single threaded per socket because it uses the sock
1322  *      structure to pass arguments.
1323  *
1324  *      LATER: switch from ip_build_xmit to ip_append_*
1325  */
1326 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1327                    unsigned int len)
1328 {
1329         struct inet_sock *inet = inet_sk(sk);
1330         struct {
1331                 struct ip_options       opt;
1332                 char                    data[40];
1333         } replyopts;
1334         struct ipcm_cookie ipc;
1335         u32 daddr;
1336         struct rtable *rt = (struct rtable*)skb->dst;
1337
1338         if (ip_options_echo(&replyopts.opt, skb))
1339                 return;
1340
1341         daddr = ipc.addr = rt->rt_src;
1342         ipc.opt = NULL;
1343
1344         if (replyopts.opt.optlen) {
1345                 ipc.opt = &replyopts.opt;
1346
1347                 if (ipc.opt->srr)
1348                         daddr = replyopts.opt.faddr;
1349         }
1350
1351         {
1352                 struct flowi fl = { .nl_u = { .ip4_u =
1353                                               { .daddr = daddr,
1354                                                 .saddr = rt->rt_spec_dst,
1355                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1356                                     /* Not quite clean, but right. */
1357                                     .uli_u = { .ports =
1358                                                { .sport = skb->h.th->dest,
1359                                                  .dport = skb->h.th->source } },
1360                                     .proto = sk->sk_protocol };
1361                 if (ip_route_output_key(&rt, &fl))
1362                         return;
1363         }
1364
1365         /* And let IP do all the hard work.
1366
1367            This chunk is not reenterable, hence spinlock.
1368            Note that it uses the fact, that this function is called
1369            with locally disabled BH and that sk cannot be already spinlocked.
1370          */
1371         bh_lock_sock(sk);
1372         inet->tos = skb->nh.iph->tos;
1373         sk->sk_priority = skb->priority;
1374         sk->sk_protocol = skb->nh.iph->protocol;
1375         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1376                        &ipc, rt, MSG_DONTWAIT);
1377         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1378                 if (arg->csumoffset >= 0)
1379                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1380                 skb->ip_summed = CHECKSUM_NONE;
1381                 ip_push_pending_frames(sk);
1382         }
1383
1384         bh_unlock_sock(sk);
1385
1386         ip_rt_put(rt);
1387 }
1388
1389 void __init ip_init(void)
1390 {
1391         ip_rt_init();
1392         inet_initpeers();
1393
1394 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1395         igmp_mc_proc_init();
1396 #endif
1397 }
1398
1399 EXPORT_SYMBOL(ip_fragment);
1400 EXPORT_SYMBOL(ip_generic_getfrag);
1401 EXPORT_SYMBOL(ip_queue_xmit);
1402 EXPORT_SYMBOL(ip_send_check);