net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/xfrm.h>
  73 #include <linux/skbuff.h>
  74 #include <net/sock.h>
  75 #include <net/arp.h>
  76 #include <net/icmp.h>
  77 #include <net/checksum.h>
  78 #include <net/inetpeer.h>
  79 #include <net/checksum.h>
  80 #include <linux/igmp.h>
  81 #include <linux/netfilter_ipv4.h>
  82 #include <linux/netfilter_bridge.h>
  83 #include <linux/mroute.h>
  84 #include <linux/netlink.h>
  85 #include <linux/tcp.h>
  86
  87 int sysctl_ip_default_ttl = IPDEFTTL;
  88
  89 static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
  90
  91 /* Generate a checksum for an outgoing IP datagram. */
  92 __inline__ void ip_send_check(struct iphdr *iph)
  93 {
  94         iph->check = 0;
  95         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  96 }
  97
  98 /* dev_loopback_xmit for use with netfilter. */
  99 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 100 {
 101         newskb->mac.raw = newskb->data;
 102         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 103         newskb->pkt_type = PACKET_LOOPBACK;
 104         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 105         BUG_TRAP(newskb->dst);
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 111 {
 112         int ttl = inet->uc_ttl;
 113
 114         if (ttl < 0)
 115                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 116         return ttl;
 117 }
 118
 119 /*
 120  *              Add an ip header to a skbuff and send it out.
 121  *
 122  */
 123 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 124                           u32 saddr, u32 daddr, struct ip_options *opt)
 125 {
 126         struct inet_sock *inet = inet_sk(sk);
 127         struct rtable *rt = (struct rtable *)skb->dst;
 128         struct iphdr *iph;
 129
 130         /* Build the IP header. */
 131         if (opt)
 132                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 133         else
 134                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 135
 136         iph->version  = 4;
 137         iph->ihl      = 5;
 138         iph->tos      = inet->tos;
 139         if (ip_dont_fragment(sk, &rt->u.dst))
 140                 iph->frag_off = htons(IP_DF);
 141         else
 142                 iph->frag_off = 0;
 143         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 144         iph->daddr    = rt->rt_dst;
 145         iph->saddr    = rt->rt_src;
 146         iph->protocol = sk->sk_protocol;
 147         iph->tot_len  = htons(skb->len);
 148         ip_select_ident(iph, &rt->u.dst, sk);
 149         skb->nh.iph   = iph;
 150
 151         if (opt && opt->optlen) {
 152                 iph->ihl += opt->optlen>>2;
 153                 ip_options_build(skb, opt, daddr, rt, 0);
 154         }
 155         ip_send_check(iph);
 156
 157         skb->priority = sk->sk_priority;
 158
 159         /* Send it out. */
 160         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 161                        dst_output);
 162 }
 163
 164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 165
 166 static inline int ip_finish_output2(struct sk_buff *skb)
 167 {
 168         struct dst_entry *dst = skb->dst;
 169         struct hh_cache *hh = dst->hh;
 170         struct net_device *dev = dst->dev;
 171         int hh_len = LL_RESERVED_SPACE(dev);
 172
 173         /* Be paranoid, rather than too clever. */
 174         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 175                 struct sk_buff *skb2;
 176
 177                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 178                 if (skb2 == NULL) {
 179                         kfree_skb(skb);
 180                         return -ENOMEM;
 181                 }
 182                 if (skb->sk)
 183                         skb_set_owner_w(skb2, skb->sk);
 184                 kfree_skb(skb);
 185                 skb = skb2;
 186         }
 187
 188         if (hh) {
 189                 int hh_alen;
 190
 191                 read_lock_bh(&hh->hh_lock);
 192                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 193                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 194                 read_unlock_bh(&hh->hh_lock);
 195                 skb_push(skb, hh->hh_len);
 196                 return hh->hh_output(skb);
 197         } else if (dst->neighbour)
 198                 return dst->neighbour->output(skb);
 199
 200         if (net_ratelimit())
 201                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 202         kfree_skb(skb);
 203         return -EINVAL;
 204 }
 205
 206 static inline int ip_finish_output(struct sk_buff *skb)
 207 {
 208 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 209         /* Policy lookup after SNAT yielded a new policy */
 210         if (skb->dst->xfrm != NULL)
 211                 return xfrm4_output_finish(skb);
 212 #endif
 213         if (skb->len > dst_mtu(skb->dst) &&
 214             !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
 215                 return ip_fragment(skb, ip_finish_output2);
 216         else
 217                 return ip_finish_output2(skb);
 218 }
 219
 220 int ip_mc_output(struct sk_buff *skb)
 221 {
 222         struct sock *sk = skb->sk;
 223         struct rtable *rt = (struct rtable*)skb->dst;
 224         struct net_device *dev = rt->u.dst.dev;
 225
 226         /*
 227          *      If the indicated interface is up and running, send the packet.
 228          */
 229         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 230
 231         skb->dev = dev;
 232         skb->protocol = htons(ETH_P_IP);
 233
 234         /*
 235          *      Multicasts are looped back for other local users
 236          */
 237
 238         if (rt->rt_flags&RTCF_MULTICAST) {
 239                 if ((!sk || inet_sk(sk)->mc_loop)
 240 #ifdef CONFIG_IP_MROUTE
 241                 /* Small optimization: do not loopback not local frames,
 242                    which returned after forwarding; they will be  dropped
 243                    by ip_mr_input in any case.
 244                    Note, that local frames are looped back to be delivered
 245                    to local recipients.
 246
 247                    This check is duplicated in ip_mr_input at the moment.
 248                  */
 249                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 250 #endif
 251                 ) {
 252                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 253                         if (newskb)
 254                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 255                                         newskb->dev,
 256                                         ip_dev_loopback_xmit);
 257                 }
 258
 259                 /* Multicasts with ttl 0 must not go beyond the host */
 260
 261                 if (skb->nh.iph->ttl == 0) {
 262                         kfree_skb(skb);
 263                         return 0;
 264                 }
 265         }
 266
 267         if (rt->rt_flags&RTCF_BROADCAST) {
 268                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 269                 if (newskb)
 270                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 271                                 newskb->dev, ip_dev_loopback_xmit);
 272         }
 273
 274         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 275                        ip_finish_output);
 276 }
 277
 278 int ip_output(struct sk_buff *skb)
 279 {
 280         struct net_device *dev = skb->dst->dev;
 281
 282         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 283
 284         skb->dev = dev;
 285         skb->protocol = htons(ETH_P_IP);
 286
 287         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 288                        ip_finish_output);
 289 }
 290
 291 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 292 {
 293         struct sock *sk = skb->sk;
 294         struct inet_sock *inet = inet_sk(sk);
 295         struct ip_options *opt = inet->opt;
 296         struct rtable *rt;
 297         struct iphdr *iph;
 298
 299         /* Skip all of this if the packet is already routed,
 300          * f.e. by something like SCTP.
 301          */
 302         rt = (struct rtable *) skb->dst;
 303         if (rt != NULL)
 304                 goto packet_routed;
 305
 306         /* Make sure we can route this packet. */
 307         rt = (struct rtable *)__sk_dst_check(sk, 0);
 308         if (rt == NULL) {
 309                 u32 daddr;
 310
 311                 /* Use correct destination address if we have options. */
 312                 daddr = inet->daddr;
 313                 if(opt && opt->srr)
 314                         daddr = opt->faddr;
 315
 316                 {
 317                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 318                                             .nl_u = { .ip4_u =
 319                                                       { .daddr = daddr,
 320                                                         .saddr = inet->saddr,
 321                                                         .tos = RT_CONN_FLAGS(sk) } },
 322                                             .proto = sk->sk_protocol,
 323                                             .uli_u = { .ports =
 324                                                        { .sport = inet->sport,
 325                                                          .dport = inet->dport } } };
 326
 327                         /* If this fails, retransmit mechanism of transport layer will
 328                          * keep trying until route appears or the connection times
 329                          * itself out.
 330                          */
 331                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 332                                 goto no_route;
 333                 }
 334                 sk_setup_caps(sk, &rt->u.dst);
 335         }
 336         skb->dst = dst_clone(&rt->u.dst);
 337
 338 packet_routed:
 339         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 340                 goto no_route;
 341
 342         /* OK, we know where to send it, allocate and build IP header. */
 343         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 344         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 345         iph->tot_len = htons(skb->len);
 346         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 347                 iph->frag_off = htons(IP_DF);
 348         else
 349                 iph->frag_off = 0;
 350         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 351         iph->protocol = sk->sk_protocol;
 352         iph->saddr    = rt->rt_src;
 353         iph->daddr    = rt->rt_dst;
 354         skb->nh.iph   = iph;
 355         /* Transport layer set skb->h.foo itself. */
 356
 357         if (opt && opt->optlen) {
 358                 iph->ihl += opt->optlen >> 2;
 359                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 360         }
 361
 362         ip_select_ident_more(iph, &rt->u.dst, sk,
 363                              (skb_shinfo(skb)->tso_segs ?: 1) - 1);
 364
 365         /* Add an IP checksum. */
 366         ip_send_check(iph);
 367
 368         skb->priority = sk->sk_priority;
 369
 370         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 371                        dst_output);
 372
 373 no_route:
 374         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 375         kfree_skb(skb);
 376         return -EHOSTUNREACH;
 377 }
 378
 379
 380 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 381 {
 382         to->pkt_type = from->pkt_type;
 383         to->priority = from->priority;
 384         to->protocol = from->protocol;
 385         dst_release(to->dst);
 386         to->dst = dst_clone(from->dst);
 387         to->dev = from->dev;
 388
 389         /* Copy the flags to each fragment. */
 390         IPCB(to)->flags = IPCB(from)->flags;
 391
 392 #ifdef CONFIG_NET_SCHED
 393         to->tc_index = from->tc_index;
 394 #endif
 395 #ifdef CONFIG_NETFILTER
 396         to->nfmark = from->nfmark;
 397         /* Connection association is same as pre-frag packet */
 398         nf_conntrack_put(to->nfct);
 399         to->nfct = from->nfct;
 400         nf_conntrack_get(to->nfct);
 401         to->nfctinfo = from->nfctinfo;
 402 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 403         to->ipvs_property = from->ipvs_property;
 404 #endif
 405 #ifdef CONFIG_BRIDGE_NETFILTER
 406         nf_bridge_put(to->nf_bridge);
 407         to->nf_bridge = from->nf_bridge;
 408         nf_bridge_get(to->nf_bridge);
 409 #endif
 410 #endif
 411 }
 412
 413 /*
 414  *      This IP datagram is too large to be sent in one piece.  Break it up into
 415  *      smaller pieces (each of size equal to IP header plus
 416  *      a block of the data of the original IP data part) that will yet fit in a
 417  *      single device frame, and queue such a frame for sending.
 418  */
 419
 420 static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 421 {
 422         struct iphdr *iph;
 423         int raw = 0;
 424         int ptr;
 425         struct net_device *dev;
 426         struct sk_buff *skb2;
 427         unsigned int mtu, hlen, left, len, ll_rs;
 428         int offset;
 429         __be16 not_last_frag;
 430         struct rtable *rt = (struct rtable*)skb->dst;
 431         int err = 0;
 432
 433         dev = rt->u.dst.dev;
 434
 435         /*
 436          *      Point into the IP datagram header.
 437          */
 438
 439         iph = skb->nh.iph;
 440
 441         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 442                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 443                           htonl(dst_mtu(&rt->u.dst)));
 444                 kfree_skb(skb);
 445                 return -EMSGSIZE;
 446         }
 447
 448         /*
 449          *      Setup starting values.
 450          */
 451
 452         hlen = iph->ihl * 4;
 453         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 454         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 455
 456         /* When frag_list is given, use it. First, check its validity:
 457          * some transformers could create wrong frag_list or break existing
 458          * one, it is not prohibited. In this case fall back to copying.
 459          *
 460          * LATER: this step can be merged to real generation of fragments,
 461          * we can switch to copy when see the first bad fragment.
 462          */
 463         if (skb_shinfo(skb)->frag_list) {
 464                 struct sk_buff *frag;
 465                 int first_len = skb_pagelen(skb);
 466
 467                 if (first_len - hlen > mtu ||
 468                     ((first_len - hlen) & 7) ||
 469                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 470                     skb_cloned(skb))
 471                         goto slow_path;
 472
 473                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 474                         /* Correct geometry. */
 475                         if (frag->len > mtu ||
 476                             ((frag->len & 7) && frag->next) ||
 477                             skb_headroom(frag) < hlen)
 478                             goto slow_path;
 479
 480                         /* Partially cloned skb? */
 481                         if (skb_shared(frag))
 482                                 goto slow_path;
 483
 484                         BUG_ON(frag->sk);
 485                         if (skb->sk) {
 486                                 sock_hold(skb->sk);
 487                                 frag->sk = skb->sk;
 488                                 frag->destructor = sock_wfree;
 489                                 skb->truesize -= frag->truesize;
 490                         }
 491                 }
 492
 493                 /* Everything is OK. Generate! */
 494
 495                 err = 0;
 496                 offset = 0;
 497                 frag = skb_shinfo(skb)->frag_list;
 498                 skb_shinfo(skb)->frag_list = NULL;
 499                 skb->data_len = first_len - skb_headlen(skb);
 500                 skb->len = first_len;
 501                 iph->tot_len = htons(first_len);
 502                 iph->frag_off = htons(IP_MF);
 503                 ip_send_check(iph);
 504
 505                 for (;;) {
 506                         /* Prepare header of the next frame,
 507                          * before previous one went down. */
 508                         if (frag) {
 509                                 frag->ip_summed = CHECKSUM_NONE;
 510                                 frag->h.raw = frag->data;
 511                                 frag->nh.raw = __skb_push(frag, hlen);
 512                                 memcpy(frag->nh.raw, iph, hlen);
 513                                 iph = frag->nh.iph;
 514                                 iph->tot_len = htons(frag->len);
 515                                 ip_copy_metadata(frag, skb);
 516                                 if (offset == 0)
 517                                         ip_options_fragment(frag);
 518                                 offset += skb->len - hlen;
 519                                 iph->frag_off = htons(offset>>3);
 520                                 if (frag->next != NULL)
 521                                         iph->frag_off |= htons(IP_MF);
 522                                 /* Ready, complete checksum */
 523                                 ip_send_check(iph);
 524                         }
 525
 526                         err = output(skb);
 527
 528                         if (err || !frag)
 529                                 break;
 530
 531                         skb = frag;
 532                         frag = skb->next;
 533                         skb->next = NULL;
 534                 }
 535
 536                 if (err == 0) {
 537                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 538                         return 0;
 539                 }
 540
 541                 while (frag) {
 542                         skb = frag->next;
 543                         kfree_skb(frag);
 544                         frag = skb;
 545                 }
 546                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 547                 return err;
 548         }
 549
 550 slow_path:
 551         left = skb->len - hlen;         /* Space per frame */
 552         ptr = raw + hlen;               /* Where to start from */
 553
 554 #ifdef CONFIG_BRIDGE_NETFILTER
 555         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 556          * we need to make room for the encapsulating header */
 557         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 558         mtu -= nf_bridge_pad(skb);
 559 #else
 560         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 561 #endif
 562         /*
 563          *      Fragment the datagram.
 564          */
 565
 566         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 567         not_last_frag = iph->frag_off & htons(IP_MF);
 568
 569         /*
 570          *      Keep copying data until we run out.
 571          */
 572
 573         while(left > 0) {
 574                 len = left;
 575                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 576                 if (len > mtu)
 577                         len = mtu;
 578                 /* IF: we are not sending upto and including the packet end
 579                    then align the next start on an eight byte boundary */
 580                 if (len < left) {
 581                         len &= ~7;
 582                 }
 583                 /*
 584                  *      Allocate buffer.
 585                  */
 586
 587                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 588                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 589                         err = -ENOMEM;
 590                         goto fail;
 591                 }
 592
 593                 /*
 594                  *      Set up data on packet
 595                  */
 596
 597                 ip_copy_metadata(skb2, skb);
 598                 skb_reserve(skb2, ll_rs);
 599                 skb_put(skb2, len + hlen);
 600                 skb2->nh.raw = skb2->data;
 601                 skb2->h.raw = skb2->data + hlen;
 602
 603                 /*
 604                  *      Charge the memory for the fragment to any owner
 605                  *      it might possess
 606                  */
 607
 608                 if (skb->sk)
 609                         skb_set_owner_w(skb2, skb->sk);
 610
 611                 /*
 612                  *      Copy the packet header into the new buffer.
 613                  */
 614
 615                 memcpy(skb2->nh.raw, skb->data, hlen);
 616
 617                 /*
 618                  *      Copy a block of the IP datagram.
 619                  */
 620                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 621                         BUG();
 622                 left -= len;
 623
 624                 /*
 625                  *      Fill in the new header fields.
 626                  */
 627                 iph = skb2->nh.iph;
 628                 iph->frag_off = htons((offset >> 3));
 629
 630                 /* ANK: dirty, but effective trick. Upgrade options only if
 631                  * the segment to be fragmented was THE FIRST (otherwise,
 632                  * options are already fixed) and make it ONCE
 633                  * on the initial skb, so that all the following fragments
 634                  * will inherit fixed options.
 635                  */
 636                 if (offset == 0)
 637                         ip_options_fragment(skb);
 638
 639                 /*
 640                  *      Added AC : If we are fragmenting a fragment that's not the
 641                  *                 last fragment then keep MF on each bit
 642                  */
 643                 if (left > 0 || not_last_frag)
 644                         iph->frag_off |= htons(IP_MF);
 645                 ptr += len;
 646                 offset += len;
 647
 648                 /*
 649                  *      Put this fragment into the sending queue.
 650                  */
 651
 652                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 653
 654                 iph->tot_len = htons(len + hlen);
 655
 656                 ip_send_check(iph);
 657
 658                 err = output(skb2);
 659                 if (err)
 660                         goto fail;
 661         }
 662         kfree_skb(skb);
 663         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 664         return err;
 665
 666 fail:
 667         kfree_skb(skb);
 668         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 669         return err;
 670 }
 671
 672 int
 673 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 674 {
 675         struct iovec *iov = from;
 676
 677         if (skb->ip_summed == CHECKSUM_HW) {
 678                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 679                         return -EFAULT;
 680         } else {
 681                 unsigned int csum = 0;
 682                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 683                         return -EFAULT;
 684                 skb->csum = csum_block_add(skb->csum, csum, odd);
 685         }
 686         return 0;
 687 }
 688
 689 static inline unsigned int
 690 csum_page(struct page *page, int offset, int copy)
 691 {
 692         char *kaddr;
 693         unsigned int csum;
 694         kaddr = kmap(page);
 695         csum = csum_partial(kaddr + offset, copy, 0);
 696         kunmap(page);
 697         return csum;
 698 }
 699
 700 static inline int ip_ufo_append_data(struct sock *sk,
 701                         int getfrag(void *from, char *to, int offset, int len,
 702                                int odd, struct sk_buff *skb),
 703                         void *from, int length, int hh_len, int fragheaderlen,
 704                         int transhdrlen, int mtu,unsigned int flags)
 705 {
 706         struct sk_buff *skb;
 707         int err;
 708
 709         /* There is support for UDP fragmentation offload by network
 710          * device, so create one single skb packet containing complete
 711          * udp datagram
 712          */
 713         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 714                 skb = sock_alloc_send_skb(sk,
 715                         hh_len + fragheaderlen + transhdrlen + 20,
 716                         (flags & MSG_DONTWAIT), &err);
 717
 718                 if (skb == NULL)
 719                         return err;
 720
 721                 /* reserve space for Hardware header */
 722                 skb_reserve(skb, hh_len);
 723
 724                 /* create space for UDP/IP header */
 725                 skb_put(skb,fragheaderlen + transhdrlen);
 726
 727                 /* initialize network header pointer */
 728                 skb->nh.raw = skb->data;
 729
 730                 /* initialize protocol header pointer */
 731                 skb->h.raw = skb->data + fragheaderlen;
 732
 733                 skb->ip_summed = CHECKSUM_HW;
 734                 skb->csum = 0;
 735                 sk->sk_sndmsg_off = 0;
 736         }
 737
 738         err = skb_append_datato_frags(sk,skb, getfrag, from,
 739                                (length - transhdrlen));
 740         if (!err) {
 741                 /* specify the length of each IP datagram fragment*/
 742                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
 743                 __skb_queue_tail(&sk->sk_write_queue, skb);
 744
 745                 return 0;
 746         }
 747         /* There is not enough support do UFO ,
 748          * so follow normal path
 749          */
 750         kfree_skb(skb);
 751         return err;
 752 }
 753
 754 /*
 755  *      ip_append_data() and ip_append_page() can make one large IP datagram
 756  *      from many pieces of data. Each pieces will be holded on the socket
 757  *      until ip_push_pending_frames() is called. Each piece can be a page
 758  *      or non-page data.
 759  *
 760  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 761  *      this interface potentially.
 762  *
 763  *      LATER: length must be adjusted by pad at tail, when it is required.
 764  */
 765 int ip_append_data(struct sock *sk,
 766                    int getfrag(void *from, char *to, int offset, int len,
 767                                int odd, struct sk_buff *skb),
 768                    void *from, int length, int transhdrlen,
 769                    struct ipcm_cookie *ipc, struct rtable *rt,
 770                    unsigned int flags)
 771 {
 772         struct inet_sock *inet = inet_sk(sk);
 773         struct sk_buff *skb;
 774
 775         struct ip_options *opt = NULL;
 776         int hh_len;
 777         int exthdrlen;
 778         int mtu;
 779         int copy;
 780         int err;
 781         int offset = 0;
 782         unsigned int maxfraglen, fragheaderlen;
 783         int csummode = CHECKSUM_NONE;
 784
 785         if (flags&MSG_PROBE)
 786                 return 0;
 787
 788         if (skb_queue_empty(&sk->sk_write_queue)) {
 789                 /*
 790                  * setup for corking.
 791                  */
 792                 opt = ipc->opt;
 793                 if (opt) {
 794                         if (inet->cork.opt == NULL) {
 795                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 796                                 if (unlikely(inet->cork.opt == NULL))
 797                                         return -ENOBUFS;
 798                         }
 799                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 800                         inet->cork.flags |= IPCORK_OPT;
 801                         inet->cork.addr = ipc->addr;
 802                 }
 803                 dst_hold(&rt->u.dst);
 804                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 805                 inet->cork.rt = rt;
 806                 inet->cork.length = 0;
 807                 sk->sk_sndmsg_page = NULL;
 808                 sk->sk_sndmsg_off = 0;
 809                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 810                         length += exthdrlen;
 811                         transhdrlen += exthdrlen;
 812                 }
 813         } else {
 814                 rt = inet->cork.rt;
 815                 if (inet->cork.flags & IPCORK_OPT)
 816                         opt = inet->cork.opt;
 817
 818                 transhdrlen = 0;
 819                 exthdrlen = 0;
 820                 mtu = inet->cork.fragsize;
 821         }
 822         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 823
 824         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 825         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 826
 827         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 828                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 829                 return -EMSGSIZE;
 830         }
 831
 832         /*
 833          * transhdrlen > 0 means that this is the first fragment and we wish
 834          * it won't be fragmented in the future.
 835          */
 836         if (transhdrlen &&
 837             length + fragheaderlen <= mtu &&
 838             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 839             !exthdrlen)
 840                 csummode = CHECKSUM_HW;
 841
 842         inet->cork.length += length;
 843         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 844                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 845
 846                 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 847                                fragheaderlen, transhdrlen, mtu, flags))
 848                         goto error;
 849
 850                 return 0;
 851         }
 852
 853         /* So, what's going on in the loop below?
 854          *
 855          * We use calculated fragment length to generate chained skb,
 856          * each of segments is IP fragment ready for sending to network after
 857          * adding appropriate IP header.
 858          */
 859
 860         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 861                 goto alloc_new_skb;
 862
 863         while (length > 0) {
 864                 /* Check if the remaining data fits into current packet. */
 865                 copy = mtu - skb->len;
 866                 if (copy < length)
 867                         copy = maxfraglen - skb->len;
 868                 if (copy <= 0) {
 869                         char *data;
 870                         unsigned int datalen;
 871                         unsigned int fraglen;
 872                         unsigned int fraggap;
 873                         unsigned int alloclen;
 874                         struct sk_buff *skb_prev;
 875 alloc_new_skb:
 876                         skb_prev = skb;
 877                         if (skb_prev)
 878                                 fraggap = skb_prev->len - maxfraglen;
 879                         else
 880                                 fraggap = 0;
 881
 882                         /*
 883                          * If remaining data exceeds the mtu,
 884                          * we know we need more fragment(s).
 885                          */
 886                         datalen = length + fraggap;
 887                         if (datalen > mtu - fragheaderlen)
 888                                 datalen = maxfraglen - fragheaderlen;
 889                         fraglen = datalen + fragheaderlen;
 890
 891                         if ((flags & MSG_MORE) &&
 892                             !(rt->u.dst.dev->features&NETIF_F_SG))
 893                                 alloclen = mtu;
 894                         else
 895                                 alloclen = datalen + fragheaderlen;
 896
 897                         /* The last fragment gets additional space at tail.
 898                          * Note, with MSG_MORE we overallocate on fragments,
 899                          * because we have no idea what fragment will be
 900                          * the last.
 901                          */
 902                         if (datalen == length)
 903                                 alloclen += rt->u.dst.trailer_len;
 904
 905                         if (transhdrlen) {
 906                                 skb = sock_alloc_send_skb(sk,
 907                                                 alloclen + hh_len + 15,
 908                                                 (flags & MSG_DONTWAIT), &err);
 909                         } else {
 910                                 skb = NULL;
 911                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 912                                     2 * sk->sk_sndbuf)
 913                                         skb = sock_wmalloc(sk,
 914                                                            alloclen + hh_len + 15, 1,
 915                                                            sk->sk_allocation);
 916                                 if (unlikely(skb == NULL))
 917                                         err = -ENOBUFS;
 918                         }
 919                         if (skb == NULL)
 920                                 goto error;
 921
 922                         /*
 923                          *      Fill in the control structures
 924                          */
 925                         skb->ip_summed = csummode;
 926                         skb->csum = 0;
 927                         skb_reserve(skb, hh_len);
 928
 929                         /*
 930                          *      Find where to start putting bytes.
 931                          */
 932                         data = skb_put(skb, fraglen);
 933                         skb->nh.raw = data + exthdrlen;
 934                         data += fragheaderlen;
 935                         skb->h.raw = data + exthdrlen;
 936
 937                         if (fraggap) {
 938                                 skb->csum = skb_copy_and_csum_bits(
 939                                         skb_prev, maxfraglen,
 940                                         data + transhdrlen, fraggap, 0);
 941                                 skb_prev->csum = csum_sub(skb_prev->csum,
 942                                                           skb->csum);
 943                                 data += fraggap;
 944                                 skb_trim(skb_prev, maxfraglen);
 945                         }
 946
 947                         copy = datalen - transhdrlen - fraggap;
 948                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 949                                 err = -EFAULT;
 950                                 kfree_skb(skb);
 951                                 goto error;
 952                         }
 953
 954                         offset += copy;
 955                         length -= datalen - fraggap;
 956                         transhdrlen = 0;
 957                         exthdrlen = 0;
 958                         csummode = CHECKSUM_NONE;
 959
 960                         /*
 961                          * Put the packet on the pending queue.
 962                          */
 963                         __skb_queue_tail(&sk->sk_write_queue, skb);
 964                         continue;
 965                 }
 966
 967                 if (copy > length)
 968                         copy = length;
 969
 970                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 971                         unsigned int off;
 972
 973                         off = skb->len;
 974                         if (getfrag(from, skb_put(skb, copy),
 975                                         offset, copy, off, skb) < 0) {
 976                                 __skb_trim(skb, off);
 977                                 err = -EFAULT;
 978                                 goto error;
 979                         }
 980                 } else {
 981                         int i = skb_shinfo(skb)->nr_frags;
 982                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 983                         struct page *page = sk->sk_sndmsg_page;
 984                         int off = sk->sk_sndmsg_off;
 985                         unsigned int left;
 986
 987                         if (page && (left = PAGE_SIZE - off) > 0) {
 988                                 if (copy >= left)
 989                                         copy = left;
 990                                 if (page != frag->page) {
 991                                         if (i == MAX_SKB_FRAGS) {
 992                                                 err = -EMSGSIZE;
 993                                                 goto error;
 994                                         }
 995                                         get_page(page);
 996                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 997                                         frag = &skb_shinfo(skb)->frags[i];
 998                                 }
 999                         } else if (i < MAX_SKB_FRAGS) {
1000                                 if (copy > PAGE_SIZE)
1001                                         copy = PAGE_SIZE;
1002                                 page = alloc_pages(sk->sk_allocation, 0);
1003                                 if (page == NULL)  {
1004                                         err = -ENOMEM;
1005                                         goto error;
1006                                 }
1007                                 sk->sk_sndmsg_page = page;
1008                                 sk->sk_sndmsg_off = 0;
1009
1010                                 skb_fill_page_desc(skb, i, page, 0, 0);
1011                                 frag = &skb_shinfo(skb)->frags[i];
1012                                 skb->truesize += PAGE_SIZE;
1013                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1014                         } else {
1015                                 err = -EMSGSIZE;
1016                                 goto error;
1017                         }
1018                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1019                                 err = -EFAULT;
1020                                 goto error;
1021                         }
1022                         sk->sk_sndmsg_off += copy;
1023                         frag->size += copy;
1024                         skb->len += copy;
1025                         skb->data_len += copy;
1026                 }
1027                 offset += copy;
1028                 length -= copy;
1029         }
1030
1031         return 0;
1032
1033 error:
1034         inet->cork.length -= length;
1035         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1036         return err;
1037 }
1038
1039 ssize_t ip_append_page(struct sock *sk, struct page *page,
1040                        int offset, size_t size, int flags)
1041 {
1042         struct inet_sock *inet = inet_sk(sk);
1043         struct sk_buff *skb;
1044         struct rtable *rt;
1045         struct ip_options *opt = NULL;
1046         int hh_len;
1047         int mtu;
1048         int len;
1049         int err;
1050         unsigned int maxfraglen, fragheaderlen, fraggap;
1051
1052         if (inet->hdrincl)
1053                 return -EPERM;
1054
1055         if (flags&MSG_PROBE)
1056                 return 0;
1057
1058         if (skb_queue_empty(&sk->sk_write_queue))
1059                 return -EINVAL;
1060
1061         rt = inet->cork.rt;
1062         if (inet->cork.flags & IPCORK_OPT)
1063                 opt = inet->cork.opt;
1064
1065         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1066                 return -EOPNOTSUPP;
1067
1068         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1069         mtu = inet->cork.fragsize;
1070
1071         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1072         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1073
1074         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1075                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1076                 return -EMSGSIZE;
1077         }
1078
1079         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1080                 return -EINVAL;
1081
1082         inet->cork.length += size;
1083         if ((sk->sk_protocol == IPPROTO_UDP) &&
1084             (rt->u.dst.dev->features & NETIF_F_UFO))
1085                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1086
1087
1088         while (size > 0) {
1089                 int i;
1090
1091                 if (skb_shinfo(skb)->ufo_size)
1092                         len = size;
1093                 else {
1094
1095                         /* Check if the remaining data fits into current packet. */
1096                         len = mtu - skb->len;
1097                         if (len < size)
1098                                 len = maxfraglen - skb->len;
1099                 }
1100                 if (len <= 0) {
1101                         struct sk_buff *skb_prev;
1102                         char *data;
1103                         struct iphdr *iph;
1104                         int alloclen;
1105
1106                         skb_prev = skb;
1107                         fraggap = skb_prev->len - maxfraglen;
1108
1109                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1110                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1111                         if (unlikely(!skb)) {
1112                                 err = -ENOBUFS;
1113                                 goto error;
1114                         }
1115
1116                         /*
1117                          *      Fill in the control structures
1118                          */
1119                         skb->ip_summed = CHECKSUM_NONE;
1120                         skb->csum = 0;
1121                         skb_reserve(skb, hh_len);
1122
1123                         /*
1124                          *      Find where to start putting bytes.
1125                          */
1126                         data = skb_put(skb, fragheaderlen + fraggap);
1127                         skb->nh.iph = iph = (struct iphdr *)data;
1128                         data += fragheaderlen;
1129                         skb->h.raw = data;
1130
1131                         if (fraggap) {
1132                                 skb->csum = skb_copy_and_csum_bits(
1133                                         skb_prev, maxfraglen,
1134                                         data, fraggap, 0);
1135                                 skb_prev->csum = csum_sub(skb_prev->csum,
1136                                                           skb->csum);
1137                                 skb_trim(skb_prev, maxfraglen);
1138                         }
1139
1140                         /*
1141                          * Put the packet on the pending queue.
1142                          */
1143                         __skb_queue_tail(&sk->sk_write_queue, skb);
1144                         continue;
1145                 }
1146
1147                 i = skb_shinfo(skb)->nr_frags;
1148                 if (len > size)
1149                         len = size;
1150                 if (skb_can_coalesce(skb, i, page, offset)) {
1151                         skb_shinfo(skb)->frags[i-1].size += len;
1152                 } else if (i < MAX_SKB_FRAGS) {
1153                         get_page(page);
1154                         skb_fill_page_desc(skb, i, page, offset, len);
1155                 } else {
1156                         err = -EMSGSIZE;
1157                         goto error;
1158                 }
1159
1160                 if (skb->ip_summed == CHECKSUM_NONE) {
1161                         unsigned int csum;
1162                         csum = csum_page(page, offset, len);
1163                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1164                 }
1165
1166                 skb->len += len;
1167                 skb->data_len += len;
1168                 offset += len;
1169                 size -= len;
1170         }
1171         return 0;
1172
1173 error:
1174         inet->cork.length -= size;
1175         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1176         return err;
1177 }
1178
1179 /*
1180  *      Combined all pending IP fragments on the socket as one IP datagram
1181  *      and push them out.
1182  */
1183 int ip_push_pending_frames(struct sock *sk)
1184 {
1185         struct sk_buff *skb, *tmp_skb;
1186         struct sk_buff **tail_skb;
1187         struct inet_sock *inet = inet_sk(sk);
1188         struct ip_options *opt = NULL;
1189         struct rtable *rt = inet->cork.rt;
1190         struct iphdr *iph;
1191         __be16 df = 0;
1192         __u8 ttl;
1193         int err = 0;
1194
1195         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1196                 goto out;
1197         tail_skb = &(skb_shinfo(skb)->frag_list);
1198
1199         /* move skb->data to ip header from ext header */
1200         if (skb->data < skb->nh.raw)
1201                 __skb_pull(skb, skb->nh.raw - skb->data);
1202         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1203                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1204                 *tail_skb = tmp_skb;
1205                 tail_skb = &(tmp_skb->next);
1206                 skb->len += tmp_skb->len;
1207                 skb->data_len += tmp_skb->len;
1208                 skb->truesize += tmp_skb->truesize;
1209                 __sock_put(tmp_skb->sk);
1210                 tmp_skb->destructor = NULL;
1211                 tmp_skb->sk = NULL;
1212         }
1213
1214         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1215          * to fragment the frame generated here. No matter, what transforms
1216          * how transforms change size of the packet, it will come out.
1217          */
1218         if (inet->pmtudisc != IP_PMTUDISC_DO)
1219                 skb->local_df = 1;
1220
1221         /* DF bit is set when we want to see DF on outgoing frames.
1222          * If local_df is set too, we still allow to fragment this frame
1223          * locally. */
1224         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1225             (skb->len <= dst_mtu(&rt->u.dst) &&
1226              ip_dont_fragment(sk, &rt->u.dst)))
1227                 df = htons(IP_DF);
1228
1229         if (inet->cork.flags & IPCORK_OPT)
1230                 opt = inet->cork.opt;
1231
1232         if (rt->rt_type == RTN_MULTICAST)
1233                 ttl = inet->mc_ttl;
1234         else
1235                 ttl = ip_select_ttl(inet, &rt->u.dst);
1236
1237         iph = (struct iphdr *)skb->data;
1238         iph->version = 4;
1239         iph->ihl = 5;
1240         if (opt) {
1241                 iph->ihl += opt->optlen>>2;
1242                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1243         }
1244         iph->tos = inet->tos;
1245         iph->tot_len = htons(skb->len);
1246         iph->frag_off = df;
1247         if (!df) {
1248                 __ip_select_ident(iph, &rt->u.dst, 0);
1249         } else {
1250                 iph->id = htons(inet->id++);
1251         }
1252         iph->ttl = ttl;
1253         iph->protocol = sk->sk_protocol;
1254         iph->saddr = rt->rt_src;
1255         iph->daddr = rt->rt_dst;
1256         ip_send_check(iph);
1257
1258         skb->priority = sk->sk_priority;
1259         skb->dst = dst_clone(&rt->u.dst);
1260
1261         /* Netfilter gets whole the not fragmented skb. */
1262         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1263                       skb->dst->dev, dst_output);
1264         if (err) {
1265                 if (err > 0)
1266                         err = inet->recverr ? net_xmit_errno(err) : 0;
1267                 if (err)
1268                         goto error;
1269         }
1270
1271 out:
1272         inet->cork.flags &= ~IPCORK_OPT;
1273         kfree(inet->cork.opt);
1274         inet->cork.opt = NULL;
1275         if (inet->cork.rt) {
1276                 ip_rt_put(inet->cork.rt);
1277                 inet->cork.rt = NULL;
1278         }
1279         return err;
1280
1281 error:
1282         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1283         goto out;
1284 }
1285
1286 /*
1287  *      Throw away all pending data on the socket.
1288  */
1289 void ip_flush_pending_frames(struct sock *sk)
1290 {
1291         struct inet_sock *inet = inet_sk(sk);
1292         struct sk_buff *skb;
1293
1294         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1295                 kfree_skb(skb);
1296
1297         inet->cork.flags &= ~IPCORK_OPT;
1298         kfree(inet->cork.opt);
1299         inet->cork.opt = NULL;
1300         if (inet->cork.rt) {
1301                 ip_rt_put(inet->cork.rt);
1302                 inet->cork.rt = NULL;
1303         }
1304 }
1305
1306
1307 /*
1308  *      Fetch data from kernel space and fill in checksum if needed.
1309  */
1310 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1311                               int len, int odd, struct sk_buff *skb)
1312 {
1313         unsigned int csum;
1314
1315         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1316         skb->csum = csum_block_add(skb->csum, csum, odd);
1317         return 0;
1318 }
1319
1320 /*
1321  *      Generic function to send a packet as reply to another packet.
1322  *      Used to send TCP resets so far. ICMP should use this function too.
1323  *
1324  *      Should run single threaded per socket because it uses the sock
1325  *      structure to pass arguments.
1326  *
1327  *      LATER: switch from ip_build_xmit to ip_append_*
1328  */
1329 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1330                    unsigned int len)
1331 {
1332         struct inet_sock *inet = inet_sk(sk);
1333         struct {
1334                 struct ip_options       opt;
1335                 char                    data[40];
1336         } replyopts;
1337         struct ipcm_cookie ipc;
1338         u32 daddr;
1339         struct rtable *rt = (struct rtable*)skb->dst;
1340
1341         if (ip_options_echo(&replyopts.opt, skb))
1342                 return;
1343
1344         daddr = ipc.addr = rt->rt_src;
1345         ipc.opt = NULL;
1346
1347         if (replyopts.opt.optlen) {
1348                 ipc.opt = &replyopts.opt;
1349
1350                 if (ipc.opt->srr)
1351                         daddr = replyopts.opt.faddr;
1352         }
1353
1354         {
1355                 struct flowi fl = { .nl_u = { .ip4_u =
1356                                               { .daddr = daddr,
1357                                                 .saddr = rt->rt_spec_dst,
1358                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1359                                     /* Not quite clean, but right. */
1360                                     .uli_u = { .ports =
1361                                                { .sport = skb->h.th->dest,
1362                                                  .dport = skb->h.th->source } },
1363                                     .proto = sk->sk_protocol };
1364                 if (ip_route_output_key(&rt, &fl))
1365                         return;
1366         }
1367
1368         /* And let IP do all the hard work.
1369
1370            This chunk is not reenterable, hence spinlock.
1371            Note that it uses the fact, that this function is called
1372            with locally disabled BH and that sk cannot be already spinlocked.
1373          */
1374         bh_lock_sock(sk);
1375         inet->tos = skb->nh.iph->tos;
1376         sk->sk_priority = skb->priority;
1377         sk->sk_protocol = skb->nh.iph->protocol;
1378         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1379                        &ipc, rt, MSG_DONTWAIT);
1380         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1381                 if (arg->csumoffset >= 0)
1382                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1383                 skb->ip_summed = CHECKSUM_NONE;
1384                 ip_push_pending_frames(sk);
1385         }
1386
1387         bh_unlock_sock(sk);
1388
1389         ip_rt_put(rt);
1390 }
1391
1392 void __init ip_init(void)
1393 {
1394         ip_rt_init();
1395         inet_initpeers();
1396
1397 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1398         igmp_mc_proc_init();
1399 #endif
1400 }
1401
1402 EXPORT_SYMBOL(ip_generic_getfrag);
1403 EXPORT_SYMBOL(ip_queue_xmit);
1404 EXPORT_SYMBOL(ip_send_check);