net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  87
  88 /* Generate a checksum for an outgoing IP datagram. */
  89 __inline__ void ip_send_check(struct iphdr *iph)
  90 {
  91         iph->check = 0;
  92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  93 }
  94
  95 /* dev_loopback_xmit for use with netfilter. */
  96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  97 {
  98         skb_reset_mac_header(newskb);
  99         __skb_pull(newskb, skb_network_offset(newskb));
 100         newskb->pkt_type = PACKET_LOOPBACK;
 101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 102         BUG_TRAP(newskb->dst);
 103         netif_rx(newskb);
 104         return 0;
 105 }
 106
 107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 108 {
 109         int ttl = inet->uc_ttl;
 110
 111         if (ttl < 0)
 112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 113         return ttl;
 114 }
 115
 116 /*
 117  *              Add an ip header to a skbuff and send it out.
 118  *
 119  */
 120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 122 {
 123         struct inet_sock *inet = inet_sk(sk);
 124         struct rtable *rt = (struct rtable *)skb->dst;
 125         struct iphdr *iph;
 126
 127         /* Build the IP header. */
 128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 129         skb_reset_network_header(skb);
 130         iph = ip_hdr(skb);
 131         iph->version  = 4;
 132         iph->ihl      = 5;
 133         iph->tos      = inet->tos;
 134         if (ip_dont_fragment(sk, &rt->u.dst))
 135                 iph->frag_off = htons(IP_DF);
 136         else
 137                 iph->frag_off = 0;
 138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 139         iph->daddr    = rt->rt_dst;
 140         iph->saddr    = rt->rt_src;
 141         iph->protocol = sk->sk_protocol;
 142         iph->tot_len  = htons(skb->len);
 143         ip_select_ident(iph, &rt->u.dst, sk);
 144
 145         if (opt && opt->optlen) {
 146                 iph->ihl += opt->optlen>>2;
 147                 ip_options_build(skb, opt, daddr, rt, 0);
 148         }
 149         ip_send_check(iph);
 150
 151         skb->priority = sk->sk_priority;
 152
 153         /* Send it out. */
 154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 155                        dst_output);
 156 }
 157
 158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 159
 160 static inline int ip_finish_output2(struct sk_buff *skb)
 161 {
 162         struct dst_entry *dst = skb->dst;
 163         struct rtable *rt = (struct rtable *)dst;
 164         struct net_device *dev = dst->dev;
 165         int hh_len = LL_RESERVED_SPACE(dev);
 166
 167         if (rt->rt_type == RTN_MULTICAST)
 168                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 169         else if (rt->rt_type == RTN_BROADCAST)
 170                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 171
 172         /* Be paranoid, rather than too clever. */
 173         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 174                 struct sk_buff *skb2;
 175
 176                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 177                 if (skb2 == NULL) {
 178                         kfree_skb(skb);
 179                         return -ENOMEM;
 180                 }
 181                 if (skb->sk)
 182                         skb_set_owner_w(skb2, skb->sk);
 183                 kfree_skb(skb);
 184                 skb = skb2;
 185         }
 186
 187         if (dst->hh)
 188                 return neigh_hh_output(dst->hh, skb);
 189         else if (dst->neighbour)
 190                 return dst->neighbour->output(skb);
 191
 192         if (net_ratelimit())
 193                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 194         kfree_skb(skb);
 195         return -EINVAL;
 196 }
 197
 198 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 199 {
 200         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 201
 202         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 203                skb->dst->dev->mtu : dst_mtu(skb->dst);
 204 }
 205
 206 static inline int ip_finish_output(struct sk_buff *skb)
 207 {
 208 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 209         /* Policy lookup after SNAT yielded a new policy */
 210         if (skb->dst->xfrm != NULL) {
 211                 IPCB(skb)->flags |= IPSKB_REROUTED;
 212                 return dst_output(skb);
 213         }
 214 #endif
 215         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 216                 return ip_fragment(skb, ip_finish_output2);
 217         else
 218                 return ip_finish_output2(skb);
 219 }
 220
 221 int ip_mc_output(struct sk_buff *skb)
 222 {
 223         struct sock *sk = skb->sk;
 224         struct rtable *rt = (struct rtable*)skb->dst;
 225         struct net_device *dev = rt->u.dst.dev;
 226
 227         /*
 228          *      If the indicated interface is up and running, send the packet.
 229          */
 230         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 231
 232         skb->dev = dev;
 233         skb->protocol = htons(ETH_P_IP);
 234
 235         /*
 236          *      Multicasts are looped back for other local users
 237          */
 238
 239         if (rt->rt_flags&RTCF_MULTICAST) {
 240                 if ((!sk || inet_sk(sk)->mc_loop)
 241 #ifdef CONFIG_IP_MROUTE
 242                 /* Small optimization: do not loopback not local frames,
 243                    which returned after forwarding; they will be  dropped
 244                    by ip_mr_input in any case.
 245                    Note, that local frames are looped back to be delivered
 246                    to local recipients.
 247
 248                    This check is duplicated in ip_mr_input at the moment.
 249                  */
 250                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 251 #endif
 252                 ) {
 253                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 254                         if (newskb)
 255                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 256                                         newskb->dev,
 257                                         ip_dev_loopback_xmit);
 258                 }
 259
 260                 /* Multicasts with ttl 0 must not go beyond the host */
 261
 262                 if (ip_hdr(skb)->ttl == 0) {
 263                         kfree_skb(skb);
 264                         return 0;
 265                 }
 266         }
 267
 268         if (rt->rt_flags&RTCF_BROADCAST) {
 269                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 270                 if (newskb)
 271                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 272                                 newskb->dev, ip_dev_loopback_xmit);
 273         }
 274
 275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 276                             ip_finish_output,
 277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 278 }
 279
 280 int ip_output(struct sk_buff *skb)
 281 {
 282         struct net_device *dev = skb->dst->dev;
 283
 284         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 285
 286         skb->dev = dev;
 287         skb->protocol = htons(ETH_P_IP);
 288
 289         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 290                             ip_finish_output,
 291                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 292 }
 293
 294 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 295 {
 296         struct sock *sk = skb->sk;
 297         struct inet_sock *inet = inet_sk(sk);
 298         struct ip_options *opt = inet->opt;
 299         struct rtable *rt;
 300         struct iphdr *iph;
 301
 302         /* Skip all of this if the packet is already routed,
 303          * f.e. by something like SCTP.
 304          */
 305         rt = (struct rtable *) skb->dst;
 306         if (rt != NULL)
 307                 goto packet_routed;
 308
 309         /* Make sure we can route this packet. */
 310         rt = (struct rtable *)__sk_dst_check(sk, 0);
 311         if (rt == NULL) {
 312                 __be32 daddr;
 313
 314                 /* Use correct destination address if we have options. */
 315                 daddr = inet->daddr;
 316                 if(opt && opt->srr)
 317                         daddr = opt->faddr;
 318
 319                 {
 320                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 321                                             .nl_u = { .ip4_u =
 322                                                       { .daddr = daddr,
 323                                                         .saddr = inet->saddr,
 324                                                         .tos = RT_CONN_FLAGS(sk) } },
 325                                             .proto = sk->sk_protocol,
 326                                             .uli_u = { .ports =
 327                                                        { .sport = inet->sport,
 328                                                          .dport = inet->dport } } };
 329
 330                         /* If this fails, retransmit mechanism of transport layer will
 331                          * keep trying until route appears or the connection times
 332                          * itself out.
 333                          */
 334                         security_sk_classify_flow(sk, &fl);
 335                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 336                                 goto no_route;
 337                 }
 338                 sk_setup_caps(sk, &rt->u.dst);
 339         }
 340         skb->dst = dst_clone(&rt->u.dst);
 341
 342 packet_routed:
 343         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 344                 goto no_route;
 345
 346         /* OK, we know where to send it, allocate and build IP header. */
 347         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 348         skb_reset_network_header(skb);
 349         iph = ip_hdr(skb);
 350         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 351         iph->tot_len = htons(skb->len);
 352         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 353                 iph->frag_off = htons(IP_DF);
 354         else
 355                 iph->frag_off = 0;
 356         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 357         iph->protocol = sk->sk_protocol;
 358         iph->saddr    = rt->rt_src;
 359         iph->daddr    = rt->rt_dst;
 360         /* Transport layer set skb->h.foo itself. */
 361
 362         if (opt && opt->optlen) {
 363                 iph->ihl += opt->optlen >> 2;
 364                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 365         }
 366
 367         ip_select_ident_more(iph, &rt->u.dst, sk,
 368                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 369
 370         /* Add an IP checksum. */
 371         ip_send_check(iph);
 372
 373         skb->priority = sk->sk_priority;
 374
 375         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 376                        dst_output);
 377
 378 no_route:
 379         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 380         kfree_skb(skb);
 381         return -EHOSTUNREACH;
 382 }
 383
 384
 385 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 386 {
 387         to->pkt_type = from->pkt_type;
 388         to->priority = from->priority;
 389         to->protocol = from->protocol;
 390         dst_release(to->dst);
 391         to->dst = dst_clone(from->dst);
 392         to->dev = from->dev;
 393         to->mark = from->mark;
 394
 395         /* Copy the flags to each fragment. */
 396         IPCB(to)->flags = IPCB(from)->flags;
 397
 398 #ifdef CONFIG_NET_SCHED
 399         to->tc_index = from->tc_index;
 400 #endif
 401         nf_copy(to, from);
 402 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 403     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 404         to->nf_trace = from->nf_trace;
 405 #endif
 406 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 407         to->ipvs_property = from->ipvs_property;
 408 #endif
 409         skb_copy_secmark(to, from);
 410 }
 411
 412 /*
 413  *      This IP datagram is too large to be sent in one piece.  Break it up into
 414  *      smaller pieces (each of size equal to IP header plus
 415  *      a block of the data of the original IP data part) that will yet fit in a
 416  *      single device frame, and queue such a frame for sending.
 417  */
 418
 419 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 420 {
 421         struct iphdr *iph;
 422         int raw = 0;
 423         int ptr;
 424         struct net_device *dev;
 425         struct sk_buff *skb2;
 426         unsigned int mtu, hlen, left, len, ll_rs, pad;
 427         int offset;
 428         __be16 not_last_frag;
 429         struct rtable *rt = (struct rtable*)skb->dst;
 430         int err = 0;
 431
 432         dev = rt->u.dst.dev;
 433
 434         /*
 435          *      Point into the IP datagram header.
 436          */
 437
 438         iph = ip_hdr(skb);
 439
 440         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 441                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 442                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 443                           htonl(ip_skb_dst_mtu(skb)));
 444                 kfree_skb(skb);
 445                 return -EMSGSIZE;
 446         }
 447
 448         /*
 449          *      Setup starting values.
 450          */
 451
 452         hlen = iph->ihl * 4;
 453         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 454         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 455
 456         /* When frag_list is given, use it. First, check its validity:
 457          * some transformers could create wrong frag_list or break existing
 458          * one, it is not prohibited. In this case fall back to copying.
 459          *
 460          * LATER: this step can be merged to real generation of fragments,
 461          * we can switch to copy when see the first bad fragment.
 462          */
 463         if (skb_shinfo(skb)->frag_list) {
 464                 struct sk_buff *frag;
 465                 int first_len = skb_pagelen(skb);
 466
 467                 if (first_len - hlen > mtu ||
 468                     ((first_len - hlen) & 7) ||
 469                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 470                     skb_cloned(skb))
 471                         goto slow_path;
 472
 473                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 474                         /* Correct geometry. */
 475                         if (frag->len > mtu ||
 476                             ((frag->len & 7) && frag->next) ||
 477                             skb_headroom(frag) < hlen)
 478                             goto slow_path;
 479
 480                         /* Partially cloned skb? */
 481                         if (skb_shared(frag))
 482                                 goto slow_path;
 483
 484                         BUG_ON(frag->sk);
 485                         if (skb->sk) {
 486                                 sock_hold(skb->sk);
 487                                 frag->sk = skb->sk;
 488                                 frag->destructor = sock_wfree;
 489                                 skb->truesize -= frag->truesize;
 490                         }
 491                 }
 492
 493                 /* Everything is OK. Generate! */
 494
 495                 err = 0;
 496                 offset = 0;
 497                 frag = skb_shinfo(skb)->frag_list;
 498                 skb_shinfo(skb)->frag_list = NULL;
 499                 skb->data_len = first_len - skb_headlen(skb);
 500                 skb->len = first_len;
 501                 iph->tot_len = htons(first_len);
 502                 iph->frag_off = htons(IP_MF);
 503                 ip_send_check(iph);
 504
 505                 for (;;) {
 506                         /* Prepare header of the next frame,
 507                          * before previous one went down. */
 508                         if (frag) {
 509                                 frag->ip_summed = CHECKSUM_NONE;
 510                                 skb_reset_transport_header(frag);
 511                                 __skb_push(frag, hlen);
 512                                 skb_reset_network_header(frag);
 513                                 memcpy(skb_network_header(frag), iph, hlen);
 514                                 iph = ip_hdr(frag);
 515                                 iph->tot_len = htons(frag->len);
 516                                 ip_copy_metadata(frag, skb);
 517                                 if (offset == 0)
 518                                         ip_options_fragment(frag);
 519                                 offset += skb->len - hlen;
 520                                 iph->frag_off = htons(offset>>3);
 521                                 if (frag->next != NULL)
 522                                         iph->frag_off |= htons(IP_MF);
 523                                 /* Ready, complete checksum */
 524                                 ip_send_check(iph);
 525                         }
 526
 527                         err = output(skb);
 528
 529                         if (!err)
 530                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 531                         if (err || !frag)
 532                                 break;
 533
 534                         skb = frag;
 535                         frag = skb->next;
 536                         skb->next = NULL;
 537                 }
 538
 539                 if (err == 0) {
 540                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 541                         return 0;
 542                 }
 543
 544                 while (frag) {
 545                         skb = frag->next;
 546                         kfree_skb(frag);
 547                         frag = skb;
 548                 }
 549                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 550                 return err;
 551         }
 552
 553 slow_path:
 554         left = skb->len - hlen;         /* Space per frame */
 555         ptr = raw + hlen;               /* Where to start from */
 556
 557         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 558          * we need to make room for the encapsulating header
 559          */
 560         pad = nf_bridge_pad(skb);
 561         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 562         mtu -= pad;
 563
 564         /*
 565          *      Fragment the datagram.
 566          */
 567
 568         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 569         not_last_frag = iph->frag_off & htons(IP_MF);
 570
 571         /*
 572          *      Keep copying data until we run out.
 573          */
 574
 575         while (left > 0) {
 576                 len = left;
 577                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 578                 if (len > mtu)
 579                         len = mtu;
 580                 /* IF: we are not sending upto and including the packet end
 581                    then align the next start on an eight byte boundary */
 582                 if (len < left) {
 583                         len &= ~7;
 584                 }
 585                 /*
 586                  *      Allocate buffer.
 587                  */
 588
 589                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 590                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 591                         err = -ENOMEM;
 592                         goto fail;
 593                 }
 594
 595                 /*
 596                  *      Set up data on packet
 597                  */
 598
 599                 ip_copy_metadata(skb2, skb);
 600                 skb_reserve(skb2, ll_rs);
 601                 skb_put(skb2, len + hlen);
 602                 skb_reset_network_header(skb2);
 603                 skb2->transport_header = skb2->network_header + hlen;
 604
 605                 /*
 606                  *      Charge the memory for the fragment to any owner
 607                  *      it might possess
 608                  */
 609
 610                 if (skb->sk)
 611                         skb_set_owner_w(skb2, skb->sk);
 612
 613                 /*
 614                  *      Copy the packet header into the new buffer.
 615                  */
 616
 617                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 618
 619                 /*
 620                  *      Copy a block of the IP datagram.
 621                  */
 622                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 623                         BUG();
 624                 left -= len;
 625
 626                 /*
 627                  *      Fill in the new header fields.
 628                  */
 629                 iph = ip_hdr(skb2);
 630                 iph->frag_off = htons((offset >> 3));
 631
 632                 /* ANK: dirty, but effective trick. Upgrade options only if
 633                  * the segment to be fragmented was THE FIRST (otherwise,
 634                  * options are already fixed) and make it ONCE
 635                  * on the initial skb, so that all the following fragments
 636                  * will inherit fixed options.
 637                  */
 638                 if (offset == 0)
 639                         ip_options_fragment(skb);
 640
 641                 /*
 642                  *      Added AC : If we are fragmenting a fragment that's not the
 643                  *                 last fragment then keep MF on each bit
 644                  */
 645                 if (left > 0 || not_last_frag)
 646                         iph->frag_off |= htons(IP_MF);
 647                 ptr += len;
 648                 offset += len;
 649
 650                 /*
 651                  *      Put this fragment into the sending queue.
 652                  */
 653                 iph->tot_len = htons(len + hlen);
 654
 655                 ip_send_check(iph);
 656
 657                 err = output(skb2);
 658                 if (err)
 659                         goto fail;
 660
 661                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 662         }
 663         kfree_skb(skb);
 664         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 665         return err;
 666
 667 fail:
 668         kfree_skb(skb);
 669         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 670         return err;
 671 }
 672
 673 EXPORT_SYMBOL(ip_fragment);
 674
 675 int
 676 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 677 {
 678         struct iovec *iov = from;
 679
 680         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 681                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 682                         return -EFAULT;
 683         } else {
 684                 __wsum csum = 0;
 685                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 686                         return -EFAULT;
 687                 skb->csum = csum_block_add(skb->csum, csum, odd);
 688         }
 689         return 0;
 690 }
 691
 692 static inline __wsum
 693 csum_page(struct page *page, int offset, int copy)
 694 {
 695         char *kaddr;
 696         __wsum csum;
 697         kaddr = kmap(page);
 698         csum = csum_partial(kaddr + offset, copy, 0);
 699         kunmap(page);
 700         return csum;
 701 }
 702
 703 static inline int ip_ufo_append_data(struct sock *sk,
 704                         int getfrag(void *from, char *to, int offset, int len,
 705                                int odd, struct sk_buff *skb),
 706                         void *from, int length, int hh_len, int fragheaderlen,
 707                         int transhdrlen, int mtu,unsigned int flags)
 708 {
 709         struct sk_buff *skb;
 710         int err;
 711
 712         /* There is support for UDP fragmentation offload by network
 713          * device, so create one single skb packet containing complete
 714          * udp datagram
 715          */
 716         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 717                 skb = sock_alloc_send_skb(sk,
 718                         hh_len + fragheaderlen + transhdrlen + 20,
 719                         (flags & MSG_DONTWAIT), &err);
 720
 721                 if (skb == NULL)
 722                         return err;
 723
 724                 /* reserve space for Hardware header */
 725                 skb_reserve(skb, hh_len);
 726
 727                 /* create space for UDP/IP header */
 728                 skb_put(skb,fragheaderlen + transhdrlen);
 729
 730                 /* initialize network header pointer */
 731                 skb_reset_network_header(skb);
 732
 733                 /* initialize protocol header pointer */
 734                 skb->transport_header = skb->network_header + fragheaderlen;
 735
 736                 skb->ip_summed = CHECKSUM_PARTIAL;
 737                 skb->csum = 0;
 738                 sk->sk_sndmsg_off = 0;
 739         }
 740
 741         err = skb_append_datato_frags(sk,skb, getfrag, from,
 742                                (length - transhdrlen));
 743         if (!err) {
 744                 /* specify the length of each IP datagram fragment*/
 745                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 746                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 747                 __skb_queue_tail(&sk->sk_write_queue, skb);
 748
 749                 return 0;
 750         }
 751         /* There is not enough support do UFO ,
 752          * so follow normal path
 753          */
 754         kfree_skb(skb);
 755         return err;
 756 }
 757
 758 /*
 759  *      ip_append_data() and ip_append_page() can make one large IP datagram
 760  *      from many pieces of data. Each pieces will be holded on the socket
 761  *      until ip_push_pending_frames() is called. Each piece can be a page
 762  *      or non-page data.
 763  *
 764  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 765  *      this interface potentially.
 766  *
 767  *      LATER: length must be adjusted by pad at tail, when it is required.
 768  */
 769 int ip_append_data(struct sock *sk,
 770                    int getfrag(void *from, char *to, int offset, int len,
 771                                int odd, struct sk_buff *skb),
 772                    void *from, int length, int transhdrlen,
 773                    struct ipcm_cookie *ipc, struct rtable *rt,
 774                    unsigned int flags)
 775 {
 776         struct inet_sock *inet = inet_sk(sk);
 777         struct sk_buff *skb;
 778
 779         struct ip_options *opt = NULL;
 780         int hh_len;
 781         int exthdrlen;
 782         int mtu;
 783         int copy;
 784         int err;
 785         int offset = 0;
 786         unsigned int maxfraglen, fragheaderlen;
 787         int csummode = CHECKSUM_NONE;
 788
 789         if (flags&MSG_PROBE)
 790                 return 0;
 791
 792         if (skb_queue_empty(&sk->sk_write_queue)) {
 793                 /*
 794                  * setup for corking.
 795                  */
 796                 opt = ipc->opt;
 797                 if (opt) {
 798                         if (inet->cork.opt == NULL) {
 799                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 800                                 if (unlikely(inet->cork.opt == NULL))
 801                                         return -ENOBUFS;
 802                         }
 803                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 804                         inet->cork.flags |= IPCORK_OPT;
 805                         inet->cork.addr = ipc->addr;
 806                 }
 807                 dst_hold(&rt->u.dst);
 808                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 809                                             rt->u.dst.dev->mtu :
 810                                             dst_mtu(rt->u.dst.path);
 811                 inet->cork.rt = rt;
 812                 inet->cork.length = 0;
 813                 sk->sk_sndmsg_page = NULL;
 814                 sk->sk_sndmsg_off = 0;
 815                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 816                         length += exthdrlen;
 817                         transhdrlen += exthdrlen;
 818                 }
 819         } else {
 820                 rt = inet->cork.rt;
 821                 if (inet->cork.flags & IPCORK_OPT)
 822                         opt = inet->cork.opt;
 823
 824                 transhdrlen = 0;
 825                 exthdrlen = 0;
 826                 mtu = inet->cork.fragsize;
 827         }
 828         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 829
 830         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 831         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 832
 833         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 834                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 835                 return -EMSGSIZE;
 836         }
 837
 838         /*
 839          * transhdrlen > 0 means that this is the first fragment and we wish
 840          * it won't be fragmented in the future.
 841          */
 842         if (transhdrlen &&
 843             length + fragheaderlen <= mtu &&
 844             rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 845             !exthdrlen)
 846                 csummode = CHECKSUM_PARTIAL;
 847
 848         inet->cork.length += length;
 849         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 850                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 851
 852                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 853                                          fragheaderlen, transhdrlen, mtu,
 854                                          flags);
 855                 if (err)
 856                         goto error;
 857                 return 0;
 858         }
 859
 860         /* So, what's going on in the loop below?
 861          *
 862          * We use calculated fragment length to generate chained skb,
 863          * each of segments is IP fragment ready for sending to network after
 864          * adding appropriate IP header.
 865          */
 866
 867         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 868                 goto alloc_new_skb;
 869
 870         while (length > 0) {
 871                 /* Check if the remaining data fits into current packet. */
 872                 copy = mtu - skb->len;
 873                 if (copy < length)
 874                         copy = maxfraglen - skb->len;
 875                 if (copy <= 0) {
 876                         char *data;
 877                         unsigned int datalen;
 878                         unsigned int fraglen;
 879                         unsigned int fraggap;
 880                         unsigned int alloclen;
 881                         struct sk_buff *skb_prev;
 882 alloc_new_skb:
 883                         skb_prev = skb;
 884                         if (skb_prev)
 885                                 fraggap = skb_prev->len - maxfraglen;
 886                         else
 887                                 fraggap = 0;
 888
 889                         /*
 890                          * If remaining data exceeds the mtu,
 891                          * we know we need more fragment(s).
 892                          */
 893                         datalen = length + fraggap;
 894                         if (datalen > mtu - fragheaderlen)
 895                                 datalen = maxfraglen - fragheaderlen;
 896                         fraglen = datalen + fragheaderlen;
 897
 898                         if ((flags & MSG_MORE) &&
 899                             !(rt->u.dst.dev->features&NETIF_F_SG))
 900                                 alloclen = mtu;
 901                         else
 902                                 alloclen = datalen + fragheaderlen;
 903
 904                         /* The last fragment gets additional space at tail.
 905                          * Note, with MSG_MORE we overallocate on fragments,
 906                          * because we have no idea what fragment will be
 907                          * the last.
 908                          */
 909                         if (datalen == length + fraggap)
 910                                 alloclen += rt->u.dst.trailer_len;
 911
 912                         if (transhdrlen) {
 913                                 skb = sock_alloc_send_skb(sk,
 914                                                 alloclen + hh_len + 15,
 915                                                 (flags & MSG_DONTWAIT), &err);
 916                         } else {
 917                                 skb = NULL;
 918                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 919                                     2 * sk->sk_sndbuf)
 920                                         skb = sock_wmalloc(sk,
 921                                                            alloclen + hh_len + 15, 1,
 922                                                            sk->sk_allocation);
 923                                 if (unlikely(skb == NULL))
 924                                         err = -ENOBUFS;
 925                         }
 926                         if (skb == NULL)
 927                                 goto error;
 928
 929                         /*
 930                          *      Fill in the control structures
 931                          */
 932                         skb->ip_summed = csummode;
 933                         skb->csum = 0;
 934                         skb_reserve(skb, hh_len);
 935
 936                         /*
 937                          *      Find where to start putting bytes.
 938                          */
 939                         data = skb_put(skb, fraglen);
 940                         skb_set_network_header(skb, exthdrlen);
 941                         skb->transport_header = (skb->network_header +
 942                                                  fragheaderlen);
 943                         data += fragheaderlen;
 944
 945                         if (fraggap) {
 946                                 skb->csum = skb_copy_and_csum_bits(
 947                                         skb_prev, maxfraglen,
 948                                         data + transhdrlen, fraggap, 0);
 949                                 skb_prev->csum = csum_sub(skb_prev->csum,
 950                                                           skb->csum);
 951                                 data += fraggap;
 952                                 pskb_trim_unique(skb_prev, maxfraglen);
 953                         }
 954
 955                         copy = datalen - transhdrlen - fraggap;
 956                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 957                                 err = -EFAULT;
 958                                 kfree_skb(skb);
 959                                 goto error;
 960                         }
 961
 962                         offset += copy;
 963                         length -= datalen - fraggap;
 964                         transhdrlen = 0;
 965                         exthdrlen = 0;
 966                         csummode = CHECKSUM_NONE;
 967
 968                         /*
 969                          * Put the packet on the pending queue.
 970                          */
 971                         __skb_queue_tail(&sk->sk_write_queue, skb);
 972                         continue;
 973                 }
 974
 975                 if (copy > length)
 976                         copy = length;
 977
 978                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 979                         unsigned int off;
 980
 981                         off = skb->len;
 982                         if (getfrag(from, skb_put(skb, copy),
 983                                         offset, copy, off, skb) < 0) {
 984                                 __skb_trim(skb, off);
 985                                 err = -EFAULT;
 986                                 goto error;
 987                         }
 988                 } else {
 989                         int i = skb_shinfo(skb)->nr_frags;
 990                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 991                         struct page *page = sk->sk_sndmsg_page;
 992                         int off = sk->sk_sndmsg_off;
 993                         unsigned int left;
 994
 995                         if (page && (left = PAGE_SIZE - off) > 0) {
 996                                 if (copy >= left)
 997                                         copy = left;
 998                                 if (page != frag->page) {
 999                                         if (i == MAX_SKB_FRAGS) {
1000                                                 err = -EMSGSIZE;
1001                                                 goto error;
1002                                         }
1003                                         get_page(page);
1004                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1005                                         frag = &skb_shinfo(skb)->frags[i];
1006                                 }
1007                         } else if (i < MAX_SKB_FRAGS) {
1008                                 if (copy > PAGE_SIZE)
1009                                         copy = PAGE_SIZE;
1010                                 page = alloc_pages(sk->sk_allocation, 0);
1011                                 if (page == NULL)  {
1012                                         err = -ENOMEM;
1013                                         goto error;
1014                                 }
1015                                 sk->sk_sndmsg_page = page;
1016                                 sk->sk_sndmsg_off = 0;
1017
1018                                 skb_fill_page_desc(skb, i, page, 0, 0);
1019                                 frag = &skb_shinfo(skb)->frags[i];
1020                                 skb->truesize += PAGE_SIZE;
1021                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1022                         } else {
1023                                 err = -EMSGSIZE;
1024                                 goto error;
1025                         }
1026                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1027                                 err = -EFAULT;
1028                                 goto error;
1029                         }
1030                         sk->sk_sndmsg_off += copy;
1031                         frag->size += copy;
1032                         skb->len += copy;
1033                         skb->data_len += copy;
1034                 }
1035                 offset += copy;
1036                 length -= copy;
1037         }
1038
1039         return 0;
1040
1041 error:
1042         inet->cork.length -= length;
1043         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1044         return err;
1045 }
1046
1047 ssize_t ip_append_page(struct sock *sk, struct page *page,
1048                        int offset, size_t size, int flags)
1049 {
1050         struct inet_sock *inet = inet_sk(sk);
1051         struct sk_buff *skb;
1052         struct rtable *rt;
1053         struct ip_options *opt = NULL;
1054         int hh_len;
1055         int mtu;
1056         int len;
1057         int err;
1058         unsigned int maxfraglen, fragheaderlen, fraggap;
1059
1060         if (inet->hdrincl)
1061                 return -EPERM;
1062
1063         if (flags&MSG_PROBE)
1064                 return 0;
1065
1066         if (skb_queue_empty(&sk->sk_write_queue))
1067                 return -EINVAL;
1068
1069         rt = inet->cork.rt;
1070         if (inet->cork.flags & IPCORK_OPT)
1071                 opt = inet->cork.opt;
1072
1073         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1074                 return -EOPNOTSUPP;
1075
1076         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1077         mtu = inet->cork.fragsize;
1078
1079         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1080         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1081
1082         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1083                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1084                 return -EMSGSIZE;
1085         }
1086
1087         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1088                 return -EINVAL;
1089
1090         inet->cork.length += size;
1091         if ((sk->sk_protocol == IPPROTO_UDP) &&
1092             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1093                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1094                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1095         }
1096
1097
1098         while (size > 0) {
1099                 int i;
1100
1101                 if (skb_is_gso(skb))
1102                         len = size;
1103                 else {
1104
1105                         /* Check if the remaining data fits into current packet. */
1106                         len = mtu - skb->len;
1107                         if (len < size)
1108                                 len = maxfraglen - skb->len;
1109                 }
1110                 if (len <= 0) {
1111                         struct sk_buff *skb_prev;
1112                         int alloclen;
1113
1114                         skb_prev = skb;
1115                         fraggap = skb_prev->len - maxfraglen;
1116
1117                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1118                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1119                         if (unlikely(!skb)) {
1120                                 err = -ENOBUFS;
1121                                 goto error;
1122                         }
1123
1124                         /*
1125                          *      Fill in the control structures
1126                          */
1127                         skb->ip_summed = CHECKSUM_NONE;
1128                         skb->csum = 0;
1129                         skb_reserve(skb, hh_len);
1130
1131                         /*
1132                          *      Find where to start putting bytes.
1133                          */
1134                         skb_put(skb, fragheaderlen + fraggap);
1135                         skb_reset_network_header(skb);
1136                         skb->transport_header = (skb->network_header +
1137                                                  fragheaderlen);
1138                         if (fraggap) {
1139                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1140                                                                    maxfraglen,
1141                                                     skb_transport_header(skb),
1142                                                                    fraggap, 0);
1143                                 skb_prev->csum = csum_sub(skb_prev->csum,
1144                                                           skb->csum);
1145                                 pskb_trim_unique(skb_prev, maxfraglen);
1146                         }
1147
1148                         /*
1149                          * Put the packet on the pending queue.
1150                          */
1151                         __skb_queue_tail(&sk->sk_write_queue, skb);
1152                         continue;
1153                 }
1154
1155                 i = skb_shinfo(skb)->nr_frags;
1156                 if (len > size)
1157                         len = size;
1158                 if (skb_can_coalesce(skb, i, page, offset)) {
1159                         skb_shinfo(skb)->frags[i-1].size += len;
1160                 } else if (i < MAX_SKB_FRAGS) {
1161                         get_page(page);
1162                         skb_fill_page_desc(skb, i, page, offset, len);
1163                 } else {
1164                         err = -EMSGSIZE;
1165                         goto error;
1166                 }
1167
1168                 if (skb->ip_summed == CHECKSUM_NONE) {
1169                         __wsum csum;
1170                         csum = csum_page(page, offset, len);
1171                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1172                 }
1173
1174                 skb->len += len;
1175                 skb->data_len += len;
1176                 offset += len;
1177                 size -= len;
1178         }
1179         return 0;
1180
1181 error:
1182         inet->cork.length -= size;
1183         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184         return err;
1185 }
1186
1187 /*
1188  *      Combined all pending IP fragments on the socket as one IP datagram
1189  *      and push them out.
1190  */
1191 int ip_push_pending_frames(struct sock *sk)
1192 {
1193         struct sk_buff *skb, *tmp_skb;
1194         struct sk_buff **tail_skb;
1195         struct inet_sock *inet = inet_sk(sk);
1196         struct ip_options *opt = NULL;
1197         struct rtable *rt = inet->cork.rt;
1198         struct iphdr *iph;
1199         __be16 df = 0;
1200         __u8 ttl;
1201         int err = 0;
1202
1203         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1204                 goto out;
1205         tail_skb = &(skb_shinfo(skb)->frag_list);
1206
1207         /* move skb->data to ip header from ext header */
1208         if (skb->data < skb_network_header(skb))
1209                 __skb_pull(skb, skb_network_offset(skb));
1210         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1211                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1212                 *tail_skb = tmp_skb;
1213                 tail_skb = &(tmp_skb->next);
1214                 skb->len += tmp_skb->len;
1215                 skb->data_len += tmp_skb->len;
1216                 skb->truesize += tmp_skb->truesize;
1217                 __sock_put(tmp_skb->sk);
1218                 tmp_skb->destructor = NULL;
1219                 tmp_skb->sk = NULL;
1220         }
1221
1222         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1223          * to fragment the frame generated here. No matter, what transforms
1224          * how transforms change size of the packet, it will come out.
1225          */
1226         if (inet->pmtudisc < IP_PMTUDISC_DO)
1227                 skb->local_df = 1;
1228
1229         /* DF bit is set when we want to see DF on outgoing frames.
1230          * If local_df is set too, we still allow to fragment this frame
1231          * locally. */
1232         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1233             (skb->len <= dst_mtu(&rt->u.dst) &&
1234              ip_dont_fragment(sk, &rt->u.dst)))
1235                 df = htons(IP_DF);
1236
1237         if (inet->cork.flags & IPCORK_OPT)
1238                 opt = inet->cork.opt;
1239
1240         if (rt->rt_type == RTN_MULTICAST)
1241                 ttl = inet->mc_ttl;
1242         else
1243                 ttl = ip_select_ttl(inet, &rt->u.dst);
1244
1245         iph = (struct iphdr *)skb->data;
1246         iph->version = 4;
1247         iph->ihl = 5;
1248         if (opt) {
1249                 iph->ihl += opt->optlen>>2;
1250                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1251         }
1252         iph->tos = inet->tos;
1253         iph->tot_len = htons(skb->len);
1254         iph->frag_off = df;
1255         ip_select_ident(iph, &rt->u.dst, sk);
1256         iph->ttl = ttl;
1257         iph->protocol = sk->sk_protocol;
1258         iph->saddr = rt->rt_src;
1259         iph->daddr = rt->rt_dst;
1260         ip_send_check(iph);
1261
1262         skb->priority = sk->sk_priority;
1263         skb->dst = dst_clone(&rt->u.dst);
1264
1265         /* Netfilter gets whole the not fragmented skb. */
1266         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1267                       skb->dst->dev, dst_output);
1268         if (err) {
1269                 if (err > 0)
1270                         err = inet->recverr ? net_xmit_errno(err) : 0;
1271                 if (err)
1272                         goto error;
1273         }
1274
1275 out:
1276         inet->cork.flags &= ~IPCORK_OPT;
1277         kfree(inet->cork.opt);
1278         inet->cork.opt = NULL;
1279         if (inet->cork.rt) {
1280                 ip_rt_put(inet->cork.rt);
1281                 inet->cork.rt = NULL;
1282         }
1283         return err;
1284
1285 error:
1286         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1287         goto out;
1288 }
1289
1290 /*
1291  *      Throw away all pending data on the socket.
1292  */
1293 void ip_flush_pending_frames(struct sock *sk)
1294 {
1295         struct inet_sock *inet = inet_sk(sk);
1296         struct sk_buff *skb;
1297
1298         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1299                 kfree_skb(skb);
1300
1301         inet->cork.flags &= ~IPCORK_OPT;
1302         kfree(inet->cork.opt);
1303         inet->cork.opt = NULL;
1304         if (inet->cork.rt) {
1305                 ip_rt_put(inet->cork.rt);
1306                 inet->cork.rt = NULL;
1307         }
1308 }
1309
1310
1311 /*
1312  *      Fetch data from kernel space and fill in checksum if needed.
1313  */
1314 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1315                               int len, int odd, struct sk_buff *skb)
1316 {
1317         __wsum csum;
1318
1319         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1320         skb->csum = csum_block_add(skb->csum, csum, odd);
1321         return 0;
1322 }
1323
1324 /*
1325  *      Generic function to send a packet as reply to another packet.
1326  *      Used to send TCP resets so far. ICMP should use this function too.
1327  *
1328  *      Should run single threaded per socket because it uses the sock
1329  *      structure to pass arguments.
1330  *
1331  *      LATER: switch from ip_build_xmit to ip_append_*
1332  */
1333 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1334                    unsigned int len)
1335 {
1336         struct inet_sock *inet = inet_sk(sk);
1337         struct {
1338                 struct ip_options       opt;
1339                 char                    data[40];
1340         } replyopts;
1341         struct ipcm_cookie ipc;
1342         __be32 daddr;
1343         struct rtable *rt = (struct rtable*)skb->dst;
1344
1345         if (ip_options_echo(&replyopts.opt, skb))
1346                 return;
1347
1348         daddr = ipc.addr = rt->rt_src;
1349         ipc.opt = NULL;
1350
1351         if (replyopts.opt.optlen) {
1352                 ipc.opt = &replyopts.opt;
1353
1354                 if (ipc.opt->srr)
1355                         daddr = replyopts.opt.faddr;
1356         }
1357
1358         {
1359                 struct flowi fl = { .oif = arg->bound_dev_if,
1360                                     .nl_u = { .ip4_u =
1361                                               { .daddr = daddr,
1362                                                 .saddr = rt->rt_spec_dst,
1363                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1364                                     /* Not quite clean, but right. */
1365                                     .uli_u = { .ports =
1366                                                { .sport = tcp_hdr(skb)->dest,
1367                                                  .dport = tcp_hdr(skb)->source } },
1368                                     .proto = sk->sk_protocol };
1369                 security_skb_classify_flow(skb, &fl);
1370                 if (ip_route_output_key(&rt, &fl))
1371                         return;
1372         }
1373
1374         /* And let IP do all the hard work.
1375
1376            This chunk is not reenterable, hence spinlock.
1377            Note that it uses the fact, that this function is called
1378            with locally disabled BH and that sk cannot be already spinlocked.
1379          */
1380         bh_lock_sock(sk);
1381         inet->tos = ip_hdr(skb)->tos;
1382         sk->sk_priority = skb->priority;
1383         sk->sk_protocol = ip_hdr(skb)->protocol;
1384         sk->sk_bound_dev_if = arg->bound_dev_if;
1385         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1386                        &ipc, rt, MSG_DONTWAIT);
1387         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1388                 if (arg->csumoffset >= 0)
1389                         *((__sum16 *)skb_transport_header(skb) +
1390                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1391                                                                 arg->csum));
1392                 skb->ip_summed = CHECKSUM_NONE;
1393                 ip_push_pending_frames(sk);
1394         }
1395
1396         bh_unlock_sock(sk);
1397
1398         ip_rt_put(rt);
1399 }
1400
1401 void __init ip_init(void)
1402 {
1403         ip_rt_init();
1404         inet_initpeers();
1405
1406 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1407         igmp_mc_proc_init();
1408 #endif
1409 }
1410
1411 EXPORT_SYMBOL(ip_generic_getfrag);
1412 EXPORT_SYMBOL(ip_queue_xmit);
1413 EXPORT_SYMBOL(ip_send_check);