net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/system.h>
  47 #include <linux/module.h>
  48 #include <linux/types.h>
  49 #include <linux/kernel.h>
  50 #include <linux/mm.h>
  51 #include <linux/string.h>
  52 #include <linux/errno.h>
  53 #include <linux/highmem.h>
  54 #include <linux/slab.h>
  55
  56 #include <linux/socket.h>
  57 #include <linux/sockios.h>
  58 #include <linux/in.h>
  59 #include <linux/inet.h>
  60 #include <linux/netdevice.h>
  61 #include <linux/etherdevice.h>
  62 #include <linux/proc_fs.h>
  63 #include <linux/stat.h>
  64 #include <linux/init.h>
  65
  66 #include <net/snmp.h>
  67 #include <net/ip.h>
  68 #include <net/protocol.h>
  69 #include <net/route.h>
  70 #include <net/xfrm.h>
  71 #include <linux/skbuff.h>
  72 #include <net/sock.h>
  73 #include <net/arp.h>
  74 #include <net/icmp.h>
  75 #include <net/checksum.h>
  76 #include <net/inetpeer.h>
  77 #include <linux/igmp.h>
  78 #include <linux/netfilter_ipv4.h>
  79 #include <linux/netfilter_bridge.h>
  80 #include <linux/mroute.h>
  81 #include <linux/netlink.h>
  82 #include <linux/tcp.h>
  83
  84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85
  86 /* Generate a checksum for an outgoing IP datagram. */
  87 __inline__ void ip_send_check(struct iphdr *iph)
  88 {
  89         iph->check = 0;
  90         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91 }
  92
  93 int __ip_local_out(struct sk_buff *skb)
  94 {
  95         struct iphdr *iph = ip_hdr(skb);
  96
  97         iph->tot_len = htons(skb->len);
  98         ip_send_check(iph);
  99         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 100                        skb_dst(skb)->dev, dst_output);
 101 }
 102
 103 int ip_local_out(struct sk_buff *skb)
 104 {
 105         int err;
 106
 107         err = __ip_local_out(skb);
 108         if (likely(err == 1))
 109                 err = dst_output(skb);
 110
 111         return err;
 112 }
 113 EXPORT_SYMBOL_GPL(ip_local_out);
 114
 115 /* dev_loopback_xmit for use with netfilter. */
 116 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 117 {
 118         skb_reset_mac_header(newskb);
 119         __skb_pull(newskb, skb_network_offset(newskb));
 120         newskb->pkt_type = PACKET_LOOPBACK;
 121         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 122         WARN_ON(!skb_dst(newskb));
 123         netif_rx_ni(newskb);
 124         return 0;
 125 }
 126
 127 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 128 {
 129         int ttl = inet->uc_ttl;
 130
 131         if (ttl < 0)
 132                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 133         return ttl;
 134 }
 135
 136 /*
 137  *              Add an ip header to a skbuff and send it out.
 138  *
 139  */
 140 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 141                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 142 {
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct rtable *rt = skb_rtable(skb);
 145         struct iphdr *iph;
 146
 147         /* Build the IP header. */
 148         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 149         skb_reset_network_header(skb);
 150         iph = ip_hdr(skb);
 151         iph->version  = 4;
 152         iph->ihl      = 5;
 153         iph->tos      = inet->tos;
 154         if (ip_dont_fragment(sk, &rt->u.dst))
 155                 iph->frag_off = htons(IP_DF);
 156         else
 157                 iph->frag_off = 0;
 158         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 159         iph->daddr    = rt->rt_dst;
 160         iph->saddr    = rt->rt_src;
 161         iph->protocol = sk->sk_protocol;
 162         ip_select_ident(iph, &rt->u.dst, sk);
 163
 164         if (opt && opt->optlen) {
 165                 iph->ihl += opt->optlen>>2;
 166                 ip_options_build(skb, opt, daddr, rt, 0);
 167         }
 168
 169         skb->priority = sk->sk_priority;
 170         skb->mark = sk->sk_mark;
 171
 172         /* Send it out. */
 173         return ip_local_out(skb);
 174 }
 175
 176 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 177
 178 static inline int ip_finish_output2(struct sk_buff *skb)
 179 {
 180         struct dst_entry *dst = skb_dst(skb);
 181         struct rtable *rt = (struct rtable *)dst;
 182         struct net_device *dev = dst->dev;
 183         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 184
 185         if (rt->rt_type == RTN_MULTICAST) {
 186                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 187         } else if (rt->rt_type == RTN_BROADCAST)
 188                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 189
 190         /* Be paranoid, rather than too clever. */
 191         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 192                 struct sk_buff *skb2;
 193
 194                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 195                 if (skb2 == NULL) {
 196                         kfree_skb(skb);
 197                         return -ENOMEM;
 198                 }
 199                 if (skb->sk)
 200                         skb_set_owner_w(skb2, skb->sk);
 201                 kfree_skb(skb);
 202                 skb = skb2;
 203         }
 204
 205         if (dst->hh)
 206                 return neigh_hh_output(dst->hh, skb);
 207         else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 217 {
 218         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 219
 220         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 221                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 222 }
 223
 224 static int ip_finish_output(struct sk_buff *skb)
 225 {
 226 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 227         /* Policy lookup after SNAT yielded a new policy */
 228         if (skb_dst(skb)->xfrm != NULL) {
 229                 IPCB(skb)->flags |= IPSKB_REROUTED;
 230                 return dst_output(skb);
 231         }
 232 #endif
 233         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 234                 return ip_fragment(skb, ip_finish_output2);
 235         else
 236                 return ip_finish_output2(skb);
 237 }
 238
 239 int ip_mc_output(struct sk_buff *skb)
 240 {
 241         struct sock *sk = skb->sk;
 242         struct rtable *rt = skb_rtable(skb);
 243         struct net_device *dev = rt->u.dst.dev;
 244
 245         /*
 246          *      If the indicated interface is up and running, send the packet.
 247          */
 248         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 249
 250         skb->dev = dev;
 251         skb->protocol = htons(ETH_P_IP);
 252
 253         /*
 254          *      Multicasts are looped back for other local users
 255          */
 256
 257         if (rt->rt_flags&RTCF_MULTICAST) {
 258                 if (sk_mc_loop(sk)
 259 #ifdef CONFIG_IP_MROUTE
 260                 /* Small optimization: do not loopback not local frames,
 261                    which returned after forwarding; they will be  dropped
 262                    by ip_mr_input in any case.
 263                    Note, that local frames are looped back to be delivered
 264                    to local recipients.
 265
 266                    This check is duplicated in ip_mr_input at the moment.
 267                  */
 268                     &&
 269                     ((rt->rt_flags & RTCF_LOCAL) ||
 270                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 271 #endif
 272                    ) {
 273                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 274                         if (newskb)
 275                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 276                                         newskb, NULL, newskb->dev,
 277                                         ip_dev_loopback_xmit);
 278                 }
 279
 280                 /* Multicasts with ttl 0 must not go beyond the host */
 281
 282                 if (ip_hdr(skb)->ttl == 0) {
 283                         kfree_skb(skb);
 284                         return 0;
 285                 }
 286         }
 287
 288         if (rt->rt_flags&RTCF_BROADCAST) {
 289                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 290                 if (newskb)
 291                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 292                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 293         }
 294
 295         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 296                             skb->dev, ip_finish_output,
 297                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 298 }
 299
 300 int ip_output(struct sk_buff *skb)
 301 {
 302         struct net_device *dev = skb_dst(skb)->dev;
 303
 304         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 305
 306         skb->dev = dev;
 307         skb->protocol = htons(ETH_P_IP);
 308
 309         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 310                             ip_finish_output,
 311                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 312 }
 313
 314 int ip_queue_xmit(struct sk_buff *skb)
 315 {
 316         struct sock *sk = skb->sk;
 317         struct inet_sock *inet = inet_sk(sk);
 318         struct ip_options *opt = inet->opt;
 319         struct rtable *rt;
 320         struct iphdr *iph;
 321
 322         /* Skip all of this if the packet is already routed,
 323          * f.e. by something like SCTP.
 324          */
 325         rt = skb_rtable(skb);
 326         if (rt != NULL)
 327                 goto packet_routed;
 328
 329         /* Make sure we can route this packet. */
 330         rt = (struct rtable *)__sk_dst_check(sk, 0);
 331         if (rt == NULL) {
 332                 __be32 daddr;
 333
 334                 /* Use correct destination address if we have options. */
 335                 daddr = inet->inet_daddr;
 336                 if(opt && opt->srr)
 337                         daddr = opt->faddr;
 338
 339                 {
 340                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 341                                             .mark = sk->sk_mark,
 342                                             .nl_u = { .ip4_u =
 343                                                       { .daddr = daddr,
 344                                                         .saddr = inet->inet_saddr,
 345                                                         .tos = RT_CONN_FLAGS(sk) } },
 346                                             .proto = sk->sk_protocol,
 347                                             .flags = inet_sk_flowi_flags(sk),
 348                                             .uli_u = { .ports =
 349                                                        { .sport = inet->inet_sport,
 350                                                          .dport = inet->inet_dport } } };
 351
 352                         /* If this fails, retransmit mechanism of transport layer will
 353                          * keep trying until route appears or the connection times
 354                          * itself out.
 355                          */
 356                         security_sk_classify_flow(sk, &fl);
 357                         if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 358                                 goto no_route;
 359                 }
 360                 sk_setup_caps(sk, &rt->u.dst);
 361         }
 362         skb_dst_set(skb, dst_clone(&rt->u.dst));
 363
 364 packet_routed:
 365         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 366                 goto no_route;
 367
 368         /* OK, we know where to send it, allocate and build IP header. */
 369         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 370         skb_reset_network_header(skb);
 371         iph = ip_hdr(skb);
 372         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 373         if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df)
 374                 iph->frag_off = htons(IP_DF);
 375         else
 376                 iph->frag_off = 0;
 377         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 378         iph->protocol = sk->sk_protocol;
 379         iph->saddr    = rt->rt_src;
 380         iph->daddr    = rt->rt_dst;
 381         /* Transport layer set skb->h.foo itself. */
 382
 383         if (opt && opt->optlen) {
 384                 iph->ihl += opt->optlen >> 2;
 385                 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 386         }
 387
 388         ip_select_ident_more(iph, &rt->u.dst, sk,
 389                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 390
 391         skb->priority = sk->sk_priority;
 392         skb->mark = sk->sk_mark;
 393
 394         return ip_local_out(skb);
 395
 396 no_route:
 397         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 398         kfree_skb(skb);
 399         return -EHOSTUNREACH;
 400 }
 401
 402
 403 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 404 {
 405         to->pkt_type = from->pkt_type;
 406         to->priority = from->priority;
 407         to->protocol = from->protocol;
 408         skb_dst_drop(to);
 409         skb_dst_set(to, dst_clone(skb_dst(from)));
 410         to->dev = from->dev;
 411         to->mark = from->mark;
 412
 413         /* Copy the flags to each fragment. */
 414         IPCB(to)->flags = IPCB(from)->flags;
 415
 416 #ifdef CONFIG_NET_SCHED
 417         to->tc_index = from->tc_index;
 418 #endif
 419         nf_copy(to, from);
 420 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 421     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 422         to->nf_trace = from->nf_trace;
 423 #endif
 424 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 425         to->ipvs_property = from->ipvs_property;
 426 #endif
 427         skb_copy_secmark(to, from);
 428 }
 429
 430 /*
 431  *      This IP datagram is too large to be sent in one piece.  Break it up into
 432  *      smaller pieces (each of size equal to IP header plus
 433  *      a block of the data of the original IP data part) that will yet fit in a
 434  *      single device frame, and queue such a frame for sending.
 435  */
 436
 437 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 438 {
 439         struct iphdr *iph;
 440         int raw = 0;
 441         int ptr;
 442         struct net_device *dev;
 443         struct sk_buff *skb2;
 444         unsigned int mtu, hlen, left, len, ll_rs, pad;
 445         int offset;
 446         __be16 not_last_frag;
 447         struct rtable *rt = skb_rtable(skb);
 448         int err = 0;
 449
 450         dev = rt->u.dst.dev;
 451
 452         /*
 453          *      Point into the IP datagram header.
 454          */
 455
 456         iph = ip_hdr(skb);
 457
 458         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 459                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 460                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 461                           htonl(ip_skb_dst_mtu(skb)));
 462                 kfree_skb(skb);
 463                 return -EMSGSIZE;
 464         }
 465
 466         /*
 467          *      Setup starting values.
 468          */
 469
 470         hlen = iph->ihl * 4;
 471         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 472 #ifdef CONFIG_BRIDGE_NETFILTER
 473         if (skb->nf_bridge)
 474                 mtu -= nf_bridge_mtu_reduction(skb);
 475 #endif
 476         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 477
 478         /* When frag_list is given, use it. First, check its validity:
 479          * some transformers could create wrong frag_list or break existing
 480          * one, it is not prohibited. In this case fall back to copying.
 481          *
 482          * LATER: this step can be merged to real generation of fragments,
 483          * we can switch to copy when see the first bad fragment.
 484          */
 485         if (skb_has_frags(skb)) {
 486                 struct sk_buff *frag;
 487                 int first_len = skb_pagelen(skb);
 488                 int truesizes = 0;
 489
 490                 if (first_len - hlen > mtu ||
 491                     ((first_len - hlen) & 7) ||
 492                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 493                     skb_cloned(skb))
 494                         goto slow_path;
 495
 496                 skb_walk_frags(skb, frag) {
 497                         /* Correct geometry. */
 498                         if (frag->len > mtu ||
 499                             ((frag->len & 7) && frag->next) ||
 500                             skb_headroom(frag) < hlen)
 501                             goto slow_path;
 502
 503                         /* Partially cloned skb? */
 504                         if (skb_shared(frag))
 505                                 goto slow_path;
 506
 507                         BUG_ON(frag->sk);
 508                         if (skb->sk) {
 509                                 frag->sk = skb->sk;
 510                                 frag->destructor = sock_wfree;
 511                         }
 512                         truesizes += frag->truesize;
 513                 }
 514
 515                 /* Everything is OK. Generate! */
 516
 517                 err = 0;
 518                 offset = 0;
 519                 frag = skb_shinfo(skb)->frag_list;
 520                 skb_frag_list_init(skb);
 521                 skb->data_len = first_len - skb_headlen(skb);
 522                 skb->truesize -= truesizes;
 523                 skb->len = first_len;
 524                 iph->tot_len = htons(first_len);
 525                 iph->frag_off = htons(IP_MF);
 526                 ip_send_check(iph);
 527
 528                 for (;;) {
 529                         /* Prepare header of the next frame,
 530                          * before previous one went down. */
 531                         if (frag) {
 532                                 frag->ip_summed = CHECKSUM_NONE;
 533                                 skb_reset_transport_header(frag);
 534                                 __skb_push(frag, hlen);
 535                                 skb_reset_network_header(frag);
 536                                 memcpy(skb_network_header(frag), iph, hlen);
 537                                 iph = ip_hdr(frag);
 538                                 iph->tot_len = htons(frag->len);
 539                                 ip_copy_metadata(frag, skb);
 540                                 if (offset == 0)
 541                                         ip_options_fragment(frag);
 542                                 offset += skb->len - hlen;
 543                                 iph->frag_off = htons(offset>>3);
 544                                 if (frag->next != NULL)
 545                                         iph->frag_off |= htons(IP_MF);
 546                                 /* Ready, complete checksum */
 547                                 ip_send_check(iph);
 548                         }
 549
 550                         err = output(skb);
 551
 552                         if (!err)
 553                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 554                         if (err || !frag)
 555                                 break;
 556
 557                         skb = frag;
 558                         frag = skb->next;
 559                         skb->next = NULL;
 560                 }
 561
 562                 if (err == 0) {
 563                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 564                         return 0;
 565                 }
 566
 567                 while (frag) {
 568                         skb = frag->next;
 569                         kfree_skb(frag);
 570                         frag = skb;
 571                 }
 572                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 573                 return err;
 574         }
 575
 576 slow_path:
 577         left = skb->len - hlen;         /* Space per frame */
 578         ptr = raw + hlen;               /* Where to start from */
 579
 580         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 581          * we need to make room for the encapsulating header
 582          */
 583         pad = nf_bridge_pad(skb);
 584         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 585         mtu -= pad;
 586
 587         /*
 588          *      Fragment the datagram.
 589          */
 590
 591         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 592         not_last_frag = iph->frag_off & htons(IP_MF);
 593
 594         /*
 595          *      Keep copying data until we run out.
 596          */
 597
 598         while (left > 0) {
 599                 len = left;
 600                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 601                 if (len > mtu)
 602                         len = mtu;
 603                 /* IF: we are not sending upto and including the packet end
 604                    then align the next start on an eight byte boundary */
 605                 if (len < left) {
 606                         len &= ~7;
 607                 }
 608                 /*
 609                  *      Allocate buffer.
 610                  */
 611
 612                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 613                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 614                         err = -ENOMEM;
 615                         goto fail;
 616                 }
 617
 618                 /*
 619                  *      Set up data on packet
 620                  */
 621
 622                 ip_copy_metadata(skb2, skb);
 623                 skb_reserve(skb2, ll_rs);
 624                 skb_put(skb2, len + hlen);
 625                 skb_reset_network_header(skb2);
 626                 skb2->transport_header = skb2->network_header + hlen;
 627
 628                 /*
 629                  *      Charge the memory for the fragment to any owner
 630                  *      it might possess
 631                  */
 632
 633                 if (skb->sk)
 634                         skb_set_owner_w(skb2, skb->sk);
 635
 636                 /*
 637                  *      Copy the packet header into the new buffer.
 638                  */
 639
 640                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 641
 642                 /*
 643                  *      Copy a block of the IP datagram.
 644                  */
 645                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 646                         BUG();
 647                 left -= len;
 648
 649                 /*
 650                  *      Fill in the new header fields.
 651                  */
 652                 iph = ip_hdr(skb2);
 653                 iph->frag_off = htons((offset >> 3));
 654
 655                 /* ANK: dirty, but effective trick. Upgrade options only if
 656                  * the segment to be fragmented was THE FIRST (otherwise,
 657                  * options are already fixed) and make it ONCE
 658                  * on the initial skb, so that all the following fragments
 659                  * will inherit fixed options.
 660                  */
 661                 if (offset == 0)
 662                         ip_options_fragment(skb);
 663
 664                 /*
 665                  *      Added AC : If we are fragmenting a fragment that's not the
 666                  *                 last fragment then keep MF on each bit
 667                  */
 668                 if (left > 0 || not_last_frag)
 669                         iph->frag_off |= htons(IP_MF);
 670                 ptr += len;
 671                 offset += len;
 672
 673                 /*
 674                  *      Put this fragment into the sending queue.
 675                  */
 676                 iph->tot_len = htons(len + hlen);
 677
 678                 ip_send_check(iph);
 679
 680                 err = output(skb2);
 681                 if (err)
 682                         goto fail;
 683
 684                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 685         }
 686         kfree_skb(skb);
 687         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 688         return err;
 689
 690 fail:
 691         kfree_skb(skb);
 692         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 693         return err;
 694 }
 695
 696 EXPORT_SYMBOL(ip_fragment);
 697
 698 int
 699 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 700 {
 701         struct iovec *iov = from;
 702
 703         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 704                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 705                         return -EFAULT;
 706         } else {
 707                 __wsum csum = 0;
 708                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 709                         return -EFAULT;
 710                 skb->csum = csum_block_add(skb->csum, csum, odd);
 711         }
 712         return 0;
 713 }
 714
 715 static inline __wsum
 716 csum_page(struct page *page, int offset, int copy)
 717 {
 718         char *kaddr;
 719         __wsum csum;
 720         kaddr = kmap(page);
 721         csum = csum_partial(kaddr + offset, copy, 0);
 722         kunmap(page);
 723         return csum;
 724 }
 725
 726 static inline int ip_ufo_append_data(struct sock *sk,
 727                         int getfrag(void *from, char *to, int offset, int len,
 728                                int odd, struct sk_buff *skb),
 729                         void *from, int length, int hh_len, int fragheaderlen,
 730                         int transhdrlen, int mtu, unsigned int flags)
 731 {
 732         struct sk_buff *skb;
 733         int err;
 734
 735         /* There is support for UDP fragmentation offload by network
 736          * device, so create one single skb packet containing complete
 737          * udp datagram
 738          */
 739         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 740                 skb = sock_alloc_send_skb(sk,
 741                         hh_len + fragheaderlen + transhdrlen + 20,
 742                         (flags & MSG_DONTWAIT), &err);
 743
 744                 if (skb == NULL)
 745                         return err;
 746
 747                 /* reserve space for Hardware header */
 748                 skb_reserve(skb, hh_len);
 749
 750                 /* create space for UDP/IP header */
 751                 skb_put(skb, fragheaderlen + transhdrlen);
 752
 753                 /* initialize network header pointer */
 754                 skb_reset_network_header(skb);
 755
 756                 /* initialize protocol header pointer */
 757                 skb->transport_header = skb->network_header + fragheaderlen;
 758
 759                 skb->ip_summed = CHECKSUM_PARTIAL;
 760                 skb->csum = 0;
 761                 sk->sk_sndmsg_off = 0;
 762
 763                 /* specify the length of each IP datagram fragment */
 764                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 765                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 766                 __skb_queue_tail(&sk->sk_write_queue, skb);
 767         }
 768
 769         return skb_append_datato_frags(sk, skb, getfrag, from,
 770                                        (length - transhdrlen));
 771 }
 772
 773 /*
 774  *      ip_append_data() and ip_append_page() can make one large IP datagram
 775  *      from many pieces of data. Each pieces will be holded on the socket
 776  *      until ip_push_pending_frames() is called. Each piece can be a page
 777  *      or non-page data.
 778  *
 779  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 780  *      this interface potentially.
 781  *
 782  *      LATER: length must be adjusted by pad at tail, when it is required.
 783  */
 784 int ip_append_data(struct sock *sk,
 785                    int getfrag(void *from, char *to, int offset, int len,
 786                                int odd, struct sk_buff *skb),
 787                    void *from, int length, int transhdrlen,
 788                    struct ipcm_cookie *ipc, struct rtable **rtp,
 789                    unsigned int flags)
 790 {
 791         struct inet_sock *inet = inet_sk(sk);
 792         struct sk_buff *skb;
 793
 794         struct ip_options *opt = NULL;
 795         int hh_len;
 796         int exthdrlen;
 797         int mtu;
 798         int copy;
 799         int err;
 800         int offset = 0;
 801         unsigned int maxfraglen, fragheaderlen;
 802         int csummode = CHECKSUM_NONE;
 803         struct rtable *rt;
 804
 805         if (flags&MSG_PROBE)
 806                 return 0;
 807
 808         if (skb_queue_empty(&sk->sk_write_queue)) {
 809                 /*
 810                  * setup for corking.
 811                  */
 812                 opt = ipc->opt;
 813                 if (opt) {
 814                         if (inet->cork.opt == NULL) {
 815                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 816                                 if (unlikely(inet->cork.opt == NULL))
 817                                         return -ENOBUFS;
 818                         }
 819                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 820                         inet->cork.flags |= IPCORK_OPT;
 821                         inet->cork.addr = ipc->addr;
 822                 }
 823                 rt = *rtp;
 824                 if (unlikely(!rt))
 825                         return -EFAULT;
 826                 /*
 827                  * We steal reference to this route, caller should not release it
 828                  */
 829                 *rtp = NULL;
 830                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 831                                             rt->u.dst.dev->mtu :
 832                                             dst_mtu(rt->u.dst.path);
 833                 inet->cork.dst = &rt->u.dst;
 834                 inet->cork.length = 0;
 835                 sk->sk_sndmsg_page = NULL;
 836                 sk->sk_sndmsg_off = 0;
 837                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 838                         length += exthdrlen;
 839                         transhdrlen += exthdrlen;
 840                 }
 841         } else {
 842                 rt = (struct rtable *)inet->cork.dst;
 843                 if (inet->cork.flags & IPCORK_OPT)
 844                         opt = inet->cork.opt;
 845
 846                 transhdrlen = 0;
 847                 exthdrlen = 0;
 848                 mtu = inet->cork.fragsize;
 849         }
 850         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 851
 852         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 853         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 854
 855         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 856                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 857                                mtu-exthdrlen);
 858                 return -EMSGSIZE;
 859         }
 860
 861         /*
 862          * transhdrlen > 0 means that this is the first fragment and we wish
 863          * it won't be fragmented in the future.
 864          */
 865         if (transhdrlen &&
 866             length + fragheaderlen <= mtu &&
 867             rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 868             !exthdrlen)
 869                 csummode = CHECKSUM_PARTIAL;
 870
 871         inet->cork.length += length;
 872         if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
 873             (sk->sk_protocol == IPPROTO_UDP) &&
 874             (rt->u.dst.dev->features & NETIF_F_UFO)) {
 875                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 876                                          fragheaderlen, transhdrlen, mtu,
 877                                          flags);
 878                 if (err)
 879                         goto error;
 880                 return 0;
 881         }
 882
 883         /* So, what's going on in the loop below?
 884          *
 885          * We use calculated fragment length to generate chained skb,
 886          * each of segments is IP fragment ready for sending to network after
 887          * adding appropriate IP header.
 888          */
 889
 890         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 891                 goto alloc_new_skb;
 892
 893         while (length > 0) {
 894                 /* Check if the remaining data fits into current packet. */
 895                 copy = mtu - skb->len;
 896                 if (copy < length)
 897                         copy = maxfraglen - skb->len;
 898                 if (copy <= 0) {
 899                         char *data;
 900                         unsigned int datalen;
 901                         unsigned int fraglen;
 902                         unsigned int fraggap;
 903                         unsigned int alloclen;
 904                         struct sk_buff *skb_prev;
 905 alloc_new_skb:
 906                         skb_prev = skb;
 907                         if (skb_prev)
 908                                 fraggap = skb_prev->len - maxfraglen;
 909                         else
 910                                 fraggap = 0;
 911
 912                         /*
 913                          * If remaining data exceeds the mtu,
 914                          * we know we need more fragment(s).
 915                          */
 916                         datalen = length + fraggap;
 917                         if (datalen > mtu - fragheaderlen)
 918                                 datalen = maxfraglen - fragheaderlen;
 919                         fraglen = datalen + fragheaderlen;
 920
 921                         if ((flags & MSG_MORE) &&
 922                             !(rt->u.dst.dev->features&NETIF_F_SG))
 923                                 alloclen = mtu;
 924                         else
 925                                 alloclen = datalen + fragheaderlen;
 926
 927                         /* The last fragment gets additional space at tail.
 928                          * Note, with MSG_MORE we overallocate on fragments,
 929                          * because we have no idea what fragment will be
 930                          * the last.
 931                          */
 932                         if (datalen == length + fraggap)
 933                                 alloclen += rt->u.dst.trailer_len;
 934
 935                         if (transhdrlen) {
 936                                 skb = sock_alloc_send_skb(sk,
 937                                                 alloclen + hh_len + 15,
 938                                                 (flags & MSG_DONTWAIT), &err);
 939                         } else {
 940                                 skb = NULL;
 941                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 942                                     2 * sk->sk_sndbuf)
 943                                         skb = sock_wmalloc(sk,
 944                                                            alloclen + hh_len + 15, 1,
 945                                                            sk->sk_allocation);
 946                                 if (unlikely(skb == NULL))
 947                                         err = -ENOBUFS;
 948                                 else
 949                                         /* only the initial fragment is
 950                                            time stamped */
 951                                         ipc->shtx.flags = 0;
 952                         }
 953                         if (skb == NULL)
 954                                 goto error;
 955
 956                         /*
 957                          *      Fill in the control structures
 958                          */
 959                         skb->ip_summed = csummode;
 960                         skb->csum = 0;
 961                         skb_reserve(skb, hh_len);
 962                         *skb_tx(skb) = ipc->shtx;
 963
 964                         /*
 965                          *      Find where to start putting bytes.
 966                          */
 967                         data = skb_put(skb, fraglen);
 968                         skb_set_network_header(skb, exthdrlen);
 969                         skb->transport_header = (skb->network_header +
 970                                                  fragheaderlen);
 971                         data += fragheaderlen;
 972
 973                         if (fraggap) {
 974                                 skb->csum = skb_copy_and_csum_bits(
 975                                         skb_prev, maxfraglen,
 976                                         data + transhdrlen, fraggap, 0);
 977                                 skb_prev->csum = csum_sub(skb_prev->csum,
 978                                                           skb->csum);
 979                                 data += fraggap;
 980                                 pskb_trim_unique(skb_prev, maxfraglen);
 981                         }
 982
 983                         copy = datalen - transhdrlen - fraggap;
 984                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 985                                 err = -EFAULT;
 986                                 kfree_skb(skb);
 987                                 goto error;
 988                         }
 989
 990                         offset += copy;
 991                         length -= datalen - fraggap;
 992                         transhdrlen = 0;
 993                         exthdrlen = 0;
 994                         csummode = CHECKSUM_NONE;
 995
 996                         /*
 997                          * Put the packet on the pending queue.
 998                          */
 999                         __skb_queue_tail(&sk->sk_write_queue, skb);
1000                         continue;
1001                 }
1002
1003                 if (copy > length)
1004                         copy = length;
1005
1006                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1007                         unsigned int off;
1008
1009                         off = skb->len;
1010                         if (getfrag(from, skb_put(skb, copy),
1011                                         offset, copy, off, skb) < 0) {
1012                                 __skb_trim(skb, off);
1013                                 err = -EFAULT;
1014                                 goto error;
1015                         }
1016                 } else {
1017                         int i = skb_shinfo(skb)->nr_frags;
1018                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1019                         struct page *page = sk->sk_sndmsg_page;
1020                         int off = sk->sk_sndmsg_off;
1021                         unsigned int left;
1022
1023                         if (page && (left = PAGE_SIZE - off) > 0) {
1024                                 if (copy >= left)
1025                                         copy = left;
1026                                 if (page != frag->page) {
1027                                         if (i == MAX_SKB_FRAGS) {
1028                                                 err = -EMSGSIZE;
1029                                                 goto error;
1030                                         }
1031                                         get_page(page);
1032                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1033                                         frag = &skb_shinfo(skb)->frags[i];
1034                                 }
1035                         } else if (i < MAX_SKB_FRAGS) {
1036                                 if (copy > PAGE_SIZE)
1037                                         copy = PAGE_SIZE;
1038                                 page = alloc_pages(sk->sk_allocation, 0);
1039                                 if (page == NULL)  {
1040                                         err = -ENOMEM;
1041                                         goto error;
1042                                 }
1043                                 sk->sk_sndmsg_page = page;
1044                                 sk->sk_sndmsg_off = 0;
1045
1046                                 skb_fill_page_desc(skb, i, page, 0, 0);
1047                                 frag = &skb_shinfo(skb)->frags[i];
1048                         } else {
1049                                 err = -EMSGSIZE;
1050                                 goto error;
1051                         }
1052                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1053                                 err = -EFAULT;
1054                                 goto error;
1055                         }
1056                         sk->sk_sndmsg_off += copy;
1057                         frag->size += copy;
1058                         skb->len += copy;
1059                         skb->data_len += copy;
1060                         skb->truesize += copy;
1061                         atomic_add(copy, &sk->sk_wmem_alloc);
1062                 }
1063                 offset += copy;
1064                 length -= copy;
1065         }
1066
1067         return 0;
1068
1069 error:
1070         inet->cork.length -= length;
1071         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1072         return err;
1073 }
1074
1075 ssize_t ip_append_page(struct sock *sk, struct page *page,
1076                        int offset, size_t size, int flags)
1077 {
1078         struct inet_sock *inet = inet_sk(sk);
1079         struct sk_buff *skb;
1080         struct rtable *rt;
1081         struct ip_options *opt = NULL;
1082         int hh_len;
1083         int mtu;
1084         int len;
1085         int err;
1086         unsigned int maxfraglen, fragheaderlen, fraggap;
1087
1088         if (inet->hdrincl)
1089                 return -EPERM;
1090
1091         if (flags&MSG_PROBE)
1092                 return 0;
1093
1094         if (skb_queue_empty(&sk->sk_write_queue))
1095                 return -EINVAL;
1096
1097         rt = (struct rtable *)inet->cork.dst;
1098         if (inet->cork.flags & IPCORK_OPT)
1099                 opt = inet->cork.opt;
1100
1101         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1102                 return -EOPNOTSUPP;
1103
1104         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1105         mtu = inet->cork.fragsize;
1106
1107         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1108         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1109
1110         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1111                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1112                 return -EMSGSIZE;
1113         }
1114
1115         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1116                 return -EINVAL;
1117
1118         inet->cork.length += size;
1119         if ((sk->sk_protocol == IPPROTO_UDP) &&
1120             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1121                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1122                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1123         }
1124
1125
1126         while (size > 0) {
1127                 int i;
1128
1129                 if (skb_is_gso(skb))
1130                         len = size;
1131                 else {
1132
1133                         /* Check if the remaining data fits into current packet. */
1134                         len = mtu - skb->len;
1135                         if (len < size)
1136                                 len = maxfraglen - skb->len;
1137                 }
1138                 if (len <= 0) {
1139                         struct sk_buff *skb_prev;
1140                         int alloclen;
1141
1142                         skb_prev = skb;
1143                         fraggap = skb_prev->len - maxfraglen;
1144
1145                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1146                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1147                         if (unlikely(!skb)) {
1148                                 err = -ENOBUFS;
1149                                 goto error;
1150                         }
1151
1152                         /*
1153                          *      Fill in the control structures
1154                          */
1155                         skb->ip_summed = CHECKSUM_NONE;
1156                         skb->csum = 0;
1157                         skb_reserve(skb, hh_len);
1158
1159                         /*
1160                          *      Find where to start putting bytes.
1161                          */
1162                         skb_put(skb, fragheaderlen + fraggap);
1163                         skb_reset_network_header(skb);
1164                         skb->transport_header = (skb->network_header +
1165                                                  fragheaderlen);
1166                         if (fraggap) {
1167                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1168                                                                    maxfraglen,
1169                                                     skb_transport_header(skb),
1170                                                                    fraggap, 0);
1171                                 skb_prev->csum = csum_sub(skb_prev->csum,
1172                                                           skb->csum);
1173                                 pskb_trim_unique(skb_prev, maxfraglen);
1174                         }
1175
1176                         /*
1177                          * Put the packet on the pending queue.
1178                          */
1179                         __skb_queue_tail(&sk->sk_write_queue, skb);
1180                         continue;
1181                 }
1182
1183                 i = skb_shinfo(skb)->nr_frags;
1184                 if (len > size)
1185                         len = size;
1186                 if (skb_can_coalesce(skb, i, page, offset)) {
1187                         skb_shinfo(skb)->frags[i-1].size += len;
1188                 } else if (i < MAX_SKB_FRAGS) {
1189                         get_page(page);
1190                         skb_fill_page_desc(skb, i, page, offset, len);
1191                 } else {
1192                         err = -EMSGSIZE;
1193                         goto error;
1194                 }
1195
1196                 if (skb->ip_summed == CHECKSUM_NONE) {
1197                         __wsum csum;
1198                         csum = csum_page(page, offset, len);
1199                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1200                 }
1201
1202                 skb->len += len;
1203                 skb->data_len += len;
1204                 skb->truesize += len;
1205                 atomic_add(len, &sk->sk_wmem_alloc);
1206                 offset += len;
1207                 size -= len;
1208         }
1209         return 0;
1210
1211 error:
1212         inet->cork.length -= size;
1213         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1214         return err;
1215 }
1216
1217 static void ip_cork_release(struct inet_sock *inet)
1218 {
1219         inet->cork.flags &= ~IPCORK_OPT;
1220         kfree(inet->cork.opt);
1221         inet->cork.opt = NULL;
1222         dst_release(inet->cork.dst);
1223         inet->cork.dst = NULL;
1224 }
1225
1226 /*
1227  *      Combined all pending IP fragments on the socket as one IP datagram
1228  *      and push them out.
1229  */
1230 int ip_push_pending_frames(struct sock *sk)
1231 {
1232         struct sk_buff *skb, *tmp_skb;
1233         struct sk_buff **tail_skb;
1234         struct inet_sock *inet = inet_sk(sk);
1235         struct net *net = sock_net(sk);
1236         struct ip_options *opt = NULL;
1237         struct rtable *rt = (struct rtable *)inet->cork.dst;
1238         struct iphdr *iph;
1239         __be16 df = 0;
1240         __u8 ttl;
1241         int err = 0;
1242
1243         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1244                 goto out;
1245         tail_skb = &(skb_shinfo(skb)->frag_list);
1246
1247         /* move skb->data to ip header from ext header */
1248         if (skb->data < skb_network_header(skb))
1249                 __skb_pull(skb, skb_network_offset(skb));
1250         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1251                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1252                 *tail_skb = tmp_skb;
1253                 tail_skb = &(tmp_skb->next);
1254                 skb->len += tmp_skb->len;
1255                 skb->data_len += tmp_skb->len;
1256                 skb->truesize += tmp_skb->truesize;
1257                 tmp_skb->destructor = NULL;
1258                 tmp_skb->sk = NULL;
1259         }
1260
1261         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1262          * to fragment the frame generated here. No matter, what transforms
1263          * how transforms change size of the packet, it will come out.
1264          */
1265         if (inet->pmtudisc < IP_PMTUDISC_DO)
1266                 skb->local_df = 1;
1267
1268         /* DF bit is set when we want to see DF on outgoing frames.
1269          * If local_df is set too, we still allow to fragment this frame
1270          * locally. */
1271         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1272             (skb->len <= dst_mtu(&rt->u.dst) &&
1273              ip_dont_fragment(sk, &rt->u.dst)))
1274                 df = htons(IP_DF);
1275
1276         if (inet->cork.flags & IPCORK_OPT)
1277                 opt = inet->cork.opt;
1278
1279         if (rt->rt_type == RTN_MULTICAST)
1280                 ttl = inet->mc_ttl;
1281         else
1282                 ttl = ip_select_ttl(inet, &rt->u.dst);
1283
1284         iph = (struct iphdr *)skb->data;
1285         iph->version = 4;
1286         iph->ihl = 5;
1287         if (opt) {
1288                 iph->ihl += opt->optlen>>2;
1289                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1290         }
1291         iph->tos = inet->tos;
1292         iph->frag_off = df;
1293         ip_select_ident(iph, &rt->u.dst, sk);
1294         iph->ttl = ttl;
1295         iph->protocol = sk->sk_protocol;
1296         iph->saddr = rt->rt_src;
1297         iph->daddr = rt->rt_dst;
1298
1299         skb->priority = sk->sk_priority;
1300         skb->mark = sk->sk_mark;
1301         /*
1302          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1303          * on dst refcount
1304          */
1305         inet->cork.dst = NULL;
1306         skb_dst_set(skb, &rt->u.dst);
1307
1308         if (iph->protocol == IPPROTO_ICMP)
1309                 icmp_out_count(net, ((struct icmphdr *)
1310                         skb_transport_header(skb))->type);
1311
1312         /* Netfilter gets whole the not fragmented skb. */
1313         err = ip_local_out(skb);
1314         if (err) {
1315                 if (err > 0)
1316                         err = net_xmit_errno(err);
1317                 if (err)
1318                         goto error;
1319         }
1320
1321 out:
1322         ip_cork_release(inet);
1323         return err;
1324
1325 error:
1326         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1327         goto out;
1328 }
1329
1330 /*
1331  *      Throw away all pending data on the socket.
1332  */
1333 void ip_flush_pending_frames(struct sock *sk)
1334 {
1335         struct sk_buff *skb;
1336
1337         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1338                 kfree_skb(skb);
1339
1340         ip_cork_release(inet_sk(sk));
1341 }
1342
1343
1344 /*
1345  *      Fetch data from kernel space and fill in checksum if needed.
1346  */
1347 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1348                               int len, int odd, struct sk_buff *skb)
1349 {
1350         __wsum csum;
1351
1352         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1353         skb->csum = csum_block_add(skb->csum, csum, odd);
1354         return 0;
1355 }
1356
1357 /*
1358  *      Generic function to send a packet as reply to another packet.
1359  *      Used to send TCP resets so far. ICMP should use this function too.
1360  *
1361  *      Should run single threaded per socket because it uses the sock
1362  *      structure to pass arguments.
1363  */
1364 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1365                    unsigned int len)
1366 {
1367         struct inet_sock *inet = inet_sk(sk);
1368         struct {
1369                 struct ip_options       opt;
1370                 char                    data[40];
1371         } replyopts;
1372         struct ipcm_cookie ipc;
1373         __be32 daddr;
1374         struct rtable *rt = skb_rtable(skb);
1375
1376         if (ip_options_echo(&replyopts.opt, skb))
1377                 return;
1378
1379         daddr = ipc.addr = rt->rt_src;
1380         ipc.opt = NULL;
1381         ipc.shtx.flags = 0;
1382
1383         if (replyopts.opt.optlen) {
1384                 ipc.opt = &replyopts.opt;
1385
1386                 if (ipc.opt->srr)
1387                         daddr = replyopts.opt.faddr;
1388         }
1389
1390         {
1391                 struct flowi fl = { .oif = arg->bound_dev_if,
1392                                     .nl_u = { .ip4_u =
1393                                               { .daddr = daddr,
1394                                                 .saddr = rt->rt_spec_dst,
1395                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1396                                     /* Not quite clean, but right. */
1397                                     .uli_u = { .ports =
1398                                                { .sport = tcp_hdr(skb)->dest,
1399                                                  .dport = tcp_hdr(skb)->source } },
1400                                     .proto = sk->sk_protocol,
1401                                     .flags = ip_reply_arg_flowi_flags(arg) };
1402                 security_skb_classify_flow(skb, &fl);
1403                 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1404                         return;
1405         }
1406
1407         /* And let IP do all the hard work.
1408
1409            This chunk is not reenterable, hence spinlock.
1410            Note that it uses the fact, that this function is called
1411            with locally disabled BH and that sk cannot be already spinlocked.
1412          */
1413         bh_lock_sock(sk);
1414         inet->tos = ip_hdr(skb)->tos;
1415         sk->sk_priority = skb->priority;
1416         sk->sk_protocol = ip_hdr(skb)->protocol;
1417         sk->sk_bound_dev_if = arg->bound_dev_if;
1418         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1419                        &ipc, &rt, MSG_DONTWAIT);
1420         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1421                 if (arg->csumoffset >= 0)
1422                         *((__sum16 *)skb_transport_header(skb) +
1423                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1424                                                                 arg->csum));
1425                 skb->ip_summed = CHECKSUM_NONE;
1426                 ip_push_pending_frames(sk);
1427         }
1428
1429         bh_unlock_sock(sk);
1430
1431         ip_rt_put(rt);
1432 }
1433
1434 void __init ip_init(void)
1435 {
1436         ip_rt_init();
1437         inet_initpeers();
1438
1439 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1440         igmp_mc_proc_init();
1441 #endif
1442 }
1443
1444 EXPORT_SYMBOL(ip_generic_getfrag);
1445 EXPORT_SYMBOL(ip_queue_xmit);
1446 EXPORT_SYMBOL(ip_send_check);