net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/system.h>
  47 #include <linux/module.h>
  48 #include <linux/types.h>
  49 #include <linux/kernel.h>
  50 #include <linux/mm.h>
  51 #include <linux/string.h>
  52 #include <linux/errno.h>
  53 #include <linux/highmem.h>
  54 #include <linux/slab.h>
  55
  56 #include <linux/socket.h>
  57 #include <linux/sockios.h>
  58 #include <linux/in.h>
  59 #include <linux/inet.h>
  60 #include <linux/netdevice.h>
  61 #include <linux/etherdevice.h>
  62 #include <linux/proc_fs.h>
  63 #include <linux/stat.h>
  64 #include <linux/init.h>
  65
  66 #include <net/snmp.h>
  67 #include <net/ip.h>
  68 #include <net/protocol.h>
  69 #include <net/route.h>
  70 #include <net/xfrm.h>
  71 #include <linux/skbuff.h>
  72 #include <net/sock.h>
  73 #include <net/arp.h>
  74 #include <net/icmp.h>
  75 #include <net/checksum.h>
  76 #include <net/inetpeer.h>
  77 #include <linux/igmp.h>
  78 #include <linux/netfilter_ipv4.h>
  79 #include <linux/netfilter_bridge.h>
  80 #include <linux/mroute.h>
  81 #include <linux/netlink.h>
  82 #include <linux/tcp.h>
  83
  84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93 EXPORT_SYMBOL(ip_send_check);
  94
  95 int __ip_local_out(struct sk_buff *skb)
  96 {
  97         struct iphdr *iph = ip_hdr(skb);
  98
  99         iph->tot_len = htons(skb->len);
 100         ip_send_check(iph);
 101         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 102                        skb_dst(skb)->dev, dst_output);
 103 }
 104
 105 int ip_local_out(struct sk_buff *skb)
 106 {
 107         int err;
 108
 109         err = __ip_local_out(skb);
 110         if (likely(err == 1))
 111                 err = dst_output(skb);
 112
 113         return err;
 114 }
 115 EXPORT_SYMBOL_GPL(ip_local_out);
 116
 117 /* dev_loopback_xmit for use with netfilter. */
 118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 119 {
 120         skb_reset_mac_header(newskb);
 121         __skb_pull(newskb, skb_network_offset(newskb));
 122         newskb->pkt_type = PACKET_LOOPBACK;
 123         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 124         WARN_ON(!skb_dst(newskb));
 125         netif_rx_ni(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = ip4_dst_hoplimit(dst);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = skb_rtable(skb);
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161         iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
 162         iph->saddr    = saddr;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(iph, &rt->dst, sk);
 165
 166         if (opt && opt->opt.optlen) {
 167                 iph->ihl += opt->opt.optlen>>2;
 168                 ip_options_build(skb, &opt->opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172         skb->mark = sk->sk_mark;
 173
 174         /* Send it out. */
 175         return ip_local_out(skb);
 176 }
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185
 186         if (rt->rt_type == RTN_MULTICAST) {
 187                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 188         } else if (rt->rt_type == RTN_BROADCAST)
 189                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 190
 191         /* Be paranoid, rather than too clever. */
 192         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 193                 struct sk_buff *skb2;
 194
 195                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                 if (skb2 == NULL) {
 197                         kfree_skb(skb);
 198                         return -ENOMEM;
 199                 }
 200                 if (skb->sk)
 201                         skb_set_owner_w(skb2, skb->sk);
 202                 kfree_skb(skb);
 203                 skb = skb2;
 204         }
 205
 206         if (dst->hh)
 207                 return neigh_hh_output(dst->hh, skb);
 208         else if (dst->neighbour)
 209                 return dst->neighbour->output(skb);
 210
 211         if (net_ratelimit())
 212                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213         kfree_skb(skb);
 214         return -EINVAL;
 215 }
 216
 217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218 {
 219         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 223 }
 224
 225 static int ip_finish_output(struct sk_buff *skb)
 226 {
 227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228         /* Policy lookup after SNAT yielded a new policy */
 229         if (skb_dst(skb)->xfrm != NULL) {
 230                 IPCB(skb)->flags |= IPSKB_REROUTED;
 231                 return dst_output(skb);
 232         }
 233 #endif
 234         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                 return ip_fragment(skb, ip_finish_output2);
 236         else
 237                 return ip_finish_output2(skb);
 238 }
 239
 240 int ip_mc_output(struct sk_buff *skb)
 241 {
 242         struct sock *sk = skb->sk;
 243         struct rtable *rt = skb_rtable(skb);
 244         struct net_device *dev = rt->dst.dev;
 245
 246         /*
 247          *      If the indicated interface is up and running, send the packet.
 248          */
 249         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 250
 251         skb->dev = dev;
 252         skb->protocol = htons(ETH_P_IP);
 253
 254         /*
 255          *      Multicasts are looped back for other local users
 256          */
 257
 258         if (rt->rt_flags&RTCF_MULTICAST) {
 259                 if (sk_mc_loop(sk)
 260 #ifdef CONFIG_IP_MROUTE
 261                 /* Small optimization: do not loopback not local frames,
 262                    which returned after forwarding; they will be  dropped
 263                    by ip_mr_input in any case.
 264                    Note, that local frames are looped back to be delivered
 265                    to local recipients.
 266
 267                    This check is duplicated in ip_mr_input at the moment.
 268                  */
 269                     &&
 270                     ((rt->rt_flags & RTCF_LOCAL) ||
 271                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 272 #endif
 273                    ) {
 274                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                         if (newskb)
 276                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 277                                         newskb, NULL, newskb->dev,
 278                                         ip_dev_loopback_xmit);
 279                 }
 280
 281                 /* Multicasts with ttl 0 must not go beyond the host */
 282
 283                 if (ip_hdr(skb)->ttl == 0) {
 284                         kfree_skb(skb);
 285                         return 0;
 286                 }
 287         }
 288
 289         if (rt->rt_flags&RTCF_BROADCAST) {
 290                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 291                 if (newskb)
 292                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 293                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 294         }
 295
 296         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 297                             skb->dev, ip_finish_output,
 298                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 299 }
 300
 301 int ip_output(struct sk_buff *skb)
 302 {
 303         struct net_device *dev = skb_dst(skb)->dev;
 304
 305         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 306
 307         skb->dev = dev;
 308         skb->protocol = htons(ETH_P_IP);
 309
 310         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 311                             ip_finish_output,
 312                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 313 }
 314
 315 int ip_queue_xmit(struct sk_buff *skb)
 316 {
 317         struct sock *sk = skb->sk;
 318         struct inet_sock *inet = inet_sk(sk);
 319         struct ip_options_rcu *inet_opt;
 320         struct rtable *rt;
 321         struct iphdr *iph;
 322         int res;
 323
 324         /* Skip all of this if the packet is already routed,
 325          * f.e. by something like SCTP.
 326          */
 327         rcu_read_lock();
 328         inet_opt = rcu_dereference(inet->inet_opt);
 329         rt = skb_rtable(skb);
 330         if (rt != NULL)
 331                 goto packet_routed;
 332
 333         /* Make sure we can route this packet. */
 334         rt = (struct rtable *)__sk_dst_check(sk, 0);
 335         if (rt == NULL) {
 336                 struct flowi4 fl4;
 337                 __be32 daddr;
 338
 339                 /* Use correct destination address if we have options. */
 340                 daddr = inet->inet_daddr;
 341                 if (inet_opt && inet_opt->opt.srr)
 342                         daddr = inet_opt->opt.faddr;
 343
 344                 /* If this fails, retransmit mechanism of transport layer will
 345                  * keep trying until route appears or the connection times
 346                  * itself out.
 347                  */
 348                 rt = ip_route_output_ports(sock_net(sk), &fl4, sk,
 349                                            daddr, inet->inet_saddr,
 350                                            inet->inet_dport,
 351                                            inet->inet_sport,
 352                                            sk->sk_protocol,
 353                                            RT_CONN_FLAGS(sk),
 354                                            sk->sk_bound_dev_if);
 355                 if (IS_ERR(rt))
 356                         goto no_route;
 357                 sk_setup_caps(sk, &rt->dst);
 358         }
 359         skb_dst_set_noref(skb, &rt->dst);
 360
 361 packet_routed:
 362         if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
 363                 goto no_route;
 364
 365         /* OK, we know where to send it, allocate and build IP header. */
 366         skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 367         skb_reset_network_header(skb);
 368         iph = ip_hdr(skb);
 369         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 370         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 371                 iph->frag_off = htons(IP_DF);
 372         else
 373                 iph->frag_off = 0;
 374         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 375         iph->protocol = sk->sk_protocol;
 376         iph->saddr    = rt->rt_src;
 377         iph->daddr    = rt->rt_dst;
 378         /* Transport layer set skb->h.foo itself. */
 379
 380         if (inet_opt && inet_opt->opt.optlen) {
 381                 iph->ihl += inet_opt->opt.optlen >> 2;
 382                 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 383         }
 384
 385         ip_select_ident_more(iph, &rt->dst, sk,
 386                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 387
 388         skb->priority = sk->sk_priority;
 389         skb->mark = sk->sk_mark;
 390
 391         res = ip_local_out(skb);
 392         rcu_read_unlock();
 393         return res;
 394
 395 no_route:
 396         rcu_read_unlock();
 397         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 398         kfree_skb(skb);
 399         return -EHOSTUNREACH;
 400 }
 401 EXPORT_SYMBOL(ip_queue_xmit);
 402
 403
 404 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 405 {
 406         to->pkt_type = from->pkt_type;
 407         to->priority = from->priority;
 408         to->protocol = from->protocol;
 409         skb_dst_drop(to);
 410         skb_dst_copy(to, from);
 411         to->dev = from->dev;
 412         to->mark = from->mark;
 413
 414         /* Copy the flags to each fragment. */
 415         IPCB(to)->flags = IPCB(from)->flags;
 416
 417 #ifdef CONFIG_NET_SCHED
 418         to->tc_index = from->tc_index;
 419 #endif
 420         nf_copy(to, from);
 421 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 422     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 423         to->nf_trace = from->nf_trace;
 424 #endif
 425 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 426         to->ipvs_property = from->ipvs_property;
 427 #endif
 428         skb_copy_secmark(to, from);
 429 }
 430
 431 /*
 432  *      This IP datagram is too large to be sent in one piece.  Break it up into
 433  *      smaller pieces (each of size equal to IP header plus
 434  *      a block of the data of the original IP data part) that will yet fit in a
 435  *      single device frame, and queue such a frame for sending.
 436  */
 437
 438 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 439 {
 440         struct iphdr *iph;
 441         int ptr;
 442         struct net_device *dev;
 443         struct sk_buff *skb2;
 444         unsigned int mtu, hlen, left, len, ll_rs;
 445         int offset;
 446         __be16 not_last_frag;
 447         struct rtable *rt = skb_rtable(skb);
 448         int err = 0;
 449
 450         dev = rt->dst.dev;
 451
 452         /*
 453          *      Point into the IP datagram header.
 454          */
 455
 456         iph = ip_hdr(skb);
 457
 458         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 459                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 460                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 461                           htonl(ip_skb_dst_mtu(skb)));
 462                 kfree_skb(skb);
 463                 return -EMSGSIZE;
 464         }
 465
 466         /*
 467          *      Setup starting values.
 468          */
 469
 470         hlen = iph->ihl * 4;
 471         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 472 #ifdef CONFIG_BRIDGE_NETFILTER
 473         if (skb->nf_bridge)
 474                 mtu -= nf_bridge_mtu_reduction(skb);
 475 #endif
 476         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 477
 478         /* When frag_list is given, use it. First, check its validity:
 479          * some transformers could create wrong frag_list or break existing
 480          * one, it is not prohibited. In this case fall back to copying.
 481          *
 482          * LATER: this step can be merged to real generation of fragments,
 483          * we can switch to copy when see the first bad fragment.
 484          */
 485         if (skb_has_frag_list(skb)) {
 486                 struct sk_buff *frag, *frag2;
 487                 int first_len = skb_pagelen(skb);
 488
 489                 if (first_len - hlen > mtu ||
 490                     ((first_len - hlen) & 7) ||
 491                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 492                     skb_cloned(skb))
 493                         goto slow_path;
 494
 495                 skb_walk_frags(skb, frag) {
 496                         /* Correct geometry. */
 497                         if (frag->len > mtu ||
 498                             ((frag->len & 7) && frag->next) ||
 499                             skb_headroom(frag) < hlen)
 500                                 goto slow_path_clean;
 501
 502                         /* Partially cloned skb? */
 503                         if (skb_shared(frag))
 504                                 goto slow_path_clean;
 505
 506                         BUG_ON(frag->sk);
 507                         if (skb->sk) {
 508                                 frag->sk = skb->sk;
 509                                 frag->destructor = sock_wfree;
 510                         }
 511                         skb->truesize -= frag->truesize;
 512                 }
 513
 514                 /* Everything is OK. Generate! */
 515
 516                 err = 0;
 517                 offset = 0;
 518                 frag = skb_shinfo(skb)->frag_list;
 519                 skb_frag_list_init(skb);
 520                 skb->data_len = first_len - skb_headlen(skb);
 521                 skb->len = first_len;
 522                 iph->tot_len = htons(first_len);
 523                 iph->frag_off = htons(IP_MF);
 524                 ip_send_check(iph);
 525
 526                 for (;;) {
 527                         /* Prepare header of the next frame,
 528                          * before previous one went down. */
 529                         if (frag) {
 530                                 frag->ip_summed = CHECKSUM_NONE;
 531                                 skb_reset_transport_header(frag);
 532                                 __skb_push(frag, hlen);
 533                                 skb_reset_network_header(frag);
 534                                 memcpy(skb_network_header(frag), iph, hlen);
 535                                 iph = ip_hdr(frag);
 536                                 iph->tot_len = htons(frag->len);
 537                                 ip_copy_metadata(frag, skb);
 538                                 if (offset == 0)
 539                                         ip_options_fragment(frag);
 540                                 offset += skb->len - hlen;
 541                                 iph->frag_off = htons(offset>>3);
 542                                 if (frag->next != NULL)
 543                                         iph->frag_off |= htons(IP_MF);
 544                                 /* Ready, complete checksum */
 545                                 ip_send_check(iph);
 546                         }
 547
 548                         err = output(skb);
 549
 550                         if (!err)
 551                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 552                         if (err || !frag)
 553                                 break;
 554
 555                         skb = frag;
 556                         frag = skb->next;
 557                         skb->next = NULL;
 558                 }
 559
 560                 if (err == 0) {
 561                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 562                         return 0;
 563                 }
 564
 565                 while (frag) {
 566                         skb = frag->next;
 567                         kfree_skb(frag);
 568                         frag = skb;
 569                 }
 570                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 571                 return err;
 572
 573 slow_path_clean:
 574                 skb_walk_frags(skb, frag2) {
 575                         if (frag2 == frag)
 576                                 break;
 577                         frag2->sk = NULL;
 578                         frag2->destructor = NULL;
 579                         skb->truesize += frag2->truesize;
 580                 }
 581         }
 582
 583 slow_path:
 584         left = skb->len - hlen;         /* Space per frame */
 585         ptr = hlen;             /* Where to start from */
 586
 587         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 588          * we need to make room for the encapsulating header
 589          */
 590         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 591
 592         /*
 593          *      Fragment the datagram.
 594          */
 595
 596         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 597         not_last_frag = iph->frag_off & htons(IP_MF);
 598
 599         /*
 600          *      Keep copying data until we run out.
 601          */
 602
 603         while (left > 0) {
 604                 len = left;
 605                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 606                 if (len > mtu)
 607                         len = mtu;
 608                 /* IF: we are not sending up to and including the packet end
 609                    then align the next start on an eight byte boundary */
 610                 if (len < left) {
 611                         len &= ~7;
 612                 }
 613                 /*
 614                  *      Allocate buffer.
 615                  */
 616
 617                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 618                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 619                         err = -ENOMEM;
 620                         goto fail;
 621                 }
 622
 623                 /*
 624                  *      Set up data on packet
 625                  */
 626
 627                 ip_copy_metadata(skb2, skb);
 628                 skb_reserve(skb2, ll_rs);
 629                 skb_put(skb2, len + hlen);
 630                 skb_reset_network_header(skb2);
 631                 skb2->transport_header = skb2->network_header + hlen;
 632
 633                 /*
 634                  *      Charge the memory for the fragment to any owner
 635                  *      it might possess
 636                  */
 637
 638                 if (skb->sk)
 639                         skb_set_owner_w(skb2, skb->sk);
 640
 641                 /*
 642                  *      Copy the packet header into the new buffer.
 643                  */
 644
 645                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 646
 647                 /*
 648                  *      Copy a block of the IP datagram.
 649                  */
 650                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 651                         BUG();
 652                 left -= len;
 653
 654                 /*
 655                  *      Fill in the new header fields.
 656                  */
 657                 iph = ip_hdr(skb2);
 658                 iph->frag_off = htons((offset >> 3));
 659
 660                 /* ANK: dirty, but effective trick. Upgrade options only if
 661                  * the segment to be fragmented was THE FIRST (otherwise,
 662                  * options are already fixed) and make it ONCE
 663                  * on the initial skb, so that all the following fragments
 664                  * will inherit fixed options.
 665                  */
 666                 if (offset == 0)
 667                         ip_options_fragment(skb);
 668
 669                 /*
 670                  *      Added AC : If we are fragmenting a fragment that's not the
 671                  *                 last fragment then keep MF on each bit
 672                  */
 673                 if (left > 0 || not_last_frag)
 674                         iph->frag_off |= htons(IP_MF);
 675                 ptr += len;
 676                 offset += len;
 677
 678                 /*
 679                  *      Put this fragment into the sending queue.
 680                  */
 681                 iph->tot_len = htons(len + hlen);
 682
 683                 ip_send_check(iph);
 684
 685                 err = output(skb2);
 686                 if (err)
 687                         goto fail;
 688
 689                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 690         }
 691         kfree_skb(skb);
 692         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 693         return err;
 694
 695 fail:
 696         kfree_skb(skb);
 697         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 698         return err;
 699 }
 700 EXPORT_SYMBOL(ip_fragment);
 701
 702 int
 703 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 704 {
 705         struct iovec *iov = from;
 706
 707         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 708                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 709                         return -EFAULT;
 710         } else {
 711                 __wsum csum = 0;
 712                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 713                         return -EFAULT;
 714                 skb->csum = csum_block_add(skb->csum, csum, odd);
 715         }
 716         return 0;
 717 }
 718 EXPORT_SYMBOL(ip_generic_getfrag);
 719
 720 static inline __wsum
 721 csum_page(struct page *page, int offset, int copy)
 722 {
 723         char *kaddr;
 724         __wsum csum;
 725         kaddr = kmap(page);
 726         csum = csum_partial(kaddr + offset, copy, 0);
 727         kunmap(page);
 728         return csum;
 729 }
 730
 731 static inline int ip_ufo_append_data(struct sock *sk,
 732                         struct sk_buff_head *queue,
 733                         int getfrag(void *from, char *to, int offset, int len,
 734                                int odd, struct sk_buff *skb),
 735                         void *from, int length, int hh_len, int fragheaderlen,
 736                         int transhdrlen, int mtu, unsigned int flags)
 737 {
 738         struct sk_buff *skb;
 739         int err;
 740
 741         /* There is support for UDP fragmentation offload by network
 742          * device, so create one single skb packet containing complete
 743          * udp datagram
 744          */
 745         if ((skb = skb_peek_tail(queue)) == NULL) {
 746                 skb = sock_alloc_send_skb(sk,
 747                         hh_len + fragheaderlen + transhdrlen + 20,
 748                         (flags & MSG_DONTWAIT), &err);
 749
 750                 if (skb == NULL)
 751                         return err;
 752
 753                 /* reserve space for Hardware header */
 754                 skb_reserve(skb, hh_len);
 755
 756                 /* create space for UDP/IP header */
 757                 skb_put(skb, fragheaderlen + transhdrlen);
 758
 759                 /* initialize network header pointer */
 760                 skb_reset_network_header(skb);
 761
 762                 /* initialize protocol header pointer */
 763                 skb->transport_header = skb->network_header + fragheaderlen;
 764
 765                 skb->ip_summed = CHECKSUM_PARTIAL;
 766                 skb->csum = 0;
 767
 768                 /* specify the length of each IP datagram fragment */
 769                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 770                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 771                 __skb_queue_tail(queue, skb);
 772         }
 773
 774         return skb_append_datato_frags(sk, skb, getfrag, from,
 775                                        (length - transhdrlen));
 776 }
 777
 778 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
 779                             struct inet_cork *cork,
 780                             int getfrag(void *from, char *to, int offset,
 781                                         int len, int odd, struct sk_buff *skb),
 782                             void *from, int length, int transhdrlen,
 783                             unsigned int flags)
 784 {
 785         struct inet_sock *inet = inet_sk(sk);
 786         struct sk_buff *skb;
 787
 788         struct ip_options *opt = cork->opt;
 789         int hh_len;
 790         int exthdrlen;
 791         int mtu;
 792         int copy;
 793         int err;
 794         int offset = 0;
 795         unsigned int maxfraglen, fragheaderlen;
 796         int csummode = CHECKSUM_NONE;
 797         struct rtable *rt = (struct rtable *)cork->dst;
 798
 799         exthdrlen = transhdrlen ? rt->dst.header_len : 0;
 800         length += exthdrlen;
 801         transhdrlen += exthdrlen;
 802         mtu = cork->fragsize;
 803
 804         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 805
 806         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 807         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 808
 809         if (cork->length + length > 0xFFFF - fragheaderlen) {
 810                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 811                                mtu-exthdrlen);
 812                 return -EMSGSIZE;
 813         }
 814
 815         /*
 816          * transhdrlen > 0 means that this is the first fragment and we wish
 817          * it won't be fragmented in the future.
 818          */
 819         if (transhdrlen &&
 820             length + fragheaderlen <= mtu &&
 821             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 822             !exthdrlen)
 823                 csummode = CHECKSUM_PARTIAL;
 824
 825         skb = skb_peek_tail(queue);
 826
 827         cork->length += length;
 828         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 829             (sk->sk_protocol == IPPROTO_UDP) &&
 830             (rt->dst.dev->features & NETIF_F_UFO)) {
 831                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 832                                          hh_len, fragheaderlen, transhdrlen,
 833                                          mtu, flags);
 834                 if (err)
 835                         goto error;
 836                 return 0;
 837         }
 838
 839         /* So, what's going on in the loop below?
 840          *
 841          * We use calculated fragment length to generate chained skb,
 842          * each of segments is IP fragment ready for sending to network after
 843          * adding appropriate IP header.
 844          */
 845
 846         if (!skb)
 847                 goto alloc_new_skb;
 848
 849         while (length > 0) {
 850                 /* Check if the remaining data fits into current packet. */
 851                 copy = mtu - skb->len;
 852                 if (copy < length)
 853                         copy = maxfraglen - skb->len;
 854                 if (copy <= 0) {
 855                         char *data;
 856                         unsigned int datalen;
 857                         unsigned int fraglen;
 858                         unsigned int fraggap;
 859                         unsigned int alloclen;
 860                         struct sk_buff *skb_prev;
 861 alloc_new_skb:
 862                         skb_prev = skb;
 863                         if (skb_prev)
 864                                 fraggap = skb_prev->len - maxfraglen;
 865                         else
 866                                 fraggap = 0;
 867
 868                         /*
 869                          * If remaining data exceeds the mtu,
 870                          * we know we need more fragment(s).
 871                          */
 872                         datalen = length + fraggap;
 873                         if (datalen > mtu - fragheaderlen)
 874                                 datalen = maxfraglen - fragheaderlen;
 875                         fraglen = datalen + fragheaderlen;
 876
 877                         if ((flags & MSG_MORE) &&
 878                             !(rt->dst.dev->features&NETIF_F_SG))
 879                                 alloclen = mtu;
 880                         else
 881                                 alloclen = fraglen;
 882
 883                         /* The last fragment gets additional space at tail.
 884                          * Note, with MSG_MORE we overallocate on fragments,
 885                          * because we have no idea what fragment will be
 886                          * the last.
 887                          */
 888                         if (datalen == length + fraggap) {
 889                                 alloclen += rt->dst.trailer_len;
 890                                 /* make sure mtu is not reached */
 891                                 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
 892                                         datalen -= ALIGN(rt->dst.trailer_len, 8);
 893                         }
 894                         if (transhdrlen) {
 895                                 skb = sock_alloc_send_skb(sk,
 896                                                 alloclen + hh_len + 15,
 897                                                 (flags & MSG_DONTWAIT), &err);
 898                         } else {
 899                                 skb = NULL;
 900                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 901                                     2 * sk->sk_sndbuf)
 902                                         skb = sock_wmalloc(sk,
 903                                                            alloclen + hh_len + 15, 1,
 904                                                            sk->sk_allocation);
 905                                 if (unlikely(skb == NULL))
 906                                         err = -ENOBUFS;
 907                                 else
 908                                         /* only the initial fragment is
 909                                            time stamped */
 910                                         cork->tx_flags = 0;
 911                         }
 912                         if (skb == NULL)
 913                                 goto error;
 914
 915                         /*
 916                          *      Fill in the control structures
 917                          */
 918                         skb->ip_summed = csummode;
 919                         skb->csum = 0;
 920                         skb_reserve(skb, hh_len);
 921                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 922
 923                         /*
 924                          *      Find where to start putting bytes.
 925                          */
 926                         data = skb_put(skb, fraglen);
 927                         skb_set_network_header(skb, exthdrlen);
 928                         skb->transport_header = (skb->network_header +
 929                                                  fragheaderlen);
 930                         data += fragheaderlen;
 931
 932                         if (fraggap) {
 933                                 skb->csum = skb_copy_and_csum_bits(
 934                                         skb_prev, maxfraglen,
 935                                         data + transhdrlen, fraggap, 0);
 936                                 skb_prev->csum = csum_sub(skb_prev->csum,
 937                                                           skb->csum);
 938                                 data += fraggap;
 939                                 pskb_trim_unique(skb_prev, maxfraglen);
 940                         }
 941
 942                         copy = datalen - transhdrlen - fraggap;
 943                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 944                                 err = -EFAULT;
 945                                 kfree_skb(skb);
 946                                 goto error;
 947                         }
 948
 949                         offset += copy;
 950                         length -= datalen - fraggap;
 951                         transhdrlen = 0;
 952                         exthdrlen = 0;
 953                         csummode = CHECKSUM_NONE;
 954
 955                         /*
 956                          * Put the packet on the pending queue.
 957                          */
 958                         __skb_queue_tail(queue, skb);
 959                         continue;
 960                 }
 961
 962                 if (copy > length)
 963                         copy = length;
 964
 965                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 966                         unsigned int off;
 967
 968                         off = skb->len;
 969                         if (getfrag(from, skb_put(skb, copy),
 970                                         offset, copy, off, skb) < 0) {
 971                                 __skb_trim(skb, off);
 972                                 err = -EFAULT;
 973                                 goto error;
 974                         }
 975                 } else {
 976                         int i = skb_shinfo(skb)->nr_frags;
 977                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 978                         struct page *page = cork->page;
 979                         int off = cork->off;
 980                         unsigned int left;
 981
 982                         if (page && (left = PAGE_SIZE - off) > 0) {
 983                                 if (copy >= left)
 984                                         copy = left;
 985                                 if (page != frag->page) {
 986                                         if (i == MAX_SKB_FRAGS) {
 987                                                 err = -EMSGSIZE;
 988                                                 goto error;
 989                                         }
 990                                         get_page(page);
 991                                         skb_fill_page_desc(skb, i, page, off, 0);
 992                                         frag = &skb_shinfo(skb)->frags[i];
 993                                 }
 994                         } else if (i < MAX_SKB_FRAGS) {
 995                                 if (copy > PAGE_SIZE)
 996                                         copy = PAGE_SIZE;
 997                                 page = alloc_pages(sk->sk_allocation, 0);
 998                                 if (page == NULL)  {
 999                                         err = -ENOMEM;
1000                                         goto error;
1001                                 }
1002                                 cork->page = page;
1003                                 cork->off = 0;
1004
1005                                 skb_fill_page_desc(skb, i, page, 0, 0);
1006                                 frag = &skb_shinfo(skb)->frags[i];
1007                         } else {
1008                                 err = -EMSGSIZE;
1009                                 goto error;
1010                         }
1011                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012                                 err = -EFAULT;
1013                                 goto error;
1014                         }
1015                         cork->off += copy;
1016                         frag->size += copy;
1017                         skb->len += copy;
1018                         skb->data_len += copy;
1019                         skb->truesize += copy;
1020                         atomic_add(copy, &sk->sk_wmem_alloc);
1021                 }
1022                 offset += copy;
1023                 length -= copy;
1024         }
1025
1026         return 0;
1027
1028 error:
1029         cork->length -= length;
1030         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1031         return err;
1032 }
1033
1034 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035                          struct ipcm_cookie *ipc, struct rtable **rtp)
1036 {
1037         struct inet_sock *inet = inet_sk(sk);
1038         struct ip_options_rcu *opt;
1039         struct rtable *rt;
1040
1041         /*
1042          * setup for corking.
1043          */
1044         opt = ipc->opt;
1045         if (opt) {
1046                 if (cork->opt == NULL) {
1047                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048                                             sk->sk_allocation);
1049                         if (unlikely(cork->opt == NULL))
1050                                 return -ENOBUFS;
1051                 }
1052                 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053                 cork->flags |= IPCORK_OPT;
1054                 cork->addr = ipc->addr;
1055         }
1056         rt = *rtp;
1057         if (unlikely(!rt))
1058                 return -EFAULT;
1059         /*
1060          * We steal reference to this route, caller should not release it
1061          */
1062         *rtp = NULL;
1063         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064                          rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1065         cork->dst = &rt->dst;
1066         cork->length = 0;
1067         cork->tx_flags = ipc->tx_flags;
1068         cork->page = NULL;
1069         cork->off = 0;
1070
1071         return 0;
1072 }
1073
1074 /*
1075  *      ip_append_data() and ip_append_page() can make one large IP datagram
1076  *      from many pieces of data. Each pieces will be holded on the socket
1077  *      until ip_push_pending_frames() is called. Each piece can be a page
1078  *      or non-page data.
1079  *
1080  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1081  *      this interface potentially.
1082  *
1083  *      LATER: length must be adjusted by pad at tail, when it is required.
1084  */
1085 int ip_append_data(struct sock *sk,
1086                    int getfrag(void *from, char *to, int offset, int len,
1087                                int odd, struct sk_buff *skb),
1088                    void *from, int length, int transhdrlen,
1089                    struct ipcm_cookie *ipc, struct rtable **rtp,
1090                    unsigned int flags)
1091 {
1092         struct inet_sock *inet = inet_sk(sk);
1093         int err;
1094
1095         if (flags&MSG_PROBE)
1096                 return 0;
1097
1098         if (skb_queue_empty(&sk->sk_write_queue)) {
1099                 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100                 if (err)
1101                         return err;
1102         } else {
1103                 transhdrlen = 0;
1104         }
1105
1106         return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107                                 from, length, transhdrlen, flags);
1108 }
1109
1110 ssize_t ip_append_page(struct sock *sk, struct page *page,
1111                        int offset, size_t size, int flags)
1112 {
1113         struct inet_sock *inet = inet_sk(sk);
1114         struct sk_buff *skb;
1115         struct rtable *rt;
1116         struct ip_options *opt = NULL;
1117         struct inet_cork *cork;
1118         int hh_len;
1119         int mtu;
1120         int len;
1121         int err;
1122         unsigned int maxfraglen, fragheaderlen, fraggap;
1123
1124         if (inet->hdrincl)
1125                 return -EPERM;
1126
1127         if (flags&MSG_PROBE)
1128                 return 0;
1129
1130         if (skb_queue_empty(&sk->sk_write_queue))
1131                 return -EINVAL;
1132
1133         cork = &inet->cork.base;
1134         rt = (struct rtable *)cork->dst;
1135         if (cork->flags & IPCORK_OPT)
1136                 opt = cork->opt;
1137
1138         if (!(rt->dst.dev->features&NETIF_F_SG))
1139                 return -EOPNOTSUPP;
1140
1141         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1142         mtu = cork->fragsize;
1143
1144         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1145         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1146
1147         if (cork->length + size > 0xFFFF - fragheaderlen) {
1148                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1149                 return -EMSGSIZE;
1150         }
1151
1152         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1153                 return -EINVAL;
1154
1155         cork->length += size;
1156         if ((size + skb->len > mtu) &&
1157             (sk->sk_protocol == IPPROTO_UDP) &&
1158             (rt->dst.dev->features & NETIF_F_UFO)) {
1159                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1160                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161         }
1162
1163
1164         while (size > 0) {
1165                 int i;
1166
1167                 if (skb_is_gso(skb))
1168                         len = size;
1169                 else {
1170
1171                         /* Check if the remaining data fits into current packet. */
1172                         len = mtu - skb->len;
1173                         if (len < size)
1174                                 len = maxfraglen - skb->len;
1175                 }
1176                 if (len <= 0) {
1177                         struct sk_buff *skb_prev;
1178                         int alloclen;
1179
1180                         skb_prev = skb;
1181                         fraggap = skb_prev->len - maxfraglen;
1182
1183                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1184                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1185                         if (unlikely(!skb)) {
1186                                 err = -ENOBUFS;
1187                                 goto error;
1188                         }
1189
1190                         /*
1191                          *      Fill in the control structures
1192                          */
1193                         skb->ip_summed = CHECKSUM_NONE;
1194                         skb->csum = 0;
1195                         skb_reserve(skb, hh_len);
1196
1197                         /*
1198                          *      Find where to start putting bytes.
1199                          */
1200                         skb_put(skb, fragheaderlen + fraggap);
1201                         skb_reset_network_header(skb);
1202                         skb->transport_header = (skb->network_header +
1203                                                  fragheaderlen);
1204                         if (fraggap) {
1205                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1206                                                                    maxfraglen,
1207                                                     skb_transport_header(skb),
1208                                                                    fraggap, 0);
1209                                 skb_prev->csum = csum_sub(skb_prev->csum,
1210                                                           skb->csum);
1211                                 pskb_trim_unique(skb_prev, maxfraglen);
1212                         }
1213
1214                         /*
1215                          * Put the packet on the pending queue.
1216                          */
1217                         __skb_queue_tail(&sk->sk_write_queue, skb);
1218                         continue;
1219                 }
1220
1221                 i = skb_shinfo(skb)->nr_frags;
1222                 if (len > size)
1223                         len = size;
1224                 if (skb_can_coalesce(skb, i, page, offset)) {
1225                         skb_shinfo(skb)->frags[i-1].size += len;
1226                 } else if (i < MAX_SKB_FRAGS) {
1227                         get_page(page);
1228                         skb_fill_page_desc(skb, i, page, offset, len);
1229                 } else {
1230                         err = -EMSGSIZE;
1231                         goto error;
1232                 }
1233
1234                 if (skb->ip_summed == CHECKSUM_NONE) {
1235                         __wsum csum;
1236                         csum = csum_page(page, offset, len);
1237                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1238                 }
1239
1240                 skb->len += len;
1241                 skb->data_len += len;
1242                 skb->truesize += len;
1243                 atomic_add(len, &sk->sk_wmem_alloc);
1244                 offset += len;
1245                 size -= len;
1246         }
1247         return 0;
1248
1249 error:
1250         cork->length -= size;
1251         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1252         return err;
1253 }
1254
1255 static void ip_cork_release(struct inet_cork *cork)
1256 {
1257         cork->flags &= ~IPCORK_OPT;
1258         kfree(cork->opt);
1259         cork->opt = NULL;
1260         dst_release(cork->dst);
1261         cork->dst = NULL;
1262 }
1263
1264 /*
1265  *      Combined all pending IP fragments on the socket as one IP datagram
1266  *      and push them out.
1267  */
1268 struct sk_buff *__ip_make_skb(struct sock *sk,
1269                               struct sk_buff_head *queue,
1270                               struct inet_cork *cork)
1271 {
1272         struct sk_buff *skb, *tmp_skb;
1273         struct sk_buff **tail_skb;
1274         struct inet_sock *inet = inet_sk(sk);
1275         struct net *net = sock_net(sk);
1276         struct ip_options *opt = NULL;
1277         struct rtable *rt = (struct rtable *)cork->dst;
1278         struct iphdr *iph;
1279         __be16 df = 0;
1280         __u8 ttl;
1281
1282         if ((skb = __skb_dequeue(queue)) == NULL)
1283                 goto out;
1284         tail_skb = &(skb_shinfo(skb)->frag_list);
1285
1286         /* move skb->data to ip header from ext header */
1287         if (skb->data < skb_network_header(skb))
1288                 __skb_pull(skb, skb_network_offset(skb));
1289         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1290                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1291                 *tail_skb = tmp_skb;
1292                 tail_skb = &(tmp_skb->next);
1293                 skb->len += tmp_skb->len;
1294                 skb->data_len += tmp_skb->len;
1295                 skb->truesize += tmp_skb->truesize;
1296                 tmp_skb->destructor = NULL;
1297                 tmp_skb->sk = NULL;
1298         }
1299
1300         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1301          * to fragment the frame generated here. No matter, what transforms
1302          * how transforms change size of the packet, it will come out.
1303          */
1304         if (inet->pmtudisc < IP_PMTUDISC_DO)
1305                 skb->local_df = 1;
1306
1307         /* DF bit is set when we want to see DF on outgoing frames.
1308          * If local_df is set too, we still allow to fragment this frame
1309          * locally. */
1310         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1311             (skb->len <= dst_mtu(&rt->dst) &&
1312              ip_dont_fragment(sk, &rt->dst)))
1313                 df = htons(IP_DF);
1314
1315         if (cork->flags & IPCORK_OPT)
1316                 opt = cork->opt;
1317
1318         if (rt->rt_type == RTN_MULTICAST)
1319                 ttl = inet->mc_ttl;
1320         else
1321                 ttl = ip_select_ttl(inet, &rt->dst);
1322
1323         iph = (struct iphdr *)skb->data;
1324         iph->version = 4;
1325         iph->ihl = 5;
1326         if (opt) {
1327                 iph->ihl += opt->optlen>>2;
1328                 ip_options_build(skb, opt, cork->addr, rt, 0);
1329         }
1330         iph->tos = inet->tos;
1331         iph->frag_off = df;
1332         ip_select_ident(iph, &rt->dst, sk);
1333         iph->ttl = ttl;
1334         iph->protocol = sk->sk_protocol;
1335         iph->saddr = rt->rt_src;
1336         iph->daddr = rt->rt_dst;
1337
1338         skb->priority = sk->sk_priority;
1339         skb->mark = sk->sk_mark;
1340         /*
1341          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1342          * on dst refcount
1343          */
1344         cork->dst = NULL;
1345         skb_dst_set(skb, &rt->dst);
1346
1347         if (iph->protocol == IPPROTO_ICMP)
1348                 icmp_out_count(net, ((struct icmphdr *)
1349                         skb_transport_header(skb))->type);
1350
1351         ip_cork_release(cork);
1352 out:
1353         return skb;
1354 }
1355
1356 int ip_send_skb(struct sk_buff *skb)
1357 {
1358         struct net *net = sock_net(skb->sk);
1359         int err;
1360
1361         err = ip_local_out(skb);
1362         if (err) {
1363                 if (err > 0)
1364                         err = net_xmit_errno(err);
1365                 if (err)
1366                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1367         }
1368
1369         return err;
1370 }
1371
1372 int ip_push_pending_frames(struct sock *sk)
1373 {
1374         struct sk_buff *skb;
1375
1376         skb = ip_finish_skb(sk);
1377         if (!skb)
1378                 return 0;
1379
1380         /* Netfilter gets whole the not fragmented skb. */
1381         return ip_send_skb(skb);
1382 }
1383
1384 /*
1385  *      Throw away all pending data on the socket.
1386  */
1387 static void __ip_flush_pending_frames(struct sock *sk,
1388                                       struct sk_buff_head *queue,
1389                                       struct inet_cork *cork)
1390 {
1391         struct sk_buff *skb;
1392
1393         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1394                 kfree_skb(skb);
1395
1396         ip_cork_release(cork);
1397 }
1398
1399 void ip_flush_pending_frames(struct sock *sk)
1400 {
1401         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1402 }
1403
1404 struct sk_buff *ip_make_skb(struct sock *sk,
1405                             int getfrag(void *from, char *to, int offset,
1406                                         int len, int odd, struct sk_buff *skb),
1407                             void *from, int length, int transhdrlen,
1408                             struct ipcm_cookie *ipc, struct rtable **rtp,
1409                             unsigned int flags)
1410 {
1411         struct inet_cork cork = {};
1412         struct sk_buff_head queue;
1413         int err;
1414
1415         if (flags & MSG_PROBE)
1416                 return NULL;
1417
1418         __skb_queue_head_init(&queue);
1419
1420         err = ip_setup_cork(sk, &cork, ipc, rtp);
1421         if (err)
1422                 return ERR_PTR(err);
1423
1424         err = __ip_append_data(sk, &queue, &cork, getfrag,
1425                                from, length, transhdrlen, flags);
1426         if (err) {
1427                 __ip_flush_pending_frames(sk, &queue, &cork);
1428                 return ERR_PTR(err);
1429         }
1430
1431         return __ip_make_skb(sk, &queue, &cork);
1432 }
1433
1434 /*
1435  *      Fetch data from kernel space and fill in checksum if needed.
1436  */
1437 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1438                               int len, int odd, struct sk_buff *skb)
1439 {
1440         __wsum csum;
1441
1442         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1443         skb->csum = csum_block_add(skb->csum, csum, odd);
1444         return 0;
1445 }
1446
1447 /*
1448  *      Generic function to send a packet as reply to another packet.
1449  *      Used to send TCP resets so far. ICMP should use this function too.
1450  *
1451  *      Should run single threaded per socket because it uses the sock
1452  *      structure to pass arguments.
1453  */
1454 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1455                    unsigned int len)
1456 {
1457         struct inet_sock *inet = inet_sk(sk);
1458         struct ip_options_data replyopts;
1459         struct ipcm_cookie ipc;
1460         __be32 daddr;
1461         struct rtable *rt = skb_rtable(skb);
1462
1463         if (ip_options_echo(&replyopts.opt.opt, skb))
1464                 return;
1465
1466         daddr = ipc.addr = rt->rt_src;
1467         ipc.opt = NULL;
1468         ipc.tx_flags = 0;
1469
1470         if (replyopts.opt.opt.optlen) {
1471                 ipc.opt = &replyopts.opt;
1472
1473                 if (replyopts.opt.opt.srr)
1474                         daddr = replyopts.opt.opt.faddr;
1475         }
1476
1477         {
1478                 struct flowi4 fl4;
1479
1480                 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1481                                    RT_TOS(ip_hdr(skb)->tos),
1482                                    RT_SCOPE_UNIVERSE, sk->sk_protocol,
1483                                    ip_reply_arg_flowi_flags(arg),
1484                                    daddr, rt->rt_spec_dst,
1485                                    tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1486                 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1487                 rt = ip_route_output_key(sock_net(sk), &fl4);
1488                 if (IS_ERR(rt))
1489                         return;
1490         }
1491
1492         /* And let IP do all the hard work.
1493
1494            This chunk is not reenterable, hence spinlock.
1495            Note that it uses the fact, that this function is called
1496            with locally disabled BH and that sk cannot be already spinlocked.
1497          */
1498         bh_lock_sock(sk);
1499         inet->tos = ip_hdr(skb)->tos;
1500         sk->sk_priority = skb->priority;
1501         sk->sk_protocol = ip_hdr(skb)->protocol;
1502         sk->sk_bound_dev_if = arg->bound_dev_if;
1503         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1504                        &ipc, &rt, MSG_DONTWAIT);
1505         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1506                 if (arg->csumoffset >= 0)
1507                         *((__sum16 *)skb_transport_header(skb) +
1508                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1509                                                                 arg->csum));
1510                 skb->ip_summed = CHECKSUM_NONE;
1511                 ip_push_pending_frames(sk);
1512         }
1513
1514         bh_unlock_sock(sk);
1515
1516         ip_rt_put(rt);
1517 }
1518
1519 void __init ip_init(void)
1520 {
1521         ip_rt_init();
1522         inet_initpeers();
1523
1524 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1525         igmp_mc_proc_init();
1526 #endif
1527 }