net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/system.h>
  47 #include <linux/module.h>
  48 #include <linux/types.h>
  49 #include <linux/kernel.h>
  50 #include <linux/mm.h>
  51 #include <linux/string.h>
  52 #include <linux/errno.h>
  53 #include <linux/highmem.h>
  54 #include <linux/slab.h>
  55
  56 #include <linux/socket.h>
  57 #include <linux/sockios.h>
  58 #include <linux/in.h>
  59 #include <linux/inet.h>
  60 #include <linux/netdevice.h>
  61 #include <linux/etherdevice.h>
  62 #include <linux/proc_fs.h>
  63 #include <linux/stat.h>
  64 #include <linux/init.h>
  65
  66 #include <net/snmp.h>
  67 #include <net/ip.h>
  68 #include <net/protocol.h>
  69 #include <net/route.h>
  70 #include <net/xfrm.h>
  71 #include <linux/skbuff.h>
  72 #include <net/sock.h>
  73 #include <net/arp.h>
  74 #include <net/icmp.h>
  75 #include <net/checksum.h>
  76 #include <net/inetpeer.h>
  77 #include <linux/igmp.h>
  78 #include <linux/netfilter_ipv4.h>
  79 #include <linux/netfilter_bridge.h>
  80 #include <linux/mroute.h>
  81 #include <linux/netlink.h>
  82 #include <linux/tcp.h>
  83
  84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93 EXPORT_SYMBOL(ip_send_check);
  94
  95 int __ip_local_out(struct sk_buff *skb)
  96 {
  97         struct iphdr *iph = ip_hdr(skb);
  98
  99         iph->tot_len = htons(skb->len);
 100         ip_send_check(iph);
 101         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 102                        skb_dst(skb)->dev, dst_output);
 103 }
 104
 105 int ip_local_out(struct sk_buff *skb)
 106 {
 107         int err;
 108
 109         err = __ip_local_out(skb);
 110         if (likely(err == 1))
 111                 err = dst_output(skb);
 112
 113         return err;
 114 }
 115 EXPORT_SYMBOL_GPL(ip_local_out);
 116
 117 /* dev_loopback_xmit for use with netfilter. */
 118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 119 {
 120         skb_reset_mac_header(newskb);
 121         __skb_pull(newskb, skb_network_offset(newskb));
 122         newskb->pkt_type = PACKET_LOOPBACK;
 123         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 124         WARN_ON(!skb_dst(newskb));
 125         netif_rx_ni(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = ip4_dst_hoplimit(dst);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = skb_rtable(skb);
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161         iph->daddr    = rt->rt_dst;
 162         iph->saddr    = rt->rt_src;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(iph, &rt->dst, sk);
 165
 166         if (opt && opt->opt.optlen) {
 167                 iph->ihl += opt->opt.optlen>>2;
 168                 ip_options_build(skb, &opt->opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172         skb->mark = sk->sk_mark;
 173
 174         /* Send it out. */
 175         return ip_local_out(skb);
 176 }
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185
 186         if (rt->rt_type == RTN_MULTICAST) {
 187                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 188         } else if (rt->rt_type == RTN_BROADCAST)
 189                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 190
 191         /* Be paranoid, rather than too clever. */
 192         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 193                 struct sk_buff *skb2;
 194
 195                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                 if (skb2 == NULL) {
 197                         kfree_skb(skb);
 198                         return -ENOMEM;
 199                 }
 200                 if (skb->sk)
 201                         skb_set_owner_w(skb2, skb->sk);
 202                 kfree_skb(skb);
 203                 skb = skb2;
 204         }
 205
 206         if (dst->hh)
 207                 return neigh_hh_output(dst->hh, skb);
 208         else if (dst->neighbour)
 209                 return dst->neighbour->output(skb);
 210
 211         if (net_ratelimit())
 212                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213         kfree_skb(skb);
 214         return -EINVAL;
 215 }
 216
 217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218 {
 219         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 223 }
 224
 225 static int ip_finish_output(struct sk_buff *skb)
 226 {
 227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228         /* Policy lookup after SNAT yielded a new policy */
 229         if (skb_dst(skb)->xfrm != NULL) {
 230                 IPCB(skb)->flags |= IPSKB_REROUTED;
 231                 return dst_output(skb);
 232         }
 233 #endif
 234         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                 return ip_fragment(skb, ip_finish_output2);
 236         else
 237                 return ip_finish_output2(skb);
 238 }
 239
 240 int ip_mc_output(struct sk_buff *skb)
 241 {
 242         struct sock *sk = skb->sk;
 243         struct rtable *rt = skb_rtable(skb);
 244         struct net_device *dev = rt->dst.dev;
 245
 246         /*
 247          *      If the indicated interface is up and running, send the packet.
 248          */
 249         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 250
 251         skb->dev = dev;
 252         skb->protocol = htons(ETH_P_IP);
 253
 254         /*
 255          *      Multicasts are looped back for other local users
 256          */
 257
 258         if (rt->rt_flags&RTCF_MULTICAST) {
 259                 if (sk_mc_loop(sk)
 260 #ifdef CONFIG_IP_MROUTE
 261                 /* Small optimization: do not loopback not local frames,
 262                    which returned after forwarding; they will be  dropped
 263                    by ip_mr_input in any case.
 264                    Note, that local frames are looped back to be delivered
 265                    to local recipients.
 266
 267                    This check is duplicated in ip_mr_input at the moment.
 268                  */
 269                     &&
 270                     ((rt->rt_flags & RTCF_LOCAL) ||
 271                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 272 #endif
 273                    ) {
 274                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                         if (newskb)
 276                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 277                                         newskb, NULL, newskb->dev,
 278                                         ip_dev_loopback_xmit);
 279                 }
 280
 281                 /* Multicasts with ttl 0 must not go beyond the host */
 282
 283                 if (ip_hdr(skb)->ttl == 0) {
 284                         kfree_skb(skb);
 285                         return 0;
 286                 }
 287         }
 288
 289         if (rt->rt_flags&RTCF_BROADCAST) {
 290                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 291                 if (newskb)
 292                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 293                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 294         }
 295
 296         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 297                             skb->dev, ip_finish_output,
 298                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 299 }
 300
 301 int ip_output(struct sk_buff *skb)
 302 {
 303         struct net_device *dev = skb_dst(skb)->dev;
 304
 305         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 306
 307         skb->dev = dev;
 308         skb->protocol = htons(ETH_P_IP);
 309
 310         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 311                             ip_finish_output,
 312                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 313 }
 314
 315 int ip_queue_xmit(struct sk_buff *skb)
 316 {
 317         struct sock *sk = skb->sk;
 318         struct inet_sock *inet = inet_sk(sk);
 319         struct ip_options_rcu *inet_opt;
 320         struct rtable *rt;
 321         struct iphdr *iph;
 322         int res;
 323
 324         /* Skip all of this if the packet is already routed,
 325          * f.e. by something like SCTP.
 326          */
 327         rcu_read_lock();
 328         inet_opt = rcu_dereference(inet->inet_opt);
 329         rt = skb_rtable(skb);
 330         if (rt != NULL)
 331                 goto packet_routed;
 332
 333         /* Make sure we can route this packet. */
 334         rt = (struct rtable *)__sk_dst_check(sk, 0);
 335         if (rt == NULL) {
 336                 __be32 daddr;
 337
 338                 /* Use correct destination address if we have options. */
 339                 daddr = inet->inet_daddr;
 340                 if (inet_opt && inet_opt->opt.srr)
 341                         daddr = inet_opt->opt.faddr;
 342
 343                 /* If this fails, retransmit mechanism of transport layer will
 344                  * keep trying until route appears or the connection times
 345                  * itself out.
 346                  */
 347                 rt = ip_route_output_ports(sock_net(sk), sk,
 348                                            daddr, inet->inet_saddr,
 349                                            inet->inet_dport,
 350                                            inet->inet_sport,
 351                                            sk->sk_protocol,
 352                                            RT_CONN_FLAGS(sk),
 353                                            sk->sk_bound_dev_if);
 354                 if (IS_ERR(rt))
 355                         goto no_route;
 356                 sk_setup_caps(sk, &rt->dst);
 357         }
 358         skb_dst_set_noref(skb, &rt->dst);
 359
 360 packet_routed:
 361         if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
 362                 goto no_route;
 363
 364         /* OK, we know where to send it, allocate and build IP header. */
 365         skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 366         skb_reset_network_header(skb);
 367         iph = ip_hdr(skb);
 368         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 369         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 370                 iph->frag_off = htons(IP_DF);
 371         else
 372                 iph->frag_off = 0;
 373         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 374         iph->protocol = sk->sk_protocol;
 375         iph->saddr    = rt->rt_src;
 376         iph->daddr    = rt->rt_dst;
 377         /* Transport layer set skb->h.foo itself. */
 378
 379         if (inet_opt && inet_opt->opt.optlen) {
 380                 iph->ihl += inet_opt->opt.optlen >> 2;
 381                 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 382         }
 383
 384         ip_select_ident_more(iph, &rt->dst, sk,
 385                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 386
 387         skb->priority = sk->sk_priority;
 388         skb->mark = sk->sk_mark;
 389
 390         res = ip_local_out(skb);
 391         rcu_read_unlock();
 392         return res;
 393
 394 no_route:
 395         rcu_read_unlock();
 396         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 397         kfree_skb(skb);
 398         return -EHOSTUNREACH;
 399 }
 400 EXPORT_SYMBOL(ip_queue_xmit);
 401
 402
 403 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 404 {
 405         to->pkt_type = from->pkt_type;
 406         to->priority = from->priority;
 407         to->protocol = from->protocol;
 408         skb_dst_drop(to);
 409         skb_dst_copy(to, from);
 410         to->dev = from->dev;
 411         to->mark = from->mark;
 412
 413         /* Copy the flags to each fragment. */
 414         IPCB(to)->flags = IPCB(from)->flags;
 415
 416 #ifdef CONFIG_NET_SCHED
 417         to->tc_index = from->tc_index;
 418 #endif
 419         nf_copy(to, from);
 420 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 421     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 422         to->nf_trace = from->nf_trace;
 423 #endif
 424 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 425         to->ipvs_property = from->ipvs_property;
 426 #endif
 427         skb_copy_secmark(to, from);
 428 }
 429
 430 /*
 431  *      This IP datagram is too large to be sent in one piece.  Break it up into
 432  *      smaller pieces (each of size equal to IP header plus
 433  *      a block of the data of the original IP data part) that will yet fit in a
 434  *      single device frame, and queue such a frame for sending.
 435  */
 436
 437 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 438 {
 439         struct iphdr *iph;
 440         int ptr;
 441         struct net_device *dev;
 442         struct sk_buff *skb2;
 443         unsigned int mtu, hlen, left, len, ll_rs;
 444         int offset;
 445         __be16 not_last_frag;
 446         struct rtable *rt = skb_rtable(skb);
 447         int err = 0;
 448
 449         dev = rt->dst.dev;
 450
 451         /*
 452          *      Point into the IP datagram header.
 453          */
 454
 455         iph = ip_hdr(skb);
 456
 457         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 458                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 459                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 460                           htonl(ip_skb_dst_mtu(skb)));
 461                 kfree_skb(skb);
 462                 return -EMSGSIZE;
 463         }
 464
 465         /*
 466          *      Setup starting values.
 467          */
 468
 469         hlen = iph->ihl * 4;
 470         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 471 #ifdef CONFIG_BRIDGE_NETFILTER
 472         if (skb->nf_bridge)
 473                 mtu -= nf_bridge_mtu_reduction(skb);
 474 #endif
 475         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 476
 477         /* When frag_list is given, use it. First, check its validity:
 478          * some transformers could create wrong frag_list or break existing
 479          * one, it is not prohibited. In this case fall back to copying.
 480          *
 481          * LATER: this step can be merged to real generation of fragments,
 482          * we can switch to copy when see the first bad fragment.
 483          */
 484         if (skb_has_frag_list(skb)) {
 485                 struct sk_buff *frag, *frag2;
 486                 int first_len = skb_pagelen(skb);
 487
 488                 if (first_len - hlen > mtu ||
 489                     ((first_len - hlen) & 7) ||
 490                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 491                     skb_cloned(skb))
 492                         goto slow_path;
 493
 494                 skb_walk_frags(skb, frag) {
 495                         /* Correct geometry. */
 496                         if (frag->len > mtu ||
 497                             ((frag->len & 7) && frag->next) ||
 498                             skb_headroom(frag) < hlen)
 499                                 goto slow_path_clean;
 500
 501                         /* Partially cloned skb? */
 502                         if (skb_shared(frag))
 503                                 goto slow_path_clean;
 504
 505                         BUG_ON(frag->sk);
 506                         if (skb->sk) {
 507                                 frag->sk = skb->sk;
 508                                 frag->destructor = sock_wfree;
 509                         }
 510                         skb->truesize -= frag->truesize;
 511                 }
 512
 513                 /* Everything is OK. Generate! */
 514
 515                 err = 0;
 516                 offset = 0;
 517                 frag = skb_shinfo(skb)->frag_list;
 518                 skb_frag_list_init(skb);
 519                 skb->data_len = first_len - skb_headlen(skb);
 520                 skb->len = first_len;
 521                 iph->tot_len = htons(first_len);
 522                 iph->frag_off = htons(IP_MF);
 523                 ip_send_check(iph);
 524
 525                 for (;;) {
 526                         /* Prepare header of the next frame,
 527                          * before previous one went down. */
 528                         if (frag) {
 529                                 frag->ip_summed = CHECKSUM_NONE;
 530                                 skb_reset_transport_header(frag);
 531                                 __skb_push(frag, hlen);
 532                                 skb_reset_network_header(frag);
 533                                 memcpy(skb_network_header(frag), iph, hlen);
 534                                 iph = ip_hdr(frag);
 535                                 iph->tot_len = htons(frag->len);
 536                                 ip_copy_metadata(frag, skb);
 537                                 if (offset == 0)
 538                                         ip_options_fragment(frag);
 539                                 offset += skb->len - hlen;
 540                                 iph->frag_off = htons(offset>>3);
 541                                 if (frag->next != NULL)
 542                                         iph->frag_off |= htons(IP_MF);
 543                                 /* Ready, complete checksum */
 544                                 ip_send_check(iph);
 545                         }
 546
 547                         err = output(skb);
 548
 549                         if (!err)
 550                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 551                         if (err || !frag)
 552                                 break;
 553
 554                         skb = frag;
 555                         frag = skb->next;
 556                         skb->next = NULL;
 557                 }
 558
 559                 if (err == 0) {
 560                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 561                         return 0;
 562                 }
 563
 564                 while (frag) {
 565                         skb = frag->next;
 566                         kfree_skb(frag);
 567                         frag = skb;
 568                 }
 569                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 570                 return err;
 571
 572 slow_path_clean:
 573                 skb_walk_frags(skb, frag2) {
 574                         if (frag2 == frag)
 575                                 break;
 576                         frag2->sk = NULL;
 577                         frag2->destructor = NULL;
 578                         skb->truesize += frag2->truesize;
 579                 }
 580         }
 581
 582 slow_path:
 583         left = skb->len - hlen;         /* Space per frame */
 584         ptr = hlen;             /* Where to start from */
 585
 586         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 587          * we need to make room for the encapsulating header
 588          */
 589         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 590
 591         /*
 592          *      Fragment the datagram.
 593          */
 594
 595         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 596         not_last_frag = iph->frag_off & htons(IP_MF);
 597
 598         /*
 599          *      Keep copying data until we run out.
 600          */
 601
 602         while (left > 0) {
 603                 len = left;
 604                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 605                 if (len > mtu)
 606                         len = mtu;
 607                 /* IF: we are not sending up to and including the packet end
 608                    then align the next start on an eight byte boundary */
 609                 if (len < left) {
 610                         len &= ~7;
 611                 }
 612                 /*
 613                  *      Allocate buffer.
 614                  */
 615
 616                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 617                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 618                         err = -ENOMEM;
 619                         goto fail;
 620                 }
 621
 622                 /*
 623                  *      Set up data on packet
 624                  */
 625
 626                 ip_copy_metadata(skb2, skb);
 627                 skb_reserve(skb2, ll_rs);
 628                 skb_put(skb2, len + hlen);
 629                 skb_reset_network_header(skb2);
 630                 skb2->transport_header = skb2->network_header + hlen;
 631
 632                 /*
 633                  *      Charge the memory for the fragment to any owner
 634                  *      it might possess
 635                  */
 636
 637                 if (skb->sk)
 638                         skb_set_owner_w(skb2, skb->sk);
 639
 640                 /*
 641                  *      Copy the packet header into the new buffer.
 642                  */
 643
 644                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 645
 646                 /*
 647                  *      Copy a block of the IP datagram.
 648                  */
 649                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 650                         BUG();
 651                 left -= len;
 652
 653                 /*
 654                  *      Fill in the new header fields.
 655                  */
 656                 iph = ip_hdr(skb2);
 657                 iph->frag_off = htons((offset >> 3));
 658
 659                 /* ANK: dirty, but effective trick. Upgrade options only if
 660                  * the segment to be fragmented was THE FIRST (otherwise,
 661                  * options are already fixed) and make it ONCE
 662                  * on the initial skb, so that all the following fragments
 663                  * will inherit fixed options.
 664                  */
 665                 if (offset == 0)
 666                         ip_options_fragment(skb);
 667
 668                 /*
 669                  *      Added AC : If we are fragmenting a fragment that's not the
 670                  *                 last fragment then keep MF on each bit
 671                  */
 672                 if (left > 0 || not_last_frag)
 673                         iph->frag_off |= htons(IP_MF);
 674                 ptr += len;
 675                 offset += len;
 676
 677                 /*
 678                  *      Put this fragment into the sending queue.
 679                  */
 680                 iph->tot_len = htons(len + hlen);
 681
 682                 ip_send_check(iph);
 683
 684                 err = output(skb2);
 685                 if (err)
 686                         goto fail;
 687
 688                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 689         }
 690         kfree_skb(skb);
 691         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 692         return err;
 693
 694 fail:
 695         kfree_skb(skb);
 696         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 697         return err;
 698 }
 699 EXPORT_SYMBOL(ip_fragment);
 700
 701 int
 702 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 703 {
 704         struct iovec *iov = from;
 705
 706         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 707                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 708                         return -EFAULT;
 709         } else {
 710                 __wsum csum = 0;
 711                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 712                         return -EFAULT;
 713                 skb->csum = csum_block_add(skb->csum, csum, odd);
 714         }
 715         return 0;
 716 }
 717 EXPORT_SYMBOL(ip_generic_getfrag);
 718
 719 static inline __wsum
 720 csum_page(struct page *page, int offset, int copy)
 721 {
 722         char *kaddr;
 723         __wsum csum;
 724         kaddr = kmap(page);
 725         csum = csum_partial(kaddr + offset, copy, 0);
 726         kunmap(page);
 727         return csum;
 728 }
 729
 730 static inline int ip_ufo_append_data(struct sock *sk,
 731                         struct sk_buff_head *queue,
 732                         int getfrag(void *from, char *to, int offset, int len,
 733                                int odd, struct sk_buff *skb),
 734                         void *from, int length, int hh_len, int fragheaderlen,
 735                         int transhdrlen, int mtu, unsigned int flags)
 736 {
 737         struct sk_buff *skb;
 738         int err;
 739
 740         /* There is support for UDP fragmentation offload by network
 741          * device, so create one single skb packet containing complete
 742          * udp datagram
 743          */
 744         if ((skb = skb_peek_tail(queue)) == NULL) {
 745                 skb = sock_alloc_send_skb(sk,
 746                         hh_len + fragheaderlen + transhdrlen + 20,
 747                         (flags & MSG_DONTWAIT), &err);
 748
 749                 if (skb == NULL)
 750                         return err;
 751
 752                 /* reserve space for Hardware header */
 753                 skb_reserve(skb, hh_len);
 754
 755                 /* create space for UDP/IP header */
 756                 skb_put(skb, fragheaderlen + transhdrlen);
 757
 758                 /* initialize network header pointer */
 759                 skb_reset_network_header(skb);
 760
 761                 /* initialize protocol header pointer */
 762                 skb->transport_header = skb->network_header + fragheaderlen;
 763
 764                 skb->ip_summed = CHECKSUM_PARTIAL;
 765                 skb->csum = 0;
 766
 767                 /* specify the length of each IP datagram fragment */
 768                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 769                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 770                 __skb_queue_tail(queue, skb);
 771         }
 772
 773         return skb_append_datato_frags(sk, skb, getfrag, from,
 774                                        (length - transhdrlen));
 775 }
 776
 777 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
 778                             struct inet_cork *cork,
 779                             int getfrag(void *from, char *to, int offset,
 780                                         int len, int odd, struct sk_buff *skb),
 781                             void *from, int length, int transhdrlen,
 782                             unsigned int flags)
 783 {
 784         struct inet_sock *inet = inet_sk(sk);
 785         struct sk_buff *skb;
 786
 787         struct ip_options *opt = cork->opt;
 788         int hh_len;
 789         int exthdrlen;
 790         int mtu;
 791         int copy;
 792         int err;
 793         int offset = 0;
 794         unsigned int maxfraglen, fragheaderlen;
 795         int csummode = CHECKSUM_NONE;
 796         struct rtable *rt = (struct rtable *)cork->dst;
 797
 798         exthdrlen = transhdrlen ? rt->dst.header_len : 0;
 799         length += exthdrlen;
 800         transhdrlen += exthdrlen;
 801         mtu = cork->fragsize;
 802
 803         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 804
 805         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 806         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 807
 808         if (cork->length + length > 0xFFFF - fragheaderlen) {
 809                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 810                                mtu-exthdrlen);
 811                 return -EMSGSIZE;
 812         }
 813
 814         /*
 815          * transhdrlen > 0 means that this is the first fragment and we wish
 816          * it won't be fragmented in the future.
 817          */
 818         if (transhdrlen &&
 819             length + fragheaderlen <= mtu &&
 820             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 821             !exthdrlen)
 822                 csummode = CHECKSUM_PARTIAL;
 823
 824         skb = skb_peek_tail(queue);
 825
 826         cork->length += length;
 827         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 828             (sk->sk_protocol == IPPROTO_UDP) &&
 829             (rt->dst.dev->features & NETIF_F_UFO)) {
 830                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 831                                          hh_len, fragheaderlen, transhdrlen,
 832                                          mtu, flags);
 833                 if (err)
 834                         goto error;
 835                 return 0;
 836         }
 837
 838         /* So, what's going on in the loop below?
 839          *
 840          * We use calculated fragment length to generate chained skb,
 841          * each of segments is IP fragment ready for sending to network after
 842          * adding appropriate IP header.
 843          */
 844
 845         if (!skb)
 846                 goto alloc_new_skb;
 847
 848         while (length > 0) {
 849                 /* Check if the remaining data fits into current packet. */
 850                 copy = mtu - skb->len;
 851                 if (copy < length)
 852                         copy = maxfraglen - skb->len;
 853                 if (copy <= 0) {
 854                         char *data;
 855                         unsigned int datalen;
 856                         unsigned int fraglen;
 857                         unsigned int fraggap;
 858                         unsigned int alloclen;
 859                         struct sk_buff *skb_prev;
 860 alloc_new_skb:
 861                         skb_prev = skb;
 862                         if (skb_prev)
 863                                 fraggap = skb_prev->len - maxfraglen;
 864                         else
 865                                 fraggap = 0;
 866
 867                         /*
 868                          * If remaining data exceeds the mtu,
 869                          * we know we need more fragment(s).
 870                          */
 871                         datalen = length + fraggap;
 872                         if (datalen > mtu - fragheaderlen)
 873                                 datalen = maxfraglen - fragheaderlen;
 874                         fraglen = datalen + fragheaderlen;
 875
 876                         if ((flags & MSG_MORE) &&
 877                             !(rt->dst.dev->features&NETIF_F_SG))
 878                                 alloclen = mtu;
 879                         else
 880                                 alloclen = fraglen;
 881
 882                         /* The last fragment gets additional space at tail.
 883                          * Note, with MSG_MORE we overallocate on fragments,
 884                          * because we have no idea what fragment will be
 885                          * the last.
 886                          */
 887                         if (datalen == length + fraggap) {
 888                                 alloclen += rt->dst.trailer_len;
 889                                 /* make sure mtu is not reached */
 890                                 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
 891                                         datalen -= ALIGN(rt->dst.trailer_len, 8);
 892                         }
 893                         if (transhdrlen) {
 894                                 skb = sock_alloc_send_skb(sk,
 895                                                 alloclen + hh_len + 15,
 896                                                 (flags & MSG_DONTWAIT), &err);
 897                         } else {
 898                                 skb = NULL;
 899                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 900                                     2 * sk->sk_sndbuf)
 901                                         skb = sock_wmalloc(sk,
 902                                                            alloclen + hh_len + 15, 1,
 903                                                            sk->sk_allocation);
 904                                 if (unlikely(skb == NULL))
 905                                         err = -ENOBUFS;
 906                                 else
 907                                         /* only the initial fragment is
 908                                            time stamped */
 909                                         cork->tx_flags = 0;
 910                         }
 911                         if (skb == NULL)
 912                                 goto error;
 913
 914                         /*
 915                          *      Fill in the control structures
 916                          */
 917                         skb->ip_summed = csummode;
 918                         skb->csum = 0;
 919                         skb_reserve(skb, hh_len);
 920                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 921
 922                         /*
 923                          *      Find where to start putting bytes.
 924                          */
 925                         data = skb_put(skb, fraglen);
 926                         skb_set_network_header(skb, exthdrlen);
 927                         skb->transport_header = (skb->network_header +
 928                                                  fragheaderlen);
 929                         data += fragheaderlen;
 930
 931                         if (fraggap) {
 932                                 skb->csum = skb_copy_and_csum_bits(
 933                                         skb_prev, maxfraglen,
 934                                         data + transhdrlen, fraggap, 0);
 935                                 skb_prev->csum = csum_sub(skb_prev->csum,
 936                                                           skb->csum);
 937                                 data += fraggap;
 938                                 pskb_trim_unique(skb_prev, maxfraglen);
 939                         }
 940
 941                         copy = datalen - transhdrlen - fraggap;
 942                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 943                                 err = -EFAULT;
 944                                 kfree_skb(skb);
 945                                 goto error;
 946                         }
 947
 948                         offset += copy;
 949                         length -= datalen - fraggap;
 950                         transhdrlen = 0;
 951                         exthdrlen = 0;
 952                         csummode = CHECKSUM_NONE;
 953
 954                         /*
 955                          * Put the packet on the pending queue.
 956                          */
 957                         __skb_queue_tail(queue, skb);
 958                         continue;
 959                 }
 960
 961                 if (copy > length)
 962                         copy = length;
 963
 964                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 965                         unsigned int off;
 966
 967                         off = skb->len;
 968                         if (getfrag(from, skb_put(skb, copy),
 969                                         offset, copy, off, skb) < 0) {
 970                                 __skb_trim(skb, off);
 971                                 err = -EFAULT;
 972                                 goto error;
 973                         }
 974                 } else {
 975                         int i = skb_shinfo(skb)->nr_frags;
 976                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 977                         struct page *page = cork->page;
 978                         int off = cork->off;
 979                         unsigned int left;
 980
 981                         if (page && (left = PAGE_SIZE - off) > 0) {
 982                                 if (copy >= left)
 983                                         copy = left;
 984                                 if (page != frag->page) {
 985                                         if (i == MAX_SKB_FRAGS) {
 986                                                 err = -EMSGSIZE;
 987                                                 goto error;
 988                                         }
 989                                         get_page(page);
 990                                         skb_fill_page_desc(skb, i, page, off, 0);
 991                                         frag = &skb_shinfo(skb)->frags[i];
 992                                 }
 993                         } else if (i < MAX_SKB_FRAGS) {
 994                                 if (copy > PAGE_SIZE)
 995                                         copy = PAGE_SIZE;
 996                                 page = alloc_pages(sk->sk_allocation, 0);
 997                                 if (page == NULL)  {
 998                                         err = -ENOMEM;
 999                                         goto error;
1000                                 }
1001                                 cork->page = page;
1002                                 cork->off = 0;
1003
1004                                 skb_fill_page_desc(skb, i, page, 0, 0);
1005                                 frag = &skb_shinfo(skb)->frags[i];
1006                         } else {
1007                                 err = -EMSGSIZE;
1008                                 goto error;
1009                         }
1010                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1011                                 err = -EFAULT;
1012                                 goto error;
1013                         }
1014                         cork->off += copy;
1015                         frag->size += copy;
1016                         skb->len += copy;
1017                         skb->data_len += copy;
1018                         skb->truesize += copy;
1019                         atomic_add(copy, &sk->sk_wmem_alloc);
1020                 }
1021                 offset += copy;
1022                 length -= copy;
1023         }
1024
1025         return 0;
1026
1027 error:
1028         cork->length -= length;
1029         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1030         return err;
1031 }
1032
1033 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1034                          struct ipcm_cookie *ipc, struct rtable **rtp)
1035 {
1036         struct inet_sock *inet = inet_sk(sk);
1037         struct ip_options_rcu *opt;
1038         struct rtable *rt;
1039
1040         /*
1041          * setup for corking.
1042          */
1043         opt = ipc->opt;
1044         if (opt) {
1045                 if (cork->opt == NULL) {
1046                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1047                                             sk->sk_allocation);
1048                         if (unlikely(cork->opt == NULL))
1049                                 return -ENOBUFS;
1050                 }
1051                 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1052                 cork->flags |= IPCORK_OPT;
1053                 cork->addr = ipc->addr;
1054         }
1055         rt = *rtp;
1056         if (unlikely(!rt))
1057                 return -EFAULT;
1058         /*
1059          * We steal reference to this route, caller should not release it
1060          */
1061         *rtp = NULL;
1062         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1063                          rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1064         cork->dst = &rt->dst;
1065         cork->length = 0;
1066         cork->tx_flags = ipc->tx_flags;
1067         cork->page = NULL;
1068         cork->off = 0;
1069
1070         return 0;
1071 }
1072
1073 /*
1074  *      ip_append_data() and ip_append_page() can make one large IP datagram
1075  *      from many pieces of data. Each pieces will be holded on the socket
1076  *      until ip_push_pending_frames() is called. Each piece can be a page
1077  *      or non-page data.
1078  *
1079  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1080  *      this interface potentially.
1081  *
1082  *      LATER: length must be adjusted by pad at tail, when it is required.
1083  */
1084 int ip_append_data(struct sock *sk,
1085                    int getfrag(void *from, char *to, int offset, int len,
1086                                int odd, struct sk_buff *skb),
1087                    void *from, int length, int transhdrlen,
1088                    struct ipcm_cookie *ipc, struct rtable **rtp,
1089                    unsigned int flags)
1090 {
1091         struct inet_sock *inet = inet_sk(sk);
1092         int err;
1093
1094         if (flags&MSG_PROBE)
1095                 return 0;
1096
1097         if (skb_queue_empty(&sk->sk_write_queue)) {
1098                 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1099                 if (err)
1100                         return err;
1101         } else {
1102                 transhdrlen = 0;
1103         }
1104
1105         return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1106                                 from, length, transhdrlen, flags);
1107 }
1108
1109 ssize_t ip_append_page(struct sock *sk, struct page *page,
1110                        int offset, size_t size, int flags)
1111 {
1112         struct inet_sock *inet = inet_sk(sk);
1113         struct sk_buff *skb;
1114         struct rtable *rt;
1115         struct ip_options *opt = NULL;
1116         int hh_len;
1117         int mtu;
1118         int len;
1119         int err;
1120         unsigned int maxfraglen, fragheaderlen, fraggap;
1121
1122         if (inet->hdrincl)
1123                 return -EPERM;
1124
1125         if (flags&MSG_PROBE)
1126                 return 0;
1127
1128         if (skb_queue_empty(&sk->sk_write_queue))
1129                 return -EINVAL;
1130
1131         rt = (struct rtable *)inet->cork.dst;
1132         if (inet->cork.flags & IPCORK_OPT)
1133                 opt = inet->cork.opt;
1134
1135         if (!(rt->dst.dev->features&NETIF_F_SG))
1136                 return -EOPNOTSUPP;
1137
1138         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1139         mtu = inet->cork.fragsize;
1140
1141         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1142         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1143
1144         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1145                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1146                 return -EMSGSIZE;
1147         }
1148
1149         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1150                 return -EINVAL;
1151
1152         inet->cork.length += size;
1153         if ((size + skb->len > mtu) &&
1154             (sk->sk_protocol == IPPROTO_UDP) &&
1155             (rt->dst.dev->features & NETIF_F_UFO)) {
1156                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1157                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158         }
1159
1160
1161         while (size > 0) {
1162                 int i;
1163
1164                 if (skb_is_gso(skb))
1165                         len = size;
1166                 else {
1167
1168                         /* Check if the remaining data fits into current packet. */
1169                         len = mtu - skb->len;
1170                         if (len < size)
1171                                 len = maxfraglen - skb->len;
1172                 }
1173                 if (len <= 0) {
1174                         struct sk_buff *skb_prev;
1175                         int alloclen;
1176
1177                         skb_prev = skb;
1178                         fraggap = skb_prev->len - maxfraglen;
1179
1180                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1181                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1182                         if (unlikely(!skb)) {
1183                                 err = -ENOBUFS;
1184                                 goto error;
1185                         }
1186
1187                         /*
1188                          *      Fill in the control structures
1189                          */
1190                         skb->ip_summed = CHECKSUM_NONE;
1191                         skb->csum = 0;
1192                         skb_reserve(skb, hh_len);
1193
1194                         /*
1195                          *      Find where to start putting bytes.
1196                          */
1197                         skb_put(skb, fragheaderlen + fraggap);
1198                         skb_reset_network_header(skb);
1199                         skb->transport_header = (skb->network_header +
1200                                                  fragheaderlen);
1201                         if (fraggap) {
1202                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1203                                                                    maxfraglen,
1204                                                     skb_transport_header(skb),
1205                                                                    fraggap, 0);
1206                                 skb_prev->csum = csum_sub(skb_prev->csum,
1207                                                           skb->csum);
1208                                 pskb_trim_unique(skb_prev, maxfraglen);
1209                         }
1210
1211                         /*
1212                          * Put the packet on the pending queue.
1213                          */
1214                         __skb_queue_tail(&sk->sk_write_queue, skb);
1215                         continue;
1216                 }
1217
1218                 i = skb_shinfo(skb)->nr_frags;
1219                 if (len > size)
1220                         len = size;
1221                 if (skb_can_coalesce(skb, i, page, offset)) {
1222                         skb_shinfo(skb)->frags[i-1].size += len;
1223                 } else if (i < MAX_SKB_FRAGS) {
1224                         get_page(page);
1225                         skb_fill_page_desc(skb, i, page, offset, len);
1226                 } else {
1227                         err = -EMSGSIZE;
1228                         goto error;
1229                 }
1230
1231                 if (skb->ip_summed == CHECKSUM_NONE) {
1232                         __wsum csum;
1233                         csum = csum_page(page, offset, len);
1234                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1235                 }
1236
1237                 skb->len += len;
1238                 skb->data_len += len;
1239                 skb->truesize += len;
1240                 atomic_add(len, &sk->sk_wmem_alloc);
1241                 offset += len;
1242                 size -= len;
1243         }
1244         return 0;
1245
1246 error:
1247         inet->cork.length -= size;
1248         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1249         return err;
1250 }
1251
1252 static void ip_cork_release(struct inet_cork *cork)
1253 {
1254         cork->flags &= ~IPCORK_OPT;
1255         kfree(cork->opt);
1256         cork->opt = NULL;
1257         dst_release(cork->dst);
1258         cork->dst = NULL;
1259 }
1260
1261 /*
1262  *      Combined all pending IP fragments on the socket as one IP datagram
1263  *      and push them out.
1264  */
1265 struct sk_buff *__ip_make_skb(struct sock *sk,
1266                               struct sk_buff_head *queue,
1267                               struct inet_cork *cork)
1268 {
1269         struct sk_buff *skb, *tmp_skb;
1270         struct sk_buff **tail_skb;
1271         struct inet_sock *inet = inet_sk(sk);
1272         struct net *net = sock_net(sk);
1273         struct ip_options *opt = NULL;
1274         struct rtable *rt = (struct rtable *)cork->dst;
1275         struct iphdr *iph;
1276         __be16 df = 0;
1277         __u8 ttl;
1278
1279         if ((skb = __skb_dequeue(queue)) == NULL)
1280                 goto out;
1281         tail_skb = &(skb_shinfo(skb)->frag_list);
1282
1283         /* move skb->data to ip header from ext header */
1284         if (skb->data < skb_network_header(skb))
1285                 __skb_pull(skb, skb_network_offset(skb));
1286         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1287                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1288                 *tail_skb = tmp_skb;
1289                 tail_skb = &(tmp_skb->next);
1290                 skb->len += tmp_skb->len;
1291                 skb->data_len += tmp_skb->len;
1292                 skb->truesize += tmp_skb->truesize;
1293                 tmp_skb->destructor = NULL;
1294                 tmp_skb->sk = NULL;
1295         }
1296
1297         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1298          * to fragment the frame generated here. No matter, what transforms
1299          * how transforms change size of the packet, it will come out.
1300          */
1301         if (inet->pmtudisc < IP_PMTUDISC_DO)
1302                 skb->local_df = 1;
1303
1304         /* DF bit is set when we want to see DF on outgoing frames.
1305          * If local_df is set too, we still allow to fragment this frame
1306          * locally. */
1307         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1308             (skb->len <= dst_mtu(&rt->dst) &&
1309              ip_dont_fragment(sk, &rt->dst)))
1310                 df = htons(IP_DF);
1311
1312         if (cork->flags & IPCORK_OPT)
1313                 opt = cork->opt;
1314
1315         if (rt->rt_type == RTN_MULTICAST)
1316                 ttl = inet->mc_ttl;
1317         else
1318                 ttl = ip_select_ttl(inet, &rt->dst);
1319
1320         iph = (struct iphdr *)skb->data;
1321         iph->version = 4;
1322         iph->ihl = 5;
1323         if (opt) {
1324                 iph->ihl += opt->optlen>>2;
1325                 ip_options_build(skb, opt, cork->addr, rt, 0);
1326         }
1327         iph->tos = inet->tos;
1328         iph->frag_off = df;
1329         ip_select_ident(iph, &rt->dst, sk);
1330         iph->ttl = ttl;
1331         iph->protocol = sk->sk_protocol;
1332         iph->saddr = rt->rt_src;
1333         iph->daddr = rt->rt_dst;
1334
1335         skb->priority = sk->sk_priority;
1336         skb->mark = sk->sk_mark;
1337         /*
1338          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1339          * on dst refcount
1340          */
1341         cork->dst = NULL;
1342         skb_dst_set(skb, &rt->dst);
1343
1344         if (iph->protocol == IPPROTO_ICMP)
1345                 icmp_out_count(net, ((struct icmphdr *)
1346                         skb_transport_header(skb))->type);
1347
1348         ip_cork_release(cork);
1349 out:
1350         return skb;
1351 }
1352
1353 int ip_send_skb(struct sk_buff *skb)
1354 {
1355         struct net *net = sock_net(skb->sk);
1356         int err;
1357
1358         err = ip_local_out(skb);
1359         if (err) {
1360                 if (err > 0)
1361                         err = net_xmit_errno(err);
1362                 if (err)
1363                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1364         }
1365
1366         return err;
1367 }
1368
1369 int ip_push_pending_frames(struct sock *sk)
1370 {
1371         struct sk_buff *skb;
1372
1373         skb = ip_finish_skb(sk);
1374         if (!skb)
1375                 return 0;
1376
1377         /* Netfilter gets whole the not fragmented skb. */
1378         return ip_send_skb(skb);
1379 }
1380
1381 /*
1382  *      Throw away all pending data on the socket.
1383  */
1384 static void __ip_flush_pending_frames(struct sock *sk,
1385                                       struct sk_buff_head *queue,
1386                                       struct inet_cork *cork)
1387 {
1388         struct sk_buff *skb;
1389
1390         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1391                 kfree_skb(skb);
1392
1393         ip_cork_release(cork);
1394 }
1395
1396 void ip_flush_pending_frames(struct sock *sk)
1397 {
1398         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1399 }
1400
1401 struct sk_buff *ip_make_skb(struct sock *sk,
1402                             int getfrag(void *from, char *to, int offset,
1403                                         int len, int odd, struct sk_buff *skb),
1404                             void *from, int length, int transhdrlen,
1405                             struct ipcm_cookie *ipc, struct rtable **rtp,
1406                             unsigned int flags)
1407 {
1408         struct inet_cork cork = {};
1409         struct sk_buff_head queue;
1410         int err;
1411
1412         if (flags & MSG_PROBE)
1413                 return NULL;
1414
1415         __skb_queue_head_init(&queue);
1416
1417         err = ip_setup_cork(sk, &cork, ipc, rtp);
1418         if (err)
1419                 return ERR_PTR(err);
1420
1421         err = __ip_append_data(sk, &queue, &cork, getfrag,
1422                                from, length, transhdrlen, flags);
1423         if (err) {
1424                 __ip_flush_pending_frames(sk, &queue, &cork);
1425                 return ERR_PTR(err);
1426         }
1427
1428         return __ip_make_skb(sk, &queue, &cork);
1429 }
1430
1431 /*
1432  *      Fetch data from kernel space and fill in checksum if needed.
1433  */
1434 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1435                               int len, int odd, struct sk_buff *skb)
1436 {
1437         __wsum csum;
1438
1439         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1440         skb->csum = csum_block_add(skb->csum, csum, odd);
1441         return 0;
1442 }
1443
1444 /*
1445  *      Generic function to send a packet as reply to another packet.
1446  *      Used to send TCP resets so far. ICMP should use this function too.
1447  *
1448  *      Should run single threaded per socket because it uses the sock
1449  *      structure to pass arguments.
1450  */
1451 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1452                    unsigned int len)
1453 {
1454         struct inet_sock *inet = inet_sk(sk);
1455         struct ip_options_data replyopts;
1456         struct ipcm_cookie ipc;
1457         __be32 daddr;
1458         struct rtable *rt = skb_rtable(skb);
1459
1460         if (ip_options_echo(&replyopts.opt.opt, skb))
1461                 return;
1462
1463         daddr = ipc.addr = rt->rt_src;
1464         ipc.opt = NULL;
1465         ipc.tx_flags = 0;
1466
1467         if (replyopts.opt.opt.optlen) {
1468                 ipc.opt = &replyopts.opt;
1469
1470                 if (replyopts.opt.opt.srr)
1471                         daddr = replyopts.opt.opt.faddr;
1472         }
1473
1474         {
1475                 struct flowi4 fl4;
1476
1477                 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1478                                    RT_TOS(ip_hdr(skb)->tos),
1479                                    RT_SCOPE_UNIVERSE, sk->sk_protocol,
1480                                    ip_reply_arg_flowi_flags(arg),
1481                                    daddr, rt->rt_spec_dst,
1482                                    tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1483                 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1484                 rt = ip_route_output_key(sock_net(sk), &fl4);
1485                 if (IS_ERR(rt))
1486                         return;
1487         }
1488
1489         /* And let IP do all the hard work.
1490
1491            This chunk is not reenterable, hence spinlock.
1492            Note that it uses the fact, that this function is called
1493            with locally disabled BH and that sk cannot be already spinlocked.
1494          */
1495         bh_lock_sock(sk);
1496         inet->tos = ip_hdr(skb)->tos;
1497         sk->sk_priority = skb->priority;
1498         sk->sk_protocol = ip_hdr(skb)->protocol;
1499         sk->sk_bound_dev_if = arg->bound_dev_if;
1500         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1501                        &ipc, &rt, MSG_DONTWAIT);
1502         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1503                 if (arg->csumoffset >= 0)
1504                         *((__sum16 *)skb_transport_header(skb) +
1505                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1506                                                                 arg->csum));
1507                 skb->ip_summed = CHECKSUM_NONE;
1508                 ip_push_pending_frames(sk);
1509         }
1510
1511         bh_unlock_sock(sk);
1512
1513         ip_rt_put(rt);
1514 }
1515
1516 void __init ip_init(void)
1517 {
1518         ip_rt_init();
1519         inet_initpeers();
1520
1521 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1522         igmp_mc_proc_init();
1523 #endif
1524 }