net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/system.h>
  47 #include <linux/module.h>
  48 #include <linux/types.h>
  49 #include <linux/kernel.h>
  50 #include <linux/mm.h>
  51 #include <linux/string.h>
  52 #include <linux/errno.h>
  53 #include <linux/highmem.h>
  54 #include <linux/slab.h>
  55
  56 #include <linux/socket.h>
  57 #include <linux/sockios.h>
  58 #include <linux/in.h>
  59 #include <linux/inet.h>
  60 #include <linux/netdevice.h>
  61 #include <linux/etherdevice.h>
  62 #include <linux/proc_fs.h>
  63 #include <linux/stat.h>
  64 #include <linux/init.h>
  65
  66 #include <net/snmp.h>
  67 #include <net/ip.h>
  68 #include <net/protocol.h>
  69 #include <net/route.h>
  70 #include <net/xfrm.h>
  71 #include <linux/skbuff.h>
  72 #include <net/sock.h>
  73 #include <net/arp.h>
  74 #include <net/icmp.h>
  75 #include <net/checksum.h>
  76 #include <net/inetpeer.h>
  77 #include <linux/igmp.h>
  78 #include <linux/netfilter_ipv4.h>
  79 #include <linux/netfilter_bridge.h>
  80 #include <linux/mroute.h>
  81 #include <linux/netlink.h>
  82 #include <linux/tcp.h>
  83
  84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93 EXPORT_SYMBOL(ip_send_check);
  94
  95 int __ip_local_out(struct sk_buff *skb)
  96 {
  97         struct iphdr *iph = ip_hdr(skb);
  98
  99         iph->tot_len = htons(skb->len);
 100         ip_send_check(iph);
 101         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 102                        skb_dst(skb)->dev, dst_output);
 103 }
 104
 105 int ip_local_out(struct sk_buff *skb)
 106 {
 107         int err;
 108
 109         err = __ip_local_out(skb);
 110         if (likely(err == 1))
 111                 err = dst_output(skb);
 112
 113         return err;
 114 }
 115 EXPORT_SYMBOL_GPL(ip_local_out);
 116
 117 /* dev_loopback_xmit for use with netfilter. */
 118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 119 {
 120         skb_reset_mac_header(newskb);
 121         __skb_pull(newskb, skb_network_offset(newskb));
 122         newskb->pkt_type = PACKET_LOOPBACK;
 123         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 124         WARN_ON(!skb_dst(newskb));
 125         netif_rx_ni(newskb);
 126         return 0;
 127 }
 128
 129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130 {
 131         int ttl = inet->uc_ttl;
 132
 133         if (ttl < 0)
 134                 ttl = ip4_dst_hoplimit(dst);
 135         return ttl;
 136 }
 137
 138 /*
 139  *              Add an ip header to a skbuff and send it out.
 140  *
 141  */
 142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 144 {
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct rtable *rt = skb_rtable(skb);
 147         struct iphdr *iph;
 148
 149         /* Build the IP header. */
 150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 151         skb_reset_network_header(skb);
 152         iph = ip_hdr(skb);
 153         iph->version  = 4;
 154         iph->ihl      = 5;
 155         iph->tos      = inet->tos;
 156         if (ip_dont_fragment(sk, &rt->dst))
 157                 iph->frag_off = htons(IP_DF);
 158         else
 159                 iph->frag_off = 0;
 160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161         iph->daddr    = rt->rt_dst;
 162         iph->saddr    = rt->rt_src;
 163         iph->protocol = sk->sk_protocol;
 164         ip_select_ident(iph, &rt->dst, sk);
 165
 166         if (opt && opt->optlen) {
 167                 iph->ihl += opt->optlen>>2;
 168                 ip_options_build(skb, opt, daddr, rt, 0);
 169         }
 170
 171         skb->priority = sk->sk_priority;
 172         skb->mark = sk->sk_mark;
 173
 174         /* Send it out. */
 175         return ip_local_out(skb);
 176 }
 177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179 static inline int ip_finish_output2(struct sk_buff *skb)
 180 {
 181         struct dst_entry *dst = skb_dst(skb);
 182         struct rtable *rt = (struct rtable *)dst;
 183         struct net_device *dev = dst->dev;
 184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185
 186         if (rt->rt_type == RTN_MULTICAST) {
 187                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 188         } else if (rt->rt_type == RTN_BROADCAST)
 189                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 190
 191         /* Be paranoid, rather than too clever. */
 192         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 193                 struct sk_buff *skb2;
 194
 195                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                 if (skb2 == NULL) {
 197                         kfree_skb(skb);
 198                         return -ENOMEM;
 199                 }
 200                 if (skb->sk)
 201                         skb_set_owner_w(skb2, skb->sk);
 202                 kfree_skb(skb);
 203                 skb = skb2;
 204         }
 205
 206         if (dst->hh)
 207                 return neigh_hh_output(dst->hh, skb);
 208         else if (dst->neighbour)
 209                 return dst->neighbour->output(skb);
 210
 211         if (net_ratelimit())
 212                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213         kfree_skb(skb);
 214         return -EINVAL;
 215 }
 216
 217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218 {
 219         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 223 }
 224
 225 static int ip_finish_output(struct sk_buff *skb)
 226 {
 227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228         /* Policy lookup after SNAT yielded a new policy */
 229         if (skb_dst(skb)->xfrm != NULL) {
 230                 IPCB(skb)->flags |= IPSKB_REROUTED;
 231                 return dst_output(skb);
 232         }
 233 #endif
 234         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                 return ip_fragment(skb, ip_finish_output2);
 236         else
 237                 return ip_finish_output2(skb);
 238 }
 239
 240 int ip_mc_output(struct sk_buff *skb)
 241 {
 242         struct sock *sk = skb->sk;
 243         struct rtable *rt = skb_rtable(skb);
 244         struct net_device *dev = rt->dst.dev;
 245
 246         /*
 247          *      If the indicated interface is up and running, send the packet.
 248          */
 249         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 250
 251         skb->dev = dev;
 252         skb->protocol = htons(ETH_P_IP);
 253
 254         /*
 255          *      Multicasts are looped back for other local users
 256          */
 257
 258         if (rt->rt_flags&RTCF_MULTICAST) {
 259                 if (sk_mc_loop(sk)
 260 #ifdef CONFIG_IP_MROUTE
 261                 /* Small optimization: do not loopback not local frames,
 262                    which returned after forwarding; they will be  dropped
 263                    by ip_mr_input in any case.
 264                    Note, that local frames are looped back to be delivered
 265                    to local recipients.
 266
 267                    This check is duplicated in ip_mr_input at the moment.
 268                  */
 269                     &&
 270                     ((rt->rt_flags & RTCF_LOCAL) ||
 271                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 272 #endif
 273                    ) {
 274                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                         if (newskb)
 276                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 277                                         newskb, NULL, newskb->dev,
 278                                         ip_dev_loopback_xmit);
 279                 }
 280
 281                 /* Multicasts with ttl 0 must not go beyond the host */
 282
 283                 if (ip_hdr(skb)->ttl == 0) {
 284                         kfree_skb(skb);
 285                         return 0;
 286                 }
 287         }
 288
 289         if (rt->rt_flags&RTCF_BROADCAST) {
 290                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 291                 if (newskb)
 292                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 293                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 294         }
 295
 296         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 297                             skb->dev, ip_finish_output,
 298                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 299 }
 300
 301 int ip_output(struct sk_buff *skb)
 302 {
 303         struct net_device *dev = skb_dst(skb)->dev;
 304
 305         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 306
 307         skb->dev = dev;
 308         skb->protocol = htons(ETH_P_IP);
 309
 310         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 311                             ip_finish_output,
 312                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 313 }
 314
 315 int ip_queue_xmit(struct sk_buff *skb)
 316 {
 317         struct sock *sk = skb->sk;
 318         struct inet_sock *inet = inet_sk(sk);
 319         struct ip_options *opt = inet->opt;
 320         struct rtable *rt;
 321         struct iphdr *iph;
 322         int res;
 323
 324         /* Skip all of this if the packet is already routed,
 325          * f.e. by something like SCTP.
 326          */
 327         rcu_read_lock();
 328         rt = skb_rtable(skb);
 329         if (rt != NULL)
 330                 goto packet_routed;
 331
 332         /* Make sure we can route this packet. */
 333         rt = (struct rtable *)__sk_dst_check(sk, 0);
 334         if (rt == NULL) {
 335                 __be32 daddr;
 336
 337                 /* Use correct destination address if we have options. */
 338                 daddr = inet->inet_daddr;
 339                 if(opt && opt->srr)
 340                         daddr = opt->faddr;
 341
 342                 /* If this fails, retransmit mechanism of transport layer will
 343                  * keep trying until route appears or the connection times
 344                  * itself out.
 345                  */
 346                 rt = ip_route_output_ports(sock_net(sk), sk,
 347                                            daddr, inet->inet_saddr,
 348                                            inet->inet_dport,
 349                                            inet->inet_sport,
 350                                            sk->sk_protocol,
 351                                            RT_CONN_FLAGS(sk),
 352                                            sk->sk_bound_dev_if);
 353                 if (IS_ERR(rt))
 354                         goto no_route;
 355                 sk_setup_caps(sk, &rt->dst);
 356         }
 357         skb_dst_set_noref(skb, &rt->dst);
 358
 359 packet_routed:
 360         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 361                 goto no_route;
 362
 363         /* OK, we know where to send it, allocate and build IP header. */
 364         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 365         skb_reset_network_header(skb);
 366         iph = ip_hdr(skb);
 367         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 368         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 369                 iph->frag_off = htons(IP_DF);
 370         else
 371                 iph->frag_off = 0;
 372         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 373         iph->protocol = sk->sk_protocol;
 374         iph->saddr    = rt->rt_src;
 375         iph->daddr    = rt->rt_dst;
 376         /* Transport layer set skb->h.foo itself. */
 377
 378         if (opt && opt->optlen) {
 379                 iph->ihl += opt->optlen >> 2;
 380                 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 381         }
 382
 383         ip_select_ident_more(iph, &rt->dst, sk,
 384                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 385
 386         skb->priority = sk->sk_priority;
 387         skb->mark = sk->sk_mark;
 388
 389         res = ip_local_out(skb);
 390         rcu_read_unlock();
 391         return res;
 392
 393 no_route:
 394         rcu_read_unlock();
 395         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 396         kfree_skb(skb);
 397         return -EHOSTUNREACH;
 398 }
 399 EXPORT_SYMBOL(ip_queue_xmit);
 400
 401
 402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 403 {
 404         to->pkt_type = from->pkt_type;
 405         to->priority = from->priority;
 406         to->protocol = from->protocol;
 407         skb_dst_drop(to);
 408         skb_dst_copy(to, from);
 409         to->dev = from->dev;
 410         to->mark = from->mark;
 411
 412         /* Copy the flags to each fragment. */
 413         IPCB(to)->flags = IPCB(from)->flags;
 414
 415 #ifdef CONFIG_NET_SCHED
 416         to->tc_index = from->tc_index;
 417 #endif
 418         nf_copy(to, from);
 419 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 420     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 421         to->nf_trace = from->nf_trace;
 422 #endif
 423 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 424         to->ipvs_property = from->ipvs_property;
 425 #endif
 426         skb_copy_secmark(to, from);
 427 }
 428
 429 /*
 430  *      This IP datagram is too large to be sent in one piece.  Break it up into
 431  *      smaller pieces (each of size equal to IP header plus
 432  *      a block of the data of the original IP data part) that will yet fit in a
 433  *      single device frame, and queue such a frame for sending.
 434  */
 435
 436 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 437 {
 438         struct iphdr *iph;
 439         int ptr;
 440         struct net_device *dev;
 441         struct sk_buff *skb2;
 442         unsigned int mtu, hlen, left, len, ll_rs;
 443         int offset;
 444         __be16 not_last_frag;
 445         struct rtable *rt = skb_rtable(skb);
 446         int err = 0;
 447
 448         dev = rt->dst.dev;
 449
 450         /*
 451          *      Point into the IP datagram header.
 452          */
 453
 454         iph = ip_hdr(skb);
 455
 456         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 457                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 458                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 459                           htonl(ip_skb_dst_mtu(skb)));
 460                 kfree_skb(skb);
 461                 return -EMSGSIZE;
 462         }
 463
 464         /*
 465          *      Setup starting values.
 466          */
 467
 468         hlen = iph->ihl * 4;
 469         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 470 #ifdef CONFIG_BRIDGE_NETFILTER
 471         if (skb->nf_bridge)
 472                 mtu -= nf_bridge_mtu_reduction(skb);
 473 #endif
 474         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 475
 476         /* When frag_list is given, use it. First, check its validity:
 477          * some transformers could create wrong frag_list or break existing
 478          * one, it is not prohibited. In this case fall back to copying.
 479          *
 480          * LATER: this step can be merged to real generation of fragments,
 481          * we can switch to copy when see the first bad fragment.
 482          */
 483         if (skb_has_frag_list(skb)) {
 484                 struct sk_buff *frag, *frag2;
 485                 int first_len = skb_pagelen(skb);
 486
 487                 if (first_len - hlen > mtu ||
 488                     ((first_len - hlen) & 7) ||
 489                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 490                     skb_cloned(skb))
 491                         goto slow_path;
 492
 493                 skb_walk_frags(skb, frag) {
 494                         /* Correct geometry. */
 495                         if (frag->len > mtu ||
 496                             ((frag->len & 7) && frag->next) ||
 497                             skb_headroom(frag) < hlen)
 498                                 goto slow_path_clean;
 499
 500                         /* Partially cloned skb? */
 501                         if (skb_shared(frag))
 502                                 goto slow_path_clean;
 503
 504                         BUG_ON(frag->sk);
 505                         if (skb->sk) {
 506                                 frag->sk = skb->sk;
 507                                 frag->destructor = sock_wfree;
 508                         }
 509                         skb->truesize -= frag->truesize;
 510                 }
 511
 512                 /* Everything is OK. Generate! */
 513
 514                 err = 0;
 515                 offset = 0;
 516                 frag = skb_shinfo(skb)->frag_list;
 517                 skb_frag_list_init(skb);
 518                 skb->data_len = first_len - skb_headlen(skb);
 519                 skb->len = first_len;
 520                 iph->tot_len = htons(first_len);
 521                 iph->frag_off = htons(IP_MF);
 522                 ip_send_check(iph);
 523
 524                 for (;;) {
 525                         /* Prepare header of the next frame,
 526                          * before previous one went down. */
 527                         if (frag) {
 528                                 frag->ip_summed = CHECKSUM_NONE;
 529                                 skb_reset_transport_header(frag);
 530                                 __skb_push(frag, hlen);
 531                                 skb_reset_network_header(frag);
 532                                 memcpy(skb_network_header(frag), iph, hlen);
 533                                 iph = ip_hdr(frag);
 534                                 iph->tot_len = htons(frag->len);
 535                                 ip_copy_metadata(frag, skb);
 536                                 if (offset == 0)
 537                                         ip_options_fragment(frag);
 538                                 offset += skb->len - hlen;
 539                                 iph->frag_off = htons(offset>>3);
 540                                 if (frag->next != NULL)
 541                                         iph->frag_off |= htons(IP_MF);
 542                                 /* Ready, complete checksum */
 543                                 ip_send_check(iph);
 544                         }
 545
 546                         err = output(skb);
 547
 548                         if (!err)
 549                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 550                         if (err || !frag)
 551                                 break;
 552
 553                         skb = frag;
 554                         frag = skb->next;
 555                         skb->next = NULL;
 556                 }
 557
 558                 if (err == 0) {
 559                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 560                         return 0;
 561                 }
 562
 563                 while (frag) {
 564                         skb = frag->next;
 565                         kfree_skb(frag);
 566                         frag = skb;
 567                 }
 568                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 569                 return err;
 570
 571 slow_path_clean:
 572                 skb_walk_frags(skb, frag2) {
 573                         if (frag2 == frag)
 574                                 break;
 575                         frag2->sk = NULL;
 576                         frag2->destructor = NULL;
 577                         skb->truesize += frag2->truesize;
 578                 }
 579         }
 580
 581 slow_path:
 582         left = skb->len - hlen;         /* Space per frame */
 583         ptr = hlen;             /* Where to start from */
 584
 585         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 586          * we need to make room for the encapsulating header
 587          */
 588         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 589
 590         /*
 591          *      Fragment the datagram.
 592          */
 593
 594         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 595         not_last_frag = iph->frag_off & htons(IP_MF);
 596
 597         /*
 598          *      Keep copying data until we run out.
 599          */
 600
 601         while (left > 0) {
 602                 len = left;
 603                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 604                 if (len > mtu)
 605                         len = mtu;
 606                 /* IF: we are not sending up to and including the packet end
 607                    then align the next start on an eight byte boundary */
 608                 if (len < left) {
 609                         len &= ~7;
 610                 }
 611                 /*
 612                  *      Allocate buffer.
 613                  */
 614
 615                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 616                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 617                         err = -ENOMEM;
 618                         goto fail;
 619                 }
 620
 621                 /*
 622                  *      Set up data on packet
 623                  */
 624
 625                 ip_copy_metadata(skb2, skb);
 626                 skb_reserve(skb2, ll_rs);
 627                 skb_put(skb2, len + hlen);
 628                 skb_reset_network_header(skb2);
 629                 skb2->transport_header = skb2->network_header + hlen;
 630
 631                 /*
 632                  *      Charge the memory for the fragment to any owner
 633                  *      it might possess
 634                  */
 635
 636                 if (skb->sk)
 637                         skb_set_owner_w(skb2, skb->sk);
 638
 639                 /*
 640                  *      Copy the packet header into the new buffer.
 641                  */
 642
 643                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 644
 645                 /*
 646                  *      Copy a block of the IP datagram.
 647                  */
 648                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 649                         BUG();
 650                 left -= len;
 651
 652                 /*
 653                  *      Fill in the new header fields.
 654                  */
 655                 iph = ip_hdr(skb2);
 656                 iph->frag_off = htons((offset >> 3));
 657
 658                 /* ANK: dirty, but effective trick. Upgrade options only if
 659                  * the segment to be fragmented was THE FIRST (otherwise,
 660                  * options are already fixed) and make it ONCE
 661                  * on the initial skb, so that all the following fragments
 662                  * will inherit fixed options.
 663                  */
 664                 if (offset == 0)
 665                         ip_options_fragment(skb);
 666
 667                 /*
 668                  *      Added AC : If we are fragmenting a fragment that's not the
 669                  *                 last fragment then keep MF on each bit
 670                  */
 671                 if (left > 0 || not_last_frag)
 672                         iph->frag_off |= htons(IP_MF);
 673                 ptr += len;
 674                 offset += len;
 675
 676                 /*
 677                  *      Put this fragment into the sending queue.
 678                  */
 679                 iph->tot_len = htons(len + hlen);
 680
 681                 ip_send_check(iph);
 682
 683                 err = output(skb2);
 684                 if (err)
 685                         goto fail;
 686
 687                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 688         }
 689         kfree_skb(skb);
 690         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 691         return err;
 692
 693 fail:
 694         kfree_skb(skb);
 695         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 696         return err;
 697 }
 698 EXPORT_SYMBOL(ip_fragment);
 699
 700 int
 701 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 702 {
 703         struct iovec *iov = from;
 704
 705         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 706                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 707                         return -EFAULT;
 708         } else {
 709                 __wsum csum = 0;
 710                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 711                         return -EFAULT;
 712                 skb->csum = csum_block_add(skb->csum, csum, odd);
 713         }
 714         return 0;
 715 }
 716 EXPORT_SYMBOL(ip_generic_getfrag);
 717
 718 static inline __wsum
 719 csum_page(struct page *page, int offset, int copy)
 720 {
 721         char *kaddr;
 722         __wsum csum;
 723         kaddr = kmap(page);
 724         csum = csum_partial(kaddr + offset, copy, 0);
 725         kunmap(page);
 726         return csum;
 727 }
 728
 729 static inline int ip_ufo_append_data(struct sock *sk,
 730                         struct sk_buff_head *queue,
 731                         int getfrag(void *from, char *to, int offset, int len,
 732                                int odd, struct sk_buff *skb),
 733                         void *from, int length, int hh_len, int fragheaderlen,
 734                         int transhdrlen, int mtu, unsigned int flags)
 735 {
 736         struct sk_buff *skb;
 737         int err;
 738
 739         /* There is support for UDP fragmentation offload by network
 740          * device, so create one single skb packet containing complete
 741          * udp datagram
 742          */
 743         if ((skb = skb_peek_tail(queue)) == NULL) {
 744                 skb = sock_alloc_send_skb(sk,
 745                         hh_len + fragheaderlen + transhdrlen + 20,
 746                         (flags & MSG_DONTWAIT), &err);
 747
 748                 if (skb == NULL)
 749                         return err;
 750
 751                 /* reserve space for Hardware header */
 752                 skb_reserve(skb, hh_len);
 753
 754                 /* create space for UDP/IP header */
 755                 skb_put(skb, fragheaderlen + transhdrlen);
 756
 757                 /* initialize network header pointer */
 758                 skb_reset_network_header(skb);
 759
 760                 /* initialize protocol header pointer */
 761                 skb->transport_header = skb->network_header + fragheaderlen;
 762
 763                 skb->ip_summed = CHECKSUM_PARTIAL;
 764                 skb->csum = 0;
 765
 766                 /* specify the length of each IP datagram fragment */
 767                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 768                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 769                 __skb_queue_tail(queue, skb);
 770         }
 771
 772         return skb_append_datato_frags(sk, skb, getfrag, from,
 773                                        (length - transhdrlen));
 774 }
 775
 776 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
 777                             struct inet_cork *cork,
 778                             int getfrag(void *from, char *to, int offset,
 779                                         int len, int odd, struct sk_buff *skb),
 780                             void *from, int length, int transhdrlen,
 781                             unsigned int flags)
 782 {
 783         struct inet_sock *inet = inet_sk(sk);
 784         struct sk_buff *skb;
 785
 786         struct ip_options *opt = cork->opt;
 787         int hh_len;
 788         int exthdrlen;
 789         int mtu;
 790         int copy;
 791         int err;
 792         int offset = 0;
 793         unsigned int maxfraglen, fragheaderlen;
 794         int csummode = CHECKSUM_NONE;
 795         struct rtable *rt = (struct rtable *)cork->dst;
 796
 797         exthdrlen = transhdrlen ? rt->dst.header_len : 0;
 798         length += exthdrlen;
 799         transhdrlen += exthdrlen;
 800         mtu = cork->fragsize;
 801
 802         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 803
 804         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 805         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 806
 807         if (cork->length + length > 0xFFFF - fragheaderlen) {
 808                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 809                                mtu-exthdrlen);
 810                 return -EMSGSIZE;
 811         }
 812
 813         /*
 814          * transhdrlen > 0 means that this is the first fragment and we wish
 815          * it won't be fragmented in the future.
 816          */
 817         if (transhdrlen &&
 818             length + fragheaderlen <= mtu &&
 819             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 820             !exthdrlen)
 821                 csummode = CHECKSUM_PARTIAL;
 822
 823         skb = skb_peek_tail(queue);
 824
 825         cork->length += length;
 826         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 827             (sk->sk_protocol == IPPROTO_UDP) &&
 828             (rt->dst.dev->features & NETIF_F_UFO)) {
 829                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
 830                                          hh_len, fragheaderlen, transhdrlen,
 831                                          mtu, flags);
 832                 if (err)
 833                         goto error;
 834                 return 0;
 835         }
 836
 837         /* So, what's going on in the loop below?
 838          *
 839          * We use calculated fragment length to generate chained skb,
 840          * each of segments is IP fragment ready for sending to network after
 841          * adding appropriate IP header.
 842          */
 843
 844         if (!skb)
 845                 goto alloc_new_skb;
 846
 847         while (length > 0) {
 848                 /* Check if the remaining data fits into current packet. */
 849                 copy = mtu - skb->len;
 850                 if (copy < length)
 851                         copy = maxfraglen - skb->len;
 852                 if (copy <= 0) {
 853                         char *data;
 854                         unsigned int datalen;
 855                         unsigned int fraglen;
 856                         unsigned int fraggap;
 857                         unsigned int alloclen;
 858                         struct sk_buff *skb_prev;
 859 alloc_new_skb:
 860                         skb_prev = skb;
 861                         if (skb_prev)
 862                                 fraggap = skb_prev->len - maxfraglen;
 863                         else
 864                                 fraggap = 0;
 865
 866                         /*
 867                          * If remaining data exceeds the mtu,
 868                          * we know we need more fragment(s).
 869                          */
 870                         datalen = length + fraggap;
 871                         if (datalen > mtu - fragheaderlen)
 872                                 datalen = maxfraglen - fragheaderlen;
 873                         fraglen = datalen + fragheaderlen;
 874
 875                         if ((flags & MSG_MORE) &&
 876                             !(rt->dst.dev->features&NETIF_F_SG))
 877                                 alloclen = mtu;
 878                         else
 879                                 alloclen = fraglen;
 880
 881                         /* The last fragment gets additional space at tail.
 882                          * Note, with MSG_MORE we overallocate on fragments,
 883                          * because we have no idea what fragment will be
 884                          * the last.
 885                          */
 886                         if (datalen == length + fraggap) {
 887                                 alloclen += rt->dst.trailer_len;
 888                                 /* make sure mtu is not reached */
 889                                 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
 890                                         datalen -= ALIGN(rt->dst.trailer_len, 8);
 891                         }
 892                         if (transhdrlen) {
 893                                 skb = sock_alloc_send_skb(sk,
 894                                                 alloclen + hh_len + 15,
 895                                                 (flags & MSG_DONTWAIT), &err);
 896                         } else {
 897                                 skb = NULL;
 898                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 899                                     2 * sk->sk_sndbuf)
 900                                         skb = sock_wmalloc(sk,
 901                                                            alloclen + hh_len + 15, 1,
 902                                                            sk->sk_allocation);
 903                                 if (unlikely(skb == NULL))
 904                                         err = -ENOBUFS;
 905                                 else
 906                                         /* only the initial fragment is
 907                                            time stamped */
 908                                         cork->tx_flags = 0;
 909                         }
 910                         if (skb == NULL)
 911                                 goto error;
 912
 913                         /*
 914                          *      Fill in the control structures
 915                          */
 916                         skb->ip_summed = csummode;
 917                         skb->csum = 0;
 918                         skb_reserve(skb, hh_len);
 919                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
 920
 921                         /*
 922                          *      Find where to start putting bytes.
 923                          */
 924                         data = skb_put(skb, fraglen);
 925                         skb_set_network_header(skb, exthdrlen);
 926                         skb->transport_header = (skb->network_header +
 927                                                  fragheaderlen);
 928                         data += fragheaderlen;
 929
 930                         if (fraggap) {
 931                                 skb->csum = skb_copy_and_csum_bits(
 932                                         skb_prev, maxfraglen,
 933                                         data + transhdrlen, fraggap, 0);
 934                                 skb_prev->csum = csum_sub(skb_prev->csum,
 935                                                           skb->csum);
 936                                 data += fraggap;
 937                                 pskb_trim_unique(skb_prev, maxfraglen);
 938                         }
 939
 940                         copy = datalen - transhdrlen - fraggap;
 941                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 942                                 err = -EFAULT;
 943                                 kfree_skb(skb);
 944                                 goto error;
 945                         }
 946
 947                         offset += copy;
 948                         length -= datalen - fraggap;
 949                         transhdrlen = 0;
 950                         exthdrlen = 0;
 951                         csummode = CHECKSUM_NONE;
 952
 953                         /*
 954                          * Put the packet on the pending queue.
 955                          */
 956                         __skb_queue_tail(queue, skb);
 957                         continue;
 958                 }
 959
 960                 if (copy > length)
 961                         copy = length;
 962
 963                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
 964                         unsigned int off;
 965
 966                         off = skb->len;
 967                         if (getfrag(from, skb_put(skb, copy),
 968                                         offset, copy, off, skb) < 0) {
 969                                 __skb_trim(skb, off);
 970                                 err = -EFAULT;
 971                                 goto error;
 972                         }
 973                 } else {
 974                         int i = skb_shinfo(skb)->nr_frags;
 975                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 976                         struct page *page = cork->page;
 977                         int off = cork->off;
 978                         unsigned int left;
 979
 980                         if (page && (left = PAGE_SIZE - off) > 0) {
 981                                 if (copy >= left)
 982                                         copy = left;
 983                                 if (page != frag->page) {
 984                                         if (i == MAX_SKB_FRAGS) {
 985                                                 err = -EMSGSIZE;
 986                                                 goto error;
 987                                         }
 988                                         get_page(page);
 989                                         skb_fill_page_desc(skb, i, page, off, 0);
 990                                         frag = &skb_shinfo(skb)->frags[i];
 991                                 }
 992                         } else if (i < MAX_SKB_FRAGS) {
 993                                 if (copy > PAGE_SIZE)
 994                                         copy = PAGE_SIZE;
 995                                 page = alloc_pages(sk->sk_allocation, 0);
 996                                 if (page == NULL)  {
 997                                         err = -ENOMEM;
 998                                         goto error;
 999                                 }
1000                                 cork->page = page;
1001                                 cork->off = 0;
1002
1003                                 skb_fill_page_desc(skb, i, page, 0, 0);
1004                                 frag = &skb_shinfo(skb)->frags[i];
1005                         } else {
1006                                 err = -EMSGSIZE;
1007                                 goto error;
1008                         }
1009                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1010                                 err = -EFAULT;
1011                                 goto error;
1012                         }
1013                         cork->off += copy;
1014                         frag->size += copy;
1015                         skb->len += copy;
1016                         skb->data_len += copy;
1017                         skb->truesize += copy;
1018                         atomic_add(copy, &sk->sk_wmem_alloc);
1019                 }
1020                 offset += copy;
1021                 length -= copy;
1022         }
1023
1024         return 0;
1025
1026 error:
1027         cork->length -= length;
1028         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1029         return err;
1030 }
1031
1032 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033                          struct ipcm_cookie *ipc, struct rtable **rtp)
1034 {
1035         struct inet_sock *inet = inet_sk(sk);
1036         struct ip_options *opt;
1037         struct rtable *rt;
1038
1039         /*
1040          * setup for corking.
1041          */
1042         opt = ipc->opt;
1043         if (opt) {
1044                 if (cork->opt == NULL) {
1045                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046                                             sk->sk_allocation);
1047                         if (unlikely(cork->opt == NULL))
1048                                 return -ENOBUFS;
1049                 }
1050                 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051                 cork->flags |= IPCORK_OPT;
1052                 cork->addr = ipc->addr;
1053         }
1054         rt = *rtp;
1055         if (unlikely(!rt))
1056                 return -EFAULT;
1057         /*
1058          * We steal reference to this route, caller should not release it
1059          */
1060         *rtp = NULL;
1061         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062                          rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063         cork->dst = &rt->dst;
1064         cork->length = 0;
1065         cork->tx_flags = ipc->tx_flags;
1066         cork->page = NULL;
1067         cork->off = 0;
1068
1069         return 0;
1070 }
1071
1072 /*
1073  *      ip_append_data() and ip_append_page() can make one large IP datagram
1074  *      from many pieces of data. Each pieces will be holded on the socket
1075  *      until ip_push_pending_frames() is called. Each piece can be a page
1076  *      or non-page data.
1077  *
1078  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1079  *      this interface potentially.
1080  *
1081  *      LATER: length must be adjusted by pad at tail, when it is required.
1082  */
1083 int ip_append_data(struct sock *sk,
1084                    int getfrag(void *from, char *to, int offset, int len,
1085                                int odd, struct sk_buff *skb),
1086                    void *from, int length, int transhdrlen,
1087                    struct ipcm_cookie *ipc, struct rtable **rtp,
1088                    unsigned int flags)
1089 {
1090         struct inet_sock *inet = inet_sk(sk);
1091         int err;
1092
1093         if (flags&MSG_PROBE)
1094                 return 0;
1095
1096         if (skb_queue_empty(&sk->sk_write_queue)) {
1097                 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098                 if (err)
1099                         return err;
1100         } else {
1101                 transhdrlen = 0;
1102         }
1103
1104         return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105                                 from, length, transhdrlen, flags);
1106 }
1107
1108 ssize_t ip_append_page(struct sock *sk, struct page *page,
1109                        int offset, size_t size, int flags)
1110 {
1111         struct inet_sock *inet = inet_sk(sk);
1112         struct sk_buff *skb;
1113         struct rtable *rt;
1114         struct ip_options *opt = NULL;
1115         int hh_len;
1116         int mtu;
1117         int len;
1118         int err;
1119         unsigned int maxfraglen, fragheaderlen, fraggap;
1120
1121         if (inet->hdrincl)
1122                 return -EPERM;
1123
1124         if (flags&MSG_PROBE)
1125                 return 0;
1126
1127         if (skb_queue_empty(&sk->sk_write_queue))
1128                 return -EINVAL;
1129
1130         rt = (struct rtable *)inet->cork.dst;
1131         if (inet->cork.flags & IPCORK_OPT)
1132                 opt = inet->cork.opt;
1133
1134         if (!(rt->dst.dev->features&NETIF_F_SG))
1135                 return -EOPNOTSUPP;
1136
1137         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1138         mtu = inet->cork.fragsize;
1139
1140         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1141         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1142
1143         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1144                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1145                 return -EMSGSIZE;
1146         }
1147
1148         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1149                 return -EINVAL;
1150
1151         inet->cork.length += size;
1152         if ((size + skb->len > mtu) &&
1153             (sk->sk_protocol == IPPROTO_UDP) &&
1154             (rt->dst.dev->features & NETIF_F_UFO)) {
1155                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1156                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157         }
1158
1159
1160         while (size > 0) {
1161                 int i;
1162
1163                 if (skb_is_gso(skb))
1164                         len = size;
1165                 else {
1166
1167                         /* Check if the remaining data fits into current packet. */
1168                         len = mtu - skb->len;
1169                         if (len < size)
1170                                 len = maxfraglen - skb->len;
1171                 }
1172                 if (len <= 0) {
1173                         struct sk_buff *skb_prev;
1174                         int alloclen;
1175
1176                         skb_prev = skb;
1177                         fraggap = skb_prev->len - maxfraglen;
1178
1179                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1180                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1181                         if (unlikely(!skb)) {
1182                                 err = -ENOBUFS;
1183                                 goto error;
1184                         }
1185
1186                         /*
1187                          *      Fill in the control structures
1188                          */
1189                         skb->ip_summed = CHECKSUM_NONE;
1190                         skb->csum = 0;
1191                         skb_reserve(skb, hh_len);
1192
1193                         /*
1194                          *      Find where to start putting bytes.
1195                          */
1196                         skb_put(skb, fragheaderlen + fraggap);
1197                         skb_reset_network_header(skb);
1198                         skb->transport_header = (skb->network_header +
1199                                                  fragheaderlen);
1200                         if (fraggap) {
1201                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1202                                                                    maxfraglen,
1203                                                     skb_transport_header(skb),
1204                                                                    fraggap, 0);
1205                                 skb_prev->csum = csum_sub(skb_prev->csum,
1206                                                           skb->csum);
1207                                 pskb_trim_unique(skb_prev, maxfraglen);
1208                         }
1209
1210                         /*
1211                          * Put the packet on the pending queue.
1212                          */
1213                         __skb_queue_tail(&sk->sk_write_queue, skb);
1214                         continue;
1215                 }
1216
1217                 i = skb_shinfo(skb)->nr_frags;
1218                 if (len > size)
1219                         len = size;
1220                 if (skb_can_coalesce(skb, i, page, offset)) {
1221                         skb_shinfo(skb)->frags[i-1].size += len;
1222                 } else if (i < MAX_SKB_FRAGS) {
1223                         get_page(page);
1224                         skb_fill_page_desc(skb, i, page, offset, len);
1225                 } else {
1226                         err = -EMSGSIZE;
1227                         goto error;
1228                 }
1229
1230                 if (skb->ip_summed == CHECKSUM_NONE) {
1231                         __wsum csum;
1232                         csum = csum_page(page, offset, len);
1233                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1234                 }
1235
1236                 skb->len += len;
1237                 skb->data_len += len;
1238                 skb->truesize += len;
1239                 atomic_add(len, &sk->sk_wmem_alloc);
1240                 offset += len;
1241                 size -= len;
1242         }
1243         return 0;
1244
1245 error:
1246         inet->cork.length -= size;
1247         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1248         return err;
1249 }
1250
1251 static void ip_cork_release(struct inet_cork *cork)
1252 {
1253         cork->flags &= ~IPCORK_OPT;
1254         kfree(cork->opt);
1255         cork->opt = NULL;
1256         dst_release(cork->dst);
1257         cork->dst = NULL;
1258 }
1259
1260 /*
1261  *      Combined all pending IP fragments on the socket as one IP datagram
1262  *      and push them out.
1263  */
1264 struct sk_buff *__ip_make_skb(struct sock *sk,
1265                               struct sk_buff_head *queue,
1266                               struct inet_cork *cork)
1267 {
1268         struct sk_buff *skb, *tmp_skb;
1269         struct sk_buff **tail_skb;
1270         struct inet_sock *inet = inet_sk(sk);
1271         struct net *net = sock_net(sk);
1272         struct ip_options *opt = NULL;
1273         struct rtable *rt = (struct rtable *)cork->dst;
1274         struct iphdr *iph;
1275         __be16 df = 0;
1276         __u8 ttl;
1277
1278         if ((skb = __skb_dequeue(queue)) == NULL)
1279                 goto out;
1280         tail_skb = &(skb_shinfo(skb)->frag_list);
1281
1282         /* move skb->data to ip header from ext header */
1283         if (skb->data < skb_network_header(skb))
1284                 __skb_pull(skb, skb_network_offset(skb));
1285         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1286                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1287                 *tail_skb = tmp_skb;
1288                 tail_skb = &(tmp_skb->next);
1289                 skb->len += tmp_skb->len;
1290                 skb->data_len += tmp_skb->len;
1291                 skb->truesize += tmp_skb->truesize;
1292                 tmp_skb->destructor = NULL;
1293                 tmp_skb->sk = NULL;
1294         }
1295
1296         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1297          * to fragment the frame generated here. No matter, what transforms
1298          * how transforms change size of the packet, it will come out.
1299          */
1300         if (inet->pmtudisc < IP_PMTUDISC_DO)
1301                 skb->local_df = 1;
1302
1303         /* DF bit is set when we want to see DF on outgoing frames.
1304          * If local_df is set too, we still allow to fragment this frame
1305          * locally. */
1306         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1307             (skb->len <= dst_mtu(&rt->dst) &&
1308              ip_dont_fragment(sk, &rt->dst)))
1309                 df = htons(IP_DF);
1310
1311         if (cork->flags & IPCORK_OPT)
1312                 opt = cork->opt;
1313
1314         if (rt->rt_type == RTN_MULTICAST)
1315                 ttl = inet->mc_ttl;
1316         else
1317                 ttl = ip_select_ttl(inet, &rt->dst);
1318
1319         iph = (struct iphdr *)skb->data;
1320         iph->version = 4;
1321         iph->ihl = 5;
1322         if (opt) {
1323                 iph->ihl += opt->optlen>>2;
1324                 ip_options_build(skb, opt, cork->addr, rt, 0);
1325         }
1326         iph->tos = inet->tos;
1327         iph->frag_off = df;
1328         ip_select_ident(iph, &rt->dst, sk);
1329         iph->ttl = ttl;
1330         iph->protocol = sk->sk_protocol;
1331         iph->saddr = rt->rt_src;
1332         iph->daddr = rt->rt_dst;
1333
1334         skb->priority = sk->sk_priority;
1335         skb->mark = sk->sk_mark;
1336         /*
1337          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1338          * on dst refcount
1339          */
1340         cork->dst = NULL;
1341         skb_dst_set(skb, &rt->dst);
1342
1343         if (iph->protocol == IPPROTO_ICMP)
1344                 icmp_out_count(net, ((struct icmphdr *)
1345                         skb_transport_header(skb))->type);
1346
1347         ip_cork_release(cork);
1348 out:
1349         return skb;
1350 }
1351
1352 int ip_send_skb(struct sk_buff *skb)
1353 {
1354         struct net *net = sock_net(skb->sk);
1355         int err;
1356
1357         err = ip_local_out(skb);
1358         if (err) {
1359                 if (err > 0)
1360                         err = net_xmit_errno(err);
1361                 if (err)
1362                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1363         }
1364
1365         return err;
1366 }
1367
1368 int ip_push_pending_frames(struct sock *sk)
1369 {
1370         struct sk_buff *skb;
1371
1372         skb = ip_finish_skb(sk);
1373         if (!skb)
1374                 return 0;
1375
1376         /* Netfilter gets whole the not fragmented skb. */
1377         return ip_send_skb(skb);
1378 }
1379
1380 /*
1381  *      Throw away all pending data on the socket.
1382  */
1383 static void __ip_flush_pending_frames(struct sock *sk,
1384                                       struct sk_buff_head *queue,
1385                                       struct inet_cork *cork)
1386 {
1387         struct sk_buff *skb;
1388
1389         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1390                 kfree_skb(skb);
1391
1392         ip_cork_release(cork);
1393 }
1394
1395 void ip_flush_pending_frames(struct sock *sk)
1396 {
1397         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1398 }
1399
1400 struct sk_buff *ip_make_skb(struct sock *sk,
1401                             int getfrag(void *from, char *to, int offset,
1402                                         int len, int odd, struct sk_buff *skb),
1403                             void *from, int length, int transhdrlen,
1404                             struct ipcm_cookie *ipc, struct rtable **rtp,
1405                             unsigned int flags)
1406 {
1407         struct inet_cork cork = {};
1408         struct sk_buff_head queue;
1409         int err;
1410
1411         if (flags & MSG_PROBE)
1412                 return NULL;
1413
1414         __skb_queue_head_init(&queue);
1415
1416         err = ip_setup_cork(sk, &cork, ipc, rtp);
1417         if (err)
1418                 return ERR_PTR(err);
1419
1420         err = __ip_append_data(sk, &queue, &cork, getfrag,
1421                                from, length, transhdrlen, flags);
1422         if (err) {
1423                 __ip_flush_pending_frames(sk, &queue, &cork);
1424                 return ERR_PTR(err);
1425         }
1426
1427         return __ip_make_skb(sk, &queue, &cork);
1428 }
1429
1430 /*
1431  *      Fetch data from kernel space and fill in checksum if needed.
1432  */
1433 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1434                               int len, int odd, struct sk_buff *skb)
1435 {
1436         __wsum csum;
1437
1438         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1439         skb->csum = csum_block_add(skb->csum, csum, odd);
1440         return 0;
1441 }
1442
1443 /*
1444  *      Generic function to send a packet as reply to another packet.
1445  *      Used to send TCP resets so far. ICMP should use this function too.
1446  *
1447  *      Should run single threaded per socket because it uses the sock
1448  *      structure to pass arguments.
1449  */
1450 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1451                    unsigned int len)
1452 {
1453         struct inet_sock *inet = inet_sk(sk);
1454         struct {
1455                 struct ip_options       opt;
1456                 char                    data[40];
1457         } replyopts;
1458         struct ipcm_cookie ipc;
1459         __be32 daddr;
1460         struct rtable *rt = skb_rtable(skb);
1461
1462         if (ip_options_echo(&replyopts.opt, skb))
1463                 return;
1464
1465         daddr = ipc.addr = rt->rt_src;
1466         ipc.opt = NULL;
1467         ipc.tx_flags = 0;
1468
1469         if (replyopts.opt.optlen) {
1470                 ipc.opt = &replyopts.opt;
1471
1472                 if (ipc.opt->srr)
1473                         daddr = replyopts.opt.faddr;
1474         }
1475
1476         {
1477                 struct flowi4 fl4 = {
1478                         .flowi4_oif = arg->bound_dev_if,
1479                         .daddr = daddr,
1480                         .saddr = rt->rt_spec_dst,
1481                         .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
1482                         .fl4_sport = tcp_hdr(skb)->dest,
1483                         .fl4_dport = tcp_hdr(skb)->source,
1484                         .flowi4_proto = sk->sk_protocol,
1485                         .flowi4_flags = ip_reply_arg_flowi_flags(arg),
1486                 };
1487                 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1488                 rt = ip_route_output_key(sock_net(sk), &fl4);
1489                 if (IS_ERR(rt))
1490                         return;
1491         }
1492
1493         /* And let IP do all the hard work.
1494
1495            This chunk is not reenterable, hence spinlock.
1496            Note that it uses the fact, that this function is called
1497            with locally disabled BH and that sk cannot be already spinlocked.
1498          */
1499         bh_lock_sock(sk);
1500         inet->tos = ip_hdr(skb)->tos;
1501         sk->sk_priority = skb->priority;
1502         sk->sk_protocol = ip_hdr(skb)->protocol;
1503         sk->sk_bound_dev_if = arg->bound_dev_if;
1504         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1505                        &ipc, &rt, MSG_DONTWAIT);
1506         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1507                 if (arg->csumoffset >= 0)
1508                         *((__sum16 *)skb_transport_header(skb) +
1509                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1510                                                                 arg->csum));
1511                 skb->ip_summed = CHECKSUM_NONE;
1512                 ip_push_pending_frames(sk);
1513         }
1514
1515         bh_unlock_sock(sk);
1516
1517         ip_rt_put(rt);
1518 }
1519
1520 void __init ip_init(void)
1521 {
1522         ip_rt_init();
1523         inet_initpeers();
1524
1525 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1526         igmp_mc_proc_init();
1527 #endif
1528 }