net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         nf_reset(newskb);
 115         netif_rx(newskb);
 116         return 0;
 117 }
 118
 119 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 120 {
 121         int ttl = inet->uc_ttl;
 122
 123         if (ttl < 0)
 124                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 125         return ttl;
 126 }
 127
 128 /*
 129  *              Add an ip header to a skbuff and send it out.
 130  *
 131  */
 132 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 133                           u32 saddr, u32 daddr, struct ip_options *opt)
 134 {
 135         struct inet_sock *inet = inet_sk(sk);
 136         struct rtable *rt = (struct rtable *)skb->dst;
 137         struct iphdr *iph;
 138
 139         /* Build the IP header. */
 140         if (opt)
 141                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 142         else
 143                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 144
 145         iph->version  = 4;
 146         iph->ihl      = 5;
 147         iph->tos      = inet->tos;
 148         if (ip_dont_fragment(sk, &rt->u.dst))
 149                 iph->frag_off = htons(IP_DF);
 150         else
 151                 iph->frag_off = 0;
 152         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 153         iph->daddr    = rt->rt_dst;
 154         iph->saddr    = rt->rt_src;
 155         iph->protocol = sk->sk_protocol;
 156         iph->tot_len  = htons(skb->len);
 157         ip_select_ident(iph, &rt->u.dst, sk);
 158         skb->nh.iph   = iph;
 159
 160         if (opt && opt->optlen) {
 161                 iph->ihl += opt->optlen>>2;
 162                 ip_options_build(skb, opt, daddr, rt, 0);
 163         }
 164         ip_send_check(iph);
 165
 166         skb->priority = sk->sk_priority;
 167
 168         /* Send it out. */
 169         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 170                        dst_output);
 171 }
 172
 173 static inline int ip_finish_output2(struct sk_buff *skb)
 174 {
 175         struct dst_entry *dst = skb->dst;
 176         struct hh_cache *hh = dst->hh;
 177         struct net_device *dev = dst->dev;
 178         int hh_len = LL_RESERVED_SPACE(dev);
 179
 180         /* Be paranoid, rather than too clever. */
 181         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 182                 struct sk_buff *skb2;
 183
 184                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 185                 if (skb2 == NULL) {
 186                         kfree_skb(skb);
 187                         return -ENOMEM;
 188                 }
 189                 if (skb->sk)
 190                         skb_set_owner_w(skb2, skb->sk);
 191                 kfree_skb(skb);
 192                 skb = skb2;
 193         }
 194
 195 #ifdef CONFIG_NETFILTER_DEBUG
 196         nf_debug_ip_finish_output2(skb);
 197 #endif /*CONFIG_NETFILTER_DEBUG*/
 198
 199         nf_reset(skb);
 200
 201         if (hh) {
 202                 int hh_alen;
 203
 204                 read_lock_bh(&hh->hh_lock);
 205                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 206                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 207                 read_unlock_bh(&hh->hh_lock);
 208                 skb_push(skb, hh->hh_len);
 209                 return hh->hh_output(skb);
 210         } else if (dst->neighbour)
 211                 return dst->neighbour->output(skb);
 212
 213         if (net_ratelimit())
 214                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 215         kfree_skb(skb);
 216         return -EINVAL;
 217 }
 218
 219 int ip_finish_output(struct sk_buff *skb)
 220 {
 221         struct net_device *dev = skb->dst->dev;
 222
 223         skb->dev = dev;
 224         skb->protocol = htons(ETH_P_IP);
 225
 226         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 227                        ip_finish_output2);
 228 }
 229
 230 int ip_mc_output(struct sk_buff *skb)
 231 {
 232         struct sock *sk = skb->sk;
 233         struct rtable *rt = (struct rtable*)skb->dst;
 234         struct net_device *dev = rt->u.dst.dev;
 235
 236         /*
 237          *      If the indicated interface is up and running, send the packet.
 238          */
 239         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 240
 241         skb->dev = dev;
 242         skb->protocol = htons(ETH_P_IP);
 243
 244         /*
 245          *      Multicasts are looped back for other local users
 246          */
 247
 248         if (rt->rt_flags&RTCF_MULTICAST) {
 249                 if ((!sk || inet_sk(sk)->mc_loop)
 250 #ifdef CONFIG_IP_MROUTE
 251                 /* Small optimization: do not loopback not local frames,
 252                    which returned after forwarding; they will be  dropped
 253                    by ip_mr_input in any case.
 254                    Note, that local frames are looped back to be delivered
 255                    to local recipients.
 256
 257                    This check is duplicated in ip_mr_input at the moment.
 258                  */
 259                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 260 #endif
 261                 ) {
 262                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 263                         if (newskb)
 264                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 265                                         newskb->dev,
 266                                         ip_dev_loopback_xmit);
 267                 }
 268
 269                 /* Multicasts with ttl 0 must not go beyond the host */
 270
 271                 if (skb->nh.iph->ttl == 0) {
 272                         kfree_skb(skb);
 273                         return 0;
 274                 }
 275         }
 276
 277         if (rt->rt_flags&RTCF_BROADCAST) {
 278                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 279                 if (newskb)
 280                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 281                                 newskb->dev, ip_dev_loopback_xmit);
 282         }
 283
 284         if (skb->len > dst_mtu(&rt->u.dst))
 285                 return ip_fragment(skb, ip_finish_output);
 286         else
 287                 return ip_finish_output(skb);
 288 }
 289
 290 int ip_output(struct sk_buff *skb)
 291 {
 292         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 293
 294         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 295                 return ip_fragment(skb, ip_finish_output);
 296         else
 297                 return ip_finish_output(skb);
 298 }
 299
 300 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 301 {
 302         struct sock *sk = skb->sk;
 303         struct inet_sock *inet = inet_sk(sk);
 304         struct ip_options *opt = inet->opt;
 305         struct rtable *rt;
 306         struct iphdr *iph;
 307
 308         /* Skip all of this if the packet is already routed,
 309          * f.e. by something like SCTP.
 310          */
 311         rt = (struct rtable *) skb->dst;
 312         if (rt != NULL)
 313                 goto packet_routed;
 314
 315         /* Make sure we can route this packet. */
 316         rt = (struct rtable *)__sk_dst_check(sk, 0);
 317         if (rt == NULL) {
 318                 u32 daddr;
 319
 320                 /* Use correct destination address if we have options. */
 321                 daddr = inet->daddr;
 322                 if(opt && opt->srr)
 323                         daddr = opt->faddr;
 324
 325                 {
 326                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 327                                             .nl_u = { .ip4_u =
 328                                                       { .daddr = daddr,
 329                                                         .saddr = inet->saddr,
 330                                                         .tos = RT_CONN_FLAGS(sk) } },
 331                                             .proto = sk->sk_protocol,
 332                                             .uli_u = { .ports =
 333                                                        { .sport = inet->sport,
 334                                                          .dport = inet->dport } } };
 335
 336                         /* If this fails, retransmit mechanism of transport layer will
 337                          * keep trying until route appears or the connection times
 338                          * itself out.
 339                          */
 340                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 341                                 goto no_route;
 342                 }
 343                 __sk_dst_set(sk, &rt->u.dst);
 344                 tcp_v4_setup_caps(sk, &rt->u.dst);
 345         }
 346         skb->dst = dst_clone(&rt->u.dst);
 347
 348 packet_routed:
 349         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 350                 goto no_route;
 351
 352         /* OK, we know where to send it, allocate and build IP header. */
 353         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 354         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 355         iph->tot_len = htons(skb->len);
 356         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 357                 iph->frag_off = htons(IP_DF);
 358         else
 359                 iph->frag_off = 0;
 360         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 361         iph->protocol = sk->sk_protocol;
 362         iph->saddr    = rt->rt_src;
 363         iph->daddr    = rt->rt_dst;
 364         skb->nh.iph   = iph;
 365         /* Transport layer set skb->h.foo itself. */
 366
 367         if (opt && opt->optlen) {
 368                 iph->ihl += opt->optlen >> 2;
 369                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 370         }
 371
 372         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 373
 374         /* Add an IP checksum. */
 375         ip_send_check(iph);
 376
 377         skb->priority = sk->sk_priority;
 378
 379         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 380                        dst_output);
 381
 382 no_route:
 383         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 384         kfree_skb(skb);
 385         return -EHOSTUNREACH;
 386 }
 387
 388
 389 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 390 {
 391         to->pkt_type = from->pkt_type;
 392         to->priority = from->priority;
 393         to->protocol = from->protocol;
 394         to->security = from->security;
 395         dst_release(to->dst);
 396         to->dst = dst_clone(from->dst);
 397         to->dev = from->dev;
 398
 399         /* Copy the flags to each fragment. */
 400         IPCB(to)->flags = IPCB(from)->flags;
 401
 402 #ifdef CONFIG_NET_SCHED
 403         to->tc_index = from->tc_index;
 404 #endif
 405 #ifdef CONFIG_NETFILTER
 406         to->nfmark = from->nfmark;
 407         to->nfcache = from->nfcache;
 408         /* Connection association is same as pre-frag packet */
 409         nf_conntrack_put(to->nfct);
 410         to->nfct = from->nfct;
 411         nf_conntrack_get(to->nfct);
 412         to->nfctinfo = from->nfctinfo;
 413 #ifdef CONFIG_BRIDGE_NETFILTER
 414         nf_bridge_put(to->nf_bridge);
 415         to->nf_bridge = from->nf_bridge;
 416         nf_bridge_get(to->nf_bridge);
 417 #endif
 418 #ifdef CONFIG_NETFILTER_DEBUG
 419         to->nf_debug = from->nf_debug;
 420 #endif
 421 #endif
 422 }
 423
 424 /*
 425  *      This IP datagram is too large to be sent in one piece.  Break it up into
 426  *      smaller pieces (each of size equal to IP header plus
 427  *      a block of the data of the original IP data part) that will yet fit in a
 428  *      single device frame, and queue such a frame for sending.
 429  */
 430
 431 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 432 {
 433         struct iphdr *iph;
 434         int raw = 0;
 435         int ptr;
 436         struct net_device *dev;
 437         struct sk_buff *skb2;
 438         unsigned int mtu, hlen, left, len, ll_rs;
 439         int offset;
 440         int not_last_frag;
 441         struct rtable *rt = (struct rtable*)skb->dst;
 442         int err = 0;
 443
 444         dev = rt->u.dst.dev;
 445
 446         /*
 447          *      Point into the IP datagram header.
 448          */
 449
 450         iph = skb->nh.iph;
 451
 452         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 453                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 454                           htonl(dst_mtu(&rt->u.dst)));
 455                 kfree_skb(skb);
 456                 return -EMSGSIZE;
 457         }
 458
 459         /*
 460          *      Setup starting values.
 461          */
 462
 463         hlen = iph->ihl * 4;
 464         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 465
 466         /* When frag_list is given, use it. First, check its validity:
 467          * some transformers could create wrong frag_list or break existing
 468          * one, it is not prohibited. In this case fall back to copying.
 469          *
 470          * LATER: this step can be merged to real generation of fragments,
 471          * we can switch to copy when see the first bad fragment.
 472          */
 473         if (skb_shinfo(skb)->frag_list) {
 474                 struct sk_buff *frag;
 475                 int first_len = skb_pagelen(skb);
 476
 477                 if (first_len - hlen > mtu ||
 478                     ((first_len - hlen) & 7) ||
 479                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 480                     skb_cloned(skb))
 481                         goto slow_path;
 482
 483                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 484                         /* Correct geometry. */
 485                         if (frag->len > mtu ||
 486                             ((frag->len & 7) && frag->next) ||
 487                             skb_headroom(frag) < hlen)
 488                             goto slow_path;
 489
 490                         /* Partially cloned skb? */
 491                         if (skb_shared(frag))
 492                                 goto slow_path;
 493
 494                         BUG_ON(frag->sk);
 495                         if (skb->sk) {
 496                                 sock_hold(skb->sk);
 497                                 frag->sk = skb->sk;
 498                                 frag->destructor = sock_wfree;
 499                                 skb->truesize -= frag->truesize;
 500                         }
 501                 }
 502
 503                 /* Everything is OK. Generate! */
 504
 505                 err = 0;
 506                 offset = 0;
 507                 frag = skb_shinfo(skb)->frag_list;
 508                 skb_shinfo(skb)->frag_list = NULL;
 509                 skb->data_len = first_len - skb_headlen(skb);
 510                 skb->len = first_len;
 511                 iph->tot_len = htons(first_len);
 512                 iph->frag_off = htons(IP_MF);
 513                 ip_send_check(iph);
 514
 515                 for (;;) {
 516                         /* Prepare header of the next frame,
 517                          * before previous one went down. */
 518                         if (frag) {
 519                                 frag->ip_summed = CHECKSUM_NONE;
 520                                 frag->h.raw = frag->data;
 521                                 frag->nh.raw = __skb_push(frag, hlen);
 522                                 memcpy(frag->nh.raw, iph, hlen);
 523                                 iph = frag->nh.iph;
 524                                 iph->tot_len = htons(frag->len);
 525                                 ip_copy_metadata(frag, skb);
 526                                 if (offset == 0)
 527                                         ip_options_fragment(frag);
 528                                 offset += skb->len - hlen;
 529                                 iph->frag_off = htons(offset>>3);
 530                                 if (frag->next != NULL)
 531                                         iph->frag_off |= htons(IP_MF);
 532                                 /* Ready, complete checksum */
 533                                 ip_send_check(iph);
 534                         }
 535
 536                         err = output(skb);
 537
 538                         if (err || !frag)
 539                                 break;
 540
 541                         skb = frag;
 542                         frag = skb->next;
 543                         skb->next = NULL;
 544                 }
 545
 546                 if (err == 0) {
 547                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 548                         return 0;
 549                 }
 550
 551                 while (frag) {
 552                         skb = frag->next;
 553                         kfree_skb(frag);
 554                         frag = skb;
 555                 }
 556                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 557                 return err;
 558         }
 559
 560 slow_path:
 561         left = skb->len - hlen;         /* Space per frame */
 562         ptr = raw + hlen;               /* Where to start from */
 563
 564 #ifdef CONFIG_BRIDGE_NETFILTER
 565         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 566          * we need to make room for the encapsulating header */
 567         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 568         mtu -= nf_bridge_pad(skb);
 569 #else
 570         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 571 #endif
 572         /*
 573          *      Fragment the datagram.
 574          */
 575
 576         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 577         not_last_frag = iph->frag_off & htons(IP_MF);
 578
 579         /*
 580          *      Keep copying data until we run out.
 581          */
 582
 583         while(left > 0) {
 584                 len = left;
 585                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 586                 if (len > mtu)
 587                         len = mtu;
 588                 /* IF: we are not sending upto and including the packet end
 589                    then align the next start on an eight byte boundary */
 590                 if (len < left) {
 591                         len &= ~7;
 592                 }
 593                 /*
 594                  *      Allocate buffer.
 595                  */
 596
 597                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 598                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 599                         err = -ENOMEM;
 600                         goto fail;
 601                 }
 602
 603                 /*
 604                  *      Set up data on packet
 605                  */
 606
 607                 ip_copy_metadata(skb2, skb);
 608                 skb_reserve(skb2, ll_rs);
 609                 skb_put(skb2, len + hlen);
 610                 skb2->nh.raw = skb2->data;
 611                 skb2->h.raw = skb2->data + hlen;
 612
 613                 /*
 614                  *      Charge the memory for the fragment to any owner
 615                  *      it might possess
 616                  */
 617
 618                 if (skb->sk)
 619                         skb_set_owner_w(skb2, skb->sk);
 620
 621                 /*
 622                  *      Copy the packet header into the new buffer.
 623                  */
 624
 625                 memcpy(skb2->nh.raw, skb->data, hlen);
 626
 627                 /*
 628                  *      Copy a block of the IP datagram.
 629                  */
 630                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 631                         BUG();
 632                 left -= len;
 633
 634                 /*
 635                  *      Fill in the new header fields.
 636                  */
 637                 iph = skb2->nh.iph;
 638                 iph->frag_off = htons((offset >> 3));
 639
 640                 /* ANK: dirty, but effective trick. Upgrade options only if
 641                  * the segment to be fragmented was THE FIRST (otherwise,
 642                  * options are already fixed) and make it ONCE
 643                  * on the initial skb, so that all the following fragments
 644                  * will inherit fixed options.
 645                  */
 646                 if (offset == 0)
 647                         ip_options_fragment(skb);
 648
 649                 /*
 650                  *      Added AC : If we are fragmenting a fragment that's not the
 651                  *                 last fragment then keep MF on each bit
 652                  */
 653                 if (left > 0 || not_last_frag)
 654                         iph->frag_off |= htons(IP_MF);
 655                 ptr += len;
 656                 offset += len;
 657
 658                 /*
 659                  *      Put this fragment into the sending queue.
 660                  */
 661
 662                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 663
 664                 iph->tot_len = htons(len + hlen);
 665
 666                 ip_send_check(iph);
 667
 668                 err = output(skb2);
 669                 if (err)
 670                         goto fail;
 671         }
 672         kfree_skb(skb);
 673         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 674         return err;
 675
 676 fail:
 677         kfree_skb(skb);
 678         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 679         return err;
 680 }
 681
 682 int
 683 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 684 {
 685         struct iovec *iov = from;
 686
 687         if (skb->ip_summed == CHECKSUM_HW) {
 688                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 689                         return -EFAULT;
 690         } else {
 691                 unsigned int csum = 0;
 692                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 693                         return -EFAULT;
 694                 skb->csum = csum_block_add(skb->csum, csum, odd);
 695         }
 696         return 0;
 697 }
 698
 699 static inline unsigned int
 700 csum_page(struct page *page, int offset, int copy)
 701 {
 702         char *kaddr;
 703         unsigned int csum;
 704         kaddr = kmap(page);
 705         csum = csum_partial(kaddr + offset, copy, 0);
 706         kunmap(page);
 707         return csum;
 708 }
 709
 710 /*
 711  *      ip_append_data() and ip_append_page() can make one large IP datagram
 712  *      from many pieces of data. Each pieces will be holded on the socket
 713  *      until ip_push_pending_frames() is called. Each piece can be a page
 714  *      or non-page data.
 715  *
 716  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 717  *      this interface potentially.
 718  *
 719  *      LATER: length must be adjusted by pad at tail, when it is required.
 720  */
 721 int ip_append_data(struct sock *sk,
 722                    int getfrag(void *from, char *to, int offset, int len,
 723                                int odd, struct sk_buff *skb),
 724                    void *from, int length, int transhdrlen,
 725                    struct ipcm_cookie *ipc, struct rtable *rt,
 726                    unsigned int flags)
 727 {
 728         struct inet_sock *inet = inet_sk(sk);
 729         struct sk_buff *skb;
 730
 731         struct ip_options *opt = NULL;
 732         int hh_len;
 733         int exthdrlen;
 734         int mtu;
 735         int copy;
 736         int err;
 737         int offset = 0;
 738         unsigned int maxfraglen, fragheaderlen;
 739         int csummode = CHECKSUM_NONE;
 740
 741         if (flags&MSG_PROBE)
 742                 return 0;
 743
 744         if (skb_queue_empty(&sk->sk_write_queue)) {
 745                 /*
 746                  * setup for corking.
 747                  */
 748                 opt = ipc->opt;
 749                 if (opt) {
 750                         if (inet->cork.opt == NULL) {
 751                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 752                                 if (unlikely(inet->cork.opt == NULL))
 753                                         return -ENOBUFS;
 754                         }
 755                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 756                         inet->cork.flags |= IPCORK_OPT;
 757                         inet->cork.addr = ipc->addr;
 758                 }
 759                 dst_hold(&rt->u.dst);
 760                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 761                 inet->cork.rt = rt;
 762                 inet->cork.length = 0;
 763                 sk->sk_sndmsg_page = NULL;
 764                 sk->sk_sndmsg_off = 0;
 765                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 766                         length += exthdrlen;
 767                         transhdrlen += exthdrlen;
 768                 }
 769         } else {
 770                 rt = inet->cork.rt;
 771                 if (inet->cork.flags & IPCORK_OPT)
 772                         opt = inet->cork.opt;
 773
 774                 transhdrlen = 0;
 775                 exthdrlen = 0;
 776                 mtu = inet->cork.fragsize;
 777         }
 778         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 779
 780         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 781         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 782
 783         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 784                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 785                 return -EMSGSIZE;
 786         }
 787
 788         /*
 789          * transhdrlen > 0 means that this is the first fragment and we wish
 790          * it won't be fragmented in the future.
 791          */
 792         if (transhdrlen &&
 793             length + fragheaderlen <= mtu &&
 794             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 795             !exthdrlen)
 796                 csummode = CHECKSUM_HW;
 797
 798         inet->cork.length += length;
 799
 800         /* So, what's going on in the loop below?
 801          *
 802          * We use calculated fragment length to generate chained skb,
 803          * each of segments is IP fragment ready for sending to network after
 804          * adding appropriate IP header.
 805          */
 806
 807         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 808                 goto alloc_new_skb;
 809
 810         while (length > 0) {
 811                 /* Check if the remaining data fits into current packet. */
 812                 copy = mtu - skb->len;
 813                 if (copy < length)
 814                         copy = maxfraglen - skb->len;
 815                 if (copy <= 0) {
 816                         char *data;
 817                         unsigned int datalen;
 818                         unsigned int fraglen;
 819                         unsigned int fraggap;
 820                         unsigned int alloclen;
 821                         struct sk_buff *skb_prev;
 822 alloc_new_skb:
 823                         skb_prev = skb;
 824                         if (skb_prev)
 825                                 fraggap = skb_prev->len - maxfraglen;
 826                         else
 827                                 fraggap = 0;
 828
 829                         /*
 830                          * If remaining data exceeds the mtu,
 831                          * we know we need more fragment(s).
 832                          */
 833                         datalen = length + fraggap;
 834                         if (datalen > mtu - fragheaderlen)
 835                                 datalen = maxfraglen - fragheaderlen;
 836                         fraglen = datalen + fragheaderlen;
 837
 838                         if ((flags & MSG_MORE) &&
 839                             !(rt->u.dst.dev->features&NETIF_F_SG))
 840                                 alloclen = mtu;
 841                         else
 842                                 alloclen = datalen + fragheaderlen;
 843
 844                         /* The last fragment gets additional space at tail.
 845                          * Note, with MSG_MORE we overallocate on fragments,
 846                          * because we have no idea what fragment will be
 847                          * the last.
 848                          */
 849                         if (datalen == length)
 850                                 alloclen += rt->u.dst.trailer_len;
 851
 852                         if (transhdrlen) {
 853                                 skb = sock_alloc_send_skb(sk,
 854                                                 alloclen + hh_len + 15,
 855                                                 (flags & MSG_DONTWAIT), &err);
 856                         } else {
 857                                 skb = NULL;
 858                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 859                                     2 * sk->sk_sndbuf)
 860                                         skb = sock_wmalloc(sk,
 861                                                            alloclen + hh_len + 15, 1,
 862                                                            sk->sk_allocation);
 863                                 if (unlikely(skb == NULL))
 864                                         err = -ENOBUFS;
 865                         }
 866                         if (skb == NULL)
 867                                 goto error;
 868
 869                         /*
 870                          *      Fill in the control structures
 871                          */
 872                         skb->ip_summed = csummode;
 873                         skb->csum = 0;
 874                         skb_reserve(skb, hh_len);
 875
 876                         /*
 877                          *      Find where to start putting bytes.
 878                          */
 879                         data = skb_put(skb, fraglen);
 880                         skb->nh.raw = data + exthdrlen;
 881                         data += fragheaderlen;
 882                         skb->h.raw = data + exthdrlen;
 883
 884                         if (fraggap) {
 885                                 skb->csum = skb_copy_and_csum_bits(
 886                                         skb_prev, maxfraglen,
 887                                         data + transhdrlen, fraggap, 0);
 888                                 skb_prev->csum = csum_sub(skb_prev->csum,
 889                                                           skb->csum);
 890                                 data += fraggap;
 891                                 skb_trim(skb_prev, maxfraglen);
 892                         }
 893
 894                         copy = datalen - transhdrlen - fraggap;
 895                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 896                                 err = -EFAULT;
 897                                 kfree_skb(skb);
 898                                 goto error;
 899                         }
 900
 901                         offset += copy;
 902                         length -= datalen - fraggap;
 903                         transhdrlen = 0;
 904                         exthdrlen = 0;
 905                         csummode = CHECKSUM_NONE;
 906
 907                         /*
 908                          * Put the packet on the pending queue.
 909                          */
 910                         __skb_queue_tail(&sk->sk_write_queue, skb);
 911                         continue;
 912                 }
 913
 914                 if (copy > length)
 915                         copy = length;
 916
 917                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 918                         unsigned int off;
 919
 920                         off = skb->len;
 921                         if (getfrag(from, skb_put(skb, copy),
 922                                         offset, copy, off, skb) < 0) {
 923                                 __skb_trim(skb, off);
 924                                 err = -EFAULT;
 925                                 goto error;
 926                         }
 927                 } else {
 928                         int i = skb_shinfo(skb)->nr_frags;
 929                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 930                         struct page *page = sk->sk_sndmsg_page;
 931                         int off = sk->sk_sndmsg_off;
 932                         unsigned int left;
 933
 934                         if (page && (left = PAGE_SIZE - off) > 0) {
 935                                 if (copy >= left)
 936                                         copy = left;
 937                                 if (page != frag->page) {
 938                                         if (i == MAX_SKB_FRAGS) {
 939                                                 err = -EMSGSIZE;
 940                                                 goto error;
 941                                         }
 942                                         get_page(page);
 943                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 944                                         frag = &skb_shinfo(skb)->frags[i];
 945                                 }
 946                         } else if (i < MAX_SKB_FRAGS) {
 947                                 if (copy > PAGE_SIZE)
 948                                         copy = PAGE_SIZE;
 949                                 page = alloc_pages(sk->sk_allocation, 0);
 950                                 if (page == NULL)  {
 951                                         err = -ENOMEM;
 952                                         goto error;
 953                                 }
 954                                 sk->sk_sndmsg_page = page;
 955                                 sk->sk_sndmsg_off = 0;
 956
 957                                 skb_fill_page_desc(skb, i, page, 0, 0);
 958                                 frag = &skb_shinfo(skb)->frags[i];
 959                                 skb->truesize += PAGE_SIZE;
 960                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 961                         } else {
 962                                 err = -EMSGSIZE;
 963                                 goto error;
 964                         }
 965                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 966                                 err = -EFAULT;
 967                                 goto error;
 968                         }
 969                         sk->sk_sndmsg_off += copy;
 970                         frag->size += copy;
 971                         skb->len += copy;
 972                         skb->data_len += copy;
 973                 }
 974                 offset += copy;
 975                 length -= copy;
 976         }
 977
 978         return 0;
 979
 980 error:
 981         inet->cork.length -= length;
 982         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 983         return err;
 984 }
 985
 986 ssize_t ip_append_page(struct sock *sk, struct page *page,
 987                        int offset, size_t size, int flags)
 988 {
 989         struct inet_sock *inet = inet_sk(sk);
 990         struct sk_buff *skb;
 991         struct rtable *rt;
 992         struct ip_options *opt = NULL;
 993         int hh_len;
 994         int mtu;
 995         int len;
 996         int err;
 997         unsigned int maxfraglen, fragheaderlen, fraggap;
 998
 999         if (inet->hdrincl)
1000                 return -EPERM;
1001
1002         if (flags&MSG_PROBE)
1003                 return 0;
1004
1005         if (skb_queue_empty(&sk->sk_write_queue))
1006                 return -EINVAL;
1007
1008         rt = inet->cork.rt;
1009         if (inet->cork.flags & IPCORK_OPT)
1010                 opt = inet->cork.opt;
1011
1012         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1013                 return -EOPNOTSUPP;
1014
1015         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1016         mtu = inet->cork.fragsize;
1017
1018         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1019         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1020
1021         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1022                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1023                 return -EMSGSIZE;
1024         }
1025
1026         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1027                 return -EINVAL;
1028
1029         inet->cork.length += size;
1030
1031         while (size > 0) {
1032                 int i;
1033
1034                 /* Check if the remaining data fits into current packet. */
1035                 len = mtu - skb->len;
1036                 if (len < size)
1037                         len = maxfraglen - skb->len;
1038                 if (len <= 0) {
1039                         struct sk_buff *skb_prev;
1040                         char *data;
1041                         struct iphdr *iph;
1042                         int alloclen;
1043
1044                         skb_prev = skb;
1045                         if (skb_prev)
1046                                 fraggap = skb_prev->len - maxfraglen;
1047                         else
1048                                 fraggap = 0;
1049
1050                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1051                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1052                         if (unlikely(!skb)) {
1053                                 err = -ENOBUFS;
1054                                 goto error;
1055                         }
1056
1057                         /*
1058                          *      Fill in the control structures
1059                          */
1060                         skb->ip_summed = CHECKSUM_NONE;
1061                         skb->csum = 0;
1062                         skb_reserve(skb, hh_len);
1063
1064                         /*
1065                          *      Find where to start putting bytes.
1066                          */
1067                         data = skb_put(skb, fragheaderlen + fraggap);
1068                         skb->nh.iph = iph = (struct iphdr *)data;
1069                         data += fragheaderlen;
1070                         skb->h.raw = data;
1071
1072                         if (fraggap) {
1073                                 skb->csum = skb_copy_and_csum_bits(
1074                                         skb_prev, maxfraglen,
1075                                         data, fraggap, 0);
1076                                 skb_prev->csum = csum_sub(skb_prev->csum,
1077                                                           skb->csum);
1078                                 skb_trim(skb_prev, maxfraglen);
1079                         }
1080
1081                         /*
1082                          * Put the packet on the pending queue.
1083                          */
1084                         __skb_queue_tail(&sk->sk_write_queue, skb);
1085                         continue;
1086                 }
1087
1088                 i = skb_shinfo(skb)->nr_frags;
1089                 if (len > size)
1090                         len = size;
1091                 if (skb_can_coalesce(skb, i, page, offset)) {
1092                         skb_shinfo(skb)->frags[i-1].size += len;
1093                 } else if (i < MAX_SKB_FRAGS) {
1094                         get_page(page);
1095                         skb_fill_page_desc(skb, i, page, offset, len);
1096                 } else {
1097                         err = -EMSGSIZE;
1098                         goto error;
1099                 }
1100
1101                 if (skb->ip_summed == CHECKSUM_NONE) {
1102                         unsigned int csum;
1103                         csum = csum_page(page, offset, len);
1104                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1105                 }
1106
1107                 skb->len += len;
1108                 skb->data_len += len;
1109                 offset += len;
1110                 size -= len;
1111         }
1112         return 0;
1113
1114 error:
1115         inet->cork.length -= size;
1116         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1117         return err;
1118 }
1119
1120 /*
1121  *      Combined all pending IP fragments on the socket as one IP datagram
1122  *      and push them out.
1123  */
1124 int ip_push_pending_frames(struct sock *sk)
1125 {
1126         struct sk_buff *skb, *tmp_skb;
1127         struct sk_buff **tail_skb;
1128         struct inet_sock *inet = inet_sk(sk);
1129         struct ip_options *opt = NULL;
1130         struct rtable *rt = inet->cork.rt;
1131         struct iphdr *iph;
1132         int df = 0;
1133         __u8 ttl;
1134         int err = 0;
1135
1136         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1137                 goto out;
1138         tail_skb = &(skb_shinfo(skb)->frag_list);
1139
1140         /* move skb->data to ip header from ext header */
1141         if (skb->data < skb->nh.raw)
1142                 __skb_pull(skb, skb->nh.raw - skb->data);
1143         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1144                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1145                 *tail_skb = tmp_skb;
1146                 tail_skb = &(tmp_skb->next);
1147                 skb->len += tmp_skb->len;
1148                 skb->data_len += tmp_skb->len;
1149                 skb->truesize += tmp_skb->truesize;
1150                 __sock_put(tmp_skb->sk);
1151                 tmp_skb->destructor = NULL;
1152                 tmp_skb->sk = NULL;
1153         }
1154
1155         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1156          * to fragment the frame generated here. No matter, what transforms
1157          * how transforms change size of the packet, it will come out.
1158          */
1159         if (inet->pmtudisc != IP_PMTUDISC_DO)
1160                 skb->local_df = 1;
1161
1162         /* DF bit is set when we want to see DF on outgoing frames.
1163          * If local_df is set too, we still allow to fragment this frame
1164          * locally. */
1165         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1166             (skb->len <= dst_mtu(&rt->u.dst) &&
1167              ip_dont_fragment(sk, &rt->u.dst)))
1168                 df = htons(IP_DF);
1169
1170         if (inet->cork.flags & IPCORK_OPT)
1171                 opt = inet->cork.opt;
1172
1173         if (rt->rt_type == RTN_MULTICAST)
1174                 ttl = inet->mc_ttl;
1175         else
1176                 ttl = ip_select_ttl(inet, &rt->u.dst);
1177
1178         iph = (struct iphdr *)skb->data;
1179         iph->version = 4;
1180         iph->ihl = 5;
1181         if (opt) {
1182                 iph->ihl += opt->optlen>>2;
1183                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1184         }
1185         iph->tos = inet->tos;
1186         iph->tot_len = htons(skb->len);
1187         iph->frag_off = df;
1188         if (!df) {
1189                 __ip_select_ident(iph, &rt->u.dst, 0);
1190         } else {
1191                 iph->id = htons(inet->id++);
1192         }
1193         iph->ttl = ttl;
1194         iph->protocol = sk->sk_protocol;
1195         iph->saddr = rt->rt_src;
1196         iph->daddr = rt->rt_dst;
1197         ip_send_check(iph);
1198
1199         skb->priority = sk->sk_priority;
1200         skb->dst = dst_clone(&rt->u.dst);
1201
1202         /* Netfilter gets whole the not fragmented skb. */
1203         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1204                       skb->dst->dev, dst_output);
1205         if (err) {
1206                 if (err > 0)
1207                         err = inet->recverr ? net_xmit_errno(err) : 0;
1208                 if (err)
1209                         goto error;
1210         }
1211
1212 out:
1213         inet->cork.flags &= ~IPCORK_OPT;
1214         if (inet->cork.opt) {
1215                 kfree(inet->cork.opt);
1216                 inet->cork.opt = NULL;
1217         }
1218         if (inet->cork.rt) {
1219                 ip_rt_put(inet->cork.rt);
1220                 inet->cork.rt = NULL;
1221         }
1222         return err;
1223
1224 error:
1225         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1226         goto out;
1227 }
1228
1229 /*
1230  *      Throw away all pending data on the socket.
1231  */
1232 void ip_flush_pending_frames(struct sock *sk)
1233 {
1234         struct inet_sock *inet = inet_sk(sk);
1235         struct sk_buff *skb;
1236
1237         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1238                 kfree_skb(skb);
1239
1240         inet->cork.flags &= ~IPCORK_OPT;
1241         if (inet->cork.opt) {
1242                 kfree(inet->cork.opt);
1243                 inet->cork.opt = NULL;
1244         }
1245         if (inet->cork.rt) {
1246                 ip_rt_put(inet->cork.rt);
1247                 inet->cork.rt = NULL;
1248         }
1249 }
1250
1251
1252 /*
1253  *      Fetch data from kernel space and fill in checksum if needed.
1254  */
1255 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1256                               int len, int odd, struct sk_buff *skb)
1257 {
1258         unsigned int csum;
1259
1260         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1261         skb->csum = csum_block_add(skb->csum, csum, odd);
1262         return 0;
1263 }
1264
1265 /*
1266  *      Generic function to send a packet as reply to another packet.
1267  *      Used to send TCP resets so far. ICMP should use this function too.
1268  *
1269  *      Should run single threaded per socket because it uses the sock
1270  *      structure to pass arguments.
1271  *
1272  *      LATER: switch from ip_build_xmit to ip_append_*
1273  */
1274 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1275                    unsigned int len)
1276 {
1277         struct inet_sock *inet = inet_sk(sk);
1278         struct {
1279                 struct ip_options       opt;
1280                 char                    data[40];
1281         } replyopts;
1282         struct ipcm_cookie ipc;
1283         u32 daddr;
1284         struct rtable *rt = (struct rtable*)skb->dst;
1285
1286         if (ip_options_echo(&replyopts.opt, skb))
1287                 return;
1288
1289         daddr = ipc.addr = rt->rt_src;
1290         ipc.opt = NULL;
1291
1292         if (replyopts.opt.optlen) {
1293                 ipc.opt = &replyopts.opt;
1294
1295                 if (ipc.opt->srr)
1296                         daddr = replyopts.opt.faddr;
1297         }
1298
1299         {
1300                 struct flowi fl = { .nl_u = { .ip4_u =
1301                                               { .daddr = daddr,
1302                                                 .saddr = rt->rt_spec_dst,
1303                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1304                                     /* Not quite clean, but right. */
1305                                     .uli_u = { .ports =
1306                                                { .sport = skb->h.th->dest,
1307                                                  .dport = skb->h.th->source } },
1308                                     .proto = sk->sk_protocol };
1309                 if (ip_route_output_key(&rt, &fl))
1310                         return;
1311         }
1312
1313         /* And let IP do all the hard work.
1314
1315            This chunk is not reenterable, hence spinlock.
1316            Note that it uses the fact, that this function is called
1317            with locally disabled BH and that sk cannot be already spinlocked.
1318          */
1319         bh_lock_sock(sk);
1320         inet->tos = skb->nh.iph->tos;
1321         sk->sk_priority = skb->priority;
1322         sk->sk_protocol = skb->nh.iph->protocol;
1323         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1324                        &ipc, rt, MSG_DONTWAIT);
1325         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1326                 if (arg->csumoffset >= 0)
1327                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1328                 skb->ip_summed = CHECKSUM_NONE;
1329                 ip_push_pending_frames(sk);
1330         }
1331
1332         bh_unlock_sock(sk);
1333
1334         ip_rt_put(rt);
1335 }
1336
1337 /*
1338  *      IP protocol layer initialiser
1339  */
1340
1341 static struct packet_type ip_packet_type = {
1342         .type = __constant_htons(ETH_P_IP),
1343         .func = ip_rcv,
1344 };
1345
1346 /*
1347  *      IP registers the packet type and then calls the subprotocol initialisers
1348  */
1349
1350 void __init ip_init(void)
1351 {
1352         dev_add_pack(&ip_packet_type);
1353
1354         ip_rt_init();
1355         inet_initpeers();
1356
1357 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1358         igmp_mc_proc_init();
1359 #endif
1360 }
1361
1362 EXPORT_SYMBOL(ip_finish_output);
1363 EXPORT_SYMBOL(ip_fragment);
1364 EXPORT_SYMBOL(ip_generic_getfrag);
1365 EXPORT_SYMBOL(ip_queue_xmit);
1366 EXPORT_SYMBOL(ip_send_check);
1367
1368 #ifdef CONFIG_SYSCTL
1369 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1370 #endif