net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <net/tcp.h>
  73 #include <net/udp.h>
  74 #include <linux/skbuff.h>
  75 #include <net/sock.h>
  76 #include <net/arp.h>
  77 #include <net/icmp.h>
  78 #include <net/raw.h>
  79 #include <net/checksum.h>
  80 #include <net/inetpeer.h>
  81 #include <net/checksum.h>
  82 #include <linux/igmp.h>
  83 #include <linux/netfilter_ipv4.h>
  84 #include <linux/netfilter_bridge.h>
  85 #include <linux/mroute.h>
  86 #include <linux/netlink.h>
  87
  88 /*
  89  *      Shall we try to damage output packets if routing dev changes?
  90  */
  91
  92 int sysctl_ip_dynaddr;
  93 int sysctl_ip_default_ttl = IPDEFTTL;
  94
  95 /* Generate a checksum for an outgoing IP datagram. */
  96 __inline__ void ip_send_check(struct iphdr *iph)
  97 {
  98         iph->check = 0;
  99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 100 }
 101
 102 /* dev_loopback_xmit for use with netfilter. */
 103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 104 {
 105         newskb->mac.raw = newskb->data;
 106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 107         newskb->pkt_type = PACKET_LOOPBACK;
 108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 109         BUG_TRAP(newskb->dst);
 110
 111 #ifdef CONFIG_NETFILTER_DEBUG
 112         nf_debug_ip_loopback_xmit(newskb);
 113 #endif
 114         netif_rx(newskb);
 115         return 0;
 116 }
 117
 118 static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst)
 119 {
 120         int ttl = inet->uc_ttl;
 121
 122         if (ttl < 0)
 123                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 124         return ttl;
 125 }
 126
 127 /*
 128  *              Add an ip header to a skbuff and send it out.
 129  *
 130  */
 131 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 132                           u32 saddr, u32 daddr, struct ip_options *opt)
 133 {
 134         struct inet_opt *inet = inet_sk(sk);
 135         struct rtable *rt = (struct rtable *)skb->dst;
 136         struct iphdr *iph;
 137
 138         /* Build the IP header. */
 139         if (opt)
 140                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 141         else
 142                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 143
 144         iph->version  = 4;
 145         iph->ihl      = 5;
 146         iph->tos      = inet->tos;
 147         if (ip_dont_fragment(sk, &rt->u.dst))
 148                 iph->frag_off = htons(IP_DF);
 149         else
 150                 iph->frag_off = 0;
 151         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 152         iph->daddr    = rt->rt_dst;
 153         iph->saddr    = rt->rt_src;
 154         iph->protocol = sk->sk_protocol;
 155         iph->tot_len  = htons(skb->len);
 156         ip_select_ident(iph, &rt->u.dst, sk);
 157         skb->nh.iph   = iph;
 158
 159         if (opt && opt->optlen) {
 160                 iph->ihl += opt->optlen>>2;
 161                 ip_options_build(skb, opt, daddr, rt, 0);
 162         }
 163         ip_send_check(iph);
 164
 165         skb->priority = sk->sk_priority;
 166
 167         /* Send it out. */
 168         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 169                        dst_output);
 170 }
 171
 172 static inline int ip_finish_output2(struct sk_buff *skb)
 173 {
 174         struct dst_entry *dst = skb->dst;
 175         struct hh_cache *hh = dst->hh;
 176         struct net_device *dev = dst->dev;
 177         int hh_len = LL_RESERVED_SPACE(dev);
 178
 179         /* Be paranoid, rather than too clever. */
 180         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 181                 struct sk_buff *skb2;
 182
 183                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 184                 if (skb2 == NULL) {
 185                         kfree_skb(skb);
 186                         return -ENOMEM;
 187                 }
 188                 if (skb->sk)
 189                         skb_set_owner_w(skb2, skb->sk);
 190                 kfree_skb(skb);
 191                 skb = skb2;
 192         }
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         if (hh) {
 199                 int hh_alen;
 200
 201                 read_lock_bh(&hh->hh_lock);
 202                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 203                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 204                 read_unlock_bh(&hh->hh_lock);
 205                 skb_push(skb, hh->hh_len);
 206                 return hh->hh_output(skb);
 207         } else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 int ip_finish_output(struct sk_buff *skb)
 217 {
 218         struct net_device *dev = skb->dst->dev;
 219
 220         skb->dev = dev;
 221         skb->protocol = htons(ETH_P_IP);
 222
 223         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 224                        ip_finish_output2);
 225 }
 226
 227 int ip_mc_output(struct sk_buff **pskb)
 228 {
 229         struct sk_buff *skb = *pskb;
 230         struct sock *sk = skb->sk;
 231         struct rtable *rt = (struct rtable*)skb->dst;
 232         struct net_device *dev = rt->u.dst.dev;
 233
 234         /*
 235          *      If the indicated interface is up and running, send the packet.
 236          */
 237         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 238
 239         skb->dev = dev;
 240         skb->protocol = htons(ETH_P_IP);
 241
 242         /*
 243          *      Multicasts are looped back for other local users
 244          */
 245
 246         if (rt->rt_flags&RTCF_MULTICAST) {
 247                 if ((!sk || inet_sk(sk)->mc_loop)
 248 #ifdef CONFIG_IP_MROUTE
 249                 /* Small optimization: do not loopback not local frames,
 250                    which returned after forwarding; they will be  dropped
 251                    by ip_mr_input in any case.
 252                    Note, that local frames are looped back to be delivered
 253                    to local recipients.
 254
 255                    This check is duplicated in ip_mr_input at the moment.
 256                  */
 257                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 258 #endif
 259                 ) {
 260                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 261                         if (newskb)
 262                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 263                                         newskb->dev,
 264                                         ip_dev_loopback_xmit);
 265                 }
 266
 267                 /* Multicasts with ttl 0 must not go beyond the host */
 268
 269                 if (skb->nh.iph->ttl == 0) {
 270                         kfree_skb(skb);
 271                         return 0;
 272                 }
 273         }
 274
 275         if (rt->rt_flags&RTCF_BROADCAST) {
 276                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 277                 if (newskb)
 278                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 279                                 newskb->dev, ip_dev_loopback_xmit);
 280         }
 281
 282         if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list)
 283                 return ip_fragment(skb, ip_finish_output);
 284         else
 285                 return ip_finish_output(skb);
 286 }
 287
 288 int ip_output(struct sk_buff **pskb)
 289 {
 290         struct sk_buff *skb = *pskb;
 291
 292         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 293
 294         if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) &&
 295             !skb_shinfo(skb)->tso_size)
 296                 return ip_fragment(skb, ip_finish_output);
 297         else
 298                 return ip_finish_output(skb);
 299 }
 300
 301 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 302 {
 303         struct sock *sk = skb->sk;
 304         struct inet_opt *inet = inet_sk(sk);
 305         struct ip_options *opt = inet->opt;
 306         struct rtable *rt;
 307         struct iphdr *iph;
 308
 309         /* Skip all of this if the packet is already routed,
 310          * f.e. by something like SCTP.
 311          */
 312         rt = (struct rtable *) skb->dst;
 313         if (rt != NULL)
 314                 goto packet_routed;
 315
 316         /* Make sure we can route this packet. */
 317         rt = (struct rtable *)__sk_dst_check(sk, 0);
 318         if (rt == NULL) {
 319                 u32 daddr;
 320
 321                 /* Use correct destination address if we have options. */
 322                 daddr = inet->daddr;
 323                 if(opt && opt->srr)
 324                         daddr = opt->faddr;
 325
 326                 {
 327                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 328                                             .nl_u = { .ip4_u =
 329                                                       { .daddr = daddr,
 330                                                         .saddr = inet->saddr,
 331                                                         .tos = RT_CONN_FLAGS(sk) } },
 332                                             .proto = sk->sk_protocol,
 333                                             .uli_u = { .ports =
 334                                                        { .sport = inet->sport,
 335                                                          .dport = inet->dport } } };
 336
 337                         /* If this fails, retransmit mechanism of transport layer will
 338                          * keep trying until route appears or the connection times
 339                          * itself out.
 340                          */
 341                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 342                                 goto no_route;
 343                 }
 344                 __sk_dst_set(sk, &rt->u.dst);
 345                 tcp_v4_setup_caps(sk, &rt->u.dst);
 346         }
 347         skb->dst = dst_clone(&rt->u.dst);
 348
 349 packet_routed:
 350         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 351                 goto no_route;
 352
 353         /* OK, we know where to send it, allocate and build IP header. */
 354         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 355         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 356         iph->tot_len = htons(skb->len);
 357         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 358                 iph->frag_off = htons(IP_DF);
 359         else
 360                 iph->frag_off = 0;
 361         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 362         iph->protocol = sk->sk_protocol;
 363         iph->saddr    = rt->rt_src;
 364         iph->daddr    = rt->rt_dst;
 365         skb->nh.iph   = iph;
 366         /* Transport layer set skb->h.foo itself. */
 367
 368         if (opt && opt->optlen) {
 369                 iph->ihl += opt->optlen >> 2;
 370                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 371         }
 372
 373         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 374
 375         /* Add an IP checksum. */
 376         ip_send_check(iph);
 377
 378         skb->priority = sk->sk_priority;
 379
 380         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 381                        dst_output);
 382
 383 no_route:
 384         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 385         kfree_skb(skb);
 386         return -EHOSTUNREACH;
 387 }
 388
 389
 390 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 391 {
 392         to->pkt_type = from->pkt_type;
 393         to->priority = from->priority;
 394         to->protocol = from->protocol;
 395         to->security = from->security;
 396         to->dst = dst_clone(from->dst);
 397         to->dev = from->dev;
 398
 399         /* Copy the flags to each fragment. */
 400         IPCB(to)->flags = IPCB(from)->flags;
 401
 402 #ifdef CONFIG_NET_SCHED
 403         to->tc_index = from->tc_index;
 404 #endif
 405 #ifdef CONFIG_NETFILTER
 406         to->nfmark = from->nfmark;
 407         to->nfcache = from->nfcache;
 408         /* Connection association is same as pre-frag packet */
 409         nf_conntrack_put(to->nfct);
 410         to->nfct = from->nfct;
 411         nf_conntrack_get(to->nfct);
 412         to->nfctinfo = from->nfctinfo;
 413 #ifdef CONFIG_BRIDGE_NETFILTER
 414         nf_bridge_put(to->nf_bridge);
 415         to->nf_bridge = from->nf_bridge;
 416         nf_bridge_get(to->nf_bridge);
 417 #endif
 418 #ifdef CONFIG_NETFILTER_DEBUG
 419         to->nf_debug = from->nf_debug;
 420 #endif
 421 #endif
 422 }
 423
 424 /*
 425  *      This IP datagram is too large to be sent in one piece.  Break it up into
 426  *      smaller pieces (each of size equal to IP header plus
 427  *      a block of the data of the original IP data part) that will yet fit in a
 428  *      single device frame, and queue such a frame for sending.
 429  */
 430
 431 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 432 {
 433         struct iphdr *iph;
 434         int raw = 0;
 435         int ptr;
 436         struct net_device *dev;
 437         struct sk_buff *skb2;
 438         unsigned int mtu, hlen, left, len, ll_rs;
 439         int offset;
 440         int not_last_frag;
 441         struct rtable *rt = (struct rtable*)skb->dst;
 442         int err = 0;
 443
 444         dev = rt->u.dst.dev;
 445
 446         /*
 447          *      Point into the IP datagram header.
 448          */
 449
 450         iph = skb->nh.iph;
 451
 452         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 453                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 454                           htonl(dst_pmtu(&rt->u.dst)));
 455                 kfree_skb(skb);
 456                 return -EMSGSIZE;
 457         }
 458
 459         /*
 460          *      Setup starting values.
 461          */
 462
 463         hlen = iph->ihl * 4;
 464         mtu = dst_pmtu(&rt->u.dst) - hlen;      /* Size of data space */
 465
 466         /* When frag_list is given, use it. First, check its validity:
 467          * some transformers could create wrong frag_list or break existing
 468          * one, it is not prohibited. In this case fall back to copying.
 469          *
 470          * LATER: this step can be merged to real generation of fragments,
 471          * we can switch to copy when see the first bad fragment.
 472          */
 473         if (skb_shinfo(skb)->frag_list) {
 474                 struct sk_buff *frag;
 475                 int first_len = skb_pagelen(skb);
 476
 477                 if (first_len - hlen > mtu ||
 478                     ((first_len - hlen) & 7) ||
 479                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 480                     skb_cloned(skb))
 481                         goto slow_path;
 482
 483                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 484                         /* Correct geometry. */
 485                         if (frag->len > mtu ||
 486                             ((frag->len & 7) && frag->next) ||
 487                             skb_headroom(frag) < hlen)
 488                             goto slow_path;
 489
 490                         /* Partially cloned skb? */
 491                         if (skb_shared(frag))
 492                                 goto slow_path;
 493                 }
 494
 495                 /* Everything is OK. Generate! */
 496
 497                 err = 0;
 498                 offset = 0;
 499                 frag = skb_shinfo(skb)->frag_list;
 500                 skb_shinfo(skb)->frag_list = NULL;
 501                 skb->data_len = first_len - skb_headlen(skb);
 502                 skb->len = first_len;
 503                 iph->tot_len = htons(first_len);
 504                 iph->frag_off |= htons(IP_MF);
 505                 ip_send_check(iph);
 506
 507                 for (;;) {
 508                         /* Prepare header of the next frame,
 509                          * before previous one went down. */
 510                         if (frag) {
 511                                 frag->h.raw = frag->data;
 512                                 frag->nh.raw = __skb_push(frag, hlen);
 513                                 memcpy(frag->nh.raw, iph, hlen);
 514                                 iph = frag->nh.iph;
 515                                 iph->tot_len = htons(frag->len);
 516                                 ip_copy_metadata(frag, skb);
 517                                 if (offset == 0)
 518                                         ip_options_fragment(frag);
 519                                 offset += skb->len - hlen;
 520                                 iph->frag_off = htons(offset>>3);
 521                                 if (frag->next != NULL)
 522                                         iph->frag_off |= htons(IP_MF);
 523                                 /* Ready, complete checksum */
 524                                 ip_send_check(iph);
 525                         }
 526
 527                         err = output(skb);
 528
 529                         if (err || !frag)
 530                                 break;
 531
 532                         skb = frag;
 533                         frag = skb->next;
 534                         skb->next = NULL;
 535                 }
 536
 537                 if (err == 0) {
 538                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 539                         return 0;
 540                 }
 541
 542                 while (frag) {
 543                         skb = frag->next;
 544                         kfree_skb(frag);
 545                         frag = skb;
 546                 }
 547                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 548                 return err;
 549         }
 550
 551 slow_path:
 552         left = skb->len - hlen;         /* Space per frame */
 553         ptr = raw + hlen;               /* Where to start from */
 554
 555 #ifdef CONFIG_BRIDGE_NETFILTER
 556         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 557          * we need to make room for the encapsulating header */
 558         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 559         mtu -= nf_bridge_pad(skb);
 560 #else
 561         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 562 #endif
 563         /*
 564          *      Fragment the datagram.
 565          */
 566
 567         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 568         not_last_frag = iph->frag_off & htons(IP_MF);
 569
 570         /*
 571          *      Keep copying data until we run out.
 572          */
 573
 574         while(left > 0) {
 575                 len = left;
 576                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 577                 if (len > mtu)
 578                         len = mtu;
 579                 /* IF: we are not sending upto and including the packet end
 580                    then align the next start on an eight byte boundary */
 581                 if (len < left) {
 582                         len &= ~7;
 583                 }
 584                 /*
 585                  *      Allocate buffer.
 586                  */
 587
 588                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 589                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 590                         err = -ENOMEM;
 591                         goto fail;
 592                 }
 593
 594                 /*
 595                  *      Set up data on packet
 596                  */
 597
 598                 ip_copy_metadata(skb2, skb);
 599                 skb_reserve(skb2, ll_rs);
 600                 skb_put(skb2, len + hlen);
 601                 skb2->nh.raw = skb2->data;
 602                 skb2->h.raw = skb2->data + hlen;
 603
 604                 /*
 605                  *      Charge the memory for the fragment to any owner
 606                  *      it might possess
 607                  */
 608
 609                 if (skb->sk)
 610                         skb_set_owner_w(skb2, skb->sk);
 611
 612                 /*
 613                  *      Copy the packet header into the new buffer.
 614                  */
 615
 616                 memcpy(skb2->nh.raw, skb->data, hlen);
 617
 618                 /*
 619                  *      Copy a block of the IP datagram.
 620                  */
 621                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 622                         BUG();
 623                 left -= len;
 624
 625                 /*
 626                  *      Fill in the new header fields.
 627                  */
 628                 iph = skb2->nh.iph;
 629                 iph->frag_off = htons((offset >> 3));
 630
 631                 /* ANK: dirty, but effective trick. Upgrade options only if
 632                  * the segment to be fragmented was THE FIRST (otherwise,
 633                  * options are already fixed) and make it ONCE
 634                  * on the initial skb, so that all the following fragments
 635                  * will inherit fixed options.
 636                  */
 637                 if (offset == 0)
 638                         ip_options_fragment(skb);
 639
 640                 /*
 641                  *      Added AC : If we are fragmenting a fragment that's not the
 642                  *                 last fragment then keep MF on each bit
 643                  */
 644                 if (left > 0 || not_last_frag)
 645                         iph->frag_off |= htons(IP_MF);
 646                 ptr += len;
 647                 offset += len;
 648
 649                 /*
 650                  *      Put this fragment into the sending queue.
 651                  */
 652
 653                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 654
 655                 iph->tot_len = htons(len + hlen);
 656
 657                 ip_send_check(iph);
 658
 659                 err = output(skb2);
 660                 if (err)
 661                         goto fail;
 662         }
 663         kfree_skb(skb);
 664         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 665         return err;
 666
 667 fail:
 668         kfree_skb(skb);
 669         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 670         return err;
 671 }
 672
 673 int
 674 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 675 {
 676         struct iovec *iov = from;
 677
 678         if (skb->ip_summed == CHECKSUM_HW) {
 679                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 680                         return -EFAULT;
 681         } else {
 682                 unsigned int csum = 0;
 683                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 684                         return -EFAULT;
 685                 skb->csum = csum_block_add(skb->csum, csum, odd);
 686         }
 687         return 0;
 688 }
 689
 690 static inline unsigned int
 691 csum_page(struct page *page, int offset, int copy)
 692 {
 693         char *kaddr;
 694         unsigned int csum;
 695         kaddr = kmap(page);
 696         csum = csum_partial(kaddr + offset, copy, 0);
 697         kunmap(page);
 698         return csum;
 699 }
 700
 701 /*
 702  *      ip_append_data() and ip_append_page() can make one large IP datagram
 703  *      from many pieces of data. Each pieces will be holded on the socket
 704  *      until ip_push_pending_frames() is called. Each piece can be a page
 705  *      or non-page data.
 706  *
 707  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 708  *      this interface potentially.
 709  *
 710  *      LATER: length must be adjusted by pad at tail, when it is required.
 711  */
 712 int ip_append_data(struct sock *sk,
 713                    int getfrag(void *from, char *to, int offset, int len,
 714                                int odd, struct sk_buff *skb),
 715                    void *from, int length, int transhdrlen,
 716                    struct ipcm_cookie *ipc, struct rtable *rt,
 717                    unsigned int flags)
 718 {
 719         struct inet_opt *inet = inet_sk(sk);
 720         struct sk_buff *skb;
 721
 722         struct ip_options *opt = NULL;
 723         int hh_len;
 724         int exthdrlen;
 725         int mtu;
 726         int copy;
 727         int err;
 728         int offset = 0;
 729         unsigned int maxfraglen, fragheaderlen;
 730         int csummode = CHECKSUM_NONE;
 731
 732         if (flags&MSG_PROBE)
 733                 return 0;
 734
 735         if (skb_queue_empty(&sk->sk_write_queue)) {
 736                 /*
 737                  * setup for corking.
 738                  */
 739                 opt = ipc->opt;
 740                 if (opt) {
 741                         if (inet->cork.opt == NULL) {
 742                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 743                                 if (unlikely(inet->cork.opt == NULL))
 744                                         return -ENOBUFS;
 745                         }
 746                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 747                         inet->cork.flags |= IPCORK_OPT;
 748                         inet->cork.addr = ipc->addr;
 749                 }
 750                 dst_hold(&rt->u.dst);
 751                 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
 752                 inet->cork.rt = rt;
 753                 inet->cork.length = 0;
 754                 sk->sk_sndmsg_page = NULL;
 755                 sk->sk_sndmsg_off = 0;
 756                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 757                         length += exthdrlen;
 758                         transhdrlen += exthdrlen;
 759                 }
 760         } else {
 761                 rt = inet->cork.rt;
 762                 if (inet->cork.flags & IPCORK_OPT)
 763                         opt = inet->cork.opt;
 764
 765                 transhdrlen = 0;
 766                 exthdrlen = 0;
 767                 mtu = inet->cork.fragsize;
 768         }
 769         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 770
 771         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 772         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 773
 774         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 775                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 776                 return -EMSGSIZE;
 777         }
 778
 779         /*
 780          * transhdrlen > 0 means that this is the first fragment and we wish
 781          * it won't be fragmented in the future.
 782          */
 783         if (transhdrlen &&
 784             length + fragheaderlen <= mtu &&
 785             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 786             !exthdrlen)
 787                 csummode = CHECKSUM_HW;
 788
 789         inet->cork.length += length;
 790
 791         /* So, what's going on in the loop below?
 792          *
 793          * We use calculated fragment length to generate chained skb,
 794          * each of segments is IP fragment ready for sending to network after
 795          * adding appropriate IP header.
 796          */
 797
 798         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 799                 goto alloc_new_skb;
 800
 801         while (length > 0) {
 802                 /* Check if the remaining data fits into current packet. */
 803                 copy = mtu - skb->len;
 804                 if (copy < length)
 805                         copy = maxfraglen - skb->len;
 806                 if (copy <= 0) {
 807                         char *data;
 808                         unsigned int datalen;
 809                         unsigned int fraglen;
 810                         unsigned int fraggap;
 811                         unsigned int alloclen;
 812                         struct sk_buff *skb_prev;
 813 alloc_new_skb:
 814                         skb_prev = skb;
 815                         if (skb_prev)
 816                                 fraggap = skb_prev->len - maxfraglen;
 817                         else
 818                                 fraggap = 0;
 819
 820                         /*
 821                          * If remaining data exceeds the mtu,
 822                          * we know we need more fragment(s).
 823                          */
 824                         datalen = length + fraggap;
 825                         if (datalen > mtu - fragheaderlen)
 826                                 datalen = maxfraglen - fragheaderlen;
 827                         fraglen = datalen + fragheaderlen;
 828
 829                         if ((flags & MSG_MORE) &&
 830                             !(rt->u.dst.dev->features&NETIF_F_SG))
 831                                 alloclen = mtu;
 832                         else
 833                                 alloclen = datalen + fragheaderlen;
 834
 835                         /* The last fragment gets additional space at tail.
 836                          * Note, with MSG_MORE we overallocate on fragments,
 837                          * because we have no idea what fragment will be
 838                          * the last.
 839                          */
 840                         if (datalen == length)
 841                                 alloclen += rt->u.dst.trailer_len;
 842
 843                         if (transhdrlen) {
 844                                 skb = sock_alloc_send_skb(sk,
 845                                                 alloclen + hh_len + 15,
 846                                                 (flags & MSG_DONTWAIT), &err);
 847                         } else {
 848                                 skb = NULL;
 849                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 850                                     2 * sk->sk_sndbuf)
 851                                         skb = sock_wmalloc(sk,
 852                                                            alloclen + hh_len + 15, 1,
 853                                                            sk->sk_allocation);
 854                                 if (unlikely(skb == NULL))
 855                                         err = -ENOBUFS;
 856                         }
 857                         if (skb == NULL)
 858                                 goto error;
 859
 860                         /*
 861                          *      Fill in the control structures
 862                          */
 863                         skb->ip_summed = csummode;
 864                         skb->csum = 0;
 865                         skb_reserve(skb, hh_len);
 866
 867                         /*
 868                          *      Find where to start putting bytes.
 869                          */
 870                         data = skb_put(skb, fraglen);
 871                         skb->nh.raw = data + exthdrlen;
 872                         data += fragheaderlen;
 873                         skb->h.raw = data + exthdrlen;
 874
 875                         if (fraggap) {
 876                                 skb->csum = skb_copy_and_csum_bits(
 877                                         skb_prev, maxfraglen,
 878                                         data + transhdrlen, fraggap, 0);
 879                                 skb_prev->csum = csum_sub(skb_prev->csum,
 880                                                           skb->csum);
 881                                 data += fraggap;
 882                                 skb_trim(skb_prev, maxfraglen);
 883                         }
 884
 885                         copy = datalen - transhdrlen - fraggap;
 886                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 887                                 err = -EFAULT;
 888                                 kfree_skb(skb);
 889                                 goto error;
 890                         }
 891
 892                         offset += copy;
 893                         length -= datalen - fraggap;
 894                         transhdrlen = 0;
 895                         exthdrlen = 0;
 896                         csummode = CHECKSUM_NONE;
 897
 898                         /*
 899                          * Put the packet on the pending queue.
 900                          */
 901                         __skb_queue_tail(&sk->sk_write_queue, skb);
 902                         continue;
 903                 }
 904
 905                 if (copy > length)
 906                         copy = length;
 907
 908                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 909                         unsigned int off;
 910
 911                         off = skb->len;
 912                         if (getfrag(from, skb_put(skb, copy),
 913                                         offset, copy, off, skb) < 0) {
 914                                 __skb_trim(skb, off);
 915                                 err = -EFAULT;
 916                                 goto error;
 917                         }
 918                 } else {
 919                         int i = skb_shinfo(skb)->nr_frags;
 920                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 921                         struct page *page = sk->sk_sndmsg_page;
 922                         int off = sk->sk_sndmsg_off;
 923                         unsigned int left;
 924
 925                         if (page && (left = PAGE_SIZE - off) > 0) {
 926                                 if (copy >= left)
 927                                         copy = left;
 928                                 if (page != frag->page) {
 929                                         if (i == MAX_SKB_FRAGS) {
 930                                                 err = -EMSGSIZE;
 931                                                 goto error;
 932                                         }
 933                                         get_page(page);
 934                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 935                                         frag = &skb_shinfo(skb)->frags[i];
 936                                 }
 937                         } else if (i < MAX_SKB_FRAGS) {
 938                                 if (copy > PAGE_SIZE)
 939                                         copy = PAGE_SIZE;
 940                                 page = alloc_pages(sk->sk_allocation, 0);
 941                                 if (page == NULL)  {
 942                                         err = -ENOMEM;
 943                                         goto error;
 944                                 }
 945                                 sk->sk_sndmsg_page = page;
 946                                 sk->sk_sndmsg_off = 0;
 947
 948                                 skb_fill_page_desc(skb, i, page, 0, 0);
 949                                 frag = &skb_shinfo(skb)->frags[i];
 950                                 skb->truesize += PAGE_SIZE;
 951                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
 952                         } else {
 953                                 err = -EMSGSIZE;
 954                                 goto error;
 955                         }
 956                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
 957                                 err = -EFAULT;
 958                                 goto error;
 959                         }
 960                         sk->sk_sndmsg_off += copy;
 961                         frag->size += copy;
 962                         skb->len += copy;
 963                         skb->data_len += copy;
 964                 }
 965                 offset += copy;
 966                 length -= copy;
 967         }
 968
 969         return 0;
 970
 971 error:
 972         inet->cork.length -= length;
 973         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
 974         return err;
 975 }
 976
 977 ssize_t ip_append_page(struct sock *sk, struct page *page,
 978                        int offset, size_t size, int flags)
 979 {
 980         struct inet_opt *inet = inet_sk(sk);
 981         struct sk_buff *skb;
 982         struct rtable *rt;
 983         struct ip_options *opt = NULL;
 984         int hh_len;
 985         int mtu;
 986         int len;
 987         int err;
 988         unsigned int maxfraglen, fragheaderlen, fraggap;
 989
 990         if (inet->hdrincl)
 991                 return -EPERM;
 992
 993         if (flags&MSG_PROBE)
 994                 return 0;
 995
 996         if (skb_queue_empty(&sk->sk_write_queue))
 997                 return -EINVAL;
 998
 999         rt = inet->cork.rt;
1000         if (inet->cork.flags & IPCORK_OPT)
1001                 opt = inet->cork.opt;
1002
1003         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1004                 return -EOPNOTSUPP;
1005
1006         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1007         mtu = inet->cork.fragsize;
1008
1009         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1010         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1011
1012         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1013                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1014                 return -EMSGSIZE;
1015         }
1016
1017         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1018                 return -EINVAL;
1019
1020         inet->cork.length += size;
1021
1022         while (size > 0) {
1023                 int i;
1024
1025                 /* Check if the remaining data fits into current packet. */
1026                 len = mtu - skb->len;
1027                 if (len < size)
1028                         len = maxfraglen - skb->len;
1029                 if (len <= 0) {
1030                         struct sk_buff *skb_prev;
1031                         char *data;
1032                         struct iphdr *iph;
1033                         int alloclen;
1034
1035                         skb_prev = skb;
1036                         if (skb_prev)
1037                                 fraggap = skb_prev->len - maxfraglen;
1038                         else
1039                                 fraggap = 0;
1040
1041                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1042                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1043                         if (unlikely(!skb)) {
1044                                 err = -ENOBUFS;
1045                                 goto error;
1046                         }
1047
1048                         /*
1049                          *      Fill in the control structures
1050                          */
1051                         skb->ip_summed = CHECKSUM_NONE;
1052                         skb->csum = 0;
1053                         skb_reserve(skb, hh_len);
1054
1055                         /*
1056                          *      Find where to start putting bytes.
1057                          */
1058                         data = skb_put(skb, fragheaderlen + fraggap);
1059                         skb->nh.iph = iph = (struct iphdr *)data;
1060                         data += fragheaderlen;
1061                         skb->h.raw = data;
1062
1063                         if (fraggap) {
1064                                 skb->csum = skb_copy_and_csum_bits(
1065                                         skb_prev, maxfraglen,
1066                                         data, fraggap, 0);
1067                                 skb_prev->csum = csum_sub(skb_prev->csum,
1068                                                           skb->csum);
1069                                 skb_trim(skb_prev, maxfraglen);
1070                         }
1071
1072                         /*
1073                          * Put the packet on the pending queue.
1074                          */
1075                         __skb_queue_tail(&sk->sk_write_queue, skb);
1076                         continue;
1077                 }
1078
1079                 i = skb_shinfo(skb)->nr_frags;
1080                 if (len > size)
1081                         len = size;
1082                 if (skb_can_coalesce(skb, i, page, offset)) {
1083                         skb_shinfo(skb)->frags[i-1].size += len;
1084                 } else if (i < MAX_SKB_FRAGS) {
1085                         get_page(page);
1086                         skb_fill_page_desc(skb, i, page, offset, len);
1087                 } else {
1088                         err = -EMSGSIZE;
1089                         goto error;
1090                 }
1091
1092                 if (skb->ip_summed == CHECKSUM_NONE) {
1093                         unsigned int csum;
1094                         csum = csum_page(page, offset, len);
1095                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1096                 }
1097
1098                 skb->len += len;
1099                 skb->data_len += len;
1100                 offset += len;
1101                 size -= len;
1102         }
1103         return 0;
1104
1105 error:
1106         inet->cork.length -= size;
1107         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1108         return err;
1109 }
1110
1111 /*
1112  *      Combined all pending IP fragments on the socket as one IP datagram
1113  *      and push them out.
1114  */
1115 int ip_push_pending_frames(struct sock *sk)
1116 {
1117         struct sk_buff *skb, *tmp_skb;
1118         struct sk_buff **tail_skb;
1119         struct inet_opt *inet = inet_sk(sk);
1120         struct ip_options *opt = NULL;
1121         struct rtable *rt = inet->cork.rt;
1122         struct iphdr *iph;
1123         int df = 0;
1124         __u8 ttl;
1125         int err = 0;
1126
1127         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1128                 goto out;
1129         tail_skb = &(skb_shinfo(skb)->frag_list);
1130
1131         /* move skb->data to ip header from ext header */
1132         if (skb->data < skb->nh.raw)
1133                 __skb_pull(skb, skb->nh.raw - skb->data);
1134         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1135                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1136                 *tail_skb = tmp_skb;
1137                 tail_skb = &(tmp_skb->next);
1138                 skb->len += tmp_skb->len;
1139                 skb->data_len += tmp_skb->len;
1140                 skb->truesize += tmp_skb->truesize;
1141                 __sock_put(tmp_skb->sk);
1142                 tmp_skb->destructor = NULL;
1143                 tmp_skb->sk = NULL;
1144         }
1145
1146         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1147          * to fragment the frame generated here. No matter, what transforms
1148          * how transforms change size of the packet, it will come out.
1149          */
1150         if (inet->pmtudisc != IP_PMTUDISC_DO)
1151                 skb->local_df = 1;
1152
1153         /* DF bit is set when we want to see DF on outgoing frames.
1154          * If local_df is set too, we still allow to fragment this frame
1155          * locally. */
1156         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1157             (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
1158                 df = htons(IP_DF);
1159
1160         if (inet->cork.flags & IPCORK_OPT)
1161                 opt = inet->cork.opt;
1162
1163         if (rt->rt_type == RTN_MULTICAST)
1164                 ttl = inet->mc_ttl;
1165         else
1166                 ttl = ip_select_ttl(inet, &rt->u.dst);
1167
1168         iph = (struct iphdr *)skb->data;
1169         iph->version = 4;
1170         iph->ihl = 5;
1171         if (opt) {
1172                 iph->ihl += opt->optlen>>2;
1173                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1174         }
1175         iph->tos = inet->tos;
1176         iph->tot_len = htons(skb->len);
1177         iph->frag_off = df;
1178         if (!df) {
1179                 __ip_select_ident(iph, &rt->u.dst, 0);
1180         } else {
1181                 iph->id = htons(inet->id++);
1182         }
1183         iph->ttl = ttl;
1184         iph->protocol = sk->sk_protocol;
1185         iph->saddr = rt->rt_src;
1186         iph->daddr = rt->rt_dst;
1187         ip_send_check(iph);
1188
1189         skb->priority = sk->sk_priority;
1190         skb->dst = dst_clone(&rt->u.dst);
1191
1192         /* Netfilter gets whole the not fragmented skb. */
1193         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1194                       skb->dst->dev, dst_output);
1195         if (err) {
1196                 if (err > 0)
1197                         err = inet->recverr ? net_xmit_errno(err) : 0;
1198                 if (err)
1199                         goto error;
1200         }
1201
1202 out:
1203         inet->cork.flags &= ~IPCORK_OPT;
1204         if (inet->cork.opt) {
1205                 kfree(inet->cork.opt);
1206                 inet->cork.opt = NULL;
1207         }
1208         if (inet->cork.rt) {
1209                 ip_rt_put(inet->cork.rt);
1210                 inet->cork.rt = NULL;
1211         }
1212         return err;
1213
1214 error:
1215         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1216         goto out;
1217 }
1218
1219 /*
1220  *      Throw away all pending data on the socket.
1221  */
1222 void ip_flush_pending_frames(struct sock *sk)
1223 {
1224         struct inet_opt *inet = inet_sk(sk);
1225         struct sk_buff *skb;
1226
1227         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1228                 kfree_skb(skb);
1229
1230         inet->cork.flags &= ~IPCORK_OPT;
1231         if (inet->cork.opt) {
1232                 kfree(inet->cork.opt);
1233                 inet->cork.opt = NULL;
1234         }
1235         if (inet->cork.rt) {
1236                 ip_rt_put(inet->cork.rt);
1237                 inet->cork.rt = NULL;
1238         }
1239 }
1240
1241
1242 /*
1243  *      Fetch data from kernel space and fill in checksum if needed.
1244  */
1245 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1246                               int len, int odd, struct sk_buff *skb)
1247 {
1248         unsigned int csum;
1249
1250         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1251         skb->csum = csum_block_add(skb->csum, csum, odd);
1252         return 0;
1253 }
1254
1255 /*
1256  *      Generic function to send a packet as reply to another packet.
1257  *      Used to send TCP resets so far. ICMP should use this function too.
1258  *
1259  *      Should run single threaded per socket because it uses the sock
1260  *      structure to pass arguments.
1261  *
1262  *      LATER: switch from ip_build_xmit to ip_append_*
1263  */
1264 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1265                    unsigned int len)
1266 {
1267         struct inet_opt *inet = inet_sk(sk);
1268         struct {
1269                 struct ip_options       opt;
1270                 char                    data[40];
1271         } replyopts;
1272         struct ipcm_cookie ipc;
1273         u32 daddr;
1274         struct rtable *rt = (struct rtable*)skb->dst;
1275
1276         if (ip_options_echo(&replyopts.opt, skb))
1277                 return;
1278
1279         daddr = ipc.addr = rt->rt_src;
1280         ipc.opt = NULL;
1281
1282         if (replyopts.opt.optlen) {
1283                 ipc.opt = &replyopts.opt;
1284
1285                 if (ipc.opt->srr)
1286                         daddr = replyopts.opt.faddr;
1287         }
1288
1289         {
1290                 struct flowi fl = { .nl_u = { .ip4_u =
1291                                               { .daddr = daddr,
1292                                                 .saddr = rt->rt_spec_dst,
1293                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1294                                     /* Not quite clean, but right. */
1295                                     .uli_u = { .ports =
1296                                                { .sport = skb->h.th->dest,
1297                                                  .dport = skb->h.th->source } },
1298                                     .proto = sk->sk_protocol };
1299                 if (ip_route_output_key(&rt, &fl))
1300                         return;
1301         }
1302
1303         /* And let IP do all the hard work.
1304
1305            This chunk is not reenterable, hence spinlock.
1306            Note that it uses the fact, that this function is called
1307            with locally disabled BH and that sk cannot be already spinlocked.
1308          */
1309         bh_lock_sock(sk);
1310         inet->tos = skb->nh.iph->tos;
1311         sk->sk_priority = skb->priority;
1312         sk->sk_protocol = skb->nh.iph->protocol;
1313         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1314                        &ipc, rt, MSG_DONTWAIT);
1315         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1316                 if (arg->csumoffset >= 0)
1317                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1318                 skb->ip_summed = CHECKSUM_NONE;
1319                 ip_push_pending_frames(sk);
1320         }
1321
1322         bh_unlock_sock(sk);
1323
1324         ip_rt_put(rt);
1325 }
1326
1327 /*
1328  *      IP protocol layer initialiser
1329  */
1330
1331 static struct packet_type ip_packet_type = {
1332         .type = __constant_htons(ETH_P_IP),
1333         .func = ip_rcv,
1334 };
1335
1336 /*
1337  *      IP registers the packet type and then calls the subprotocol initialisers
1338  */
1339
1340 void __init ip_init(void)
1341 {
1342         dev_add_pack(&ip_packet_type);
1343
1344         ip_rt_init();
1345         inet_initpeers();
1346
1347 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1348         igmp_mc_proc_init();
1349 #endif
1350 }
1351
1352 EXPORT_SYMBOL(ip_finish_output);
1353 EXPORT_SYMBOL(ip_fragment);
1354 EXPORT_SYMBOL(ip_generic_getfrag);
1355 EXPORT_SYMBOL(ip_queue_xmit);
1356 EXPORT_SYMBOL(ip_send_check);
1357
1358 #ifdef CONFIG_SYSCTL
1359 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1360 #endif