net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.83 2000/03/25 01:52:08 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently drop skb instead of failing with -EPERM.
  40  */
  41
  42 #include <asm/uaccess.h>
  43 #include <asm/system.h>
  44 #include <linux/types.h>
  45 #include <linux/kernel.h>
  46 #include <linux/sched.h>
  47 #include <linux/mm.h>
  48 #include <linux/string.h>
  49 #include <linux/errno.h>
  50 #include <linux/config.h>
  51
  52 #include <linux/socket.h>
  53 #include <linux/sockios.h>
  54 #include <linux/in.h>
  55 #include <linux/inet.h>
  56 #include <linux/netdevice.h>
  57 #include <linux/etherdevice.h>
  58 #include <linux/proc_fs.h>
  59 #include <linux/stat.h>
  60 #include <linux/init.h>
  61
  62 #include <net/snmp.h>
  63 #include <net/ip.h>
  64 #include <net/protocol.h>
  65 #include <net/route.h>
  66 #include <net/tcp.h>
  67 #include <net/udp.h>
  68 #include <linux/skbuff.h>
  69 #include <net/sock.h>
  70 #include <net/arp.h>
  71 #include <net/icmp.h>
  72 #include <net/raw.h>
  73 #include <net/checksum.h>
  74 #include <net/inetpeer.h>
  75 #include <linux/igmp.h>
  76 #include <linux/netfilter_ipv4.h>
  77 #include <linux/mroute.h>
  78 #include <linux/netlink.h>
  79
  80 /*
  81  *      Shall we try to damage output packets if routing dev changes?
  82  */
  83
  84 int sysctl_ip_dynaddr = 0;
  85 int sysctl_ip_default_ttl = IPDEFTTL;
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93
  94 /* dev_loopback_xmit for use with netfilter. */
  95 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  96 {
  97         newskb->mac.raw = newskb->data;
  98         skb_pull(newskb, newskb->nh.raw - newskb->data);
  99         newskb->pkt_type = PACKET_LOOPBACK;
 100         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 101         BUG_TRAP(newskb->dst);
 102
 103 #ifdef CONFIG_NETFILTER_DEBUG
 104         nf_debug_ip_loopback_xmit(newskb);
 105 #endif
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110 #ifdef CONFIG_NETFILTER
 111 /* To preserve the cute illusion that a locally-generated packet can
 112    be mangled before routing, we actually reroute if a hook altered
 113    the packet. -RR */
 114 static int route_me_harder(struct sk_buff *skb)
 115 {
 116         struct iphdr *iph = skb->nh.iph;
 117         struct rtable *rt;
 118
 119         if (ip_route_output(&rt, iph->daddr, iph->saddr,
 120                             RT_TOS(iph->tos) | RTO_CONN,
 121                             skb->sk ? skb->sk->bound_dev_if : 0)) {
 122                 printk("route_me_harder: No more route.\n");
 123                 return -EINVAL;
 124         }
 125
 126         /* Drop old route. */
 127         dst_release(skb->dst);
 128
 129         skb->dst = &rt->u.dst;
 130         return 0;
 131 }
 132 #endif
 133
 134 /* Do route recalc if netfilter changes skb. */
 135 static inline int
 136 output_maybe_reroute(struct sk_buff *skb)
 137 {
 138 #ifdef CONFIG_NETFILTER
 139         if (skb->nfcache & NFC_ALTERED) {
 140                 if (route_me_harder(skb) != 0) {
 141                         kfree_skb(skb);
 142                         return -EINVAL;
 143                 }
 144         }
 145 #endif
 146         return skb->dst->output(skb);
 147 }
 148
 149 /*
 150  *              Add an ip header to a skbuff and send it out.
 151  */
 152 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 153                           u32 saddr, u32 daddr, struct ip_options *opt)
 154 {
 155         struct rtable *rt = (struct rtable *)skb->dst;
 156         struct iphdr *iph;
 157
 158         /* Build the IP header. */
 159         if (opt)
 160                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 161         else
 162                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 163
 164         iph->version  = 4;
 165         iph->ihl      = 5;
 166         iph->tos      = sk->protinfo.af_inet.tos;
 167         iph->frag_off = 0;
 168         if (ip_dont_fragment(sk, &rt->u.dst))
 169                 iph->frag_off |= htons(IP_DF);
 170         iph->ttl      = sk->protinfo.af_inet.ttl;
 171         iph->daddr    = rt->rt_dst;
 172         iph->saddr    = rt->rt_src;
 173         iph->protocol = sk->protocol;
 174         iph->tot_len  = htons(skb->len);
 175         ip_select_ident(iph, &rt->u.dst);
 176         skb->nh.iph   = iph;
 177
 178         if (opt && opt->optlen) {
 179                 iph->ihl += opt->optlen>>2;
 180                 ip_options_build(skb, opt, daddr, rt, 0);
 181         }
 182         ip_send_check(iph);
 183
 184         /* Send it out. */
 185         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 186                        output_maybe_reroute);
 187 }
 188
 189 static inline int ip_finish_output2(struct sk_buff *skb)
 190 {
 191         struct dst_entry *dst = skb->dst;
 192         struct hh_cache *hh = dst->hh;
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         if (hh) {
 199                 read_lock_bh(&hh->hh_lock);
 200                 memcpy(skb->data - 16, hh->hh_data, 16);
 201                 read_unlock_bh(&hh->hh_lock);
 202                 skb_push(skb, hh->hh_len);
 203                 return hh->hh_output(skb);
 204         } else if (dst->neighbour)
 205                 return dst->neighbour->output(skb);
 206
 207         printk(KERN_DEBUG "khm\n");
 208         kfree_skb(skb);
 209         return -EINVAL;
 210 }
 211
 212 __inline__ int ip_finish_output(struct sk_buff *skb)
 213 {
 214         struct net_device *dev = skb->dst->dev;
 215
 216         skb->dev = dev;
 217         skb->protocol = __constant_htons(ETH_P_IP);
 218
 219         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 220                        ip_finish_output2);
 221 }
 222
 223 int ip_mc_output(struct sk_buff *skb)
 224 {
 225         struct sock *sk = skb->sk;
 226         struct rtable *rt = (struct rtable*)skb->dst;
 227         struct net_device *dev = rt->u.dst.dev;
 228
 229         /*
 230          *      If the indicated interface is up and running, send the packet.
 231          */
 232         IP_INC_STATS(IpOutRequests);
 233 #ifdef CONFIG_IP_ROUTE_NAT
 234         if (rt->rt_flags & RTCF_NAT)
 235                 ip_do_nat(skb);
 236 #endif
 237
 238         skb->dev = dev;
 239         skb->protocol = __constant_htons(ETH_P_IP);
 240
 241         /*
 242          *      Multicasts are looped back for other local users
 243          */
 244
 245         if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
 246 #ifdef CONFIG_IP_MROUTE
 247                 /* Small optimization: do not loopback not local frames,
 248                    which returned after forwarding; they will be  dropped
 249                    by ip_mr_input in any case.
 250                    Note, that local frames are looped back to be delivered
 251                    to local recipients.
 252
 253                    This check is duplicated in ip_mr_input at the moment.
 254                  */
 255                 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 256 #endif
 257                 {
 258                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 259                         if (newskb)
 260                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 261                                         newskb->dev,
 262                                         ip_dev_loopback_xmit);
 263                 }
 264
 265                 /* Multicasts with ttl 0 must not go beyond the host */
 266
 267                 if (skb->nh.iph->ttl == 0) {
 268                         kfree_skb(skb);
 269                         return 0;
 270                 }
 271         }
 272
 273         if (rt->rt_flags&RTCF_BROADCAST) {
 274                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                 if (newskb)
 276                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 277                                 newskb->dev, ip_dev_loopback_xmit);
 278         }
 279
 280         return ip_finish_output(skb);
 281 }
 282
 283 int ip_output(struct sk_buff *skb)
 284 {
 285 #ifdef CONFIG_IP_ROUTE_NAT
 286         struct rtable *rt = (struct rtable*)skb->dst;
 287 #endif
 288
 289         IP_INC_STATS(IpOutRequests);
 290
 291 #ifdef CONFIG_IP_ROUTE_NAT
 292         if (rt->rt_flags&RTCF_NAT)
 293                 ip_do_nat(skb);
 294 #endif
 295
 296         return ip_finish_output(skb);
 297 }
 298
 299 /* Queues a packet to be sent, and starts the transmitter if necessary.
 300  * This routine also needs to put in the total length and compute the
 301  * checksum.  We use to do this in two stages, ip_build_header() then
 302  * this, but that scheme created a mess when routes disappeared etc.
 303  * So we do it all here, and the TCP send engine has been changed to
 304  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 305  * most likely make other reliable transport layers above IP easier
 306  * to implement under Linux.
 307  */
 308 static inline int ip_queue_xmit2(struct sk_buff *skb)
 309 {
 310         struct sock *sk = skb->sk;
 311         struct rtable *rt = (struct rtable *)skb->dst;
 312         struct net_device *dev;
 313         struct iphdr *iph = skb->nh.iph;
 314
 315 #ifdef CONFIG_NETFILTER
 316         /* BLUE-PEN-FOR-ALEXEY.  I don't understand; you mean I can't
 317            hold the route as I pass the packet to userspace? -- RR
 318
 319            You may hold it, if you really hold it. F.e. if netfilter
 320            does not destroy handed skb with skb->dst attached, it
 321            will be held. When it was stored in info->arg, then
 322            it was not held apparently. Now (without second arg) it is evident,
 323            that it is clean.                               --ANK
 324          */
 325         if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
 326                 if (route_me_harder(skb) != 0) {
 327                         kfree_skb(skb);
 328                         return -EHOSTUNREACH;
 329                 }
 330         }
 331 #endif
 332
 333         dev = rt->u.dst.dev;
 334
 335         /* This can happen when the transport layer has segments queued
 336          * with a cached route, and by the time we get here things are
 337          * re-routed to a device with a different MTU than the original
 338          * device.  Sick, but we must cover it.
 339          */
 340         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 341                 struct sk_buff *skb2;
 342
 343                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 344                 kfree_skb(skb);
 345                 if (skb2 == NULL)
 346                         return -ENOMEM;
 347                 if (sk)
 348                         skb_set_owner_w(skb2, sk);
 349                 skb = skb2;
 350                 iph = skb->nh.iph;
 351         }
 352
 353         if (skb->len > rt->u.dst.pmtu)
 354                 goto fragment;
 355
 356         if (ip_dont_fragment(sk, &rt->u.dst))
 357                 iph->frag_off |= __constant_htons(IP_DF);
 358
 359         ip_select_ident(iph, &rt->u.dst);
 360
 361         /* Add an IP checksum. */
 362         ip_send_check(iph);
 363
 364         skb->priority = sk->priority;
 365         return skb->dst->output(skb);
 366
 367 fragment:
 368         if (ip_dont_fragment(sk, &rt->u.dst)) {
 369                 /* Reject packet ONLY if TCP might fragment
 370                  * it itself, if were careful enough.
 371                  */
 372                 iph->frag_off |= __constant_htons(IP_DF);
 373                 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
 374
 375                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 376                           htonl(rt->u.dst.pmtu));
 377                 kfree_skb(skb);
 378                 return -EMSGSIZE;
 379         }
 380         ip_select_ident(iph, &rt->u.dst);
 381         return ip_fragment(skb, skb->dst->output);
 382 }
 383
 384 int ip_queue_xmit(struct sk_buff *skb)
 385 {
 386         struct sock *sk = skb->sk;
 387         struct ip_options *opt = sk->protinfo.af_inet.opt;
 388         struct rtable *rt;
 389         struct iphdr *iph;
 390
 391         /* Make sure we can route this packet. */
 392         rt = (struct rtable *)__sk_dst_check(sk, 0);
 393         if (rt == NULL) {
 394                 u32 daddr;
 395
 396                 /* Use correct destination address if we have options. */
 397                 daddr = sk->daddr;
 398                 if(opt && opt->srr)
 399                         daddr = opt->faddr;
 400
 401                 /* If this fails, retransmit mechanism of transport layer will
 402                  * keep trying until route appears or the connection times itself
 403                  * out.
 404                  */
 405                 if (ip_route_output(&rt, daddr, sk->saddr,
 406                                     RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
 407                                     sk->bound_dev_if))
 408                         goto no_route;
 409                 __sk_dst_set(sk, &rt->u.dst);
 410         }
 411         skb->dst = dst_clone(&rt->u.dst);
 412
 413         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 414                 goto no_route;
 415
 416         /* OK, we know where to send it, allocate and build IP header. */
 417         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 418         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
 419         iph->tot_len = htons(skb->len);
 420         iph->frag_off = 0;
 421         iph->ttl      = sk->protinfo.af_inet.ttl;
 422         iph->protocol = sk->protocol;
 423         iph->saddr    = rt->rt_src;
 424         iph->daddr    = rt->rt_dst;
 425         skb->nh.iph   = iph;
 426         /* Transport layer set skb->h.foo itself. */
 427
 428         if(opt && opt->optlen) {
 429                 iph->ihl += opt->optlen >> 2;
 430                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 431         }
 432
 433         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 434                        ip_queue_xmit2);
 435
 436 no_route:
 437         IP_INC_STATS(IpOutNoRoutes);
 438         kfree_skb(skb);
 439         return -EHOSTUNREACH;
 440 }
 441
 442 /*
 443  *      Build and send a packet, with as little as one copy
 444  *
 445  *      Doesn't care much about ip options... option length can be
 446  *      different for fragment at 0 and other fragments.
 447  *
 448  *      Note that the fragment at the highest offset is sent first,
 449  *      so the getfrag routine can fill in the TCP/UDP checksum header
 450  *      field in the last fragment it sends... actually it also helps
 451  *      the reassemblers, they can put most packets in at the head of
 452  *      the fragment queue, and they know the total size in advance. This
 453  *      last feature will measurably improve the Linux fragment handler one
 454  *      day.
 455  *
 456  *      The callback has five args, an arbitrary pointer (copy of frag),
 457  *      the source IP address (may depend on the routing table), the
 458  *      destination address (char *), the offset to copy from, and the
 459  *      length to be copied.
 460  */
 461
 462 static int ip_build_xmit_slow(struct sock *sk,
 463                   int getfrag (const void *,
 464                                char *,
 465                                unsigned int,
 466                                unsigned int),
 467                   const void *frag,
 468                   unsigned length,
 469                   struct ipcm_cookie *ipc,
 470                   struct rtable *rt,
 471                   int flags)
 472 {
 473         unsigned int fraglen, maxfraglen, fragheaderlen;
 474         int err;
 475         int offset, mf;
 476         int mtu;
 477         u16 id = 0;
 478
 479         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 480         int nfrags=0;
 481         struct ip_options *opt = ipc->opt;
 482         int df = 0;
 483
 484         mtu = rt->u.dst.pmtu;
 485         if (ip_dont_fragment(sk, &rt->u.dst))
 486                 df = htons(IP_DF);
 487
 488         length -= sizeof(struct iphdr);
 489
 490         if (opt) {
 491                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 492                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 493         } else {
 494                 fragheaderlen = sizeof(struct iphdr);
 495
 496                 /*
 497                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 498                  *      out the size of the frames to send.
 499                  */
 500
 501                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 502         }
 503
 504         if (length + fragheaderlen > 0xFFFF) {
 505                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 506                 return -EMSGSIZE;
 507         }
 508
 509         /*
 510          *      Start at the end of the frame by handling the remainder.
 511          */
 512
 513         offset = length - (length % (maxfraglen - fragheaderlen));
 514
 515         /*
 516          *      Amount of memory to allocate for final fragment.
 517          */
 518
 519         fraglen = length - offset + fragheaderlen;
 520
 521         if (length-offset==0) {
 522                 fraglen = maxfraglen;
 523                 offset -= maxfraglen-fragheaderlen;
 524         }
 525
 526         /*
 527          *      The last fragment will not have MF (more fragments) set.
 528          */
 529
 530         mf = 0;
 531
 532         /*
 533          *      Don't fragment packets for path mtu discovery.
 534          */
 535
 536         if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
 537                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 538                 return -EMSGSIZE;
 539         }
 540         if (flags&MSG_PROBE)
 541                 goto out;
 542
 543         /*
 544          *      Begin outputting the bytes.
 545          */
 546
 547         do {
 548                 char *data;
 549                 struct sk_buff * skb;
 550
 551                 /*
 552                  *      Get the memory we require with some space left for alignment.
 553                  */
 554
 555                 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
 556                 if (skb == NULL)
 557                         goto error;
 558
 559                 /*
 560                  *      Fill in the control structures
 561                  */
 562
 563                 skb->priority = sk->priority;
 564                 skb->dst = dst_clone(&rt->u.dst);
 565                 skb_reserve(skb, hh_len);
 566
 567                 /*
 568                  *      Find where to start putting bytes.
 569                  */
 570
 571                 data = skb_put(skb, fraglen);
 572                 skb->nh.iph = (struct iphdr *)data;
 573
 574                 /*
 575                  *      Only write IP header onto non-raw packets
 576                  */
 577
 578                 {
 579                         struct iphdr *iph = (struct iphdr *)data;
 580
 581                         iph->version = 4;
 582                         iph->ihl = 5;
 583                         if (opt) {
 584                                 iph->ihl += opt->optlen>>2;
 585                                 ip_options_build(skb, opt,
 586                                                  ipc->addr, rt, offset);
 587                         }
 588                         iph->tos = sk->protinfo.af_inet.tos;
 589                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 590                         iph->frag_off = htons(offset>>3)|mf|df;
 591                         iph->id = id;
 592                         if (!mf) {
 593                                 if (offset || !df) {
 594                                         /* Select an unpredictable ident only
 595                                          * for packets without DF or having
 596                                          * been fragmented.
 597                                          */
 598                                         __ip_select_ident(iph, &rt->u.dst);
 599                                         id = iph->id;
 600                                 }
 601
 602                                 /*
 603                                  *      Any further fragments will have MF set.
 604                                  */
 605                                 mf = htons(IP_MF);
 606                         }
 607                         if (rt->rt_type == RTN_MULTICAST)
 608                                 iph->ttl = sk->protinfo.af_inet.mc_ttl;
 609                         else
 610                                 iph->ttl = sk->protinfo.af_inet.ttl;
 611                         iph->protocol = sk->protocol;
 612                         iph->check = 0;
 613                         iph->saddr = rt->rt_src;
 614                         iph->daddr = rt->rt_dst;
 615                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 616                         data += iph->ihl*4;
 617                 }
 618
 619                 /*
 620                  *      User data callback
 621                  */
 622
 623                 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 624                         err = -EFAULT;
 625                         kfree_skb(skb);
 626                         goto error;
 627                 }
 628
 629                 offset -= (maxfraglen-fragheaderlen);
 630                 fraglen = maxfraglen;
 631
 632                 nfrags++;
 633
 634                 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 635                               skb->dst->dev, output_maybe_reroute);
 636                 if (err) {
 637                         if (err > 0)
 638                                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 639                         if (err)
 640                                 goto error;
 641                 }
 642         } while (offset >= 0);
 643
 644         if (nfrags>1)
 645                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 646 out:
 647         return 0;
 648
 649 error:
 650         IP_INC_STATS(IpOutDiscards);
 651         if (nfrags>1)
 652                 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
 653         return err;
 654 }
 655
 656 /*
 657  *      Fast path for unfragmented packets.
 658  */
 659 int ip_build_xmit(struct sock *sk,
 660                   int getfrag (const void *,
 661                                char *,
 662                                unsigned int,
 663                                unsigned int),
 664                   const void *frag,
 665                   unsigned length,
 666                   struct ipcm_cookie *ipc,
 667                   struct rtable *rt,
 668                   int flags)
 669 {
 670         int err;
 671         struct sk_buff *skb;
 672         int df;
 673         struct iphdr *iph;
 674
 675         /*
 676          *      Try the simple case first. This leaves fragmented frames, and by
 677          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 678          */
 679
 680         if (!sk->protinfo.af_inet.hdrincl) {
 681                 length += sizeof(struct iphdr);
 682
 683                 /*
 684                  *      Check for slow path.
 685                  */
 686                 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 687                         return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 688         } else {
 689                 if (length > rt->u.dst.dev->mtu) {
 690                         ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 691                         return -EMSGSIZE;
 692                 }
 693         }
 694         if (flags&MSG_PROBE)
 695                 goto out;
 696
 697         /*
 698          *      Do path mtu discovery if needed.
 699          */
 700         df = 0;
 701         if (ip_dont_fragment(sk, &rt->u.dst))
 702                 df = htons(IP_DF);
 703
 704         /*
 705          *      Fast path for unfragmented frames without options.
 706          */
 707         {
 708         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 709
 710         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 711                                   0, flags&MSG_DONTWAIT, &err);
 712         if(skb==NULL)
 713                 goto error;
 714         skb_reserve(skb, hh_len);
 715         }
 716
 717         skb->priority = sk->priority;
 718         skb->dst = dst_clone(&rt->u.dst);
 719
 720         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 721
 722         if(!sk->protinfo.af_inet.hdrincl) {
 723                 iph->version=4;
 724                 iph->ihl=5;
 725                 iph->tos=sk->protinfo.af_inet.tos;
 726                 iph->tot_len = htons(length);
 727                 iph->frag_off = df;
 728                 iph->ttl=sk->protinfo.af_inet.mc_ttl;
 729                 ip_select_ident(iph, &rt->u.dst);
 730                 if (rt->rt_type != RTN_MULTICAST)
 731                         iph->ttl=sk->protinfo.af_inet.ttl;
 732                 iph->protocol=sk->protocol;
 733                 iph->saddr=rt->rt_src;
 734                 iph->daddr=rt->rt_dst;
 735                 iph->check=0;
 736                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 737                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 738         }
 739         else
 740                 err = getfrag(frag, (void *)iph, 0, length);
 741
 742         if (err)
 743                 goto error_fault;
 744
 745         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 746                       output_maybe_reroute);
 747         if (err > 0)
 748                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 749         if (err)
 750                 goto error;
 751 out:
 752         return 0;
 753
 754 error_fault:
 755         err = -EFAULT;
 756         kfree_skb(skb);
 757 error:
 758         IP_INC_STATS(IpOutDiscards);
 759         return err;
 760 }
 761
 762 /*
 763  *      This IP datagram is too large to be sent in one piece.  Break it up into
 764  *      smaller pieces (each of size equal to IP header plus
 765  *      a block of the data of the original IP data part) that will yet fit in a
 766  *      single device frame, and queue such a frame for sending.
 767  *
 768  *      Yes this is inefficient, feel free to submit a quicker one.
 769  */
 770
 771 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 772 {
 773         struct iphdr *iph;
 774         unsigned char *raw;
 775         unsigned char *ptr;
 776         struct net_device *dev;
 777         struct sk_buff *skb2;
 778         unsigned int mtu, hlen, left, len;
 779         int offset;
 780         int not_last_frag;
 781         struct rtable *rt = (struct rtable*)skb->dst;
 782         int err = 0;
 783
 784         dev = rt->u.dst.dev;
 785
 786         /*
 787          *      Point into the IP datagram header.
 788          */
 789
 790         raw = skb->nh.raw;
 791         iph = (struct iphdr*)raw;
 792
 793         /*
 794          *      Setup starting values.
 795          */
 796
 797         hlen = iph->ihl * 4;
 798         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 799         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 800         ptr = raw + hlen;                       /* Where to start from */
 801
 802         /*
 803          *      Fragment the datagram.
 804          */
 805
 806         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 807         not_last_frag = iph->frag_off & htons(IP_MF);
 808
 809         /*
 810          *      Keep copying data until we run out.
 811          */
 812
 813         while(left > 0) {
 814                 len = left;
 815                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 816                 if (len > mtu)
 817                         len = mtu;
 818                 /* IF: we are not sending upto and including the packet end
 819                    then align the next start on an eight byte boundary */
 820                 if (len < left) {
 821                         len &= ~7;
 822                 }
 823                 /*
 824                  *      Allocate buffer.
 825                  */
 826
 827                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 828                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 829                         err = -ENOMEM;
 830                         goto fail;
 831                 }
 832
 833                 /*
 834                  *      Set up data on packet
 835                  */
 836
 837                 skb2->pkt_type = skb->pkt_type;
 838                 skb2->priority = skb->priority;
 839                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 840                 skb_put(skb2, len + hlen);
 841                 skb2->nh.raw = skb2->data;
 842                 skb2->h.raw = skb2->data + hlen;
 843
 844                 /*
 845                  *      Charge the memory for the fragment to any owner
 846                  *      it might possess
 847                  */
 848
 849                 if (skb->sk)
 850                         skb_set_owner_w(skb2, skb->sk);
 851                 skb2->dst = dst_clone(skb->dst);
 852                 skb2->dev = skb->dev;
 853
 854                 /*
 855                  *      Copy the packet header into the new buffer.
 856                  */
 857
 858                 memcpy(skb2->nh.raw, raw, hlen);
 859
 860                 /*
 861                  *      Copy a block of the IP datagram.
 862                  */
 863                 memcpy(skb2->h.raw, ptr, len);
 864                 left -= len;
 865
 866                 /*
 867                  *      Fill in the new header fields.
 868                  */
 869                 iph = skb2->nh.iph;
 870                 iph->frag_off = htons((offset >> 3));
 871
 872                 /* ANK: dirty, but effective trick. Upgrade options only if
 873                  * the segment to be fragmented was THE FIRST (otherwise,
 874                  * options are already fixed) and make it ONCE
 875                  * on the initial skb, so that all the following fragments
 876                  * will inherit fixed options.
 877                  */
 878                 if (offset == 0)
 879                         ip_options_fragment(skb);
 880
 881                 /*
 882                  *      Added AC : If we are fragmenting a fragment that's not the
 883                  *                 last fragment then keep MF on each bit
 884                  */
 885                 if (left > 0 || not_last_frag)
 886                         iph->frag_off |= htons(IP_MF);
 887                 ptr += len;
 888                 offset += len;
 889
 890 #ifdef CONFIG_NETFILTER
 891                 /* Connection association is same as pre-frag packet */
 892                 skb2->nfct = skb->nfct;
 893                 nf_conntrack_get(skb2->nfct);
 894 #ifdef CONFIG_NETFILTER_DEBUG
 895                 skb2->nf_debug = skb->nf_debug;
 896 #endif
 897 #endif
 898
 899                 /*
 900                  *      Put this fragment into the sending queue.
 901                  */
 902
 903                 IP_INC_STATS(IpFragCreates);
 904
 905                 iph->tot_len = htons(len + hlen);
 906
 907                 ip_send_check(iph);
 908
 909                 err = output(skb2);
 910                 if (err)
 911                         goto fail;
 912         }
 913         kfree_skb(skb);
 914         IP_INC_STATS(IpFragOKs);
 915         return err;
 916
 917 fail:
 918         kfree_skb(skb);
 919         IP_INC_STATS(IpFragFails);
 920         return err;
 921 }
 922
 923 /*
 924  *      Fetch data from kernel space and fill in checksum if needed.
 925  */
 926 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 927                               unsigned int fraglen)
 928 {
 929         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 930         u16 *pktp = (u16 *)to;
 931         struct iovec *iov;
 932         int len;
 933         int hdrflag = 1;
 934
 935         iov = &dp->iov[0];
 936         if (offset >= iov->iov_len) {
 937                 offset -= iov->iov_len;
 938                 iov++;
 939                 hdrflag = 0;
 940         }
 941         len = iov->iov_len - offset;
 942         if (fraglen > len) { /* overlapping. */
 943                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 944                                              dp->csum);
 945                 offset = 0;
 946                 fraglen -= len;
 947                 to += len;
 948                 iov++;
 949         }
 950
 951         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 952                                              dp->csum);
 953
 954         if (hdrflag && dp->csumoffset)
 955                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 956         return 0;
 957 }
 958
 959 /*
 960  *      Generic function to send a packet as reply to another packet.
 961  *      Used to send TCP resets so far. ICMP should use this function too.
 962  *
 963  *      Should run single threaded per socket because it uses the sock
 964  *      structure to pass arguments.
 965  */
 966 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 967                    unsigned int len)
 968 {
 969         struct {
 970                 struct ip_options       opt;
 971                 char                    data[40];
 972         } replyopts;
 973         struct ipcm_cookie ipc;
 974         u32 daddr;
 975         struct rtable *rt = (struct rtable*)skb->dst;
 976
 977         if (ip_options_echo(&replyopts.opt, skb))
 978                 return;
 979
 980         daddr = ipc.addr = rt->rt_src;
 981         ipc.opt = NULL;
 982
 983         if (replyopts.opt.optlen) {
 984                 ipc.opt = &replyopts.opt;
 985
 986                 if (ipc.opt->srr)
 987                         daddr = replyopts.opt.faddr;
 988         }
 989
 990         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 991                 return;
 992
 993         /* And let IP do all the hard work.
 994
 995            This chunk is not reenterable, hence spinlock.
 996            Note that it uses the fact, that this function is called
 997            with locally disabled BH and that sk cannot be already spinlocked.
 998          */
 999         bh_lock_sock(sk);
1000         sk->protinfo.af_inet.tos = skb->nh.iph->tos;
1001         sk->priority = skb->priority;
1002         sk->protocol = skb->nh.iph->protocol;
1003         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
1004         bh_unlock_sock(sk);
1005
1006         ip_rt_put(rt);
1007 }
1008
1009 /*
1010  *      IP protocol layer initialiser
1011  */
1012
1013 static struct packet_type ip_packet_type =
1014 {
1015         __constant_htons(ETH_P_IP),
1016         NULL,   /* All devices */
1017         ip_rcv,
1018         (void*)1,
1019         NULL,
1020 };
1021
1022 /*
1023  *      IP registers the packet type and then calls the subprotocol initialisers
1024  */
1025
1026 void __init ip_init(void)
1027 {
1028         dev_add_pack(&ip_packet_type);
1029
1030         ip_rt_init();
1031         inet_initpeers();
1032
1033 #ifdef CONFIG_IP_MULTICAST
1034         proc_net_create("igmp", 0, ip_mc_procinfo);
1035 #endif
1036 }