net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.64 1999/01/04 20:05:33 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently abort send instead of failing
  40  *                                      with -EPERM.
  41  */
  42
  43 #include <asm/uaccess.h>
  44 #include <asm/system.h>
  45 #include <linux/types.h>
  46 #include <linux/kernel.h>
  47 #include <linux/sched.h>
  48 #include <linux/mm.h>
  49 #include <linux/string.h>
  50 #include <linux/errno.h>
  51 #include <linux/config.h>
  52
  53 #include <linux/socket.h>
  54 #include <linux/sockios.h>
  55 #include <linux/in.h>
  56 #include <linux/inet.h>
  57 #include <linux/netdevice.h>
  58 #include <linux/etherdevice.h>
  59 #include <linux/proc_fs.h>
  60 #include <linux/stat.h>
  61 #include <linux/init.h>
  62
  63 #include <net/snmp.h>
  64 #include <net/ip.h>
  65 #include <net/protocol.h>
  66 #include <net/route.h>
  67 #include <net/tcp.h>
  68 #include <net/udp.h>
  69 #include <linux/skbuff.h>
  70 #include <net/sock.h>
  71 #include <net/arp.h>
  72 #include <net/icmp.h>
  73 #include <net/raw.h>
  74 #include <net/checksum.h>
  75 #include <linux/igmp.h>
  76 #include <linux/ip_fw.h>
  77 #include <linux/firewall.h>
  78 #include <linux/mroute.h>
  79 #include <linux/netlink.h>
  80
  81 /*
  82  *      Shall we try to damage output packets if routing dev changes?
  83  */
  84
  85 int sysctl_ip_dynaddr = 0;
  86
  87
  88 int ip_id_count = 0;
  89
  90 /* Generate a checksum for an outgoing IP datagram. */
  91 __inline__ void ip_send_check(struct iphdr *iph)
  92 {
  93         iph->check = 0;
  94         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  95 }
  96
  97 /*
  98  *              Add an ip header to a skbuff and send it out.
  99  */
 100 void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 101                            u32 saddr, u32 daddr, struct ip_options *opt)
 102 {
 103         struct rtable *rt = (struct rtable *)skb->dst;
 104         struct iphdr *iph;
 105         struct device *dev;
 106
 107         /* Build the IP header. */
 108         if (opt)
 109                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 110         else
 111                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 112
 113         iph->version  = 4;
 114         iph->ihl      = 5;
 115         iph->tos      = sk->ip_tos;
 116         iph->frag_off = 0;
 117         if (ip_dont_fragment(sk, &rt->u.dst))
 118                 iph->frag_off |= htons(IP_DF);
 119         iph->ttl      = sk->ip_ttl;
 120         iph->daddr    = rt->rt_dst;
 121         iph->saddr    = rt->rt_src;
 122         iph->protocol = sk->protocol;
 123         iph->tot_len  = htons(skb->len);
 124         iph->id       = htons(ip_id_count++);
 125         skb->nh.iph   = iph;
 126
 127         if (opt && opt->optlen) {
 128                 iph->ihl += opt->optlen>>2;
 129                 ip_options_build(skb, opt, daddr, rt, 0);
 130         }
 131
 132         dev = rt->u.dst.dev;
 133
 134 #ifdef CONFIG_FIREWALL
 135         if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
 136                 goto drop;
 137 #endif
 138
 139         ip_send_check(iph);
 140
 141         /* Send it out. */
 142         skb->dst->output(skb);
 143         return;
 144
 145 #ifdef CONFIG_FIREWALL
 146 drop:
 147         kfree_skb(skb);
 148 #endif
 149 }
 150
 151 int __ip_finish_output(struct sk_buff *skb)
 152 {
 153         return ip_finish_output(skb);
 154 }
 155
 156 int ip_mc_output(struct sk_buff *skb)
 157 {
 158         struct sock *sk = skb->sk;
 159         struct rtable *rt = (struct rtable*)skb->dst;
 160         struct device *dev = rt->u.dst.dev;
 161
 162         /*
 163          *      If the indicated interface is up and running, send the packet.
 164          */
 165
 166         ip_statistics.IpOutRequests++;
 167 #ifdef CONFIG_IP_ROUTE_NAT
 168         if (rt->rt_flags & RTCF_NAT)
 169                 ip_do_nat(skb);
 170 #endif
 171
 172         skb->dev = dev;
 173         skb->protocol = __constant_htons(ETH_P_IP);
 174
 175         /*
 176          *      Multicasts are looped back for other local users
 177          */
 178
 179         if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
 180 #ifdef CONFIG_IP_MROUTE
 181                 /* Small optimization: do not loopback not local frames,
 182                    which returned after forwarding; they will be  dropped
 183                    by ip_mr_input in any case.
 184                    Note, that local frames are looped back to be delivered
 185                    to local recipients.
 186
 187                    This check is duplicated in ip_mr_input at the moment.
 188                  */
 189                 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 190 #endif
 191                 dev_loopback_xmit(skb);
 192
 193                 /* Multicasts with ttl 0 must not go beyond the host */
 194
 195                 if (skb->nh.iph->ttl == 0) {
 196                         kfree_skb(skb);
 197                         return 0;
 198                 }
 199         }
 200
 201         if (rt->rt_flags&RTCF_BROADCAST)
 202                 dev_loopback_xmit(skb);
 203
 204         return ip_finish_output(skb);
 205 }
 206
 207 int ip_output(struct sk_buff *skb)
 208 {
 209 #ifdef CONFIG_IP_ROUTE_NAT
 210         struct rtable *rt = (struct rtable*)skb->dst;
 211 #endif
 212
 213         ip_statistics.IpOutRequests++;
 214
 215 #ifdef CONFIG_IP_ROUTE_NAT
 216         if (rt->rt_flags&RTCF_NAT)
 217                 ip_do_nat(skb);
 218 #endif
 219
 220         return ip_finish_output(skb);
 221 }
 222
 223 /* Queues a packet to be sent, and starts the transmitter if necessary.
 224  * This routine also needs to put in the total length and compute the
 225  * checksum.  We use to do this in two stages, ip_build_header() then
 226  * this, but that scheme created a mess when routes disappeared etc.
 227  * So we do it all here, and the TCP send engine has been changed to
 228  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 229  * most likely make other reliable transport layers above IP easier
 230  * to implement under Linux.
 231  */
 232 void ip_queue_xmit(struct sk_buff *skb)
 233 {
 234         struct sock *sk = skb->sk;
 235         struct ip_options *opt = sk->opt;
 236         struct rtable *rt;
 237         struct device *dev;
 238         struct iphdr *iph;
 239         unsigned int tot_len;
 240
 241         /* Make sure we can route this packet. */
 242         rt = (struct rtable *) sk->dst_cache;
 243         if(rt == NULL || rt->u.dst.obsolete) {
 244                 u32 daddr;
 245
 246                 sk->dst_cache = NULL;
 247                 ip_rt_put(rt);
 248
 249                 /* Use correct destination address if we have options. */
 250                 daddr = sk->daddr;
 251                 if(opt && opt->srr)
 252                         daddr = opt->faddr;
 253
 254                 /* If this fails, retransmit mechanism of transport layer will
 255                  * keep trying until route appears or the connection times itself
 256                  * out.
 257                  */
 258                 if(ip_route_output(&rt, daddr, sk->saddr,
 259                                    RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
 260                                    sk->bound_dev_if))
 261                         goto drop;
 262                 sk->dst_cache = &rt->u.dst;
 263         }
 264         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 265                 goto no_route;
 266
 267         /* We have a route, so grab a reference. */
 268         skb->dst = dst_clone(sk->dst_cache);
 269
 270         /* OK, we know where to send it, allocate and build IP header. */
 271         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 272         iph->version  = 4;
 273         iph->ihl      = 5;
 274         iph->tos      = sk->ip_tos;
 275         iph->frag_off = 0;
 276         iph->ttl      = sk->ip_ttl;
 277         iph->daddr    = rt->rt_dst;
 278         iph->saddr    = rt->rt_src;
 279         iph->protocol = sk->protocol;
 280         skb->nh.iph   = iph;
 281         /* Transport layer set skb->h.foo itself. */
 282
 283         if(opt && opt->optlen) {
 284                 iph->ihl += opt->optlen >> 2;
 285                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 286         }
 287
 288         tot_len = skb->len;
 289         iph->tot_len = htons(tot_len);
 290         iph->id = htons(ip_id_count++);
 291
 292         dev = rt->u.dst.dev;
 293
 294 #ifdef CONFIG_FIREWALL
 295         if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
 296                 goto drop;
 297 #endif
 298
 299         /* This can happen when the transport layer has segments queued
 300          * with a cached route, and by the time we get here things are
 301          * re-routed to a device with a different MTU than the original
 302          * device.  Sick, but we must cover it.
 303          */
 304         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 305                 struct sk_buff *skb2;
 306
 307                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 308                 kfree_skb(skb);
 309                 if (skb2 == NULL)
 310                         return;
 311                 if (sk)
 312                         skb_set_owner_w(skb, sk);
 313                 skb = skb2;
 314                 iph = skb->nh.iph;
 315         }
 316
 317         /* Do we need to fragment.  Again this is inefficient.  We
 318          * need to somehow lock the original buffer and use bits of it.
 319          */
 320         if (tot_len > rt->u.dst.pmtu)
 321                 goto fragment;
 322
 323         if (ip_dont_fragment(sk, &rt->u.dst))
 324                 iph->frag_off |= __constant_htons(IP_DF);
 325
 326         /* Add an IP checksum. */
 327         ip_send_check(iph);
 328
 329         skb->priority = sk->priority;
 330         skb->dst->output(skb);
 331         return;
 332
 333 fragment:
 334         if (ip_dont_fragment(sk, &rt->u.dst) &&
 335             tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
 336                 /* Reject packet ONLY if TCP might fragment
 337                    it itself, if were careful enough.
 338                    Test is not precise (f.e. it does not take sacks
 339                    into account). Actually, tcp should make it. --ANK (980801)
 340                  */
 341                 iph->frag_off |= __constant_htons(IP_DF);
 342                 printk(KERN_DEBUG "sending pkt_too_big to self\n");
 343                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 344                           htonl(rt->u.dst.pmtu));
 345                 goto drop;
 346         }
 347         ip_fragment(skb, skb->dst->output);
 348         return;
 349
 350 no_route:
 351         sk->dst_cache = NULL;
 352         ip_rt_put(rt);
 353         ip_statistics.IpOutNoRoutes++;
 354         /* Fall through... */
 355 drop:
 356         kfree_skb(skb);
 357 }
 358
 359 /*
 360  *      Build and send a packet, with as little as one copy
 361  *
 362  *      Doesn't care much about ip options... option length can be
 363  *      different for fragment at 0 and other fragments.
 364  *
 365  *      Note that the fragment at the highest offset is sent first,
 366  *      so the getfrag routine can fill in the TCP/UDP checksum header
 367  *      field in the last fragment it sends... actually it also helps
 368  *      the reassemblers, they can put most packets in at the head of
 369  *      the fragment queue, and they know the total size in advance. This
 370  *      last feature will measurably improve the Linux fragment handler one
 371  *      day.
 372  *
 373  *      The callback has five args, an arbitrary pointer (copy of frag),
 374  *      the source IP address (may depend on the routing table), the
 375  *      destination address (char *), the offset to copy from, and the
 376  *      length to be copied.
 377  */
 378
 379 int ip_build_xmit_slow(struct sock *sk,
 380                   int getfrag (const void *,
 381                                char *,
 382                                unsigned int,
 383                                unsigned int),
 384                   const void *frag,
 385                   unsigned length,
 386                   struct ipcm_cookie *ipc,
 387                   struct rtable *rt,
 388                   int flags)
 389 {
 390         unsigned int fraglen, maxfraglen, fragheaderlen;
 391         int err;
 392         int offset, mf;
 393         int mtu;
 394         unsigned short id;
 395
 396         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 397         int nfrags=0;
 398         struct ip_options *opt = ipc->opt;
 399         int df = 0;
 400
 401         mtu = rt->u.dst.pmtu;
 402         if (ip_dont_fragment(sk, &rt->u.dst))
 403                 df = htons(IP_DF);
 404
 405         if (!sk->ip_hdrincl)
 406                 length -= sizeof(struct iphdr);
 407
 408         if (opt) {
 409                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 410                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 411         } else {
 412                 fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr);
 413
 414                 /*
 415                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 416                  *      out the size of the frames to send.
 417                  */
 418
 419                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 420         }
 421
 422         if (length + fragheaderlen > 0xFFFF) {
 423                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 424                 return -EMSGSIZE;
 425         }
 426
 427         /*
 428          *      Start at the end of the frame by handling the remainder.
 429          */
 430
 431         offset = length - (length % (maxfraglen - fragheaderlen));
 432
 433         /*
 434          *      Amount of memory to allocate for final fragment.
 435          */
 436
 437         fraglen = length - offset + fragheaderlen;
 438
 439         if (length-offset==0) {
 440                 fraglen = maxfraglen;
 441                 offset -= maxfraglen-fragheaderlen;
 442         }
 443
 444
 445         /*
 446          *      The last fragment will not have MF (more fragments) set.
 447          */
 448
 449         mf = 0;
 450
 451         /*
 452          *      Don't fragment packets for path mtu discovery.
 453          */
 454
 455         if (offset > 0 && df) {
 456                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 457                 return(-EMSGSIZE);
 458         }
 459
 460         /*
 461          *      Lock the device lists.
 462          */
 463
 464         dev_lock_list();
 465
 466         /*
 467          *      Get an identifier
 468          */
 469
 470         id = htons(ip_id_count++);
 471
 472         /*
 473          *      Begin outputting the bytes.
 474          */
 475
 476         do {
 477                 int error;
 478                 char *data;
 479                 struct sk_buff * skb;
 480
 481                 /*
 482                  *      Get the memory we require with some space left for alignment.
 483                  */
 484
 485                 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &error);
 486                 if (skb == NULL) {
 487                         ip_statistics.IpOutDiscards++;
 488                         if(nfrags>1)
 489                                 ip_statistics.IpFragCreates++;
 490                         dev_unlock_list();
 491                         return(error);
 492                 }
 493
 494                 /*
 495                  *      Fill in the control structures
 496                  */
 497
 498                 skb->priority = sk->priority;
 499                 skb->dst = dst_clone(&rt->u.dst);
 500                 skb_reserve(skb, hh_len);
 501
 502                 /*
 503                  *      Find where to start putting bytes.
 504                  */
 505
 506                 data = skb_put(skb, fraglen);
 507                 skb->nh.iph = (struct iphdr *)data;
 508
 509                 /*
 510                  *      Only write IP header onto non-raw packets
 511                  */
 512
 513                 if(!sk->ip_hdrincl) {
 514                         struct iphdr *iph = (struct iphdr *)data;
 515
 516                         iph->version = 4;
 517                         iph->ihl = 5;
 518                         if (opt) {
 519                                 iph->ihl += opt->optlen>>2;
 520                                 ip_options_build(skb, opt,
 521                                                  ipc->addr, rt, offset);
 522                         }
 523                         iph->tos = sk->ip_tos;
 524                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 525                         iph->id = id;
 526                         iph->frag_off = htons(offset>>3);
 527                         iph->frag_off |= mf|df;
 528                         if (rt->rt_type == RTN_MULTICAST)
 529                                 iph->ttl = sk->ip_mc_ttl;
 530                         else
 531                                 iph->ttl = sk->ip_ttl;
 532                         iph->protocol = sk->protocol;
 533                         iph->check = 0;
 534                         iph->saddr = rt->rt_src;
 535                         iph->daddr = rt->rt_dst;
 536                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 537                         data += iph->ihl*4;
 538
 539                         /*
 540                          *      Any further fragments will have MF set.
 541                          */
 542
 543                         mf = htons(IP_MF);
 544                 }
 545
 546                 /*
 547                  *      User data callback
 548                  */
 549
 550                 err = 0;
 551                 if (getfrag(frag, data, offset, fraglen-fragheaderlen))
 552                         err = -EFAULT;
 553
 554                 /*
 555                  *      Account for the fragment.
 556                  */
 557
 558 #ifdef CONFIG_FIREWALL
 559                 if(!err) {
 560                         int fw_res;
 561
 562                         fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb);
 563                         if(fw_res == FW_QUEUE) {
 564                                 kfree_skb(skb);
 565                                 skb = NULL;
 566                         } else if(fw_res < FW_ACCEPT) {
 567                                 err = -EPERM;
 568                         }
 569                 }
 570 #endif
 571
 572                 if (err) {
 573                         ip_statistics.IpOutDiscards++;
 574                         kfree_skb(skb);
 575                         dev_unlock_list();
 576                         return err;
 577                 }
 578
 579
 580                 offset -= (maxfraglen-fragheaderlen);
 581                 fraglen = maxfraglen;
 582
 583                 nfrags++;
 584
 585                 err = 0;
 586                 if (skb && rt->u.dst.output(skb)) {
 587                         err = -ENETDOWN;
 588                         ip_statistics.IpOutDiscards++;
 589                         break;
 590                 }
 591         } while (offset >= 0);
 592
 593         if (nfrags>1)
 594                 ip_statistics.IpFragCreates += nfrags;
 595         dev_unlock_list();
 596         return err;
 597 }
 598
 599
 600 /*
 601  *      Fast path for unfragmented packets.
 602  */
 603 int ip_build_xmit(struct sock *sk,
 604                   int getfrag (const void *,
 605                                char *,
 606                                unsigned int,
 607                                unsigned int),
 608                   const void *frag,
 609                   unsigned length,
 610                   struct ipcm_cookie *ipc,
 611                   struct rtable *rt,
 612                   int flags)
 613 {
 614         int err;
 615         struct sk_buff *skb;
 616         int df;
 617         struct iphdr *iph;
 618
 619         /*
 620          *      Try the simple case first. This leaves fragmented frames, and by
 621          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 622          */
 623
 624         if (!sk->ip_hdrincl)
 625                 length += sizeof(struct iphdr);
 626
 627         /*
 628          *      Check for slow path.
 629          */
 630         if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 631                 return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 632
 633         /*
 634          *      Do path mtu discovery if needed.
 635          */
 636         df = 0;
 637         if (ip_dont_fragment(sk, &rt->u.dst))
 638                 df = htons(IP_DF);
 639
 640         /*
 641          *      Fast path for unfragmented frames without options.
 642          */
 643         {
 644         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 645
 646         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 647                                   0, flags&MSG_DONTWAIT, &err);
 648         if(skb==NULL)
 649                 goto error;
 650         skb_reserve(skb, hh_len);
 651         }
 652
 653         skb->priority = sk->priority;
 654         skb->dst = dst_clone(&rt->u.dst);
 655
 656         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 657
 658         dev_lock_list();
 659
 660         if(!sk->ip_hdrincl) {
 661                 iph->version=4;
 662                 iph->ihl=5;
 663                 iph->tos=sk->ip_tos;
 664                 iph->tot_len = htons(length);
 665                 iph->id=htons(ip_id_count++);
 666                 iph->frag_off = df;
 667                 iph->ttl=sk->ip_mc_ttl;
 668                 if (rt->rt_type != RTN_MULTICAST)
 669                         iph->ttl=sk->ip_ttl;
 670                 iph->protocol=sk->protocol;
 671                 iph->saddr=rt->rt_src;
 672                 iph->daddr=rt->rt_dst;
 673                 iph->check=0;
 674                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 675                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 676         }
 677         else
 678                 err = getfrag(frag, (void *)iph, 0, length);
 679
 680         dev_unlock_list();
 681
 682         if (err)
 683                 err = -EFAULT;
 684
 685 #ifdef CONFIG_FIREWALL
 686         if(!err) {
 687                 int fw_res;
 688
 689                 fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb);
 690                 if(fw_res == FW_QUEUE) {
 691                         /* re-queued elsewhere; silently abort this send */
 692                         kfree_skb(skb);
 693                         return 0;
 694                 }
 695                 if(fw_res < FW_ACCEPT)
 696                         err = -EPERM;
 697         }
 698 #endif
 699
 700         if (err) {
 701                 kfree_skb(skb);
 702                 goto error;
 703         }
 704
 705         return rt->u.dst.output(skb);
 706
 707 error:
 708         ip_statistics.IpOutDiscards++;
 709         return err;
 710 }
 711
 712
 713
 714 /*
 715  *      This IP datagram is too large to be sent in one piece.  Break it up into
 716  *      smaller pieces (each of size equal to IP header plus
 717  *      a block of the data of the original IP data part) that will yet fit in a
 718  *      single device frame, and queue such a frame for sending.
 719  *
 720  *      Yes this is inefficient, feel free to submit a quicker one.
 721  */
 722
 723 void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 724 {
 725         struct iphdr *iph;
 726         unsigned char *raw;
 727         unsigned char *ptr;
 728         struct device *dev;
 729         struct sk_buff *skb2;
 730         unsigned int mtu, hlen, left, len;
 731         int offset;
 732         int not_last_frag;
 733         struct rtable *rt = (struct rtable*)skb->dst;
 734
 735         dev = rt->u.dst.dev;
 736
 737         /*
 738          *      Point into the IP datagram header.
 739          */
 740
 741         raw = skb->nh.raw;
 742         iph = (struct iphdr*)raw;
 743
 744         /*
 745          *      Setup starting values.
 746          */
 747
 748         hlen = iph->ihl * 4;
 749         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 750         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 751         ptr = raw + hlen;                       /* Where to start from */
 752
 753         /*
 754          *      The protocol doesn't seem to say what to do in the case that the
 755          *      frame + options doesn't fit the mtu. As it used to fall down dead
 756          *      in this case we were fortunate it didn't happen
 757          *
 758          *      It is impossible, because mtu>=68. --ANK (980801)
 759          */
 760
 761 #ifdef CONFIG_NET_PARANOIA
 762         if (mtu<8)
 763                 goto fail;
 764 #endif
 765
 766         /*
 767          *      Fragment the datagram.
 768          */
 769
 770         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 771         not_last_frag = iph->frag_off & htons(IP_MF);
 772
 773         /*
 774          *      Keep copying data until we run out.
 775          */
 776
 777         while(left > 0) {
 778                 len = left;
 779                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 780                 if (len > mtu)
 781                         len = mtu;
 782                 /* IF: we are not sending upto and including the packet end
 783                    then align the next start on an eight byte boundary */
 784                 if (len < left) {
 785                         len &= ~7;
 786                 }
 787                 /*
 788                  *      Allocate buffer.
 789                  */
 790
 791                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 792                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 793                         goto fail;
 794                 }
 795
 796                 /*
 797                  *      Set up data on packet
 798                  */
 799
 800                 skb2->pkt_type = skb->pkt_type;
 801                 skb2->priority = skb->priority;
 802                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 803                 skb_put(skb2, len + hlen);
 804                 skb2->nh.raw = skb2->data;
 805                 skb2->h.raw = skb2->data + hlen;
 806
 807                 /*
 808                  *      Charge the memory for the fragment to any owner
 809                  *      it might possess
 810                  */
 811
 812                 if (skb->sk)
 813                         skb_set_owner_w(skb2, skb->sk);
 814                 skb2->dst = dst_clone(skb->dst);
 815
 816                 /*
 817                  *      Copy the packet header into the new buffer.
 818                  */
 819
 820                 memcpy(skb2->nh.raw, raw, hlen);
 821
 822                 /*
 823                  *      Copy a block of the IP datagram.
 824                  */
 825                 memcpy(skb2->h.raw, ptr, len);
 826                 left -= len;
 827
 828                 /*
 829                  *      Fill in the new header fields.
 830                  */
 831                 iph = skb2->nh.iph;
 832                 iph->frag_off = htons((offset >> 3));
 833
 834                 /* ANK: dirty, but effective trick. Upgrade options only if
 835                  * the segment to be fragmented was THE FIRST (otherwise,
 836                  * options are already fixed) and make it ONCE
 837                  * on the initial skb, so that all the following fragments
 838                  * will inherit fixed options.
 839                  */
 840                 if (offset == 0)
 841                         ip_options_fragment(skb);
 842
 843                 /*
 844                  *      Added AC : If we are fragmenting a fragment that's not the
 845                  *                 last fragment then keep MF on each bit
 846                  */
 847                 if (left > 0 || not_last_frag)
 848                         iph->frag_off |= htons(IP_MF);
 849                 ptr += len;
 850                 offset += len;
 851
 852                 /*
 853                  *      Put this fragment into the sending queue.
 854                  */
 855
 856                 ip_statistics.IpFragCreates++;
 857
 858                 iph->tot_len = htons(len + hlen);
 859
 860                 ip_send_check(iph);
 861
 862                 output(skb2);
 863         }
 864         kfree_skb(skb);
 865         ip_statistics.IpFragOKs++;
 866         return;
 867
 868 fail:
 869         kfree_skb(skb);
 870         ip_statistics.IpFragFails++;
 871 }
 872
 873 /*
 874  *      Fetch data from kernel space and fill in checksum if needed.
 875  */
 876 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 877                               unsigned int fraglen)
 878 {
 879         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 880         u16 *pktp = (u16 *)to;
 881         struct iovec *iov;
 882         int len;
 883         int hdrflag = 1;
 884
 885         iov = &dp->iov[0];
 886         if (offset >= iov->iov_len) {
 887                 offset -= iov->iov_len;
 888                 iov++;
 889                 hdrflag = 0;
 890         }
 891         len = iov->iov_len - offset;
 892         if (fraglen > len) { /* overlapping. */
 893                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 894                                              dp->csum);
 895                 offset = 0;
 896                 fraglen -= len;
 897                 to += len;
 898                 iov++;
 899         }
 900
 901         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 902                                              dp->csum);
 903
 904         if (hdrflag && dp->csumoffset)
 905                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 906         return 0;
 907 }
 908
 909 /*
 910  *      Generic function to send a packet as reply to another packet.
 911  *      Used to send TCP resets so far. ICMP should use this function too.
 912  *
 913  *      Should run single threaded per socket because it uses the sock
 914  *      structure to pass arguments.
 915  */
 916 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 917                    unsigned int len)
 918 {
 919         struct {
 920                 struct ip_options       opt;
 921                 char                    data[40];
 922         } replyopts;
 923         struct ipcm_cookie ipc;
 924         u32 daddr;
 925         struct rtable *rt = (struct rtable*)skb->dst;
 926
 927         if (ip_options_echo(&replyopts.opt, skb))
 928                 return;
 929
 930         sk->ip_tos = skb->nh.iph->tos;
 931         sk->priority = skb->priority;
 932         sk->protocol = skb->nh.iph->protocol;
 933
 934         daddr = ipc.addr = rt->rt_src;
 935         ipc.opt = &replyopts.opt;
 936
 937         if (ipc.opt->srr)
 938                 daddr = replyopts.opt.faddr;
 939         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 940                 return;
 941
 942         /* And let IP do all the hard work. */
 943         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 944         ip_rt_put(rt);
 945 }
 946
 947 /*
 948  *      IP protocol layer initialiser
 949  */
 950
 951 static struct packet_type ip_packet_type =
 952 {
 953         __constant_htons(ETH_P_IP),
 954         NULL,   /* All devices */
 955         ip_rcv,
 956         NULL,
 957         NULL,
 958 };
 959
 960
 961
 962 #ifdef CONFIG_PROC_FS
 963 #ifdef CONFIG_IP_MULTICAST
 964 static struct proc_dir_entry proc_net_igmp = {
 965         PROC_NET_IGMP, 4, "igmp",
 966         S_IFREG | S_IRUGO, 1, 0, 0,
 967         0, &proc_net_inode_operations,
 968         ip_mc_procinfo
 969 };
 970 #endif
 971 #endif
 972
 973 /*
 974  *      IP registers the packet type and then calls the subprotocol initialisers
 975  */
 976
 977 __initfunc(void ip_init(void))
 978 {
 979         dev_add_pack(&ip_packet_type);
 980
 981         ip_rt_init();
 982
 983 #ifdef CONFIG_PROC_FS
 984 #ifdef CONFIG_IP_MULTICAST
 985         proc_net_register(&proc_net_igmp);
 986 #endif
 987 #endif
 988 }
 989