net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.67 1999/03/25 00:43:00 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently drop skb instead of failing with -EPERM.
  40  */
  41
  42 #include <asm/uaccess.h>
  43 #include <asm/system.h>
  44 #include <linux/types.h>
  45 #include <linux/kernel.h>
  46 #include <linux/sched.h>
  47 #include <linux/mm.h>
  48 #include <linux/string.h>
  49 #include <linux/errno.h>
  50 #include <linux/config.h>
  51
  52 #include <linux/socket.h>
  53 #include <linux/sockios.h>
  54 #include <linux/in.h>
  55 #include <linux/inet.h>
  56 #include <linux/netdevice.h>
  57 #include <linux/etherdevice.h>
  58 #include <linux/proc_fs.h>
  59 #include <linux/stat.h>
  60 #include <linux/init.h>
  61
  62 #include <net/snmp.h>
  63 #include <net/ip.h>
  64 #include <net/protocol.h>
  65 #include <net/route.h>
  66 #include <net/tcp.h>
  67 #include <net/udp.h>
  68 #include <linux/skbuff.h>
  69 #include <net/sock.h>
  70 #include <net/arp.h>
  71 #include <net/icmp.h>
  72 #include <net/raw.h>
  73 #include <net/checksum.h>
  74 #include <linux/igmp.h>
  75 #include <linux/ip_fw.h>
  76 #include <linux/firewall.h>
  77 #include <linux/mroute.h>
  78 #include <linux/netlink.h>
  79
  80 /*
  81  *      Shall we try to damage output packets if routing dev changes?
  82  */
  83
  84 int sysctl_ip_dynaddr = 0;
  85
  86
  87 int ip_id_count = 0;
  88
  89 /* Generate a checksum for an outgoing IP datagram. */
  90 __inline__ void ip_send_check(struct iphdr *iph)
  91 {
  92         iph->check = 0;
  93         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  94 }
  95
  96 /*
  97  *              Add an ip header to a skbuff and send it out.
  98  */
  99 void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 100                            u32 saddr, u32 daddr, struct ip_options *opt)
 101 {
 102         struct rtable *rt = (struct rtable *)skb->dst;
 103         struct iphdr *iph;
 104         struct device *dev;
 105
 106         /* Build the IP header. */
 107         if (opt)
 108                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 109         else
 110                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 111
 112         iph->version  = 4;
 113         iph->ihl      = 5;
 114         iph->tos      = sk->ip_tos;
 115         iph->frag_off = 0;
 116         if (ip_dont_fragment(sk, &rt->u.dst))
 117                 iph->frag_off |= htons(IP_DF);
 118         iph->ttl      = sk->ip_ttl;
 119         iph->daddr    = rt->rt_dst;
 120         iph->saddr    = rt->rt_src;
 121         iph->protocol = sk->protocol;
 122         iph->tot_len  = htons(skb->len);
 123         iph->id       = htons(ip_id_count++);
 124         skb->nh.iph   = iph;
 125
 126         if (opt && opt->optlen) {
 127                 iph->ihl += opt->optlen>>2;
 128                 ip_options_build(skb, opt, daddr, rt, 0);
 129         }
 130
 131         dev = rt->u.dst.dev;
 132
 133 #ifdef CONFIG_FIREWALL
 134         /* Now we have no better mechanism to notify about error. */
 135         switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
 136         case FW_REJECT:
 137                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 138                 /* Fall thru... */
 139         case FW_BLOCK:
 140         case FW_QUEUE:
 141                 kfree_skb(skb);
 142                 return;
 143         }
 144 #endif
 145
 146         ip_send_check(iph);
 147
 148         /* Send it out. */
 149         skb->dst->output(skb);
 150         return;
 151 }
 152
 153 int __ip_finish_output(struct sk_buff *skb)
 154 {
 155         return ip_finish_output(skb);
 156 }
 157
 158 int ip_mc_output(struct sk_buff *skb)
 159 {
 160         struct sock *sk = skb->sk;
 161         struct rtable *rt = (struct rtable*)skb->dst;
 162         struct device *dev = rt->u.dst.dev;
 163
 164         /*
 165          *      If the indicated interface is up and running, send the packet.
 166          */
 167
 168         ip_statistics.IpOutRequests++;
 169 #ifdef CONFIG_IP_ROUTE_NAT
 170         if (rt->rt_flags & RTCF_NAT)
 171                 ip_do_nat(skb);
 172 #endif
 173
 174         skb->dev = dev;
 175         skb->protocol = __constant_htons(ETH_P_IP);
 176
 177         /*
 178          *      Multicasts are looped back for other local users
 179          */
 180
 181         if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
 182 #ifdef CONFIG_IP_MROUTE
 183                 /* Small optimization: do not loopback not local frames,
 184                    which returned after forwarding; they will be  dropped
 185                    by ip_mr_input in any case.
 186                    Note, that local frames are looped back to be delivered
 187                    to local recipients.
 188
 189                    This check is duplicated in ip_mr_input at the moment.
 190                  */
 191                 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 192 #endif
 193                 dev_loopback_xmit(skb);
 194
 195                 /* Multicasts with ttl 0 must not go beyond the host */
 196
 197                 if (skb->nh.iph->ttl == 0) {
 198                         kfree_skb(skb);
 199                         return 0;
 200                 }
 201         }
 202
 203         if (rt->rt_flags&RTCF_BROADCAST)
 204                 dev_loopback_xmit(skb);
 205
 206         return ip_finish_output(skb);
 207 }
 208
 209 int ip_output(struct sk_buff *skb)
 210 {
 211 #ifdef CONFIG_IP_ROUTE_NAT
 212         struct rtable *rt = (struct rtable*)skb->dst;
 213 #endif
 214
 215         ip_statistics.IpOutRequests++;
 216
 217 #ifdef CONFIG_IP_ROUTE_NAT
 218         if (rt->rt_flags&RTCF_NAT)
 219                 ip_do_nat(skb);
 220 #endif
 221
 222         return ip_finish_output(skb);
 223 }
 224
 225 /* Queues a packet to be sent, and starts the transmitter if necessary.
 226  * This routine also needs to put in the total length and compute the
 227  * checksum.  We use to do this in two stages, ip_build_header() then
 228  * this, but that scheme created a mess when routes disappeared etc.
 229  * So we do it all here, and the TCP send engine has been changed to
 230  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 231  * most likely make other reliable transport layers above IP easier
 232  * to implement under Linux.
 233  */
 234 void ip_queue_xmit(struct sk_buff *skb)
 235 {
 236         struct sock *sk = skb->sk;
 237         struct ip_options *opt = sk->opt;
 238         struct rtable *rt;
 239         struct device *dev;
 240         struct iphdr *iph;
 241         unsigned int tot_len;
 242
 243         /* Make sure we can route this packet. */
 244         rt = (struct rtable *) sk->dst_cache;
 245         if(rt == NULL || rt->u.dst.obsolete) {
 246                 u32 daddr;
 247
 248                 sk->dst_cache = NULL;
 249                 ip_rt_put(rt);
 250
 251                 /* Use correct destination address if we have options. */
 252                 daddr = sk->daddr;
 253                 if(opt && opt->srr)
 254                         daddr = opt->faddr;
 255
 256                 /* If this fails, retransmit mechanism of transport layer will
 257                  * keep trying until route appears or the connection times itself
 258                  * out.
 259                  */
 260                 if(ip_route_output(&rt, daddr, sk->saddr,
 261                                    RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
 262                                    sk->bound_dev_if))
 263                         goto drop;
 264                 sk->dst_cache = &rt->u.dst;
 265         }
 266         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 267                 goto no_route;
 268
 269         /* We have a route, so grab a reference. */
 270         skb->dst = dst_clone(sk->dst_cache);
 271
 272         /* OK, we know where to send it, allocate and build IP header. */
 273         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 274         iph->version  = 4;
 275         iph->ihl      = 5;
 276         iph->tos      = sk->ip_tos;
 277         iph->frag_off = 0;
 278         iph->ttl      = sk->ip_ttl;
 279         iph->daddr    = rt->rt_dst;
 280         iph->saddr    = rt->rt_src;
 281         iph->protocol = sk->protocol;
 282         skb->nh.iph   = iph;
 283         /* Transport layer set skb->h.foo itself. */
 284
 285         if(opt && opt->optlen) {
 286                 iph->ihl += opt->optlen >> 2;
 287                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 288         }
 289
 290         tot_len = skb->len;
 291         iph->tot_len = htons(tot_len);
 292         iph->id = htons(ip_id_count++);
 293
 294         dev = rt->u.dst.dev;
 295
 296 #ifdef CONFIG_FIREWALL
 297         /* Now we have no better mechanism to notify about error. */
 298         switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
 299         case FW_REJECT:
 300                 start_bh_atomic();
 301                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 302                 end_bh_atomic();
 303                 /* Fall thru... */
 304         case FW_BLOCK:
 305         case FW_QUEUE:
 306                 goto drop;
 307         }
 308 #endif
 309
 310         /* This can happen when the transport layer has segments queued
 311          * with a cached route, and by the time we get here things are
 312          * re-routed to a device with a different MTU than the original
 313          * device.  Sick, but we must cover it.
 314          */
 315         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 316                 struct sk_buff *skb2;
 317
 318                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 319                 kfree_skb(skb);
 320                 if (skb2 == NULL)
 321                         return;
 322                 if (sk)
 323                         skb_set_owner_w(skb, sk);
 324                 skb = skb2;
 325                 iph = skb->nh.iph;
 326         }
 327
 328         /* Do we need to fragment.  Again this is inefficient.  We
 329          * need to somehow lock the original buffer and use bits of it.
 330          */
 331         if (tot_len > rt->u.dst.pmtu)
 332                 goto fragment;
 333
 334         if (ip_dont_fragment(sk, &rt->u.dst))
 335                 iph->frag_off |= __constant_htons(IP_DF);
 336
 337         /* Add an IP checksum. */
 338         ip_send_check(iph);
 339
 340         skb->priority = sk->priority;
 341         skb->dst->output(skb);
 342         return;
 343
 344 fragment:
 345         if (ip_dont_fragment(sk, &rt->u.dst) &&
 346             tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
 347                 /* Reject packet ONLY if TCP might fragment
 348                    it itself, if were careful enough.
 349                    Test is not precise (f.e. it does not take sacks
 350                    into account). Actually, tcp should make it. --ANK (980801)
 351                  */
 352                 iph->frag_off |= __constant_htons(IP_DF);
 353                 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
 354
 355                 /* icmp_send is not reenterable, so that bh_atomic... --ANK */
 356                 start_bh_atomic();
 357                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 358                           htonl(rt->u.dst.pmtu));
 359                 end_bh_atomic();
 360                 goto drop;
 361         }
 362         ip_fragment(skb, skb->dst->output);
 363         return;
 364
 365 no_route:
 366         sk->dst_cache = NULL;
 367         ip_rt_put(rt);
 368         ip_statistics.IpOutNoRoutes++;
 369         /* Fall through... */
 370 drop:
 371         kfree_skb(skb);
 372 }
 373
 374 /*
 375  *      Build and send a packet, with as little as one copy
 376  *
 377  *      Doesn't care much about ip options... option length can be
 378  *      different for fragment at 0 and other fragments.
 379  *
 380  *      Note that the fragment at the highest offset is sent first,
 381  *      so the getfrag routine can fill in the TCP/UDP checksum header
 382  *      field in the last fragment it sends... actually it also helps
 383  *      the reassemblers, they can put most packets in at the head of
 384  *      the fragment queue, and they know the total size in advance. This
 385  *      last feature will measurably improve the Linux fragment handler one
 386  *      day.
 387  *
 388  *      The callback has five args, an arbitrary pointer (copy of frag),
 389  *      the source IP address (may depend on the routing table), the
 390  *      destination address (char *), the offset to copy from, and the
 391  *      length to be copied.
 392  */
 393
 394 int ip_build_xmit_slow(struct sock *sk,
 395                   int getfrag (const void *,
 396                                char *,
 397                                unsigned int,
 398                                unsigned int),
 399                   const void *frag,
 400                   unsigned length,
 401                   struct ipcm_cookie *ipc,
 402                   struct rtable *rt,
 403                   int flags)
 404 {
 405         unsigned int fraglen, maxfraglen, fragheaderlen;
 406         int err;
 407         int offset, mf;
 408         int mtu;
 409         unsigned short id;
 410
 411         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 412         int nfrags=0;
 413         struct ip_options *opt = ipc->opt;
 414         int df = 0;
 415
 416         mtu = rt->u.dst.pmtu;
 417         if (ip_dont_fragment(sk, &rt->u.dst))
 418                 df = htons(IP_DF);
 419
 420         length -= sizeof(struct iphdr);
 421
 422         if (opt) {
 423                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 424                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 425         } else {
 426                 fragheaderlen = sizeof(struct iphdr);
 427
 428                 /*
 429                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 430                  *      out the size of the frames to send.
 431                  */
 432
 433                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 434         }
 435
 436         if (length + fragheaderlen > 0xFFFF) {
 437                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 438                 return -EMSGSIZE;
 439         }
 440
 441         /*
 442          *      Start at the end of the frame by handling the remainder.
 443          */
 444
 445         offset = length - (length % (maxfraglen - fragheaderlen));
 446
 447         /*
 448          *      Amount of memory to allocate for final fragment.
 449          */
 450
 451         fraglen = length - offset + fragheaderlen;
 452
 453         if (length-offset==0) {
 454                 fraglen = maxfraglen;
 455                 offset -= maxfraglen-fragheaderlen;
 456         }
 457
 458
 459         /*
 460          *      The last fragment will not have MF (more fragments) set.
 461          */
 462
 463         mf = 0;
 464
 465         /*
 466          *      Don't fragment packets for path mtu discovery.
 467          */
 468
 469         if (offset > 0 && df) {
 470                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 471                 return(-EMSGSIZE);
 472         }
 473
 474         /*
 475          *      Lock the device lists.
 476          */
 477
 478         dev_lock_list();
 479
 480         /*
 481          *      Get an identifier
 482          */
 483
 484         id = htons(ip_id_count++);
 485
 486         /*
 487          *      Begin outputting the bytes.
 488          */
 489
 490         do {
 491                 char *data;
 492                 struct sk_buff * skb;
 493
 494                 /*
 495                  *      Get the memory we require with some space left for alignment.
 496                  */
 497
 498                 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
 499                 if (skb == NULL)
 500                         goto error;
 501
 502                 /*
 503                  *      Fill in the control structures
 504                  */
 505
 506                 skb->priority = sk->priority;
 507                 skb->dst = dst_clone(&rt->u.dst);
 508                 skb_reserve(skb, hh_len);
 509
 510                 /*
 511                  *      Find where to start putting bytes.
 512                  */
 513
 514                 data = skb_put(skb, fraglen);
 515                 skb->nh.iph = (struct iphdr *)data;
 516
 517                 /*
 518                  *      Only write IP header onto non-raw packets
 519                  */
 520
 521                 {
 522                         struct iphdr *iph = (struct iphdr *)data;
 523
 524                         iph->version = 4;
 525                         iph->ihl = 5;
 526                         if (opt) {
 527                                 iph->ihl += opt->optlen>>2;
 528                                 ip_options_build(skb, opt,
 529                                                  ipc->addr, rt, offset);
 530                         }
 531                         iph->tos = sk->ip_tos;
 532                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 533                         iph->id = id;
 534                         iph->frag_off = htons(offset>>3);
 535                         iph->frag_off |= mf|df;
 536                         if (rt->rt_type == RTN_MULTICAST)
 537                                 iph->ttl = sk->ip_mc_ttl;
 538                         else
 539                                 iph->ttl = sk->ip_ttl;
 540                         iph->protocol = sk->protocol;
 541                         iph->check = 0;
 542                         iph->saddr = rt->rt_src;
 543                         iph->daddr = rt->rt_dst;
 544                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 545                         data += iph->ihl*4;
 546
 547                         /*
 548                          *      Any further fragments will have MF set.
 549                          */
 550
 551                         mf = htons(IP_MF);
 552                 }
 553
 554                 /*
 555                  *      User data callback
 556                  */
 557
 558                 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 559                         err = -EFAULT;
 560                         kfree_skb(skb);
 561                         goto error;
 562                 }
 563
 564                 offset -= (maxfraglen-fragheaderlen);
 565                 fraglen = maxfraglen;
 566
 567                 nfrags++;
 568
 569 #ifdef CONFIG_FIREWALL
 570                 switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
 571                 case FW_QUEUE:
 572                         kfree_skb(skb);
 573                         continue;
 574                 case FW_BLOCK:
 575                 case FW_REJECT:
 576                         kfree_skb(skb);
 577                         err = -EPERM;
 578                         goto error;
 579                 }
 580 #endif
 581
 582                 err = -ENETDOWN;
 583                 if (rt->u.dst.output(skb))
 584                         goto error;
 585         } while (offset >= 0);
 586
 587         if (nfrags>1)
 588                 ip_statistics.IpFragCreates += nfrags;
 589         dev_unlock_list();
 590         return 0;
 591
 592 error:
 593         ip_statistics.IpOutDiscards++;
 594         if (nfrags>1)
 595                 ip_statistics.IpFragCreates += nfrags;
 596         dev_unlock_list();
 597         return err;
 598 }
 599
 600
 601 /*
 602  *      Fast path for unfragmented packets.
 603  */
 604 int ip_build_xmit(struct sock *sk,
 605                   int getfrag (const void *,
 606                                char *,
 607                                unsigned int,
 608                                unsigned int),
 609                   const void *frag,
 610                   unsigned length,
 611                   struct ipcm_cookie *ipc,
 612                   struct rtable *rt,
 613                   int flags)
 614 {
 615         int err;
 616         struct sk_buff *skb;
 617         int df;
 618         struct iphdr *iph;
 619
 620         /*
 621          *      Try the simple case first. This leaves fragmented frames, and by
 622          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 623          */
 624
 625         if (!sk->ip_hdrincl) {
 626                 length += sizeof(struct iphdr);
 627
 628                 /*
 629                  *      Check for slow path.
 630                  */
 631                 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 632                         return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 633         } else {
 634                 if (length > rt->u.dst.dev->mtu) {
 635                         ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 636                         return -EMSGSIZE;
 637                 }
 638         }
 639
 640         /*
 641          *      Do path mtu discovery if needed.
 642          */
 643         df = 0;
 644         if (ip_dont_fragment(sk, &rt->u.dst))
 645                 df = htons(IP_DF);
 646
 647         /*
 648          *      Fast path for unfragmented frames without options.
 649          */
 650         {
 651         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 652
 653         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 654                                   0, flags&MSG_DONTWAIT, &err);
 655         if(skb==NULL)
 656                 goto error;
 657         skb_reserve(skb, hh_len);
 658         }
 659
 660         skb->priority = sk->priority;
 661         skb->dst = dst_clone(&rt->u.dst);
 662
 663         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 664
 665         dev_lock_list();
 666
 667         if(!sk->ip_hdrincl) {
 668                 iph->version=4;
 669                 iph->ihl=5;
 670                 iph->tos=sk->ip_tos;
 671                 iph->tot_len = htons(length);
 672                 iph->id=htons(ip_id_count++);
 673                 iph->frag_off = df;
 674                 iph->ttl=sk->ip_mc_ttl;
 675                 if (rt->rt_type != RTN_MULTICAST)
 676                         iph->ttl=sk->ip_ttl;
 677                 iph->protocol=sk->protocol;
 678                 iph->saddr=rt->rt_src;
 679                 iph->daddr=rt->rt_dst;
 680                 iph->check=0;
 681                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 682                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 683         }
 684         else
 685                 err = getfrag(frag, (void *)iph, 0, length);
 686
 687         dev_unlock_list();
 688
 689         if (err)
 690                 goto error_fault;
 691
 692 #ifdef CONFIG_FIREWALL
 693         switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
 694         case FW_QUEUE:
 695                 kfree_skb(skb);
 696                 return 0;
 697         case FW_BLOCK:
 698         case FW_REJECT:
 699                 kfree_skb(skb);
 700                 err = -EPERM;
 701                 goto error;
 702         }
 703 #endif
 704
 705         return rt->u.dst.output(skb);
 706
 707 error_fault:
 708         err = -EFAULT;
 709         kfree_skb(skb);
 710 error:
 711         ip_statistics.IpOutDiscards++;
 712         return err;
 713 }
 714
 715
 716
 717 /*
 718  *      This IP datagram is too large to be sent in one piece.  Break it up into
 719  *      smaller pieces (each of size equal to IP header plus
 720  *      a block of the data of the original IP data part) that will yet fit in a
 721  *      single device frame, and queue such a frame for sending.
 722  *
 723  *      Yes this is inefficient, feel free to submit a quicker one.
 724  */
 725
 726 void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 727 {
 728         struct iphdr *iph;
 729         unsigned char *raw;
 730         unsigned char *ptr;
 731         struct device *dev;
 732         struct sk_buff *skb2;
 733         unsigned int mtu, hlen, left, len;
 734         int offset;
 735         int not_last_frag;
 736         struct rtable *rt = (struct rtable*)skb->dst;
 737
 738         dev = rt->u.dst.dev;
 739
 740         /*
 741          *      Point into the IP datagram header.
 742          */
 743
 744         raw = skb->nh.raw;
 745         iph = (struct iphdr*)raw;
 746
 747         /*
 748          *      Setup starting values.
 749          */
 750
 751         hlen = iph->ihl * 4;
 752         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 753         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 754         ptr = raw + hlen;                       /* Where to start from */
 755
 756         /*
 757          *      The protocol doesn't seem to say what to do in the case that the
 758          *      frame + options doesn't fit the mtu. As it used to fall down dead
 759          *      in this case we were fortunate it didn't happen
 760          *
 761          *      It is impossible, because mtu>=68. --ANK (980801)
 762          */
 763
 764 #ifdef CONFIG_NET_PARANOIA
 765         if (mtu<8)
 766                 goto fail;
 767 #endif
 768
 769         /*
 770          *      Fragment the datagram.
 771          */
 772
 773         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 774         not_last_frag = iph->frag_off & htons(IP_MF);
 775
 776         /*
 777          *      Keep copying data until we run out.
 778          */
 779
 780         while(left > 0) {
 781                 len = left;
 782                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 783                 if (len > mtu)
 784                         len = mtu;
 785                 /* IF: we are not sending upto and including the packet end
 786                    then align the next start on an eight byte boundary */
 787                 if (len < left) {
 788                         len &= ~7;
 789                 }
 790                 /*
 791                  *      Allocate buffer.
 792                  */
 793
 794                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 795                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 796                         goto fail;
 797                 }
 798
 799                 /*
 800                  *      Set up data on packet
 801                  */
 802
 803                 skb2->pkt_type = skb->pkt_type;
 804                 skb2->priority = skb->priority;
 805                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 806                 skb_put(skb2, len + hlen);
 807                 skb2->nh.raw = skb2->data;
 808                 skb2->h.raw = skb2->data + hlen;
 809
 810                 /*
 811                  *      Charge the memory for the fragment to any owner
 812                  *      it might possess
 813                  */
 814
 815                 if (skb->sk)
 816                         skb_set_owner_w(skb2, skb->sk);
 817                 skb2->dst = dst_clone(skb->dst);
 818
 819                 /*
 820                  *      Copy the packet header into the new buffer.
 821                  */
 822
 823                 memcpy(skb2->nh.raw, raw, hlen);
 824
 825                 /*
 826                  *      Copy a block of the IP datagram.
 827                  */
 828                 memcpy(skb2->h.raw, ptr, len);
 829                 left -= len;
 830
 831                 /*
 832                  *      Fill in the new header fields.
 833                  */
 834                 iph = skb2->nh.iph;
 835                 iph->frag_off = htons((offset >> 3));
 836
 837                 /* ANK: dirty, but effective trick. Upgrade options only if
 838                  * the segment to be fragmented was THE FIRST (otherwise,
 839                  * options are already fixed) and make it ONCE
 840                  * on the initial skb, so that all the following fragments
 841                  * will inherit fixed options.
 842                  */
 843                 if (offset == 0)
 844                         ip_options_fragment(skb);
 845
 846                 /*
 847                  *      Added AC : If we are fragmenting a fragment that's not the
 848                  *                 last fragment then keep MF on each bit
 849                  */
 850                 if (left > 0 || not_last_frag)
 851                         iph->frag_off |= htons(IP_MF);
 852                 ptr += len;
 853                 offset += len;
 854
 855                 /*
 856                  *      Put this fragment into the sending queue.
 857                  */
 858
 859                 ip_statistics.IpFragCreates++;
 860
 861                 iph->tot_len = htons(len + hlen);
 862
 863                 ip_send_check(iph);
 864
 865                 output(skb2);
 866         }
 867         kfree_skb(skb);
 868         ip_statistics.IpFragOKs++;
 869         return;
 870
 871 fail:
 872         kfree_skb(skb);
 873         ip_statistics.IpFragFails++;
 874 }
 875
 876 /*
 877  *      Fetch data from kernel space and fill in checksum if needed.
 878  */
 879 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 880                               unsigned int fraglen)
 881 {
 882         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 883         u16 *pktp = (u16 *)to;
 884         struct iovec *iov;
 885         int len;
 886         int hdrflag = 1;
 887
 888         iov = &dp->iov[0];
 889         if (offset >= iov->iov_len) {
 890                 offset -= iov->iov_len;
 891                 iov++;
 892                 hdrflag = 0;
 893         }
 894         len = iov->iov_len - offset;
 895         if (fraglen > len) { /* overlapping. */
 896                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 897                                              dp->csum);
 898                 offset = 0;
 899                 fraglen -= len;
 900                 to += len;
 901                 iov++;
 902         }
 903
 904         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 905                                              dp->csum);
 906
 907         if (hdrflag && dp->csumoffset)
 908                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 909         return 0;
 910 }
 911
 912 /*
 913  *      Generic function to send a packet as reply to another packet.
 914  *      Used to send TCP resets so far. ICMP should use this function too.
 915  *
 916  *      Should run single threaded per socket because it uses the sock
 917  *      structure to pass arguments.
 918  */
 919 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 920                    unsigned int len)
 921 {
 922         struct {
 923                 struct ip_options       opt;
 924                 char                    data[40];
 925         } replyopts;
 926         struct ipcm_cookie ipc;
 927         u32 daddr;
 928         struct rtable *rt = (struct rtable*)skb->dst;
 929
 930         if (ip_options_echo(&replyopts.opt, skb))
 931                 return;
 932
 933         sk->ip_tos = skb->nh.iph->tos;
 934         sk->priority = skb->priority;
 935         sk->protocol = skb->nh.iph->protocol;
 936
 937         daddr = ipc.addr = rt->rt_src;
 938         ipc.opt = &replyopts.opt;
 939
 940         if (ipc.opt->srr)
 941                 daddr = replyopts.opt.faddr;
 942         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 943                 return;
 944
 945         /* And let IP do all the hard work. */
 946         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 947         ip_rt_put(rt);
 948 }
 949
 950 /*
 951  *      IP protocol layer initialiser
 952  */
 953
 954 static struct packet_type ip_packet_type =
 955 {
 956         __constant_htons(ETH_P_IP),
 957         NULL,   /* All devices */
 958         ip_rcv,
 959         NULL,
 960         NULL,
 961 };
 962
 963
 964
 965 #ifdef CONFIG_PROC_FS
 966 #ifdef CONFIG_IP_MULTICAST
 967 static struct proc_dir_entry proc_net_igmp = {
 968         PROC_NET_IGMP, 4, "igmp",
 969         S_IFREG | S_IRUGO, 1, 0, 0,
 970         0, &proc_net_inode_operations,
 971         ip_mc_procinfo
 972 };
 973 #endif
 974 #endif
 975
 976 /*
 977  *      IP registers the packet type and then calls the subprotocol initialisers
 978  */
 979
 980 __initfunc(void ip_init(void))
 981 {
 982         dev_add_pack(&ip_packet_type);
 983
 984         ip_rt_init();
 985
 986 #ifdef CONFIG_PROC_FS
 987 #ifdef CONFIG_IP_MULTICAST
 988         proc_net_register(&proc_net_igmp);
 989 #endif
 990 #endif
 991 }
 992