net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.56 1998/04/17 02:36:46 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  */
  39
  40 #include <asm/uaccess.h>
  41 #include <asm/system.h>
  42 #include <linux/types.h>
  43 #include <linux/kernel.h>
  44 #include <linux/sched.h>
  45 #include <linux/mm.h>
  46 #include <linux/string.h>
  47 #include <linux/errno.h>
  48 #include <linux/config.h>
  49
  50 #include <linux/socket.h>
  51 #include <linux/sockios.h>
  52 #include <linux/in.h>
  53 #include <linux/inet.h>
  54 #include <linux/netdevice.h>
  55 #include <linux/etherdevice.h>
  56 #include <linux/proc_fs.h>
  57 #include <linux/stat.h>
  58 #include <linux/init.h>
  59
  60 #include <net/snmp.h>
  61 #include <net/ip.h>
  62 #include <net/protocol.h>
  63 #include <net/route.h>
  64 #include <net/tcp.h>
  65 #include <net/udp.h>
  66 #include <linux/skbuff.h>
  67 #include <net/sock.h>
  68 #include <net/arp.h>
  69 #include <net/icmp.h>
  70 #include <net/raw.h>
  71 #include <net/checksum.h>
  72 #include <linux/igmp.h>
  73 #include <linux/ip_fw.h>
  74 #include <linux/firewall.h>
  75 #include <linux/mroute.h>
  76 #include <linux/netlink.h>
  77
  78 /*
  79  *      Shall we try to damage output packets if routing dev changes?
  80  */
  81
  82 int sysctl_ip_dynaddr = 0;
  83
  84
  85 int ip_id_count = 0;
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93
  94 /*
  95  *              Add an ip header to a skbuff and send it out.
  96  */
  97 void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
  98                            u32 saddr, u32 daddr, struct ip_options *opt)
  99 {
 100         struct rtable *rt = (struct rtable *)skb->dst;
 101         struct iphdr *iph;
 102
 103         /* Build the IP header. */
 104         if (opt)
 105                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 106         else
 107                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 108
 109         iph->version  = 4;
 110         iph->ihl      = 5;
 111         iph->tos      = sk->ip_tos;
 112         iph->frag_off = 0;
 113         if (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
 114                 !(rt->u.dst.mxlock&(1<<RTAX_MTU)))
 115                 iph->frag_off |= htons(IP_DF);
 116         iph->ttl      = sk->ip_ttl;
 117         iph->daddr    = rt->rt_dst;
 118         iph->saddr    = rt->rt_src;
 119         iph->protocol = sk->protocol;
 120         iph->tot_len  = htons(skb->len);
 121         iph->id       = htons(ip_id_count++);
 122         skb->nh.iph   = iph;
 123
 124         if (opt && opt->optlen) {
 125                 iph->ihl += opt->optlen>>2;
 126                 ip_options_build(skb, opt, daddr, rt, 0);
 127         }
 128
 129         ip_send_check(iph);
 130
 131         /* Send it out. */
 132         skb->dst->output(skb);
 133 }
 134
 135 int __ip_finish_output(struct sk_buff *skb)
 136 {
 137         return ip_finish_output(skb);
 138 }
 139
 140 int ip_mc_output(struct sk_buff *skb)
 141 {
 142         struct sock *sk = skb->sk;
 143         struct rtable *rt = (struct rtable*)skb->dst;
 144         struct device *dev = rt->u.dst.dev;
 145
 146         /*
 147          *      If the indicated interface is up and running, send the packet.
 148          */
 149
 150         ip_statistics.IpOutRequests++;
 151 #ifdef CONFIG_IP_ROUTE_NAT
 152         if (rt->rt_flags & RTCF_NAT)
 153                 ip_do_nat(skb);
 154 #endif
 155
 156         skb->dev = dev;
 157         skb->protocol = __constant_htons(ETH_P_IP);
 158
 159         /*
 160          *      Multicasts are looped back for other local users
 161          */
 162
 163         if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
 164 #ifndef CONFIG_IP_MROUTE
 165 #if 1
 166                 /* It should never occur. Delete it eventually. --ANK */
 167                 if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
 168                         printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n");
 169                 else
 170 #endif
 171 #else
 172                 /* Small optimization: do not loopback not local frames,
 173                    which returned after forwarding; they will be  dropped
 174                    by ip_mr_input in any case.
 175                    Note, that local frames are looped back to be delivered
 176                    to local recipients.
 177
 178                    This check is duplicated in ip_mr_input at the moment.
 179                  */
 180                 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 181 #endif
 182                 dev_loopback_xmit(skb);
 183
 184                 /* Multicasts with ttl 0 must not go beyond the host */
 185
 186                 if (skb->nh.iph->ttl == 0) {
 187                         kfree_skb(skb);
 188                         return 0;
 189                 }
 190         }
 191
 192         if (rt->rt_flags&RTCF_BROADCAST) {
 193 #if 1
 194                 /* It should never occur. Delete it eventually. --ANK */
 195                 if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
 196                         printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n");
 197                 else
 198 #endif
 199                 dev_loopback_xmit(skb);
 200         }
 201
 202         return ip_finish_output(skb);
 203 }
 204
 205 int ip_output(struct sk_buff *skb)
 206 {
 207 #ifdef CONFIG_IP_ROUTE_NAT
 208         struct rtable *rt = (struct rtable*)skb->dst;
 209 #endif
 210
 211         ip_statistics.IpOutRequests++;
 212
 213 #ifdef CONFIG_IP_ROUTE_NAT
 214         if (rt->rt_flags&RTCF_NAT)
 215                 ip_do_nat(skb);
 216 #endif
 217
 218         return ip_finish_output(skb);
 219 }
 220
 221 #ifdef CONFIG_IP_ACCT
 222 int ip_acct_output(struct sk_buff *skb)
 223 {
 224         /*
 225          *      Count mapping we shortcut
 226          */
 227
 228         ip_fw_chk(skb->nh.iph, skb->dev, NULL, ip_acct_chain, 0, IP_FW_MODE_ACCT_OUT);
 229
 230         dev_queue_xmit(skb);
 231
 232         return 0;
 233 }
 234 #endif
 235
 236 /* Queues a packet to be sent, and starts the transmitter if necessary.
 237  * This routine also needs to put in the total length and compute the
 238  * checksum.  We use to do this in two stages, ip_build_header() then
 239  * this, but that scheme created a mess when routes disappeared etc.
 240  * So we do it all here, and the TCP send engine has been changed to
 241  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 242  * most likely make other reliable transport layers above IP easier
 243  * to implement under Linux.
 244  */
 245 void ip_queue_xmit(struct sk_buff *skb)
 246 {
 247         struct sock *sk = skb->sk;
 248         struct ip_options *opt = sk->opt;
 249         struct rtable *rt;
 250         struct device *dev;
 251         struct iphdr *iph;
 252         unsigned int tot_len;
 253
 254         /* Make sure we can route this packet. */
 255         rt = (struct rtable *) sk->dst_cache;
 256         if(rt == NULL || rt->u.dst.obsolete) {
 257                 u32 daddr;
 258
 259                 sk->dst_cache = NULL;
 260                 ip_rt_put(rt);
 261
 262                 /* Use correct destination address if we have options. */
 263                 daddr = sk->daddr;
 264                 if(opt && opt->srr)
 265                         daddr = opt->faddr;
 266
 267                 /* If this fails, retransmit mechanism of transport layer will
 268                  * keep trying until route appears or the connection times itself
 269                  * out.
 270                  */
 271                 if(ip_route_output(&rt, daddr, sk->saddr,
 272                                    RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
 273                                    sk->bound_dev_if))
 274                         goto drop;
 275                 sk->dst_cache = &rt->u.dst;
 276         }
 277         if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 278                 goto no_route;
 279
 280         /* We have a route, so grab a reference. */
 281         skb->dst = dst_clone(sk->dst_cache);
 282
 283         /* OK, we know where to send it, allocate and build IP header. */
 284         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 285         iph->version  = 4;
 286         iph->ihl      = 5;
 287         iph->tos      = sk->ip_tos;
 288         iph->frag_off = 0;
 289         if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU)))
 290                 iph->frag_off |= __constant_htons(IP_DF);
 291         iph->ttl      = sk->ip_ttl;
 292         iph->daddr    = rt->rt_dst;
 293         iph->saddr    = rt->rt_src;
 294         iph->protocol = sk->protocol;
 295         skb->nh.iph   = iph;
 296         /* Transport layer set skb->h.foo itself. */
 297
 298         if(opt && opt->optlen) {
 299                 iph->ihl += opt->optlen >> 2;
 300                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 301         }
 302
 303         tot_len = skb->len;
 304         iph->tot_len = htons(tot_len);
 305         iph->id = htons(ip_id_count++);
 306
 307         dev = rt->u.dst.dev;
 308
 309         if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT)
 310                 goto drop;
 311
 312         /* This can happen when the transport layer has segments queued
 313          * with a cached route, and by the time we get here things are
 314          * re-routed to a device with a different MTU than the original
 315          * device.  Sick, but we must cover it.
 316          */
 317         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 318                 struct sk_buff *skb2;
 319
 320                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 321                 kfree_skb(skb);
 322                 if (skb2 == NULL)
 323                         return;
 324                 skb = skb2;
 325                 iph = skb->nh.iph;
 326         }
 327
 328         /* Do we need to fragment.  Again this is inefficient.  We
 329          * need to somehow lock the original buffer and use bits of it.
 330          */
 331         if (tot_len > rt->u.dst.pmtu)
 332                 goto fragment;
 333
 334         /* Add an IP checksum. */
 335         ip_send_check(iph);
 336
 337         skb->priority = sk->priority;
 338         skb->dst->output(skb);
 339         return;
 340
 341 fragment:
 342         if ((iph->frag_off & htons(IP_DF)) != 0) {
 343                 printk(KERN_DEBUG "sending pkt_too_big to self\n");
 344                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 345                           htonl(rt->u.dst.pmtu));
 346                 goto drop;
 347         }
 348         ip_fragment(skb, skb->dst->output);
 349         return;
 350
 351 no_route:
 352         sk->dst_cache = NULL;
 353         ip_rt_put(rt);
 354         ip_statistics.IpOutNoRoutes++;
 355         /* Fall through... */
 356 drop:
 357         kfree_skb(skb);
 358 }
 359
 360 /*
 361  *      Build and send a packet, with as little as one copy
 362  *
 363  *      Doesn't care much about ip options... option length can be
 364  *      different for fragment at 0 and other fragments.
 365  *
 366  *      Note that the fragment at the highest offset is sent first,
 367  *      so the getfrag routine can fill in the TCP/UDP checksum header
 368  *      field in the last fragment it sends... actually it also helps
 369  *      the reassemblers, they can put most packets in at the head of
 370  *      the fragment queue, and they know the total size in advance. This
 371  *      last feature will measurably improve the Linux fragment handler one
 372  *      day.
 373  *
 374  *      The callback has five args, an arbitrary pointer (copy of frag),
 375  *      the source IP address (may depend on the routing table), the
 376  *      destination address (char *), the offset to copy from, and the
 377  *      length to be copied.
 378  */
 379
 380 int ip_build_xmit_slow(struct sock *sk,
 381                   int getfrag (const void *,
 382                                char *,
 383                                unsigned int,
 384                                unsigned int),
 385                   const void *frag,
 386                   unsigned length,
 387                   struct ipcm_cookie *ipc,
 388                   struct rtable *rt,
 389                   int flags)
 390 {
 391         unsigned int fraglen, maxfraglen, fragheaderlen;
 392         int err;
 393         int offset, mf;
 394         unsigned short id;
 395
 396         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 397         int nfrags=0;
 398         struct ip_options *opt = ipc->opt;
 399         int df = htons(IP_DF);
 400
 401         if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
 402                 (rt->u.dst.mxlock&(1<<RTAX_MTU)))
 403                 df = 0;
 404
 405         if (!sk->ip_hdrincl)
 406                 length -= sizeof(struct iphdr);
 407
 408         if (opt) {
 409                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 410                 maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 411         } else {
 412                 fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr);
 413
 414                 /*
 415                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 416                  *      out the size of the frames to send.
 417                  */
 418
 419                 maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 420         }
 421
 422         if (length + fragheaderlen > 0xFFFF)
 423                 return -EMSGSIZE;
 424
 425         /*
 426          *      Start at the end of the frame by handling the remainder.
 427          */
 428
 429         offset = length - (length % (maxfraglen - fragheaderlen));
 430
 431         /*
 432          *      Amount of memory to allocate for final fragment.
 433          */
 434
 435         fraglen = length - offset + fragheaderlen;
 436
 437         if (length-offset==0) {
 438                 fraglen = maxfraglen;
 439                 offset -= maxfraglen-fragheaderlen;
 440         }
 441
 442
 443         /*
 444          *      The last fragment will not have MF (more fragments) set.
 445          */
 446
 447         mf = 0;
 448
 449         /*
 450          *      Don't fragment packets for path mtu discovery.
 451          */
 452
 453         if (offset > 0 && df) {
 454                 return(-EMSGSIZE);
 455         }
 456
 457         /*
 458          *      Lock the device lists.
 459          */
 460
 461         dev_lock_list();
 462
 463         /*
 464          *      Get an identifier
 465          */
 466
 467         id = htons(ip_id_count++);
 468
 469         /*
 470          *      Being outputting the bytes.
 471          */
 472
 473         do {
 474                 int error;
 475                 char *data;
 476                 struct sk_buff * skb;
 477
 478                 /*
 479                  *      Get the memory we require with some space left for alignment.
 480                  */
 481
 482                 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &error);
 483                 if (skb == NULL) {
 484                         ip_statistics.IpOutDiscards++;
 485                         if(nfrags>1)
 486                                 ip_statistics.IpFragCreates++;
 487                         dev_unlock_list();
 488                         return(error);
 489                 }
 490
 491                 /*
 492                  *      Fill in the control structures
 493                  */
 494
 495                 skb->priority = sk->priority;
 496                 skb->dst = dst_clone(&rt->u.dst);
 497                 skb_reserve(skb, hh_len);
 498
 499                 /*
 500                  *      Find where to start putting bytes.
 501                  */
 502
 503                 data = skb_put(skb, fraglen);
 504                 skb->nh.iph = (struct iphdr *)data;
 505
 506                 /*
 507                  *      Only write IP header onto non-raw packets
 508                  */
 509
 510                 if(!sk->ip_hdrincl) {
 511                         struct iphdr *iph = (struct iphdr *)data;
 512
 513                         iph->version = 4;
 514                         iph->ihl = 5;
 515                         if (opt) {
 516                                 iph->ihl += opt->optlen>>2;
 517                                 ip_options_build(skb, opt,
 518                                                  ipc->addr, rt, offset);
 519                         }
 520                         iph->tos = sk->ip_tos;
 521                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 522                         iph->id = id;
 523                         iph->frag_off = htons(offset>>3);
 524                         iph->frag_off |= mf|df;
 525                         if (rt->rt_type == RTN_MULTICAST)
 526                                 iph->ttl = sk->ip_mc_ttl;
 527                         else
 528                                 iph->ttl = sk->ip_ttl;
 529                         iph->protocol = sk->protocol;
 530                         iph->check = 0;
 531                         iph->saddr = rt->rt_src;
 532                         iph->daddr = rt->rt_dst;
 533                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 534                         data += iph->ihl*4;
 535
 536                         /*
 537                          *      Any further fragments will have MF set.
 538                          */
 539
 540                         mf = htons(IP_MF);
 541                 }
 542
 543                 /*
 544                  *      User data callback
 545                  */
 546
 547                 err = 0;
 548                 if (getfrag(frag, data, offset, fraglen-fragheaderlen))
 549                         err = -EFAULT;
 550
 551                 /*
 552                  *      Account for the fragment.
 553                  */
 554
 555                 if(!err && offset == 0 &&
 556                    call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb) < FW_ACCEPT)
 557                         err = -EPERM;
 558
 559                 if (err) {
 560                         ip_statistics.IpOutDiscards++;
 561                         kfree_skb(skb);
 562                         dev_unlock_list();
 563                         return err;
 564                 }
 565
 566
 567                 offset -= (maxfraglen-fragheaderlen);
 568                 fraglen = maxfraglen;
 569
 570                 nfrags++;
 571
 572                 err = 0;
 573                 if (rt->u.dst.output(skb)) {
 574                         err = -ENETDOWN;
 575                         ip_statistics.IpOutDiscards++;
 576                         break;
 577                 }
 578         } while (offset >= 0);
 579
 580         if (nfrags>1)
 581                 ip_statistics.IpFragCreates += nfrags;
 582         dev_unlock_list();
 583         return err;
 584 }
 585
 586
 587 /*
 588  *      Fast path for unfragmented packets.
 589  */
 590 int ip_build_xmit(struct sock *sk,
 591                   int getfrag (const void *,
 592                                char *,
 593                                unsigned int,
 594                                unsigned int),
 595                   const void *frag,
 596                   unsigned length,
 597                   struct ipcm_cookie *ipc,
 598                   struct rtable *rt,
 599                   int flags)
 600 {
 601         int err;
 602         struct sk_buff *skb;
 603         int df;
 604         struct iphdr *iph;
 605
 606         /*
 607          *      Try the simple case first. This leaves fragmented frames, and by
 608          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 609          */
 610
 611         if (!sk->ip_hdrincl)
 612                 length += sizeof(struct iphdr);
 613
 614         /*
 615          *      Check for slow path.
 616          */
 617         if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 618                 return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 619
 620         /*
 621          *      Do path mtu discovery if needed.
 622          */
 623         df = htons(IP_DF);
 624         if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
 625                 (rt->u.dst.mxlock&(1<<RTAX_MTU)))
 626                 df = 0;
 627
 628         /*
 629          *      Fast path for unfragmented frames without options.
 630          */
 631         {
 632         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 633
 634         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 635                                   0, flags&MSG_DONTWAIT, &err);
 636         if(skb==NULL)
 637                 goto error;
 638         skb_reserve(skb, hh_len);
 639         }
 640
 641         skb->priority = sk->priority;
 642         skb->dst = dst_clone(&rt->u.dst);
 643
 644         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 645
 646         dev_lock_list();
 647
 648         if(!sk->ip_hdrincl) {
 649                 iph->version=4;
 650                 iph->ihl=5;
 651                 iph->tos=sk->ip_tos;
 652                 iph->tot_len = htons(length);
 653                 iph->id=htons(ip_id_count++);
 654                 iph->frag_off = df;
 655                 iph->ttl=sk->ip_mc_ttl;
 656                 if (rt->rt_type != RTN_MULTICAST)
 657                         iph->ttl=sk->ip_ttl;
 658                 iph->protocol=sk->protocol;
 659                 iph->saddr=rt->rt_src;
 660                 iph->daddr=rt->rt_dst;
 661                 iph->check=0;
 662                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 663                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 664         }
 665         else
 666                 err = getfrag(frag, (void *)iph, 0, length);
 667
 668         dev_unlock_list();
 669
 670         if (err)
 671                 err = -EFAULT;
 672
 673         if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT)
 674                 err = -EPERM;
 675
 676         if (err) {
 677                 kfree_skb(skb);
 678                 goto error;
 679         }
 680
 681         return rt->u.dst.output(skb);
 682
 683 error:
 684         ip_statistics.IpOutDiscards++;
 685         return err;
 686 }
 687
 688
 689
 690 /*
 691  *      This IP datagram is too large to be sent in one piece.  Break it up into
 692  *      smaller pieces (each of size equal to IP header plus
 693  *      a block of the data of the original IP data part) that will yet fit in a
 694  *      single device frame, and queue such a frame for sending.
 695  *
 696  *      Yes this is inefficient, feel free to submit a quicker one.
 697  */
 698
 699 void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 700 {
 701         struct iphdr *iph;
 702         unsigned char *raw;
 703         unsigned char *ptr;
 704         struct device *dev;
 705         struct sk_buff *skb2;
 706         unsigned int mtu, hlen, left, len;
 707         int offset;
 708         int not_last_frag;
 709         u16 dont_fragment;
 710         struct rtable *rt = (struct rtable*)skb->dst;
 711
 712         dev = rt->u.dst.dev;
 713
 714         /*
 715          *      Point into the IP datagram header.
 716          */
 717
 718         raw = skb->nh.raw;
 719         iph = (struct iphdr*)raw;
 720
 721         /*
 722          *      Setup starting values.
 723          */
 724
 725         hlen = iph->ihl * 4;
 726         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 727         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 728         ptr = raw + hlen;                       /* Where to start from */
 729
 730         /*
 731          *      The protocol doesn't seem to say what to do in the case that the
 732          *      frame + options doesn't fit the mtu. As it used to fall down dead
 733          *      in this case we were fortunate it didn't happen
 734          */
 735
 736         if (mtu<8)
 737                 goto fail;
 738
 739         /*
 740          *      Fragment the datagram.
 741          */
 742
 743         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 744         not_last_frag = iph->frag_off & htons(IP_MF);
 745
 746         /*
 747          *      Nice moment: if DF is set and we are here,
 748          *      it means that packet should be fragmented and
 749          *      DF is set on fragments. If it works,
 750          *      path MTU discovery can be done by ONE segment(!). --ANK
 751          */
 752         dont_fragment = iph->frag_off & htons(IP_DF);
 753
 754         /*
 755          *      Keep copying data until we run out.
 756          */
 757
 758         while(left > 0) {
 759                 len = left;
 760                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 761                 if (len > mtu)
 762                         len = mtu;
 763                 /* IF: we are not sending upto and including the packet end
 764                    then align the next start on an eight byte boundary */
 765                 if (len < left) {
 766                         len &= ~7;
 767                 }
 768                 /*
 769                  *      Allocate buffer.
 770                  */
 771
 772                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 773                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 774                         goto fail;
 775                 }
 776
 777                 /*
 778                  *      Set up data on packet
 779                  */
 780
 781                 skb2->pkt_type = skb->pkt_type;
 782                 skb2->priority = skb->priority;
 783                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 784                 skb_put(skb2, len + hlen);
 785                 skb2->nh.raw = skb2->data;
 786                 skb2->h.raw = skb2->data + hlen;
 787
 788                 /*
 789                  *      Charge the memory for the fragment to any owner
 790                  *      it might possess
 791                  */
 792
 793                 if (skb->sk)
 794                         skb_set_owner_w(skb2, skb->sk);
 795                 skb2->dst = dst_clone(skb->dst);
 796
 797                 /*
 798                  *      Copy the packet header into the new buffer.
 799                  */
 800
 801                 memcpy(skb2->nh.raw, raw, hlen);
 802
 803                 /*
 804                  *      Copy a block of the IP datagram.
 805                  */
 806                 memcpy(skb2->h.raw, ptr, len);
 807                 left -= len;
 808
 809                 /*
 810                  *      Fill in the new header fields.
 811                  */
 812                 iph = skb2->nh.iph;
 813                 iph->frag_off = htons((offset >> 3))|dont_fragment;
 814
 815                 /* ANK: dirty, but effective trick. Upgrade options only if
 816                  * the segment to be fragmented was THE FIRST (otherwise,
 817                  * options are already fixed) and make it ONCE
 818                  * on the initial skb, so that all the following fragments
 819                  * will inherit fixed options.
 820                  */
 821                 if (offset == 0)
 822                         ip_options_fragment(skb2);
 823
 824                 /*
 825                  *      Added AC : If we are fragmenting a fragment that's not the
 826                  *                 last fragment then keep MF on each bit
 827                  */
 828                 if (left > 0 || not_last_frag)
 829                         iph->frag_off |= htons(IP_MF);
 830                 ptr += len;
 831                 offset += len;
 832
 833                 /*
 834                  *      Put this fragment into the sending queue.
 835                  */
 836
 837                 ip_statistics.IpFragCreates++;
 838
 839                 iph->tot_len = htons(len + hlen);
 840
 841                 ip_send_check(iph);
 842
 843                 output(skb2);
 844         }
 845         kfree_skb(skb);
 846         ip_statistics.IpFragOKs++;
 847         return;
 848
 849 fail:
 850         kfree_skb(skb);
 851         ip_statistics.IpFragFails++;
 852 }
 853
 854 /*
 855  *      Fetch data from kernel space and fill in checksum if needed.
 856  */
 857 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 858                               unsigned int fraglen)
 859 {
 860         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 861         u16 *pktp = (u16 *)to;
 862         struct iovec *iov;
 863         int len;
 864         int hdrflag = 1;
 865
 866 #if 0
 867         printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n",
 868                offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len);
 869 #endif
 870
 871         iov = &dp->iov[0];
 872         if (offset >= iov->iov_len) {
 873                 offset -= iov->iov_len;
 874                 iov++;
 875                 hdrflag = 0;
 876         }
 877         len = iov->iov_len - offset;
 878         if (fraglen > len) { /* overlapping. */
 879 #if 1
 880                 if (iov > &dp->iov[0]) {
 881                         printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen);
 882                         return -1;
 883                 }
 884 #endif
 885                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 886                                              dp->csum);
 887                 offset = 0;
 888                 fraglen -= len;
 889                 to += len;
 890                 iov++;
 891         }
 892
 893         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 894                                              dp->csum);
 895
 896         if (hdrflag && dp->csumoffset)
 897                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 898         return 0;
 899 }
 900
 901 /*
 902  *      Generic function to send a packet as reply to another packet.
 903  *      Used to send TCP resets so far. ICMP should use this function too.
 904  *
 905  *      Should run single threaded per socket because it uses the sock
 906  *      structure to pass arguments.
 907  */
 908 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 909                    unsigned int len)
 910 {
 911         struct {
 912                 struct ip_options       opt;
 913                 char                    data[40];
 914         } replyopts;
 915         struct ipcm_cookie ipc;
 916         u32 daddr;
 917         struct rtable *rt = (struct rtable*)skb->dst;
 918
 919         if (ip_options_echo(&replyopts.opt, skb))
 920                 return;
 921
 922         sk->ip_tos = skb->nh.iph->tos;
 923         sk->priority = skb->priority;
 924         sk->protocol = skb->nh.iph->protocol;
 925
 926         daddr = ipc.addr = rt->rt_src;
 927         ipc.opt = &replyopts.opt;
 928
 929         if (ipc.opt->srr)
 930                 daddr = replyopts.opt.faddr;
 931         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 932                 return;
 933
 934         /* And let IP do all the hard work. */
 935         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 936         ip_rt_put(rt);
 937 }
 938
 939 /*
 940  *      IP protocol layer initialiser
 941  */
 942
 943 static struct packet_type ip_packet_type =
 944 {
 945         __constant_htons(ETH_P_IP),
 946         NULL,   /* All devices */
 947         ip_rcv,
 948         NULL,
 949         NULL,
 950 };
 951
 952
 953
 954 #ifdef CONFIG_PROC_FS
 955 #ifdef CONFIG_IP_MULTICAST
 956 static struct proc_dir_entry proc_net_igmp = {
 957         PROC_NET_IGMP, 4, "igmp",
 958         S_IFREG | S_IRUGO, 1, 0, 0,
 959         0, &proc_net_inode_operations,
 960         ip_mc_procinfo
 961 };
 962 #endif
 963 #endif
 964
 965 /*
 966  *      IP registers the packet type and then calls the subprotocol initialisers
 967  */
 968
 969 __initfunc(void ip_init(void))
 970 {
 971         dev_add_pack(&ip_packet_type);
 972
 973         ip_rt_init();
 974
 975 #ifdef CONFIG_PROC_FS
 976 #ifdef CONFIG_IP_MULTICAST
 977         proc_net_register(&proc_net_igmp);
 978 #endif
 979 #endif
 980 }
 981