net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.72 1999/09/07 02:31:15 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *
  19  *      See ip_input.c for original log
  20  *
  21  *      Fixes:
  22  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  23  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  24  *              Bradford Johnson:       Fix faulty handling of some frames when
  25  *                                      no route is found.
  26  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  27  *                                      (in case if packet not accepted by
  28  *                                      output firewall rules)
  29  *              Mike McLagan    :       Routing by source
  30  *              Alexey Kuznetsov:       use new route cache
  31  *              Andi Kleen:             Fix broken PMTU recovery and remove
  32  *                                      some redundant tests.
  33  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  34  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  35  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  36  *                                      for decreased register pressure on x86
  37  *                                      and more readibility.
  38  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  39  *                                      silently drop skb instead of failing with -EPERM.
  40  */
  41
  42 #include <asm/uaccess.h>
  43 #include <asm/system.h>
  44 #include <linux/types.h>
  45 #include <linux/kernel.h>
  46 #include <linux/sched.h>
  47 #include <linux/mm.h>
  48 #include <linux/string.h>
  49 #include <linux/errno.h>
  50 #include <linux/config.h>
  51
  52 #include <linux/socket.h>
  53 #include <linux/sockios.h>
  54 #include <linux/in.h>
  55 #include <linux/inet.h>
  56 #include <linux/netdevice.h>
  57 #include <linux/etherdevice.h>
  58 #include <linux/proc_fs.h>
  59 #include <linux/stat.h>
  60 #include <linux/init.h>
  61
  62 #include <net/snmp.h>
  63 #include <net/ip.h>
  64 #include <net/protocol.h>
  65 #include <net/route.h>
  66 #include <net/tcp.h>
  67 #include <net/udp.h>
  68 #include <linux/skbuff.h>
  69 #include <net/sock.h>
  70 #include <net/arp.h>
  71 #include <net/icmp.h>
  72 #include <net/raw.h>
  73 #include <net/checksum.h>
  74 #include <linux/igmp.h>
  75 #include <linux/netfilter_ipv4.h>
  76 #include <linux/mroute.h>
  77 #include <linux/netlink.h>
  78
  79 /*
  80  *      Shall we try to damage output packets if routing dev changes?
  81  */
  82
  83 int sysctl_ip_dynaddr = 0;
  84
  85 int ip_id_count = 0;
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93
  94 /* dev_loopback_xmit for use with netfilter. */
  95 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  96 {
  97         newskb->mac.raw = newskb->data;
  98         skb_pull(newskb, newskb->nh.raw - newskb->data);
  99         newskb->pkt_type = PACKET_LOOPBACK;
 100         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 101         BUG_TRAP(newskb->dst);
 102
 103 #ifdef CONFIG_NETFILTER_DEBUG
 104         nf_debug_ip_loopback_xmit(newskb);
 105 #endif
 106         netif_rx(newskb);
 107         return 0;
 108 }
 109
 110 #ifdef CONFIG_NETFILTER
 111 /* To preserve the cute illusion that a locally-generated packet can
 112    be mangled before routing, we actually reroute if a hook altered
 113    the packet. -RR */
 114 static int route_me_harder(struct sk_buff *skb)
 115 {
 116         struct iphdr *iph = skb->nh.iph;
 117         struct rtable *rt;
 118
 119         if (ip_route_output(&rt, iph->daddr, iph->saddr,
 120                             RT_TOS(iph->tos) | RTO_CONN,
 121                             skb->sk ? skb->sk->bound_dev_if : 0)) {
 122                 printk("route_me_harder: No more route.\n");
 123                 return -EINVAL;
 124         }
 125
 126         /* Drop old route. */
 127         dst_release(skb->dst);
 128
 129         skb->dst = &rt->u.dst;
 130         return 0;
 131 }
 132 #endif
 133
 134 /* Do route recalc if netfilter changes skb. */
 135 static inline int
 136 output_maybe_reroute(struct sk_buff *skb)
 137 {
 138 #ifdef CONFIG_NETFILTER
 139         if (skb->nfcache & NFC_ALTERED) {
 140                 if (route_me_harder(skb) != 0) {
 141                         kfree_skb(skb);
 142                         return -EINVAL;
 143                 }
 144         }
 145 #endif
 146         return skb->dst->output(skb);
 147 }
 148
 149 /*
 150  *              Add an ip header to a skbuff and send it out.
 151  */
 152 void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 153                            u32 saddr, u32 daddr, struct ip_options *opt)
 154 {
 155         struct rtable *rt = (struct rtable *)skb->dst;
 156         struct iphdr *iph;
 157
 158         /* Build the IP header. */
 159         if (opt)
 160                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 161         else
 162                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 163
 164         iph->version  = 4;
 165         iph->ihl      = 5;
 166         iph->tos      = sk->protinfo.af_inet.tos;
 167         iph->frag_off = 0;
 168         if (ip_dont_fragment(sk, &rt->u.dst))
 169                 iph->frag_off |= htons(IP_DF);
 170         iph->ttl      = sk->protinfo.af_inet.ttl;
 171         iph->daddr    = rt->rt_dst;
 172         iph->saddr    = rt->rt_src;
 173         iph->protocol = sk->protocol;
 174         iph->tot_len  = htons(skb->len);
 175         iph->id       = htons(ip_id_count++);
 176         skb->nh.iph   = iph;
 177
 178         if (opt && opt->optlen) {
 179                 iph->ihl += opt->optlen>>2;
 180                 ip_options_build(skb, opt, daddr, rt, 0);
 181         }
 182         ip_send_check(iph);
 183
 184         /* Send it out. */
 185         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, NULL,
 186                 output_maybe_reroute);
 187 }
 188
 189 static inline int ip_finish_output2(struct sk_buff *skb)
 190 {
 191         struct dst_entry *dst = skb->dst;
 192         struct hh_cache *hh = dst->hh;
 193
 194 #ifdef CONFIG_NETFILTER_DEBUG
 195         nf_debug_ip_finish_output2(skb);
 196 #endif /*CONFIG_NETFILTER_DEBUG*/
 197
 198         if (hh) {
 199                 read_lock_bh(&hh->hh_lock);
 200                 memcpy(skb->data - 16, hh->hh_data, 16);
 201                 read_unlock_bh(&hh->hh_lock);
 202                 skb_push(skb, hh->hh_len);
 203                 return hh->hh_output(skb);
 204         } else if (dst->neighbour)
 205                 return dst->neighbour->output(skb);
 206
 207         printk(KERN_DEBUG "khm\n");
 208         kfree_skb(skb);
 209         return -EINVAL;
 210 }
 211
 212 __inline__ int ip_finish_output(struct sk_buff *skb)
 213 {
 214         struct net_device *dev = skb->dst->dev;
 215
 216         skb->dev = dev;
 217         skb->protocol = __constant_htons(ETH_P_IP);
 218
 219         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 220                        ip_finish_output2);
 221 }
 222
 223 int ip_mc_output(struct sk_buff *skb)
 224 {
 225         struct sock *sk = skb->sk;
 226         struct rtable *rt = (struct rtable*)skb->dst;
 227         struct net_device *dev = rt->u.dst.dev;
 228
 229         /*
 230          *      If the indicated interface is up and running, send the packet.
 231          */
 232         ip_statistics.IpOutRequests++;
 233 #ifdef CONFIG_IP_ROUTE_NAT
 234         if (rt->rt_flags & RTCF_NAT)
 235                 ip_do_nat(skb);
 236 #endif
 237
 238         skb->dev = dev;
 239         skb->protocol = __constant_htons(ETH_P_IP);
 240
 241         /*
 242          *      Multicasts are looped back for other local users
 243          */
 244
 245         if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
 246 #ifdef CONFIG_IP_MROUTE
 247                 /* Small optimization: do not loopback not local frames,
 248                    which returned after forwarding; they will be  dropped
 249                    by ip_mr_input in any case.
 250                    Note, that local frames are looped back to be delivered
 251                    to local recipients.
 252
 253                    This check is duplicated in ip_mr_input at the moment.
 254                  */
 255                 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 256 #endif
 257                 {
 258                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 259                         if (newskb)
 260                                 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
 261                                         newskb->dev,
 262                                         ip_dev_loopback_xmit);
 263                 }
 264
 265                 /* Multicasts with ttl 0 must not go beyond the host */
 266
 267                 if (skb->nh.iph->ttl == 0) {
 268                         kfree_skb(skb);
 269                         return 0;
 270                 }
 271         }
 272
 273         if (rt->rt_flags&RTCF_BROADCAST) {
 274                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                 if (newskb)
 276                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 277                                 newskb->dev, ip_dev_loopback_xmit);
 278         }
 279
 280         return ip_finish_output(skb);
 281 }
 282
 283 int ip_output(struct sk_buff *skb)
 284 {
 285 #ifdef CONFIG_IP_ROUTE_NAT
 286         struct rtable *rt = (struct rtable*)skb->dst;
 287 #endif
 288
 289         ip_statistics.IpOutRequests++;
 290
 291 #ifdef CONFIG_IP_ROUTE_NAT
 292         if (rt->rt_flags&RTCF_NAT)
 293                 ip_do_nat(skb);
 294 #endif
 295
 296         return ip_finish_output(skb);
 297 }
 298
 299 /* Queues a packet to be sent, and starts the transmitter if necessary.
 300  * This routine also needs to put in the total length and compute the
 301  * checksum.  We use to do this in two stages, ip_build_header() then
 302  * this, but that scheme created a mess when routes disappeared etc.
 303  * So we do it all here, and the TCP send engine has been changed to
 304  * match. (No more unroutable FIN disasters, etc. wheee...)  This will
 305  * most likely make other reliable transport layers above IP easier
 306  * to implement under Linux.
 307  */
 308 static inline int ip_queue_xmit2(struct sk_buff *skb)
 309 {
 310         struct sock *sk = skb->sk;
 311         struct rtable *rt = (struct rtable *)skb->dst;
 312         struct net_device *dev;
 313         struct iphdr *iph = skb->nh.iph;
 314
 315 #ifdef CONFIG_NETFILTER
 316         /* BLUE-PEN-FOR-ALEXEY.  I don't understand; you mean I can't
 317            hold the route as I pass the packet to userspace? -- RR
 318
 319            You may hold it, if you really hold it. F.e. if netfilter
 320            does not destroy handed skb with skb->dst attached, it
 321            will be held. When it was stored in info->arg, then
 322            it was not held apparently. Now (without second arg) it is evident,
 323            that it is clean.                               --ANK
 324          */
 325         if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
 326                 if (route_me_harder(skb) != 0) {
 327                         kfree_skb(skb);
 328                         return -EHOSTUNREACH;
 329                 }
 330         }
 331 #endif
 332
 333         dev = rt->u.dst.dev;
 334
 335         /* This can happen when the transport layer has segments queued
 336          * with a cached route, and by the time we get here things are
 337          * re-routed to a device with a different MTU than the original
 338          * device.  Sick, but we must cover it.
 339          */
 340         if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
 341                 struct sk_buff *skb2;
 342
 343                 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
 344                 kfree_skb(skb);
 345                 if (skb2 == NULL)
 346                         return -ENOMEM;
 347                 if (sk)
 348                         skb_set_owner_w(skb2, sk);
 349                 skb = skb2;
 350                 iph = skb->nh.iph;
 351         }
 352
 353         if (skb->len > rt->u.dst.pmtu)
 354                 goto fragment;
 355
 356         if (ip_dont_fragment(sk, &rt->u.dst))
 357                 iph->frag_off |= __constant_htons(IP_DF);
 358
 359         /* Add an IP checksum. */
 360         ip_send_check(iph);
 361
 362         skb->priority = sk->priority;
 363         return skb->dst->output(skb);
 364
 365 fragment:
 366         if (ip_dont_fragment(sk, &rt->u.dst)) {
 367                 /* Reject packet ONLY if TCP might fragment
 368                  * it itself, if were careful enough.
 369                  */
 370                 iph->frag_off |= __constant_htons(IP_DF);
 371                 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
 372
 373                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 374                           htonl(rt->u.dst.pmtu));
 375                 kfree_skb(skb);
 376                 return -EMSGSIZE;
 377         }
 378         return ip_fragment(skb, skb->dst->output);
 379 }
 380
 381 int ip_queue_xmit(struct sk_buff *skb)
 382 {
 383         struct sock *sk = skb->sk;
 384         struct ip_options *opt = sk->protinfo.af_inet.opt;
 385         struct rtable *rt;
 386         struct iphdr *iph;
 387
 388         /* Make sure we can route this packet. */
 389         rt = (struct rtable *)__sk_dst_check(sk, 0);
 390         if (rt == NULL) {
 391                 u32 daddr;
 392
 393                 /* Use correct destination address if we have options. */
 394                 daddr = sk->daddr;
 395                 if(opt && opt->srr)
 396                         daddr = opt->faddr;
 397
 398                 /* If this fails, retransmit mechanism of transport layer will
 399                  * keep trying until route appears or the connection times itself
 400                  * out.
 401                  */
 402                 if (ip_route_output(&rt, daddr, sk->saddr,
 403                                     RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
 404                                     sk->bound_dev_if))
 405                         goto no_route;
 406                 __sk_dst_set(sk, &rt->u.dst);
 407         }
 408         skb->dst = dst_clone(&rt->u.dst);
 409
 410         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 411                 goto no_route;
 412
 413         /* OK, we know where to send it, allocate and build IP header. */
 414         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 415         iph->version  = 4;
 416         iph->ihl      = 5;
 417         iph->tos      = sk->protinfo.af_inet.tos;
 418         iph->frag_off = 0;
 419         iph->ttl      = sk->protinfo.af_inet.ttl;
 420         iph->daddr    = rt->rt_dst;
 421         iph->saddr    = rt->rt_src;
 422         iph->protocol = sk->protocol;
 423         skb->nh.iph   = iph;
 424         /* Transport layer set skb->h.foo itself. */
 425
 426         if(opt && opt->optlen) {
 427                 iph->ihl += opt->optlen >> 2;
 428                 ip_options_build(skb, opt, sk->daddr, rt, 0);
 429         }
 430
 431         iph->tot_len = htons(skb->len);
 432         iph->id = htons(ip_id_count++);
 433
 434         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 435                        ip_queue_xmit2);
 436
 437 no_route:
 438         ip_statistics.IpOutNoRoutes++;
 439         kfree_skb(skb);
 440         return -EHOSTUNREACH;
 441 }
 442
 443 /*
 444  *      Build and send a packet, with as little as one copy
 445  *
 446  *      Doesn't care much about ip options... option length can be
 447  *      different for fragment at 0 and other fragments.
 448  *
 449  *      Note that the fragment at the highest offset is sent first,
 450  *      so the getfrag routine can fill in the TCP/UDP checksum header
 451  *      field in the last fragment it sends... actually it also helps
 452  *      the reassemblers, they can put most packets in at the head of
 453  *      the fragment queue, and they know the total size in advance. This
 454  *      last feature will measurably improve the Linux fragment handler one
 455  *      day.
 456  *
 457  *      The callback has five args, an arbitrary pointer (copy of frag),
 458  *      the source IP address (may depend on the routing table), the
 459  *      destination address (char *), the offset to copy from, and the
 460  *      length to be copied.
 461  */
 462
 463 static int ip_build_xmit_slow(struct sock *sk,
 464                   int getfrag (const void *,
 465                                char *,
 466                                unsigned int,
 467                                unsigned int),
 468                   const void *frag,
 469                   unsigned length,
 470                   struct ipcm_cookie *ipc,
 471                   struct rtable *rt,
 472                   int flags)
 473 {
 474         unsigned int fraglen, maxfraglen, fragheaderlen;
 475         int err;
 476         int offset, mf;
 477         int mtu;
 478         unsigned short id;
 479
 480         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 481         int nfrags=0;
 482         struct ip_options *opt = ipc->opt;
 483         int df = 0;
 484
 485         mtu = rt->u.dst.pmtu;
 486         if (ip_dont_fragment(sk, &rt->u.dst))
 487                 df = htons(IP_DF);
 488
 489         length -= sizeof(struct iphdr);
 490
 491         if (opt) {
 492                 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
 493                 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
 494         } else {
 495                 fragheaderlen = sizeof(struct iphdr);
 496
 497                 /*
 498                  *      Fragheaderlen is the size of 'overhead' on each buffer. Now work
 499                  *      out the size of the frames to send.
 500                  */
 501
 502                 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
 503         }
 504
 505         if (length + fragheaderlen > 0xFFFF) {
 506                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 507                 return -EMSGSIZE;
 508         }
 509
 510         /*
 511          *      Start at the end of the frame by handling the remainder.
 512          */
 513
 514         offset = length - (length % (maxfraglen - fragheaderlen));
 515
 516         /*
 517          *      Amount of memory to allocate for final fragment.
 518          */
 519
 520         fraglen = length - offset + fragheaderlen;
 521
 522         if (length-offset==0) {
 523                 fraglen = maxfraglen;
 524                 offset -= maxfraglen-fragheaderlen;
 525         }
 526
 527         /*
 528          *      The last fragment will not have MF (more fragments) set.
 529          */
 530
 531         mf = 0;
 532
 533         /*
 534          *      Don't fragment packets for path mtu discovery.
 535          */
 536
 537         if (offset > 0 && df) {
 538                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
 539                 return -EMSGSIZE;
 540         }
 541         if (flags&MSG_PROBE)
 542                 goto out;
 543
 544         /*
 545          *      Get an identifier
 546          */
 547
 548         id = htons(ip_id_count++);
 549
 550         /*
 551          *      Begin outputting the bytes.
 552          */
 553
 554         do {
 555                 char *data;
 556                 struct sk_buff * skb;
 557
 558                 /*
 559                  *      Get the memory we require with some space left for alignment.
 560                  */
 561
 562                 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
 563                 if (skb == NULL)
 564                         goto error;
 565
 566                 /*
 567                  *      Fill in the control structures
 568                  */
 569
 570                 skb->priority = sk->priority;
 571                 skb->dst = dst_clone(&rt->u.dst);
 572                 skb_reserve(skb, hh_len);
 573
 574                 /*
 575                  *      Find where to start putting bytes.
 576                  */
 577
 578                 data = skb_put(skb, fraglen);
 579                 skb->nh.iph = (struct iphdr *)data;
 580
 581                 /*
 582                  *      Only write IP header onto non-raw packets
 583                  */
 584
 585                 {
 586                         struct iphdr *iph = (struct iphdr *)data;
 587
 588                         iph->version = 4;
 589                         iph->ihl = 5;
 590                         if (opt) {
 591                                 iph->ihl += opt->optlen>>2;
 592                                 ip_options_build(skb, opt,
 593                                                  ipc->addr, rt, offset);
 594                         }
 595                         iph->tos = sk->protinfo.af_inet.tos;
 596                         iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
 597                         iph->id = id;
 598                         iph->frag_off = htons(offset>>3);
 599                         iph->frag_off |= mf|df;
 600                         if (rt->rt_type == RTN_MULTICAST)
 601                                 iph->ttl = sk->protinfo.af_inet.mc_ttl;
 602                         else
 603                                 iph->ttl = sk->protinfo.af_inet.ttl;
 604                         iph->protocol = sk->protocol;
 605                         iph->check = 0;
 606                         iph->saddr = rt->rt_src;
 607                         iph->daddr = rt->rt_dst;
 608                         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 609                         data += iph->ihl*4;
 610
 611                         /*
 612                          *      Any further fragments will have MF set.
 613                          */
 614
 615                         mf = htons(IP_MF);
 616                 }
 617
 618                 /*
 619                  *      User data callback
 620                  */
 621
 622                 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
 623                         err = -EFAULT;
 624                         kfree_skb(skb);
 625                         goto error;
 626                 }
 627
 628                 offset -= (maxfraglen-fragheaderlen);
 629                 fraglen = maxfraglen;
 630
 631                 nfrags++;
 632
 633                 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 634                               skb->dst->dev, output_maybe_reroute);
 635                 if (err) {
 636                         if (err > 0)
 637                                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 638                         if (err)
 639                                 goto error;
 640                 }
 641         } while (offset >= 0);
 642
 643         if (nfrags>1)
 644                 ip_statistics.IpFragCreates += nfrags;
 645 out:
 646         return 0;
 647
 648 error:
 649         ip_statistics.IpOutDiscards++;
 650         if (nfrags>1)
 651                 ip_statistics.IpFragCreates += nfrags;
 652         return err;
 653 }
 654
 655 /*
 656  *      Fast path for unfragmented packets.
 657  */
 658 int ip_build_xmit(struct sock *sk,
 659                   int getfrag (const void *,
 660                                char *,
 661                                unsigned int,
 662                                unsigned int),
 663                   const void *frag,
 664                   unsigned length,
 665                   struct ipcm_cookie *ipc,
 666                   struct rtable *rt,
 667                   int flags)
 668 {
 669         int err;
 670         struct sk_buff *skb;
 671         int df;
 672         struct iphdr *iph;
 673
 674         /*
 675          *      Try the simple case first. This leaves fragmented frames, and by
 676          *      choice RAW frames within 20 bytes of maximum size(rare) to the long path
 677          */
 678
 679         if (!sk->protinfo.af_inet.hdrincl) {
 680                 length += sizeof(struct iphdr);
 681
 682                 /*
 683                  *      Check for slow path.
 684                  */
 685                 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
 686                         return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
 687         } else {
 688                 if (length > rt->u.dst.dev->mtu) {
 689                         ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
 690                         return -EMSGSIZE;
 691                 }
 692         }
 693         if (flags&MSG_PROBE)
 694                 goto out;
 695
 696         /*
 697          *      Do path mtu discovery if needed.
 698          */
 699         df = 0;
 700         if (ip_dont_fragment(sk, &rt->u.dst))
 701                 df = htons(IP_DF);
 702
 703         /*
 704          *      Fast path for unfragmented frames without options.
 705          */
 706         {
 707         int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
 708
 709         skb = sock_alloc_send_skb(sk, length+hh_len+15,
 710                                   0, flags&MSG_DONTWAIT, &err);
 711         if(skb==NULL)
 712                 goto error;
 713         skb_reserve(skb, hh_len);
 714         }
 715
 716         skb->priority = sk->priority;
 717         skb->dst = dst_clone(&rt->u.dst);
 718
 719         skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
 720
 721         if(!sk->protinfo.af_inet.hdrincl) {
 722                 iph->version=4;
 723                 iph->ihl=5;
 724                 iph->tos=sk->protinfo.af_inet.tos;
 725                 iph->tot_len = htons(length);
 726                 iph->id=htons(ip_id_count++);
 727                 iph->frag_off = df;
 728                 iph->ttl=sk->protinfo.af_inet.mc_ttl;
 729                 if (rt->rt_type != RTN_MULTICAST)
 730                         iph->ttl=sk->protinfo.af_inet.ttl;
 731                 iph->protocol=sk->protocol;
 732                 iph->saddr=rt->rt_src;
 733                 iph->daddr=rt->rt_dst;
 734                 iph->check=0;
 735                 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 736                 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
 737         }
 738         else
 739                 err = getfrag(frag, (void *)iph, 0, length);
 740
 741         if (err)
 742                 goto error_fault;
 743
 744         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 745                       output_maybe_reroute);
 746         if (err > 0)
 747                 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
 748         if (err)
 749                 goto error;
 750 out:
 751         return 0;
 752
 753 error_fault:
 754         err = -EFAULT;
 755         kfree_skb(skb);
 756 error:
 757         ip_statistics.IpOutDiscards++;
 758         return err;
 759 }
 760
 761
 762
 763 /*
 764  *      This IP datagram is too large to be sent in one piece.  Break it up into
 765  *      smaller pieces (each of size equal to IP header plus
 766  *      a block of the data of the original IP data part) that will yet fit in a
 767  *      single device frame, and queue such a frame for sending.
 768  *
 769  *      Yes this is inefficient, feel free to submit a quicker one.
 770  */
 771
 772 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 773 {
 774         struct iphdr *iph;
 775         unsigned char *raw;
 776         unsigned char *ptr;
 777         struct net_device *dev;
 778         struct sk_buff *skb2;
 779         unsigned int mtu, hlen, left, len;
 780         int offset;
 781         int not_last_frag;
 782         struct rtable *rt = (struct rtable*)skb->dst;
 783         int err = 0;
 784
 785         dev = rt->u.dst.dev;
 786
 787         /*
 788          *      Point into the IP datagram header.
 789          */
 790
 791         raw = skb->nh.raw;
 792         iph = (struct iphdr*)raw;
 793
 794         /*
 795          *      Setup starting values.
 796          */
 797
 798         hlen = iph->ihl * 4;
 799         left = ntohs(iph->tot_len) - hlen;      /* Space per frame */
 800         mtu = rt->u.dst.pmtu - hlen;    /* Size of data space */
 801         ptr = raw + hlen;                       /* Where to start from */
 802
 803         /*
 804          *      Fragment the datagram.
 805          */
 806
 807         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 808         not_last_frag = iph->frag_off & htons(IP_MF);
 809
 810         /*
 811          *      Keep copying data until we run out.
 812          */
 813
 814         while(left > 0) {
 815                 len = left;
 816                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 817                 if (len > mtu)
 818                         len = mtu;
 819                 /* IF: we are not sending upto and including the packet end
 820                    then align the next start on an eight byte boundary */
 821                 if (len < left) {
 822                         len &= ~7;
 823                 }
 824                 /*
 825                  *      Allocate buffer.
 826                  */
 827
 828                 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
 829                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 830                         err = -ENOMEM;
 831                         goto fail;
 832                 }
 833
 834                 /*
 835                  *      Set up data on packet
 836                  */
 837
 838                 skb2->pkt_type = skb->pkt_type;
 839                 skb2->priority = skb->priority;
 840                 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
 841                 skb_put(skb2, len + hlen);
 842                 skb2->nh.raw = skb2->data;
 843                 skb2->h.raw = skb2->data + hlen;
 844
 845                 /*
 846                  *      Charge the memory for the fragment to any owner
 847                  *      it might possess
 848                  */
 849
 850                 if (skb->sk)
 851                         skb_set_owner_w(skb2, skb->sk);
 852                 skb2->dst = dst_clone(skb->dst);
 853
 854                 /*
 855                  *      Copy the packet header into the new buffer.
 856                  */
 857
 858                 memcpy(skb2->nh.raw, raw, hlen);
 859
 860                 /*
 861                  *      Copy a block of the IP datagram.
 862                  */
 863                 memcpy(skb2->h.raw, ptr, len);
 864                 left -= len;
 865
 866                 /*
 867                  *      Fill in the new header fields.
 868                  */
 869                 iph = skb2->nh.iph;
 870                 iph->frag_off = htons((offset >> 3));
 871
 872                 /* ANK: dirty, but effective trick. Upgrade options only if
 873                  * the segment to be fragmented was THE FIRST (otherwise,
 874                  * options are already fixed) and make it ONCE
 875                  * on the initial skb, so that all the following fragments
 876                  * will inherit fixed options.
 877                  */
 878                 if (offset == 0)
 879                         ip_options_fragment(skb);
 880
 881                 /*
 882                  *      Added AC : If we are fragmenting a fragment that's not the
 883                  *                 last fragment then keep MF on each bit
 884                  */
 885                 if (left > 0 || not_last_frag)
 886                         iph->frag_off |= htons(IP_MF);
 887                 ptr += len;
 888                 offset += len;
 889
 890                 /*
 891                  *      Put this fragment into the sending queue.
 892                  */
 893
 894                 ip_statistics.IpFragCreates++;
 895
 896                 iph->tot_len = htons(len + hlen);
 897
 898                 ip_send_check(iph);
 899
 900                 err = output(skb2);
 901                 if (err)
 902                         goto fail;
 903         }
 904         kfree_skb(skb);
 905         ip_statistics.IpFragOKs++;
 906         return err;
 907
 908 fail:
 909         kfree_skb(skb);
 910         ip_statistics.IpFragFails++;
 911         return err;
 912 }
 913
 914 /*
 915  *      Fetch data from kernel space and fill in checksum if needed.
 916  */
 917 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 918                               unsigned int fraglen)
 919 {
 920         struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
 921         u16 *pktp = (u16 *)to;
 922         struct iovec *iov;
 923         int len;
 924         int hdrflag = 1;
 925
 926         iov = &dp->iov[0];
 927         if (offset >= iov->iov_len) {
 928                 offset -= iov->iov_len;
 929                 iov++;
 930                 hdrflag = 0;
 931         }
 932         len = iov->iov_len - offset;
 933         if (fraglen > len) { /* overlapping. */
 934                 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 935                                              dp->csum);
 936                 offset = 0;
 937                 fraglen -= len;
 938                 to += len;
 939                 iov++;
 940         }
 941
 942         dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
 943                                              dp->csum);
 944
 945         if (hdrflag && dp->csumoffset)
 946                 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
 947         return 0;
 948 }
 949
 950 /*
 951  *      Generic function to send a packet as reply to another packet.
 952  *      Used to send TCP resets so far. ICMP should use this function too.
 953  *
 954  *      Should run single threaded per socket because it uses the sock
 955  *      structure to pass arguments.
 956  */
 957 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 958                    unsigned int len)
 959 {
 960         struct {
 961                 struct ip_options       opt;
 962                 char                    data[40];
 963         } replyopts;
 964         struct ipcm_cookie ipc;
 965         u32 daddr;
 966         struct rtable *rt = (struct rtable*)skb->dst;
 967
 968         if (ip_options_echo(&replyopts.opt, skb))
 969                 return;
 970
 971         daddr = ipc.addr = rt->rt_src;
 972         ipc.opt = &replyopts.opt;
 973
 974         if (ipc.opt->srr)
 975                 daddr = replyopts.opt.faddr;
 976         if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
 977                 return;
 978
 979         /* And let IP do all the hard work.
 980
 981            This chunk is not reenterable, hence spinlock.
 982            Note that it uses the fact, that this function is called
 983            with locally disabled BH and that sk cannot be already spinlocked.
 984          */
 985         bh_lock_sock(sk);
 986         sk->protinfo.af_inet.tos = skb->nh.iph->tos;
 987         sk->priority = skb->priority;
 988         sk->protocol = skb->nh.iph->protocol;
 989         ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
 990         bh_unlock_sock(sk);
 991
 992         ip_rt_put(rt);
 993 }
 994
 995 /*
 996  *      IP protocol layer initialiser
 997  */
 998
 999 static struct packet_type ip_packet_type =
1000 {
1001         __constant_htons(ETH_P_IP),
1002         NULL,   /* All devices */
1003         ip_rcv,
1004         (void*)1,
1005         NULL,
1006 };
1007
1008
1009
1010 #ifdef CONFIG_PROC_FS
1011 #ifdef CONFIG_IP_MULTICAST
1012 static struct proc_dir_entry proc_net_igmp = {
1013         PROC_NET_IGMP, 4, "igmp",
1014         S_IFREG | S_IRUGO, 1, 0, 0,
1015         0, &proc_net_inode_operations,
1016         ip_mc_procinfo
1017 };
1018 #endif
1019 #endif
1020
1021 /*
1022  *      IP registers the packet type and then calls the subprotocol initialisers
1023  */
1024
1025 void __init ip_init(void)
1026 {
1027         dev_add_pack(&ip_packet_type);
1028
1029         ip_rt_init();
1030
1031 #ifdef CONFIG_PROC_FS
1032 #ifdef CONFIG_IP_MULTICAST
1033         proc_net_register(&proc_net_igmp);
1034 #endif
1035 #endif
1036 }
1037