net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *
  56  *              This program is free software; you can redistribute it and/or
  57  *              modify it under the terms of the GNU General Public License
  58  *              as published by the Free Software Foundation; either version
  59  *              2 of the License, or (at your option) any later version.
  60  */
  61
  62 #include <linux/config.h>
  63 #include <asm/uaccess.h>
  64 #include <asm/system.h>
  65 #include <asm/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/sched.h>
  69 #include <linux/mm.h>
  70 #include <linux/string.h>
  71 #include <linux/socket.h>
  72 #include <linux/sockios.h>
  73 #include <linux/errno.h>
  74 #include <linux/in.h>
  75 #include <linux/inet.h>
  76 #include <linux/netdevice.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/init.h>
  79 #include <linux/skbuff.h>
  80 #include <linux/rtnetlink.h>
  81 #include <linux/inetdevice.h>
  82 #include <linux/igmp.h>
  83 #include <linux/pkt_sched.h>
  84 #include <linux/mroute.h>
  85 #include <net/protocol.h>
  86 #include <net/ip.h>
  87 #include <net/route.h>
  88 #include <net/sock.h>
  89 #include <net/ip_fib.h>
  90 #include <net/arp.h>
  91 #include <net/tcp.h>
  92 #include <net/icmp.h>
  93 #ifdef CONFIG_SYSCTL
  94 #include <linux/sysctl.h>
  95 #endif
  96
  97 #define IP_MAX_MTU      0xFFF0
  98
  99 #define RT_GC_TIMEOUT (300*HZ)
 100
 101 int ip_rt_min_delay = 2*HZ;
 102 int ip_rt_max_delay = 10*HZ;
 103 int ip_rt_gc_thresh = RT_HASH_DIVISOR;
 104 int ip_rt_max_size = RT_HASH_DIVISOR*16;
 105 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
 106 int ip_rt_gc_interval = 60*HZ;
 107 int ip_rt_gc_min_interval = 5*HZ;
 108 int ip_rt_redirect_number = 9;
 109 int ip_rt_redirect_load = HZ/50;
 110 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
 111 int ip_rt_error_cost = HZ;
 112 int ip_rt_error_burst = 5*HZ;
 113 int ip_rt_gc_elasticity = 8;
 114 int ip_rt_mtu_expires = 10*60*HZ;
 115
 116 static unsigned long rt_deadline = 0;
 117
 118 #define RTprint(a...)   printk(KERN_DEBUG a)
 119
 120 static void rt_run_flush(unsigned long dummy);
 121
 122 static struct timer_list rt_flush_timer =
 123         { NULL, NULL, 0, 0L, rt_run_flush };
 124 static struct timer_list rt_periodic_timer =
 125         { NULL, NULL, 0, 0L, NULL };
 126
 127 /*
 128  *      Interface to generic destination cache.
 129  */
 130
 131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
 132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 133                                            struct sk_buff *);
 134 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
 135 static void               ipv4_link_failure(struct sk_buff *skb);
 136 static int rt_garbage_collect(void);
 137
 138
 139 struct dst_ops ipv4_dst_ops =
 140 {
 141         AF_INET,
 142         __constant_htons(ETH_P_IP),
 143         RT_HASH_DIVISOR,
 144
 145         rt_garbage_collect,
 146         ipv4_dst_check,
 147         ipv4_dst_reroute,
 148         NULL,
 149         ipv4_negative_advice,
 150         ipv4_link_failure,
 151 };
 152
 153 __u8 ip_tos2prio[16] = {
 154         TC_PRIO_BESTEFFORT,
 155         TC_PRIO_FILLER,
 156         TC_PRIO_BESTEFFORT,
 157         TC_PRIO_FILLER,
 158         TC_PRIO_BULK,
 159         TC_PRIO_FILLER,
 160         TC_PRIO_BULK,
 161         TC_PRIO_FILLER,
 162         TC_PRIO_INTERACTIVE,
 163         TC_PRIO_FILLER,
 164         TC_PRIO_INTERACTIVE,
 165         TC_PRIO_FILLER,
 166         TC_PRIO_INTERACTIVE_BULK,
 167         TC_PRIO_FILLER,
 168         TC_PRIO_INTERACTIVE_BULK,
 169         TC_PRIO_FILLER
 170 };
 171
 172
 173 /*
 174  * Route cache.
 175  */
 176
 177 /* The locking scheme is rather straight forward:
 178  *
 179  * 1) A BH protected rwlock protects the central route hash.
 180  * 2) Only writers remove entries, and they hold the lock
 181  *    as they look at rtable reference counts.
 182  * 3) Only readers acquire references to rtable entries,
 183  *    they do so with atomic increments and with the
 184  *    lock held.
 185  */
 186
 187 static struct rtable    *rt_hash_table[RT_HASH_DIVISOR];
 188 static rwlock_t          rt_hash_lock = RW_LOCK_UNLOCKED;
 189
 190 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
 191
 192 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 193 {
 194         unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
 195         hash = hash^saddr^tos;
 196         hash = hash^(hash>>16);
 197         return (hash^(hash>>8)) & 0xFF;
 198 }
 199
 200 #ifdef CONFIG_PROC_FS
 201
 202 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
 203 {
 204         int len=0;
 205         off_t pos=0;
 206         char temp[129];
 207         struct rtable *r;
 208         int i;
 209
 210         pos = 128;
 211
 212         if (offset<128) {
 213                 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
 214                 len = 128;
 215         }
 216
 217
 218         read_lock_bh(&rt_hash_lock);
 219
 220         for (i = 0; i<RT_HASH_DIVISOR; i++) {
 221                 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
 222                         /*
 223                          *      Spin through entries until we are ready
 224                          */
 225                         pos += 128;
 226
 227                         if (pos <= offset) {
 228                                 len = 0;
 229                                 continue;
 230                         }
 231                         sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 232                                 r->u.dst.dev ? r->u.dst.dev->name : "*",
 233                                 (unsigned long)r->rt_dst,
 234                                 (unsigned long)r->rt_gateway,
 235                                 r->rt_flags,
 236                                 atomic_read(&r->u.dst.use),
 237                                 atomic_read(&r->u.dst.refcnt),
 238                                 0,
 239                                 (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
 240                                 r->u.dst.window,
 241                                 (int)r->u.dst.rtt, r->key.tos,
 242                                 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 243                                 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
 244                                 r->rt_spec_dst);
 245                         sprintf(buffer+len,"%-127s\n",temp);
 246                         len += 128;
 247                         if (pos >= offset+length)
 248                                 goto done;
 249                 }
 250         }
 251
 252 done:
 253         read_unlock_bh(&rt_hash_lock);
 254
 255         *start = buffer+len-(pos-offset);
 256         len = pos-offset;
 257         if (len>length)
 258                 len = length;
 259         return len;
 260 }
 261 #endif
 262
 263 static __inline__ void rt_free(struct rtable *rt)
 264 {
 265         dst_free(&rt->u.dst);
 266 }
 267
 268 static __inline__ void rt_drop(struct rtable *rt)
 269 {
 270         ip_rt_put(rt);
 271         dst_free(&rt->u.dst);
 272 }
 273
 274 static __inline__ int rt_fast_clean(struct rtable *rth)
 275 {
 276         /* Kill broadcast/multicast entries very aggresively, if they
 277            collide in hash table with more useful entries */
 278         return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
 279                 && rth->key.iif && rth->u.rt_next);
 280 }
 281
 282 static __inline__ int rt_valuable(struct rtable *rth)
 283 {
 284         return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
 285                 || rth->u.dst.expires);
 286 }
 287
 288 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
 289 {
 290         int age;
 291
 292         if (atomic_read(&rth->u.dst.use))
 293                 return 0;
 294
 295         if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
 296                 return 1;
 297
 298         age = jiffies - rth->u.dst.lastuse;
 299         if (age <= tmo1 && !rt_fast_clean(rth))
 300                 return 0;
 301         if (age <= tmo2 && rt_valuable(rth))
 302                 return 0;
 303         return 1;
 304 }
 305
 306 /* This runs via a timer and thus is always in BH context. */
 307 static void rt_check_expire(unsigned long dummy)
 308 {
 309         int i;
 310         static int rover;
 311         struct rtable *rth, **rthp;
 312         unsigned long now = jiffies;
 313
 314         for (i=0; i<RT_HASH_DIVISOR/5; i++) {
 315                 unsigned tmo = ip_rt_gc_timeout;
 316
 317                 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 318                 rthp = &rt_hash_table[rover];
 319
 320                 write_lock(&rt_hash_lock);
 321                 while ((rth = *rthp) != NULL) {
 322                         if (rth->u.dst.expires) {
 323                                 /* Entrie is expired even if it is in use */
 324                                 if ((long)(now - rth->u.dst.expires) <= 0) {
 325                                         tmo >>= 1;
 326                                         rthp = &rth->u.rt_next;
 327                                         continue;
 328                                 }
 329                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 330                                 tmo >>= 1;
 331                                 rthp = &rth->u.rt_next;
 332                                 continue;
 333                         }
 334
 335                         /*
 336                          * Cleanup aged off entries.
 337                          */
 338                         *rthp = rth->u.rt_next;
 339                         rt_free(rth);
 340                 }
 341                 write_unlock(&rt_hash_lock);
 342
 343                 /* Fallback loop breaker. */
 344                 if ((jiffies - now) > 0)
 345                         break;
 346         }
 347         rt_periodic_timer.expires = now + ip_rt_gc_interval;
 348         add_timer(&rt_periodic_timer);
 349 }
 350
 351 /* This can run from both BH and non-BH contexts, the latter
 352  * in the case of a forced flush event.
 353  */
 354 static void rt_run_flush(unsigned long dummy)
 355 {
 356         int i;
 357         struct rtable * rth, * next;
 358
 359         rt_deadline = 0;
 360
 361         for (i=0; i<RT_HASH_DIVISOR; i++) {
 362                 write_lock_bh(&rt_hash_lock);
 363                 rth = rt_hash_table[i];
 364                 if(rth != NULL)
 365                         rt_hash_table[i] = NULL;
 366                 write_unlock_bh(&rt_hash_lock);
 367
 368                 for (; rth; rth=next) {
 369                         next = rth->u.rt_next;
 370                         rth->u.rt_next = NULL;
 371                         rt_free(rth);
 372                 }
 373         }
 374 }
 375
 376 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 377
 378 void rt_cache_flush(int delay)
 379 {
 380         unsigned long now = jiffies;
 381         int user_mode = !in_interrupt();
 382
 383         if (delay < 0)
 384                 delay = ip_rt_min_delay;
 385
 386         spin_lock_bh(&rt_flush_lock);
 387
 388         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 389                 long tmo = (long)(rt_deadline - now);
 390
 391                 /* If flush timer is already running
 392                    and flush request is not immediate (delay > 0):
 393
 394                    if deadline is not achieved, prolongate timer to "delay",
 395                    otherwise fire it at deadline time.
 396                  */
 397
 398                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 399                         tmo = 0;
 400
 401                 if (delay > tmo)
 402                         delay = tmo;
 403         }
 404
 405         if (delay <= 0) {
 406                 spin_unlock_bh(&rt_flush_lock);
 407                 rt_run_flush(0);
 408                 return;
 409         }
 410
 411         if (rt_deadline == 0)
 412                 rt_deadline = now + ip_rt_max_delay;
 413
 414         rt_flush_timer.expires = now + delay;
 415         add_timer(&rt_flush_timer);
 416         spin_unlock_bh(&rt_flush_lock);
 417 }
 418
 419 /*
 420    Short description of GC goals.
 421
 422    We want to build algorithm, which will keep routing cache
 423    at some equilibrium point, when number of aged off entries
 424    is kept approximately equal to newly generated ones.
 425
 426    Current expiration strength is variable "expire".
 427    We try to adjust it dynamically, so that if networking
 428    is idle expires is large enough to keep enough of warm entries,
 429    and when load increases it reduces to limit cache size.
 430  */
 431
 432 static int rt_garbage_collect(void)
 433 {
 434         static unsigned expire = RT_GC_TIMEOUT;
 435         static unsigned long last_gc;
 436         static int rover;
 437         static int equilibrium;
 438         struct rtable *rth, **rthp;
 439         unsigned long now = jiffies;
 440         int goal;
 441
 442         /*
 443          * Garbage collection is pretty expensive,
 444          * do not make it too frequently.
 445          */
 446         if (now - last_gc < ip_rt_gc_min_interval &&
 447             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 448                 return 0;
 449
 450         /* Calculate number of entries, which we want to expire now. */
 451         goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
 452         if (goal <= 0) {
 453                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 454                         equilibrium = ipv4_dst_ops.gc_thresh;
 455                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 456                 if (goal > 0) {
 457                         equilibrium += min(goal/2, RT_HASH_DIVISOR);
 458                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 459                 }
 460         } else {
 461                 /* We are in dangerous area. Try to reduce cache really
 462                  * aggressively.
 463                  */
 464                 goal = max(goal/2, RT_HASH_DIVISOR);
 465                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 466         }
 467
 468         if (now - last_gc >= ip_rt_gc_min_interval)
 469                 last_gc = now;
 470
 471         if (goal <= 0) {
 472                 equilibrium += goal;
 473                 goto work_done;
 474         }
 475
 476         do {
 477                 int i, k;
 478
 479                 /* The write lock is held during the entire hash
 480                  * traversal to ensure consistent state of the rover.
 481                  */
 482                 write_lock_bh(&rt_hash_lock);
 483                 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
 484                         unsigned tmo = expire;
 485
 486                         k = (k + 1) & (RT_HASH_DIVISOR-1);
 487                         rthp = &rt_hash_table[k];
 488                         while ((rth = *rthp) != NULL) {
 489                                 if (!rt_may_expire(rth, tmo, expire)) {
 490                                         tmo >>= 1;
 491                                         rthp = &rth->u.rt_next;
 492                                         continue;
 493                                 }
 494                                 *rthp = rth->u.rt_next;
 495                                 rth->u.rt_next = NULL;
 496                                 rt_free(rth);
 497                                 goal--;
 498                         }
 499                         if (goal <= 0)
 500                                 break;
 501                 }
 502                 rover = k;
 503                 write_unlock_bh(&rt_hash_lock);
 504
 505                 if (goal <= 0)
 506                         goto work_done;
 507
 508                 /* Goal is not achieved. We stop process if:
 509
 510                    - if expire reduced to zero. Otherwise, expire is halfed.
 511                    - if table is not full.
 512                    - if we are called from interrupt.
 513                    - jiffies check is just fallback/debug loop breaker.
 514                      We will not spin here for long time in any case.
 515                  */
 516
 517                 if (expire == 0)
 518                         break;
 519
 520                 expire >>= 1;
 521 #if RT_CACHE_DEBUG >= 2
 522                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
 523 #endif
 524
 525                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 526                         return 0;
 527         } while (!in_interrupt() && jiffies - now < 1);
 528
 529         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 530                 return 0;
 531         if (net_ratelimit())
 532                 printk("dst cache overflow\n");
 533         return 1;
 534
 535 work_done:
 536         expire += ip_rt_gc_min_interval;
 537         if (expire > ip_rt_gc_timeout ||
 538             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 539                 expire = ip_rt_gc_timeout;
 540 #if RT_CACHE_DEBUG >= 2
 541         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
 542 #endif
 543         return 0;
 544 }
 545
 546 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
 547 {
 548         struct rtable   *rth, **rthp;
 549         unsigned long   now = jiffies;
 550         int attempts = !in_interrupt();
 551
 552 restart:
 553         rthp = &rt_hash_table[hash];
 554
 555         write_lock_bh(&rt_hash_lock);
 556         while ((rth = *rthp) != NULL) {
 557                 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 558                         /* Put it first */
 559                         *rthp = rth->u.rt_next;
 560                         rth->u.rt_next = rt_hash_table[hash];
 561                         rt_hash_table[hash] = rth;
 562
 563                         atomic_inc(&rth->u.dst.refcnt);
 564                         atomic_inc(&rth->u.dst.use);
 565                         rth->u.dst.lastuse = now;
 566                         write_unlock_bh(&rt_hash_lock);
 567
 568                         rt_drop(rt);
 569                         *rp = rth;
 570                         return 0;
 571                 }
 572
 573                 rthp = &rth->u.rt_next;
 574         }
 575
 576         /* Try to bind route to arp only if it is output
 577            route or unicast forwarding path.
 578          */
 579         if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 580                 if (!arp_bind_neighbour(&rt->u.dst)) {
 581                         write_unlock_bh(&rt_hash_lock);
 582
 583                         /* Neighbour tables are full and nothing
 584                            can be released. Try to shrink route cache,
 585                            it is most likely it holds some neighbour records.
 586                          */
 587                         if (attempts-- > 0) {
 588                                 int saved_elasticity = ip_rt_gc_elasticity;
 589                                 int saved_int = ip_rt_gc_min_interval;
 590                                 ip_rt_gc_elasticity = 1;
 591                                 ip_rt_gc_min_interval = 0;
 592                                 rt_garbage_collect();
 593                                 ip_rt_gc_min_interval = saved_int;
 594                                 ip_rt_gc_elasticity = saved_elasticity;
 595                                 goto restart;
 596                         }
 597
 598                         rt_drop(rt);
 599                         if (net_ratelimit())
 600                                 printk("neighbour table overflow\n");
 601                         return -ENOBUFS;
 602                 }
 603         }
 604
 605         rt->u.rt_next = rt_hash_table[hash];
 606 #if RT_CACHE_DEBUG >= 2
 607         if (rt->u.rt_next) {
 608                 struct rtable * trt;
 609                 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
 610                 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
 611                         printk(" . %08x", trt->rt_dst);
 612                 printk("\n");
 613         }
 614 #endif
 615         rt_hash_table[hash] = rt;
 616         write_unlock_bh(&rt_hash_lock);
 617         *rp = rt;
 618         return 0;
 619 }
 620
 621 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 622                     u32 saddr, u8 tos, struct device *dev)
 623 {
 624         int i, k;
 625         struct in_device *in_dev = dev->ip_ptr;
 626         struct rtable *rth, **rthp;
 627         u32  skeys[2] = { saddr, 0 };
 628         int  ikeys[2] = { dev->ifindex, 0 };
 629
 630         tos &= IPTOS_TOS_MASK;
 631
 632         if (!in_dev)
 633                 return;
 634
 635         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 636             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 637                 goto reject_redirect;
 638
 639         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 640                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 641                         goto reject_redirect;
 642                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 643                         goto reject_redirect;
 644         } else {
 645                 if (inet_addr_type(new_gw) != RTN_UNICAST)
 646                         goto reject_redirect;
 647         }
 648
 649         for (i=0; i<2; i++) {
 650                 for (k=0; k<2; k++) {
 651                         unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
 652
 653                         rthp=&rt_hash_table[hash];
 654
 655                         write_lock_bh(&rt_hash_lock);
 656                         while ( (rth = *rthp) != NULL) {
 657                                 struct rtable *rt;
 658
 659                                 if (rth->key.dst != daddr ||
 660                                     rth->key.src != skeys[i] ||
 661                                     rth->key.tos != tos ||
 662                                     rth->key.oif != ikeys[k] ||
 663                                     rth->key.iif != 0) {
 664                                         rthp = &rth->u.rt_next;
 665                                         continue;
 666                                 }
 667
 668                                 if (rth->rt_dst != daddr ||
 669                                     rth->rt_src != saddr ||
 670                                     rth->u.dst.error ||
 671                                     rth->rt_gateway != old_gw ||
 672                                     rth->u.dst.dev != dev)
 673                                         break;
 674
 675                                 dst_clone(&rth->u.dst);
 676
 677                                 rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
 678                                 if (rt == NULL) {
 679                                         ip_rt_put(rth);
 680                                         write_unlock_bh(&rt_hash_lock);
 681                                         return;
 682                                 }
 683
 684                                 /*
 685                                  * Copy all the information.
 686                                  */
 687                                 *rt = *rth;
 688                                 atomic_set(&rt->u.dst.refcnt, 1);
 689                                 atomic_set(&rt->u.dst.use, 1);
 690                                 rt->u.dst.lastuse = jiffies;
 691                                 rt->u.dst.neighbour = NULL;
 692                                 rt->u.dst.hh = NULL;
 693
 694                                 rt->rt_flags |= RTCF_REDIRECTED;
 695
 696                                 /* Gateway is different ... */
 697                                 rt->rt_gateway = new_gw;
 698
 699                                 /* Redirect received -> path was valid */
 700                                 dst_confirm(&rth->u.dst);
 701
 702                                 if (!arp_bind_neighbour(&rt->u.dst) ||
 703                                     !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
 704                                         if (rt->u.dst.neighbour)
 705                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
 706                                         ip_rt_put(rth);
 707                                         rt_drop(rt);
 708                                         break;
 709                                 }
 710
 711                                 *rthp = rth->u.rt_next;
 712                                 write_unlock_bh(&rt_hash_lock);
 713                                 if (!rt_intern_hash(hash, rt, &rt))
 714                                         ip_rt_put(rt);
 715                                 rt_drop(rth);
 716                                 goto do_next;
 717                         }
 718                         write_unlock_bh(&rt_hash_lock);
 719                 do_next:
 720                         ;
 721                 }
 722         }
 723         return;
 724
 725 reject_redirect:
 726 #ifdef CONFIG_IP_ROUTE_VERBOSE
 727         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 728                 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
 729                        "Path = %lX -> %lX, tos %02x\n",
 730                        ntohl(old_gw), dev->name, ntohl(new_gw),
 731                        ntohl(saddr), ntohl(daddr), tos);
 732 #endif
 733 }
 734
 735 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 736 {
 737         struct rtable *rt = (struct rtable*)dst;
 738
 739         if (rt != NULL) {
 740                 if (dst->obsolete) {
 741                         ip_rt_put(rt);
 742                         return NULL;
 743                 }
 744                 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
 745                         unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
 746                         struct rtable **rthp;
 747 #if RT_CACHE_DEBUG >= 1
 748                         printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
 749 #endif
 750                         ip_rt_put(rt);
 751                         write_lock_bh(&rt_hash_lock);
 752                         for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
 753                                 if (*rthp == rt) {
 754                                         *rthp = rt->u.rt_next;
 755                                         rt_free(rt);
 756                                         break;
 757                                 }
 758                         }
 759                         write_unlock_bh(&rt_hash_lock);
 760                         return NULL;
 761                 }
 762         }
 763         return dst;
 764 }
 765
 766 /*
 767  * Algorithm:
 768  *      1. The first ip_rt_redirect_number redirects are sent
 769  *         with exponential backoff, then we stop sending them at all,
 770  *         assuming that the host ignores our redirects.
 771  *      2. If we did not see packets requiring redirects
 772  *         during ip_rt_redirect_silence, we assume that the host
 773  *         forgot redirected route and start to send redirects again.
 774  *
 775  * This algorithm is much cheaper and more intelligent than dumb load limiting
 776  * in icmp.c.
 777  *
 778  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 779  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 780  */
 781
 782 void ip_rt_send_redirect(struct sk_buff *skb)
 783 {
 784         struct rtable *rt = (struct rtable*)skb->dst;
 785         struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
 786
 787         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
 788                 return;
 789
 790         /* No redirected packets during ip_rt_redirect_silence;
 791          * reset the algorithm.
 792          */
 793         if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
 794                 rt->u.dst.rate_tokens = 0;
 795
 796         /* Too many ignored redirects; do not send anything
 797          * set u.dst.rate_last to the last seen redirected packet.
 798          */
 799         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
 800                 rt->u.dst.rate_last = jiffies;
 801                 return;
 802         }
 803
 804         /* Check for load limit; set rate_last to the latest sent
 805          * redirect.
 806          */
 807         if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
 808                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 809                 rt->u.dst.rate_last = jiffies;
 810                 ++rt->u.dst.rate_tokens;
 811 #ifdef CONFIG_IP_ROUTE_VERBOSE
 812                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
 813                     rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
 814                         printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
 815                                rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
 816 #endif
 817         }
 818 }
 819
 820 static int ip_error(struct sk_buff *skb)
 821 {
 822         struct rtable *rt = (struct rtable*)skb->dst;
 823         unsigned long now;
 824         int code;
 825
 826         switch (rt->u.dst.error) {
 827         case EINVAL:
 828         default:
 829                 kfree_skb(skb);
 830                 return 0;
 831         case EHOSTUNREACH:
 832                 code = ICMP_HOST_UNREACH;
 833                 break;
 834         case ENETUNREACH:
 835                 code = ICMP_NET_UNREACH;
 836                 break;
 837         case EACCES:
 838                 code = ICMP_PKT_FILTERED;
 839                 break;
 840         }
 841
 842         now = jiffies;
 843         if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
 844                 rt->u.dst.rate_tokens = ip_rt_error_burst;
 845         rt->u.dst.rate_last = now;
 846         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
 847                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
 848                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 849         }
 850
 851         kfree_skb(skb);
 852         return 0;
 853 }
 854
 855 /*
 856  *      The last two values are not from the RFC but
 857  *      are needed for AMPRnet AX.25 paths.
 858  */
 859
 860 static unsigned short mtu_plateau[] =
 861 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 862
 863 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 864 {
 865         int i;
 866
 867         for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
 868                 if (old_mtu > mtu_plateau[i])
 869                         return mtu_plateau[i];
 870         return 68;
 871 }
 872
 873 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 874 {
 875         int i;
 876         unsigned short old_mtu = ntohs(iph->tot_len);
 877         struct rtable *rth;
 878         u32  skeys[2] = { iph->saddr, 0, };
 879         u32  daddr = iph->daddr;
 880         u8   tos = iph->tos & IPTOS_TOS_MASK;
 881         unsigned short est_mtu = 0;
 882
 883         if (ipv4_config.no_pmtu_disc)
 884                 return 0;
 885
 886         for (i=0; i<2; i++) {
 887                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 888
 889                 read_lock_bh(&rt_hash_lock);
 890                 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
 891                         if (rth->key.dst == daddr &&
 892                             rth->key.src == skeys[i] &&
 893                             rth->rt_dst == daddr &&
 894                             rth->rt_src == iph->saddr &&
 895                             rth->key.tos == tos &&
 896                             rth->key.iif == 0 &&
 897                             !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
 898                                 unsigned short mtu = new_mtu;
 899
 900                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
 901
 902                                         /* BSD 4.2 compatibility hack :-( */
 903                                         if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
 904                                             old_mtu >= 68 + (iph->ihl<<2))
 905                                                 old_mtu -= iph->ihl<<2;
 906
 907                                         mtu = guess_mtu(old_mtu);
 908                                 }
 909                                 if (mtu <= rth->u.dst.pmtu) {
 910                                         if (mtu < rth->u.dst.pmtu) {
 911                                                 dst_confirm(&rth->u.dst);
 912                                                 rth->u.dst.pmtu = mtu;
 913                                                 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
 914                                         }
 915                                         est_mtu = mtu;
 916                                 }
 917                         }
 918                 }
 919                 read_unlock_bh(&rt_hash_lock);
 920         }
 921         return est_mtu ? : new_mtu;
 922 }
 923
 924 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
 925 {
 926         if (dst->pmtu > mtu && mtu >= 68 &&
 927             !(dst->mxlock&(1<<RTAX_MTU))) {
 928                 dst->pmtu = mtu;
 929                 dst_set_expires(dst, ip_rt_mtu_expires);
 930         }
 931 }
 932
 933 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
 934 {
 935         dst_release(dst);
 936         return NULL;
 937 }
 938
 939 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 940                                            struct sk_buff *skb)
 941 {
 942         return NULL;
 943 }
 944
 945 static void ipv4_link_failure(struct sk_buff *skb)
 946 {
 947         struct rtable *rt;
 948
 949         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 950
 951         rt = (struct rtable *) skb->dst;
 952         if (rt)
 953                 dst_set_expires(&rt->u.dst, 0);
 954 }
 955
 956 static int ip_rt_bug(struct sk_buff *skb)
 957 {
 958         printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
 959                skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
 960         kfree_skb(skb);
 961         return 0;
 962 }
 963
 964 /*
 965    We do not cache source address of outgoing interface,
 966    because it is used only by IP RR, TS and SRR options,
 967    so that it out of fast path.
 968
 969    BTW remember: "addr" is allowed to be not aligned
 970    in IP options!
 971  */
 972
 973 void ip_rt_get_source(u8 *addr, struct rtable *rt)
 974 {
 975         u32 src;
 976         struct fib_result res;
 977
 978         if (rt->key.iif == 0)
 979                 src = rt->rt_src;
 980         else if (fib_lookup(&rt->key, &res) == 0)
 981                 src = FIB_RES_PREFSRC(res);
 982         else
 983                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 984         memcpy(addr, &src, 4);
 985 }
 986
 987 #ifdef CONFIG_NET_CLS_ROUTE
 988 static void set_class_tag(struct rtable *rt, u32 tag)
 989 {
 990         if (!(rt->u.dst.tclassid&0xFFFF))
 991                 rt->u.dst.tclassid |= tag&0xFFFF;
 992         if (!(rt->u.dst.tclassid&0xFFFF0000))
 993                 rt->u.dst.tclassid |= tag&0xFFFF0000;
 994 }
 995 #endif
 996
 997 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 998 {
 999         struct fib_info *fi = res->fi;
1000
1001         if (fi) {
1002                 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1003                         rt->rt_gateway = FIB_RES_GW(*res);
1004                 rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
1005                 rt->u.dst.pmtu = fi->fib_mtu;
1006                 if (fi->fib_mtu == 0) {
1007                         rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1008                         if (rt->u.dst.pmtu > IP_MAX_MTU)
1009                                 rt->u.dst.pmtu = IP_MAX_MTU;
1010                         if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
1011                             rt->rt_gateway != rt->rt_dst &&
1012                             rt->u.dst.pmtu > 576)
1013                                 rt->u.dst.pmtu = 576;
1014                 }
1015                 rt->u.dst.window= fi->fib_window ? : 0;
1016                 rt->u.dst.rtt   = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
1017 #ifdef CONFIG_NET_CLS_ROUTE
1018                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1019 #endif
1020         } else {
1021                 rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1022                 if (rt->u.dst.pmtu > IP_MAX_MTU)
1023                         rt->u.dst.pmtu = IP_MAX_MTU;
1024                 rt->u.dst.window= 0;
1025                 rt->u.dst.rtt   = TCP_TIMEOUT_INIT;
1026         }
1027 #ifdef CONFIG_NET_CLS_ROUTE
1028 #ifdef CONFIG_IP_MULTIPLE_TABLES
1029         set_class_tag(rt, fib_rules_tclass(res));
1030 #endif
1031         set_class_tag(rt, itag);
1032 #endif
1033         rt->rt_type = res->type;
1034 }
1035
1036 static int
1037 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1038                   u8 tos, struct device *dev, int our)
1039 {
1040         unsigned hash;
1041         struct rtable *rth;
1042         u32 spec_dst;
1043         struct in_device *in_dev = dev->ip_ptr;
1044         u32 itag = 0;
1045
1046         /* Primary sanity checks. */
1047
1048         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1049             in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1050                 return -EINVAL;
1051
1052         if (ZERONET(saddr)) {
1053                 if (!LOCAL_MCAST(daddr))
1054                         return -EINVAL;
1055                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1056         } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1057                 return -EINVAL;
1058
1059         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1060         if (!rth)
1061                 return -ENOBUFS;
1062
1063         rth->u.dst.output= ip_rt_bug;
1064
1065         atomic_set(&rth->u.dst.use, 1);
1066         rth->key.dst    = daddr;
1067         rth->rt_dst     = daddr;
1068         rth->key.tos    = tos;
1069 #ifdef CONFIG_IP_ROUTE_FWMARK
1070         rth->key.fwmark = skb->fwmark;
1071 #endif
1072         rth->key.src    = saddr;
1073         rth->rt_src     = saddr;
1074 #ifdef CONFIG_IP_ROUTE_NAT
1075         rth->rt_dst_map = daddr;
1076         rth->rt_src_map = saddr;
1077 #endif
1078 #ifdef CONFIG_NET_CLS_ROUTE
1079         rth->u.dst.tclassid = itag;
1080 #endif
1081         rth->rt_iif     =
1082         rth->key.iif    = dev->ifindex;
1083         rth->u.dst.dev  = &loopback_dev;
1084         rth->key.oif    = 0;
1085         rth->rt_gateway = daddr;
1086         rth->rt_spec_dst= spec_dst;
1087         rth->rt_type    = RTN_MULTICAST;
1088         rth->rt_flags   = RTCF_MULTICAST;
1089         if (our) {
1090                 rth->u.dst.input= ip_local_deliver;
1091                 rth->rt_flags |= RTCF_LOCAL;
1092         }
1093
1094 #ifdef CONFIG_IP_MROUTE
1095         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1096                 rth->u.dst.input = ip_mr_input;
1097 #endif
1098
1099         hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1100         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1101 }
1102
1103 /*
1104  *      NOTE. We drop all the packets that has local source
1105  *      addresses, because every properly looped back packet
1106  *      must have correct destination already attached by output routine.
1107  *
1108  *      Such approach solves two big problems:
1109  *      1. Not simplex devices are handled properly.
1110  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1111  */
1112
1113 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1114                         u8 tos, struct device *dev)
1115 {
1116         struct rt_key   key;
1117         struct fib_result res;
1118         struct in_device *in_dev = dev->ip_ptr;
1119         struct in_device *out_dev;
1120         unsigned        flags = 0;
1121         u32             itag = 0;
1122         struct rtable * rth;
1123         unsigned        hash;
1124         u32             spec_dst;
1125         int             err = -EINVAL;
1126
1127         /*
1128          *      IP on this device is disabled.
1129          */
1130
1131         if (!in_dev)
1132                 return -EINVAL;
1133
1134         key.dst = daddr;
1135         key.src = saddr;
1136         key.tos = tos;
1137 #ifdef CONFIG_IP_ROUTE_FWMARK
1138         key.fwmark = skb->fwmark;
1139 #endif
1140         key.iif = dev->ifindex;
1141         key.oif = 0;
1142         key.scope = RT_SCOPE_UNIVERSE;
1143
1144         hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1145
1146         /* Check for the most weird martians, which can be not detected
1147            by fib_lookup.
1148          */
1149
1150         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1151                 goto martian_source;
1152
1153         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1154                 goto brd_input;
1155
1156         /* Accept zero addresses only to limited broadcast;
1157          * I even do not know to fix it or not. Waiting for complains :-)
1158          */
1159         if (ZERONET(saddr))
1160                 goto martian_source;
1161
1162         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1163                 goto martian_destination;
1164
1165         /*
1166          *      Now we are ready to route packet.
1167          */
1168         if ((err = fib_lookup(&key, &res))) {
1169                 if (!IN_DEV_FORWARD(in_dev))
1170                         return -EINVAL;
1171                 goto no_route;
1172         }
1173
1174 #ifdef CONFIG_IP_ROUTE_NAT
1175         /* Policy is applied before mapping destination,
1176            but rerouting after map should be made with old source.
1177          */
1178
1179         if (1) {
1180                 u32 src_map = saddr;
1181                 if (res.r)
1182                         src_map = fib_rules_policy(saddr, &res, &flags);
1183
1184                 if (res.type == RTN_NAT) {
1185                         key.dst = fib_rules_map_destination(daddr, &res);
1186                         if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1187                                 return -EINVAL;
1188                         flags |= RTCF_DNAT;
1189                 }
1190                 key.src = src_map;
1191         }
1192 #endif
1193
1194         if (res.type == RTN_BROADCAST)
1195                 goto brd_input;
1196
1197         if (res.type == RTN_LOCAL) {
1198                 int result;
1199                 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1200                                              dev, &spec_dst, &itag);
1201                 if (result < 0)
1202                         goto martian_source;
1203                 if (result)
1204                         flags |= RTCF_DIRECTSRC;
1205                 spec_dst = daddr;
1206                 goto local_input;
1207         }
1208
1209         if (!IN_DEV_FORWARD(in_dev))
1210                 return -EINVAL;
1211         if (res.type != RTN_UNICAST)
1212                 goto martian_destination;
1213
1214 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1215         if (res.fi->fib_nhs > 1 && key.oif == 0)
1216                 fib_select_multipath(&key, &res);
1217 #endif
1218         out_dev = FIB_RES_DEV(res)->ip_ptr;
1219         if (out_dev == NULL) {
1220                 if (net_ratelimit())
1221                         printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1222                 return -EINVAL;
1223         }
1224
1225         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1226         if (err < 0)
1227                 goto martian_source;
1228
1229         if (err)
1230                 flags |= RTCF_DIRECTSRC;
1231
1232         if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1233             (IN_DEV_SHARED_MEDIA(out_dev)
1234              || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1235                 flags |= RTCF_DOREDIRECT;
1236
1237         if (skb->protocol != __constant_htons(ETH_P_IP)) {
1238                 /* Not IP (i.e. ARP). Do not create route, if it is
1239                  * invalid for proxy arp. DNAT routes are always valid.
1240                  */
1241                 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1242                         return -EINVAL;
1243         }
1244
1245         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1246         if (!rth)
1247                 return -ENOBUFS;
1248
1249         atomic_set(&rth->u.dst.use, 1);
1250         rth->key.dst    = daddr;
1251         rth->rt_dst     = daddr;
1252         rth->key.tos    = tos;
1253 #ifdef CONFIG_IP_ROUTE_FWMARK
1254         rth->key.fwmark = skb->fwmark;
1255 #endif
1256         rth->key.src    = saddr;
1257         rth->rt_src     = saddr;
1258         rth->rt_gateway = daddr;
1259 #ifdef CONFIG_IP_ROUTE_NAT
1260         rth->rt_src_map = key.src;
1261         rth->rt_dst_map = key.dst;
1262         if (flags&RTCF_DNAT)
1263                 rth->rt_gateway = key.dst;
1264 #endif
1265         rth->rt_iif     =
1266         rth->key.iif    = dev->ifindex;
1267         rth->u.dst.dev  = out_dev->dev;
1268         rth->key.oif    = 0;
1269         rth->rt_spec_dst= spec_dst;
1270
1271         rth->u.dst.input = ip_forward;
1272         rth->u.dst.output = ip_output;
1273
1274         rt_set_nexthop(rth, &res, itag);
1275
1276         rth->rt_flags = flags;
1277
1278 #ifdef CONFIG_NET_FASTROUTE
1279         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1280                 struct device *odev = rth->u.dst.dev;
1281                 if (odev != dev &&
1282                     dev->accept_fastpath &&
1283                     odev->mtu >= dev->mtu &&
1284                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1285                         rth->rt_flags |= RTCF_FAST;
1286         }
1287 #endif
1288
1289         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1290
1291 brd_input:
1292         if (skb->protocol != __constant_htons(ETH_P_IP))
1293                 return -EINVAL;
1294
1295         if (ZERONET(saddr)) {
1296                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1297         } else {
1298                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1299                 if (err < 0)
1300                         goto martian_source;
1301                 if (err)
1302                         flags |= RTCF_DIRECTSRC;
1303         }
1304         flags |= RTCF_BROADCAST;
1305         res.type = RTN_BROADCAST;
1306
1307 local_input:
1308         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1309         if (!rth)
1310                 return -ENOBUFS;
1311
1312         rth->u.dst.output= ip_rt_bug;
1313
1314         atomic_set(&rth->u.dst.use, 1);
1315         rth->key.dst    = daddr;
1316         rth->rt_dst     = daddr;
1317         rth->key.tos    = tos;
1318 #ifdef CONFIG_IP_ROUTE_FWMARK
1319         rth->key.fwmark = skb->fwmark;
1320 #endif
1321         rth->key.src    = saddr;
1322         rth->rt_src     = saddr;
1323 #ifdef CONFIG_IP_ROUTE_NAT
1324         rth->rt_dst_map = key.dst;
1325         rth->rt_src_map = key.src;
1326 #endif
1327 #ifdef CONFIG_NET_CLS_ROUTE
1328         rth->u.dst.tclassid = itag;
1329 #endif
1330         rth->rt_iif     =
1331         rth->key.iif    = dev->ifindex;
1332         rth->u.dst.dev  = &loopback_dev;
1333         rth->key.oif    = 0;
1334         rth->rt_gateway = daddr;
1335         rth->rt_spec_dst= spec_dst;
1336         rth->u.dst.input= ip_local_deliver;
1337         rth->rt_flags   = flags|RTCF_LOCAL;
1338         if (res.type == RTN_UNREACHABLE) {
1339                 rth->u.dst.input= ip_error;
1340                 rth->u.dst.error= -err;
1341                 rth->rt_flags   &= ~RTCF_LOCAL;
1342         }
1343         rth->rt_type    = res.type;
1344         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1345
1346 no_route:
1347         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1348         res.type = RTN_UNREACHABLE;
1349         goto local_input;
1350
1351         /*
1352          *      Do not cache martian addresses: they should be logged (RFC1812)
1353          */
1354 martian_destination:
1355 #ifdef CONFIG_IP_ROUTE_VERBOSE
1356         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1357                 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1358 #endif
1359         return -EINVAL;
1360
1361 martian_source:
1362 #ifdef CONFIG_IP_ROUTE_VERBOSE
1363         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1364                 /*
1365                  *      RFC1812 recommenadtion, if source is martian,
1366                  *      the only hint is MAC header.
1367                  */
1368                 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1369                 if (dev->hard_header_len) {
1370                         int i;
1371                         unsigned char *p = skb->mac.raw;
1372                         printk(KERN_WARNING "ll header:");
1373                         for (i=0; i<dev->hard_header_len; i++, p++)
1374                                 printk(" %02x", *p);
1375                         printk("\n");
1376                 }
1377         }
1378 #endif
1379         return -EINVAL;
1380 }
1381
1382 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1383                    u8 tos, struct device *dev)
1384 {
1385         struct rtable * rth;
1386         unsigned        hash;
1387         int iif = dev->ifindex;
1388
1389         tos &= IPTOS_TOS_MASK;
1390         hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1391
1392         read_lock_bh(&rt_hash_lock);
1393         for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1394                 if (rth->key.dst == daddr &&
1395                     rth->key.src == saddr &&
1396                     rth->key.iif == iif &&
1397                     rth->key.oif == 0 &&
1398 #ifdef CONFIG_IP_ROUTE_FWMARK
1399                     rth->key.fwmark == skb->fwmark &&
1400 #endif
1401                     rth->key.tos == tos) {
1402                         rth->u.dst.lastuse = jiffies;
1403                         atomic_inc(&rth->u.dst.use);
1404                         atomic_inc(&rth->u.dst.refcnt);
1405                         read_unlock_bh(&rt_hash_lock);
1406                         skb->dst = (struct dst_entry*)rth;
1407                         return 0;
1408                 }
1409         }
1410         read_unlock_bh(&rt_hash_lock);
1411
1412         /* Multicast recognition logic is moved from route cache to here.
1413            The problem was that too many Ethernet cards have broken/missing
1414            hardware multicast filters :-( As result the host on multicasting
1415            network acquires a lot of useless route cache entries, sort of
1416            SDR messages from all the world. Now we try to get rid of them.
1417            Really, provided software IP multicast filter is organized
1418            reasonably (at least, hashed), it does not result in a slowdown
1419            comparing with route cache reject entries.
1420            Note, that multicast routers are not affected, because
1421            route cache entry is created eventually.
1422          */
1423         if (MULTICAST(daddr)) {
1424                 int our = ip_check_mc(dev, daddr);
1425                 if (!our
1426 #ifdef CONFIG_IP_MROUTE
1427                     && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1428                         !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1429 #endif
1430                     ) return -EINVAL;
1431                 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1432         }
1433         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1434 }
1435
1436 /*
1437  * Major route resolver routine.
1438  */
1439
1440 int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1441 {
1442         struct rt_key key;
1443         struct fib_result res;
1444         unsigned flags = 0;
1445         struct rtable *rth;
1446         struct device *dev_out = NULL;
1447         unsigned hash;
1448 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1449         u32 nochecksrc = (tos & RTO_TPROXY);
1450 #endif
1451
1452         tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1453         key.dst = daddr;
1454         key.src = saddr;
1455         key.tos = tos&IPTOS_TOS_MASK;
1456         key.iif = loopback_dev.ifindex;
1457         key.oif = oif;
1458         key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1459         res.fi = NULL;
1460 #ifdef CONFIG_IP_MULTIPLE_TABLES
1461         res.r = NULL;
1462 #endif
1463
1464         if (saddr) {
1465                 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1466                         return -EINVAL;
1467
1468                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1469                 dev_out = ip_dev_find(saddr);
1470 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1471                 /* If address is not local, test for transparent proxy flag;
1472                    if address is local --- clear the flag.
1473                  */
1474                 if (dev_out == NULL) {
1475                         if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1476                                 return -EINVAL;
1477                         flags |= RTCF_TPROXY;
1478                 }
1479 #else
1480                 if (dev_out == NULL)
1481                         return -EINVAL;
1482 #endif
1483
1484                 /* I removed check for oif == dev_out->oif here.
1485                    It was wrong by three reasons:
1486                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1487                       assigned to multiple interfaces.
1488                    2. Moreover, we are allowed to send packets with saddr
1489                       of another iface. --ANK
1490                  */
1491
1492                 if (oif == 0 &&
1493 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1494                         dev_out &&
1495 #endif
1496                         (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1497                         /* Special hack: user can direct multicasts
1498                            and limited broadcast via necessary interface
1499                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1500                            This hack is not just for fun, it allows
1501                            vic,vat and friends to work.
1502                            They bind socket to loopback, set ttl to zero
1503                            and expect that it will work.
1504                            From the viewpoint of routing cache they are broken,
1505                            because we are not allowed to build multicast path
1506                            with loopback source addr (look, routing cache
1507                            cannot know, that ttl is zero, so that packet
1508                            will not leave this host and route is valid).
1509                            Luckily, this hack is good workaround.
1510                          */
1511
1512                         key.oif = dev_out->ifindex;
1513                         goto make_route;
1514                 }
1515                 dev_out = NULL;
1516         }
1517         if (oif) {
1518                 dev_out = dev_get_by_index(oif);
1519                 if (dev_out == NULL)
1520                         return -ENODEV;
1521                 if (dev_out->ip_ptr == NULL)
1522                         return -ENODEV; /* Wrong error code */
1523
1524                 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1525                         if (!key.src)
1526                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1527                         goto make_route;
1528                 }
1529                 if (!key.src) {
1530                         if (MULTICAST(daddr))
1531                                 key.src = inet_select_addr(dev_out, 0, key.scope);
1532                         else if (!daddr)
1533                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1534                 }
1535         }
1536
1537         if (!key.dst) {
1538                 key.dst = key.src;
1539                 if (!key.dst)
1540                         key.dst = key.src = htonl(INADDR_LOOPBACK);
1541                 dev_out = &loopback_dev;
1542                 key.oif = loopback_dev.ifindex;
1543                 res.type = RTN_LOCAL;
1544                 flags |= RTCF_LOCAL;
1545                 goto make_route;
1546         }
1547
1548         if (fib_lookup(&key, &res)) {
1549                 res.fi = NULL;
1550                 if (oif) {
1551                         /* Apparently, routing tables are wrong. Assume,
1552                            that the destination is on link.
1553
1554                            WHY? DW.
1555                            Because we are allowed to send to iface
1556                            even if it has NO routes and NO assigned
1557                            addresses. When oif is specified, routing
1558                            tables are looked up with only one purpose:
1559                            to catch if destination is gatewayed, rather than
1560                            direct. Moreover, if MSG_DONTROUTE is set,
1561                            we send packet, ignoring both routing tables
1562                            and ifaddr state. --ANK
1563
1564
1565                            We could make it even if oif is unknown,
1566                            likely IPv6, but we do not.
1567                          */
1568
1569                         if (key.src == 0)
1570                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1571                         res.type = RTN_UNICAST;
1572                         goto make_route;
1573                 }
1574                 return -ENETUNREACH;
1575         }
1576
1577         if (res.type == RTN_NAT)
1578                 return -EINVAL;
1579
1580         if (res.type == RTN_LOCAL) {
1581                 if (!key.src)
1582                         key.src = key.dst;
1583                 dev_out = &loopback_dev;
1584                 key.oif = dev_out->ifindex;
1585                 res.fi = NULL;
1586                 flags |= RTCF_LOCAL;
1587                 goto make_route;
1588         }
1589
1590 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1591         if (res.fi->fib_nhs > 1 && key.oif == 0)
1592                 fib_select_multipath(&key, &res);
1593         else
1594 #endif
1595         if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1596                 fib_select_default(&key, &res);
1597
1598         if (!key.src)
1599                 key.src = FIB_RES_PREFSRC(res);
1600
1601         dev_out = FIB_RES_DEV(res);
1602         key.oif = dev_out->ifindex;
1603
1604 make_route:
1605         if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1606                 return -EINVAL;
1607
1608         if (key.dst == 0xFFFFFFFF)
1609                 res.type = RTN_BROADCAST;
1610         else if (MULTICAST(key.dst))
1611                 res.type = RTN_MULTICAST;
1612         else if (BADCLASS(key.dst) || ZERONET(key.dst))
1613                 return -EINVAL;
1614
1615         if (dev_out->flags&IFF_LOOPBACK)
1616                 flags |= RTCF_LOCAL;
1617
1618         if (res.type == RTN_BROADCAST) {
1619                 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1620                 res.fi = NULL;
1621         } else if (res.type == RTN_MULTICAST) {
1622                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1623                 if (!ip_check_mc(dev_out, daddr))
1624                         flags &= ~RTCF_LOCAL;
1625                 /* If multicast route do not exist use
1626                    default one, but do not gateway in this case.
1627                    Yes, it is hack.
1628                  */
1629                 if (res.fi && res.prefixlen < 4)
1630                         res.fi = NULL;
1631         }
1632
1633         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1634         if (!rth)
1635                 return -ENOBUFS;
1636
1637         atomic_set(&rth->u.dst.use, 1);
1638         rth->key.dst    = daddr;
1639         rth->key.tos    = tos;
1640         rth->key.src    = saddr;
1641         rth->key.iif    = 0;
1642         rth->key.oif    = oif;
1643         rth->rt_dst     = key.dst;
1644         rth->rt_src     = key.src;
1645 #ifdef CONFIG_IP_ROUTE_NAT
1646         rth->rt_dst_map = key.dst;
1647         rth->rt_src_map = key.src;
1648 #endif
1649         rth->rt_iif     = oif ? : dev_out->ifindex;
1650         rth->u.dst.dev  = dev_out;
1651         rth->rt_gateway = key.dst;
1652         rth->rt_spec_dst= key.src;
1653
1654         rth->u.dst.output=ip_output;
1655
1656         if (flags&RTCF_LOCAL) {
1657                 rth->u.dst.input = ip_local_deliver;
1658                 rth->rt_spec_dst = key.dst;
1659         }
1660         if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1661                 rth->rt_spec_dst = key.src;
1662                 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1663                         rth->u.dst.output = ip_mc_output;
1664 #ifdef CONFIG_IP_MROUTE
1665                 if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1666                         struct in_device *in_dev = dev_out->ip_ptr;
1667                         if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1668                                 rth->u.dst.input = ip_mr_input;
1669                                 rth->u.dst.output = ip_mc_output;
1670                         }
1671                 }
1672 #endif
1673         }
1674
1675         rt_set_nexthop(rth, &res, 0);
1676
1677         rth->rt_flags = flags;
1678
1679         hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1680         return rt_intern_hash(hash, rth, rp);
1681 }
1682
1683 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1684 {
1685         unsigned hash;
1686         struct rtable *rth;
1687
1688         hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1689
1690         read_lock_bh(&rt_hash_lock);
1691         for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1692                 if (rth->key.dst == daddr &&
1693                     rth->key.src == saddr &&
1694                     rth->key.iif == 0 &&
1695                     rth->key.oif == oif &&
1696 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1697                     rth->key.tos == tos
1698 #else
1699                     !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1700                     ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1701 #endif
1702                 ) {
1703                         rth->u.dst.lastuse = jiffies;
1704                         atomic_inc(&rth->u.dst.use);
1705                         atomic_inc(&rth->u.dst.refcnt);
1706                         read_unlock_bh(&rt_hash_lock);
1707                         *rp = rth;
1708                         return 0;
1709                 }
1710         }
1711         read_unlock_bh(&rt_hash_lock);
1712
1713         return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1714 }
1715
1716 #ifdef CONFIG_RTNETLINK
1717
1718 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1719 {
1720         struct rtable *rt = (struct rtable*)skb->dst;
1721         struct rtmsg *r;
1722         struct nlmsghdr  *nlh;
1723         unsigned char    *b = skb->tail;
1724         struct rta_cacheinfo ci;
1725 #ifdef CONFIG_IP_MROUTE
1726         struct rtattr *eptr;
1727 #endif
1728         struct rtattr *mx;
1729
1730         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1731         r = NLMSG_DATA(nlh);
1732         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1733         r->rtm_family = AF_INET;
1734         r->rtm_dst_len = 32;
1735         r->rtm_src_len = 0;
1736         r->rtm_tos = rt->key.tos;
1737         r->rtm_table = RT_TABLE_MAIN;
1738         r->rtm_type = rt->rt_type;
1739         r->rtm_scope = RT_SCOPE_UNIVERSE;
1740         r->rtm_protocol = RTPROT_UNSPEC;
1741         r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1742         if (rt->rt_flags & RTCF_NOTIFY)
1743                 r->rtm_flags |= RTM_F_NOTIFY;
1744         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1745         if (rt->key.src) {
1746                 r->rtm_src_len = 32;
1747                 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1748         }
1749         if (rt->u.dst.dev)
1750                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1751 #ifdef CONFIG_NET_CLS_ROUTE
1752         if (rt->u.dst.tclassid)
1753                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1754 #endif
1755         if (rt->key.iif)
1756                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1757         else if (rt->rt_src != rt->key.src)
1758                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1759         if (rt->rt_dst != rt->rt_gateway)
1760                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1761         mx = (struct rtattr*)skb->tail;
1762         RTA_PUT(skb, RTA_METRICS, 0, NULL);
1763         if (rt->u.dst.mxlock)
1764                 RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1765         if (rt->u.dst.pmtu)
1766                 RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1767         if (rt->u.dst.window)
1768                 RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1769         if (rt->u.dst.rtt)
1770                 RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1771         mx->rta_len = skb->tail - (u8*)mx;
1772         if (mx->rta_len == RTA_LENGTH(0))
1773                 skb_trim(skb, (u8*)mx - skb->data);
1774         ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1775         ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1776         ci.rta_clntref = atomic_read(&rt->u.dst.use);
1777         if (rt->u.dst.expires)
1778                 ci.rta_expires = rt->u.dst.expires - jiffies;
1779         else
1780                 ci.rta_expires = 0;
1781         ci.rta_error = rt->u.dst.error;
1782 #ifdef CONFIG_IP_MROUTE
1783         eptr = (struct rtattr*)skb->tail;
1784 #endif
1785         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1786         if (rt->key.iif) {
1787 #ifdef CONFIG_IP_MROUTE
1788                 u32 dst = rt->rt_dst;
1789
1790                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1791                         int err = ipmr_get_route(skb, r, nowait);
1792                         if (err <= 0) {
1793                                 if (!nowait) {
1794                                         if (err == 0)
1795                                                 return 0;
1796                                         goto nlmsg_failure;
1797                                 } else {
1798                                         if (err == -EMSGSIZE)
1799                                                 goto nlmsg_failure;
1800                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1801                                 }
1802                         }
1803                 } else
1804 #endif
1805                 {
1806                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1807                 }
1808         }
1809
1810         nlh->nlmsg_len = skb->tail - b;
1811         return skb->len;
1812
1813 nlmsg_failure:
1814 rtattr_failure:
1815         skb_trim(skb, b - skb->data);
1816         return -1;
1817 }
1818
1819 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1820 {
1821         struct rtattr **rta = arg;
1822         struct rtmsg *rtm = NLMSG_DATA(nlh);
1823         struct rtable *rt = NULL;
1824         u32 dst = 0;
1825         u32 src = 0;
1826         int iif = 0;
1827         int err;
1828         struct sk_buff *skb;
1829
1830         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1831         if (skb == NULL)
1832                 return -ENOBUFS;
1833
1834         /* Reserve room for dummy headers, this skb can pass
1835            through good chunk of routing engine.
1836          */
1837         skb->mac.raw = skb->data;
1838         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1839
1840         if (rta[RTA_SRC-1])
1841                 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1842         if (rta[RTA_DST-1])
1843                 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1844         if (rta[RTA_IIF-1])
1845                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1846
1847         if (iif) {
1848                 struct device *dev;
1849                 dev = dev_get_by_index(iif);
1850                 if (!dev)
1851                         return -ENODEV;
1852                 skb->protocol = __constant_htons(ETH_P_IP);
1853                 skb->dev = dev;
1854                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1855                 rt = (struct rtable*)skb->dst;
1856                 if (!err && rt->u.dst.error)
1857                         err = -rt->u.dst.error;
1858         } else {
1859                 int oif = 0;
1860                 if (rta[RTA_OIF-1])
1861                         memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1862                 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1863         }
1864         if (err) {
1865                 kfree_skb(skb);
1866                 return err;
1867         }
1868
1869         skb->dst = &rt->u.dst;
1870         if (rtm->rtm_flags & RTM_F_NOTIFY)
1871                 rt->rt_flags |= RTCF_NOTIFY;
1872
1873         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1874
1875         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1876         if (err == 0)
1877                 return 0;
1878         if (err < 0)
1879                 return -EMSGSIZE;
1880
1881         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1882         if (err < 0)
1883                 return err;
1884         return 0;
1885 }
1886
1887
1888 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
1889 {
1890         struct rtable *rt;
1891         int h, s_h;
1892         int idx, s_idx;
1893
1894         s_h = cb->args[0];
1895         s_idx = idx = cb->args[1];
1896         for (h=0; h < RT_HASH_DIVISOR; h++) {
1897                 if (h < s_h) continue;
1898                 if (h > s_h)
1899                         s_idx = 0;
1900                 read_lock_bh(&rt_hash_lock);
1901                 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1902                         if (idx < s_idx)
1903                                 continue;
1904                         skb->dst = dst_clone(&rt->u.dst);
1905                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1906                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1907                                 dst_release(xchg(&skb->dst, NULL));
1908                                 read_unlock_bh(&rt_hash_lock);
1909                                 goto done;
1910                         }
1911                         dst_release(xchg(&skb->dst, NULL));
1912                 }
1913                 read_unlock_bh(&rt_hash_lock);
1914         }
1915
1916 done:
1917         cb->args[0] = h;
1918         cb->args[1] = idx;
1919         return skb->len;
1920 }
1921
1922 #endif /* CONFIG_RTNETLINK */
1923
1924 void ip_rt_multicast_event(struct in_device *in_dev)
1925 {
1926         rt_cache_flush(0);
1927 }
1928
1929
1930
1931 #ifdef CONFIG_SYSCTL
1932
1933 static int flush_delay;
1934
1935 static
1936 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1937                               void *buffer, size_t *lenp)
1938 {
1939         if (write) {
1940                 proc_dointvec(ctl, write, filp, buffer, lenp);
1941                 rt_cache_flush(flush_delay);
1942                 return 0;
1943         } else
1944                 return -EINVAL;
1945 }
1946
1947 ctl_table ipv4_route_table[] = {
1948         {NET_IPV4_ROUTE_FLUSH, "flush",
1949          &flush_delay, sizeof(int), 0200, NULL,
1950          &ipv4_sysctl_rtcache_flush},
1951         {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1952          &ip_rt_min_delay, sizeof(int), 0644, NULL,
1953          &proc_dointvec_jiffies},
1954         {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1955          &ip_rt_max_delay, sizeof(int), 0644, NULL,
1956          &proc_dointvec_jiffies},
1957         {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1958          &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1959          &proc_dointvec},
1960         {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1961          &ip_rt_max_size, sizeof(int), 0644, NULL,
1962          &proc_dointvec},
1963         {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1964          &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1965          &proc_dointvec_jiffies},
1966         {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1967          &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1968          &proc_dointvec_jiffies},
1969         {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1970          &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1971          &proc_dointvec_jiffies},
1972         {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1973          &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1974          &proc_dointvec},
1975         {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1976          &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1977          &proc_dointvec},
1978         {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1979          &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1980          &proc_dointvec},
1981         {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1982          &ip_rt_error_cost, sizeof(int), 0644, NULL,
1983          &proc_dointvec},
1984         {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1985          &ip_rt_error_burst, sizeof(int), 0644, NULL,
1986          &proc_dointvec},
1987         {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1988          &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1989          &proc_dointvec},
1990         {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1991          &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1992          &proc_dointvec_jiffies},
1993          {0}
1994 };
1995 #endif
1996
1997 #ifdef CONFIG_NET_CLS_ROUTE
1998 struct ip_rt_acct ip_rt_acct[256];
1999 rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
2000
2001 #ifdef CONFIG_PROC_FS
2002 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2003                            int length, int *eof, void *data)
2004 {
2005         *start=buffer;
2006
2007         if (offset + length > sizeof(ip_rt_acct)) {
2008                 length = sizeof(ip_rt_acct) - offset;
2009                 *eof = 1;
2010         }
2011         if (length > 0) {
2012                 read_lock_bh(&ip_rt_acct_lock);
2013                 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
2014                 read_unlock_bh(&ip_rt_acct_lock);
2015                 return length;
2016         }
2017         return 0;
2018 }
2019 #endif
2020 #endif
2021
2022
2023 __initfunc(void ip_rt_init(void))
2024 {
2025 #ifdef CONFIG_PROC_FS
2026 #ifdef CONFIG_NET_CLS_ROUTE
2027         struct proc_dir_entry *ent;
2028 #endif
2029 #endif
2030         devinet_init();
2031         ip_fib_init();
2032         rt_periodic_timer.function = rt_check_expire;
2033         /* All the timers, started at system startup tend
2034            to synchronize. Perturb it a bit.
2035          */
2036         rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2037                 + ip_rt_gc_interval;
2038         add_timer(&rt_periodic_timer);
2039
2040 #ifdef CONFIG_PROC_FS
2041         proc_net_register(&(struct proc_dir_entry) {
2042                 PROC_NET_RTCACHE, 8, "rt_cache",
2043                 S_IFREG | S_IRUGO, 1, 0, 0,
2044                 0, &proc_net_inode_operations,
2045                 rt_cache_get_info
2046         });
2047 #ifdef CONFIG_NET_CLS_ROUTE
2048         ent = create_proc_entry("net/rt_acct", 0, 0);
2049         ent->read_proc = ip_rt_acct_read;
2050 #endif
2051 #endif
2052 }