net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *
  56  *              This program is free software; you can redistribute it and/or
  57  *              modify it under the terms of the GNU General Public License
  58  *              as published by the Free Software Foundation; either version
  59  *              2 of the License, or (at your option) any later version.
  60  */
  61
  62 #include <linux/config.h>
  63 #include <asm/uaccess.h>
  64 #include <asm/system.h>
  65 #include <asm/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/sched.h>
  69 #include <linux/mm.h>
  70 #include <linux/string.h>
  71 #include <linux/socket.h>
  72 #include <linux/sockios.h>
  73 #include <linux/errno.h>
  74 #include <linux/in.h>
  75 #include <linux/inet.h>
  76 #include <linux/netdevice.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/init.h>
  79 #include <linux/skbuff.h>
  80 #include <linux/rtnetlink.h>
  81 #include <linux/inetdevice.h>
  82 #include <linux/igmp.h>
  83 #include <linux/pkt_sched.h>
  84 #include <linux/mroute.h>
  85 #include <net/protocol.h>
  86 #include <net/ip.h>
  87 #include <net/route.h>
  88 #include <net/sock.h>
  89 #include <net/ip_fib.h>
  90 #include <net/arp.h>
  91 #include <net/tcp.h>
  92 #include <net/icmp.h>
  93 #ifdef CONFIG_SYSCTL
  94 #include <linux/sysctl.h>
  95 #endif
  96
  97 #define IP_MAX_MTU      0xFFF0
  98
  99 #define RT_GC_TIMEOUT (300*HZ)
 100
 101 int ip_rt_min_delay = 2*HZ;
 102 int ip_rt_max_delay = 10*HZ;
 103 int ip_rt_gc_thresh = RT_HASH_DIVISOR;
 104 int ip_rt_max_size = RT_HASH_DIVISOR*16;
 105 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
 106 int ip_rt_gc_interval = 60*HZ;
 107 int ip_rt_gc_min_interval = 5*HZ;
 108 int ip_rt_redirect_number = 9;
 109 int ip_rt_redirect_load = HZ/50;
 110 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
 111 int ip_rt_error_cost = HZ;
 112 int ip_rt_error_burst = 5*HZ;
 113 int ip_rt_gc_elasticity = 8;
 114 int ip_rt_mtu_expires = 10*60*HZ;
 115
 116 static unsigned long rt_deadline = 0;
 117
 118 #define RTprint(a...)   printk(KERN_DEBUG a)
 119
 120 static void rt_run_flush(unsigned long dummy);
 121
 122 static struct timer_list rt_flush_timer =
 123         { NULL, NULL, 0, 0L, rt_run_flush };
 124 static struct timer_list rt_periodic_timer =
 125         { NULL, NULL, 0, 0L, NULL };
 126
 127 /*
 128  *      Interface to generic destination cache.
 129  */
 130
 131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
 132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 133                                            struct sk_buff *);
 134 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
 135 static void               ipv4_link_failure(struct sk_buff *skb);
 136 static int rt_garbage_collect(void);
 137
 138
 139 struct dst_ops ipv4_dst_ops =
 140 {
 141         AF_INET,
 142         __constant_htons(ETH_P_IP),
 143         RT_HASH_DIVISOR,
 144
 145         rt_garbage_collect,
 146         ipv4_dst_check,
 147         ipv4_dst_reroute,
 148         NULL,
 149         ipv4_negative_advice,
 150         ipv4_link_failure,
 151 };
 152
 153 __u8 ip_tos2prio[16] = {
 154         TC_PRIO_BESTEFFORT,
 155         TC_PRIO_FILLER,
 156         TC_PRIO_BESTEFFORT,
 157         TC_PRIO_FILLER,
 158         TC_PRIO_BULK,
 159         TC_PRIO_FILLER,
 160         TC_PRIO_BULK,
 161         TC_PRIO_FILLER,
 162         TC_PRIO_INTERACTIVE,
 163         TC_PRIO_FILLER,
 164         TC_PRIO_INTERACTIVE,
 165         TC_PRIO_FILLER,
 166         TC_PRIO_INTERACTIVE_BULK,
 167         TC_PRIO_FILLER,
 168         TC_PRIO_INTERACTIVE_BULK,
 169         TC_PRIO_FILLER
 170 };
 171
 172
 173 /*
 174  * Route cache.
 175  */
 176
 177 static struct rtable    *rt_hash_table[RT_HASH_DIVISOR];
 178 static rwlock_t          rt_hash_lock = RW_LOCK_UNLOCKED;
 179
 180 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
 181
 182 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 183 {
 184         unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
 185         hash = hash^saddr^tos;
 186         hash = hash^(hash>>16);
 187         return (hash^(hash>>8)) & 0xFF;
 188 }
 189
 190 #ifdef CONFIG_PROC_FS
 191
 192 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
 193 {
 194         int len=0;
 195         off_t pos=0;
 196         char temp[129];
 197         struct rtable *r;
 198         int i;
 199
 200         pos = 128;
 201
 202         if (offset<128) {
 203                 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
 204                 len = 128;
 205         }
 206
 207
 208         read_lock_bh(&rt_hash_lock);
 209
 210         for (i = 0; i<RT_HASH_DIVISOR; i++) {
 211                 for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
 212                         /*
 213                          *      Spin through entries until we are ready
 214                          */
 215                         pos += 128;
 216
 217                         if (pos <= offset) {
 218                                 len = 0;
 219                                 continue;
 220                         }
 221                         sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 222                                 r->u.dst.dev ? r->u.dst.dev->name : "*",
 223                                 (unsigned long)r->rt_dst,
 224                                 (unsigned long)r->rt_gateway,
 225                                 r->rt_flags,
 226                                 atomic_read(&r->u.dst.use),
 227                                 atomic_read(&r->u.dst.refcnt),
 228                                 0,
 229                                 (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
 230                                 r->u.dst.window,
 231                                 (int)r->u.dst.rtt, r->key.tos,
 232                                 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 233                                 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
 234                                 r->rt_spec_dst);
 235                         sprintf(buffer+len,"%-127s\n",temp);
 236                         len += 128;
 237                         if (pos >= offset+length)
 238                                 goto done;
 239                 }
 240         }
 241
 242 done:
 243         read_unlock_bh(&rt_hash_lock);
 244
 245         *start = buffer+len-(pos-offset);
 246         len = pos-offset;
 247         if (len>length)
 248                 len = length;
 249         return len;
 250 }
 251 #endif
 252
 253 static __inline__ void rt_free(struct rtable *rt)
 254 {
 255         dst_free(&rt->u.dst);
 256 }
 257
 258 static __inline__ void rt_drop(struct rtable *rt)
 259 {
 260         ip_rt_put(rt);
 261         dst_free(&rt->u.dst);
 262 }
 263
 264 static __inline__ int rt_fast_clean(struct rtable *rth)
 265 {
 266         /* Kill broadcast/multicast entries very aggresively, if they
 267            collide in hash table with more useful entries */
 268         return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
 269                 && rth->key.iif && rth->u.rt_next);
 270 }
 271
 272 static __inline__ int rt_valuable(struct rtable *rth)
 273 {
 274         return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
 275                 || rth->u.dst.expires);
 276 }
 277
 278 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
 279 {
 280         int age;
 281
 282         if (atomic_read(&rth->u.dst.use))
 283                 return 0;
 284
 285         if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
 286                 return 1;
 287
 288         age = jiffies - rth->u.dst.lastuse;
 289         if (age <= tmo1 && !rt_fast_clean(rth))
 290                 return 0;
 291         if (age <= tmo2 && rt_valuable(rth))
 292                 return 0;
 293         return 1;
 294 }
 295
 296 static void rt_check_expire(unsigned long dummy)
 297 {
 298         int i;
 299         static int rover;
 300         struct rtable *rth, **rthp;
 301         unsigned long now = jiffies;
 302
 303         for (i=0; i<RT_HASH_DIVISOR/5; i++) {
 304                 unsigned tmo = ip_rt_gc_timeout;
 305
 306                 rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 307                 rthp = &rt_hash_table[rover];
 308
 309                 write_lock_bh(&rt_hash_lock);
 310                 while ((rth = *rthp) != NULL) {
 311                         if (rth->u.dst.expires) {
 312                                 /* Entrie is expired even if it is in use */
 313                                 if ((long)(now - rth->u.dst.expires) <= 0) {
 314                                         tmo >>= 1;
 315                                         rthp = &rth->u.rt_next;
 316                                         continue;
 317                                 }
 318                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 319                                 tmo >>= 1;
 320                                 rthp = &rth->u.rt_next;
 321                                 continue;
 322                         }
 323
 324                         /*
 325                          * Cleanup aged off entries.
 326                          */
 327                         *rthp = rth->u.rt_next;
 328                         rt_free(rth);
 329                 }
 330                 write_unlock_bh(&rt_hash_lock);
 331
 332                 /* Fallback loop breaker. */
 333                 if ((jiffies - now) > 0)
 334                         break;
 335         }
 336         rt_periodic_timer.expires = now + ip_rt_gc_interval;
 337         add_timer(&rt_periodic_timer);
 338 }
 339
 340 static void rt_run_flush(unsigned long dummy)
 341 {
 342         int i;
 343         struct rtable * rth, * next;
 344
 345         rt_deadline = 0;
 346
 347         write_lock_bh(&rt_hash_lock);
 348         for (i=0; i<RT_HASH_DIVISOR; i++) {
 349                 rth = rt_hash_table[i];
 350                 if(rth == NULL)
 351                         continue;
 352                 rt_hash_table[i] = NULL;
 353                 write_unlock_bh(&rt_hash_lock);
 354
 355                 for (; rth; rth=next) {
 356                         next = rth->u.rt_next;
 357                         rth->u.rt_next = NULL;
 358                         rt_free(rth);
 359                 }
 360
 361                 write_lock_bh(&rt_hash_lock);
 362         }
 363         write_unlock_bh(&rt_hash_lock);
 364 }
 365
 366 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
 367
 368 void rt_cache_flush(int delay)
 369 {
 370         unsigned long now = jiffies;
 371         int user_mode = !in_interrupt();
 372
 373         if (delay < 0)
 374                 delay = ip_rt_min_delay;
 375
 376         spin_lock_bh(&rt_flush_lock);
 377
 378         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 379                 long tmo = (long)(rt_deadline - now);
 380
 381                 /* If flush timer is already running
 382                    and flush request is not immediate (delay > 0):
 383
 384                    if deadline is not achieved, prolongate timer to "delay",
 385                    otherwise fire it at deadline time.
 386                  */
 387
 388                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 389                         tmo = 0;
 390
 391                 if (delay > tmo)
 392                         delay = tmo;
 393         }
 394
 395         if (delay <= 0) {
 396                 spin_unlock_bh(&rt_flush_lock);
 397                 rt_run_flush(0);
 398                 return;
 399         }
 400
 401         if (rt_deadline == 0)
 402                 rt_deadline = now + ip_rt_max_delay;
 403
 404         rt_flush_timer.expires = now + delay;
 405         add_timer(&rt_flush_timer);
 406         spin_unlock_bh(&rt_flush_lock);
 407 }
 408
 409 /*
 410    Short description of GC goals.
 411
 412    We want to build algorithm, which will keep routing cache
 413    at some equilibrium point, when number of aged off entries
 414    is kept approximately equal to newly generated ones.
 415
 416    Current expiration strength is variable "expire".
 417    We try to adjust it dynamically, so that if networking
 418    is idle expires is large enough to keep enough of warm entries,
 419    and when load increases it reduces to limit cache size.
 420  */
 421
 422 static int rt_garbage_collect(void)
 423 {
 424         static unsigned expire = RT_GC_TIMEOUT;
 425         static unsigned long last_gc;
 426         static int rover;
 427         static int equilibrium;
 428         struct rtable *rth, **rthp;
 429         unsigned long now = jiffies;
 430         int goal;
 431
 432         /*
 433          * Garbage collection is pretty expensive,
 434          * do not make it too frequently.
 435          */
 436         if (now - last_gc < ip_rt_gc_min_interval &&
 437             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 438                 return 0;
 439
 440         /* Calculate number of entries, which we want to expire now. */
 441         goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
 442         if (goal <= 0) {
 443                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 444                         equilibrium = ipv4_dst_ops.gc_thresh;
 445                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 446                 if (goal > 0) {
 447                         equilibrium += min(goal/2, RT_HASH_DIVISOR);
 448                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 449                 }
 450         } else {
 451                 /* We are in dangerous area. Try to reduce cache really
 452                  * aggressively.
 453                  */
 454                 goal = max(goal/2, RT_HASH_DIVISOR);
 455                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 456         }
 457
 458         if (now - last_gc >= ip_rt_gc_min_interval)
 459                 last_gc = now;
 460
 461         if (goal <= 0) {
 462                 equilibrium += goal;
 463                 goto work_done;
 464         }
 465
 466         do {
 467                 int i, k;
 468
 469                 write_lock_bh(&rt_hash_lock);
 470                 for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
 471                         unsigned tmo = expire;
 472
 473                         k = (k + 1) & (RT_HASH_DIVISOR-1);
 474                         rthp = &rt_hash_table[k];
 475                         while ((rth = *rthp) != NULL) {
 476                                 if (!rt_may_expire(rth, tmo, expire)) {
 477                                         tmo >>= 1;
 478                                         rthp = &rth->u.rt_next;
 479                                         continue;
 480                                 }
 481                                 *rthp = rth->u.rt_next;
 482                                 rth->u.rt_next = NULL;
 483                                 rt_free(rth);
 484                                 goal--;
 485                         }
 486                         if (goal <= 0)
 487                                 break;
 488                 }
 489                 rover = k;
 490                 write_unlock_bh(&rt_hash_lock);
 491
 492                 if (goal <= 0)
 493                         goto work_done;
 494
 495                 /* Goal is not achieved. We stop process if:
 496
 497                    - if expire reduced to zero. Otherwise, expire is halfed.
 498                    - if table is not full.
 499                    - if we are called from interrupt.
 500                    - jiffies check is just fallback/debug loop breaker.
 501                      We will not spin here for long time in any case.
 502                  */
 503
 504                 if (expire == 0)
 505                         break;
 506
 507                 expire >>= 1;
 508 #if RT_CACHE_DEBUG >= 2
 509                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
 510 #endif
 511
 512                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 513                         return 0;
 514         } while (!in_interrupt() && jiffies - now < 1);
 515
 516         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 517                 return 0;
 518         if (net_ratelimit())
 519                 printk("dst cache overflow\n");
 520         return 1;
 521
 522 work_done:
 523         expire += ip_rt_gc_min_interval;
 524         if (expire > ip_rt_gc_timeout ||
 525             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 526                 expire = ip_rt_gc_timeout;
 527 #if RT_CACHE_DEBUG >= 2
 528         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
 529 #endif
 530         return 0;
 531 }
 532
 533 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
 534 {
 535         struct rtable   *rth, **rthp;
 536         unsigned long   now = jiffies;
 537         int attempts = !in_interrupt();
 538
 539 restart:
 540         rthp = &rt_hash_table[hash];
 541
 542         write_lock_bh(&rt_hash_lock);
 543         while ((rth = *rthp) != NULL) {
 544                 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 545                         /* Put it first */
 546                         *rthp = rth->u.rt_next;
 547                         rth->u.rt_next = rt_hash_table[hash];
 548                         rt_hash_table[hash] = rth;
 549
 550                         atomic_inc(&rth->u.dst.refcnt);
 551                         atomic_inc(&rth->u.dst.use);
 552                         rth->u.dst.lastuse = now;
 553                         write_unlock_bh(&rt_hash_lock);
 554
 555                         rt_drop(rt);
 556                         *rp = rth;
 557                         return 0;
 558                 }
 559
 560                 rthp = &rth->u.rt_next;
 561         }
 562
 563         /* Try to bind route to arp only if it is output
 564            route or unicast forwarding path.
 565          */
 566         if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 567                 if (!arp_bind_neighbour(&rt->u.dst)) {
 568                         write_unlock_bh(&rt_hash_lock);
 569
 570                         /* Neighbour tables are full and nothing
 571                            can be released. Try to shrink route cache,
 572                            it is most likely it holds some neighbour records.
 573                          */
 574                         if (attempts-- > 0) {
 575                                 int saved_elasticity = ip_rt_gc_elasticity;
 576                                 int saved_int = ip_rt_gc_min_interval;
 577                                 ip_rt_gc_elasticity = 1;
 578                                 ip_rt_gc_min_interval = 0;
 579                                 rt_garbage_collect();
 580                                 ip_rt_gc_min_interval = saved_int;
 581                                 ip_rt_gc_elasticity = saved_elasticity;
 582                                 goto restart;
 583                         }
 584
 585                         rt_drop(rt);
 586                         if (net_ratelimit())
 587                                 printk("neighbour table overflow\n");
 588                         return -ENOBUFS;
 589                 }
 590         }
 591
 592         rt->u.rt_next = rt_hash_table[hash];
 593 #if RT_CACHE_DEBUG >= 2
 594         if (rt->u.rt_next) {
 595                 struct rtable * trt;
 596                 printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
 597                 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
 598                         printk(" . %08x", trt->rt_dst);
 599                 printk("\n");
 600         }
 601 #endif
 602         rt_hash_table[hash] = rt;
 603         *rp = rt;
 604         write_unlock_bh(&rt_hash_lock);
 605         return 0;
 606 }
 607
 608 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 609                     u32 saddr, u8 tos, struct device *dev)
 610 {
 611         int i, k;
 612         struct in_device *in_dev = dev->ip_ptr;
 613         struct rtable *rth, **rthp;
 614         u32  skeys[2] = { saddr, 0 };
 615         int  ikeys[2] = { dev->ifindex, 0 };
 616
 617         tos &= IPTOS_TOS_MASK;
 618
 619         if (!in_dev)
 620                 return;
 621
 622         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 623             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 624                 goto reject_redirect;
 625
 626         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 627                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 628                         goto reject_redirect;
 629                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 630                         goto reject_redirect;
 631         } else {
 632                 if (inet_addr_type(new_gw) != RTN_UNICAST)
 633                         goto reject_redirect;
 634         }
 635
 636         for (i=0; i<2; i++) {
 637                 for (k=0; k<2; k++) {
 638                         unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
 639
 640                         rthp=&rt_hash_table[hash];
 641
 642                         write_lock_bh(&rt_hash_lock);
 643                         while ( (rth = *rthp) != NULL) {
 644                                 struct rtable *rt;
 645
 646                                 if (rth->key.dst != daddr ||
 647                                     rth->key.src != skeys[i] ||
 648                                     rth->key.tos != tos ||
 649                                     rth->key.oif != ikeys[k] ||
 650                                     rth->key.iif != 0) {
 651                                         rthp = &rth->u.rt_next;
 652                                         continue;
 653                                 }
 654
 655                                 if (rth->rt_dst != daddr ||
 656                                     rth->rt_src != saddr ||
 657                                     rth->u.dst.error ||
 658                                     rth->rt_gateway != old_gw ||
 659                                     rth->u.dst.dev != dev)
 660                                         break;
 661
 662                                 dst_clone(&rth->u.dst);
 663
 664                                 rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
 665                                 if (rt == NULL) {
 666                                         ip_rt_put(rth);
 667                                         write_unlock_bh(&rt_hash_lock);
 668                                         return;
 669                                 }
 670
 671                                 /*
 672                                  * Copy all the information.
 673                                  */
 674                                 *rt = *rth;
 675                                 atomic_set(&rt->u.dst.refcnt, 1);
 676                                 atomic_set(&rt->u.dst.use, 1);
 677                                 rt->u.dst.lastuse = jiffies;
 678                                 rt->u.dst.neighbour = NULL;
 679                                 rt->u.dst.hh = NULL;
 680
 681                                 rt->rt_flags |= RTCF_REDIRECTED;
 682
 683                                 /* Gateway is different ... */
 684                                 rt->rt_gateway = new_gw;
 685
 686                                 /* Redirect received -> path was valid */
 687                                 dst_confirm(&rth->u.dst);
 688
 689                                 if (!arp_bind_neighbour(&rt->u.dst) ||
 690                                     !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
 691                                         if (rt->u.dst.neighbour)
 692                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
 693                                         ip_rt_put(rth);
 694                                         rt_drop(rt);
 695                                         break;
 696                                 }
 697
 698                                 *rthp = rth->u.rt_next;
 699                                 if (!rt_intern_hash(hash, rt, &rt))
 700                                         ip_rt_put(rt);
 701                                 rt_drop(rth);
 702                                 break;
 703                         }
 704                         write_unlock_bh(&rt_hash_lock);
 705                 }
 706         }
 707         return;
 708
 709 reject_redirect:
 710 #ifdef CONFIG_IP_ROUTE_VERBOSE
 711         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 712                 printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
 713                        "Path = %lX -> %lX, tos %02x\n",
 714                        ntohl(old_gw), dev->name, ntohl(new_gw),
 715                        ntohl(saddr), ntohl(daddr), tos);
 716 #endif
 717 }
 718
 719 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 720 {
 721         struct rtable *rt = (struct rtable*)dst;
 722
 723         if (rt != NULL) {
 724                 if (dst->obsolete) {
 725                         ip_rt_put(rt);
 726                         return NULL;
 727                 }
 728                 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
 729                         unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
 730                         struct rtable **rthp;
 731 #if RT_CACHE_DEBUG >= 1
 732                         printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
 733 #endif
 734                         ip_rt_put(rt);
 735                         write_lock_bh(&rt_hash_lock);
 736                         for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
 737                                 if (*rthp == rt) {
 738                                         *rthp = rt->u.rt_next;
 739                                         rt_free(rt);
 740                                         break;
 741                                 }
 742                         }
 743                         write_unlock_bh(&rt_hash_lock);
 744                         return NULL;
 745                 }
 746         }
 747         return dst;
 748 }
 749
 750 /*
 751  * Algorithm:
 752  *      1. The first ip_rt_redirect_number redirects are sent
 753  *         with exponential backoff, then we stop sending them at all,
 754  *         assuming that the host ignores our redirects.
 755  *      2. If we did not see packets requiring redirects
 756  *         during ip_rt_redirect_silence, we assume that the host
 757  *         forgot redirected route and start to send redirects again.
 758  *
 759  * This algorithm is much cheaper and more intelligent than dumb load limiting
 760  * in icmp.c.
 761  *
 762  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 763  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 764  */
 765
 766 void ip_rt_send_redirect(struct sk_buff *skb)
 767 {
 768         struct rtable *rt = (struct rtable*)skb->dst;
 769         struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
 770
 771         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
 772                 return;
 773
 774         /* No redirected packets during ip_rt_redirect_silence;
 775          * reset the algorithm.
 776          */
 777         if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
 778                 rt->u.dst.rate_tokens = 0;
 779
 780         /* Too many ignored redirects; do not send anything
 781          * set u.dst.rate_last to the last seen redirected packet.
 782          */
 783         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
 784                 rt->u.dst.rate_last = jiffies;
 785                 return;
 786         }
 787
 788         /* Check for load limit; set rate_last to the latest sent
 789          * redirect.
 790          */
 791         if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
 792                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 793                 rt->u.dst.rate_last = jiffies;
 794                 ++rt->u.dst.rate_tokens;
 795 #ifdef CONFIG_IP_ROUTE_VERBOSE
 796                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
 797                     rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
 798                         printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
 799                                rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
 800 #endif
 801         }
 802 }
 803
 804 static int ip_error(struct sk_buff *skb)
 805 {
 806         struct rtable *rt = (struct rtable*)skb->dst;
 807         unsigned long now;
 808         int code;
 809
 810         switch (rt->u.dst.error) {
 811         case EINVAL:
 812         default:
 813                 kfree_skb(skb);
 814                 return 0;
 815         case EHOSTUNREACH:
 816                 code = ICMP_HOST_UNREACH;
 817                 break;
 818         case ENETUNREACH:
 819                 code = ICMP_NET_UNREACH;
 820                 break;
 821         case EACCES:
 822                 code = ICMP_PKT_FILTERED;
 823                 break;
 824         }
 825
 826         now = jiffies;
 827         if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
 828                 rt->u.dst.rate_tokens = ip_rt_error_burst;
 829         rt->u.dst.rate_last = now;
 830         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
 831                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
 832                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 833         }
 834
 835         kfree_skb(skb);
 836         return 0;
 837 }
 838
 839 /*
 840  *      The last two values are not from the RFC but
 841  *      are needed for AMPRnet AX.25 paths.
 842  */
 843
 844 static unsigned short mtu_plateau[] =
 845 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 846
 847 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 848 {
 849         int i;
 850
 851         for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
 852                 if (old_mtu > mtu_plateau[i])
 853                         return mtu_plateau[i];
 854         return 68;
 855 }
 856
 857 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 858 {
 859         int i;
 860         unsigned short old_mtu = ntohs(iph->tot_len);
 861         struct rtable *rth;
 862         u32  skeys[2] = { iph->saddr, 0, };
 863         u32  daddr = iph->daddr;
 864         u8   tos = iph->tos & IPTOS_TOS_MASK;
 865         unsigned short est_mtu = 0;
 866
 867         if (ipv4_config.no_pmtu_disc)
 868                 return 0;
 869
 870         for (i=0; i<2; i++) {
 871                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 872
 873                 read_lock_bh(&rt_hash_lock);
 874                 for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
 875                         if (rth->key.dst == daddr &&
 876                             rth->key.src == skeys[i] &&
 877                             rth->rt_dst == daddr &&
 878                             rth->rt_src == iph->saddr &&
 879                             rth->key.tos == tos &&
 880                             rth->key.iif == 0 &&
 881                             !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
 882                                 unsigned short mtu = new_mtu;
 883
 884                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
 885
 886                                         /* BSD 4.2 compatibility hack :-( */
 887                                         if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
 888                                             old_mtu >= 68 + (iph->ihl<<2))
 889                                                 old_mtu -= iph->ihl<<2;
 890
 891                                         mtu = guess_mtu(old_mtu);
 892                                 }
 893                                 if (mtu <= rth->u.dst.pmtu) {
 894                                         if (mtu < rth->u.dst.pmtu) {
 895                                                 dst_confirm(&rth->u.dst);
 896                                                 rth->u.dst.pmtu = mtu;
 897                                                 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
 898                                         }
 899                                         est_mtu = mtu;
 900                                 }
 901                         }
 902                 }
 903                 read_unlock_bh(&rt_hash_lock);
 904         }
 905         return est_mtu ? : new_mtu;
 906 }
 907
 908 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
 909 {
 910         if (dst->pmtu > mtu && mtu >= 68 &&
 911             !(dst->mxlock&(1<<RTAX_MTU))) {
 912                 dst->pmtu = mtu;
 913                 dst_set_expires(dst, ip_rt_mtu_expires);
 914         }
 915 }
 916
 917 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
 918 {
 919         dst_release(dst);
 920         return NULL;
 921 }
 922
 923 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
 924                                            struct sk_buff *skb)
 925 {
 926         return NULL;
 927 }
 928
 929 static void ipv4_link_failure(struct sk_buff *skb)
 930 {
 931         struct rtable *rt;
 932
 933         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 934
 935         rt = (struct rtable *) skb->dst;
 936         if (rt)
 937                 dst_set_expires(&rt->u.dst, 0);
 938 }
 939
 940 static int ip_rt_bug(struct sk_buff *skb)
 941 {
 942         printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
 943                skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
 944         kfree_skb(skb);
 945         return 0;
 946 }
 947
 948 /*
 949    We do not cache source address of outgoing interface,
 950    because it is used only by IP RR, TS and SRR options,
 951    so that it out of fast path.
 952
 953    BTW remember: "addr" is allowed to be not aligned
 954    in IP options!
 955  */
 956
 957 void ip_rt_get_source(u8 *addr, struct rtable *rt)
 958 {
 959         u32 src;
 960         struct fib_result res;
 961
 962         if (rt->key.iif == 0)
 963                 src = rt->rt_src;
 964         else if (fib_lookup(&rt->key, &res) == 0)
 965                 src = FIB_RES_PREFSRC(res);
 966         else
 967                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 968         memcpy(addr, &src, 4);
 969 }
 970
 971 #ifdef CONFIG_NET_CLS_ROUTE
 972 static void set_class_tag(struct rtable *rt, u32 tag)
 973 {
 974         if (!(rt->u.dst.tclassid&0xFFFF))
 975                 rt->u.dst.tclassid |= tag&0xFFFF;
 976         if (!(rt->u.dst.tclassid&0xFFFF0000))
 977                 rt->u.dst.tclassid |= tag&0xFFFF0000;
 978 }
 979 #endif
 980
 981 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 982 {
 983         struct fib_info *fi = res->fi;
 984
 985         if (fi) {
 986                 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 987                         rt->rt_gateway = FIB_RES_GW(*res);
 988                 rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
 989                 rt->u.dst.pmtu = fi->fib_mtu;
 990                 if (fi->fib_mtu == 0) {
 991                         rt->u.dst.pmtu = rt->u.dst.dev->mtu;
 992                         if (rt->u.dst.pmtu > IP_MAX_MTU)
 993                                 rt->u.dst.pmtu = IP_MAX_MTU;
 994                         if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
 995                             rt->rt_gateway != rt->rt_dst &&
 996                             rt->u.dst.pmtu > 576)
 997                                 rt->u.dst.pmtu = 576;
 998                 }
 999                 rt->u.dst.window= fi->fib_window ? : 0;
1000                 rt->u.dst.rtt   = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
1001 #ifdef CONFIG_NET_CLS_ROUTE
1002                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1003 #endif
1004         } else {
1005                 rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1006                 if (rt->u.dst.pmtu > IP_MAX_MTU)
1007                         rt->u.dst.pmtu = IP_MAX_MTU;
1008                 rt->u.dst.window= 0;
1009                 rt->u.dst.rtt   = TCP_TIMEOUT_INIT;
1010         }
1011 #ifdef CONFIG_NET_CLS_ROUTE
1012 #ifdef CONFIG_IP_MULTIPLE_TABLES
1013         set_class_tag(rt, fib_rules_tclass(res));
1014 #endif
1015         set_class_tag(rt, itag);
1016 #endif
1017         rt->rt_type = res->type;
1018 }
1019
1020 static int
1021 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1022                   u8 tos, struct device *dev, int our)
1023 {
1024         unsigned hash;
1025         struct rtable *rth;
1026         u32 spec_dst;
1027         struct in_device *in_dev = dev->ip_ptr;
1028         u32 itag = 0;
1029
1030         /* Primary sanity checks. */
1031
1032         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1033             in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
1034                 return -EINVAL;
1035
1036         if (ZERONET(saddr)) {
1037                 if (!LOCAL_MCAST(daddr))
1038                         return -EINVAL;
1039                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1040         } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1041                 return -EINVAL;
1042
1043         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1044         if (!rth)
1045                 return -ENOBUFS;
1046
1047         rth->u.dst.output= ip_rt_bug;
1048
1049         atomic_set(&rth->u.dst.use, 1);
1050         rth->key.dst    = daddr;
1051         rth->rt_dst     = daddr;
1052         rth->key.tos    = tos;
1053 #ifdef CONFIG_IP_ROUTE_FWMARK
1054         rth->key.fwmark = skb->fwmark;
1055 #endif
1056         rth->key.src    = saddr;
1057         rth->rt_src     = saddr;
1058 #ifdef CONFIG_IP_ROUTE_NAT
1059         rth->rt_dst_map = daddr;
1060         rth->rt_src_map = saddr;
1061 #endif
1062 #ifdef CONFIG_NET_CLS_ROUTE
1063         rth->u.dst.tclassid = itag;
1064 #endif
1065         rth->rt_iif     =
1066         rth->key.iif    = dev->ifindex;
1067         rth->u.dst.dev  = &loopback_dev;
1068         rth->key.oif    = 0;
1069         rth->rt_gateway = daddr;
1070         rth->rt_spec_dst= spec_dst;
1071         rth->rt_type    = RTN_MULTICAST;
1072         rth->rt_flags   = RTCF_MULTICAST;
1073         if (our) {
1074                 rth->u.dst.input= ip_local_deliver;
1075                 rth->rt_flags |= RTCF_LOCAL;
1076         }
1077
1078 #ifdef CONFIG_IP_MROUTE
1079         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1080                 rth->u.dst.input = ip_mr_input;
1081 #endif
1082
1083         hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1084         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1085 }
1086
1087 /*
1088  *      NOTE. We drop all the packets that has local source
1089  *      addresses, because every properly looped back packet
1090  *      must have correct destination already attached by output routine.
1091  *
1092  *      Such approach solves two big problems:
1093  *      1. Not simplex devices are handled properly.
1094  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1095  */
1096
1097 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1098                         u8 tos, struct device *dev)
1099 {
1100         struct rt_key   key;
1101         struct fib_result res;
1102         struct in_device *in_dev = dev->ip_ptr;
1103         struct in_device *out_dev;
1104         unsigned        flags = 0;
1105         u32             itag = 0;
1106         struct rtable * rth;
1107         unsigned        hash;
1108         u32             spec_dst;
1109         int             err = -EINVAL;
1110
1111         /*
1112          *      IP on this device is disabled.
1113          */
1114
1115         if (!in_dev)
1116                 return -EINVAL;
1117
1118         key.dst = daddr;
1119         key.src = saddr;
1120         key.tos = tos;
1121 #ifdef CONFIG_IP_ROUTE_FWMARK
1122         key.fwmark = skb->fwmark;
1123 #endif
1124         key.iif = dev->ifindex;
1125         key.oif = 0;
1126         key.scope = RT_SCOPE_UNIVERSE;
1127
1128         hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1129
1130         /* Check for the most weird martians, which can be not detected
1131            by fib_lookup.
1132          */
1133
1134         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1135                 goto martian_source;
1136
1137         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1138                 goto brd_input;
1139
1140         /* Accept zero addresses only to limited broadcast;
1141          * I even do not know to fix it or not. Waiting for complains :-)
1142          */
1143         if (ZERONET(saddr))
1144                 goto martian_source;
1145
1146         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1147                 goto martian_destination;
1148
1149         /*
1150          *      Now we are ready to route packet.
1151          */
1152         if ((err = fib_lookup(&key, &res))) {
1153                 if (!IN_DEV_FORWARD(in_dev))
1154                         return -EINVAL;
1155                 goto no_route;
1156         }
1157
1158 #ifdef CONFIG_IP_ROUTE_NAT
1159         /* Policy is applied before mapping destination,
1160            but rerouting after map should be made with old source.
1161          */
1162
1163         if (1) {
1164                 u32 src_map = saddr;
1165                 if (res.r)
1166                         src_map = fib_rules_policy(saddr, &res, &flags);
1167
1168                 if (res.type == RTN_NAT) {
1169                         key.dst = fib_rules_map_destination(daddr, &res);
1170                         if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
1171                                 return -EINVAL;
1172                         flags |= RTCF_DNAT;
1173                 }
1174                 key.src = src_map;
1175         }
1176 #endif
1177
1178         if (res.type == RTN_BROADCAST)
1179                 goto brd_input;
1180
1181         if (res.type == RTN_LOCAL) {
1182                 int result;
1183                 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1184                                              dev, &spec_dst, &itag);
1185                 if (result < 0)
1186                         goto martian_source;
1187                 if (result)
1188                         flags |= RTCF_DIRECTSRC;
1189                 spec_dst = daddr;
1190                 goto local_input;
1191         }
1192
1193         if (!IN_DEV_FORWARD(in_dev))
1194                 return -EINVAL;
1195         if (res.type != RTN_UNICAST)
1196                 goto martian_destination;
1197
1198 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1199         if (res.fi->fib_nhs > 1 && key.oif == 0)
1200                 fib_select_multipath(&key, &res);
1201 #endif
1202         out_dev = FIB_RES_DEV(res)->ip_ptr;
1203         if (out_dev == NULL) {
1204                 if (net_ratelimit())
1205                         printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1206                 return -EINVAL;
1207         }
1208
1209         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1210         if (err < 0)
1211                 goto martian_source;
1212
1213         if (err)
1214                 flags |= RTCF_DIRECTSRC;
1215
1216         if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1217             (IN_DEV_SHARED_MEDIA(out_dev)
1218              || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1219                 flags |= RTCF_DOREDIRECT;
1220
1221         if (skb->protocol != __constant_htons(ETH_P_IP)) {
1222                 /* Not IP (i.e. ARP). Do not create route, if it is
1223                  * invalid for proxy arp. DNAT routes are always valid.
1224                  */
1225                 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1226                         return -EINVAL;
1227         }
1228
1229         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1230         if (!rth)
1231                 return -ENOBUFS;
1232
1233         atomic_set(&rth->u.dst.use, 1);
1234         rth->key.dst    = daddr;
1235         rth->rt_dst     = daddr;
1236         rth->key.tos    = tos;
1237 #ifdef CONFIG_IP_ROUTE_FWMARK
1238         rth->key.fwmark = skb->fwmark;
1239 #endif
1240         rth->key.src    = saddr;
1241         rth->rt_src     = saddr;
1242         rth->rt_gateway = daddr;
1243 #ifdef CONFIG_IP_ROUTE_NAT
1244         rth->rt_src_map = key.src;
1245         rth->rt_dst_map = key.dst;
1246         if (flags&RTCF_DNAT)
1247                 rth->rt_gateway = key.dst;
1248 #endif
1249         rth->rt_iif     =
1250         rth->key.iif    = dev->ifindex;
1251         rth->u.dst.dev  = out_dev->dev;
1252         rth->key.oif    = 0;
1253         rth->rt_spec_dst= spec_dst;
1254
1255         rth->u.dst.input = ip_forward;
1256         rth->u.dst.output = ip_output;
1257
1258         rt_set_nexthop(rth, &res, itag);
1259
1260         rth->rt_flags = flags;
1261
1262 #ifdef CONFIG_NET_FASTROUTE
1263         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1264                 struct device *odev = rth->u.dst.dev;
1265                 if (odev != dev &&
1266                     dev->accept_fastpath &&
1267                     odev->mtu >= dev->mtu &&
1268                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1269                         rth->rt_flags |= RTCF_FAST;
1270         }
1271 #endif
1272
1273         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1274
1275 brd_input:
1276         if (skb->protocol != __constant_htons(ETH_P_IP))
1277                 return -EINVAL;
1278
1279         if (ZERONET(saddr)) {
1280                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1281         } else {
1282                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1283                 if (err < 0)
1284                         goto martian_source;
1285                 if (err)
1286                         flags |= RTCF_DIRECTSRC;
1287         }
1288         flags |= RTCF_BROADCAST;
1289         res.type = RTN_BROADCAST;
1290
1291 local_input:
1292         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1293         if (!rth)
1294                 return -ENOBUFS;
1295
1296         rth->u.dst.output= ip_rt_bug;
1297
1298         atomic_set(&rth->u.dst.use, 1);
1299         rth->key.dst    = daddr;
1300         rth->rt_dst     = daddr;
1301         rth->key.tos    = tos;
1302 #ifdef CONFIG_IP_ROUTE_FWMARK
1303         rth->key.fwmark = skb->fwmark;
1304 #endif
1305         rth->key.src    = saddr;
1306         rth->rt_src     = saddr;
1307 #ifdef CONFIG_IP_ROUTE_NAT
1308         rth->rt_dst_map = key.dst;
1309         rth->rt_src_map = key.src;
1310 #endif
1311 #ifdef CONFIG_NET_CLS_ROUTE
1312         rth->u.dst.tclassid = itag;
1313 #endif
1314         rth->rt_iif     =
1315         rth->key.iif    = dev->ifindex;
1316         rth->u.dst.dev  = &loopback_dev;
1317         rth->key.oif    = 0;
1318         rth->rt_gateway = daddr;
1319         rth->rt_spec_dst= spec_dst;
1320         rth->u.dst.input= ip_local_deliver;
1321         rth->rt_flags   = flags|RTCF_LOCAL;
1322         if (res.type == RTN_UNREACHABLE) {
1323                 rth->u.dst.input= ip_error;
1324                 rth->u.dst.error= -err;
1325                 rth->rt_flags   &= ~RTCF_LOCAL;
1326         }
1327         rth->rt_type    = res.type;
1328         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1329
1330 no_route:
1331         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1332         res.type = RTN_UNREACHABLE;
1333         goto local_input;
1334
1335         /*
1336          *      Do not cache martian addresses: they should be logged (RFC1812)
1337          */
1338 martian_destination:
1339 #ifdef CONFIG_IP_ROUTE_VERBOSE
1340         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1341                 printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
1342 #endif
1343         return -EINVAL;
1344
1345 martian_source:
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1348                 /*
1349                  *      RFC1812 recommenadtion, if source is martian,
1350                  *      the only hint is MAC header.
1351                  */
1352                 printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
1353                 if (dev->hard_header_len) {
1354                         int i;
1355                         unsigned char *p = skb->mac.raw;
1356                         printk(KERN_WARNING "ll header:");
1357                         for (i=0; i<dev->hard_header_len; i++, p++)
1358                                 printk(" %02x", *p);
1359                         printk("\n");
1360                 }
1361         }
1362 #endif
1363         return -EINVAL;
1364 }
1365
1366 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1367                    u8 tos, struct device *dev)
1368 {
1369         struct rtable * rth;
1370         unsigned        hash;
1371         int iif = dev->ifindex;
1372
1373         tos &= IPTOS_TOS_MASK;
1374         hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1375
1376         read_lock_bh(&rt_hash_lock);
1377         for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1378                 if (rth->key.dst == daddr &&
1379                     rth->key.src == saddr &&
1380                     rth->key.iif == iif &&
1381                     rth->key.oif == 0 &&
1382 #ifdef CONFIG_IP_ROUTE_FWMARK
1383                     rth->key.fwmark == skb->fwmark &&
1384 #endif
1385                     rth->key.tos == tos) {
1386                         rth->u.dst.lastuse = jiffies;
1387                         atomic_inc(&rth->u.dst.use);
1388                         atomic_inc(&rth->u.dst.refcnt);
1389                         read_unlock_bh(&rt_hash_lock);
1390                         skb->dst = (struct dst_entry*)rth;
1391                         return 0;
1392                 }
1393         }
1394         read_unlock_bh(&rt_hash_lock);
1395
1396         /* Multicast recognition logic is moved from route cache to here.
1397            The problem was that too many Ethernet cards have broken/missing
1398            hardware multicast filters :-( As result the host on multicasting
1399            network acquires a lot of useless route cache entries, sort of
1400            SDR messages from all the world. Now we try to get rid of them.
1401            Really, provided software IP multicast filter is organized
1402            reasonably (at least, hashed), it does not result in a slowdown
1403            comparing with route cache reject entries.
1404            Note, that multicast routers are not affected, because
1405            route cache entry is created eventually.
1406          */
1407         if (MULTICAST(daddr)) {
1408                 int our = ip_check_mc(dev, daddr);
1409                 if (!our
1410 #ifdef CONFIG_IP_MROUTE
1411                     && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
1412                         !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
1413 #endif
1414                     ) return -EINVAL;
1415                 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1416         }
1417         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1418 }
1419
1420 /*
1421  * Major route resolver routine.
1422  */
1423
1424 int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1425 {
1426         struct rt_key key;
1427         struct fib_result res;
1428         unsigned flags = 0;
1429         struct rtable *rth;
1430         struct device *dev_out = NULL;
1431         unsigned hash;
1432 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1433         u32 nochecksrc = (tos & RTO_TPROXY);
1434 #endif
1435
1436         tos &= IPTOS_TOS_MASK|RTO_ONLINK;
1437         key.dst = daddr;
1438         key.src = saddr;
1439         key.tos = tos&IPTOS_TOS_MASK;
1440         key.iif = loopback_dev.ifindex;
1441         key.oif = oif;
1442         key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1443         res.fi = NULL;
1444 #ifdef CONFIG_IP_MULTIPLE_TABLES
1445         res.r = NULL;
1446 #endif
1447
1448         if (saddr) {
1449                 if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
1450                         return -EINVAL;
1451
1452                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1453                 dev_out = ip_dev_find(saddr);
1454 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1455                 /* If address is not local, test for transparent proxy flag;
1456                    if address is local --- clear the flag.
1457                  */
1458                 if (dev_out == NULL) {
1459                         if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
1460                                 return -EINVAL;
1461                         flags |= RTCF_TPROXY;
1462                 }
1463 #else
1464                 if (dev_out == NULL)
1465                         return -EINVAL;
1466 #endif
1467
1468                 /* I removed check for oif == dev_out->oif here.
1469                    It was wrong by three reasons:
1470                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1471                       assigned to multiple interfaces.
1472                    2. Moreover, we are allowed to send packets with saddr
1473                       of another iface. --ANK
1474                  */
1475
1476                 if (oif == 0 &&
1477 #ifdef CONFIG_IP_TRANSPARENT_PROXY
1478                         dev_out &&
1479 #endif
1480                         (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
1481                         /* Special hack: user can direct multicasts
1482                            and limited broadcast via necessary interface
1483                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1484                            This hack is not just for fun, it allows
1485                            vic,vat and friends to work.
1486                            They bind socket to loopback, set ttl to zero
1487                            and expect that it will work.
1488                            From the viewpoint of routing cache they are broken,
1489                            because we are not allowed to build multicast path
1490                            with loopback source addr (look, routing cache
1491                            cannot know, that ttl is zero, so that packet
1492                            will not leave this host and route is valid).
1493                            Luckily, this hack is good workaround.
1494                          */
1495
1496                         key.oif = dev_out->ifindex;
1497                         goto make_route;
1498                 }
1499                 dev_out = NULL;
1500         }
1501         if (oif) {
1502                 dev_out = dev_get_by_index(oif);
1503                 if (dev_out == NULL)
1504                         return -ENODEV;
1505                 if (dev_out->ip_ptr == NULL)
1506                         return -ENODEV; /* Wrong error code */
1507
1508                 if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
1509                         if (!key.src)
1510                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1511                         goto make_route;
1512                 }
1513                 if (!key.src) {
1514                         if (MULTICAST(daddr))
1515                                 key.src = inet_select_addr(dev_out, 0, key.scope);
1516                         else if (!daddr)
1517                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1518                 }
1519         }
1520
1521         if (!key.dst) {
1522                 key.dst = key.src;
1523                 if (!key.dst)
1524                         key.dst = key.src = htonl(INADDR_LOOPBACK);
1525                 dev_out = &loopback_dev;
1526                 key.oif = loopback_dev.ifindex;
1527                 res.type = RTN_LOCAL;
1528                 flags |= RTCF_LOCAL;
1529                 goto make_route;
1530         }
1531
1532         if (fib_lookup(&key, &res)) {
1533                 res.fi = NULL;
1534                 if (oif) {
1535                         /* Apparently, routing tables are wrong. Assume,
1536                            that the destination is on link.
1537
1538                            WHY? DW.
1539                            Because we are allowed to send to iface
1540                            even if it has NO routes and NO assigned
1541                            addresses. When oif is specified, routing
1542                            tables are looked up with only one purpose:
1543                            to catch if destination is gatewayed, rather than
1544                            direct. Moreover, if MSG_DONTROUTE is set,
1545                            we send packet, ignoring both routing tables
1546                            and ifaddr state. --ANK
1547
1548
1549                            We could make it even if oif is unknown,
1550                            likely IPv6, but we do not.
1551                          */
1552
1553                         if (key.src == 0)
1554                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1555                         res.type = RTN_UNICAST;
1556                         goto make_route;
1557                 }
1558                 return -ENETUNREACH;
1559         }
1560
1561         if (res.type == RTN_NAT)
1562                 return -EINVAL;
1563
1564         if (res.type == RTN_LOCAL) {
1565                 if (!key.src)
1566                         key.src = key.dst;
1567                 dev_out = &loopback_dev;
1568                 key.oif = dev_out->ifindex;
1569                 res.fi = NULL;
1570                 flags |= RTCF_LOCAL;
1571                 goto make_route;
1572         }
1573
1574 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1575         if (res.fi->fib_nhs > 1 && key.oif == 0)
1576                 fib_select_multipath(&key, &res);
1577         else
1578 #endif
1579         if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1580                 fib_select_default(&key, &res);
1581
1582         if (!key.src)
1583                 key.src = FIB_RES_PREFSRC(res);
1584
1585         dev_out = FIB_RES_DEV(res);
1586         key.oif = dev_out->ifindex;
1587
1588 make_route:
1589         if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1590                 return -EINVAL;
1591
1592         if (key.dst == 0xFFFFFFFF)
1593                 res.type = RTN_BROADCAST;
1594         else if (MULTICAST(key.dst))
1595                 res.type = RTN_MULTICAST;
1596         else if (BADCLASS(key.dst) || ZERONET(key.dst))
1597                 return -EINVAL;
1598
1599         if (dev_out->flags&IFF_LOOPBACK)
1600                 flags |= RTCF_LOCAL;
1601
1602         if (res.type == RTN_BROADCAST) {
1603                 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1604                 res.fi = NULL;
1605         } else if (res.type == RTN_MULTICAST) {
1606                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1607                 if (!ip_check_mc(dev_out, daddr))
1608                         flags &= ~RTCF_LOCAL;
1609                 /* If multicast route do not exist use
1610                    default one, but do not gateway in this case.
1611                    Yes, it is hack.
1612                  */
1613                 if (res.fi && res.prefixlen < 4)
1614                         res.fi = NULL;
1615         }
1616
1617         rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
1618         if (!rth)
1619                 return -ENOBUFS;
1620
1621         atomic_set(&rth->u.dst.use, 1);
1622         rth->key.dst    = daddr;
1623         rth->key.tos    = tos;
1624         rth->key.src    = saddr;
1625         rth->key.iif    = 0;
1626         rth->key.oif    = oif;
1627         rth->rt_dst     = key.dst;
1628         rth->rt_src     = key.src;
1629 #ifdef CONFIG_IP_ROUTE_NAT
1630         rth->rt_dst_map = key.dst;
1631         rth->rt_src_map = key.src;
1632 #endif
1633         rth->rt_iif     = oif ? : dev_out->ifindex;
1634         rth->u.dst.dev  = dev_out;
1635         rth->rt_gateway = key.dst;
1636         rth->rt_spec_dst= key.src;
1637
1638         rth->u.dst.output=ip_output;
1639
1640         if (flags&RTCF_LOCAL) {
1641                 rth->u.dst.input = ip_local_deliver;
1642                 rth->rt_spec_dst = key.dst;
1643         }
1644         if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1645                 rth->rt_spec_dst = key.src;
1646                 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1647                         rth->u.dst.output = ip_mc_output;
1648 #ifdef CONFIG_IP_MROUTE
1649                 if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
1650                         struct in_device *in_dev = dev_out->ip_ptr;
1651                         if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
1652                                 rth->u.dst.input = ip_mr_input;
1653                                 rth->u.dst.output = ip_mc_output;
1654                         }
1655                 }
1656 #endif
1657         }
1658
1659         rt_set_nexthop(rth, &res, 0);
1660
1661         rth->rt_flags = flags;
1662
1663         hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1664         return rt_intern_hash(hash, rth, rp);
1665 }
1666
1667 int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
1668 {
1669         unsigned hash;
1670         struct rtable *rth;
1671
1672         hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
1673
1674         read_lock_bh(&rt_hash_lock);
1675         for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
1676                 if (rth->key.dst == daddr &&
1677                     rth->key.src == saddr &&
1678                     rth->key.iif == 0 &&
1679                     rth->key.oif == oif &&
1680 #ifndef CONFIG_IP_TRANSPARENT_PROXY
1681                     rth->key.tos == tos
1682 #else
1683                     !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
1684                     ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1685 #endif
1686                 ) {
1687                         rth->u.dst.lastuse = jiffies;
1688                         atomic_inc(&rth->u.dst.use);
1689                         atomic_inc(&rth->u.dst.refcnt);
1690                         read_unlock_bh(&rt_hash_lock);
1691                         *rp = rth;
1692                         return 0;
1693                 }
1694         }
1695         read_unlock_bh(&rt_hash_lock);
1696
1697         return ip_route_output_slow(rp, daddr, saddr, tos, oif);
1698 }
1699
1700 #ifdef CONFIG_RTNETLINK
1701
1702 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1703 {
1704         struct rtable *rt = (struct rtable*)skb->dst;
1705         struct rtmsg *r;
1706         struct nlmsghdr  *nlh;
1707         unsigned char    *b = skb->tail;
1708         struct rta_cacheinfo ci;
1709 #ifdef CONFIG_IP_MROUTE
1710         struct rtattr *eptr;
1711 #endif
1712         struct rtattr *mx;
1713
1714         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1715         r = NLMSG_DATA(nlh);
1716         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1717         r->rtm_family = AF_INET;
1718         r->rtm_dst_len = 32;
1719         r->rtm_src_len = 0;
1720         r->rtm_tos = rt->key.tos;
1721         r->rtm_table = RT_TABLE_MAIN;
1722         r->rtm_type = rt->rt_type;
1723         r->rtm_scope = RT_SCOPE_UNIVERSE;
1724         r->rtm_protocol = RTPROT_UNSPEC;
1725         r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1726         if (rt->rt_flags & RTCF_NOTIFY)
1727                 r->rtm_flags |= RTM_F_NOTIFY;
1728         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1729         if (rt->key.src) {
1730                 r->rtm_src_len = 32;
1731                 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1732         }
1733         if (rt->u.dst.dev)
1734                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1735 #ifdef CONFIG_NET_CLS_ROUTE
1736         if (rt->u.dst.tclassid)
1737                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1738 #endif
1739         if (rt->key.iif)
1740                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1741         else if (rt->rt_src != rt->key.src)
1742                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1743         if (rt->rt_dst != rt->rt_gateway)
1744                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1745         mx = (struct rtattr*)skb->tail;
1746         RTA_PUT(skb, RTA_METRICS, 0, NULL);
1747         if (rt->u.dst.mxlock)
1748                 RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
1749         if (rt->u.dst.pmtu)
1750                 RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
1751         if (rt->u.dst.window)
1752                 RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
1753         if (rt->u.dst.rtt)
1754                 RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
1755         mx->rta_len = skb->tail - (u8*)mx;
1756         if (mx->rta_len == RTA_LENGTH(0))
1757                 skb_trim(skb, (u8*)mx - skb->data);
1758         ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1759         ci.rta_used = atomic_read(&rt->u.dst.refcnt);
1760         ci.rta_clntref = atomic_read(&rt->u.dst.use);
1761         if (rt->u.dst.expires)
1762                 ci.rta_expires = rt->u.dst.expires - jiffies;
1763         else
1764                 ci.rta_expires = 0;
1765         ci.rta_error = rt->u.dst.error;
1766 #ifdef CONFIG_IP_MROUTE
1767         eptr = (struct rtattr*)skb->tail;
1768 #endif
1769         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1770         if (rt->key.iif) {
1771 #ifdef CONFIG_IP_MROUTE
1772                 u32 dst = rt->rt_dst;
1773
1774                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1775                         int err = ipmr_get_route(skb, r, nowait);
1776                         if (err <= 0) {
1777                                 if (!nowait) {
1778                                         if (err == 0)
1779                                                 return 0;
1780                                         goto nlmsg_failure;
1781                                 } else {
1782                                         if (err == -EMSGSIZE)
1783                                                 goto nlmsg_failure;
1784                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
1785                                 }
1786                         }
1787                 } else
1788 #endif
1789                 {
1790                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
1791                 }
1792         }
1793
1794         nlh->nlmsg_len = skb->tail - b;
1795         return skb->len;
1796
1797 nlmsg_failure:
1798 rtattr_failure:
1799         skb_trim(skb, b - skb->data);
1800         return -1;
1801 }
1802
1803 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1804 {
1805         struct rtattr **rta = arg;
1806         struct rtmsg *rtm = NLMSG_DATA(nlh);
1807         struct rtable *rt = NULL;
1808         u32 dst = 0;
1809         u32 src = 0;
1810         int iif = 0;
1811         int err;
1812         struct sk_buff *skb;
1813
1814         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1815         if (skb == NULL)
1816                 return -ENOBUFS;
1817
1818         /* Reserve room for dummy headers, this skb can pass
1819            through good chunk of routing engine.
1820          */
1821         skb->mac.raw = skb->data;
1822         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
1823
1824         if (rta[RTA_SRC-1])
1825                 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
1826         if (rta[RTA_DST-1])
1827                 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
1828         if (rta[RTA_IIF-1])
1829                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1830
1831         if (iif) {
1832                 struct device *dev;
1833                 dev = dev_get_by_index(iif);
1834                 if (!dev)
1835                         return -ENODEV;
1836                 skb->protocol = __constant_htons(ETH_P_IP);
1837                 skb->dev = dev;
1838                 start_bh_atomic();
1839                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
1840                 end_bh_atomic();
1841                 rt = (struct rtable*)skb->dst;
1842                 if (!err && rt->u.dst.error)
1843                         err = -rt->u.dst.error;
1844         } else {
1845                 int oif = 0;
1846                 if (rta[RTA_OIF-1])
1847                         memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1848                 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
1849         }
1850         if (err) {
1851                 kfree_skb(skb);
1852                 return err;
1853         }
1854
1855         skb->dst = &rt->u.dst;
1856         if (rtm->rtm_flags & RTM_F_NOTIFY)
1857                 rt->rt_flags |= RTCF_NOTIFY;
1858
1859         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1860
1861         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
1862         if (err == 0)
1863                 return 0;
1864         if (err < 0)
1865                 return -EMSGSIZE;
1866
1867         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1868         if (err < 0)
1869                 return err;
1870         return 0;
1871 }
1872
1873
1874 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
1875 {
1876         struct rtable *rt;
1877         int h, s_h;
1878         int idx, s_idx;
1879
1880         s_h = cb->args[0];
1881         s_idx = idx = cb->args[1];
1882         for (h=0; h < RT_HASH_DIVISOR; h++) {
1883                 if (h < s_h) continue;
1884                 if (h > s_h)
1885                         s_idx = 0;
1886                 read_lock_bh(&rt_hash_lock);
1887                 for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
1888                         if (idx < s_idx)
1889                                 continue;
1890                         skb->dst = dst_clone(&rt->u.dst);
1891                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
1892                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
1893                                 dst_release(xchg(&skb->dst, NULL));
1894                                 read_unlock_bh(&rt_hash_lock);
1895                                 goto done;
1896                         }
1897                         dst_release(xchg(&skb->dst, NULL));
1898                 }
1899                 read_unlock_bh(&rt_hash_lock);
1900         }
1901
1902 done:
1903         cb->args[0] = h;
1904         cb->args[1] = idx;
1905         return skb->len;
1906 }
1907
1908 #endif /* CONFIG_RTNETLINK */
1909
1910 void ip_rt_multicast_event(struct in_device *in_dev)
1911 {
1912         rt_cache_flush(0);
1913 }
1914
1915
1916
1917 #ifdef CONFIG_SYSCTL
1918
1919 static int flush_delay;
1920
1921 static
1922 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
1923                               void *buffer, size_t *lenp)
1924 {
1925         if (write) {
1926                 proc_dointvec(ctl, write, filp, buffer, lenp);
1927                 rt_cache_flush(flush_delay);
1928                 return 0;
1929         } else
1930                 return -EINVAL;
1931 }
1932
1933 ctl_table ipv4_route_table[] = {
1934         {NET_IPV4_ROUTE_FLUSH, "flush",
1935          &flush_delay, sizeof(int), 0200, NULL,
1936          &ipv4_sysctl_rtcache_flush},
1937         {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
1938          &ip_rt_min_delay, sizeof(int), 0644, NULL,
1939          &proc_dointvec_jiffies},
1940         {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
1941          &ip_rt_max_delay, sizeof(int), 0644, NULL,
1942          &proc_dointvec_jiffies},
1943         {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
1944          &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
1945          &proc_dointvec},
1946         {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
1947          &ip_rt_max_size, sizeof(int), 0644, NULL,
1948          &proc_dointvec},
1949         {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
1950          &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
1951          &proc_dointvec_jiffies},
1952         {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
1953          &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
1954          &proc_dointvec_jiffies},
1955         {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
1956          &ip_rt_gc_interval, sizeof(int), 0644, NULL,
1957          &proc_dointvec_jiffies},
1958         {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
1959          &ip_rt_redirect_load, sizeof(int), 0644, NULL,
1960          &proc_dointvec},
1961         {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
1962          &ip_rt_redirect_number, sizeof(int), 0644, NULL,
1963          &proc_dointvec},
1964         {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
1965          &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
1966          &proc_dointvec},
1967         {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
1968          &ip_rt_error_cost, sizeof(int), 0644, NULL,
1969          &proc_dointvec},
1970         {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
1971          &ip_rt_error_burst, sizeof(int), 0644, NULL,
1972          &proc_dointvec},
1973         {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
1974          &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
1975          &proc_dointvec},
1976         {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
1977          &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
1978          &proc_dointvec_jiffies},
1979          {0}
1980 };
1981 #endif
1982
1983 #ifdef CONFIG_NET_CLS_ROUTE
1984 struct ip_rt_acct ip_rt_acct[256];
1985
1986 #ifdef CONFIG_PROC_FS
1987 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
1988                            int length, int *eof, void *data)
1989 {
1990         *start=buffer;
1991
1992         if (offset + length > sizeof(ip_rt_acct)) {
1993                 length = sizeof(ip_rt_acct) - offset;
1994                 *eof = 1;
1995         }
1996         if (length > 0) {
1997                 start_bh_atomic();
1998                 memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
1999                 end_bh_atomic();
2000                 return length;
2001         }
2002         return 0;
2003 }
2004 #endif
2005 #endif
2006
2007
2008 __initfunc(void ip_rt_init(void))
2009 {
2010 #ifdef CONFIG_PROC_FS
2011 #ifdef CONFIG_NET_CLS_ROUTE
2012         struct proc_dir_entry *ent;
2013 #endif
2014 #endif
2015         devinet_init();
2016         ip_fib_init();
2017         rt_periodic_timer.function = rt_check_expire;
2018         /* All the timers, started at system startup tend
2019            to synchronize. Perturb it a bit.
2020          */
2021         rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2022                 + ip_rt_gc_interval;
2023         add_timer(&rt_periodic_timer);
2024
2025 #ifdef CONFIG_PROC_FS
2026         proc_net_register(&(struct proc_dir_entry) {
2027                 PROC_NET_RTCACHE, 8, "rt_cache",
2028                 S_IFREG | S_IRUGO, 1, 0, 0,
2029                 0, &proc_net_inode_operations,
2030                 rt_cache_get_info
2031         });
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033         ent = create_proc_entry("net/rt_acct", 0, 0);
2034         ent->read_proc = ip_rt_acct_read;
2035 #endif
2036 #endif
2037 }