net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <net/protocol.h>
  95 #include <net/ip.h>
  96 #include <net/route.h>
  97 #include <net/inetpeer.h>
  98 #include <net/sock.h>
  99 #include <net/ip_fib.h>
 100 #include <net/arp.h>
 101 #include <net/tcp.h>
 102 #include <net/icmp.h>
 103 #include <net/xfrm.h>
 104 #include <net/ip_mp_alg.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_min_delay              = 2 * HZ;
 119 static int ip_rt_max_delay              = 10 * HZ;
 120 static int ip_rt_max_size;
 121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 122 static int ip_rt_gc_interval            = 60 * HZ;
 123 static int ip_rt_gc_min_interval        = HZ / 2;
 124 static int ip_rt_redirect_number        = 9;
 125 static int ip_rt_redirect_load          = HZ / 50;
 126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost             = HZ;
 128 static int ip_rt_error_burst            = 5 * HZ;
 129 static int ip_rt_gc_elasticity          = 8;
 130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 132 static int ip_rt_min_advmss             = 256;
 133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 134 static unsigned long rt_deadline;
 135
 136 #define RTprint(a...)   printk(KERN_DEBUG a)
 137
 138 static struct timer_list rt_flush_timer;
 139 static struct timer_list rt_periodic_timer;
 140 static struct timer_list rt_secret_timer;
 141
 142 /*
 143  *      Interface to generic destination cache.
 144  */
 145
 146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 147 static void              ipv4_dst_destroy(struct dst_entry *dst);
 148 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 149                                          struct net_device *dev, int how);
 150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 151 static void              ipv4_link_failure(struct sk_buff *skb);
 152 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 153 static int rt_garbage_collect(void);
 154
 155
 156 static struct dst_ops ipv4_dst_ops = {
 157         .family =               AF_INET,
 158         .protocol =             __constant_htons(ETH_P_IP),
 159         .gc =                   rt_garbage_collect,
 160         .check =                ipv4_dst_check,
 161         .destroy =              ipv4_dst_destroy,
 162         .ifdown =               ipv4_dst_ifdown,
 163         .negative_advice =      ipv4_negative_advice,
 164         .link_failure =         ipv4_link_failure,
 165         .update_pmtu =          ip_rt_update_pmtu,
 166         .entry_size =           sizeof(struct rtable),
 167 };
 168
 169 #define ECN_OR_COST(class)      TC_PRIO_##class
 170
 171 __u8 ip_tos2prio[16] = {
 172         TC_PRIO_BESTEFFORT,
 173         ECN_OR_COST(FILLER),
 174         TC_PRIO_BESTEFFORT,
 175         ECN_OR_COST(BESTEFFORT),
 176         TC_PRIO_BULK,
 177         ECN_OR_COST(BULK),
 178         TC_PRIO_BULK,
 179         ECN_OR_COST(BULK),
 180         TC_PRIO_INTERACTIVE,
 181         ECN_OR_COST(INTERACTIVE),
 182         TC_PRIO_INTERACTIVE,
 183         ECN_OR_COST(INTERACTIVE),
 184         TC_PRIO_INTERACTIVE_BULK,
 185         ECN_OR_COST(INTERACTIVE_BULK),
 186         TC_PRIO_INTERACTIVE_BULK,
 187         ECN_OR_COST(INTERACTIVE_BULK)
 188 };
 189
 190
 191 /*
 192  * Route cache.
 193  */
 194
 195 /* The locking scheme is rather straight forward:
 196  *
 197  * 1) Read-Copy Update protects the buckets of the central route hash.
 198  * 2) Only writers remove entries, and they hold the lock
 199  *    as they look at rtable reference counts.
 200  * 3) Only readers acquire references to rtable entries,
 201  *    they do so with atomic increments and with the
 202  *    lock held.
 203  */
 204
 205 struct rt_hash_bucket {
 206         struct rtable   *chain;
 207 };
 208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 209         defined(CONFIG_PROVE_LOCKING)
 210 /*
 211  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 212  * The size of this table is a power of two and depends on the number of CPUS.
 213  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 214  */
 215 #ifdef CONFIG_LOCKDEP
 216 # define RT_HASH_LOCK_SZ        256
 217 #else
 218 # if NR_CPUS >= 32
 219 #  define RT_HASH_LOCK_SZ       4096
 220 # elif NR_CPUS >= 16
 221 #  define RT_HASH_LOCK_SZ       2048
 222 # elif NR_CPUS >= 8
 223 #  define RT_HASH_LOCK_SZ       1024
 224 # elif NR_CPUS >= 4
 225 #  define RT_HASH_LOCK_SZ       512
 226 # else
 227 #  define RT_HASH_LOCK_SZ       256
 228 # endif
 229 #endif
 230
 231 static spinlock_t       *rt_hash_locks;
 232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 233 # define rt_hash_lock_init()    { \
 234                 int i; \
 235                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 236                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 237                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 238                         spin_lock_init(&rt_hash_locks[i]); \
 239                 }
 240 #else
 241 # define rt_hash_lock_addr(slot) NULL
 242 # define rt_hash_lock_init()
 243 #endif
 244
 245 static struct rt_hash_bucket    *rt_hash_table;
 246 static unsigned                 rt_hash_mask;
 247 static int                      rt_hash_log;
 248 static unsigned int             rt_hash_rnd;
 249
 250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 251 #define RT_CACHE_STAT_INC(field) \
 252         (__raw_get_cpu_var(rt_cache_stat).field++)
 253
 254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 255                                 struct rtable **res);
 256
 257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 258 {
 259         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 260                 & rt_hash_mask);
 261 }
 262
 263 #define rt_hash(daddr, saddr, idx) \
 264         rt_hash_code((__force u32)(__be32)(daddr),\
 265                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 266
 267 #ifdef CONFIG_PROC_FS
 268 struct rt_cache_iter_state {
 269         int bucket;
 270 };
 271
 272 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 273 {
 274         struct rtable *r = NULL;
 275         struct rt_cache_iter_state *st = seq->private;
 276
 277         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 278                 rcu_read_lock_bh();
 279                 r = rt_hash_table[st->bucket].chain;
 280                 if (r)
 281                         break;
 282                 rcu_read_unlock_bh();
 283         }
 284         return r;
 285 }
 286
 287 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 288 {
 289         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 290
 291         r = r->u.dst.rt_next;
 292         while (!r) {
 293                 rcu_read_unlock_bh();
 294                 if (--st->bucket < 0)
 295                         break;
 296                 rcu_read_lock_bh();
 297                 r = rt_hash_table[st->bucket].chain;
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 303 {
 304         struct rtable *r = rt_cache_get_first(seq);
 305
 306         if (r)
 307                 while (pos && (r = rt_cache_get_next(seq, r)))
 308                         --pos;
 309         return pos ? NULL : r;
 310 }
 311
 312 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 313 {
 314         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 315 }
 316
 317 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 318 {
 319         struct rtable *r = NULL;
 320
 321         if (v == SEQ_START_TOKEN)
 322                 r = rt_cache_get_first(seq);
 323         else
 324                 r = rt_cache_get_next(seq, v);
 325         ++*pos;
 326         return r;
 327 }
 328
 329 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 330 {
 331         if (v && v != SEQ_START_TOKEN)
 332                 rcu_read_unlock_bh();
 333 }
 334
 335 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 336 {
 337         if (v == SEQ_START_TOKEN)
 338                 seq_printf(seq, "%-127s\n",
 339                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 340                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 341                            "HHUptod\tSpecDst");
 342         else {
 343                 struct rtable *r = v;
 344                 char temp[256];
 345
 346                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 347                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 348                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 349                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 350                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 351                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 352                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 353                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 354                         dst_metric(&r->u.dst, RTAX_WINDOW),
 355                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 356                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 357                         r->fl.fl4_tos,
 358                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 359                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 360                                        dev_queue_xmit) : 0,
 361                         r->rt_spec_dst);
 362                 seq_printf(seq, "%-127s\n", temp);
 363         }
 364         return 0;
 365 }
 366
 367 static const struct seq_operations rt_cache_seq_ops = {
 368         .start  = rt_cache_seq_start,
 369         .next   = rt_cache_seq_next,
 370         .stop   = rt_cache_seq_stop,
 371         .show   = rt_cache_seq_show,
 372 };
 373
 374 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 375 {
 376         struct seq_file *seq;
 377         int rc = -ENOMEM;
 378         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 379
 380         if (!s)
 381                 goto out;
 382         rc = seq_open(file, &rt_cache_seq_ops);
 383         if (rc)
 384                 goto out_kfree;
 385         seq          = file->private_data;
 386         seq->private = s;
 387         memset(s, 0, sizeof(*s));
 388 out:
 389         return rc;
 390 out_kfree:
 391         kfree(s);
 392         goto out;
 393 }
 394
 395 static const struct file_operations rt_cache_seq_fops = {
 396         .owner   = THIS_MODULE,
 397         .open    = rt_cache_seq_open,
 398         .read    = seq_read,
 399         .llseek  = seq_lseek,
 400         .release = seq_release_private,
 401 };
 402
 403
 404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 405 {
 406         int cpu;
 407
 408         if (*pos == 0)
 409                 return SEQ_START_TOKEN;
 410
 411         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 412                 if (!cpu_possible(cpu))
 413                         continue;
 414                 *pos = cpu+1;
 415                 return &per_cpu(rt_cache_stat, cpu);
 416         }
 417         return NULL;
 418 }
 419
 420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 421 {
 422         int cpu;
 423
 424         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 425                 if (!cpu_possible(cpu))
 426                         continue;
 427                 *pos = cpu+1;
 428                 return &per_cpu(rt_cache_stat, cpu);
 429         }
 430         return NULL;
 431
 432 }
 433
 434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 435 {
 436
 437 }
 438
 439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 440 {
 441         struct rt_cache_stat *st = v;
 442
 443         if (v == SEQ_START_TOKEN) {
 444                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 445                 return 0;
 446         }
 447
 448         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 449                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 450                    atomic_read(&ipv4_dst_ops.entries),
 451                    st->in_hit,
 452                    st->in_slow_tot,
 453                    st->in_slow_mc,
 454                    st->in_no_route,
 455                    st->in_brd,
 456                    st->in_martian_dst,
 457                    st->in_martian_src,
 458
 459                    st->out_hit,
 460                    st->out_slow_tot,
 461                    st->out_slow_mc,
 462
 463                    st->gc_total,
 464                    st->gc_ignored,
 465                    st->gc_goal_miss,
 466                    st->gc_dst_overflow,
 467                    st->in_hlist_search,
 468                    st->out_hlist_search
 469                 );
 470         return 0;
 471 }
 472
 473 static const struct seq_operations rt_cpu_seq_ops = {
 474         .start  = rt_cpu_seq_start,
 475         .next   = rt_cpu_seq_next,
 476         .stop   = rt_cpu_seq_stop,
 477         .show   = rt_cpu_seq_show,
 478 };
 479
 480
 481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 482 {
 483         return seq_open(file, &rt_cpu_seq_ops);
 484 }
 485
 486 static const struct file_operations rt_cpu_seq_fops = {
 487         .owner   = THIS_MODULE,
 488         .open    = rt_cpu_seq_open,
 489         .read    = seq_read,
 490         .llseek  = seq_lseek,
 491         .release = seq_release,
 492 };
 493
 494 #endif /* CONFIG_PROC_FS */
 495
 496 static __inline__ void rt_free(struct rtable *rt)
 497 {
 498         multipath_remove(rt);
 499         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 500 }
 501
 502 static __inline__ void rt_drop(struct rtable *rt)
 503 {
 504         multipath_remove(rt);
 505         ip_rt_put(rt);
 506         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 507 }
 508
 509 static __inline__ int rt_fast_clean(struct rtable *rth)
 510 {
 511         /* Kill broadcast/multicast entries very aggresively, if they
 512            collide in hash table with more useful entries */
 513         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 514                 rth->fl.iif && rth->u.dst.rt_next;
 515 }
 516
 517 static __inline__ int rt_valuable(struct rtable *rth)
 518 {
 519         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 520                 rth->u.dst.expires;
 521 }
 522
 523 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 524 {
 525         unsigned long age;
 526         int ret = 0;
 527
 528         if (atomic_read(&rth->u.dst.__refcnt))
 529                 goto out;
 530
 531         ret = 1;
 532         if (rth->u.dst.expires &&
 533             time_after_eq(jiffies, rth->u.dst.expires))
 534                 goto out;
 535
 536         age = jiffies - rth->u.dst.lastuse;
 537         ret = 0;
 538         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 539             (age <= tmo2 && rt_valuable(rth)))
 540                 goto out;
 541         ret = 1;
 542 out:    return ret;
 543 }
 544
 545 /* Bits of score are:
 546  * 31: very valuable
 547  * 30: not quite useless
 548  * 29..0: usage counter
 549  */
 550 static inline u32 rt_score(struct rtable *rt)
 551 {
 552         u32 score = jiffies - rt->u.dst.lastuse;
 553
 554         score = ~score & ~(3<<30);
 555
 556         if (rt_valuable(rt))
 557                 score |= (1<<31);
 558
 559         if (!rt->fl.iif ||
 560             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 561                 score |= (1<<30);
 562
 563         return score;
 564 }
 565
 566 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 567 {
 568         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 569                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 570                 (fl1->mark ^ fl2->mark) |
 571                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 572                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 573                 (fl1->oif ^ fl2->oif) |
 574                 (fl1->iif ^ fl2->iif)) == 0;
 575 }
 576
 577 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 578 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 579                                                 struct rtable *expentry,
 580                                                 int *removed_count)
 581 {
 582         int passedexpired = 0;
 583         struct rtable **nextstep = NULL;
 584         struct rtable **rthp = chain_head;
 585         struct rtable *rth;
 586
 587         if (removed_count)
 588                 *removed_count = 0;
 589
 590         while ((rth = *rthp) != NULL) {
 591                 if (rth == expentry)
 592                         passedexpired = 1;
 593
 594                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 595                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 596                         if (*rthp == expentry) {
 597                                 *rthp = rth->u.dst.rt_next;
 598                                 continue;
 599                         } else {
 600                                 *rthp = rth->u.dst.rt_next;
 601                                 rt_free(rth);
 602                                 if (removed_count)
 603                                         ++(*removed_count);
 604                         }
 605                 } else {
 606                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 607                             passedexpired && !nextstep)
 608                                 nextstep = &rth->u.dst.rt_next;
 609
 610                         rthp = &rth->u.dst.rt_next;
 611                 }
 612         }
 613
 614         rt_free(expentry);
 615         if (removed_count)
 616                 ++(*removed_count);
 617
 618         return nextstep;
 619 }
 620 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 621
 622
 623 /* This runs via a timer and thus is always in BH context. */
 624 static void rt_check_expire(unsigned long dummy)
 625 {
 626         static unsigned int rover;
 627         unsigned int i = rover, goal;
 628         struct rtable *rth, **rthp;
 629         unsigned long now = jiffies;
 630         u64 mult;
 631
 632         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 633         if (ip_rt_gc_timeout > 1)
 634                 do_div(mult, ip_rt_gc_timeout);
 635         goal = (unsigned int)mult;
 636         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 637         for (; goal > 0; goal--) {
 638                 unsigned long tmo = ip_rt_gc_timeout;
 639
 640                 i = (i + 1) & rt_hash_mask;
 641                 rthp = &rt_hash_table[i].chain;
 642
 643                 if (*rthp == 0)
 644                         continue;
 645                 spin_lock(rt_hash_lock_addr(i));
 646                 while ((rth = *rthp) != NULL) {
 647                         if (rth->u.dst.expires) {
 648                                 /* Entry is expired even if it is in use */
 649                                 if (time_before_eq(now, rth->u.dst.expires)) {
 650                                         tmo >>= 1;
 651                                         rthp = &rth->u.dst.rt_next;
 652                                         continue;
 653                                 }
 654                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 655                                 tmo >>= 1;
 656                                 rthp = &rth->u.dst.rt_next;
 657                                 continue;
 658                         }
 659
 660                         /* Cleanup aged off entries. */
 661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 662                         /* remove all related balanced entries if necessary */
 663                         if (rth->u.dst.flags & DST_BALANCED) {
 664                                 rthp = rt_remove_balanced_route(
 665                                         &rt_hash_table[i].chain,
 666                                         rth, NULL);
 667                                 if (!rthp)
 668                                         break;
 669                         } else {
 670                                 *rthp = rth->u.dst.rt_next;
 671                                 rt_free(rth);
 672                         }
 673 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 674                         *rthp = rth->u.dst.rt_next;
 675                         rt_free(rth);
 676 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 677                 }
 678                 spin_unlock(rt_hash_lock_addr(i));
 679
 680                 /* Fallback loop breaker. */
 681                 if (time_after(jiffies, now))
 682                         break;
 683         }
 684         rover = i;
 685         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 686 }
 687
 688 /* This can run from both BH and non-BH contexts, the latter
 689  * in the case of a forced flush event.
 690  */
 691 static void rt_run_flush(unsigned long dummy)
 692 {
 693         int i;
 694         struct rtable *rth, *next;
 695
 696         rt_deadline = 0;
 697
 698         get_random_bytes(&rt_hash_rnd, 4);
 699
 700         for (i = rt_hash_mask; i >= 0; i--) {
 701                 spin_lock_bh(rt_hash_lock_addr(i));
 702                 rth = rt_hash_table[i].chain;
 703                 if (rth)
 704                         rt_hash_table[i].chain = NULL;
 705                 spin_unlock_bh(rt_hash_lock_addr(i));
 706
 707                 for (; rth; rth = next) {
 708                         next = rth->u.dst.rt_next;
 709                         rt_free(rth);
 710                 }
 711         }
 712 }
 713
 714 static DEFINE_SPINLOCK(rt_flush_lock);
 715
 716 void rt_cache_flush(int delay)
 717 {
 718         unsigned long now = jiffies;
 719         int user_mode = !in_softirq();
 720
 721         if (delay < 0)
 722                 delay = ip_rt_min_delay;
 723
 724         /* flush existing multipath state*/
 725         multipath_flush();
 726
 727         spin_lock_bh(&rt_flush_lock);
 728
 729         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 730                 long tmo = (long)(rt_deadline - now);
 731
 732                 /* If flush timer is already running
 733                    and flush request is not immediate (delay > 0):
 734
 735                    if deadline is not achieved, prolongate timer to "delay",
 736                    otherwise fire it at deadline time.
 737                  */
 738
 739                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 740                         tmo = 0;
 741
 742                 if (delay > tmo)
 743                         delay = tmo;
 744         }
 745
 746         if (delay <= 0) {
 747                 spin_unlock_bh(&rt_flush_lock);
 748                 rt_run_flush(0);
 749                 return;
 750         }
 751
 752         if (rt_deadline == 0)
 753                 rt_deadline = now + ip_rt_max_delay;
 754
 755         mod_timer(&rt_flush_timer, now+delay);
 756         spin_unlock_bh(&rt_flush_lock);
 757 }
 758
 759 static void rt_secret_rebuild(unsigned long dummy)
 760 {
 761         unsigned long now = jiffies;
 762
 763         rt_cache_flush(0);
 764         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 765 }
 766
 767 /*
 768    Short description of GC goals.
 769
 770    We want to build algorithm, which will keep routing cache
 771    at some equilibrium point, when number of aged off entries
 772    is kept approximately equal to newly generated ones.
 773
 774    Current expiration strength is variable "expire".
 775    We try to adjust it dynamically, so that if networking
 776    is idle expires is large enough to keep enough of warm entries,
 777    and when load increases it reduces to limit cache size.
 778  */
 779
 780 static int rt_garbage_collect(void)
 781 {
 782         static unsigned long expire = RT_GC_TIMEOUT;
 783         static unsigned long last_gc;
 784         static int rover;
 785         static int equilibrium;
 786         struct rtable *rth, **rthp;
 787         unsigned long now = jiffies;
 788         int goal;
 789
 790         /*
 791          * Garbage collection is pretty expensive,
 792          * do not make it too frequently.
 793          */
 794
 795         RT_CACHE_STAT_INC(gc_total);
 796
 797         if (now - last_gc < ip_rt_gc_min_interval &&
 798             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 799                 RT_CACHE_STAT_INC(gc_ignored);
 800                 goto out;
 801         }
 802
 803         /* Calculate number of entries, which we want to expire now. */
 804         goal = atomic_read(&ipv4_dst_ops.entries) -
 805                 (ip_rt_gc_elasticity << rt_hash_log);
 806         if (goal <= 0) {
 807                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 808                         equilibrium = ipv4_dst_ops.gc_thresh;
 809                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 810                 if (goal > 0) {
 811                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 812                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 813                 }
 814         } else {
 815                 /* We are in dangerous area. Try to reduce cache really
 816                  * aggressively.
 817                  */
 818                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 819                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 820         }
 821
 822         if (now - last_gc >= ip_rt_gc_min_interval)
 823                 last_gc = now;
 824
 825         if (goal <= 0) {
 826                 equilibrium += goal;
 827                 goto work_done;
 828         }
 829
 830         do {
 831                 int i, k;
 832
 833                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 834                         unsigned long tmo = expire;
 835
 836                         k = (k + 1) & rt_hash_mask;
 837                         rthp = &rt_hash_table[k].chain;
 838                         spin_lock_bh(rt_hash_lock_addr(k));
 839                         while ((rth = *rthp) != NULL) {
 840                                 if (!rt_may_expire(rth, tmo, expire)) {
 841                                         tmo >>= 1;
 842                                         rthp = &rth->u.dst.rt_next;
 843                                         continue;
 844                                 }
 845 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 846                                 /* remove all related balanced entries
 847                                  * if necessary
 848                                  */
 849                                 if (rth->u.dst.flags & DST_BALANCED) {
 850                                         int r;
 851
 852                                         rthp = rt_remove_balanced_route(
 853                                                 &rt_hash_table[k].chain,
 854                                                 rth,
 855                                                 &r);
 856                                         goal -= r;
 857                                         if (!rthp)
 858                                                 break;
 859                                 } else {
 860                                         *rthp = rth->u.dst.rt_next;
 861                                         rt_free(rth);
 862                                         goal--;
 863                                 }
 864 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 865                                 *rthp = rth->u.dst.rt_next;
 866                                 rt_free(rth);
 867                                 goal--;
 868 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 869                         }
 870                         spin_unlock_bh(rt_hash_lock_addr(k));
 871                         if (goal <= 0)
 872                                 break;
 873                 }
 874                 rover = k;
 875
 876                 if (goal <= 0)
 877                         goto work_done;
 878
 879                 /* Goal is not achieved. We stop process if:
 880
 881                    - if expire reduced to zero. Otherwise, expire is halfed.
 882                    - if table is not full.
 883                    - if we are called from interrupt.
 884                    - jiffies check is just fallback/debug loop breaker.
 885                      We will not spin here for long time in any case.
 886                  */
 887
 888                 RT_CACHE_STAT_INC(gc_goal_miss);
 889
 890                 if (expire == 0)
 891                         break;
 892
 893                 expire >>= 1;
 894 #if RT_CACHE_DEBUG >= 2
 895                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 896                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 897 #endif
 898
 899                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 900                         goto out;
 901         } while (!in_softirq() && time_before_eq(jiffies, now));
 902
 903         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 904                 goto out;
 905         if (net_ratelimit())
 906                 printk(KERN_WARNING "dst cache overflow\n");
 907         RT_CACHE_STAT_INC(gc_dst_overflow);
 908         return 1;
 909
 910 work_done:
 911         expire += ip_rt_gc_min_interval;
 912         if (expire > ip_rt_gc_timeout ||
 913             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 914                 expire = ip_rt_gc_timeout;
 915 #if RT_CACHE_DEBUG >= 2
 916         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 917                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 918 #endif
 919 out:    return 0;
 920 }
 921
 922 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 923 {
 924         struct rtable   *rth, **rthp;
 925         unsigned long   now;
 926         struct rtable *cand, **candp;
 927         u32             min_score;
 928         int             chain_length;
 929         int attempts = !in_softirq();
 930
 931 restart:
 932         chain_length = 0;
 933         min_score = ~(u32)0;
 934         cand = NULL;
 935         candp = NULL;
 936         now = jiffies;
 937
 938         rthp = &rt_hash_table[hash].chain;
 939
 940         spin_lock_bh(rt_hash_lock_addr(hash));
 941         while ((rth = *rthp) != NULL) {
 942 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 943                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 944                     compare_keys(&rth->fl, &rt->fl)) {
 945 #else
 946                 if (compare_keys(&rth->fl, &rt->fl)) {
 947 #endif
 948                         /* Put it first */
 949                         *rthp = rth->u.dst.rt_next;
 950                         /*
 951                          * Since lookup is lockfree, the deletion
 952                          * must be visible to another weakly ordered CPU before
 953                          * the insertion at the start of the hash chain.
 954                          */
 955                         rcu_assign_pointer(rth->u.dst.rt_next,
 956                                            rt_hash_table[hash].chain);
 957                         /*
 958                          * Since lookup is lockfree, the update writes
 959                          * must be ordered for consistency on SMP.
 960                          */
 961                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 962
 963                         rth->u.dst.__use++;
 964                         dst_hold(&rth->u.dst);
 965                         rth->u.dst.lastuse = now;
 966                         spin_unlock_bh(rt_hash_lock_addr(hash));
 967
 968                         rt_drop(rt);
 969                         *rp = rth;
 970                         return 0;
 971                 }
 972
 973                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 974                         u32 score = rt_score(rth);
 975
 976                         if (score <= min_score) {
 977                                 cand = rth;
 978                                 candp = rthp;
 979                                 min_score = score;
 980                         }
 981                 }
 982
 983                 chain_length++;
 984
 985                 rthp = &rth->u.dst.rt_next;
 986         }
 987
 988         if (cand) {
 989                 /* ip_rt_gc_elasticity used to be average length of chain
 990                  * length, when exceeded gc becomes really aggressive.
 991                  *
 992                  * The second limit is less certain. At the moment it allows
 993                  * only 2 entries per bucket. We will see.
 994                  */
 995                 if (chain_length > ip_rt_gc_elasticity) {
 996                         *candp = cand->u.dst.rt_next;
 997                         rt_free(cand);
 998                 }
 999         }
1000
1001         /* Try to bind route to arp only if it is output
1002            route or unicast forwarding path.
1003          */
1004         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1005                 int err = arp_bind_neighbour(&rt->u.dst);
1006                 if (err) {
1007                         spin_unlock_bh(rt_hash_lock_addr(hash));
1008
1009                         if (err != -ENOBUFS) {
1010                                 rt_drop(rt);
1011                                 return err;
1012                         }
1013
1014                         /* Neighbour tables are full and nothing
1015                            can be released. Try to shrink route cache,
1016                            it is most likely it holds some neighbour records.
1017                          */
1018                         if (attempts-- > 0) {
1019                                 int saved_elasticity = ip_rt_gc_elasticity;
1020                                 int saved_int = ip_rt_gc_min_interval;
1021                                 ip_rt_gc_elasticity     = 1;
1022                                 ip_rt_gc_min_interval   = 0;
1023                                 rt_garbage_collect();
1024                                 ip_rt_gc_min_interval   = saved_int;
1025                                 ip_rt_gc_elasticity     = saved_elasticity;
1026                                 goto restart;
1027                         }
1028
1029                         if (net_ratelimit())
1030                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1031                         rt_drop(rt);
1032                         return -ENOBUFS;
1033                 }
1034         }
1035
1036         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1037 #if RT_CACHE_DEBUG >= 2
1038         if (rt->u.dst.rt_next) {
1039                 struct rtable *trt;
1040                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1041                        NIPQUAD(rt->rt_dst));
1042                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1043                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1044                 printk("\n");
1045         }
1046 #endif
1047         rt_hash_table[hash].chain = rt;
1048         spin_unlock_bh(rt_hash_lock_addr(hash));
1049         *rp = rt;
1050         return 0;
1051 }
1052
1053 void rt_bind_peer(struct rtable *rt, int create)
1054 {
1055         static DEFINE_SPINLOCK(rt_peer_lock);
1056         struct inet_peer *peer;
1057
1058         peer = inet_getpeer(rt->rt_dst, create);
1059
1060         spin_lock_bh(&rt_peer_lock);
1061         if (rt->peer == NULL) {
1062                 rt->peer = peer;
1063                 peer = NULL;
1064         }
1065         spin_unlock_bh(&rt_peer_lock);
1066         if (peer)
1067                 inet_putpeer(peer);
1068 }
1069
1070 /*
1071  * Peer allocation may fail only in serious out-of-memory conditions.  However
1072  * we still can generate some output.
1073  * Random ID selection looks a bit dangerous because we have no chances to
1074  * select ID being unique in a reasonable period of time.
1075  * But broken packet identifier may be better than no packet at all.
1076  */
1077 static void ip_select_fb_ident(struct iphdr *iph)
1078 {
1079         static DEFINE_SPINLOCK(ip_fb_id_lock);
1080         static u32 ip_fallback_id;
1081         u32 salt;
1082
1083         spin_lock_bh(&ip_fb_id_lock);
1084         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1085         iph->id = htons(salt & 0xFFFF);
1086         ip_fallback_id = salt;
1087         spin_unlock_bh(&ip_fb_id_lock);
1088 }
1089
1090 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1091 {
1092         struct rtable *rt = (struct rtable *) dst;
1093
1094         if (rt) {
1095                 if (rt->peer == NULL)
1096                         rt_bind_peer(rt, 1);
1097
1098                 /* If peer is attached to destination, it is never detached,
1099                    so that we need not to grab a lock to dereference it.
1100                  */
1101                 if (rt->peer) {
1102                         iph->id = htons(inet_getid(rt->peer, more));
1103                         return;
1104                 }
1105         } else
1106                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1107                        __builtin_return_address(0));
1108
1109         ip_select_fb_ident(iph);
1110 }
1111
1112 static void rt_del(unsigned hash, struct rtable *rt)
1113 {
1114         struct rtable **rthp;
1115
1116         spin_lock_bh(rt_hash_lock_addr(hash));
1117         ip_rt_put(rt);
1118         for (rthp = &rt_hash_table[hash].chain; *rthp;
1119              rthp = &(*rthp)->u.dst.rt_next)
1120                 if (*rthp == rt) {
1121                         *rthp = rt->u.dst.rt_next;
1122                         rt_free(rt);
1123                         break;
1124                 }
1125         spin_unlock_bh(rt_hash_lock_addr(hash));
1126 }
1127
1128 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1129                     __be32 saddr, struct net_device *dev)
1130 {
1131         int i, k;
1132         struct in_device *in_dev = in_dev_get(dev);
1133         struct rtable *rth, **rthp;
1134         __be32  skeys[2] = { saddr, 0 };
1135         int  ikeys[2] = { dev->ifindex, 0 };
1136         struct netevent_redirect netevent;
1137
1138         if (!in_dev)
1139                 return;
1140
1141         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1142             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1143                 goto reject_redirect;
1144
1145         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147                         goto reject_redirect;
1148                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149                         goto reject_redirect;
1150         } else {
1151                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1152                         goto reject_redirect;
1153         }
1154
1155         for (i = 0; i < 2; i++) {
1156                 for (k = 0; k < 2; k++) {
1157                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1158
1159                         rthp=&rt_hash_table[hash].chain;
1160
1161                         rcu_read_lock();
1162                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1163                                 struct rtable *rt;
1164
1165                                 if (rth->fl.fl4_dst != daddr ||
1166                                     rth->fl.fl4_src != skeys[i] ||
1167                                     rth->fl.oif != ikeys[k] ||
1168                                     rth->fl.iif != 0) {
1169                                         rthp = &rth->u.dst.rt_next;
1170                                         continue;
1171                                 }
1172
1173                                 if (rth->rt_dst != daddr ||
1174                                     rth->rt_src != saddr ||
1175                                     rth->u.dst.error ||
1176                                     rth->rt_gateway != old_gw ||
1177                                     rth->u.dst.dev != dev)
1178                                         break;
1179
1180                                 dst_hold(&rth->u.dst);
1181                                 rcu_read_unlock();
1182
1183                                 rt = dst_alloc(&ipv4_dst_ops);
1184                                 if (rt == NULL) {
1185                                         ip_rt_put(rth);
1186                                         in_dev_put(in_dev);
1187                                         return;
1188                                 }
1189
1190                                 /* Copy all the information. */
1191                                 *rt = *rth;
1192                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1193                                 rt->u.dst.__use         = 1;
1194                                 atomic_set(&rt->u.dst.__refcnt, 1);
1195                                 rt->u.dst.child         = NULL;
1196                                 if (rt->u.dst.dev)
1197                                         dev_hold(rt->u.dst.dev);
1198                                 if (rt->idev)
1199                                         in_dev_hold(rt->idev);
1200                                 rt->u.dst.obsolete      = 0;
1201                                 rt->u.dst.lastuse       = jiffies;
1202                                 rt->u.dst.path          = &rt->u.dst;
1203                                 rt->u.dst.neighbour     = NULL;
1204                                 rt->u.dst.hh            = NULL;
1205                                 rt->u.dst.xfrm          = NULL;
1206
1207                                 rt->rt_flags            |= RTCF_REDIRECTED;
1208
1209                                 /* Gateway is different ... */
1210                                 rt->rt_gateway          = new_gw;
1211
1212                                 /* Redirect received -> path was valid */
1213                                 dst_confirm(&rth->u.dst);
1214
1215                                 if (rt->peer)
1216                                         atomic_inc(&rt->peer->refcnt);
1217
1218                                 if (arp_bind_neighbour(&rt->u.dst) ||
1219                                     !(rt->u.dst.neighbour->nud_state &
1220                                             NUD_VALID)) {
1221                                         if (rt->u.dst.neighbour)
1222                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1223                                         ip_rt_put(rth);
1224                                         rt_drop(rt);
1225                                         goto do_next;
1226                                 }
1227
1228                                 netevent.old = &rth->u.dst;
1229                                 netevent.new = &rt->u.dst;
1230                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1231                                                         &netevent);
1232
1233                                 rt_del(hash, rth);
1234                                 if (!rt_intern_hash(hash, rt, &rt))
1235                                         ip_rt_put(rt);
1236                                 goto do_next;
1237                         }
1238                         rcu_read_unlock();
1239                 do_next:
1240                         ;
1241                 }
1242         }
1243         in_dev_put(in_dev);
1244         return;
1245
1246 reject_redirect:
1247 #ifdef CONFIG_IP_ROUTE_VERBOSE
1248         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250                         "%u.%u.%u.%u ignored.\n"
1251                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1252                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1253                        NIPQUAD(saddr), NIPQUAD(daddr));
1254 #endif
1255         in_dev_put(in_dev);
1256 }
1257
1258 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259 {
1260         struct rtable *rt = (struct rtable*)dst;
1261         struct dst_entry *ret = dst;
1262
1263         if (rt) {
1264                 if (dst->obsolete) {
1265                         ip_rt_put(rt);
1266                         ret = NULL;
1267                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268                            rt->u.dst.expires) {
1269                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270                                                 rt->fl.oif);
1271 #if RT_CACHE_DEBUG >= 1
1272                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1273                                           "%u.%u.%u.%u/%02x dropped\n",
1274                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275 #endif
1276                         rt_del(hash, rt);
1277                         ret = NULL;
1278                 }
1279         }
1280         return ret;
1281 }
1282
1283 /*
1284  * Algorithm:
1285  *      1. The first ip_rt_redirect_number redirects are sent
1286  *         with exponential backoff, then we stop sending them at all,
1287  *         assuming that the host ignores our redirects.
1288  *      2. If we did not see packets requiring redirects
1289  *         during ip_rt_redirect_silence, we assume that the host
1290  *         forgot redirected route and start to send redirects again.
1291  *
1292  * This algorithm is much cheaper and more intelligent than dumb load limiting
1293  * in icmp.c.
1294  *
1295  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297  */
1298
1299 void ip_rt_send_redirect(struct sk_buff *skb)
1300 {
1301         struct rtable *rt = (struct rtable*)skb->dst;
1302         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303
1304         if (!in_dev)
1305                 return;
1306
1307         if (!IN_DEV_TX_REDIRECTS(in_dev))
1308                 goto out;
1309
1310         /* No redirected packets during ip_rt_redirect_silence;
1311          * reset the algorithm.
1312          */
1313         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314                 rt->u.dst.rate_tokens = 0;
1315
1316         /* Too many ignored redirects; do not send anything
1317          * set u.dst.rate_last to the last seen redirected packet.
1318          */
1319         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320                 rt->u.dst.rate_last = jiffies;
1321                 goto out;
1322         }
1323
1324         /* Check for load limit; set rate_last to the latest sent
1325          * redirect.
1326          */
1327         if (rt->u.dst.rate_tokens == 0 ||
1328             time_after(jiffies,
1329                        (rt->u.dst.rate_last +
1330                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332                 rt->u.dst.rate_last = jiffies;
1333                 ++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337                     net_ratelimit())
1338                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1341                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343         }
1344 out:
1345         in_dev_put(in_dev);
1346 }
1347
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350         struct rtable *rt = (struct rtable*)skb->dst;
1351         unsigned long now;
1352         int code;
1353
1354         switch (rt->u.dst.error) {
1355                 case EINVAL:
1356                 default:
1357                         goto out;
1358                 case EHOSTUNREACH:
1359                         code = ICMP_HOST_UNREACH;
1360                         break;
1361                 case ENETUNREACH:
1362                         code = ICMP_NET_UNREACH;
1363                         break;
1364                 case EACCES:
1365                         code = ICMP_PKT_FILTERED;
1366                         break;
1367         }
1368
1369         now = jiffies;
1370         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373         rt->u.dst.rate_last = now;
1374         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377         }
1378
1379 out:    kfree_skb(skb);
1380         return 0;
1381 }
1382
1383 /*
1384  *      The last two values are not from the RFC but
1385  *      are needed for AMPRnet AX.25 paths.
1386  */
1387
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393         int i;
1394
1395         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396                 if (old_mtu > mtu_plateau[i])
1397                         return mtu_plateau[i];
1398         return 68;
1399 }
1400
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402 {
1403         int i;
1404         unsigned short old_mtu = ntohs(iph->tot_len);
1405         struct rtable *rth;
1406         __be32  skeys[2] = { iph->saddr, 0, };
1407         __be32  daddr = iph->daddr;
1408         unsigned short est_mtu = 0;
1409
1410         if (ipv4_config.no_pmtu_disc)
1411                 return 0;
1412
1413         for (i = 0; i < 2; i++) {
1414                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1415
1416                 rcu_read_lock();
1417                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1419                         if (rth->fl.fl4_dst == daddr &&
1420                             rth->fl.fl4_src == skeys[i] &&
1421                             rth->rt_dst  == daddr &&
1422                             rth->rt_src  == iph->saddr &&
1423                             rth->fl.iif == 0 &&
1424                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425                                 unsigned short mtu = new_mtu;
1426
1427                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429                                         /* BSD 4.2 compatibility hack :-( */
1430                                         if (mtu == 0 &&
1431                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432                                             old_mtu >= 68 + (iph->ihl << 2))
1433                                                 old_mtu -= iph->ihl << 2;
1434
1435                                         mtu = guess_mtu(old_mtu);
1436                                 }
1437                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439                                                 dst_confirm(&rth->u.dst);
1440                                                 if (mtu < ip_rt_min_pmtu) {
1441                                                         mtu = ip_rt_min_pmtu;
1442                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1443                                                                 (1 << RTAX_MTU);
1444                                                 }
1445                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446                                                 dst_set_expires(&rth->u.dst,
1447                                                         ip_rt_mtu_expires);
1448                                         }
1449                                         est_mtu = mtu;
1450                                 }
1451                         }
1452                 }
1453                 rcu_read_unlock();
1454         }
1455         return est_mtu ? : new_mtu;
1456 }
1457
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459 {
1460         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461             !(dst_metric_locked(dst, RTAX_MTU))) {
1462                 if (mtu < ip_rt_min_pmtu) {
1463                         mtu = ip_rt_min_pmtu;
1464                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465                 }
1466                 dst->metrics[RTAX_MTU-1] = mtu;
1467                 dst_set_expires(dst, ip_rt_mtu_expires);
1468                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469         }
1470 }
1471
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473 {
1474         return NULL;
1475 }
1476
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479         struct rtable *rt = (struct rtable *) dst;
1480         struct inet_peer *peer = rt->peer;
1481         struct in_device *idev = rt->idev;
1482
1483         if (peer) {
1484                 rt->peer = NULL;
1485                 inet_putpeer(peer);
1486         }
1487
1488         if (idev) {
1489                 rt->idev = NULL;
1490                 in_dev_put(idev);
1491         }
1492 }
1493
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495                             int how)
1496 {
1497         struct rtable *rt = (struct rtable *) dst;
1498         struct in_device *idev = rt->idev;
1499         if (dev != &loopback_dev && idev && idev->dev == dev) {
1500                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501                 if (loopback_idev) {
1502                         rt->idev = loopback_idev;
1503                         in_dev_put(idev);
1504                 }
1505         }
1506 }
1507
1508 static void ipv4_link_failure(struct sk_buff *skb)
1509 {
1510         struct rtable *rt;
1511
1512         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514         rt = (struct rtable *) skb->dst;
1515         if (rt)
1516                 dst_set_expires(&rt->u.dst, 0);
1517 }
1518
1519 static int ip_rt_bug(struct sk_buff *skb)
1520 {
1521         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1523                 skb->dev ? skb->dev->name : "?");
1524         kfree_skb(skb);
1525         return 0;
1526 }
1527
1528 /*
1529    We do not cache source address of outgoing interface,
1530    because it is used only by IP RR, TS and SRR options,
1531    so that it out of fast path.
1532
1533    BTW remember: "addr" is allowed to be not aligned
1534    in IP options!
1535  */
1536
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538 {
1539         __be32 src;
1540         struct fib_result res;
1541
1542         if (rt->fl.iif == 0)
1543                 src = rt->rt_src;
1544         else if (fib_lookup(&rt->fl, &res) == 0) {
1545                 src = FIB_RES_PREFSRC(res);
1546                 fib_res_put(&res);
1547         } else
1548                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549                                         RT_SCOPE_UNIVERSE);
1550         memcpy(addr, &src, 4);
1551 }
1552
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1555 {
1556         if (!(rt->u.dst.tclassid & 0xFFFF))
1557                 rt->u.dst.tclassid |= tag & 0xFFFF;
1558         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560 }
1561 #endif
1562
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564 {
1565         struct fib_info *fi = res->fi;
1566
1567         if (fi) {
1568                 if (FIB_RES_GW(*res) &&
1569                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570                         rt->rt_gateway = FIB_RES_GW(*res);
1571                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572                        sizeof(rt->u.dst.metrics));
1573                 if (fi->fib_mtu == 0) {
1574                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576                             rt->rt_gateway != rt->rt_dst &&
1577                             rt->u.dst.dev->mtu > 576)
1578                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579                 }
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582 #endif
1583         } else
1584                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592                                        ip_rt_min_advmss);
1593         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598         set_class_tag(rt, fib_rules_tclass(res));
1599 #endif
1600         set_class_tag(rt, itag);
1601 #endif
1602         rt->rt_type = res->type;
1603 }
1604
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606                                 u8 tos, struct net_device *dev, int our)
1607 {
1608         unsigned hash;
1609         struct rtable *rth;
1610         __be32 spec_dst;
1611         struct in_device *in_dev = in_dev_get(dev);
1612         u32 itag = 0;
1613
1614         /* Primary sanity checks. */
1615
1616         if (in_dev == NULL)
1617                 return -EINVAL;
1618
1619         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620             skb->protocol != htons(ETH_P_IP))
1621                 goto e_inval;
1622
1623         if (ZERONET(saddr)) {
1624                 if (!LOCAL_MCAST(daddr))
1625                         goto e_inval;
1626                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627         } else if (fib_validate_source(saddr, 0, tos, 0,
1628                                         dev, &spec_dst, &itag) < 0)
1629                 goto e_inval;
1630
1631         rth = dst_alloc(&ipv4_dst_ops);
1632         if (!rth)
1633                 goto e_nobufs;
1634
1635         rth->u.dst.output= ip_rt_bug;
1636
1637         atomic_set(&rth->u.dst.__refcnt, 1);
1638         rth->u.dst.flags= DST_HOST;
1639         if (in_dev->cnf.no_policy)
1640                 rth->u.dst.flags |= DST_NOPOLICY;
1641         rth->fl.fl4_dst = daddr;
1642         rth->rt_dst     = daddr;
1643         rth->fl.fl4_tos = tos;
1644         rth->fl.mark    = skb->mark;
1645         rth->fl.fl4_src = saddr;
1646         rth->rt_src     = saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648         rth->u.dst.tclassid = itag;
1649 #endif
1650         rth->rt_iif     =
1651         rth->fl.iif     = dev->ifindex;
1652         rth->u.dst.dev  = &loopback_dev;
1653         dev_hold(rth->u.dst.dev);
1654         rth->idev       = in_dev_get(rth->u.dst.dev);
1655         rth->fl.oif     = 0;
1656         rth->rt_gateway = daddr;
1657         rth->rt_spec_dst= spec_dst;
1658         rth->rt_type    = RTN_MULTICAST;
1659         rth->rt_flags   = RTCF_MULTICAST;
1660         if (our) {
1661                 rth->u.dst.input= ip_local_deliver;
1662                 rth->rt_flags |= RTCF_LOCAL;
1663         }
1664
1665 #ifdef CONFIG_IP_MROUTE
1666         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667                 rth->u.dst.input = ip_mr_input;
1668 #endif
1669         RT_CACHE_STAT_INC(in_slow_mc);
1670
1671         in_dev_put(in_dev);
1672         hash = rt_hash(daddr, saddr, dev->ifindex);
1673         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675 e_nobufs:
1676         in_dev_put(in_dev);
1677         return -ENOBUFS;
1678
1679 e_inval:
1680         in_dev_put(in_dev);
1681         return -EINVAL;
1682 }
1683
1684
1685 static void ip_handle_martian_source(struct net_device *dev,
1686                                      struct in_device *in_dev,
1687                                      struct sk_buff *skb,
1688                                      __be32 daddr,
1689                                      __be32 saddr)
1690 {
1691         RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694                 /*
1695                  *      RFC1812 recommendation, if source is martian,
1696                  *      the only hint is MAC header.
1697                  */
1698                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699                         "%u.%u.%u.%u, on dev %s\n",
1700                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1702                         int i;
1703                         const unsigned char *p = skb_mac_header(skb);
1704                         printk(KERN_WARNING "ll header: ");
1705                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1706                                 printk("%02x", *p);
1707                                 if (i < (dev->hard_header_len - 1))
1708                                         printk(":");
1709                         }
1710                         printk("\n");
1711                 }
1712         }
1713 #endif
1714 }
1715
1716 static inline int __mkroute_input(struct sk_buff *skb,
1717                                   struct fib_result* res,
1718                                   struct in_device *in_dev,
1719                                   __be32 daddr, __be32 saddr, u32 tos,
1720                                   struct rtable **result)
1721 {
1722
1723         struct rtable *rth;
1724         int err;
1725         struct in_device *out_dev;
1726         unsigned flags = 0;
1727         __be32 spec_dst;
1728         u32 itag;
1729
1730         /* get a working reference to the output device */
1731         out_dev = in_dev_get(FIB_RES_DEV(*res));
1732         if (out_dev == NULL) {
1733                 if (net_ratelimit())
1734                         printk(KERN_CRIT "Bug in ip_route_input" \
1735                                "_slow(). Please, report\n");
1736                 return -EINVAL;
1737         }
1738
1739
1740         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741                                   in_dev->dev, &spec_dst, &itag);
1742         if (err < 0) {
1743                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744                                          saddr);
1745
1746                 err = -EINVAL;
1747                 goto cleanup;
1748         }
1749
1750         if (err)
1751                 flags |= RTCF_DIRECTSRC;
1752
1753         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754             (IN_DEV_SHARED_MEDIA(out_dev) ||
1755              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756                 flags |= RTCF_DOREDIRECT;
1757
1758         if (skb->protocol != htons(ETH_P_IP)) {
1759                 /* Not IP (i.e. ARP). Do not create route, if it is
1760                  * invalid for proxy arp. DNAT routes are always valid.
1761                  */
1762                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763                         err = -EINVAL;
1764                         goto cleanup;
1765                 }
1766         }
1767
1768
1769         rth = dst_alloc(&ipv4_dst_ops);
1770         if (!rth) {
1771                 err = -ENOBUFS;
1772                 goto cleanup;
1773         }
1774
1775         atomic_set(&rth->u.dst.__refcnt, 1);
1776         rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778         if (res->fi->fib_nhs > 1)
1779                 rth->u.dst.flags |= DST_BALANCED;
1780 #endif
1781         if (in_dev->cnf.no_policy)
1782                 rth->u.dst.flags |= DST_NOPOLICY;
1783         if (out_dev->cnf.no_xfrm)
1784                 rth->u.dst.flags |= DST_NOXFRM;
1785         rth->fl.fl4_dst = daddr;
1786         rth->rt_dst     = daddr;
1787         rth->fl.fl4_tos = tos;
1788         rth->fl.mark    = skb->mark;
1789         rth->fl.fl4_src = saddr;
1790         rth->rt_src     = saddr;
1791         rth->rt_gateway = daddr;
1792         rth->rt_iif     =
1793                 rth->fl.iif     = in_dev->dev->ifindex;
1794         rth->u.dst.dev  = (out_dev)->dev;
1795         dev_hold(rth->u.dst.dev);
1796         rth->idev       = in_dev_get(rth->u.dst.dev);
1797         rth->fl.oif     = 0;
1798         rth->rt_spec_dst= spec_dst;
1799
1800         rth->u.dst.input = ip_forward;
1801         rth->u.dst.output = ip_output;
1802
1803         rt_set_nexthop(rth, res, itag);
1804
1805         rth->rt_flags = flags;
1806
1807         *result = rth;
1808         err = 0;
1809  cleanup:
1810         /* release the working reference to the output device */
1811         in_dev_put(out_dev);
1812         return err;
1813 }
1814
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816                                        struct fib_result* res,
1817                                        const struct flowi *fl,
1818                                        struct in_device *in_dev,
1819                                        __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821         struct rtable* rth = NULL;
1822         int err;
1823         unsigned hash;
1824
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827                 fib_select_multipath(fl, res);
1828 #endif
1829
1830         /* create a routing cache entry */
1831         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832         if (err)
1833                 return err;
1834
1835         /* put it into the cache */
1836         hash = rt_hash(daddr, saddr, fl->iif);
1837         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838 }
1839
1840 static inline int ip_mkroute_input(struct sk_buff *skb,
1841                                    struct fib_result* res,
1842                                    const struct flowi *fl,
1843                                    struct in_device *in_dev,
1844                                    __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847         struct rtable* rth = NULL, *rtres;
1848         unsigned char hop, hopcount;
1849         int err = -EINVAL;
1850         unsigned int hash;
1851
1852         if (res->fi)
1853                 hopcount = res->fi->fib_nhs;
1854         else
1855                 hopcount = 1;
1856
1857         /* distinguish between multipath and singlepath */
1858         if (hopcount < 2)
1859                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860                                             saddr, tos);
1861
1862         /* add all alternatives to the routing cache */
1863         for (hop = 0; hop < hopcount; hop++) {
1864                 res->nh_sel = hop;
1865
1866                 /* put reference to previous result */
1867                 if (hop)
1868                         ip_rt_put(rtres);
1869
1870                 /* create a routing cache entry */
1871                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872                                       &rth);
1873                 if (err)
1874                         return err;
1875
1876                 /* put it into the cache */
1877                 hash = rt_hash(daddr, saddr, fl->iif);
1878                 err = rt_intern_hash(hash, rth, &rtres);
1879                 if (err)
1880                         return err;
1881
1882                 /* forward hop information to multipath impl. */
1883                 multipath_set_nhinfo(rth,
1884                                      FIB_RES_NETWORK(*res),
1885                                      FIB_RES_NETMASK(*res),
1886                                      res->prefixlen,
1887                                      &FIB_RES_NH(*res));
1888         }
1889         skb->dst = &rtres->u.dst;
1890         return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895
1896
1897 /*
1898  *      NOTE. We drop all the packets that has local source
1899  *      addresses, because every properly looped back packet
1900  *      must have correct destination already attached by output routine.
1901  *
1902  *      Such approach solves two big problems:
1903  *      1. Not simplex devices are handled properly.
1904  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908                                u8 tos, struct net_device *dev)
1909 {
1910         struct fib_result res;
1911         struct in_device *in_dev = in_dev_get(dev);
1912         struct flowi fl = { .nl_u = { .ip4_u =
1913                                       { .daddr = daddr,
1914                                         .saddr = saddr,
1915                                         .tos = tos,
1916                                         .scope = RT_SCOPE_UNIVERSE,
1917                                       } },
1918                             .mark = skb->mark,
1919                             .iif = dev->ifindex };
1920         unsigned        flags = 0;
1921         u32             itag = 0;
1922         struct rtable * rth;
1923         unsigned        hash;
1924         __be32          spec_dst;
1925         int             err = -EINVAL;
1926         int             free_res = 0;
1927
1928         /* IP on this device is disabled. */
1929
1930         if (!in_dev)
1931                 goto out;
1932
1933         /* Check for the most weird martians, which can be not detected
1934            by fib_lookup.
1935          */
1936
1937         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938                 goto martian_source;
1939
1940         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941                 goto brd_input;
1942
1943         /* Accept zero addresses only to limited broadcast;
1944          * I even do not know to fix it or not. Waiting for complains :-)
1945          */
1946         if (ZERONET(saddr))
1947                 goto martian_source;
1948
1949         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950                 goto martian_destination;
1951
1952         /*
1953          *      Now we are ready to route packet.
1954          */
1955         if ((err = fib_lookup(&fl, &res)) != 0) {
1956                 if (!IN_DEV_FORWARD(in_dev))
1957                         goto e_hostunreach;
1958                 goto no_route;
1959         }
1960         free_res = 1;
1961
1962         RT_CACHE_STAT_INC(in_slow_tot);
1963
1964         if (res.type == RTN_BROADCAST)
1965                 goto brd_input;
1966
1967         if (res.type == RTN_LOCAL) {
1968                 int result;
1969                 result = fib_validate_source(saddr, daddr, tos,
1970                                              loopback_dev.ifindex,
1971                                              dev, &spec_dst, &itag);
1972                 if (result < 0)
1973                         goto martian_source;
1974                 if (result)
1975                         flags |= RTCF_DIRECTSRC;
1976                 spec_dst = daddr;
1977                 goto local_input;
1978         }
1979
1980         if (!IN_DEV_FORWARD(in_dev))
1981                 goto e_hostunreach;
1982         if (res.type != RTN_UNICAST)
1983                 goto martian_destination;
1984
1985         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986         if (err == -ENOBUFS)
1987                 goto e_nobufs;
1988         if (err == -EINVAL)
1989                 goto e_inval;
1990
1991 done:
1992         in_dev_put(in_dev);
1993         if (free_res)
1994                 fib_res_put(&res);
1995 out:    return err;
1996
1997 brd_input:
1998         if (skb->protocol != htons(ETH_P_IP))
1999                 goto e_inval;
2000
2001         if (ZERONET(saddr))
2002                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003         else {
2004                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005                                           &itag);
2006                 if (err < 0)
2007                         goto martian_source;
2008                 if (err)
2009                         flags |= RTCF_DIRECTSRC;
2010         }
2011         flags |= RTCF_BROADCAST;
2012         res.type = RTN_BROADCAST;
2013         RT_CACHE_STAT_INC(in_brd);
2014
2015 local_input:
2016         rth = dst_alloc(&ipv4_dst_ops);
2017         if (!rth)
2018                 goto e_nobufs;
2019
2020         rth->u.dst.output= ip_rt_bug;
2021
2022         atomic_set(&rth->u.dst.__refcnt, 1);
2023         rth->u.dst.flags= DST_HOST;
2024         if (in_dev->cnf.no_policy)
2025                 rth->u.dst.flags |= DST_NOPOLICY;
2026         rth->fl.fl4_dst = daddr;
2027         rth->rt_dst     = daddr;
2028         rth->fl.fl4_tos = tos;
2029         rth->fl.mark    = skb->mark;
2030         rth->fl.fl4_src = saddr;
2031         rth->rt_src     = saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033         rth->u.dst.tclassid = itag;
2034 #endif
2035         rth->rt_iif     =
2036         rth->fl.iif     = dev->ifindex;
2037         rth->u.dst.dev  = &loopback_dev;
2038         dev_hold(rth->u.dst.dev);
2039         rth->idev       = in_dev_get(rth->u.dst.dev);
2040         rth->rt_gateway = daddr;
2041         rth->rt_spec_dst= spec_dst;
2042         rth->u.dst.input= ip_local_deliver;
2043         rth->rt_flags   = flags|RTCF_LOCAL;
2044         if (res.type == RTN_UNREACHABLE) {
2045                 rth->u.dst.input= ip_error;
2046                 rth->u.dst.error= -err;
2047                 rth->rt_flags   &= ~RTCF_LOCAL;
2048         }
2049         rth->rt_type    = res.type;
2050         hash = rt_hash(daddr, saddr, fl.iif);
2051         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052         goto done;
2053
2054 no_route:
2055         RT_CACHE_STAT_INC(in_no_route);
2056         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057         res.type = RTN_UNREACHABLE;
2058         goto local_input;
2059
2060         /*
2061          *      Do not cache martian addresses: they should be logged (RFC1812)
2062          */
2063 martian_destination:
2064         RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068                         "%u.%u.%u.%u, dev %s\n",
2069                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 #endif
2071
2072 e_hostunreach:
2073         err = -EHOSTUNREACH;
2074         goto done;
2075
2076 e_inval:
2077         err = -EINVAL;
2078         goto done;
2079
2080 e_nobufs:
2081         err = -ENOBUFS;
2082         goto done;
2083
2084 martian_source:
2085         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086         goto e_inval;
2087 }
2088
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090                    u8 tos, struct net_device *dev)
2091 {
2092         struct rtable * rth;
2093         unsigned        hash;
2094         int iif = dev->ifindex;
2095
2096         tos &= IPTOS_RT_MASK;
2097         hash = rt_hash(daddr, saddr, iif);
2098
2099         rcu_read_lock();
2100         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101              rth = rcu_dereference(rth->u.dst.rt_next)) {
2102                 if (rth->fl.fl4_dst == daddr &&
2103                     rth->fl.fl4_src == saddr &&
2104                     rth->fl.iif == iif &&
2105                     rth->fl.oif == 0 &&
2106                     rth->fl.mark == skb->mark &&
2107                     rth->fl.fl4_tos == tos) {
2108                         rth->u.dst.lastuse = jiffies;
2109                         dst_hold(&rth->u.dst);
2110                         rth->u.dst.__use++;
2111                         RT_CACHE_STAT_INC(in_hit);
2112                         rcu_read_unlock();
2113                         skb->dst = (struct dst_entry*)rth;
2114                         return 0;
2115                 }
2116                 RT_CACHE_STAT_INC(in_hlist_search);
2117         }
2118         rcu_read_unlock();
2119
2120         /* Multicast recognition logic is moved from route cache to here.
2121            The problem was that too many Ethernet cards have broken/missing
2122            hardware multicast filters :-( As result the host on multicasting
2123            network acquires a lot of useless route cache entries, sort of
2124            SDR messages from all the world. Now we try to get rid of them.
2125            Really, provided software IP multicast filter is organized
2126            reasonably (at least, hashed), it does not result in a slowdown
2127            comparing with route cache reject entries.
2128            Note, that multicast routers are not affected, because
2129            route cache entry is created eventually.
2130          */
2131         if (MULTICAST(daddr)) {
2132                 struct in_device *in_dev;
2133
2134                 rcu_read_lock();
2135                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136                         int our = ip_check_mc(in_dev, daddr, saddr,
2137                                 ip_hdr(skb)->protocol);
2138                         if (our
2139 #ifdef CONFIG_IP_MROUTE
2140                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 #endif
2142                             ) {
2143                                 rcu_read_unlock();
2144                                 return ip_route_input_mc(skb, daddr, saddr,
2145                                                          tos, dev, our);
2146                         }
2147                 }
2148                 rcu_read_unlock();
2149                 return -EINVAL;
2150         }
2151         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152 }
2153
2154 static inline int __mkroute_output(struct rtable **result,
2155                                    struct fib_result* res,
2156                                    const struct flowi *fl,
2157                                    const struct flowi *oldflp,
2158                                    struct net_device *dev_out,
2159                                    unsigned flags)
2160 {
2161         struct rtable *rth;
2162         struct in_device *in_dev;
2163         u32 tos = RT_FL_TOS(oldflp);
2164         int err = 0;
2165
2166         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167                 return -EINVAL;
2168
2169         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170                 res->type = RTN_BROADCAST;
2171         else if (MULTICAST(fl->fl4_dst))
2172                 res->type = RTN_MULTICAST;
2173         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174                 return -EINVAL;
2175
2176         if (dev_out->flags & IFF_LOOPBACK)
2177                 flags |= RTCF_LOCAL;
2178
2179         /* get work reference to inet device */
2180         in_dev = in_dev_get(dev_out);
2181         if (!in_dev)
2182                 return -EINVAL;
2183
2184         if (res->type == RTN_BROADCAST) {
2185                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186                 if (res->fi) {
2187                         fib_info_put(res->fi);
2188                         res->fi = NULL;
2189                 }
2190         } else if (res->type == RTN_MULTICAST) {
2191                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193                                  oldflp->proto))
2194                         flags &= ~RTCF_LOCAL;
2195                 /* If multicast route do not exist use
2196                    default one, but do not gateway in this case.
2197                    Yes, it is hack.
2198                  */
2199                 if (res->fi && res->prefixlen < 4) {
2200                         fib_info_put(res->fi);
2201                         res->fi = NULL;
2202                 }
2203         }
2204
2205
2206         rth = dst_alloc(&ipv4_dst_ops);
2207         if (!rth) {
2208                 err = -ENOBUFS;
2209                 goto cleanup;
2210         }
2211
2212         atomic_set(&rth->u.dst.__refcnt, 1);
2213         rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215         if (res->fi) {
2216                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217                 if (res->fi->fib_nhs > 1)
2218                         rth->u.dst.flags |= DST_BALANCED;
2219         }
2220 #endif
2221         if (in_dev->cnf.no_xfrm)
2222                 rth->u.dst.flags |= DST_NOXFRM;
2223         if (in_dev->cnf.no_policy)
2224                 rth->u.dst.flags |= DST_NOPOLICY;
2225
2226         rth->fl.fl4_dst = oldflp->fl4_dst;
2227         rth->fl.fl4_tos = tos;
2228         rth->fl.fl4_src = oldflp->fl4_src;
2229         rth->fl.oif     = oldflp->oif;
2230         rth->fl.mark    = oldflp->mark;
2231         rth->rt_dst     = fl->fl4_dst;
2232         rth->rt_src     = fl->fl4_src;
2233         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2234         /* get references to the devices that are to be hold by the routing
2235            cache entry */
2236         rth->u.dst.dev  = dev_out;
2237         dev_hold(dev_out);
2238         rth->idev       = in_dev_get(dev_out);
2239         rth->rt_gateway = fl->fl4_dst;
2240         rth->rt_spec_dst= fl->fl4_src;
2241
2242         rth->u.dst.output=ip_output;
2243
2244         RT_CACHE_STAT_INC(out_slow_tot);
2245
2246         if (flags & RTCF_LOCAL) {
2247                 rth->u.dst.input = ip_local_deliver;
2248                 rth->rt_spec_dst = fl->fl4_dst;
2249         }
2250         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251                 rth->rt_spec_dst = fl->fl4_src;
2252                 if (flags & RTCF_LOCAL &&
2253                     !(dev_out->flags & IFF_LOOPBACK)) {
2254                         rth->u.dst.output = ip_mc_output;
2255                         RT_CACHE_STAT_INC(out_slow_mc);
2256                 }
2257 #ifdef CONFIG_IP_MROUTE
2258                 if (res->type == RTN_MULTICAST) {
2259                         if (IN_DEV_MFORWARD(in_dev) &&
2260                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2261                                 rth->u.dst.input = ip_mr_input;
2262                                 rth->u.dst.output = ip_mc_output;
2263                         }
2264                 }
2265 #endif
2266         }
2267
2268         rt_set_nexthop(rth, res, 0);
2269
2270         rth->rt_flags = flags;
2271
2272         *result = rth;
2273  cleanup:
2274         /* release work reference to inet device */
2275         in_dev_put(in_dev);
2276
2277         return err;
2278 }
2279
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281                                         struct fib_result* res,
2282                                         const struct flowi *fl,
2283                                         const struct flowi *oldflp,
2284                                         struct net_device *dev_out,
2285                                         unsigned flags)
2286 {
2287         struct rtable *rth = NULL;
2288         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289         unsigned hash;
2290         if (err == 0) {
2291                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292                 err = rt_intern_hash(hash, rth, rp);
2293         }
2294
2295         return err;
2296 }
2297
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299                                     struct fib_result* res,
2300                                     const struct flowi *fl,
2301                                     const struct flowi *oldflp,
2302                                     struct net_device *dev_out,
2303                                     unsigned flags)
2304 {
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306         unsigned char hop;
2307         unsigned hash;
2308         int err = -EINVAL;
2309         struct rtable *rth = NULL;
2310
2311         if (res->fi && res->fi->fib_nhs > 1) {
2312                 unsigned char hopcount = res->fi->fib_nhs;
2313
2314                 for (hop = 0; hop < hopcount; hop++) {
2315                         struct net_device *dev2nexthop;
2316
2317                         res->nh_sel = hop;
2318
2319                         /* hold a work reference to the output device */
2320                         dev2nexthop = FIB_RES_DEV(*res);
2321                         dev_hold(dev2nexthop);
2322
2323                         /* put reference to previous result */
2324                         if (hop)
2325                                 ip_rt_put(*rp);
2326
2327                         err = __mkroute_output(&rth, res, fl, oldflp,
2328                                                dev2nexthop, flags);
2329
2330                         if (err != 0)
2331                                 goto cleanup;
2332
2333                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334                                         oldflp->oif);
2335                         err = rt_intern_hash(hash, rth, rp);
2336
2337                         /* forward hop information to multipath impl. */
2338                         multipath_set_nhinfo(rth,
2339                                              FIB_RES_NETWORK(*res),
2340                                              FIB_RES_NETMASK(*res),
2341                                              res->prefixlen,
2342                                              &FIB_RES_NH(*res));
2343                 cleanup:
2344                         /* release work reference to output device */
2345                         dev_put(dev2nexthop);
2346
2347                         if (err != 0)
2348                                 return err;
2349                 }
2350                 return err;
2351         } else {
2352                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353                                              flags);
2354         }
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357 #endif
2358 }
2359
2360 /*
2361  * Major route resolver routine.
2362  */
2363
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 {
2366         u32 tos = RT_FL_TOS(oldflp);
2367         struct flowi fl = { .nl_u = { .ip4_u =
2368                                       { .daddr = oldflp->fl4_dst,
2369                                         .saddr = oldflp->fl4_src,
2370                                         .tos = tos & IPTOS_RT_MASK,
2371                                         .scope = ((tos & RTO_ONLINK) ?
2372                                                   RT_SCOPE_LINK :
2373                                                   RT_SCOPE_UNIVERSE),
2374                                       } },
2375                             .mark = oldflp->mark,
2376                             .iif = loopback_dev.ifindex,
2377                             .oif = oldflp->oif };
2378         struct fib_result res;
2379         unsigned flags = 0;
2380         struct net_device *dev_out = NULL;
2381         int free_res = 0;
2382         int err;
2383
2384
2385         res.fi          = NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2387         res.r           = NULL;
2388 #endif
2389
2390         if (oldflp->fl4_src) {
2391                 err = -EINVAL;
2392                 if (MULTICAST(oldflp->fl4_src) ||
2393                     BADCLASS(oldflp->fl4_src) ||
2394                     ZERONET(oldflp->fl4_src))
2395                         goto out;
2396
2397                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398                 dev_out = ip_dev_find(oldflp->fl4_src);
2399                 if (dev_out == NULL)
2400                         goto out;
2401
2402                 /* I removed check for oif == dev_out->oif here.
2403                    It was wrong for two reasons:
2404                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405                       assigned to multiple interfaces.
2406                    2. Moreover, we are allowed to send packets with saddr
2407                       of another iface. --ANK
2408                  */
2409
2410                 if (oldflp->oif == 0
2411                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412                         /* Special hack: user can direct multicasts
2413                            and limited broadcast via necessary interface
2414                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415                            This hack is not just for fun, it allows
2416                            vic,vat and friends to work.
2417                            They bind socket to loopback, set ttl to zero
2418                            and expect that it will work.
2419                            From the viewpoint of routing cache they are broken,
2420                            because we are not allowed to build multicast path
2421                            with loopback source addr (look, routing cache
2422                            cannot know, that ttl is zero, so that packet
2423                            will not leave this host and route is valid).
2424                            Luckily, this hack is good workaround.
2425                          */
2426
2427                         fl.oif = dev_out->ifindex;
2428                         goto make_route;
2429                 }
2430                 if (dev_out)
2431                         dev_put(dev_out);
2432                 dev_out = NULL;
2433         }
2434
2435
2436         if (oldflp->oif) {
2437                 dev_out = dev_get_by_index(oldflp->oif);
2438                 err = -ENODEV;
2439                 if (dev_out == NULL)
2440                         goto out;
2441
2442                 /* RACE: Check return value of inet_select_addr instead. */
2443                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2444                         dev_put(dev_out);
2445                         goto out;       /* Wrong error code */
2446                 }
2447
2448                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2449                         if (!fl.fl4_src)
2450                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2451                                                               RT_SCOPE_LINK);
2452                         goto make_route;
2453                 }
2454                 if (!fl.fl4_src) {
2455                         if (MULTICAST(oldflp->fl4_dst))
2456                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2457                                                               fl.fl4_scope);
2458                         else if (!oldflp->fl4_dst)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_HOST);
2461                 }
2462         }
2463
2464         if (!fl.fl4_dst) {
2465                 fl.fl4_dst = fl.fl4_src;
2466                 if (!fl.fl4_dst)
2467                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468                 if (dev_out)
2469                         dev_put(dev_out);
2470                 dev_out = &loopback_dev;
2471                 dev_hold(dev_out);
2472                 fl.oif = loopback_dev.ifindex;
2473                 res.type = RTN_LOCAL;
2474                 flags |= RTCF_LOCAL;
2475                 goto make_route;
2476         }
2477
2478         if (fib_lookup(&fl, &res)) {
2479                 res.fi = NULL;
2480                 if (oldflp->oif) {
2481                         /* Apparently, routing tables are wrong. Assume,
2482                            that the destination is on link.
2483
2484                            WHY? DW.
2485                            Because we are allowed to send to iface
2486                            even if it has NO routes and NO assigned
2487                            addresses. When oif is specified, routing
2488                            tables are looked up with only one purpose:
2489                            to catch if destination is gatewayed, rather than
2490                            direct. Moreover, if MSG_DONTROUTE is set,
2491                            we send packet, ignoring both routing tables
2492                            and ifaddr state. --ANK
2493
2494
2495                            We could make it even if oif is unknown,
2496                            likely IPv6, but we do not.
2497                          */
2498
2499                         if (fl.fl4_src == 0)
2500                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2501                                                               RT_SCOPE_LINK);
2502                         res.type = RTN_UNICAST;
2503                         goto make_route;
2504                 }
2505                 if (dev_out)
2506                         dev_put(dev_out);
2507                 err = -ENETUNREACH;
2508                 goto out;
2509         }
2510         free_res = 1;
2511
2512         if (res.type == RTN_LOCAL) {
2513                 if (!fl.fl4_src)
2514                         fl.fl4_src = fl.fl4_dst;
2515                 if (dev_out)
2516                         dev_put(dev_out);
2517                 dev_out = &loopback_dev;
2518                 dev_hold(dev_out);
2519                 fl.oif = dev_out->ifindex;
2520                 if (res.fi)
2521                         fib_info_put(res.fi);
2522                 res.fi = NULL;
2523                 flags |= RTCF_LOCAL;
2524                 goto make_route;
2525         }
2526
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529                 fib_select_multipath(&fl, &res);
2530         else
2531 #endif
2532         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533                 fib_select_default(&fl, &res);
2534
2535         if (!fl.fl4_src)
2536                 fl.fl4_src = FIB_RES_PREFSRC(res);
2537
2538         if (dev_out)
2539                 dev_put(dev_out);
2540         dev_out = FIB_RES_DEV(res);
2541         dev_hold(dev_out);
2542         fl.oif = dev_out->ifindex;
2543
2544
2545 make_route:
2546         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547
2548
2549         if (free_res)
2550                 fib_res_put(&res);
2551         if (dev_out)
2552                 dev_put(dev_out);
2553 out:    return err;
2554 }
2555
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557 {
2558         unsigned hash;
2559         struct rtable *rth;
2560
2561         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2562
2563         rcu_read_lock_bh();
2564         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2566                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567                     rth->fl.fl4_src == flp->fl4_src &&
2568                     rth->fl.iif == 0 &&
2569                     rth->fl.oif == flp->oif &&
2570                     rth->fl.mark == flp->mark &&
2571                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2573
2574                         /* check for multipath routes and choose one if
2575                          * necessary
2576                          */
2577                         if (multipath_select_route(flp, rth, rp)) {
2578                                 dst_hold(&(*rp)->u.dst);
2579                                 RT_CACHE_STAT_INC(out_hit);
2580                                 rcu_read_unlock_bh();
2581                                 return 0;
2582                         }
2583
2584                         rth->u.dst.lastuse = jiffies;
2585                         dst_hold(&rth->u.dst);
2586                         rth->u.dst.__use++;
2587                         RT_CACHE_STAT_INC(out_hit);
2588                         rcu_read_unlock_bh();
2589                         *rp = rth;
2590                         return 0;
2591                 }
2592                 RT_CACHE_STAT_INC(out_hlist_search);
2593         }
2594         rcu_read_unlock_bh();
2595
2596         return ip_route_output_slow(rp, flp);
2597 }
2598
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600
2601 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2602 {
2603 }
2604
2605 static struct dst_ops ipv4_dst_blackhole_ops = {
2606         .family                 =       AF_INET,
2607         .protocol               =       __constant_htons(ETH_P_IP),
2608         .destroy                =       ipv4_dst_destroy,
2609         .check                  =       ipv4_dst_check,
2610         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2611         .entry_size             =       sizeof(struct rtable),
2612 };
2613
2614
2615 static int ipv4_blackhole_output(struct sk_buff *skb)
2616 {
2617         kfree_skb(skb);
2618         return 0;
2619 }
2620
2621 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2622 {
2623         struct rtable *ort = *rp;
2624         struct rtable *rt = (struct rtable *)
2625                 dst_alloc(&ipv4_dst_blackhole_ops);
2626
2627         if (rt) {
2628                 struct dst_entry *new = &rt->u.dst;
2629
2630                 atomic_set(&new->__refcnt, 1);
2631                 new->__use = 1;
2632                 new->input = ipv4_blackhole_output;
2633                 new->output = ipv4_blackhole_output;
2634                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2635
2636                 new->dev = ort->u.dst.dev;
2637                 if (new->dev)
2638                         dev_hold(new->dev);
2639
2640                 rt->fl = ort->fl;
2641
2642                 rt->idev = ort->idev;
2643                 if (rt->idev)
2644                         in_dev_hold(rt->idev);
2645                 rt->rt_flags = ort->rt_flags;
2646                 rt->rt_type = ort->rt_type;
2647                 rt->rt_dst = ort->rt_dst;
2648                 rt->rt_src = ort->rt_src;
2649                 rt->rt_iif = ort->rt_iif;
2650                 rt->rt_gateway = ort->rt_gateway;
2651                 rt->rt_spec_dst = ort->rt_spec_dst;
2652                 rt->peer = ort->peer;
2653                 if (rt->peer)
2654                         atomic_inc(&rt->peer->refcnt);
2655
2656                 dst_free(new);
2657         }
2658
2659         dst_release(&(*rp)->u.dst);
2660         *rp = rt;
2661         return (rt ? 0 : -ENOMEM);
2662 }
2663
2664 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2665 {
2666         int err;
2667
2668         if ((err = __ip_route_output_key(rp, flp)) != 0)
2669                 return err;
2670
2671         if (flp->proto) {
2672                 if (!flp->fl4_src)
2673                         flp->fl4_src = (*rp)->rt_src;
2674                 if (!flp->fl4_dst)
2675                         flp->fl4_dst = (*rp)->rt_dst;
2676                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2677                 if (err == -EREMOTE)
2678                         err = ipv4_dst_blackhole(rp, flp, sk);
2679
2680                 return err;
2681         }
2682
2683         return 0;
2684 }
2685
2686 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2687
2688 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2689 {
2690         return ip_route_output_flow(rp, flp, NULL, 0);
2691 }
2692
2693 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2694                         int nowait, unsigned int flags)
2695 {
2696         struct rtable *rt = (struct rtable*)skb->dst;
2697         struct rtmsg *r;
2698         struct nlmsghdr *nlh;
2699         long expires;
2700         u32 id = 0, ts = 0, tsage = 0, error;
2701
2702         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2703         if (nlh == NULL)
2704                 return -EMSGSIZE;
2705
2706         r = nlmsg_data(nlh);
2707         r->rtm_family    = AF_INET;
2708         r->rtm_dst_len  = 32;
2709         r->rtm_src_len  = 0;
2710         r->rtm_tos      = rt->fl.fl4_tos;
2711         r->rtm_table    = RT_TABLE_MAIN;
2712         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2713         r->rtm_type     = rt->rt_type;
2714         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2715         r->rtm_protocol = RTPROT_UNSPEC;
2716         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2717         if (rt->rt_flags & RTCF_NOTIFY)
2718                 r->rtm_flags |= RTM_F_NOTIFY;
2719
2720         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2721
2722         if (rt->fl.fl4_src) {
2723                 r->rtm_src_len = 32;
2724                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2725         }
2726         if (rt->u.dst.dev)
2727                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2728 #ifdef CONFIG_NET_CLS_ROUTE
2729         if (rt->u.dst.tclassid)
2730                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2731 #endif
2732 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2733         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2734                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2735 #endif
2736         if (rt->fl.iif)
2737                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2738         else if (rt->rt_src != rt->fl.fl4_src)
2739                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2740
2741         if (rt->rt_dst != rt->rt_gateway)
2742                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2743
2744         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2745                 goto nla_put_failure;
2746
2747         error = rt->u.dst.error;
2748         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2749         if (rt->peer) {
2750                 id = rt->peer->ip_id_count;
2751                 if (rt->peer->tcp_ts_stamp) {
2752                         ts = rt->peer->tcp_ts;
2753                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2754                 }
2755         }
2756
2757         if (rt->fl.iif) {
2758 #ifdef CONFIG_IP_MROUTE
2759                 __be32 dst = rt->rt_dst;
2760
2761                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2762                     ipv4_devconf.mc_forwarding) {
2763                         int err = ipmr_get_route(skb, r, nowait);
2764                         if (err <= 0) {
2765                                 if (!nowait) {
2766                                         if (err == 0)
2767                                                 return 0;
2768                                         goto nla_put_failure;
2769                                 } else {
2770                                         if (err == -EMSGSIZE)
2771                                                 goto nla_put_failure;
2772                                         error = err;
2773                                 }
2774                         }
2775                 } else
2776 #endif
2777                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2778         }
2779
2780         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2781                                expires, error) < 0)
2782                 goto nla_put_failure;
2783
2784         return nlmsg_end(skb, nlh);
2785
2786 nla_put_failure:
2787         nlmsg_cancel(skb, nlh);
2788         return -EMSGSIZE;
2789 }
2790
2791 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2792 {
2793         struct rtmsg *rtm;
2794         struct nlattr *tb[RTA_MAX+1];
2795         struct rtable *rt = NULL;
2796         __be32 dst = 0;
2797         __be32 src = 0;
2798         u32 iif;
2799         int err;
2800         struct sk_buff *skb;
2801
2802         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2803         if (err < 0)
2804                 goto errout;
2805
2806         rtm = nlmsg_data(nlh);
2807
2808         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2809         if (skb == NULL) {
2810                 err = -ENOBUFS;
2811                 goto errout;
2812         }
2813
2814         /* Reserve room for dummy headers, this skb can pass
2815            through good chunk of routing engine.
2816          */
2817         skb_reset_mac_header(skb);
2818         skb_reset_network_header(skb);
2819
2820         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2821         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2822         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2823
2824         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2825         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2826         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2827
2828         if (iif) {
2829                 struct net_device *dev;
2830
2831                 dev = __dev_get_by_index(iif);
2832                 if (dev == NULL) {
2833                         err = -ENODEV;
2834                         goto errout_free;
2835                 }
2836
2837                 skb->protocol   = htons(ETH_P_IP);
2838                 skb->dev        = dev;
2839                 local_bh_disable();
2840                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2841                 local_bh_enable();
2842
2843                 rt = (struct rtable*) skb->dst;
2844                 if (err == 0 && rt->u.dst.error)
2845                         err = -rt->u.dst.error;
2846         } else {
2847                 struct flowi fl = {
2848                         .nl_u = {
2849                                 .ip4_u = {
2850                                         .daddr = dst,
2851                                         .saddr = src,
2852                                         .tos = rtm->rtm_tos,
2853                                 },
2854                         },
2855                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2856                 };
2857                 err = ip_route_output_key(&rt, &fl);
2858         }
2859
2860         if (err)
2861                 goto errout_free;
2862
2863         skb->dst = &rt->u.dst;
2864         if (rtm->rtm_flags & RTM_F_NOTIFY)
2865                 rt->rt_flags |= RTCF_NOTIFY;
2866
2867         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2868                                 RTM_NEWROUTE, 0, 0);
2869         if (err <= 0)
2870                 goto errout_free;
2871
2872         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2873 errout:
2874         return err;
2875
2876 errout_free:
2877         kfree_skb(skb);
2878         goto errout;
2879 }
2880
2881 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2882 {
2883         struct rtable *rt;
2884         int h, s_h;
2885         int idx, s_idx;
2886
2887         s_h = cb->args[0];
2888         s_idx = idx = cb->args[1];
2889         for (h = 0; h <= rt_hash_mask; h++) {
2890                 if (h < s_h) continue;
2891                 if (h > s_h)
2892                         s_idx = 0;
2893                 rcu_read_lock_bh();
2894                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2895                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2896                         if (idx < s_idx)
2897                                 continue;
2898                         skb->dst = dst_clone(&rt->u.dst);
2899                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2900                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2901                                          1, NLM_F_MULTI) <= 0) {
2902                                 dst_release(xchg(&skb->dst, NULL));
2903                                 rcu_read_unlock_bh();
2904                                 goto done;
2905                         }
2906                         dst_release(xchg(&skb->dst, NULL));
2907                 }
2908                 rcu_read_unlock_bh();
2909         }
2910
2911 done:
2912         cb->args[0] = h;
2913         cb->args[1] = idx;
2914         return skb->len;
2915 }
2916
2917 void ip_rt_multicast_event(struct in_device *in_dev)
2918 {
2919         rt_cache_flush(0);
2920 }
2921
2922 #ifdef CONFIG_SYSCTL
2923 static int flush_delay;
2924
2925 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2926                                         struct file *filp, void __user *buffer,
2927                                         size_t *lenp, loff_t *ppos)
2928 {
2929         if (write) {
2930                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2931                 rt_cache_flush(flush_delay);
2932                 return 0;
2933         }
2934
2935         return -EINVAL;
2936 }
2937
2938 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2939                                                 int __user *name,
2940                                                 int nlen,
2941                                                 void __user *oldval,
2942                                                 size_t __user *oldlenp,
2943                                                 void __user *newval,
2944                                                 size_t newlen)
2945 {
2946         int delay;
2947         if (newlen != sizeof(int))
2948                 return -EINVAL;
2949         if (get_user(delay, (int __user *)newval))
2950                 return -EFAULT;
2951         rt_cache_flush(delay);
2952         return 0;
2953 }
2954
2955 ctl_table ipv4_route_table[] = {
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2958                 .procname       = "flush",
2959                 .data           = &flush_delay,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0200,
2962                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2963                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2964         },
2965         {
2966                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2967                 .procname       = "min_delay",
2968                 .data           = &ip_rt_min_delay,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = &proc_dointvec_jiffies,
2972                 .strategy       = &sysctl_jiffies,
2973         },
2974         {
2975                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2976                 .procname       = "max_delay",
2977                 .data           = &ip_rt_max_delay,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = &proc_dointvec_jiffies,
2981                 .strategy       = &sysctl_jiffies,
2982         },
2983         {
2984                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2985                 .procname       = "gc_thresh",
2986                 .data           = &ipv4_dst_ops.gc_thresh,
2987                 .maxlen         = sizeof(int),
2988                 .mode           = 0644,
2989                 .proc_handler   = &proc_dointvec,
2990         },
2991         {
2992                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2993                 .procname       = "max_size",
2994                 .data           = &ip_rt_max_size,
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0644,
2997                 .proc_handler   = &proc_dointvec,
2998         },
2999         {
3000                 /*  Deprecated. Use gc_min_interval_ms */
3001
3002                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3003                 .procname       = "gc_min_interval",
3004                 .data           = &ip_rt_gc_min_interval,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = &proc_dointvec_jiffies,
3008                 .strategy       = &sysctl_jiffies,
3009         },
3010         {
3011                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3012                 .procname       = "gc_min_interval_ms",
3013                 .data           = &ip_rt_gc_min_interval,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec_ms_jiffies,
3017                 .strategy       = &sysctl_ms_jiffies,
3018         },
3019         {
3020                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3021                 .procname       = "gc_timeout",
3022                 .data           = &ip_rt_gc_timeout,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = &proc_dointvec_jiffies,
3026                 .strategy       = &sysctl_jiffies,
3027         },
3028         {
3029                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3030                 .procname       = "gc_interval",
3031                 .data           = &ip_rt_gc_interval,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = &proc_dointvec_jiffies,
3035                 .strategy       = &sysctl_jiffies,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3039                 .procname       = "redirect_load",
3040                 .data           = &ip_rt_redirect_load,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec,
3044         },
3045         {
3046                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3047                 .procname       = "redirect_number",
3048                 .data           = &ip_rt_redirect_number,
3049                 .maxlen         = sizeof(int),
3050                 .mode           = 0644,
3051                 .proc_handler   = &proc_dointvec,
3052         },
3053         {
3054                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3055                 .procname       = "redirect_silence",
3056                 .data           = &ip_rt_redirect_silence,
3057                 .maxlen         = sizeof(int),
3058                 .mode           = 0644,
3059                 .proc_handler   = &proc_dointvec,
3060         },
3061         {
3062                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3063                 .procname       = "error_cost",
3064                 .data           = &ip_rt_error_cost,
3065                 .maxlen         = sizeof(int),
3066                 .mode           = 0644,
3067                 .proc_handler   = &proc_dointvec,
3068         },
3069         {
3070                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3071                 .procname       = "error_burst",
3072                 .data           = &ip_rt_error_burst,
3073                 .maxlen         = sizeof(int),
3074                 .mode           = 0644,
3075                 .proc_handler   = &proc_dointvec,
3076         },
3077         {
3078                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3079                 .procname       = "gc_elasticity",
3080                 .data           = &ip_rt_gc_elasticity,
3081                 .maxlen         = sizeof(int),
3082                 .mode           = 0644,
3083                 .proc_handler   = &proc_dointvec,
3084         },
3085         {
3086                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3087                 .procname       = "mtu_expires",
3088                 .data           = &ip_rt_mtu_expires,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = &proc_dointvec_jiffies,
3092                 .strategy       = &sysctl_jiffies,
3093         },
3094         {
3095                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3096                 .procname       = "min_pmtu",
3097                 .data           = &ip_rt_min_pmtu,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = &proc_dointvec,
3101         },
3102         {
3103                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3104                 .procname       = "min_adv_mss",
3105                 .data           = &ip_rt_min_advmss,
3106                 .maxlen         = sizeof(int),
3107                 .mode           = 0644,
3108                 .proc_handler   = &proc_dointvec,
3109         },
3110         {
3111                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3112                 .procname       = "secret_interval",
3113                 .data           = &ip_rt_secret_interval,
3114                 .maxlen         = sizeof(int),
3115                 .mode           = 0644,
3116                 .proc_handler   = &proc_dointvec_jiffies,
3117                 .strategy       = &sysctl_jiffies,
3118         },
3119         { .ctl_name = 0 }
3120 };
3121 #endif
3122
3123 #ifdef CONFIG_NET_CLS_ROUTE
3124 struct ip_rt_acct *ip_rt_acct;
3125
3126 /* This code sucks.  But you should have seen it before! --RR */
3127
3128 /* IP route accounting ptr for this logical cpu number. */
3129 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3130
3131 #ifdef CONFIG_PROC_FS
3132 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3133                            int length, int *eof, void *data)
3134 {
3135         unsigned int i;
3136
3137         if ((offset & 3) || (length & 3))
3138                 return -EIO;
3139
3140         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3141                 *eof = 1;
3142                 return 0;
3143         }
3144
3145         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3146                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3147                 *eof = 1;
3148         }
3149
3150         offset /= sizeof(u32);
3151
3152         if (length > 0) {
3153                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3154                 u32 *dst = (u32 *) buffer;
3155
3156                 /* Copy first cpu. */
3157                 *start = buffer;
3158                 memcpy(dst, src, length);
3159
3160                 /* Add the other cpus in, one int at a time */
3161                 for_each_possible_cpu(i) {
3162                         unsigned int j;
3163
3164                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3165
3166                         for (j = 0; j < length/4; j++)
3167                                 dst[j] += src[j];
3168                 }
3169         }
3170         return length;
3171 }
3172 #endif /* CONFIG_PROC_FS */
3173 #endif /* CONFIG_NET_CLS_ROUTE */
3174
3175 static __initdata unsigned long rhash_entries;
3176 static int __init set_rhash_entries(char *str)
3177 {
3178         if (!str)
3179                 return 0;
3180         rhash_entries = simple_strtoul(str, &str, 0);
3181         return 1;
3182 }
3183 __setup("rhash_entries=", set_rhash_entries);
3184
3185 int __init ip_rt_init(void)
3186 {
3187         int rc = 0;
3188
3189         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3190                              (jiffies ^ (jiffies >> 7)));
3191
3192 #ifdef CONFIG_NET_CLS_ROUTE
3193         {
3194         int order;
3195         for (order = 0;
3196              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3197                 /* NOTHING */;
3198         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3199         if (!ip_rt_acct)
3200                 panic("IP: failed to allocate ip_rt_acct\n");
3201         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3202         }
3203 #endif
3204
3205         ipv4_dst_ops.kmem_cachep =
3206                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3207                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3208
3209         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3210
3211         rt_hash_table = (struct rt_hash_bucket *)
3212                 alloc_large_system_hash("IP route cache",
3213                                         sizeof(struct rt_hash_bucket),
3214                                         rhash_entries,
3215                                         (num_physpages >= 128 * 1024) ?
3216                                         15 : 17,
3217                                         0,
3218                                         &rt_hash_log,
3219                                         &rt_hash_mask,
3220                                         0);
3221         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3222         rt_hash_lock_init();
3223
3224         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3225         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3226
3227         devinet_init();
3228         ip_fib_init();
3229
3230         init_timer(&rt_flush_timer);
3231         rt_flush_timer.function = rt_run_flush;
3232         init_timer(&rt_periodic_timer);
3233         rt_periodic_timer.function = rt_check_expire;
3234         init_timer(&rt_secret_timer);
3235         rt_secret_timer.function = rt_secret_rebuild;
3236
3237         /* All the timers, started at system startup tend
3238            to synchronize. Perturb it a bit.
3239          */
3240         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3241                                         ip_rt_gc_interval;
3242         add_timer(&rt_periodic_timer);
3243
3244         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3245                 ip_rt_secret_interval;
3246         add_timer(&rt_secret_timer);
3247
3248 #ifdef CONFIG_PROC_FS
3249         {
3250         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3251         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3252             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3253                                              proc_net_stat))) {
3254                 return -ENOMEM;
3255         }
3256         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3257         }
3258 #ifdef CONFIG_NET_CLS_ROUTE
3259         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3260 #endif
3261 #endif
3262 #ifdef CONFIG_XFRM
3263         xfrm_init();
3264         xfrm4_init();
3265 #endif
3266         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3267
3268         return rc;
3269 }
3270
3271 EXPORT_SYMBOL(__ip_select_ident);
3272 EXPORT_SYMBOL(ip_route_input);
3273 EXPORT_SYMBOL(ip_route_output_key);