net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/mm.h>
  75 #include <linux/bootmem.h>
  76 #include <linux/string.h>
  77 #include <linux/socket.h>
  78 #include <linux/sockios.h>
  79 #include <linux/errno.h>
  80 #include <linux/in.h>
  81 #include <linux/inet.h>
  82 #include <linux/netdevice.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/init.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/rtnetlink.h>
  87 #include <linux/inetdevice.h>
  88 #include <linux/igmp.h>
  89 #include <linux/pkt_sched.h>
  90 #include <linux/mroute.h>
  91 #include <linux/netfilter_ipv4.h>
  92 #include <linux/random.h>
  93 #include <linux/jhash.h>
  94 #include <linux/rcupdate.h>
  95 #include <linux/times.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/ip_mp_alg.h>
 107 #include <net/netevent.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_min_delay              = 2 * HZ;
 120 static int ip_rt_max_delay              = 10 * HZ;
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval            = 60 * HZ;
 124 static int ip_rt_gc_min_interval        = HZ / 2;
 125 static int ip_rt_redirect_number        = 9;
 126 static int ip_rt_redirect_load          = HZ / 50;
 127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost             = HZ;
 129 static int ip_rt_error_burst            = 5 * HZ;
 130 static int ip_rt_gc_elasticity          = 8;
 131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133 static int ip_rt_min_advmss             = 256;
 134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135 static unsigned long rt_deadline;
 136
 137 #define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139 static struct timer_list rt_flush_timer;
 140 static struct timer_list rt_periodic_timer;
 141 static struct timer_list rt_secret_timer;
 142
 143 /*
 144  *      Interface to generic destination cache.
 145  */
 146
 147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 150                                          struct net_device *dev, int how);
 151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 152 static void              ipv4_link_failure(struct sk_buff *skb);
 153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 154 static int rt_garbage_collect(void);
 155
 156
 157 static struct dst_ops ipv4_dst_ops = {
 158         .family =               AF_INET,
 159         .protocol =             __constant_htons(ETH_P_IP),
 160         .gc =                   rt_garbage_collect,
 161         .check =                ipv4_dst_check,
 162         .destroy =              ipv4_dst_destroy,
 163         .ifdown =               ipv4_dst_ifdown,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .entry_size =           sizeof(struct rtable),
 168 };
 169
 170 #define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172 __u8 ip_tos2prio[16] = {
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(FILLER),
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK)
 189 };
 190
 191
 192 /*
 193  * Route cache.
 194  */
 195
 196 /* The locking scheme is rather straight forward:
 197  *
 198  * 1) Read-Copy Update protects the buckets of the central route hash.
 199  * 2) Only writers remove entries, and they hold the lock
 200  *    as they look at rtable reference counts.
 201  * 3) Only readers acquire references to rtable entries,
 202  *    they do so with atomic increments and with the
 203  *    lock held.
 204  */
 205
 206 struct rt_hash_bucket {
 207         struct rtable   *chain;
 208 };
 209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 210         defined(CONFIG_PROVE_LOCKING)
 211 /*
 212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 213  * The size of this table is a power of two and depends on the number of CPUS.
 214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 215  */
 216 #ifdef CONFIG_LOCKDEP
 217 # define RT_HASH_LOCK_SZ        256
 218 #else
 219 # if NR_CPUS >= 32
 220 #  define RT_HASH_LOCK_SZ       4096
 221 # elif NR_CPUS >= 16
 222 #  define RT_HASH_LOCK_SZ       2048
 223 # elif NR_CPUS >= 8
 224 #  define RT_HASH_LOCK_SZ       1024
 225 # elif NR_CPUS >= 4
 226 #  define RT_HASH_LOCK_SZ       512
 227 # else
 228 #  define RT_HASH_LOCK_SZ       256
 229 # endif
 230 #endif
 231
 232 static spinlock_t       *rt_hash_locks;
 233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 234 # define rt_hash_lock_init()    { \
 235                 int i; \
 236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 239                         spin_lock_init(&rt_hash_locks[i]); \
 240                 }
 241 #else
 242 # define rt_hash_lock_addr(slot) NULL
 243 # define rt_hash_lock_init()
 244 #endif
 245
 246 static struct rt_hash_bucket    *rt_hash_table;
 247 static unsigned                 rt_hash_mask;
 248 static int                      rt_hash_log;
 249 static unsigned int             rt_hash_rnd;
 250
 251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 252 #define RT_CACHE_STAT_INC(field) \
 253         (__raw_get_cpu_var(rt_cache_stat).field++)
 254
 255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 256                                 struct rtable **res);
 257
 258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 259 {
 260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 261                 & rt_hash_mask);
 262 }
 263
 264 #define rt_hash(daddr, saddr, idx) \
 265         rt_hash_code((__force u32)(__be32)(daddr),\
 266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 267
 268 #ifdef CONFIG_PROC_FS
 269 struct rt_cache_iter_state {
 270         int bucket;
 271 };
 272
 273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 274 {
 275         struct rtable *r = NULL;
 276         struct rt_cache_iter_state *st = seq->private;
 277
 278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 279                 rcu_read_lock_bh();
 280                 r = rt_hash_table[st->bucket].chain;
 281                 if (r)
 282                         break;
 283                 rcu_read_unlock_bh();
 284         }
 285         return r;
 286 }
 287
 288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 289 {
 290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 291
 292         r = r->u.rt_next;
 293         while (!r) {
 294                 rcu_read_unlock_bh();
 295                 if (--st->bucket < 0)
 296                         break;
 297                 rcu_read_lock_bh();
 298                 r = rt_hash_table[st->bucket].chain;
 299         }
 300         return r;
 301 }
 302
 303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 304 {
 305         struct rtable *r = rt_cache_get_first(seq);
 306
 307         if (r)
 308                 while (pos && (r = rt_cache_get_next(seq, r)))
 309                         --pos;
 310         return pos ? NULL : r;
 311 }
 312
 313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 314 {
 315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 316 }
 317
 318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 319 {
 320         struct rtable *r = NULL;
 321
 322         if (v == SEQ_START_TOKEN)
 323                 r = rt_cache_get_first(seq);
 324         else
 325                 r = rt_cache_get_next(seq, v);
 326         ++*pos;
 327         return r;
 328 }
 329
 330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 331 {
 332         if (v && v != SEQ_START_TOKEN)
 333                 rcu_read_unlock_bh();
 334 }
 335
 336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 337 {
 338         if (v == SEQ_START_TOKEN)
 339                 seq_printf(seq, "%-127s\n",
 340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 342                            "HHUptod\tSpecDst");
 343         else {
 344                 struct rtable *r = v;
 345                 char temp[256];
 346
 347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 355                         dst_metric(&r->u.dst, RTAX_WINDOW),
 356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 358                         r->fl.fl4_tos,
 359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 361                                        dev_queue_xmit) : 0,
 362                         r->rt_spec_dst);
 363                 seq_printf(seq, "%-127s\n", temp);
 364         }
 365         return 0;
 366 }
 367
 368 static struct seq_operations rt_cache_seq_ops = {
 369         .start  = rt_cache_seq_start,
 370         .next   = rt_cache_seq_next,
 371         .stop   = rt_cache_seq_stop,
 372         .show   = rt_cache_seq_show,
 373 };
 374
 375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 376 {
 377         struct seq_file *seq;
 378         int rc = -ENOMEM;
 379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 380
 381         if (!s)
 382                 goto out;
 383         rc = seq_open(file, &rt_cache_seq_ops);
 384         if (rc)
 385                 goto out_kfree;
 386         seq          = file->private_data;
 387         seq->private = s;
 388         memset(s, 0, sizeof(*s));
 389 out:
 390         return rc;
 391 out_kfree:
 392         kfree(s);
 393         goto out;
 394 }
 395
 396 static struct file_operations rt_cache_seq_fops = {
 397         .owner   = THIS_MODULE,
 398         .open    = rt_cache_seq_open,
 399         .read    = seq_read,
 400         .llseek  = seq_lseek,
 401         .release = seq_release_private,
 402 };
 403
 404
 405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 406 {
 407         int cpu;
 408
 409         if (*pos == 0)
 410                 return SEQ_START_TOKEN;
 411
 412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 413                 if (!cpu_possible(cpu))
 414                         continue;
 415                 *pos = cpu+1;
 416                 return &per_cpu(rt_cache_stat, cpu);
 417         }
 418         return NULL;
 419 }
 420
 421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 422 {
 423         int cpu;
 424
 425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 426                 if (!cpu_possible(cpu))
 427                         continue;
 428                 *pos = cpu+1;
 429                 return &per_cpu(rt_cache_stat, cpu);
 430         }
 431         return NULL;
 432
 433 }
 434
 435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 436 {
 437
 438 }
 439
 440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 441 {
 442         struct rt_cache_stat *st = v;
 443
 444         if (v == SEQ_START_TOKEN) {
 445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 446                 return 0;
 447         }
 448
 449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 451                    atomic_read(&ipv4_dst_ops.entries),
 452                    st->in_hit,
 453                    st->in_slow_tot,
 454                    st->in_slow_mc,
 455                    st->in_no_route,
 456                    st->in_brd,
 457                    st->in_martian_dst,
 458                    st->in_martian_src,
 459
 460                    st->out_hit,
 461                    st->out_slow_tot,
 462                    st->out_slow_mc,
 463
 464                    st->gc_total,
 465                    st->gc_ignored,
 466                    st->gc_goal_miss,
 467                    st->gc_dst_overflow,
 468                    st->in_hlist_search,
 469                    st->out_hlist_search
 470                 );
 471         return 0;
 472 }
 473
 474 static struct seq_operations rt_cpu_seq_ops = {
 475         .start  = rt_cpu_seq_start,
 476         .next   = rt_cpu_seq_next,
 477         .stop   = rt_cpu_seq_stop,
 478         .show   = rt_cpu_seq_show,
 479 };
 480
 481
 482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 483 {
 484         return seq_open(file, &rt_cpu_seq_ops);
 485 }
 486
 487 static struct file_operations rt_cpu_seq_fops = {
 488         .owner   = THIS_MODULE,
 489         .open    = rt_cpu_seq_open,
 490         .read    = seq_read,
 491         .llseek  = seq_lseek,
 492         .release = seq_release,
 493 };
 494
 495 #endif /* CONFIG_PROC_FS */
 496
 497 static __inline__ void rt_free(struct rtable *rt)
 498 {
 499         multipath_remove(rt);
 500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 501 }
 502
 503 static __inline__ void rt_drop(struct rtable *rt)
 504 {
 505         multipath_remove(rt);
 506         ip_rt_put(rt);
 507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 508 }
 509
 510 static __inline__ int rt_fast_clean(struct rtable *rth)
 511 {
 512         /* Kill broadcast/multicast entries very aggresively, if they
 513            collide in hash table with more useful entries */
 514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 515                 rth->fl.iif && rth->u.rt_next;
 516 }
 517
 518 static __inline__ int rt_valuable(struct rtable *rth)
 519 {
 520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 521                 rth->u.dst.expires;
 522 }
 523
 524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 525 {
 526         unsigned long age;
 527         int ret = 0;
 528
 529         if (atomic_read(&rth->u.dst.__refcnt))
 530                 goto out;
 531
 532         ret = 1;
 533         if (rth->u.dst.expires &&
 534             time_after_eq(jiffies, rth->u.dst.expires))
 535                 goto out;
 536
 537         age = jiffies - rth->u.dst.lastuse;
 538         ret = 0;
 539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 540             (age <= tmo2 && rt_valuable(rth)))
 541                 goto out;
 542         ret = 1;
 543 out:    return ret;
 544 }
 545
 546 /* Bits of score are:
 547  * 31: very valuable
 548  * 30: not quite useless
 549  * 29..0: usage counter
 550  */
 551 static inline u32 rt_score(struct rtable *rt)
 552 {
 553         u32 score = jiffies - rt->u.dst.lastuse;
 554
 555         score = ~score & ~(3<<30);
 556
 557         if (rt_valuable(rt))
 558                 score |= (1<<31);
 559
 560         if (!rt->fl.iif ||
 561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 562                 score |= (1<<30);
 563
 564         return score;
 565 }
 566
 567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 568 {
 569         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 570                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 571                 (fl1->mark ^ fl2->mark) |
 572                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 573                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 574                 (fl1->oif ^ fl2->oif) |
 575                 (fl1->iif ^ fl2->iif)) == 0;
 576 }
 577
 578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 580                                                 struct rtable *expentry,
 581                                                 int *removed_count)
 582 {
 583         int passedexpired = 0;
 584         struct rtable **nextstep = NULL;
 585         struct rtable **rthp = chain_head;
 586         struct rtable *rth;
 587
 588         if (removed_count)
 589                 *removed_count = 0;
 590
 591         while ((rth = *rthp) != NULL) {
 592                 if (rth == expentry)
 593                         passedexpired = 1;
 594
 595                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 596                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 597                         if (*rthp == expentry) {
 598                                 *rthp = rth->u.rt_next;
 599                                 continue;
 600                         } else {
 601                                 *rthp = rth->u.rt_next;
 602                                 rt_free(rth);
 603                                 if (removed_count)
 604                                         ++(*removed_count);
 605                         }
 606                 } else {
 607                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 608                             passedexpired && !nextstep)
 609                                 nextstep = &rth->u.rt_next;
 610
 611                         rthp = &rth->u.rt_next;
 612                 }
 613         }
 614
 615         rt_free(expentry);
 616         if (removed_count)
 617                 ++(*removed_count);
 618
 619         return nextstep;
 620 }
 621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 622
 623
 624 /* This runs via a timer and thus is always in BH context. */
 625 static void rt_check_expire(unsigned long dummy)
 626 {
 627         static unsigned int rover;
 628         unsigned int i = rover, goal;
 629         struct rtable *rth, **rthp;
 630         unsigned long now = jiffies;
 631         u64 mult;
 632
 633         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 634         if (ip_rt_gc_timeout > 1)
 635                 do_div(mult, ip_rt_gc_timeout);
 636         goal = (unsigned int)mult;
 637         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 638         for (; goal > 0; goal--) {
 639                 unsigned long tmo = ip_rt_gc_timeout;
 640
 641                 i = (i + 1) & rt_hash_mask;
 642                 rthp = &rt_hash_table[i].chain;
 643
 644                 if (*rthp == 0)
 645                         continue;
 646                 spin_lock(rt_hash_lock_addr(i));
 647                 while ((rth = *rthp) != NULL) {
 648                         if (rth->u.dst.expires) {
 649                                 /* Entry is expired even if it is in use */
 650                                 if (time_before_eq(now, rth->u.dst.expires)) {
 651                                         tmo >>= 1;
 652                                         rthp = &rth->u.rt_next;
 653                                         continue;
 654                                 }
 655                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 656                                 tmo >>= 1;
 657                                 rthp = &rth->u.rt_next;
 658                                 continue;
 659                         }
 660
 661                         /* Cleanup aged off entries. */
 662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 663                         /* remove all related balanced entries if necessary */
 664                         if (rth->u.dst.flags & DST_BALANCED) {
 665                                 rthp = rt_remove_balanced_route(
 666                                         &rt_hash_table[i].chain,
 667                                         rth, NULL);
 668                                 if (!rthp)
 669                                         break;
 670                         } else {
 671                                 *rthp = rth->u.rt_next;
 672                                 rt_free(rth);
 673                         }
 674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 675                         *rthp = rth->u.rt_next;
 676                         rt_free(rth);
 677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 678                 }
 679                 spin_unlock(rt_hash_lock_addr(i));
 680
 681                 /* Fallback loop breaker. */
 682                 if (time_after(jiffies, now))
 683                         break;
 684         }
 685         rover = i;
 686         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 687 }
 688
 689 /* This can run from both BH and non-BH contexts, the latter
 690  * in the case of a forced flush event.
 691  */
 692 static void rt_run_flush(unsigned long dummy)
 693 {
 694         int i;
 695         struct rtable *rth, *next;
 696
 697         rt_deadline = 0;
 698
 699         get_random_bytes(&rt_hash_rnd, 4);
 700
 701         for (i = rt_hash_mask; i >= 0; i--) {
 702                 spin_lock_bh(rt_hash_lock_addr(i));
 703                 rth = rt_hash_table[i].chain;
 704                 if (rth)
 705                         rt_hash_table[i].chain = NULL;
 706                 spin_unlock_bh(rt_hash_lock_addr(i));
 707
 708                 for (; rth; rth = next) {
 709                         next = rth->u.rt_next;
 710                         rt_free(rth);
 711                 }
 712         }
 713 }
 714
 715 static DEFINE_SPINLOCK(rt_flush_lock);
 716
 717 void rt_cache_flush(int delay)
 718 {
 719         unsigned long now = jiffies;
 720         int user_mode = !in_softirq();
 721
 722         if (delay < 0)
 723                 delay = ip_rt_min_delay;
 724
 725         /* flush existing multipath state*/
 726         multipath_flush();
 727
 728         spin_lock_bh(&rt_flush_lock);
 729
 730         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 731                 long tmo = (long)(rt_deadline - now);
 732
 733                 /* If flush timer is already running
 734                    and flush request is not immediate (delay > 0):
 735
 736                    if deadline is not achieved, prolongate timer to "delay",
 737                    otherwise fire it at deadline time.
 738                  */
 739
 740                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 741                         tmo = 0;
 742
 743                 if (delay > tmo)
 744                         delay = tmo;
 745         }
 746
 747         if (delay <= 0) {
 748                 spin_unlock_bh(&rt_flush_lock);
 749                 rt_run_flush(0);
 750                 return;
 751         }
 752
 753         if (rt_deadline == 0)
 754                 rt_deadline = now + ip_rt_max_delay;
 755
 756         mod_timer(&rt_flush_timer, now+delay);
 757         spin_unlock_bh(&rt_flush_lock);
 758 }
 759
 760 static void rt_secret_rebuild(unsigned long dummy)
 761 {
 762         unsigned long now = jiffies;
 763
 764         rt_cache_flush(0);
 765         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 766 }
 767
 768 /*
 769    Short description of GC goals.
 770
 771    We want to build algorithm, which will keep routing cache
 772    at some equilibrium point, when number of aged off entries
 773    is kept approximately equal to newly generated ones.
 774
 775    Current expiration strength is variable "expire".
 776    We try to adjust it dynamically, so that if networking
 777    is idle expires is large enough to keep enough of warm entries,
 778    and when load increases it reduces to limit cache size.
 779  */
 780
 781 static int rt_garbage_collect(void)
 782 {
 783         static unsigned long expire = RT_GC_TIMEOUT;
 784         static unsigned long last_gc;
 785         static int rover;
 786         static int equilibrium;
 787         struct rtable *rth, **rthp;
 788         unsigned long now = jiffies;
 789         int goal;
 790
 791         /*
 792          * Garbage collection is pretty expensive,
 793          * do not make it too frequently.
 794          */
 795
 796         RT_CACHE_STAT_INC(gc_total);
 797
 798         if (now - last_gc < ip_rt_gc_min_interval &&
 799             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 800                 RT_CACHE_STAT_INC(gc_ignored);
 801                 goto out;
 802         }
 803
 804         /* Calculate number of entries, which we want to expire now. */
 805         goal = atomic_read(&ipv4_dst_ops.entries) -
 806                 (ip_rt_gc_elasticity << rt_hash_log);
 807         if (goal <= 0) {
 808                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 809                         equilibrium = ipv4_dst_ops.gc_thresh;
 810                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 811                 if (goal > 0) {
 812                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 813                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 814                 }
 815         } else {
 816                 /* We are in dangerous area. Try to reduce cache really
 817                  * aggressively.
 818                  */
 819                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 820                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 821         }
 822
 823         if (now - last_gc >= ip_rt_gc_min_interval)
 824                 last_gc = now;
 825
 826         if (goal <= 0) {
 827                 equilibrium += goal;
 828                 goto work_done;
 829         }
 830
 831         do {
 832                 int i, k;
 833
 834                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 835                         unsigned long tmo = expire;
 836
 837                         k = (k + 1) & rt_hash_mask;
 838                         rthp = &rt_hash_table[k].chain;
 839                         spin_lock_bh(rt_hash_lock_addr(k));
 840                         while ((rth = *rthp) != NULL) {
 841                                 if (!rt_may_expire(rth, tmo, expire)) {
 842                                         tmo >>= 1;
 843                                         rthp = &rth->u.rt_next;
 844                                         continue;
 845                                 }
 846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 847                                 /* remove all related balanced entries
 848                                  * if necessary
 849                                  */
 850                                 if (rth->u.dst.flags & DST_BALANCED) {
 851                                         int r;
 852
 853                                         rthp = rt_remove_balanced_route(
 854                                                 &rt_hash_table[k].chain,
 855                                                 rth,
 856                                                 &r);
 857                                         goal -= r;
 858                                         if (!rthp)
 859                                                 break;
 860                                 } else {
 861                                         *rthp = rth->u.rt_next;
 862                                         rt_free(rth);
 863                                         goal--;
 864                                 }
 865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 866                                 *rthp = rth->u.rt_next;
 867                                 rt_free(rth);
 868                                 goal--;
 869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 870                         }
 871                         spin_unlock_bh(rt_hash_lock_addr(k));
 872                         if (goal <= 0)
 873                                 break;
 874                 }
 875                 rover = k;
 876
 877                 if (goal <= 0)
 878                         goto work_done;
 879
 880                 /* Goal is not achieved. We stop process if:
 881
 882                    - if expire reduced to zero. Otherwise, expire is halfed.
 883                    - if table is not full.
 884                    - if we are called from interrupt.
 885                    - jiffies check is just fallback/debug loop breaker.
 886                      We will not spin here for long time in any case.
 887                  */
 888
 889                 RT_CACHE_STAT_INC(gc_goal_miss);
 890
 891                 if (expire == 0)
 892                         break;
 893
 894                 expire >>= 1;
 895 #if RT_CACHE_DEBUG >= 2
 896                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 897                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 898 #endif
 899
 900                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 901                         goto out;
 902         } while (!in_softirq() && time_before_eq(jiffies, now));
 903
 904         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 905                 goto out;
 906         if (net_ratelimit())
 907                 printk(KERN_WARNING "dst cache overflow\n");
 908         RT_CACHE_STAT_INC(gc_dst_overflow);
 909         return 1;
 910
 911 work_done:
 912         expire += ip_rt_gc_min_interval;
 913         if (expire > ip_rt_gc_timeout ||
 914             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 915                 expire = ip_rt_gc_timeout;
 916 #if RT_CACHE_DEBUG >= 2
 917         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 918                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 919 #endif
 920 out:    return 0;
 921 }
 922
 923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 924 {
 925         struct rtable   *rth, **rthp;
 926         unsigned long   now;
 927         struct rtable *cand, **candp;
 928         u32             min_score;
 929         int             chain_length;
 930         int attempts = !in_softirq();
 931
 932 restart:
 933         chain_length = 0;
 934         min_score = ~(u32)0;
 935         cand = NULL;
 936         candp = NULL;
 937         now = jiffies;
 938
 939         rthp = &rt_hash_table[hash].chain;
 940
 941         spin_lock_bh(rt_hash_lock_addr(hash));
 942         while ((rth = *rthp) != NULL) {
 943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 944                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 945                     compare_keys(&rth->fl, &rt->fl)) {
 946 #else
 947                 if (compare_keys(&rth->fl, &rt->fl)) {
 948 #endif
 949                         /* Put it first */
 950                         *rthp = rth->u.rt_next;
 951                         /*
 952                          * Since lookup is lockfree, the deletion
 953                          * must be visible to another weakly ordered CPU before
 954                          * the insertion at the start of the hash chain.
 955                          */
 956                         rcu_assign_pointer(rth->u.rt_next,
 957                                            rt_hash_table[hash].chain);
 958                         /*
 959                          * Since lookup is lockfree, the update writes
 960                          * must be ordered for consistency on SMP.
 961                          */
 962                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 963
 964                         rth->u.dst.__use++;
 965                         dst_hold(&rth->u.dst);
 966                         rth->u.dst.lastuse = now;
 967                         spin_unlock_bh(rt_hash_lock_addr(hash));
 968
 969                         rt_drop(rt);
 970                         *rp = rth;
 971                         return 0;
 972                 }
 973
 974                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 975                         u32 score = rt_score(rth);
 976
 977                         if (score <= min_score) {
 978                                 cand = rth;
 979                                 candp = rthp;
 980                                 min_score = score;
 981                         }
 982                 }
 983
 984                 chain_length++;
 985
 986                 rthp = &rth->u.rt_next;
 987         }
 988
 989         if (cand) {
 990                 /* ip_rt_gc_elasticity used to be average length of chain
 991                  * length, when exceeded gc becomes really aggressive.
 992                  *
 993                  * The second limit is less certain. At the moment it allows
 994                  * only 2 entries per bucket. We will see.
 995                  */
 996                 if (chain_length > ip_rt_gc_elasticity) {
 997                         *candp = cand->u.rt_next;
 998                         rt_free(cand);
 999                 }
1000         }
1001
1002         /* Try to bind route to arp only if it is output
1003            route or unicast forwarding path.
1004          */
1005         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006                 int err = arp_bind_neighbour(&rt->u.dst);
1007                 if (err) {
1008                         spin_unlock_bh(rt_hash_lock_addr(hash));
1009
1010                         if (err != -ENOBUFS) {
1011                                 rt_drop(rt);
1012                                 return err;
1013                         }
1014
1015                         /* Neighbour tables are full and nothing
1016                            can be released. Try to shrink route cache,
1017                            it is most likely it holds some neighbour records.
1018                          */
1019                         if (attempts-- > 0) {
1020                                 int saved_elasticity = ip_rt_gc_elasticity;
1021                                 int saved_int = ip_rt_gc_min_interval;
1022                                 ip_rt_gc_elasticity     = 1;
1023                                 ip_rt_gc_min_interval   = 0;
1024                                 rt_garbage_collect();
1025                                 ip_rt_gc_min_interval   = saved_int;
1026                                 ip_rt_gc_elasticity     = saved_elasticity;
1027                                 goto restart;
1028                         }
1029
1030                         if (net_ratelimit())
1031                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1032                         rt_drop(rt);
1033                         return -ENOBUFS;
1034                 }
1035         }
1036
1037         rt->u.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039         if (rt->u.rt_next) {
1040                 struct rtable *trt;
1041                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042                        NIPQUAD(rt->rt_dst));
1043                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045                 printk("\n");
1046         }
1047 #endif
1048         rt_hash_table[hash].chain = rt;
1049         spin_unlock_bh(rt_hash_lock_addr(hash));
1050         *rp = rt;
1051         return 0;
1052 }
1053
1054 void rt_bind_peer(struct rtable *rt, int create)
1055 {
1056         static DEFINE_SPINLOCK(rt_peer_lock);
1057         struct inet_peer *peer;
1058
1059         peer = inet_getpeer(rt->rt_dst, create);
1060
1061         spin_lock_bh(&rt_peer_lock);
1062         if (rt->peer == NULL) {
1063                 rt->peer = peer;
1064                 peer = NULL;
1065         }
1066         spin_unlock_bh(&rt_peer_lock);
1067         if (peer)
1068                 inet_putpeer(peer);
1069 }
1070
1071 /*
1072  * Peer allocation may fail only in serious out-of-memory conditions.  However
1073  * we still can generate some output.
1074  * Random ID selection looks a bit dangerous because we have no chances to
1075  * select ID being unique in a reasonable period of time.
1076  * But broken packet identifier may be better than no packet at all.
1077  */
1078 static void ip_select_fb_ident(struct iphdr *iph)
1079 {
1080         static DEFINE_SPINLOCK(ip_fb_id_lock);
1081         static u32 ip_fallback_id;
1082         u32 salt;
1083
1084         spin_lock_bh(&ip_fb_id_lock);
1085         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086         iph->id = htons(salt & 0xFFFF);
1087         ip_fallback_id = salt;
1088         spin_unlock_bh(&ip_fb_id_lock);
1089 }
1090
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 {
1093         struct rtable *rt = (struct rtable *) dst;
1094
1095         if (rt) {
1096                 if (rt->peer == NULL)
1097                         rt_bind_peer(rt, 1);
1098
1099                 /* If peer is attached to destination, it is never detached,
1100                    so that we need not to grab a lock to dereference it.
1101                  */
1102                 if (rt->peer) {
1103                         iph->id = htons(inet_getid(rt->peer, more));
1104                         return;
1105                 }
1106         } else
1107                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108                        __builtin_return_address(0));
1109
1110         ip_select_fb_ident(iph);
1111 }
1112
1113 static void rt_del(unsigned hash, struct rtable *rt)
1114 {
1115         struct rtable **rthp;
1116
1117         spin_lock_bh(rt_hash_lock_addr(hash));
1118         ip_rt_put(rt);
1119         for (rthp = &rt_hash_table[hash].chain; *rthp;
1120              rthp = &(*rthp)->u.rt_next)
1121                 if (*rthp == rt) {
1122                         *rthp = rt->u.rt_next;
1123                         rt_free(rt);
1124                         break;
1125                 }
1126         spin_unlock_bh(rt_hash_lock_addr(hash));
1127 }
1128
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130                     __be32 saddr, struct net_device *dev)
1131 {
1132         int i, k;
1133         struct in_device *in_dev = in_dev_get(dev);
1134         struct rtable *rth, **rthp;
1135         __be32  skeys[2] = { saddr, 0 };
1136         int  ikeys[2] = { dev->ifindex, 0 };
1137         struct netevent_redirect netevent;
1138
1139         if (!in_dev)
1140                 return;
1141
1142         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144                 goto reject_redirect;
1145
1146         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148                         goto reject_redirect;
1149                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150                         goto reject_redirect;
1151         } else {
1152                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153                         goto reject_redirect;
1154         }
1155
1156         for (i = 0; i < 2; i++) {
1157                 for (k = 0; k < 2; k++) {
1158                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159
1160                         rthp=&rt_hash_table[hash].chain;
1161
1162                         rcu_read_lock();
1163                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1164                                 struct rtable *rt;
1165
1166                                 if (rth->fl.fl4_dst != daddr ||
1167                                     rth->fl.fl4_src != skeys[i] ||
1168                                     rth->fl.oif != ikeys[k] ||
1169                                     rth->fl.iif != 0) {
1170                                         rthp = &rth->u.rt_next;
1171                                         continue;
1172                                 }
1173
1174                                 if (rth->rt_dst != daddr ||
1175                                     rth->rt_src != saddr ||
1176                                     rth->u.dst.error ||
1177                                     rth->rt_gateway != old_gw ||
1178                                     rth->u.dst.dev != dev)
1179                                         break;
1180
1181                                 dst_hold(&rth->u.dst);
1182                                 rcu_read_unlock();
1183
1184                                 rt = dst_alloc(&ipv4_dst_ops);
1185                                 if (rt == NULL) {
1186                                         ip_rt_put(rth);
1187                                         in_dev_put(in_dev);
1188                                         return;
1189                                 }
1190
1191                                 /* Copy all the information. */
1192                                 *rt = *rth;
1193                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194                                 rt->u.dst.__use         = 1;
1195                                 atomic_set(&rt->u.dst.__refcnt, 1);
1196                                 rt->u.dst.child         = NULL;
1197                                 if (rt->u.dst.dev)
1198                                         dev_hold(rt->u.dst.dev);
1199                                 if (rt->idev)
1200                                         in_dev_hold(rt->idev);
1201                                 rt->u.dst.obsolete      = 0;
1202                                 rt->u.dst.lastuse       = jiffies;
1203                                 rt->u.dst.path          = &rt->u.dst;
1204                                 rt->u.dst.neighbour     = NULL;
1205                                 rt->u.dst.hh            = NULL;
1206                                 rt->u.dst.xfrm          = NULL;
1207
1208                                 rt->rt_flags            |= RTCF_REDIRECTED;
1209
1210                                 /* Gateway is different ... */
1211                                 rt->rt_gateway          = new_gw;
1212
1213                                 /* Redirect received -> path was valid */
1214                                 dst_confirm(&rth->u.dst);
1215
1216                                 if (rt->peer)
1217                                         atomic_inc(&rt->peer->refcnt);
1218
1219                                 if (arp_bind_neighbour(&rt->u.dst) ||
1220                                     !(rt->u.dst.neighbour->nud_state &
1221                                             NUD_VALID)) {
1222                                         if (rt->u.dst.neighbour)
1223                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1224                                         ip_rt_put(rth);
1225                                         rt_drop(rt);
1226                                         goto do_next;
1227                                 }
1228
1229                                 netevent.old = &rth->u.dst;
1230                                 netevent.new = &rt->u.dst;
1231                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1232                                                         &netevent);
1233
1234                                 rt_del(hash, rth);
1235                                 if (!rt_intern_hash(hash, rt, &rt))
1236                                         ip_rt_put(rt);
1237                                 goto do_next;
1238                         }
1239                         rcu_read_unlock();
1240                 do_next:
1241                         ;
1242                 }
1243         }
1244         in_dev_put(in_dev);
1245         return;
1246
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251                         "%u.%u.%u.%u ignored.\n"
1252                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254                        NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256         in_dev_put(in_dev);
1257 }
1258
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 {
1261         struct rtable *rt = (struct rtable*)dst;
1262         struct dst_entry *ret = dst;
1263
1264         if (rt) {
1265                 if (dst->obsolete) {
1266                         ip_rt_put(rt);
1267                         ret = NULL;
1268                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269                            rt->u.dst.expires) {
1270                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271                                                 rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274                                           "%u.%u.%u.%u/%02x dropped\n",
1275                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277                         rt_del(hash, rt);
1278                         ret = NULL;
1279                 }
1280         }
1281         return ret;
1282 }
1283
1284 /*
1285  * Algorithm:
1286  *      1. The first ip_rt_redirect_number redirects are sent
1287  *         with exponential backoff, then we stop sending them at all,
1288  *         assuming that the host ignores our redirects.
1289  *      2. If we did not see packets requiring redirects
1290  *         during ip_rt_redirect_silence, we assume that the host
1291  *         forgot redirected route and start to send redirects again.
1292  *
1293  * This algorithm is much cheaper and more intelligent than dumb load limiting
1294  * in icmp.c.
1295  *
1296  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298  */
1299
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1301 {
1302         struct rtable *rt = (struct rtable*)skb->dst;
1303         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305         if (!in_dev)
1306                 return;
1307
1308         if (!IN_DEV_TX_REDIRECTS(in_dev))
1309                 goto out;
1310
1311         /* No redirected packets during ip_rt_redirect_silence;
1312          * reset the algorithm.
1313          */
1314         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315                 rt->u.dst.rate_tokens = 0;
1316
1317         /* Too many ignored redirects; do not send anything
1318          * set u.dst.rate_last to the last seen redirected packet.
1319          */
1320         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321                 rt->u.dst.rate_last = jiffies;
1322                 goto out;
1323         }
1324
1325         /* Check for load limit; set rate_last to the latest sent
1326          * redirect.
1327          */
1328         if (time_after(jiffies,
1329                        (rt->u.dst.rate_last +
1330                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332                 rt->u.dst.rate_last = jiffies;
1333                 ++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337                     net_ratelimit())
1338                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1341                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343         }
1344 out:
1345         in_dev_put(in_dev);
1346 }
1347
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350         struct rtable *rt = (struct rtable*)skb->dst;
1351         unsigned long now;
1352         int code;
1353
1354         switch (rt->u.dst.error) {
1355                 case EINVAL:
1356                 default:
1357                         goto out;
1358                 case EHOSTUNREACH:
1359                         code = ICMP_HOST_UNREACH;
1360                         break;
1361                 case ENETUNREACH:
1362                         code = ICMP_NET_UNREACH;
1363                         break;
1364                 case EACCES:
1365                         code = ICMP_PKT_FILTERED;
1366                         break;
1367         }
1368
1369         now = jiffies;
1370         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373         rt->u.dst.rate_last = now;
1374         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377         }
1378
1379 out:    kfree_skb(skb);
1380         return 0;
1381 }
1382
1383 /*
1384  *      The last two values are not from the RFC but
1385  *      are needed for AMPRnet AX.25 paths.
1386  */
1387
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393         int i;
1394
1395         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396                 if (old_mtu > mtu_plateau[i])
1397                         return mtu_plateau[i];
1398         return 68;
1399 }
1400
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402 {
1403         int i;
1404         unsigned short old_mtu = ntohs(iph->tot_len);
1405         struct rtable *rth;
1406         __be32  skeys[2] = { iph->saddr, 0, };
1407         __be32  daddr = iph->daddr;
1408         unsigned short est_mtu = 0;
1409
1410         if (ipv4_config.no_pmtu_disc)
1411                 return 0;
1412
1413         for (i = 0; i < 2; i++) {
1414                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1415
1416                 rcu_read_lock();
1417                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418                      rth = rcu_dereference(rth->u.rt_next)) {
1419                         if (rth->fl.fl4_dst == daddr &&
1420                             rth->fl.fl4_src == skeys[i] &&
1421                             rth->rt_dst  == daddr &&
1422                             rth->rt_src  == iph->saddr &&
1423                             rth->fl.iif == 0 &&
1424                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425                                 unsigned short mtu = new_mtu;
1426
1427                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429                                         /* BSD 4.2 compatibility hack :-( */
1430                                         if (mtu == 0 &&
1431                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432                                             old_mtu >= 68 + (iph->ihl << 2))
1433                                                 old_mtu -= iph->ihl << 2;
1434
1435                                         mtu = guess_mtu(old_mtu);
1436                                 }
1437                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439                                                 dst_confirm(&rth->u.dst);
1440                                                 if (mtu < ip_rt_min_pmtu) {
1441                                                         mtu = ip_rt_min_pmtu;
1442                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1443                                                                 (1 << RTAX_MTU);
1444                                                 }
1445                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446                                                 dst_set_expires(&rth->u.dst,
1447                                                         ip_rt_mtu_expires);
1448                                         }
1449                                         est_mtu = mtu;
1450                                 }
1451                         }
1452                 }
1453                 rcu_read_unlock();
1454         }
1455         return est_mtu ? : new_mtu;
1456 }
1457
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459 {
1460         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461             !(dst_metric_locked(dst, RTAX_MTU))) {
1462                 if (mtu < ip_rt_min_pmtu) {
1463                         mtu = ip_rt_min_pmtu;
1464                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465                 }
1466                 dst->metrics[RTAX_MTU-1] = mtu;
1467                 dst_set_expires(dst, ip_rt_mtu_expires);
1468                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469         }
1470 }
1471
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473 {
1474         return NULL;
1475 }
1476
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479         struct rtable *rt = (struct rtable *) dst;
1480         struct inet_peer *peer = rt->peer;
1481         struct in_device *idev = rt->idev;
1482
1483         if (peer) {
1484                 rt->peer = NULL;
1485                 inet_putpeer(peer);
1486         }
1487
1488         if (idev) {
1489                 rt->idev = NULL;
1490                 in_dev_put(idev);
1491         }
1492 }
1493
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495                             int how)
1496 {
1497         struct rtable *rt = (struct rtable *) dst;
1498         struct in_device *idev = rt->idev;
1499         if (dev != &loopback_dev && idev && idev->dev == dev) {
1500                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501                 if (loopback_idev) {
1502                         rt->idev = loopback_idev;
1503                         in_dev_put(idev);
1504                 }
1505         }
1506 }
1507
1508 static void ipv4_link_failure(struct sk_buff *skb)
1509 {
1510         struct rtable *rt;
1511
1512         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514         rt = (struct rtable *) skb->dst;
1515         if (rt)
1516                 dst_set_expires(&rt->u.dst, 0);
1517 }
1518
1519 static int ip_rt_bug(struct sk_buff *skb)
1520 {
1521         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1523                 skb->dev ? skb->dev->name : "?");
1524         kfree_skb(skb);
1525         return 0;
1526 }
1527
1528 /*
1529    We do not cache source address of outgoing interface,
1530    because it is used only by IP RR, TS and SRR options,
1531    so that it out of fast path.
1532
1533    BTW remember: "addr" is allowed to be not aligned
1534    in IP options!
1535  */
1536
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538 {
1539         __be32 src;
1540         struct fib_result res;
1541
1542         if (rt->fl.iif == 0)
1543                 src = rt->rt_src;
1544         else if (fib_lookup(&rt->fl, &res) == 0) {
1545                 src = FIB_RES_PREFSRC(res);
1546                 fib_res_put(&res);
1547         } else
1548                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549                                         RT_SCOPE_UNIVERSE);
1550         memcpy(addr, &src, 4);
1551 }
1552
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1555 {
1556         if (!(rt->u.dst.tclassid & 0xFFFF))
1557                 rt->u.dst.tclassid |= tag & 0xFFFF;
1558         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560 }
1561 #endif
1562
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564 {
1565         struct fib_info *fi = res->fi;
1566
1567         if (fi) {
1568                 if (FIB_RES_GW(*res) &&
1569                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570                         rt->rt_gateway = FIB_RES_GW(*res);
1571                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572                        sizeof(rt->u.dst.metrics));
1573                 if (fi->fib_mtu == 0) {
1574                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576                             rt->rt_gateway != rt->rt_dst &&
1577                             rt->u.dst.dev->mtu > 576)
1578                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579                 }
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582 #endif
1583         } else
1584                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592                                        ip_rt_min_advmss);
1593         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598         set_class_tag(rt, fib_rules_tclass(res));
1599 #endif
1600         set_class_tag(rt, itag);
1601 #endif
1602         rt->rt_type = res->type;
1603 }
1604
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606                                 u8 tos, struct net_device *dev, int our)
1607 {
1608         unsigned hash;
1609         struct rtable *rth;
1610         __be32 spec_dst;
1611         struct in_device *in_dev = in_dev_get(dev);
1612         u32 itag = 0;
1613
1614         /* Primary sanity checks. */
1615
1616         if (in_dev == NULL)
1617                 return -EINVAL;
1618
1619         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620             skb->protocol != htons(ETH_P_IP))
1621                 goto e_inval;
1622
1623         if (ZERONET(saddr)) {
1624                 if (!LOCAL_MCAST(daddr))
1625                         goto e_inval;
1626                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627         } else if (fib_validate_source(saddr, 0, tos, 0,
1628                                         dev, &spec_dst, &itag) < 0)
1629                 goto e_inval;
1630
1631         rth = dst_alloc(&ipv4_dst_ops);
1632         if (!rth)
1633                 goto e_nobufs;
1634
1635         rth->u.dst.output= ip_rt_bug;
1636
1637         atomic_set(&rth->u.dst.__refcnt, 1);
1638         rth->u.dst.flags= DST_HOST;
1639         if (in_dev->cnf.no_policy)
1640                 rth->u.dst.flags |= DST_NOPOLICY;
1641         rth->fl.fl4_dst = daddr;
1642         rth->rt_dst     = daddr;
1643         rth->fl.fl4_tos = tos;
1644         rth->fl.mark    = skb->mark;
1645         rth->fl.fl4_src = saddr;
1646         rth->rt_src     = saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648         rth->u.dst.tclassid = itag;
1649 #endif
1650         rth->rt_iif     =
1651         rth->fl.iif     = dev->ifindex;
1652         rth->u.dst.dev  = &loopback_dev;
1653         dev_hold(rth->u.dst.dev);
1654         rth->idev       = in_dev_get(rth->u.dst.dev);
1655         rth->fl.oif     = 0;
1656         rth->rt_gateway = daddr;
1657         rth->rt_spec_dst= spec_dst;
1658         rth->rt_type    = RTN_MULTICAST;
1659         rth->rt_flags   = RTCF_MULTICAST;
1660         if (our) {
1661                 rth->u.dst.input= ip_local_deliver;
1662                 rth->rt_flags |= RTCF_LOCAL;
1663         }
1664
1665 #ifdef CONFIG_IP_MROUTE
1666         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667                 rth->u.dst.input = ip_mr_input;
1668 #endif
1669         RT_CACHE_STAT_INC(in_slow_mc);
1670
1671         in_dev_put(in_dev);
1672         hash = rt_hash(daddr, saddr, dev->ifindex);
1673         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675 e_nobufs:
1676         in_dev_put(in_dev);
1677         return -ENOBUFS;
1678
1679 e_inval:
1680         in_dev_put(in_dev);
1681         return -EINVAL;
1682 }
1683
1684
1685 static void ip_handle_martian_source(struct net_device *dev,
1686                                      struct in_device *in_dev,
1687                                      struct sk_buff *skb,
1688                                      __be32 daddr,
1689                                      __be32 saddr)
1690 {
1691         RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694                 /*
1695                  *      RFC1812 recommendation, if source is martian,
1696                  *      the only hint is MAC header.
1697                  */
1698                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699                         "%u.%u.%u.%u, on dev %s\n",
1700                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701                 if (dev->hard_header_len && skb->mac.raw) {
1702                         int i;
1703                         unsigned char *p = skb->mac.raw;
1704                         printk(KERN_WARNING "ll header: ");
1705                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1706                                 printk("%02x", *p);
1707                                 if (i < (dev->hard_header_len - 1))
1708                                         printk(":");
1709                         }
1710                         printk("\n");
1711                 }
1712         }
1713 #endif
1714 }
1715
1716 static inline int __mkroute_input(struct sk_buff *skb,
1717                                   struct fib_result* res,
1718                                   struct in_device *in_dev,
1719                                   __be32 daddr, __be32 saddr, u32 tos,
1720                                   struct rtable **result)
1721 {
1722
1723         struct rtable *rth;
1724         int err;
1725         struct in_device *out_dev;
1726         unsigned flags = 0;
1727         __be32 spec_dst;
1728         u32 itag;
1729
1730         /* get a working reference to the output device */
1731         out_dev = in_dev_get(FIB_RES_DEV(*res));
1732         if (out_dev == NULL) {
1733                 if (net_ratelimit())
1734                         printk(KERN_CRIT "Bug in ip_route_input" \
1735                                "_slow(). Please, report\n");
1736                 return -EINVAL;
1737         }
1738
1739
1740         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741                                   in_dev->dev, &spec_dst, &itag);
1742         if (err < 0) {
1743                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744                                          saddr);
1745
1746                 err = -EINVAL;
1747                 goto cleanup;
1748         }
1749
1750         if (err)
1751                 flags |= RTCF_DIRECTSRC;
1752
1753         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754             (IN_DEV_SHARED_MEDIA(out_dev) ||
1755              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756                 flags |= RTCF_DOREDIRECT;
1757
1758         if (skb->protocol != htons(ETH_P_IP)) {
1759                 /* Not IP (i.e. ARP). Do not create route, if it is
1760                  * invalid for proxy arp. DNAT routes are always valid.
1761                  */
1762                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763                         err = -EINVAL;
1764                         goto cleanup;
1765                 }
1766         }
1767
1768
1769         rth = dst_alloc(&ipv4_dst_ops);
1770         if (!rth) {
1771                 err = -ENOBUFS;
1772                 goto cleanup;
1773         }
1774
1775         atomic_set(&rth->u.dst.__refcnt, 1);
1776         rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778         if (res->fi->fib_nhs > 1)
1779                 rth->u.dst.flags |= DST_BALANCED;
1780 #endif
1781         if (in_dev->cnf.no_policy)
1782                 rth->u.dst.flags |= DST_NOPOLICY;
1783         if (in_dev->cnf.no_xfrm)
1784                 rth->u.dst.flags |= DST_NOXFRM;
1785         rth->fl.fl4_dst = daddr;
1786         rth->rt_dst     = daddr;
1787         rth->fl.fl4_tos = tos;
1788         rth->fl.mark    = skb->mark;
1789         rth->fl.fl4_src = saddr;
1790         rth->rt_src     = saddr;
1791         rth->rt_gateway = daddr;
1792         rth->rt_iif     =
1793                 rth->fl.iif     = in_dev->dev->ifindex;
1794         rth->u.dst.dev  = (out_dev)->dev;
1795         dev_hold(rth->u.dst.dev);
1796         rth->idev       = in_dev_get(rth->u.dst.dev);
1797         rth->fl.oif     = 0;
1798         rth->rt_spec_dst= spec_dst;
1799
1800         rth->u.dst.input = ip_forward;
1801         rth->u.dst.output = ip_output;
1802
1803         rt_set_nexthop(rth, res, itag);
1804
1805         rth->rt_flags = flags;
1806
1807         *result = rth;
1808         err = 0;
1809  cleanup:
1810         /* release the working reference to the output device */
1811         in_dev_put(out_dev);
1812         return err;
1813 }
1814
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816                                        struct fib_result* res,
1817                                        const struct flowi *fl,
1818                                        struct in_device *in_dev,
1819                                        __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821         struct rtable* rth = NULL;
1822         int err;
1823         unsigned hash;
1824
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827                 fib_select_multipath(fl, res);
1828 #endif
1829
1830         /* create a routing cache entry */
1831         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832         if (err)
1833                 return err;
1834
1835         /* put it into the cache */
1836         hash = rt_hash(daddr, saddr, fl->iif);
1837         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838 }
1839
1840 static inline int ip_mkroute_input(struct sk_buff *skb,
1841                                    struct fib_result* res,
1842                                    const struct flowi *fl,
1843                                    struct in_device *in_dev,
1844                                    __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847         struct rtable* rth = NULL, *rtres;
1848         unsigned char hop, hopcount;
1849         int err = -EINVAL;
1850         unsigned int hash;
1851
1852         if (res->fi)
1853                 hopcount = res->fi->fib_nhs;
1854         else
1855                 hopcount = 1;
1856
1857         /* distinguish between multipath and singlepath */
1858         if (hopcount < 2)
1859                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860                                             saddr, tos);
1861
1862         /* add all alternatives to the routing cache */
1863         for (hop = 0; hop < hopcount; hop++) {
1864                 res->nh_sel = hop;
1865
1866                 /* put reference to previous result */
1867                 if (hop)
1868                         ip_rt_put(rtres);
1869
1870                 /* create a routing cache entry */
1871                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872                                       &rth);
1873                 if (err)
1874                         return err;
1875
1876                 /* put it into the cache */
1877                 hash = rt_hash(daddr, saddr, fl->iif);
1878                 err = rt_intern_hash(hash, rth, &rtres);
1879                 if (err)
1880                         return err;
1881
1882                 /* forward hop information to multipath impl. */
1883                 multipath_set_nhinfo(rth,
1884                                      FIB_RES_NETWORK(*res),
1885                                      FIB_RES_NETMASK(*res),
1886                                      res->prefixlen,
1887                                      &FIB_RES_NH(*res));
1888         }
1889         skb->dst = &rtres->u.dst;
1890         return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895
1896
1897 /*
1898  *      NOTE. We drop all the packets that has local source
1899  *      addresses, because every properly looped back packet
1900  *      must have correct destination already attached by output routine.
1901  *
1902  *      Such approach solves two big problems:
1903  *      1. Not simplex devices are handled properly.
1904  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908                                u8 tos, struct net_device *dev)
1909 {
1910         struct fib_result res;
1911         struct in_device *in_dev = in_dev_get(dev);
1912         struct flowi fl = { .nl_u = { .ip4_u =
1913                                       { .daddr = daddr,
1914                                         .saddr = saddr,
1915                                         .tos = tos,
1916                                         .scope = RT_SCOPE_UNIVERSE,
1917                                       } },
1918                             .mark = skb->mark,
1919                             .iif = dev->ifindex };
1920         unsigned        flags = 0;
1921         u32             itag = 0;
1922         struct rtable * rth;
1923         unsigned        hash;
1924         __be32          spec_dst;
1925         int             err = -EINVAL;
1926         int             free_res = 0;
1927
1928         /* IP on this device is disabled. */
1929
1930         if (!in_dev)
1931                 goto out;
1932
1933         /* Check for the most weird martians, which can be not detected
1934            by fib_lookup.
1935          */
1936
1937         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938                 goto martian_source;
1939
1940         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941                 goto brd_input;
1942
1943         /* Accept zero addresses only to limited broadcast;
1944          * I even do not know to fix it or not. Waiting for complains :-)
1945          */
1946         if (ZERONET(saddr))
1947                 goto martian_source;
1948
1949         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950                 goto martian_destination;
1951
1952         /*
1953          *      Now we are ready to route packet.
1954          */
1955         if ((err = fib_lookup(&fl, &res)) != 0) {
1956                 if (!IN_DEV_FORWARD(in_dev))
1957                         goto e_hostunreach;
1958                 goto no_route;
1959         }
1960         free_res = 1;
1961
1962         RT_CACHE_STAT_INC(in_slow_tot);
1963
1964         if (res.type == RTN_BROADCAST)
1965                 goto brd_input;
1966
1967         if (res.type == RTN_LOCAL) {
1968                 int result;
1969                 result = fib_validate_source(saddr, daddr, tos,
1970                                              loopback_dev.ifindex,
1971                                              dev, &spec_dst, &itag);
1972                 if (result < 0)
1973                         goto martian_source;
1974                 if (result)
1975                         flags |= RTCF_DIRECTSRC;
1976                 spec_dst = daddr;
1977                 goto local_input;
1978         }
1979
1980         if (!IN_DEV_FORWARD(in_dev))
1981                 goto e_hostunreach;
1982         if (res.type != RTN_UNICAST)
1983                 goto martian_destination;
1984
1985         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986         if (err == -ENOBUFS)
1987                 goto e_nobufs;
1988         if (err == -EINVAL)
1989                 goto e_inval;
1990
1991 done:
1992         in_dev_put(in_dev);
1993         if (free_res)
1994                 fib_res_put(&res);
1995 out:    return err;
1996
1997 brd_input:
1998         if (skb->protocol != htons(ETH_P_IP))
1999                 goto e_inval;
2000
2001         if (ZERONET(saddr))
2002                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003         else {
2004                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005                                           &itag);
2006                 if (err < 0)
2007                         goto martian_source;
2008                 if (err)
2009                         flags |= RTCF_DIRECTSRC;
2010         }
2011         flags |= RTCF_BROADCAST;
2012         res.type = RTN_BROADCAST;
2013         RT_CACHE_STAT_INC(in_brd);
2014
2015 local_input:
2016         rth = dst_alloc(&ipv4_dst_ops);
2017         if (!rth)
2018                 goto e_nobufs;
2019
2020         rth->u.dst.output= ip_rt_bug;
2021
2022         atomic_set(&rth->u.dst.__refcnt, 1);
2023         rth->u.dst.flags= DST_HOST;
2024         if (in_dev->cnf.no_policy)
2025                 rth->u.dst.flags |= DST_NOPOLICY;
2026         rth->fl.fl4_dst = daddr;
2027         rth->rt_dst     = daddr;
2028         rth->fl.fl4_tos = tos;
2029         rth->fl.mark    = skb->mark;
2030         rth->fl.fl4_src = saddr;
2031         rth->rt_src     = saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033         rth->u.dst.tclassid = itag;
2034 #endif
2035         rth->rt_iif     =
2036         rth->fl.iif     = dev->ifindex;
2037         rth->u.dst.dev  = &loopback_dev;
2038         dev_hold(rth->u.dst.dev);
2039         rth->idev       = in_dev_get(rth->u.dst.dev);
2040         rth->rt_gateway = daddr;
2041         rth->rt_spec_dst= spec_dst;
2042         rth->u.dst.input= ip_local_deliver;
2043         rth->rt_flags   = flags|RTCF_LOCAL;
2044         if (res.type == RTN_UNREACHABLE) {
2045                 rth->u.dst.input= ip_error;
2046                 rth->u.dst.error= -err;
2047                 rth->rt_flags   &= ~RTCF_LOCAL;
2048         }
2049         rth->rt_type    = res.type;
2050         hash = rt_hash(daddr, saddr, fl.iif);
2051         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052         goto done;
2053
2054 no_route:
2055         RT_CACHE_STAT_INC(in_no_route);
2056         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057         res.type = RTN_UNREACHABLE;
2058         goto local_input;
2059
2060         /*
2061          *      Do not cache martian addresses: they should be logged (RFC1812)
2062          */
2063 martian_destination:
2064         RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068                         "%u.%u.%u.%u, dev %s\n",
2069                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 #endif
2071
2072 e_hostunreach:
2073         err = -EHOSTUNREACH;
2074         goto done;
2075
2076 e_inval:
2077         err = -EINVAL;
2078         goto done;
2079
2080 e_nobufs:
2081         err = -ENOBUFS;
2082         goto done;
2083
2084 martian_source:
2085         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086         goto e_inval;
2087 }
2088
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090                    u8 tos, struct net_device *dev)
2091 {
2092         struct rtable * rth;
2093         unsigned        hash;
2094         int iif = dev->ifindex;
2095
2096         tos &= IPTOS_RT_MASK;
2097         hash = rt_hash(daddr, saddr, iif);
2098
2099         rcu_read_lock();
2100         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101              rth = rcu_dereference(rth->u.rt_next)) {
2102                 if (rth->fl.fl4_dst == daddr &&
2103                     rth->fl.fl4_src == saddr &&
2104                     rth->fl.iif == iif &&
2105                     rth->fl.oif == 0 &&
2106                     rth->fl.mark == skb->mark &&
2107                     rth->fl.fl4_tos == tos) {
2108                         rth->u.dst.lastuse = jiffies;
2109                         dst_hold(&rth->u.dst);
2110                         rth->u.dst.__use++;
2111                         RT_CACHE_STAT_INC(in_hit);
2112                         rcu_read_unlock();
2113                         skb->dst = (struct dst_entry*)rth;
2114                         return 0;
2115                 }
2116                 RT_CACHE_STAT_INC(in_hlist_search);
2117         }
2118         rcu_read_unlock();
2119
2120         /* Multicast recognition logic is moved from route cache to here.
2121            The problem was that too many Ethernet cards have broken/missing
2122            hardware multicast filters :-( As result the host on multicasting
2123            network acquires a lot of useless route cache entries, sort of
2124            SDR messages from all the world. Now we try to get rid of them.
2125            Really, provided software IP multicast filter is organized
2126            reasonably (at least, hashed), it does not result in a slowdown
2127            comparing with route cache reject entries.
2128            Note, that multicast routers are not affected, because
2129            route cache entry is created eventually.
2130          */
2131         if (MULTICAST(daddr)) {
2132                 struct in_device *in_dev;
2133
2134                 rcu_read_lock();
2135                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136                         int our = ip_check_mc(in_dev, daddr, saddr,
2137                                 skb->nh.iph->protocol);
2138                         if (our
2139 #ifdef CONFIG_IP_MROUTE
2140                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 #endif
2142                             ) {
2143                                 rcu_read_unlock();
2144                                 return ip_route_input_mc(skb, daddr, saddr,
2145                                                          tos, dev, our);
2146                         }
2147                 }
2148                 rcu_read_unlock();
2149                 return -EINVAL;
2150         }
2151         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152 }
2153
2154 static inline int __mkroute_output(struct rtable **result,
2155                                    struct fib_result* res,
2156                                    const struct flowi *fl,
2157                                    const struct flowi *oldflp,
2158                                    struct net_device *dev_out,
2159                                    unsigned flags)
2160 {
2161         struct rtable *rth;
2162         struct in_device *in_dev;
2163         u32 tos = RT_FL_TOS(oldflp);
2164         int err = 0;
2165
2166         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167                 return -EINVAL;
2168
2169         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170                 res->type = RTN_BROADCAST;
2171         else if (MULTICAST(fl->fl4_dst))
2172                 res->type = RTN_MULTICAST;
2173         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174                 return -EINVAL;
2175
2176         if (dev_out->flags & IFF_LOOPBACK)
2177                 flags |= RTCF_LOCAL;
2178
2179         /* get work reference to inet device */
2180         in_dev = in_dev_get(dev_out);
2181         if (!in_dev)
2182                 return -EINVAL;
2183
2184         if (res->type == RTN_BROADCAST) {
2185                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186                 if (res->fi) {
2187                         fib_info_put(res->fi);
2188                         res->fi = NULL;
2189                 }
2190         } else if (res->type == RTN_MULTICAST) {
2191                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193                                  oldflp->proto))
2194                         flags &= ~RTCF_LOCAL;
2195                 /* If multicast route do not exist use
2196                    default one, but do not gateway in this case.
2197                    Yes, it is hack.
2198                  */
2199                 if (res->fi && res->prefixlen < 4) {
2200                         fib_info_put(res->fi);
2201                         res->fi = NULL;
2202                 }
2203         }
2204
2205
2206         rth = dst_alloc(&ipv4_dst_ops);
2207         if (!rth) {
2208                 err = -ENOBUFS;
2209                 goto cleanup;
2210         }
2211
2212         atomic_set(&rth->u.dst.__refcnt, 1);
2213         rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215         if (res->fi) {
2216                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217                 if (res->fi->fib_nhs > 1)
2218                         rth->u.dst.flags |= DST_BALANCED;
2219         }
2220 #endif
2221         if (in_dev->cnf.no_xfrm)
2222                 rth->u.dst.flags |= DST_NOXFRM;
2223         if (in_dev->cnf.no_policy)
2224                 rth->u.dst.flags |= DST_NOPOLICY;
2225
2226         rth->fl.fl4_dst = oldflp->fl4_dst;
2227         rth->fl.fl4_tos = tos;
2228         rth->fl.fl4_src = oldflp->fl4_src;
2229         rth->fl.oif     = oldflp->oif;
2230         rth->fl.mark    = oldflp->mark;
2231         rth->rt_dst     = fl->fl4_dst;
2232         rth->rt_src     = fl->fl4_src;
2233         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2234         /* get references to the devices that are to be hold by the routing
2235            cache entry */
2236         rth->u.dst.dev  = dev_out;
2237         dev_hold(dev_out);
2238         rth->idev       = in_dev_get(dev_out);
2239         rth->rt_gateway = fl->fl4_dst;
2240         rth->rt_spec_dst= fl->fl4_src;
2241
2242         rth->u.dst.output=ip_output;
2243
2244         RT_CACHE_STAT_INC(out_slow_tot);
2245
2246         if (flags & RTCF_LOCAL) {
2247                 rth->u.dst.input = ip_local_deliver;
2248                 rth->rt_spec_dst = fl->fl4_dst;
2249         }
2250         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251                 rth->rt_spec_dst = fl->fl4_src;
2252                 if (flags & RTCF_LOCAL &&
2253                     !(dev_out->flags & IFF_LOOPBACK)) {
2254                         rth->u.dst.output = ip_mc_output;
2255                         RT_CACHE_STAT_INC(out_slow_mc);
2256                 }
2257 #ifdef CONFIG_IP_MROUTE
2258                 if (res->type == RTN_MULTICAST) {
2259                         if (IN_DEV_MFORWARD(in_dev) &&
2260                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2261                                 rth->u.dst.input = ip_mr_input;
2262                                 rth->u.dst.output = ip_mc_output;
2263                         }
2264                 }
2265 #endif
2266         }
2267
2268         rt_set_nexthop(rth, res, 0);
2269
2270         rth->rt_flags = flags;
2271
2272         *result = rth;
2273  cleanup:
2274         /* release work reference to inet device */
2275         in_dev_put(in_dev);
2276
2277         return err;
2278 }
2279
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281                                         struct fib_result* res,
2282                                         const struct flowi *fl,
2283                                         const struct flowi *oldflp,
2284                                         struct net_device *dev_out,
2285                                         unsigned flags)
2286 {
2287         struct rtable *rth = NULL;
2288         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289         unsigned hash;
2290         if (err == 0) {
2291                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292                 err = rt_intern_hash(hash, rth, rp);
2293         }
2294
2295         return err;
2296 }
2297
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299                                     struct fib_result* res,
2300                                     const struct flowi *fl,
2301                                     const struct flowi *oldflp,
2302                                     struct net_device *dev_out,
2303                                     unsigned flags)
2304 {
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306         unsigned char hop;
2307         unsigned hash;
2308         int err = -EINVAL;
2309         struct rtable *rth = NULL;
2310
2311         if (res->fi && res->fi->fib_nhs > 1) {
2312                 unsigned char hopcount = res->fi->fib_nhs;
2313
2314                 for (hop = 0; hop < hopcount; hop++) {
2315                         struct net_device *dev2nexthop;
2316
2317                         res->nh_sel = hop;
2318
2319                         /* hold a work reference to the output device */
2320                         dev2nexthop = FIB_RES_DEV(*res);
2321                         dev_hold(dev2nexthop);
2322
2323                         /* put reference to previous result */
2324                         if (hop)
2325                                 ip_rt_put(*rp);
2326
2327                         err = __mkroute_output(&rth, res, fl, oldflp,
2328                                                dev2nexthop, flags);
2329
2330                         if (err != 0)
2331                                 goto cleanup;
2332
2333                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334                                         oldflp->oif);
2335                         err = rt_intern_hash(hash, rth, rp);
2336
2337                         /* forward hop information to multipath impl. */
2338                         multipath_set_nhinfo(rth,
2339                                              FIB_RES_NETWORK(*res),
2340                                              FIB_RES_NETMASK(*res),
2341                                              res->prefixlen,
2342                                              &FIB_RES_NH(*res));
2343                 cleanup:
2344                         /* release work reference to output device */
2345                         dev_put(dev2nexthop);
2346
2347                         if (err != 0)
2348                                 return err;
2349                 }
2350                 return err;
2351         } else {
2352                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353                                              flags);
2354         }
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357 #endif
2358 }
2359
2360 /*
2361  * Major route resolver routine.
2362  */
2363
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 {
2366         u32 tos = RT_FL_TOS(oldflp);
2367         struct flowi fl = { .nl_u = { .ip4_u =
2368                                       { .daddr = oldflp->fl4_dst,
2369                                         .saddr = oldflp->fl4_src,
2370                                         .tos = tos & IPTOS_RT_MASK,
2371                                         .scope = ((tos & RTO_ONLINK) ?
2372                                                   RT_SCOPE_LINK :
2373                                                   RT_SCOPE_UNIVERSE),
2374                                       } },
2375                             .mark = oldflp->mark,
2376                             .iif = loopback_dev.ifindex,
2377                             .oif = oldflp->oif };
2378         struct fib_result res;
2379         unsigned flags = 0;
2380         struct net_device *dev_out = NULL;
2381         int free_res = 0;
2382         int err;
2383
2384
2385         res.fi          = NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2387         res.r           = NULL;
2388 #endif
2389
2390         if (oldflp->fl4_src) {
2391                 err = -EINVAL;
2392                 if (MULTICAST(oldflp->fl4_src) ||
2393                     BADCLASS(oldflp->fl4_src) ||
2394                     ZERONET(oldflp->fl4_src))
2395                         goto out;
2396
2397                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398                 dev_out = ip_dev_find(oldflp->fl4_src);
2399                 if (dev_out == NULL)
2400                         goto out;
2401
2402                 /* I removed check for oif == dev_out->oif here.
2403                    It was wrong for two reasons:
2404                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405                       assigned to multiple interfaces.
2406                    2. Moreover, we are allowed to send packets with saddr
2407                       of another iface. --ANK
2408                  */
2409
2410                 if (oldflp->oif == 0
2411                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412                         /* Special hack: user can direct multicasts
2413                            and limited broadcast via necessary interface
2414                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415                            This hack is not just for fun, it allows
2416                            vic,vat and friends to work.
2417                            They bind socket to loopback, set ttl to zero
2418                            and expect that it will work.
2419                            From the viewpoint of routing cache they are broken,
2420                            because we are not allowed to build multicast path
2421                            with loopback source addr (look, routing cache
2422                            cannot know, that ttl is zero, so that packet
2423                            will not leave this host and route is valid).
2424                            Luckily, this hack is good workaround.
2425                          */
2426
2427                         fl.oif = dev_out->ifindex;
2428                         goto make_route;
2429                 }
2430                 if (dev_out)
2431                         dev_put(dev_out);
2432                 dev_out = NULL;
2433         }
2434
2435
2436         if (oldflp->oif) {
2437                 dev_out = dev_get_by_index(oldflp->oif);
2438                 err = -ENODEV;
2439                 if (dev_out == NULL)
2440                         goto out;
2441
2442                 /* RACE: Check return value of inet_select_addr instead. */
2443                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2444                         dev_put(dev_out);
2445                         goto out;       /* Wrong error code */
2446                 }
2447
2448                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2449                         if (!fl.fl4_src)
2450                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2451                                                               RT_SCOPE_LINK);
2452                         goto make_route;
2453                 }
2454                 if (!fl.fl4_src) {
2455                         if (MULTICAST(oldflp->fl4_dst))
2456                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2457                                                               fl.fl4_scope);
2458                         else if (!oldflp->fl4_dst)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_HOST);
2461                 }
2462         }
2463
2464         if (!fl.fl4_dst) {
2465                 fl.fl4_dst = fl.fl4_src;
2466                 if (!fl.fl4_dst)
2467                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468                 if (dev_out)
2469                         dev_put(dev_out);
2470                 dev_out = &loopback_dev;
2471                 dev_hold(dev_out);
2472                 fl.oif = loopback_dev.ifindex;
2473                 res.type = RTN_LOCAL;
2474                 flags |= RTCF_LOCAL;
2475                 goto make_route;
2476         }
2477
2478         if (fib_lookup(&fl, &res)) {
2479                 res.fi = NULL;
2480                 if (oldflp->oif) {
2481                         /* Apparently, routing tables are wrong. Assume,
2482                            that the destination is on link.
2483
2484                            WHY? DW.
2485                            Because we are allowed to send to iface
2486                            even if it has NO routes and NO assigned
2487                            addresses. When oif is specified, routing
2488                            tables are looked up with only one purpose:
2489                            to catch if destination is gatewayed, rather than
2490                            direct. Moreover, if MSG_DONTROUTE is set,
2491                            we send packet, ignoring both routing tables
2492                            and ifaddr state. --ANK
2493
2494
2495                            We could make it even if oif is unknown,
2496                            likely IPv6, but we do not.
2497                          */
2498
2499                         if (fl.fl4_src == 0)
2500                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2501                                                               RT_SCOPE_LINK);
2502                         res.type = RTN_UNICAST;
2503                         goto make_route;
2504                 }
2505                 if (dev_out)
2506                         dev_put(dev_out);
2507                 err = -ENETUNREACH;
2508                 goto out;
2509         }
2510         free_res = 1;
2511
2512         if (res.type == RTN_LOCAL) {
2513                 if (!fl.fl4_src)
2514                         fl.fl4_src = fl.fl4_dst;
2515                 if (dev_out)
2516                         dev_put(dev_out);
2517                 dev_out = &loopback_dev;
2518                 dev_hold(dev_out);
2519                 fl.oif = dev_out->ifindex;
2520                 if (res.fi)
2521                         fib_info_put(res.fi);
2522                 res.fi = NULL;
2523                 flags |= RTCF_LOCAL;
2524                 goto make_route;
2525         }
2526
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529                 fib_select_multipath(&fl, &res);
2530         else
2531 #endif
2532         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533                 fib_select_default(&fl, &res);
2534
2535         if (!fl.fl4_src)
2536                 fl.fl4_src = FIB_RES_PREFSRC(res);
2537
2538         if (dev_out)
2539                 dev_put(dev_out);
2540         dev_out = FIB_RES_DEV(res);
2541         dev_hold(dev_out);
2542         fl.oif = dev_out->ifindex;
2543
2544
2545 make_route:
2546         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547
2548
2549         if (free_res)
2550                 fib_res_put(&res);
2551         if (dev_out)
2552                 dev_put(dev_out);
2553 out:    return err;
2554 }
2555
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557 {
2558         unsigned hash;
2559         struct rtable *rth;
2560
2561         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2562
2563         rcu_read_lock_bh();
2564         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565                 rth = rcu_dereference(rth->u.rt_next)) {
2566                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567                     rth->fl.fl4_src == flp->fl4_src &&
2568                     rth->fl.iif == 0 &&
2569                     rth->fl.oif == flp->oif &&
2570                     rth->fl.mark == flp->mark &&
2571                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2573
2574                         /* check for multipath routes and choose one if
2575                          * necessary
2576                          */
2577                         if (multipath_select_route(flp, rth, rp)) {
2578                                 dst_hold(&(*rp)->u.dst);
2579                                 RT_CACHE_STAT_INC(out_hit);
2580                                 rcu_read_unlock_bh();
2581                                 return 0;
2582                         }
2583
2584                         rth->u.dst.lastuse = jiffies;
2585                         dst_hold(&rth->u.dst);
2586                         rth->u.dst.__use++;
2587                         RT_CACHE_STAT_INC(out_hit);
2588                         rcu_read_unlock_bh();
2589                         *rp = rth;
2590                         return 0;
2591                 }
2592                 RT_CACHE_STAT_INC(out_hlist_search);
2593         }
2594         rcu_read_unlock_bh();
2595
2596         return ip_route_output_slow(rp, flp);
2597 }
2598
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600
2601 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602 {
2603         int err;
2604
2605         if ((err = __ip_route_output_key(rp, flp)) != 0)
2606                 return err;
2607
2608         if (flp->proto) {
2609                 if (!flp->fl4_src)
2610                         flp->fl4_src = (*rp)->rt_src;
2611                 if (!flp->fl4_dst)
2612                         flp->fl4_dst = (*rp)->rt_dst;
2613                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2614         }
2615
2616         return 0;
2617 }
2618
2619 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2620
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623         return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627                         int nowait, unsigned int flags)
2628 {
2629         struct rtable *rt = (struct rtable*)skb->dst;
2630         struct rtmsg *r;
2631         struct nlmsghdr *nlh;
2632         long expires;
2633         u32 id = 0, ts = 0, tsage = 0, error;
2634
2635         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2636         if (nlh == NULL)
2637                 return -ENOBUFS;
2638
2639         r = nlmsg_data(nlh);
2640         r->rtm_family    = AF_INET;
2641         r->rtm_dst_len  = 32;
2642         r->rtm_src_len  = 0;
2643         r->rtm_tos      = rt->fl.fl4_tos;
2644         r->rtm_table    = RT_TABLE_MAIN;
2645         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2646         r->rtm_type     = rt->rt_type;
2647         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2648         r->rtm_protocol = RTPROT_UNSPEC;
2649         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2650         if (rt->rt_flags & RTCF_NOTIFY)
2651                 r->rtm_flags |= RTM_F_NOTIFY;
2652
2653         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2654
2655         if (rt->fl.fl4_src) {
2656                 r->rtm_src_len = 32;
2657                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2658         }
2659         if (rt->u.dst.dev)
2660                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2661 #ifdef CONFIG_NET_CLS_ROUTE
2662         if (rt->u.dst.tclassid)
2663                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2664 #endif
2665 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2666         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2667                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2668 #endif
2669         if (rt->fl.iif)
2670                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2671         else if (rt->rt_src != rt->fl.fl4_src)
2672                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2673
2674         if (rt->rt_dst != rt->rt_gateway)
2675                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2676
2677         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2678                 goto nla_put_failure;
2679
2680         error = rt->u.dst.error;
2681         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2682         if (rt->peer) {
2683                 id = rt->peer->ip_id_count;
2684                 if (rt->peer->tcp_ts_stamp) {
2685                         ts = rt->peer->tcp_ts;
2686                         tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2687                 }
2688         }
2689
2690         if (rt->fl.iif) {
2691 #ifdef CONFIG_IP_MROUTE
2692                 __be32 dst = rt->rt_dst;
2693
2694                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2695                     ipv4_devconf.mc_forwarding) {
2696                         int err = ipmr_get_route(skb, r, nowait);
2697                         if (err <= 0) {
2698                                 if (!nowait) {
2699                                         if (err == 0)
2700                                                 return 0;
2701                                         goto nla_put_failure;
2702                                 } else {
2703                                         if (err == -EMSGSIZE)
2704                                                 goto nla_put_failure;
2705                                         error = err;
2706                                 }
2707                         }
2708                 } else
2709 #endif
2710                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2711         }
2712
2713         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2714                                expires, error) < 0)
2715                 goto nla_put_failure;
2716
2717         return nlmsg_end(skb, nlh);
2718
2719 nla_put_failure:
2720         return nlmsg_cancel(skb, nlh);
2721 }
2722
2723 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2724 {
2725         struct rtmsg *rtm;
2726         struct nlattr *tb[RTA_MAX+1];
2727         struct rtable *rt = NULL;
2728         __be32 dst = 0;
2729         __be32 src = 0;
2730         u32 iif;
2731         int err;
2732         struct sk_buff *skb;
2733
2734         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2735         if (err < 0)
2736                 goto errout;
2737
2738         rtm = nlmsg_data(nlh);
2739
2740         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2741         if (skb == NULL) {
2742                 err = -ENOBUFS;
2743                 goto errout;
2744         }
2745
2746         /* Reserve room for dummy headers, this skb can pass
2747            through good chunk of routing engine.
2748          */
2749         skb->mac.raw = skb->nh.raw = skb->data;
2750
2751         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752         skb->nh.iph->protocol = IPPROTO_ICMP;
2753         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754
2755         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2756         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2757         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2758
2759         if (iif) {
2760                 struct net_device *dev;
2761
2762                 dev = __dev_get_by_index(iif);
2763                 if (dev == NULL) {
2764                         err = -ENODEV;
2765                         goto errout_free;
2766                 }
2767
2768                 skb->protocol   = htons(ETH_P_IP);
2769                 skb->dev        = dev;
2770                 local_bh_disable();
2771                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772                 local_bh_enable();
2773
2774                 rt = (struct rtable*) skb->dst;
2775                 if (err == 0 && rt->u.dst.error)
2776                         err = -rt->u.dst.error;
2777         } else {
2778                 struct flowi fl = {
2779                         .nl_u = {
2780                                 .ip4_u = {
2781                                         .daddr = dst,
2782                                         .saddr = src,
2783                                         .tos = rtm->rtm_tos,
2784                                 },
2785                         },
2786                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2787                 };
2788                 err = ip_route_output_key(&rt, &fl);
2789         }
2790
2791         if (err)
2792                 goto errout_free;
2793
2794         skb->dst = &rt->u.dst;
2795         if (rtm->rtm_flags & RTM_F_NOTIFY)
2796                 rt->rt_flags |= RTCF_NOTIFY;
2797
2798         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2799                                 RTM_NEWROUTE, 0, 0);
2800         if (err <= 0)
2801                 goto errout_free;
2802
2803         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2804 errout:
2805         return err;
2806
2807 errout_free:
2808         kfree_skb(skb);
2809         goto errout;
2810 }
2811
2812 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2813 {
2814         struct rtable *rt;
2815         int h, s_h;
2816         int idx, s_idx;
2817
2818         s_h = cb->args[0];
2819         s_idx = idx = cb->args[1];
2820         for (h = 0; h <= rt_hash_mask; h++) {
2821                 if (h < s_h) continue;
2822                 if (h > s_h)
2823                         s_idx = 0;
2824                 rcu_read_lock_bh();
2825                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2827                         if (idx < s_idx)
2828                                 continue;
2829                         skb->dst = dst_clone(&rt->u.dst);
2830                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2831                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2832                                          1, NLM_F_MULTI) <= 0) {
2833                                 dst_release(xchg(&skb->dst, NULL));
2834                                 rcu_read_unlock_bh();
2835                                 goto done;
2836                         }
2837                         dst_release(xchg(&skb->dst, NULL));
2838                 }
2839                 rcu_read_unlock_bh();
2840         }
2841
2842 done:
2843         cb->args[0] = h;
2844         cb->args[1] = idx;
2845         return skb->len;
2846 }
2847
2848 void ip_rt_multicast_event(struct in_device *in_dev)
2849 {
2850         rt_cache_flush(0);
2851 }
2852
2853 #ifdef CONFIG_SYSCTL
2854 static int flush_delay;
2855
2856 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857                                         struct file *filp, void __user *buffer,
2858                                         size_t *lenp, loff_t *ppos)
2859 {
2860         if (write) {
2861                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862                 rt_cache_flush(flush_delay);
2863                 return 0;
2864         }
2865
2866         return -EINVAL;
2867 }
2868
2869 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870                                                 int __user *name,
2871                                                 int nlen,
2872                                                 void __user *oldval,
2873                                                 size_t __user *oldlenp,
2874                                                 void __user *newval,
2875                                                 size_t newlen,
2876                                                 void **context)
2877 {
2878         int delay;
2879         if (newlen != sizeof(int))
2880                 return -EINVAL;
2881         if (get_user(delay, (int __user *)newval))
2882                 return -EFAULT;
2883         rt_cache_flush(delay);
2884         return 0;
2885 }
2886
2887 ctl_table ipv4_route_table[] = {
2888         {
2889                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2890                 .procname       = "flush",
2891                 .data           = &flush_delay,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0200,
2894                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2895                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2896         },
2897         {
2898                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2899                 .procname       = "min_delay",
2900                 .data           = &ip_rt_min_delay,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = &proc_dointvec_jiffies,
2904                 .strategy       = &sysctl_jiffies,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2908                 .procname       = "max_delay",
2909                 .data           = &ip_rt_max_delay,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec_jiffies,
2913                 .strategy       = &sysctl_jiffies,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2917                 .procname       = "gc_thresh",
2918                 .data           = &ipv4_dst_ops.gc_thresh,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec,
2922         },
2923         {
2924                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2925                 .procname       = "max_size",
2926                 .data           = &ip_rt_max_size,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = &proc_dointvec,
2930         },
2931         {
2932                 /*  Deprecated. Use gc_min_interval_ms */
2933
2934                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935                 .procname       = "gc_min_interval",
2936                 .data           = &ip_rt_gc_min_interval,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = &proc_dointvec_jiffies,
2940                 .strategy       = &sysctl_jiffies,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944                 .procname       = "gc_min_interval_ms",
2945                 .data           = &ip_rt_gc_min_interval,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec_ms_jiffies,
2949                 .strategy       = &sysctl_ms_jiffies,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2953                 .procname       = "gc_timeout",
2954                 .data           = &ip_rt_gc_timeout,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec_jiffies,
2958                 .strategy       = &sysctl_jiffies,
2959         },
2960         {
2961                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2962                 .procname       = "gc_interval",
2963                 .data           = &ip_rt_gc_interval,
2964                 .maxlen         = sizeof(int),
2965                 .mode           = 0644,
2966                 .proc_handler   = &proc_dointvec_jiffies,
2967                 .strategy       = &sysctl_jiffies,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971                 .procname       = "redirect_load",
2972                 .data           = &ip_rt_redirect_load,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec,
2976         },
2977         {
2978                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979                 .procname       = "redirect_number",
2980                 .data           = &ip_rt_redirect_number,
2981                 .maxlen         = sizeof(int),
2982                 .mode           = 0644,
2983                 .proc_handler   = &proc_dointvec,
2984         },
2985         {
2986                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987                 .procname       = "redirect_silence",
2988                 .data           = &ip_rt_redirect_silence,
2989                 .maxlen         = sizeof(int),
2990                 .mode           = 0644,
2991                 .proc_handler   = &proc_dointvec,
2992         },
2993         {
2994                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2995                 .procname       = "error_cost",
2996                 .data           = &ip_rt_error_cost,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = &proc_dointvec,
3000         },
3001         {
3002                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3003                 .procname       = "error_burst",
3004                 .data           = &ip_rt_error_burst,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = &proc_dointvec,
3008         },
3009         {
3010                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3011                 .procname       = "gc_elasticity",
3012                 .data           = &ip_rt_gc_elasticity,
3013                 .maxlen         = sizeof(int),
3014                 .mode           = 0644,
3015                 .proc_handler   = &proc_dointvec,
3016         },
3017         {
3018                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3019                 .procname       = "mtu_expires",
3020                 .data           = &ip_rt_mtu_expires,
3021                 .maxlen         = sizeof(int),
3022                 .mode           = 0644,
3023                 .proc_handler   = &proc_dointvec_jiffies,
3024                 .strategy       = &sysctl_jiffies,
3025         },
3026         {
3027                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3028                 .procname       = "min_pmtu",
3029                 .data           = &ip_rt_min_pmtu,
3030                 .maxlen         = sizeof(int),
3031                 .mode           = 0644,
3032                 .proc_handler   = &proc_dointvec,
3033         },
3034         {
3035                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3036                 .procname       = "min_adv_mss",
3037                 .data           = &ip_rt_min_advmss,
3038                 .maxlen         = sizeof(int),
3039                 .mode           = 0644,
3040                 .proc_handler   = &proc_dointvec,
3041         },
3042         {
3043                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044                 .procname       = "secret_interval",
3045                 .data           = &ip_rt_secret_interval,
3046                 .maxlen         = sizeof(int),
3047                 .mode           = 0644,
3048                 .proc_handler   = &proc_dointvec_jiffies,
3049                 .strategy       = &sysctl_jiffies,
3050         },
3051         { .ctl_name = 0 }
3052 };
3053 #endif
3054
3055 #ifdef CONFIG_NET_CLS_ROUTE
3056 struct ip_rt_acct *ip_rt_acct;
3057
3058 /* This code sucks.  But you should have seen it before! --RR */
3059
3060 /* IP route accounting ptr for this logical cpu number. */
3061 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063 #ifdef CONFIG_PROC_FS
3064 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065                            int length, int *eof, void *data)
3066 {
3067         unsigned int i;
3068
3069         if ((offset & 3) || (length & 3))
3070                 return -EIO;
3071
3072         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073                 *eof = 1;
3074                 return 0;
3075         }
3076
3077         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3079                 *eof = 1;
3080         }
3081
3082         offset /= sizeof(u32);
3083
3084         if (length > 0) {
3085                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086                 u32 *dst = (u32 *) buffer;
3087
3088                 /* Copy first cpu. */
3089                 *start = buffer;
3090                 memcpy(dst, src, length);
3091
3092                 /* Add the other cpus in, one int at a time */
3093                 for_each_possible_cpu(i) {
3094                         unsigned int j;
3095
3096                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098                         for (j = 0; j < length/4; j++)
3099                                 dst[j] += src[j];
3100                 }
3101         }
3102         return length;
3103 }
3104 #endif /* CONFIG_PROC_FS */
3105 #endif /* CONFIG_NET_CLS_ROUTE */
3106
3107 static __initdata unsigned long rhash_entries;
3108 static int __init set_rhash_entries(char *str)
3109 {
3110         if (!str)
3111                 return 0;
3112         rhash_entries = simple_strtoul(str, &str, 0);
3113         return 1;
3114 }
3115 __setup("rhash_entries=", set_rhash_entries);
3116
3117 int __init ip_rt_init(void)
3118 {
3119         int rc = 0;
3120
3121         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122                              (jiffies ^ (jiffies >> 7)));
3123
3124 #ifdef CONFIG_NET_CLS_ROUTE
3125         {
3126         int order;
3127         for (order = 0;
3128              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129                 /* NOTHING */;
3130         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131         if (!ip_rt_acct)
3132                 panic("IP: failed to allocate ip_rt_acct\n");
3133         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3134         }
3135 #endif
3136
3137         ipv4_dst_ops.kmem_cachep =
3138                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3140
3141         rt_hash_table = (struct rt_hash_bucket *)
3142                 alloc_large_system_hash("IP route cache",
3143                                         sizeof(struct rt_hash_bucket),
3144                                         rhash_entries,
3145                                         (num_physpages >= 128 * 1024) ?
3146                                         15 : 17,
3147                                         0,
3148                                         &rt_hash_log,
3149                                         &rt_hash_mask,
3150                                         0);
3151         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152         rt_hash_lock_init();
3153
3154         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
3157         devinet_init();
3158         ip_fib_init();
3159
3160         init_timer(&rt_flush_timer);
3161         rt_flush_timer.function = rt_run_flush;
3162         init_timer(&rt_periodic_timer);
3163         rt_periodic_timer.function = rt_check_expire;
3164         init_timer(&rt_secret_timer);
3165         rt_secret_timer.function = rt_secret_rebuild;
3166
3167         /* All the timers, started at system startup tend
3168            to synchronize. Perturb it a bit.
3169          */
3170         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171                                         ip_rt_gc_interval;
3172         add_timer(&rt_periodic_timer);
3173
3174         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175                 ip_rt_secret_interval;
3176         add_timer(&rt_secret_timer);
3177
3178 #ifdef CONFIG_PROC_FS
3179         {
3180         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183                                              proc_net_stat))) {
3184                 return -ENOMEM;
3185         }
3186         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187         }
3188 #ifdef CONFIG_NET_CLS_ROUTE
3189         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190 #endif
3191 #endif
3192 #ifdef CONFIG_XFRM
3193         xfrm_init();
3194         xfrm4_init();
3195 #endif
3196         return rc;
3197 }
3198
3199 EXPORT_SYMBOL(__ip_select_ident);
3200 EXPORT_SYMBOL(ip_route_input);
3201 EXPORT_SYMBOL(ip_route_output_key);