release/src-rt/linux/linux-2.6/net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/ip_mp_alg.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_min_delay              = 2 * HZ;
 120 static int ip_rt_max_delay              = 10 * HZ;
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval            = 60 * HZ;
 124 static int ip_rt_gc_min_interval        = HZ / 2;
 125 static int ip_rt_redirect_number        = 9;
 126 static int ip_rt_redirect_load          = HZ / 50;
 127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost             = HZ;
 129 static int ip_rt_error_burst            = 5 * HZ;
 130 static int ip_rt_gc_elasticity          = 8;
 131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133 static int ip_rt_min_advmss             = 256;
 134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135 static int ip_rt_flush_expected;
 136 static unsigned long rt_deadline;
 137
 138 #define RTprint(a...)   printk(KERN_DEBUG a)
 139
 140 static struct timer_list rt_flush_timer;
 141 static void rt_worker_func(struct work_struct *work);
 142 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 143 static struct timer_list rt_secret_timer;
 144
 145 /*
 146  *      Interface to generic destination cache.
 147  */
 148
 149 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 150 static void              ipv4_dst_destroy(struct dst_entry *dst);
 151 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 152                                          struct net_device *dev, int how);
 153 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 154 static void              ipv4_link_failure(struct sk_buff *skb);
 155 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 156 static int rt_garbage_collect(void);
 157
 158
 159 static struct dst_ops ipv4_dst_ops = {
 160         .family =               AF_INET,
 161         .protocol =             __constant_htons(ETH_P_IP),
 162         .gc =                   rt_garbage_collect,
 163         .check =                ipv4_dst_check,
 164         .destroy =              ipv4_dst_destroy,
 165         .ifdown =               ipv4_dst_ifdown,
 166         .negative_advice =      ipv4_negative_advice,
 167         .link_failure =         ipv4_link_failure,
 168         .update_pmtu =          ip_rt_update_pmtu,
 169         .entry_size =           sizeof(struct rtable),
 170 };
 171
 172 #define ECN_OR_COST(class)      TC_PRIO_##class
 173
 174 __u8 ip_tos2prio[16] = {
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(FILLER),
 177         TC_PRIO_BESTEFFORT,
 178         ECN_OR_COST(BESTEFFORT),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_BULK,
 182         ECN_OR_COST(BULK),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE,
 186         ECN_OR_COST(INTERACTIVE),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK),
 189         TC_PRIO_INTERACTIVE_BULK,
 190         ECN_OR_COST(INTERACTIVE_BULK)
 191 };
 192
 193
 194 /*
 195  * Route cache.
 196  */
 197
 198 /* The locking scheme is rather straight forward:
 199  *
 200  * 1) Read-Copy Update protects the buckets of the central route hash.
 201  * 2) Only writers remove entries, and they hold the lock
 202  *    as they look at rtable reference counts.
 203  * 3) Only readers acquire references to rtable entries,
 204  *    they do so with atomic increments and with the
 205  *    lock held.
 206  */
 207
 208 struct rt_hash_bucket {
 209         struct rtable   *chain;
 210 };
 211 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 212         defined(CONFIG_PROVE_LOCKING)
 213 /*
 214  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 215  * The size of this table is a power of two and depends on the number of CPUS.
 216  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 217  */
 218 #ifdef CONFIG_LOCKDEP
 219 # define RT_HASH_LOCK_SZ        256
 220 #else
 221 # if NR_CPUS >= 32
 222 #  define RT_HASH_LOCK_SZ       4096
 223 # elif NR_CPUS >= 16
 224 #  define RT_HASH_LOCK_SZ       2048
 225 # elif NR_CPUS >= 8
 226 #  define RT_HASH_LOCK_SZ       1024
 227 # elif NR_CPUS >= 4
 228 #  define RT_HASH_LOCK_SZ       512
 229 # else
 230 #  define RT_HASH_LOCK_SZ       256
 231 # endif
 232 #endif
 233
 234 static spinlock_t       *rt_hash_locks;
 235 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 236 # define rt_hash_lock_init()    { \
 237                 int i; \
 238                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 239                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 240                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 241                         spin_lock_init(&rt_hash_locks[i]); \
 242                 }
 243 #else
 244 # define rt_hash_lock_addr(slot) NULL
 245 # define rt_hash_lock_init()
 246 #endif
 247
 248 static struct rt_hash_bucket    *rt_hash_table;
 249 static unsigned                 rt_hash_mask;
 250 static int                      rt_hash_log;
 251 static unsigned int             rt_hash_rnd;
 252
 253 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 254 #define RT_CACHE_STAT_INC(field) \
 255         (__raw_get_cpu_var(rt_cache_stat).field++)
 256
 257 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 258                                 struct rtable **res);
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 261 {
 262         return jhash_3words((__force u32)(__be32)(daddr),
 263                             (__force u32)(__be32)(saddr),
 264                             idx, rt_hash_rnd)
 265                 & rt_hash_mask;
 266 }
 267
 268 #ifdef CONFIG_PROC_FS
 269 struct rt_cache_iter_state {
 270         int bucket;
 271 };
 272
 273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 274 {
 275         struct rtable *r = NULL;
 276         struct rt_cache_iter_state *st = seq->private;
 277
 278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 279                 if (!rt_hash_table[st->bucket].chain)
 280                         continue;
 281                 rcu_read_lock_bh();
 282                 r = rt_hash_table[st->bucket].chain;
 283                 if (r)
 284                         break;
 285                 rcu_read_unlock_bh();
 286         }
 287         return rcu_dereference(r);
 288 }
 289
 290 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 291 {
 292         struct rt_cache_iter_state *st = seq->private;
 293
 294         r = r->u.dst.rt_next;
 295         while (!r) {
 296                 rcu_read_unlock_bh();
 297                 do {
 298                         if (--st->bucket < 0)
 299                                 return NULL;
 300                 } while (!rt_hash_table[st->bucket].chain);
 301                 rcu_read_lock_bh();
 302                 r = rt_hash_table[st->bucket].chain;
 303         }
 304         return rcu_dereference(r);
 305 }
 306
 307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 308 {
 309         struct rtable *r = rt_cache_get_first(seq);
 310
 311         if (r)
 312                 while (pos && (r = rt_cache_get_next(seq, r)))
 313                         --pos;
 314         return pos ? NULL : r;
 315 }
 316
 317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 318 {
 319         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 320 }
 321
 322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 323 {
 324         struct rtable *r = NULL;
 325
 326         if (v == SEQ_START_TOKEN)
 327                 r = rt_cache_get_first(seq);
 328         else
 329                 r = rt_cache_get_next(seq, v);
 330         ++*pos;
 331         return r;
 332 }
 333
 334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 335 {
 336         if (v && v != SEQ_START_TOKEN)
 337                 rcu_read_unlock_bh();
 338 }
 339
 340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 341 {
 342         if (v == SEQ_START_TOKEN)
 343                 seq_printf(seq, "%-127s\n",
 344                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 345                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 346                            "HHUptod\tSpecDst");
 347         else {
 348                 struct rtable *r = v;
 349                 char temp[256];
 350
 351                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 352                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 353                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 354                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 355                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 356                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 357                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 358                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 359                         dst_metric(&r->u.dst, RTAX_WINDOW),
 360                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 361                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 362                         r->fl.fl4_tos,
 363                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 364                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 365                                        dev_queue_xmit) : 0,
 366                         r->rt_spec_dst);
 367                 seq_printf(seq, "%-127s\n", temp);
 368         }
 369         return 0;
 370 }
 371
 372 static const struct seq_operations rt_cache_seq_ops = {
 373         .start  = rt_cache_seq_start,
 374         .next   = rt_cache_seq_next,
 375         .stop   = rt_cache_seq_stop,
 376         .show   = rt_cache_seq_show,
 377 };
 378
 379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 380 {
 381         struct seq_file *seq;
 382         int rc = -ENOMEM;
 383         struct rt_cache_iter_state *s;
 384
 385         s = kzalloc(sizeof(*s), GFP_KERNEL);
 386         if (!s)
 387                 goto out;
 388         rc = seq_open(file, &rt_cache_seq_ops);
 389         if (rc)
 390                 goto out_kfree;
 391         seq          = file->private_data;
 392         seq->private = s;
 393 out:
 394         return rc;
 395 out_kfree:
 396         kfree(s);
 397         goto out;
 398 }
 399
 400 static const struct file_operations rt_cache_seq_fops = {
 401         .owner   = THIS_MODULE,
 402         .open    = rt_cache_seq_open,
 403         .read    = seq_read,
 404         .llseek  = seq_lseek,
 405         .release = seq_release_private,
 406 };
 407
 408
 409 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 410 {
 411         int cpu;
 412
 413         if (*pos == 0)
 414                 return SEQ_START_TOKEN;
 415
 416         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 417                 if (!cpu_possible(cpu))
 418                         continue;
 419                 *pos = cpu+1;
 420                 return &per_cpu(rt_cache_stat, cpu);
 421         }
 422         return NULL;
 423 }
 424
 425 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 430                 if (!cpu_possible(cpu))
 431                         continue;
 432                 *pos = cpu+1;
 433                 return &per_cpu(rt_cache_stat, cpu);
 434         }
 435         return NULL;
 436
 437 }
 438
 439 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 440 {
 441
 442 }
 443
 444 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 445 {
 446         struct rt_cache_stat *st = v;
 447
 448         if (v == SEQ_START_TOKEN) {
 449                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 450                 return 0;
 451         }
 452
 453         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 454                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 455                    atomic_read(&ipv4_dst_ops.entries),
 456                    st->in_hit,
 457                    st->in_slow_tot,
 458                    st->in_slow_mc,
 459                    st->in_no_route,
 460                    st->in_brd,
 461                    st->in_martian_dst,
 462                    st->in_martian_src,
 463
 464                    st->out_hit,
 465                    st->out_slow_tot,
 466                    st->out_slow_mc,
 467
 468                    st->gc_total,
 469                    st->gc_ignored,
 470                    st->gc_goal_miss,
 471                    st->gc_dst_overflow,
 472                    st->in_hlist_search,
 473                    st->out_hlist_search
 474                 );
 475         return 0;
 476 }
 477
 478 static const struct seq_operations rt_cpu_seq_ops = {
 479         .start  = rt_cpu_seq_start,
 480         .next   = rt_cpu_seq_next,
 481         .stop   = rt_cpu_seq_stop,
 482         .show   = rt_cpu_seq_show,
 483 };
 484
 485
 486 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 487 {
 488         return seq_open(file, &rt_cpu_seq_ops);
 489 }
 490
 491 static const struct file_operations rt_cpu_seq_fops = {
 492         .owner   = THIS_MODULE,
 493         .open    = rt_cpu_seq_open,
 494         .read    = seq_read,
 495         .llseek  = seq_lseek,
 496         .release = seq_release,
 497 };
 498
 499 #endif /* CONFIG_PROC_FS */
 500
 501 static inline void rt_free(struct rtable *rt)
 502 {
 503         multipath_remove(rt);
 504         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 505 }
 506
 507 static inline void rt_drop(struct rtable *rt)
 508 {
 509         multipath_remove(rt);
 510         ip_rt_put(rt);
 511         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 512 }
 513
 514 static inline int rt_fast_clean(struct rtable *rth)
 515 {
 516         /* Kill broadcast/multicast entries very aggresively, if they
 517            collide in hash table with more useful entries */
 518         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 519                 rth->fl.iif && rth->u.dst.rt_next;
 520 }
 521
 522 static inline int rt_valuable(struct rtable *rth)
 523 {
 524         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 525                 rth->u.dst.expires;
 526 }
 527
 528 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 529 {
 530         unsigned long age;
 531         int ret = 0;
 532
 533         if (atomic_read(&rth->u.dst.__refcnt))
 534                 goto out;
 535
 536         ret = 1;
 537         if (rth->u.dst.expires &&
 538             time_after_eq(jiffies, rth->u.dst.expires))
 539                 goto out;
 540
 541         age = jiffies - rth->u.dst.lastuse;
 542         ret = 0;
 543         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 544             (age <= tmo2 && rt_valuable(rth)))
 545                 goto out;
 546         ret = 1;
 547 out:    return ret;
 548 }
 549
 550 /* Bits of score are:
 551  * 31: very valuable
 552  * 30: not quite useless
 553  * 29..0: usage counter
 554  */
 555 static inline u32 rt_score(struct rtable *rt)
 556 {
 557         u32 score = jiffies - rt->u.dst.lastuse;
 558
 559         score = ~score & ~(3<<30);
 560
 561         if (rt_valuable(rt))
 562                 score |= (1<<31);
 563
 564         if (!rt->fl.iif ||
 565             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 566                 score |= (1<<30);
 567
 568         return score;
 569 }
 570
 571 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 572 {
 573         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 574                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 575                 (fl1->mark ^ fl2->mark) |
 576                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 577                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 578                 (fl1->oif ^ fl2->oif) |
 579                 (fl1->iif ^ fl2->iif)) == 0;
 580 }
 581
 582 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 583 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 584                                                 struct rtable *expentry,
 585                                                 int *removed_count)
 586 {
 587         int passedexpired = 0;
 588         struct rtable **nextstep = NULL;
 589         struct rtable **rthp = chain_head;
 590         struct rtable *rth;
 591
 592         if (removed_count)
 593                 *removed_count = 0;
 594
 595         while ((rth = *rthp) != NULL) {
 596                 if (rth == expentry)
 597                         passedexpired = 1;
 598
 599                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 600                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 601                         if (*rthp == expentry) {
 602                                 *rthp = rth->u.dst.rt_next;
 603                                 continue;
 604                         } else {
 605                                 *rthp = rth->u.dst.rt_next;
 606                                 rt_free(rth);
 607                                 if (removed_count)
 608                                         ++(*removed_count);
 609                         }
 610                 } else {
 611                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 612                             passedexpired && !nextstep)
 613                                 nextstep = &rth->u.dst.rt_next;
 614
 615                         rthp = &rth->u.dst.rt_next;
 616                 }
 617         }
 618
 619         rt_free(expentry);
 620         if (removed_count)
 621                 ++(*removed_count);
 622
 623         return nextstep;
 624 }
 625 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 626
 627
 628 /*
 629  * Perform a full scan of hash table and free all entries.
 630  * Can be called by a softirq or a process.
 631  * In the later case, we want to be reschedule if necessary
 632  */
 633 static void rt_do_flush(int process_context)
 634 {
 635         unsigned int i;
 636         struct rtable *rth, *next;
 637
 638         for (i = 0; i <= rt_hash_mask; i++) {
 639                 if (process_context && need_resched())
 640                         cond_resched();
 641                 rth = rt_hash_table[i].chain;
 642                 if (!rth)
 643                         continue;
 644
 645                 spin_lock_bh(rt_hash_lock_addr(i));
 646                 rth = rt_hash_table[i].chain;
 647                 rt_hash_table[i].chain = NULL;
 648                 spin_unlock_bh(rt_hash_lock_addr(i));
 649
 650                 for (; rth; rth = next) {
 651                         next = rth->u.dst.rt_next;
 652                         rt_free(rth);
 653                 }
 654         }
 655 }
 656
 657 static void rt_check_expire(void)
 658 {
 659         static unsigned int rover;
 660         unsigned int i = rover, goal;
 661         struct rtable *rth, **rthp;
 662         u64 mult;
 663
 664         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 665         if (ip_rt_gc_timeout > 1)
 666                 do_div(mult, ip_rt_gc_timeout);
 667         goal = (unsigned int)mult;
 668         if (goal > rt_hash_mask)
 669                 goal = rt_hash_mask + 1;
 670         for (; goal > 0; goal--) {
 671                 unsigned long tmo = ip_rt_gc_timeout;
 672
 673                 i = (i + 1) & rt_hash_mask;
 674                 rthp = &rt_hash_table[i].chain;
 675
 676                 if (need_resched())
 677                         cond_resched();
 678
 679                 if (*rthp == 0)
 680                         continue;
 681                 spin_lock_bh(rt_hash_lock_addr(i));
 682                 while ((rth = *rthp) != NULL) {
 683                         if (rth->u.dst.expires) {
 684                                 /* Entry is expired even if it is in use */
 685                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 686                                         tmo >>= 1;
 687                                         rthp = &rth->u.dst.rt_next;
 688                                         continue;
 689                                 }
 690                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 691                                 tmo >>= 1;
 692                                 rthp = &rth->u.dst.rt_next;
 693                                 continue;
 694                         }
 695
 696                         /* Cleanup aged off entries. */
 697 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 698                         /* remove all related balanced entries if necessary */
 699                         if (rth->u.dst.flags & DST_BALANCED) {
 700                                 rthp = rt_remove_balanced_route(
 701                                         &rt_hash_table[i].chain,
 702                                         rth, NULL);
 703                                 if (!rthp)
 704                                         break;
 705                         } else {
 706                                 *rthp = rth->u.dst.rt_next;
 707                                 rt_free(rth);
 708                         }
 709 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 710                         *rthp = rth->u.dst.rt_next;
 711                         rt_free(rth);
 712 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 713                 }
 714                 spin_unlock_bh(rt_hash_lock_addr(i));
 715         }
 716         rover = i;
 717 }
 718
 719 /*
 720  * rt_worker_func() is run in process context.
 721  * If a whole flush was scheduled, it is done.
 722  * Else, we call rt_check_expire() to scan part of the hash table
 723  */
 724 static void rt_worker_func(struct work_struct *work)
 725 {
 726         if (ip_rt_flush_expected) {
 727                 ip_rt_flush_expected = 0;
 728                 rt_do_flush(1);
 729         } else
 730                 rt_check_expire();
 731         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 732 }
 733
 734 /* This can run from both BH and non-BH contexts, the latter
 735  * in the case of a forced flush event.
 736  */
 737 static void rt_run_flush(unsigned long process_context)
 738 {
 739         rt_deadline = 0;
 740
 741         get_random_bytes(&rt_hash_rnd, 4);
 742
 743         rt_do_flush(process_context);
 744 }
 745
 746 static DEFINE_SPINLOCK(rt_flush_lock);
 747
 748 void rt_cache_flush(int delay)
 749 {
 750         unsigned long now = jiffies;
 751         int user_mode = !in_softirq();
 752
 753         if (delay < 0)
 754                 delay = ip_rt_min_delay;
 755
 756         /* flush existing multipath state*/
 757         multipath_flush();
 758
 759         spin_lock_bh(&rt_flush_lock);
 760
 761         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 762                 long tmo = (long)(rt_deadline - now);
 763
 764                 /* If flush timer is already running
 765                    and flush request is not immediate (delay > 0):
 766
 767                    if deadline is not achieved, prolongate timer to "delay",
 768                    otherwise fire it at deadline time.
 769                  */
 770
 771                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 772                         tmo = 0;
 773
 774                 if (delay > tmo)
 775                         delay = tmo;
 776         }
 777
 778         if (delay <= 0) {
 779                 spin_unlock_bh(&rt_flush_lock);
 780                 rt_run_flush(user_mode);
 781                 return;
 782         }
 783
 784         if (rt_deadline == 0)
 785                 rt_deadline = now + ip_rt_max_delay;
 786
 787         mod_timer(&rt_flush_timer, now+delay);
 788         spin_unlock_bh(&rt_flush_lock);
 789 }
 790
 791 /*
 792  * We change rt_hash_rnd and ask next rt_worker_func() invocation
 793  * to perform a flush in process context
 794  */
 795 static void rt_secret_rebuild(unsigned long dummy)
 796 {
 797         get_random_bytes(&rt_hash_rnd, 4);
 798         ip_rt_flush_expected = 1;
 799         cancel_delayed_work(&expires_work);
 800         schedule_delayed_work(&expires_work, HZ/10);
 801         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 802 }
 803
 804 /*
 805    Short description of GC goals.
 806
 807    We want to build algorithm, which will keep routing cache
 808    at some equilibrium point, when number of aged off entries
 809    is kept approximately equal to newly generated ones.
 810
 811    Current expiration strength is variable "expire".
 812    We try to adjust it dynamically, so that if networking
 813    is idle expires is large enough to keep enough of warm entries,
 814    and when load increases it reduces to limit cache size.
 815  */
 816
 817 static int rt_garbage_collect(void)
 818 {
 819         static unsigned long expire = RT_GC_TIMEOUT;
 820         static unsigned long last_gc;
 821         static int rover;
 822         static int equilibrium;
 823         struct rtable *rth, **rthp;
 824         unsigned long now = jiffies;
 825         int goal;
 826
 827         /*
 828          * Garbage collection is pretty expensive,
 829          * do not make it too frequently.
 830          */
 831
 832         RT_CACHE_STAT_INC(gc_total);
 833
 834         if (now - last_gc < ip_rt_gc_min_interval &&
 835             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 836                 RT_CACHE_STAT_INC(gc_ignored);
 837                 goto out;
 838         }
 839
 840         /* Calculate number of entries, which we want to expire now. */
 841         goal = atomic_read(&ipv4_dst_ops.entries) -
 842                 (ip_rt_gc_elasticity << rt_hash_log);
 843         if (goal <= 0) {
 844                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 845                         equilibrium = ipv4_dst_ops.gc_thresh;
 846                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 847                 if (goal > 0) {
 848                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 849                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 850                 }
 851         } else {
 852                 /* We are in dangerous area. Try to reduce cache really
 853                  * aggressively.
 854                  */
 855                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 856                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 857         }
 858
 859         if (now - last_gc >= ip_rt_gc_min_interval)
 860                 last_gc = now;
 861
 862         if (goal <= 0) {
 863                 equilibrium += goal;
 864                 goto work_done;
 865         }
 866
 867         do {
 868                 int i, k;
 869
 870                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 871                         unsigned long tmo = expire;
 872
 873                         k = (k + 1) & rt_hash_mask;
 874                         rthp = &rt_hash_table[k].chain;
 875                         spin_lock_bh(rt_hash_lock_addr(k));
 876                         while ((rth = *rthp) != NULL) {
 877                                 if (!rt_may_expire(rth, tmo, expire)) {
 878                                         tmo >>= 1;
 879                                         rthp = &rth->u.dst.rt_next;
 880                                         continue;
 881                                 }
 882 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 883                                 /* remove all related balanced entries
 884                                  * if necessary
 885                                  */
 886                                 if (rth->u.dst.flags & DST_BALANCED) {
 887                                         int r;
 888
 889                                         rthp = rt_remove_balanced_route(
 890                                                 &rt_hash_table[k].chain,
 891                                                 rth,
 892                                                 &r);
 893                                         goal -= r;
 894                                         if (!rthp)
 895                                                 break;
 896                                 } else {
 897                                         *rthp = rth->u.dst.rt_next;
 898                                         rt_free(rth);
 899                                         goal--;
 900                                 }
 901 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 902                                 *rthp = rth->u.dst.rt_next;
 903                                 rt_free(rth);
 904                                 goal--;
 905 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 906                         }
 907                         spin_unlock_bh(rt_hash_lock_addr(k));
 908                         if (goal <= 0)
 909                                 break;
 910                 }
 911                 rover = k;
 912
 913                 if (goal <= 0)
 914                         goto work_done;
 915
 916                 /* Goal is not achieved. We stop process if:
 917
 918                    - if expire reduced to zero. Otherwise, expire is halfed.
 919                    - if table is not full.
 920                    - if we are called from interrupt.
 921                    - jiffies check is just fallback/debug loop breaker.
 922                      We will not spin here for long time in any case.
 923                  */
 924
 925                 RT_CACHE_STAT_INC(gc_goal_miss);
 926
 927                 if (expire == 0)
 928                         break;
 929
 930                 expire >>= 1;
 931 #if RT_CACHE_DEBUG >= 2
 932                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 933                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 934 #endif
 935
 936                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 937                         goto out;
 938         } while (!in_softirq() && time_before_eq(jiffies, now));
 939
 940         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 941                 goto out;
 942         if (net_ratelimit())
 943                 printk(KERN_WARNING "dst cache overflow\n");
 944         RT_CACHE_STAT_INC(gc_dst_overflow);
 945         return 1;
 946
 947 work_done:
 948         expire += ip_rt_gc_min_interval;
 949         if (expire > ip_rt_gc_timeout ||
 950             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 951                 expire = ip_rt_gc_timeout;
 952 #if RT_CACHE_DEBUG >= 2
 953         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 954                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 955 #endif
 956 out:    return 0;
 957 }
 958
 959 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 960 {
 961         struct rtable   *rth, **rthp;
 962         unsigned long   now;
 963         struct rtable *cand, **candp;
 964         u32             min_score;
 965         int             chain_length;
 966         int attempts = !in_softirq();
 967
 968 restart:
 969         chain_length = 0;
 970         min_score = ~(u32)0;
 971         cand = NULL;
 972         candp = NULL;
 973         now = jiffies;
 974
 975         rthp = &rt_hash_table[hash].chain;
 976
 977         spin_lock_bh(rt_hash_lock_addr(hash));
 978         while ((rth = *rthp) != NULL) {
 979 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 980                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 981                     compare_keys(&rth->fl, &rt->fl)) {
 982 #else
 983                 if (compare_keys(&rth->fl, &rt->fl)) {
 984 #endif
 985                         /* Put it first */
 986                         *rthp = rth->u.dst.rt_next;
 987                         /*
 988                          * Since lookup is lockfree, the deletion
 989                          * must be visible to another weakly ordered CPU before
 990                          * the insertion at the start of the hash chain.
 991                          */
 992                         rcu_assign_pointer(rth->u.dst.rt_next,
 993                                            rt_hash_table[hash].chain);
 994                         /*
 995                          * Since lookup is lockfree, the update writes
 996                          * must be ordered for consistency on SMP.
 997                          */
 998                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 999
1000                         rth->u.dst.__use++;
1001                         dst_hold(&rth->u.dst);
1002                         rth->u.dst.lastuse = now;
1003                         spin_unlock_bh(rt_hash_lock_addr(hash));
1004
1005                         rt_drop(rt);
1006                         *rp = rth;
1007                         return 0;
1008                 }
1009
1010                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1011                         u32 score = rt_score(rth);
1012
1013                         if (score <= min_score) {
1014                                 cand = rth;
1015                                 candp = rthp;
1016                                 min_score = score;
1017                         }
1018                 }
1019
1020                 chain_length++;
1021
1022                 rthp = &rth->u.dst.rt_next;
1023         }
1024
1025         if (cand) {
1026                 /* ip_rt_gc_elasticity used to be average length of chain
1027                  * length, when exceeded gc becomes really aggressive.
1028                  *
1029                  * The second limit is less certain. At the moment it allows
1030                  * only 2 entries per bucket. We will see.
1031                  */
1032                 if (chain_length > ip_rt_gc_elasticity) {
1033                         *candp = cand->u.dst.rt_next;
1034                         rt_free(cand);
1035                 }
1036         }
1037
1038         /* Try to bind route to arp only if it is output
1039            route or unicast forwarding path.
1040          */
1041         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1042                 int err = arp_bind_neighbour(&rt->u.dst);
1043                 if (err) {
1044                         spin_unlock_bh(rt_hash_lock_addr(hash));
1045
1046                         if (err != -ENOBUFS) {
1047                                 rt_drop(rt);
1048                                 return err;
1049                         }
1050
1051                         /* Neighbour tables are full and nothing
1052                            can be released. Try to shrink route cache,
1053                            it is most likely it holds some neighbour records.
1054                          */
1055                         if (attempts-- > 0) {
1056                                 int saved_elasticity = ip_rt_gc_elasticity;
1057                                 int saved_int = ip_rt_gc_min_interval;
1058                                 ip_rt_gc_elasticity     = 1;
1059                                 ip_rt_gc_min_interval   = 0;
1060                                 rt_garbage_collect();
1061                                 ip_rt_gc_min_interval   = saved_int;
1062                                 ip_rt_gc_elasticity     = saved_elasticity;
1063                                 goto restart;
1064                         }
1065
1066                         if (net_ratelimit())
1067                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1068                         rt_drop(rt);
1069                         return -ENOBUFS;
1070                 }
1071         }
1072
1073         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1074 #if RT_CACHE_DEBUG >= 2
1075         if (rt->u.dst.rt_next) {
1076                 struct rtable *trt;
1077                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1078                        NIPQUAD(rt->rt_dst));
1079                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1080                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1081                 printk("\n");
1082         }
1083 #endif
1084         /*
1085          * Since lookup is lockfree, we must make sure
1086          * previous writes to rt are comitted to memory
1087          * before making rt visible to other CPUS.
1088          */
1089         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1090         spin_unlock_bh(rt_hash_lock_addr(hash));
1091         *rp = rt;
1092         return 0;
1093 }
1094
1095 void rt_bind_peer(struct rtable *rt, int create)
1096 {
1097         static DEFINE_SPINLOCK(rt_peer_lock);
1098         struct inet_peer *peer;
1099
1100         peer = inet_getpeer(rt->rt_dst, create);
1101
1102         spin_lock_bh(&rt_peer_lock);
1103         if (rt->peer == NULL) {
1104                 rt->peer = peer;
1105                 peer = NULL;
1106         }
1107         spin_unlock_bh(&rt_peer_lock);
1108         if (peer)
1109                 inet_putpeer(peer);
1110 }
1111
1112 /*
1113  * Peer allocation may fail only in serious out-of-memory conditions.  However
1114  * we still can generate some output.
1115  * Random ID selection looks a bit dangerous because we have no chances to
1116  * select ID being unique in a reasonable period of time.
1117  * But broken packet identifier may be better than no packet at all.
1118  */
1119 static void ip_select_fb_ident(struct iphdr *iph)
1120 {
1121         static DEFINE_SPINLOCK(ip_fb_id_lock);
1122         static u32 ip_fallback_id;
1123         u32 salt;
1124
1125         spin_lock_bh(&ip_fb_id_lock);
1126         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1127         iph->id = htons(salt & 0xFFFF);
1128         ip_fallback_id = salt;
1129         spin_unlock_bh(&ip_fb_id_lock);
1130 }
1131
1132 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1133 {
1134         struct rtable *rt = (struct rtable *) dst;
1135
1136         if (rt) {
1137                 if (rt->peer == NULL)
1138                         rt_bind_peer(rt, 1);
1139
1140                 /* If peer is attached to destination, it is never detached,
1141                    so that we need not to grab a lock to dereference it.
1142                  */
1143                 if (rt->peer) {
1144                         iph->id = htons(inet_getid(rt->peer, more));
1145                         return;
1146                 }
1147         } else
1148                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1149                        __builtin_return_address(0));
1150
1151         ip_select_fb_ident(iph);
1152 }
1153
1154 static void rt_del(unsigned hash, struct rtable *rt)
1155 {
1156         struct rtable **rthp;
1157
1158         spin_lock_bh(rt_hash_lock_addr(hash));
1159         ip_rt_put(rt);
1160         for (rthp = &rt_hash_table[hash].chain; *rthp;
1161              rthp = &(*rthp)->u.dst.rt_next)
1162                 if (*rthp == rt) {
1163                         *rthp = rt->u.dst.rt_next;
1164                         rt_free(rt);
1165                         break;
1166                 }
1167         spin_unlock_bh(rt_hash_lock_addr(hash));
1168 }
1169
1170 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1171                     __be32 saddr, struct net_device *dev)
1172 {
1173         int i, k;
1174         struct in_device *in_dev = in_dev_get(dev);
1175         struct rtable *rth, **rthp;
1176         __be32  skeys[2] = { saddr, 0 };
1177         int  ikeys[2] = { dev->ifindex, 0 };
1178         struct netevent_redirect netevent;
1179
1180         if (!in_dev)
1181                 return;
1182
1183         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1184             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1185                 goto reject_redirect;
1186
1187         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1188                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1189                         goto reject_redirect;
1190                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1191                         goto reject_redirect;
1192         } else {
1193                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1194                         goto reject_redirect;
1195         }
1196
1197         for (i = 0; i < 2; i++) {
1198                 for (k = 0; k < 2; k++) {
1199                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1200
1201                         rthp=&rt_hash_table[hash].chain;
1202
1203                         rcu_read_lock();
1204                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1205                                 struct rtable *rt;
1206
1207                                 if (rth->fl.fl4_dst != daddr ||
1208                                     rth->fl.fl4_src != skeys[i] ||
1209                                     rth->fl.oif != ikeys[k] ||
1210                                     rth->fl.iif != 0) {
1211                                         rthp = &rth->u.dst.rt_next;
1212                                         continue;
1213                                 }
1214
1215                                 if (rth->rt_dst != daddr ||
1216                                     rth->rt_src != saddr ||
1217                                     rth->u.dst.error ||
1218                                     rth->rt_gateway != old_gw ||
1219                                     rth->u.dst.dev != dev)
1220                                         break;
1221
1222                                 dst_hold(&rth->u.dst);
1223                                 rcu_read_unlock();
1224
1225                                 rt = dst_alloc(&ipv4_dst_ops);
1226                                 if (rt == NULL) {
1227                                         ip_rt_put(rth);
1228                                         in_dev_put(in_dev);
1229                                         return;
1230                                 }
1231
1232                                 /* Copy all the information. */
1233                                 *rt = *rth;
1234                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1235                                 rt->u.dst.__use         = 1;
1236                                 atomic_set(&rt->u.dst.__refcnt, 1);
1237                                 rt->u.dst.child         = NULL;
1238                                 if (rt->u.dst.dev)
1239                                         dev_hold(rt->u.dst.dev);
1240                                 if (rt->idev)
1241                                         in_dev_hold(rt->idev);
1242                                 rt->u.dst.obsolete      = 0;
1243                                 rt->u.dst.lastuse       = jiffies;
1244                                 rt->u.dst.path          = &rt->u.dst;
1245                                 rt->u.dst.neighbour     = NULL;
1246                                 rt->u.dst.hh            = NULL;
1247                                 rt->u.dst.xfrm          = NULL;
1248
1249                                 rt->rt_flags            |= RTCF_REDIRECTED;
1250
1251                                 /* Gateway is different ... */
1252                                 rt->rt_gateway          = new_gw;
1253                                 if (rt->fl.fl4_gw) rt->fl.fl4_gw = new_gw;
1254
1255                                 /* Redirect received -> path was valid */
1256                                 dst_confirm(&rth->u.dst);
1257
1258                                 if (rt->peer)
1259                                         atomic_inc(&rt->peer->refcnt);
1260
1261                                 if (arp_bind_neighbour(&rt->u.dst) ||
1262                                     !(rt->u.dst.neighbour->nud_state &
1263                                             NUD_VALID)) {
1264                                         if (rt->u.dst.neighbour)
1265                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1266                                         ip_rt_put(rth);
1267                                         rt_drop(rt);
1268                                         goto do_next;
1269                                 }
1270
1271                                 netevent.old = &rth->u.dst;
1272                                 netevent.new = &rt->u.dst;
1273                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1274                                                         &netevent);
1275
1276                                 rt_del(hash, rth);
1277                                 if (!rt_intern_hash(hash, rt, &rt))
1278                                         ip_rt_put(rt);
1279                                 goto do_next;
1280                         }
1281                         rcu_read_unlock();
1282                 do_next:
1283                         ;
1284                 }
1285         }
1286         in_dev_put(in_dev);
1287         return;
1288
1289 reject_redirect:
1290 #ifdef CONFIG_IP_ROUTE_VERBOSE
1291         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1292                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1293                         "%u.%u.%u.%u ignored.\n"
1294                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1295                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1296                        NIPQUAD(saddr), NIPQUAD(daddr));
1297 #endif
1298         in_dev_put(in_dev);
1299 }
1300
1301 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1302 {
1303         struct rtable *rt = (struct rtable*)dst;
1304         struct dst_entry *ret = dst;
1305
1306         if (rt) {
1307                 if (dst->obsolete) {
1308                         ip_rt_put(rt);
1309                         ret = NULL;
1310                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1311                            (rt->u.dst.expires &&
1312                             time_after_eq(jiffies, rt->u.dst.expires))) {
1313                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1314                                                 rt->fl.oif);
1315 #if RT_CACHE_DEBUG >= 1
1316                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1317                                           "%u.%u.%u.%u/%02x dropped\n",
1318                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1319 #endif
1320                         rt_del(hash, rt);
1321                         ret = NULL;
1322                 }
1323         }
1324         return ret;
1325 }
1326
1327 /*
1328  * Algorithm:
1329  *      1. The first ip_rt_redirect_number redirects are sent
1330  *         with exponential backoff, then we stop sending them at all,
1331  *         assuming that the host ignores our redirects.
1332  *      2. If we did not see packets requiring redirects
1333  *         during ip_rt_redirect_silence, we assume that the host
1334  *         forgot redirected route and start to send redirects again.
1335  *
1336  * This algorithm is much cheaper and more intelligent than dumb load limiting
1337  * in icmp.c.
1338  *
1339  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1340  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1341  */
1342
1343 void ip_rt_send_redirect(struct sk_buff *skb)
1344 {
1345         struct rtable *rt = (struct rtable*)skb->dst;
1346         struct in_device *in_dev;
1347         int log_martians;
1348
1349         rcu_read_lock();
1350         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1351         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1352                 rcu_read_unlock();
1353                 return;
1354         }
1355         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1356         rcu_read_unlock();
1357
1358         /* No redirected packets during ip_rt_redirect_silence;
1359          * reset the algorithm.
1360          */
1361         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1362                 rt->u.dst.rate_tokens = 0;
1363
1364         /* Too many ignored redirects; do not send anything
1365          * set u.dst.rate_last to the last seen redirected packet.
1366          */
1367         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1368                 rt->u.dst.rate_last = jiffies;
1369                 return;
1370         }
1371
1372         /* Check for load limit; set rate_last to the latest sent
1373          * redirect.
1374          */
1375         if (rt->u.dst.rate_tokens == 0 ||
1376             time_after(jiffies,
1377                        (rt->u.dst.rate_last +
1378                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1379                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1380                 rt->u.dst.rate_last = jiffies;
1381                 ++rt->u.dst.rate_tokens;
1382 #ifdef CONFIG_IP_ROUTE_VERBOSE
1383                 if (log_martians &&
1384                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1385                     net_ratelimit())
1386                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1387                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1388                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1389                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1390 #endif
1391         }
1392 }
1393
1394 static int ip_error(struct sk_buff *skb)
1395 {
1396         struct rtable *rt = (struct rtable*)skb->dst;
1397         unsigned long now;
1398         int code;
1399
1400         switch (rt->u.dst.error) {
1401                 case EINVAL:
1402                 default:
1403                         goto out;
1404                 case EHOSTUNREACH:
1405                         code = ICMP_HOST_UNREACH;
1406                         break;
1407                 case ENETUNREACH:
1408                         code = ICMP_NET_UNREACH;
1409                         break;
1410                 case EACCES:
1411                         code = ICMP_PKT_FILTERED;
1412                         break;
1413         }
1414
1415         now = jiffies;
1416         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1417         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1418                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1419         rt->u.dst.rate_last = now;
1420         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1421                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1422                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1423         }
1424
1425 out:    kfree_skb(skb);
1426         return 0;
1427 }
1428
1429 /*
1430  *      The last two values are not from the RFC but
1431  *      are needed for AMPRnet AX.25 paths.
1432  */
1433
1434 static const unsigned short mtu_plateau[] =
1435 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1436
1437 static inline unsigned short guess_mtu(unsigned short old_mtu)
1438 {
1439         int i;
1440
1441         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1442                 if (old_mtu > mtu_plateau[i])
1443                         return mtu_plateau[i];
1444         return 68;
1445 }
1446
1447 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1448 {
1449         int i;
1450         unsigned short old_mtu = ntohs(iph->tot_len);
1451         struct rtable *rth;
1452         __be32  skeys[2] = { iph->saddr, 0, };
1453         __be32  daddr = iph->daddr;
1454         unsigned short est_mtu = 0;
1455
1456         if (ipv4_config.no_pmtu_disc)
1457                 return 0;
1458
1459         for (i = 0; i < 2; i++) {
1460                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1461
1462                 rcu_read_lock();
1463                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1464                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1465                         if (rth->fl.fl4_dst == daddr &&
1466                             rth->fl.fl4_src == skeys[i] &&
1467                             rth->rt_dst  == daddr &&
1468                             rth->rt_src  == iph->saddr &&
1469                             rth->fl.iif == 0 &&
1470                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1471                                 unsigned short mtu = new_mtu;
1472
1473                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1474
1475                                         /* BSD 4.2 compatibility hack :-( */
1476                                         if (mtu == 0 &&
1477                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1478                                             old_mtu >= 68 + (iph->ihl << 2))
1479                                                 old_mtu -= iph->ihl << 2;
1480
1481                                         mtu = guess_mtu(old_mtu);
1482                                 }
1483                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1484                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1485                                                 dst_confirm(&rth->u.dst);
1486                                                 if (mtu < ip_rt_min_pmtu) {
1487                                                         mtu = ip_rt_min_pmtu;
1488                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1489                                                                 (1 << RTAX_MTU);
1490                                                 }
1491                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1492                                                 dst_set_expires(&rth->u.dst,
1493                                                         ip_rt_mtu_expires);
1494                                         }
1495                                         est_mtu = mtu;
1496                                 }
1497                         }
1498                 }
1499                 rcu_read_unlock();
1500         }
1501         return est_mtu ? : new_mtu;
1502 }
1503
1504 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1505 {
1506         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1507             !(dst_metric_locked(dst, RTAX_MTU))) {
1508                 if (mtu < ip_rt_min_pmtu) {
1509                         mtu = ip_rt_min_pmtu;
1510                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1511                 }
1512                 dst->metrics[RTAX_MTU-1] = mtu;
1513                 dst_set_expires(dst, ip_rt_mtu_expires);
1514                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1515         }
1516 }
1517
1518 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1519 {
1520         return NULL;
1521 }
1522
1523 static void ipv4_dst_destroy(struct dst_entry *dst)
1524 {
1525         struct rtable *rt = (struct rtable *) dst;
1526         struct inet_peer *peer = rt->peer;
1527         struct in_device *idev = rt->idev;
1528
1529         if (peer) {
1530                 rt->peer = NULL;
1531                 inet_putpeer(peer);
1532         }
1533
1534         if (idev) {
1535                 rt->idev = NULL;
1536                 in_dev_put(idev);
1537         }
1538 }
1539
1540 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1541                             int how)
1542 {
1543         struct rtable *rt = (struct rtable *) dst;
1544         struct in_device *idev = rt->idev;
1545         if (dev != &loopback_dev && idev && idev->dev == dev) {
1546                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1547                 if (loopback_idev) {
1548                         rt->idev = loopback_idev;
1549                         in_dev_put(idev);
1550                 }
1551         }
1552 }
1553
1554 static void ipv4_link_failure(struct sk_buff *skb)
1555 {
1556         struct rtable *rt;
1557
1558         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1559
1560         rt = (struct rtable *) skb->dst;
1561         if (rt)
1562                 dst_set_expires(&rt->u.dst, 0);
1563 }
1564
1565 static int ip_rt_bug(struct sk_buff *skb)
1566 {
1567         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1568                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1569                 skb->dev ? skb->dev->name : "?");
1570         kfree_skb(skb);
1571         return 0;
1572 }
1573
1574 /*
1575    We do not cache source address of outgoing interface,
1576    because it is used only by IP RR, TS and SRR options,
1577    so that it out of fast path.
1578
1579    BTW remember: "addr" is allowed to be not aligned
1580    in IP options!
1581  */
1582
1583 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1584 {
1585         __be32 src;
1586         struct fib_result res;
1587
1588         if (rt->fl.iif == 0)
1589                 src = rt->rt_src;
1590         else if (fib_lookup(&rt->fl, &res) == 0) {
1591                 src = FIB_RES_PREFSRC(res);
1592                 fib_res_put(&res);
1593         } else
1594                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1595                                         RT_SCOPE_UNIVERSE);
1596         memcpy(addr, &src, 4);
1597 }
1598
1599 #ifdef CONFIG_NET_CLS_ROUTE
1600 static void set_class_tag(struct rtable *rt, u32 tag)
1601 {
1602         if (!(rt->u.dst.tclassid & 0xFFFF))
1603                 rt->u.dst.tclassid |= tag & 0xFFFF;
1604         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1605                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1606 }
1607 #endif
1608
1609 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1610 {
1611         struct fib_info *fi = res->fi;
1612
1613         if (fi) {
1614                 if (FIB_RES_GW(*res) &&
1615                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1616                         rt->rt_gateway = FIB_RES_GW(*res);
1617                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1618                        sizeof(rt->u.dst.metrics));
1619                 if (fi->fib_mtu == 0) {
1620                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1621                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1622                             rt->rt_gateway != rt->rt_dst &&
1623                             rt->u.dst.dev->mtu > 576)
1624                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1625                 }
1626 #ifdef CONFIG_NET_CLS_ROUTE
1627                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1628 #endif
1629         } else
1630                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1631
1632         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1633                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1634         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1635                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1636         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1637                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1638                                        ip_rt_min_advmss);
1639         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1640                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1641
1642 #ifdef CONFIG_NET_CLS_ROUTE
1643 #ifdef CONFIG_IP_MULTIPLE_TABLES
1644         set_class_tag(rt, fib_rules_tclass(res));
1645 #endif
1646         set_class_tag(rt, itag);
1647 #endif
1648         rt->rt_type = res->type;
1649 }
1650
1651 /* called in rcu_read_lock() section */
1652 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1653                                 u8 tos, struct net_device *dev, int our)
1654 {
1655         unsigned int hash;
1656         struct rtable *rth;
1657         __be32 spec_dst;
1658         struct in_device *in_dev = __in_dev_get_rcu(dev);
1659         u32 itag = 0;
1660
1661         /* Primary sanity checks. */
1662
1663         if (in_dev == NULL)
1664                 return -EINVAL;
1665
1666         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1667             skb->protocol != htons(ETH_P_IP))
1668                 return -EINVAL;
1669
1670         if (ZERONET(saddr)) {
1671                 if (!LOCAL_MCAST(daddr))
1672                         return -EINVAL;
1673                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1674         } else if (fib_validate_source(saddr, 0, tos, 0,
1675                                         dev, &spec_dst, &itag) < 0)
1676                 return -EINVAL;
1677
1678         rth = dst_alloc(&ipv4_dst_ops);
1679         if (!rth)
1680                 return -ENOBUFS;
1681
1682         rth->u.dst.output= ip_rt_bug;
1683
1684         atomic_set(&rth->u.dst.__refcnt, 1);
1685         rth->u.dst.flags= DST_HOST;
1686         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1687                 rth->u.dst.flags |= DST_NOPOLICY;
1688         rth->fl.fl4_dst = daddr;
1689         rth->rt_dst     = daddr;
1690         rth->fl.fl4_tos = tos;
1691         rth->fl.mark    = skb->mark;
1692         rth->fl.fl4_src = saddr;
1693         rth->fl.fl4_lsrc = 0;
1694         rth->rt_src     = saddr;
1695 #ifdef CONFIG_NET_CLS_ROUTE
1696         rth->u.dst.tclassid = itag;
1697 #endif
1698         rth->rt_iif     =
1699         rth->fl.iif     = dev->ifindex;
1700         rth->u.dst.dev  = &loopback_dev;
1701         dev_hold(rth->u.dst.dev);
1702         rth->idev       = in_dev_get(rth->u.dst.dev);
1703         rth->fl.oif     = 0;
1704         rth->fl.fl4_gw  = 0;
1705         rth->rt_gateway = daddr;
1706         rth->rt_spec_dst= spec_dst;
1707         rth->rt_type    = RTN_MULTICAST;
1708         rth->rt_flags   = RTCF_MULTICAST;
1709         if (our) {
1710                 rth->u.dst.input= ip_local_deliver;
1711                 rth->rt_flags |= RTCF_LOCAL;
1712         }
1713
1714 #ifdef CONFIG_IP_MROUTE
1715         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1716                 rth->u.dst.input = ip_mr_input;
1717 #endif
1718         RT_CACHE_STAT_INC(in_slow_mc);
1719
1720         hash = rt_hash(daddr, saddr, dev->ifindex);
1721         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1722 }
1723
1724
1725 static void ip_handle_martian_source(struct net_device *dev,
1726                                      struct in_device *in_dev,
1727                                      struct sk_buff *skb,
1728                                      __be32 daddr,
1729                                      __be32 saddr)
1730 {
1731         RT_CACHE_STAT_INC(in_martian_src);
1732 #ifdef CONFIG_IP_ROUTE_VERBOSE
1733         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734                 /*
1735                  *      RFC1812 recommendation, if source is martian,
1736                  *      the only hint is MAC header.
1737                  */
1738                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1739                         "%u.%u.%u.%u, on dev %s\n",
1740                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1741                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1742                         int i;
1743                         const unsigned char *p = skb_mac_header(skb);
1744                         printk(KERN_WARNING "ll header: ");
1745                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1746                                 printk("%02x", *p);
1747                                 if (i < (dev->hard_header_len - 1))
1748                                         printk(":");
1749                         }
1750                         printk("\n");
1751                 }
1752         }
1753 #endif
1754 }
1755
1756 /* called in rcu_read_lock() section */
1757 static int __mkroute_input(struct sk_buff *skb,
1758                            struct fib_result *res,
1759                            struct in_device *in_dev,
1760                            __be32 daddr, __be32 saddr, u32 tos, u32 lsrc,
1761                            struct rtable **result)
1762 {
1763
1764         struct rtable *rth;
1765         int err;
1766         struct in_device *out_dev;
1767         unsigned int flags = 0;
1768         __be32 spec_dst;
1769         u32 itag;
1770
1771         /* get a working reference to the output device */
1772         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1773         if (out_dev == NULL) {
1774                 if (net_ratelimit())
1775                         printk(KERN_CRIT "Bug in ip_route_input" \
1776                                "_slow(). Please, report\n");
1777                 return -EINVAL;
1778         }
1779
1780
1781         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1782                                   in_dev->dev, &spec_dst, &itag);
1783         if (err < 0) {
1784                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1785                                          saddr);
1786
1787                 err = -EINVAL;
1788                 goto cleanup;
1789         }
1790
1791         if (err)
1792                 flags |= RTCF_DIRECTSRC;
1793
1794         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1795             !lsrc &&
1796             (IN_DEV_SHARED_MEDIA(out_dev) ||
1797              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1798                 flags |= RTCF_DOREDIRECT;
1799
1800         if (skb->protocol != htons(ETH_P_IP)) {
1801                 /* Not IP (i.e. ARP). Do not create route, if it is
1802                  * invalid for proxy arp. DNAT routes are always valid.
1803                  */
1804                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1805                         err = -EINVAL;
1806                         goto cleanup;
1807                 }
1808         }
1809
1810
1811         rth = dst_alloc(&ipv4_dst_ops);
1812         if (!rth) {
1813                 err = -ENOBUFS;
1814                 goto cleanup;
1815         }
1816
1817         atomic_set(&rth->u.dst.__refcnt, 1);
1818         rth->u.dst.flags= DST_HOST;
1819 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1820         if (res->fi->fib_nhs > 1)
1821                 rth->u.dst.flags |= DST_BALANCED;
1822 #endif
1823         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1824                 rth->u.dst.flags |= DST_NOPOLICY;
1825         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1826                 rth->u.dst.flags |= DST_NOXFRM;
1827         rth->fl.fl4_dst = daddr;
1828         rth->rt_dst     = daddr;
1829         rth->fl.fl4_tos = tos;
1830         rth->fl.mark    = skb->mark;
1831         rth->fl.fl4_src = saddr;
1832         rth->rt_src     = saddr;
1833         rth->fl.fl4_lsrc        = lsrc;
1834         rth->rt_gateway = daddr;
1835         rth->rt_iif     =
1836                 rth->fl.iif     = in_dev->dev->ifindex;
1837         rth->u.dst.dev  = (out_dev)->dev;
1838         dev_hold(rth->u.dst.dev);
1839         rth->idev       = in_dev_get(rth->u.dst.dev);
1840         rth->fl.oif     = 0;
1841         rth->fl.fl4_gw  = 0;
1842         rth->rt_spec_dst= spec_dst;
1843
1844         rth->u.dst.input = ip_forward;
1845         rth->u.dst.output = ip_output;
1846
1847         rt_set_nexthop(rth, res, itag);
1848
1849         rth->rt_flags = flags;
1850
1851         *result = rth;
1852         err = 0;
1853  cleanup:
1854         return err;
1855 }
1856
1857 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1858                                        struct fib_result* res,
1859                                        const struct flowi *fl,
1860                                        struct in_device *in_dev,
1861                                        __be32 daddr, __be32 saddr, u32 tos,
1862                                        u32 lsrc)
1863 {
1864         struct rtable* rth = NULL;
1865         int err;
1866         unsigned hash;
1867
1868         fib_select_default(fl, res);
1869 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1870         if (res->fi && res->fi->fib_nhs > 1)
1871                 fib_select_multipath(fl, res);
1872 #endif
1873
1874         /* create a routing cache entry */
1875         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc, &rth);
1876         if (err)
1877                 return err;
1878
1879         /* put it into the cache */
1880         hash = rt_hash(daddr, saddr, fl->iif);
1881         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1882 }
1883
1884 static int ip_mkroute_input(struct sk_buff *skb,
1885                             struct fib_result *res,
1886                             const struct flowi *fl,
1887                             struct in_device *in_dev,
1888                             __be32 daddr, __be32 saddr, u32 tos,
1889                             u32 lsrc)
1890 {
1891 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1892         struct rtable* rth = NULL, *rtres;
1893         unsigned char hop, hopcount;
1894         int err = -EINVAL;
1895         unsigned int hash;
1896
1897         if (res->fi)
1898                 hopcount = res->fi->fib_nhs;
1899         else
1900                 hopcount = 1;
1901
1902         /* distinguish between multipath and singlepath */
1903         if (hopcount < 2)
1904                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1905                                             saddr, tos, lsrc);
1906
1907         /* add all alternatives to the routing cache */
1908         for (hop = 0; hop < hopcount; hop++) {
1909                 res->nh_sel = hop;
1910
1911                 /* put reference to previous result */
1912                 if (hop)
1913                         ip_rt_put(rtres);
1914
1915                 /* create a routing cache entry */
1916                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc,
1917                                       &rth);
1918                 if (err)
1919                         return err;
1920
1921                 /* put it into the cache */
1922                 hash = rt_hash(daddr, saddr, fl->iif);
1923                 err = rt_intern_hash(hash, rth, &rtres);
1924                 if (err)
1925                         return err;
1926
1927                 /* forward hop information to multipath impl. */
1928                 multipath_set_nhinfo(rth,
1929                                      FIB_RES_NETWORK(*res),
1930                                      FIB_RES_NETMASK(*res),
1931                                      res->prefixlen,
1932                                      &FIB_RES_NH(*res));
1933         }
1934         skb->dst = &rtres->u.dst;
1935         return err;
1936 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1937         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos, lsrc);
1938 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1939 }
1940
1941
1942 /*
1943  *      NOTE. We drop all the packets that has local source
1944  *      addresses, because every properly looped back packet
1945  *      must have correct destination already attached by output routine.
1946  *
1947  *      Such approach solves two big problems:
1948  *      1. Not simplex devices are handled properly.
1949  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1950  */
1951
1952 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1953                                u8 tos, struct net_device *dev, u32 lsrc)
1954 {
1955         struct fib_result res;
1956         struct in_device *in_dev = __in_dev_get_rcu(dev);
1957         struct flowi fl = { .nl_u = { .ip4_u =
1958                                       { .daddr = daddr,
1959                                         .saddr = lsrc ? : saddr,
1960                                         .tos = tos,
1961                                         .scope = RT_SCOPE_UNIVERSE,
1962                                       } },
1963                             .mark = skb->mark,
1964                             .iif = lsrc? loopback_dev.ifindex : dev->ifindex };
1965         unsigned        flags = 0;
1966         u32             itag = 0;
1967         struct rtable * rth;
1968         unsigned        hash;
1969         __be32          spec_dst;
1970         int             err = -EINVAL;
1971         int             free_res = 0;
1972
1973         /* IP on this device is disabled. */
1974
1975         if (!in_dev)
1976                 goto out;
1977
1978         /* Check for the most weird martians, which can be not detected
1979            by fib_lookup.
1980          */
1981
1982         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1983                 goto martian_source;
1984
1985         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1986                 goto brd_input;
1987
1988         /* Accept zero addresses only to limited broadcast;
1989          * I even do not know to fix it or not. Waiting for complains :-)
1990          */
1991         if (ZERONET(saddr))
1992                 goto martian_source;
1993
1994         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1995                 goto martian_destination;
1996
1997         if (lsrc) {
1998                 if (MULTICAST(lsrc) || BADCLASS(lsrc) ||
1999                     ZERONET(lsrc) || LOOPBACK(lsrc))
2000                         goto e_inval;
2001         }
2002
2003         /*
2004          *      Now we are ready to route packet.
2005          */
2006         if ((err = fib_lookup(&fl, &res)) != 0) {
2007                 if (!IN_DEV_FORWARD(in_dev))
2008                         goto e_hostunreach;
2009                 goto no_route;
2010         }
2011         free_res = 1;
2012         if (lsrc && res.type != RTN_UNICAST && res.type != RTN_NAT)
2013                 goto e_inval;
2014         fl.iif = dev->ifindex;
2015         fl.fl4_src = saddr;
2016
2017         RT_CACHE_STAT_INC(in_slow_tot);
2018
2019         if (res.type == RTN_BROADCAST)
2020                 goto brd_input;
2021
2022         if (res.type == RTN_LOCAL) {
2023                 int result;
2024                 result = fib_validate_source(saddr, daddr, tos,
2025                                              loopback_dev.ifindex,
2026                                              dev, &spec_dst, &itag);
2027                 if (result < 0)
2028                         goto martian_source;
2029                 if (result)
2030                         flags |= RTCF_DIRECTSRC;
2031                 spec_dst = daddr;
2032                 goto local_input;
2033         }
2034
2035         if (!IN_DEV_FORWARD(in_dev))
2036                 goto e_hostunreach;
2037         if (res.type != RTN_UNICAST)
2038                 goto martian_destination;
2039
2040         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos, lsrc);
2041         if (err == -ENOBUFS)
2042                 goto e_nobufs;
2043         if (err == -EINVAL)
2044                 goto e_inval;
2045
2046 done:
2047         if (free_res)
2048                 fib_res_put(&res);
2049 out:    return err;
2050
2051 brd_input:
2052         if (skb->protocol != htons(ETH_P_IP))
2053                 goto e_inval;
2054         if (lsrc)
2055                 goto e_inval;
2056
2057         if (ZERONET(saddr))
2058                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2059         else {
2060                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2061                                           &itag);
2062                 if (err < 0)
2063                         goto martian_source;
2064                 if (err)
2065                         flags |= RTCF_DIRECTSRC;
2066         }
2067         flags |= RTCF_BROADCAST;
2068         res.type = RTN_BROADCAST;
2069         RT_CACHE_STAT_INC(in_brd);
2070
2071 local_input:
2072         rth = dst_alloc(&ipv4_dst_ops);
2073         if (!rth)
2074                 goto e_nobufs;
2075
2076         rth->u.dst.output= ip_rt_bug;
2077
2078         atomic_set(&rth->u.dst.__refcnt, 1);
2079         rth->u.dst.flags= DST_HOST;
2080         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2081                 rth->u.dst.flags |= DST_NOPOLICY;
2082         rth->fl.fl4_dst = daddr;
2083         rth->rt_dst     = daddr;
2084         rth->fl.fl4_tos = tos;
2085         rth->fl.mark    = skb->mark;
2086         rth->fl.fl4_src = saddr;
2087         rth->rt_src     = saddr;
2088 #ifdef CONFIG_NET_CLS_ROUTE
2089         rth->u.dst.tclassid = itag;
2090 #endif
2091         rth->rt_iif     =
2092         rth->fl.iif     = dev->ifindex;
2093         rth->u.dst.dev  = &loopback_dev;
2094         dev_hold(rth->u.dst.dev);
2095         rth->idev       = in_dev_get(rth->u.dst.dev);
2096         rth->fl.fl4_gw  = 0;
2097         rth->rt_gateway = daddr;
2098         rth->rt_spec_dst= spec_dst;
2099         rth->u.dst.input= ip_local_deliver;
2100         rth->rt_flags   = flags|RTCF_LOCAL;
2101         if (res.type == RTN_UNREACHABLE) {
2102                 rth->u.dst.input= ip_error;
2103                 rth->u.dst.error= -err;
2104                 rth->rt_flags   &= ~RTCF_LOCAL;
2105         }
2106         rth->rt_type    = res.type;
2107         hash = rt_hash(daddr, saddr, fl.iif);
2108         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2109         goto done;
2110
2111 no_route:
2112         RT_CACHE_STAT_INC(in_no_route);
2113         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2114         res.type = RTN_UNREACHABLE;
2115         goto local_input;
2116
2117         /*
2118          *      Do not cache martian addresses: they should be logged (RFC1812)
2119          */
2120 martian_destination:
2121         RT_CACHE_STAT_INC(in_martian_dst);
2122 #ifdef CONFIG_IP_ROUTE_VERBOSE
2123         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2124                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2125                         "%u.%u.%u.%u, dev %s\n",
2126                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2127 #endif
2128
2129 e_hostunreach:
2130         err = -EHOSTUNREACH;
2131         goto done;
2132
2133 e_inval:
2134         err = -EINVAL;
2135         goto done;
2136
2137 e_nobufs:
2138         err = -ENOBUFS;
2139         goto done;
2140
2141 martian_source:
2142         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2143         goto e_inval;
2144 }
2145
2146 static inline int
2147 ip_route_input_cached(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2148                    u8 tos, struct net_device *dev, u32 lsrc)
2149 {
2150         struct rtable * rth;
2151         unsigned        hash;
2152         int iif = dev->ifindex;
2153         int res;
2154
2155         rcu_read_lock();
2156
2157         tos &= IPTOS_RT_MASK;
2158         hash = rt_hash(daddr, saddr, iif);
2159
2160         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2161              rth = rcu_dereference(rth->u.dst.rt_next)) {
2162                 if (((rth->fl.fl4_dst ^ daddr) |
2163                      (rth->fl.fl4_src ^ saddr) |
2164                      (rth->fl.iif ^ iif) |
2165                      (rth->fl.fl4_lsrc ^ lsrc) |
2166                      rth->fl.oif |
2167                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2168                     rth->fl.mark == skb->mark) {
2169                         rth->u.dst.lastuse = jiffies;
2170                         dst_hold(&rth->u.dst);
2171                         rth->u.dst.__use++;
2172                         RT_CACHE_STAT_INC(in_hit);
2173                         rcu_read_unlock();
2174                         skb->dst = (struct dst_entry*)rth;
2175                         return 0;
2176                 }
2177                 RT_CACHE_STAT_INC(in_hlist_search);
2178         }
2179
2180         /* Multicast recognition logic is moved from route cache to here.
2181            The problem was that too many Ethernet cards have broken/missing
2182            hardware multicast filters :-( As result the host on multicasting
2183            network acquires a lot of useless route cache entries, sort of
2184            SDR messages from all the world. Now we try to get rid of them.
2185            Really, provided software IP multicast filter is organized
2186            reasonably (at least, hashed), it does not result in a slowdown
2187            comparing with route cache reject entries.
2188            Note, that multicast routers are not affected, because
2189            route cache entry is created eventually.
2190          */
2191         if (MULTICAST(daddr)) {
2192                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2193
2194                 if (in_dev != NULL) {
2195                         int our = ip_check_mc(in_dev, daddr, saddr,
2196                                 ip_hdr(skb)->protocol);
2197                         if (our
2198 #ifdef CONFIG_IP_MROUTE
2199                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2200 #endif
2201                             ) {
2202                                 res = ip_route_input_mc(skb, daddr, saddr,
2203                                                          tos, dev, our);
2204                                 rcu_read_unlock();
2205                                 return res;
2206                         }
2207                 }
2208                 rcu_read_unlock();
2209                 return -EINVAL;
2210         }
2211         res = ip_route_input_slow(skb, daddr, saddr, tos, dev, lsrc);
2212         rcu_read_unlock();
2213         return res;
2214 }
2215
2216 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2217                    u8 tos, struct net_device *dev)
2218 {
2219         return ip_route_input_cached(skb, daddr, saddr, tos, dev, 0);
2220 }
2221
2222 int ip_route_input_lookup(struct sk_buff *skb, u32 daddr, u32 saddr,
2223                           u8 tos, struct net_device *dev, u32 lsrc)
2224 {
2225         return ip_route_input_cached(skb, daddr, saddr, tos, dev, lsrc);
2226 }
2227
2228 static int __mkroute_output(struct rtable **result,
2229                             struct fib_result *res,
2230                             const struct flowi *fl,
2231                             const struct flowi *oldflp,
2232                             struct net_device *dev_out,
2233                             unsigned flags)
2234 {
2235         struct rtable *rth;
2236         struct in_device *in_dev;
2237         u32 tos = RT_FL_TOS(oldflp);
2238         int err = 0;
2239
2240         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2241                 return -EINVAL;
2242
2243         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2244                 res->type = RTN_BROADCAST;
2245         else if (MULTICAST(fl->fl4_dst))
2246                 res->type = RTN_MULTICAST;
2247         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2248                 return -EINVAL;
2249
2250         if (dev_out->flags & IFF_LOOPBACK)
2251                 flags |= RTCF_LOCAL;
2252
2253         /* get work reference to inet device */
2254         in_dev = in_dev_get(dev_out);
2255         if (!in_dev)
2256                 return -EINVAL;
2257
2258         if (res->type == RTN_BROADCAST) {
2259                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2260                 if (res->fi) {
2261                         fib_info_put(res->fi);
2262                         res->fi = NULL;
2263                 }
2264         } else if (res->type == RTN_MULTICAST) {
2265                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2266                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2267                                  oldflp->proto))
2268                         flags &= ~RTCF_LOCAL;
2269                 /* If multicast route do not exist use
2270                    default one, but do not gateway in this case.
2271                    Yes, it is hack.
2272                  */
2273                 if (res->fi && res->prefixlen < 4) {
2274                         fib_info_put(res->fi);
2275                         res->fi = NULL;
2276                 }
2277         }
2278
2279
2280         rth = dst_alloc(&ipv4_dst_ops);
2281         if (!rth) {
2282                 err = -ENOBUFS;
2283                 goto cleanup;
2284         }
2285
2286         atomic_set(&rth->u.dst.__refcnt, 1);
2287         rth->u.dst.flags= DST_HOST;
2288 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2289         if (res->fi) {
2290                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2291                 if (res->fi->fib_nhs > 1)
2292                         rth->u.dst.flags |= DST_BALANCED;
2293         }
2294 #endif
2295         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2296                 rth->u.dst.flags |= DST_NOXFRM;
2297         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2298                 rth->u.dst.flags |= DST_NOPOLICY;
2299
2300         rth->fl.fl4_dst = oldflp->fl4_dst;
2301         rth->fl.fl4_tos = tos;
2302         rth->fl.fl4_src = oldflp->fl4_src;
2303         rth->fl.oif     = oldflp->oif;
2304         rth->fl.fl4_gw  = oldflp->fl4_gw;
2305         rth->fl.mark    = oldflp->mark;
2306         rth->rt_dst     = fl->fl4_dst;
2307         rth->rt_src     = fl->fl4_src;
2308         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2309         /* get references to the devices that are to be hold by the routing
2310            cache entry */
2311         rth->u.dst.dev  = dev_out;
2312         dev_hold(dev_out);
2313         rth->idev       = in_dev_get(dev_out);
2314         rth->rt_gateway = fl->fl4_dst;
2315         rth->rt_spec_dst= fl->fl4_src;
2316
2317         rth->u.dst.output=ip_output;
2318
2319         RT_CACHE_STAT_INC(out_slow_tot);
2320
2321         if (flags & RTCF_LOCAL) {
2322                 rth->u.dst.input = ip_local_deliver;
2323                 rth->rt_spec_dst = fl->fl4_dst;
2324         }
2325         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2326                 rth->rt_spec_dst = fl->fl4_src;
2327                 if (flags & RTCF_LOCAL &&
2328                     !(dev_out->flags & IFF_LOOPBACK)) {
2329                         rth->u.dst.output = ip_mc_output;
2330                         RT_CACHE_STAT_INC(out_slow_mc);
2331                 }
2332 #ifdef CONFIG_IP_MROUTE
2333                 if (res->type == RTN_MULTICAST) {
2334                         if (IN_DEV_MFORWARD(in_dev) &&
2335                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2336                                 rth->u.dst.input = ip_mr_input;
2337                                 rth->u.dst.output = ip_mc_output;
2338                         }
2339                 }
2340 #endif
2341         }
2342
2343         rt_set_nexthop(rth, res, 0);
2344
2345         rth->rt_flags = flags;
2346
2347         *result = rth;
2348  cleanup:
2349         /* release work reference to inet device */
2350         in_dev_put(in_dev);
2351
2352         return err;
2353 }
2354
2355 static int ip_mkroute_output_def(struct rtable **rp,
2356                                 struct fib_result* res,
2357                                 const struct flowi *fl,
2358                                 const struct flowi *oldflp,
2359                                 struct net_device *dev_out,
2360                                 unsigned flags)
2361 {
2362         struct rtable *rth = NULL;
2363         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2364         unsigned hash;
2365         if (err == 0) {
2366                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2367                 err = rt_intern_hash(hash, rth, rp);
2368         }
2369
2370         return err;
2371 }
2372
2373 static int ip_mkroute_output(struct rtable** rp,
2374                             struct fib_result* res,
2375                             const struct flowi *fl,
2376                             const struct flowi *oldflp,
2377                             struct net_device *dev_out,
2378                             unsigned flags)
2379 {
2380 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2381         unsigned char hop;
2382         unsigned hash;
2383         int err = -EINVAL;
2384         struct rtable *rth = NULL;
2385
2386         if (res->fi && res->fi->fib_nhs > 1) {
2387                 unsigned char hopcount = res->fi->fib_nhs;
2388
2389                 for (hop = 0; hop < hopcount; hop++) {
2390                         struct net_device *dev2nexthop;
2391
2392                         res->nh_sel = hop;
2393
2394                         /* hold a work reference to the output device */
2395                         dev2nexthop = FIB_RES_DEV(*res);
2396                         dev_hold(dev2nexthop);
2397
2398                         /* put reference to previous result */
2399                         if (hop)
2400                                 ip_rt_put(*rp);
2401
2402                         err = __mkroute_output(&rth, res, fl, oldflp,
2403                                                dev2nexthop, flags);
2404
2405                         if (err != 0)
2406                                 goto cleanup;
2407
2408                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2409                                         oldflp->oif);
2410                         err = rt_intern_hash(hash, rth, rp);
2411
2412                         /* forward hop information to multipath impl. */
2413                         multipath_set_nhinfo(rth,
2414                                              FIB_RES_NETWORK(*res),
2415                                              FIB_RES_NETMASK(*res),
2416                                              res->prefixlen,
2417                                              &FIB_RES_NH(*res));
2418                 cleanup:
2419                         /* release work reference to output device */
2420                         dev_put(dev2nexthop);
2421
2422                         if (err != 0)
2423                                 return err;
2424                 }
2425                 return err;
2426         } else {
2427                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2428                                              flags);
2429         }
2430 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2431         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2432 #endif
2433 }
2434
2435 /*
2436  * Major route resolver routine.
2437  */
2438
2439 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2440 {
2441         u32 tos = RT_FL_TOS(oldflp);
2442         struct flowi fl = { .nl_u = { .ip4_u =
2443                                       { .daddr = oldflp->fl4_dst,
2444                                         .saddr = oldflp->fl4_src,
2445                                         .gw = oldflp->fl4_gw,
2446                                         .tos = tos & IPTOS_RT_MASK,
2447                                         .scope = ((tos & RTO_ONLINK) ?
2448                                                   RT_SCOPE_LINK :
2449                                                   RT_SCOPE_UNIVERSE),
2450                                       } },
2451                             .mark = oldflp->mark,
2452                             .iif = loopback_dev.ifindex,
2453                             .oif = oldflp->oif };
2454         struct fib_result res;
2455         unsigned flags = 0;
2456         struct net_device *dev_out = NULL;
2457         int free_res = 0;
2458         int err;
2459
2460
2461         res.fi          = NULL;
2462 #ifdef CONFIG_IP_MULTIPLE_TABLES
2463         res.r           = NULL;
2464 #endif
2465
2466         if (oldflp->fl4_src) {
2467                 err = -EINVAL;
2468                 if (MULTICAST(oldflp->fl4_src) ||
2469                     BADCLASS(oldflp->fl4_src) ||
2470                     ZERONET(oldflp->fl4_src))
2471                         goto out;
2472
2473                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2474                 dev_out = ip_dev_find(oldflp->fl4_src);
2475                 if (dev_out == NULL)
2476                         goto out;
2477
2478                 /* I removed check for oif == dev_out->oif here.
2479                    It was wrong for two reasons:
2480                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2481                       assigned to multiple interfaces.
2482                    2. Moreover, we are allowed to send packets with saddr
2483                       of another iface. --ANK
2484                  */
2485
2486                 if (oldflp->oif == 0
2487                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2488                         /* Special hack: user can direct multicasts
2489                            and limited broadcast via necessary interface
2490                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2491                            This hack is not just for fun, it allows
2492                            vic,vat and friends to work.
2493                            They bind socket to loopback, set ttl to zero
2494                            and expect that it will work.
2495                            From the viewpoint of routing cache they are broken,
2496                            because we are not allowed to build multicast path
2497                            with loopback source addr (look, routing cache
2498                            cannot know, that ttl is zero, so that packet
2499                            will not leave this host and route is valid).
2500                            Luckily, this hack is good workaround.
2501                          */
2502
2503                         fl.oif = dev_out->ifindex;
2504                         goto make_route;
2505                 }
2506                 if (dev_out)
2507                         dev_put(dev_out);
2508                 dev_out = NULL;
2509         }
2510
2511
2512         if (oldflp->oif) {
2513                 dev_out = dev_get_by_index(oldflp->oif);
2514                 err = -ENODEV;
2515                 if (dev_out == NULL)
2516                         goto out;
2517
2518                 /* RACE: Check return value of inet_select_addr instead. */
2519                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rtnl(dev_out)) {
2520                         dev_put(dev_out);
2521                         err = -ENETUNREACH;
2522                         goto out;
2523                 }
2524
2525                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2526                         if (!fl.fl4_src)
2527                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2528                                                               RT_SCOPE_LINK);
2529                         goto make_route;
2530                 }
2531                 if (!fl.fl4_src) {
2532                         if (MULTICAST(oldflp->fl4_dst))
2533                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2534                                                               fl.fl4_scope);
2535                         else if (!oldflp->fl4_dst)
2536                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2537                                                               RT_SCOPE_HOST);
2538                 }
2539         }
2540
2541         if (!fl.fl4_dst) {
2542                 fl.fl4_dst = fl.fl4_src;
2543                 if (!fl.fl4_dst)
2544                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2545                 if (dev_out)
2546                         dev_put(dev_out);
2547                 dev_out = &loopback_dev;
2548                 dev_hold(dev_out);
2549                 fl.oif = loopback_dev.ifindex;
2550                 fl.fl4_gw = 0;
2551                 res.type = RTN_LOCAL;
2552                 flags |= RTCF_LOCAL;
2553                 goto make_route;
2554         }
2555
2556         if (fib_lookup(&fl, &res)) {
2557                 res.fi = NULL;
2558                 if (oldflp->oif && dev_out->flags & IFF_UP) {
2559                         /* Apparently, routing tables are wrong. Assume,
2560                            that the destination is on link.
2561
2562                            WHY? DW.
2563                            Because we are allowed to send to iface
2564                            even if it has NO routes and NO assigned
2565                            addresses. When oif is specified, routing
2566                            tables are looked up with only one purpose:
2567                            to catch if destination is gatewayed, rather than
2568                            direct. Moreover, if MSG_DONTROUTE is set,
2569                            we send packet, ignoring both routing tables
2570                            and ifaddr state. --ANK
2571
2572
2573                            We could make it even if oif is unknown,
2574                            likely IPv6, but we do not.
2575                          */
2576
2577                         if (fl.fl4_src == 0)
2578                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         res.type = RTN_UNICAST;
2581                         goto make_route;
2582                 }
2583                 if (dev_out)
2584                         dev_put(dev_out);
2585                 err = -ENETUNREACH;
2586                 goto out;
2587         }
2588         free_res = 1;
2589
2590         if (res.type == RTN_LOCAL) {
2591                 if (!fl.fl4_src) {
2592                         if (res.fi->fib_prefsrc)
2593                                 fl.fl4_src = res.fi->fib_prefsrc;
2594                         else
2595                                 fl.fl4_src = fl.fl4_dst;
2596                 }
2597                 if (dev_out)
2598                         dev_put(dev_out);
2599                 dev_out = &loopback_dev;
2600                 dev_hold(dev_out);
2601                 fl.oif = dev_out->ifindex;
2602                 fl.fl4_gw = 0;
2603                 if (res.fi)
2604                         fib_info_put(res.fi);
2605                 res.fi = NULL;
2606                 flags |= RTCF_LOCAL;
2607                 goto make_route;
2608         }
2609
2610         if (res.type == RTN_UNICAST)
2611                 fib_select_default(&fl, &res);
2612 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2613         if (res.fi->fib_nhs > 1)
2614                 fib_select_multipath(&fl, &res);
2615 #endif
2616
2617         if (!fl.fl4_src)
2618                 fl.fl4_src = FIB_RES_PREFSRC(res);
2619
2620         if (dev_out)
2621                 dev_put(dev_out);
2622         dev_out = FIB_RES_DEV(res);
2623         dev_hold(dev_out);
2624         fl.oif = dev_out->ifindex;
2625
2626
2627 make_route:
2628         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2629
2630
2631         if (free_res)
2632                 fib_res_put(&res);
2633         if (dev_out)
2634                 dev_put(dev_out);
2635 out:    return err;
2636 }
2637
2638 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2639 {
2640         unsigned hash;
2641         struct rtable *rth;
2642
2643         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2644
2645         rcu_read_lock_bh();
2646         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2647                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2648                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2649                     rth->fl.fl4_src == flp->fl4_src &&
2650                     rth->fl.iif == 0 &&
2651                     rth->fl.oif == flp->oif &&
2652                     rth->fl.fl4_gw == flp->fl4_gw &&
2653                     rth->fl.mark == flp->mark &&
2654                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2655                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2656
2657                         /* check for multipath routes and choose one if
2658                          * necessary
2659                          */
2660                         if (multipath_select_route(flp, rth, rp)) {
2661                                 dst_hold(&(*rp)->u.dst);
2662                                 RT_CACHE_STAT_INC(out_hit);
2663                                 rcu_read_unlock_bh();
2664                                 return 0;
2665                         }
2666
2667                         rth->u.dst.lastuse = jiffies;
2668                         dst_hold(&rth->u.dst);
2669                         rth->u.dst.__use++;
2670                         RT_CACHE_STAT_INC(out_hit);
2671                         rcu_read_unlock_bh();
2672                         *rp = rth;
2673                         return 0;
2674                 }
2675                 RT_CACHE_STAT_INC(out_hlist_search);
2676         }
2677         rcu_read_unlock_bh();
2678
2679         return ip_route_output_slow(rp, flp);
2680 }
2681
2682 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2683
2684 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2685 {
2686 }
2687
2688 static struct dst_ops ipv4_dst_blackhole_ops = {
2689         .family                 =       AF_INET,
2690         .protocol               =       __constant_htons(ETH_P_IP),
2691         .destroy                =       ipv4_dst_destroy,
2692         .check                  =       ipv4_dst_check,
2693         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2694         .entry_size             =       sizeof(struct rtable),
2695 };
2696
2697
2698 static int ipv4_blackhole_output(struct sk_buff *skb)
2699 {
2700         kfree_skb(skb);
2701         return 0;
2702 }
2703
2704 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2705 {
2706         struct rtable *ort = *rp;
2707         struct rtable *rt = (struct rtable *)
2708                 dst_alloc(&ipv4_dst_blackhole_ops);
2709
2710         if (rt) {
2711                 struct dst_entry *new = &rt->u.dst;
2712
2713                 atomic_set(&new->__refcnt, 1);
2714                 new->__use = 1;
2715                 new->input = ipv4_blackhole_output;
2716                 new->output = ipv4_blackhole_output;
2717                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2718
2719                 new->dev = ort->u.dst.dev;
2720                 if (new->dev)
2721                         dev_hold(new->dev);
2722
2723                 rt->fl = ort->fl;
2724
2725                 rt->idev = ort->idev;
2726                 if (rt->idev)
2727                         in_dev_hold(rt->idev);
2728                 rt->rt_flags = ort->rt_flags;
2729                 rt->rt_type = ort->rt_type;
2730                 rt->rt_dst = ort->rt_dst;
2731                 rt->rt_src = ort->rt_src;
2732                 rt->rt_iif = ort->rt_iif;
2733                 rt->rt_gateway = ort->rt_gateway;
2734                 rt->rt_spec_dst = ort->rt_spec_dst;
2735                 rt->peer = ort->peer;
2736                 if (rt->peer)
2737                         atomic_inc(&rt->peer->refcnt);
2738
2739                 dst_free(new);
2740         }
2741
2742         dst_release(&(*rp)->u.dst);
2743         *rp = rt;
2744         return (rt ? 0 : -ENOMEM);
2745 }
2746
2747 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2748 {
2749         int err;
2750
2751         if ((err = __ip_route_output_key(rp, flp)) != 0)
2752                 return err;
2753
2754         if (flp->proto) {
2755                 if (!flp->fl4_src)
2756                         flp->fl4_src = (*rp)->rt_src;
2757                 if (!flp->fl4_dst)
2758                         flp->fl4_dst = (*rp)->rt_dst;
2759                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2760                 if (err == -EREMOTE)
2761                         err = ipv4_dst_blackhole(rp, flp, sk);
2762
2763                 return err;
2764         }
2765
2766         return 0;
2767 }
2768
2769 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2770
2771 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2772 {
2773         return ip_route_output_flow(rp, flp, NULL, 0);
2774 }
2775
2776 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2777                         int nowait, unsigned int flags)
2778 {
2779         struct rtable *rt = (struct rtable*)skb->dst;
2780         struct rtmsg *r;
2781         struct nlmsghdr *nlh;
2782         long expires;
2783         u32 id = 0, ts = 0, tsage = 0, error;
2784
2785         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2786         if (nlh == NULL)
2787                 return -EMSGSIZE;
2788
2789         r = nlmsg_data(nlh);
2790         r->rtm_family    = AF_INET;
2791         r->rtm_dst_len  = 32;
2792         r->rtm_src_len  = 0;
2793         r->rtm_tos      = rt->fl.fl4_tos;
2794         r->rtm_table    = RT_TABLE_MAIN;
2795         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2796         r->rtm_type     = rt->rt_type;
2797         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2798         r->rtm_protocol = RTPROT_UNSPEC;
2799         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2800         if (rt->rt_flags & RTCF_NOTIFY)
2801                 r->rtm_flags |= RTM_F_NOTIFY;
2802
2803         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2804
2805         if (rt->fl.fl4_src) {
2806                 r->rtm_src_len = 32;
2807                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2808         }
2809         if (rt->u.dst.dev)
2810                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2811 #ifdef CONFIG_NET_CLS_ROUTE
2812         if (rt->u.dst.tclassid)
2813                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2814 #endif
2815 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2816         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2817                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2818 #endif
2819         if (rt->fl.iif)
2820                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2821         else if (rt->rt_src != rt->fl.fl4_src)
2822                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2823
2824         if (rt->rt_dst != rt->rt_gateway)
2825                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2826
2827         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2828                 goto nla_put_failure;
2829
2830         error = rt->u.dst.error;
2831         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2832         if (rt->peer) {
2833                 id = rt->peer->ip_id_count;
2834                 if (rt->peer->tcp_ts_stamp) {
2835                         ts = rt->peer->tcp_ts;
2836                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2837                 }
2838         }
2839
2840         if (rt->fl.iif) {
2841 #ifdef CONFIG_IP_MROUTE
2842                 __be32 dst = rt->rt_dst;
2843
2844                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2845                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2846                         int err = ipmr_get_route(skb, r, nowait);
2847                         if (err <= 0) {
2848                                 if (!nowait) {
2849                                         if (err == 0)
2850                                                 return 0;
2851                                         goto nla_put_failure;
2852                                 } else {
2853                                         if (err == -EMSGSIZE)
2854                                                 goto nla_put_failure;
2855                                         error = err;
2856                                 }
2857                         }
2858                 } else
2859 #endif
2860                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2861         }
2862
2863         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2864                                expires, error) < 0)
2865                 goto nla_put_failure;
2866
2867         return nlmsg_end(skb, nlh);
2868
2869 nla_put_failure:
2870         nlmsg_cancel(skb, nlh);
2871         return -EMSGSIZE;
2872 }
2873
2874 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2875 {
2876         struct rtmsg *rtm;
2877         struct nlattr *tb[RTA_MAX+1];
2878         struct rtable *rt = NULL;
2879         __be32 dst = 0;
2880         __be32 src = 0;
2881         u32 iif;
2882         int err;
2883         struct sk_buff *skb;
2884
2885         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2886         if (err < 0)
2887                 goto errout;
2888
2889         rtm = nlmsg_data(nlh);
2890
2891         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2892         if (skb == NULL) {
2893                 err = -ENOBUFS;
2894                 goto errout;
2895         }
2896
2897         /* Reserve room for dummy headers, this skb can pass
2898            through good chunk of routing engine.
2899          */
2900         skb_reset_mac_header(skb);
2901         skb_reset_network_header(skb);
2902
2903         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2904         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2905         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2906
2907         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2908         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2909         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2910
2911         if (iif) {
2912                 struct net_device *dev;
2913
2914                 dev = __dev_get_by_index(iif);
2915                 if (dev == NULL) {
2916                         err = -ENODEV;
2917                         goto errout_free;
2918                 }
2919
2920                 skb->protocol   = htons(ETH_P_IP);
2921                 skb->dev        = dev;
2922                 local_bh_disable();
2923                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2924                 local_bh_enable();
2925
2926                 rt = (struct rtable*) skb->dst;
2927                 if (err == 0 && rt->u.dst.error)
2928                         err = -rt->u.dst.error;
2929         } else {
2930                 struct flowi fl = {
2931                         .nl_u = {
2932                                 .ip4_u = {
2933                                         .daddr = dst,
2934                                         .saddr = src,
2935                                         .tos = rtm->rtm_tos,
2936                                 },
2937                         },
2938                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2939                 };
2940                 err = ip_route_output_key(&rt, &fl);
2941         }
2942
2943         if (err)
2944                 goto errout_free;
2945
2946         skb->dst = &rt->u.dst;
2947         if (rtm->rtm_flags & RTM_F_NOTIFY)
2948                 rt->rt_flags |= RTCF_NOTIFY;
2949
2950         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2951                                 RTM_NEWROUTE, 0, 0);
2952         if (err <= 0)
2953                 goto errout_free;
2954
2955         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2956 errout:
2957         return err;
2958
2959 errout_free:
2960         kfree_skb(skb);
2961         goto errout;
2962 }
2963
2964 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2965 {
2966         struct rtable *rt;
2967         int h, s_h;
2968         int idx, s_idx;
2969
2970         s_h = cb->args[0];
2971         if (s_h < 0)
2972                 s_h = 0;
2973         s_idx = idx = cb->args[1];
2974         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2975                 if (!rt_hash_table[h].chain)
2976                         continue;
2977                 rcu_read_lock_bh();
2978                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2979                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2980                         if (idx < s_idx)
2981                                 continue;
2982                         skb->dst = dst_clone(&rt->u.dst);
2983                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2984                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2985                                          1, NLM_F_MULTI) <= 0) {
2986                                 dst_release(xchg(&skb->dst, NULL));
2987                                 rcu_read_unlock_bh();
2988                                 goto done;
2989                         }
2990                         dst_release(xchg(&skb->dst, NULL));
2991                 }
2992                 rcu_read_unlock_bh();
2993         }
2994
2995 done:
2996         cb->args[0] = h;
2997         cb->args[1] = idx;
2998         return skb->len;
2999 }
3000
3001 void ip_rt_multicast_event(struct in_device *in_dev)
3002 {
3003         rt_cache_flush(0);
3004 }
3005
3006 #ifdef CONFIG_SYSCTL
3007 static int flush_delay;
3008
3009 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
3010                                         struct file *filp, void __user *buffer,
3011                                         size_t *lenp, loff_t *ppos)
3012 {
3013         if (write) {
3014                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
3015                 rt_cache_flush(flush_delay);
3016                 return 0;
3017         }
3018
3019         return -EINVAL;
3020 }
3021
3022 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3023                                                 int __user *name,
3024                                                 int nlen,
3025                                                 void __user *oldval,
3026                                                 size_t __user *oldlenp,
3027                                                 void __user *newval,
3028                                                 size_t newlen)
3029 {
3030         int delay;
3031         if (newlen != sizeof(int))
3032                 return -EINVAL;
3033         if (get_user(delay, (int __user *)newval))
3034                 return -EFAULT;
3035         rt_cache_flush(delay);
3036         return 0;
3037 }
3038
3039 ctl_table ipv4_route_table[] = {
3040         {
3041                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3042                 .procname       = "flush",
3043                 .data           = &flush_delay,
3044                 .maxlen         = sizeof(int),
3045                 .mode           = 0200,
3046                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3047                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3048         },
3049         {
3050                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
3051                 .procname       = "min_delay",
3052                 .data           = &ip_rt_min_delay,
3053                 .maxlen         = sizeof(int),
3054                 .mode           = 0644,
3055                 .proc_handler   = &proc_dointvec_jiffies,
3056                 .strategy       = &sysctl_jiffies,
3057         },
3058         {
3059                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
3060                 .procname       = "max_delay",
3061                 .data           = &ip_rt_max_delay,
3062                 .maxlen         = sizeof(int),
3063                 .mode           = 0644,
3064                 .proc_handler   = &proc_dointvec_jiffies,
3065                 .strategy       = &sysctl_jiffies,
3066         },
3067         {
3068                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3069                 .procname       = "gc_thresh",
3070                 .data           = &ipv4_dst_ops.gc_thresh,
3071                 .maxlen         = sizeof(int),
3072                 .mode           = 0644,
3073                 .proc_handler   = &proc_dointvec,
3074         },
3075         {
3076                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3077                 .procname       = "max_size",
3078                 .data           = &ip_rt_max_size,
3079                 .maxlen         = sizeof(int),
3080                 .mode           = 0644,
3081                 .proc_handler   = &proc_dointvec,
3082         },
3083         {
3084                 /*  Deprecated. Use gc_min_interval_ms */
3085
3086                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3087                 .procname       = "gc_min_interval",
3088                 .data           = &ip_rt_gc_min_interval,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = &proc_dointvec_jiffies,
3092                 .strategy       = &sysctl_jiffies,
3093         },
3094         {
3095                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3096                 .procname       = "gc_min_interval_ms",
3097                 .data           = &ip_rt_gc_min_interval,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = &proc_dointvec_ms_jiffies,
3101                 .strategy       = &sysctl_ms_jiffies,
3102         },
3103         {
3104                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3105                 .procname       = "gc_timeout",
3106                 .data           = &ip_rt_gc_timeout,
3107                 .maxlen         = sizeof(int),
3108                 .mode           = 0644,
3109                 .proc_handler   = &proc_dointvec_jiffies,
3110                 .strategy       = &sysctl_jiffies,
3111         },
3112         {
3113                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3114                 .procname       = "gc_interval",
3115                 .data           = &ip_rt_gc_interval,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = &proc_dointvec_jiffies,
3119                 .strategy       = &sysctl_jiffies,
3120         },
3121         {
3122                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3123                 .procname       = "redirect_load",
3124                 .data           = &ip_rt_redirect_load,
3125                 .maxlen         = sizeof(int),
3126                 .mode           = 0644,
3127                 .proc_handler   = &proc_dointvec,
3128         },
3129         {
3130                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3131                 .procname       = "redirect_number",
3132                 .data           = &ip_rt_redirect_number,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = &proc_dointvec,
3136         },
3137         {
3138                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3139                 .procname       = "redirect_silence",
3140                 .data           = &ip_rt_redirect_silence,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = &proc_dointvec,
3144         },
3145         {
3146                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3147                 .procname       = "error_cost",
3148                 .data           = &ip_rt_error_cost,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = &proc_dointvec,
3152         },
3153         {
3154                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3155                 .procname       = "error_burst",
3156                 .data           = &ip_rt_error_burst,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = &proc_dointvec,
3160         },
3161         {
3162                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3163                 .procname       = "gc_elasticity",
3164                 .data           = &ip_rt_gc_elasticity,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = &proc_dointvec,
3168         },
3169         {
3170                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3171                 .procname       = "mtu_expires",
3172                 .data           = &ip_rt_mtu_expires,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = &proc_dointvec_jiffies,
3176                 .strategy       = &sysctl_jiffies,
3177         },
3178         {
3179                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3180                 .procname       = "min_pmtu",
3181                 .data           = &ip_rt_min_pmtu,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = &proc_dointvec,
3185         },
3186         {
3187                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3188                 .procname       = "min_adv_mss",
3189                 .data           = &ip_rt_min_advmss,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = &proc_dointvec,
3193         },
3194         {
3195                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3196                 .procname       = "secret_interval",
3197                 .data           = &ip_rt_secret_interval,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = &proc_dointvec_jiffies,
3201                 .strategy       = &sysctl_jiffies,
3202         },
3203         { .ctl_name = 0 }
3204 };
3205 #endif
3206
3207 #ifdef CONFIG_NET_CLS_ROUTE
3208 struct ip_rt_acct *ip_rt_acct;
3209
3210 /* This code sucks.  But you should have seen it before! --RR */
3211
3212 /* IP route accounting ptr for this logical cpu number. */
3213 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3214
3215 #ifdef CONFIG_PROC_FS
3216 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3217                            int length, int *eof, void *data)
3218 {
3219         unsigned int i;
3220
3221         if ((offset & 3) || (length & 3))
3222                 return -EIO;
3223
3224         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3225                 *eof = 1;
3226                 return 0;
3227         }
3228
3229         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3230                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3231                 *eof = 1;
3232         }
3233
3234         offset /= sizeof(u32);
3235
3236         if (length > 0) {
3237                 u32 *dst = (u32 *) buffer;
3238
3239                 *start = buffer;
3240                 memset(dst, 0, length);
3241
3242                 for_each_possible_cpu(i) {
3243                         unsigned int j;
3244                         u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3245
3246                         for (j = 0; j < length/4; j++)
3247                                 dst[j] += src[j];
3248                 }
3249         }
3250         return length;
3251 }
3252 #endif /* CONFIG_PROC_FS */
3253 #endif /* CONFIG_NET_CLS_ROUTE */
3254
3255 static __initdata unsigned long rhash_entries;
3256 static int __init set_rhash_entries(char *str)
3257 {
3258         if (!str)
3259                 return 0;
3260         rhash_entries = simple_strtoul(str, &str, 0);
3261         return 1;
3262 }
3263 __setup("rhash_entries=", set_rhash_entries);
3264
3265 int __init ip_rt_init(void)
3266 {
3267         int rc = 0;
3268
3269         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3270                              (jiffies ^ (jiffies >> 7)));
3271
3272 #ifdef CONFIG_NET_CLS_ROUTE
3273         {
3274         int order;
3275         for (order = 0;
3276              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3277                 /* NOTHING */;
3278         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3279         if (!ip_rt_acct)
3280                 panic("IP: failed to allocate ip_rt_acct\n");
3281         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3282         }
3283 #endif
3284
3285         ipv4_dst_ops.kmem_cachep =
3286                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3287                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3288
3289         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3290
3291         rt_hash_table = (struct rt_hash_bucket *)
3292                 alloc_large_system_hash("IP route cache",
3293                                         sizeof(struct rt_hash_bucket),
3294                                         rhash_entries,
3295                                         (num_physpages >= 128 * 1024) ?
3296                                         15 : 17,
3297                                         0,
3298                                         &rt_hash_log,
3299                                         &rt_hash_mask,
3300                                         0);
3301         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3302         rt_hash_lock_init();
3303
3304         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3305         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3306
3307         devinet_init();
3308         ip_fib_init();
3309
3310         init_timer(&rt_flush_timer);
3311         rt_flush_timer.function = rt_run_flush;
3312         init_timer(&rt_secret_timer);
3313         rt_secret_timer.function = rt_secret_rebuild;
3314
3315         /* All the timers, started at system startup tend
3316            to synchronize. Perturb it a bit.
3317          */
3318         schedule_delayed_work(&expires_work,
3319                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3320
3321         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3322                 ip_rt_secret_interval;
3323         add_timer(&rt_secret_timer);
3324
3325 #ifdef CONFIG_PROC_FS
3326         {
3327         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3328         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3329             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3330                                              proc_net_stat))) {
3331                 return -ENOMEM;
3332         }
3333         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3334         }
3335 #ifdef CONFIG_NET_CLS_ROUTE
3336         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3337 #endif
3338 #endif
3339 #ifdef CONFIG_XFRM
3340         xfrm_init();
3341         xfrm4_init();
3342 #endif
3343         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3344
3345         return rc;
3346 }
3347
3348 EXPORT_SYMBOL(__ip_select_ident);
3349 EXPORT_SYMBOL(ip_route_input);
3350 EXPORT_SYMBOL(ip_route_output_key);
3351 EXPORT_SYMBOL(ip_route_input_lookup);