net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/mm.h>
  75 #include <linux/bootmem.h>
  76 #include <linux/string.h>
  77 #include <linux/socket.h>
  78 #include <linux/sockios.h>
  79 #include <linux/errno.h>
  80 #include <linux/in.h>
  81 #include <linux/inet.h>
  82 #include <linux/netdevice.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/init.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/rtnetlink.h>
  87 #include <linux/inetdevice.h>
  88 #include <linux/igmp.h>
  89 #include <linux/pkt_sched.h>
  90 #include <linux/mroute.h>
  91 #include <linux/netfilter_ipv4.h>
  92 #include <linux/random.h>
  93 #include <linux/jhash.h>
  94 #include <linux/rcupdate.h>
  95 #include <linux/times.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/ip_mp_alg.h>
 107 #include <net/netevent.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_min_delay              = 2 * HZ;
 120 static int ip_rt_max_delay              = 10 * HZ;
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval            = 60 * HZ;
 124 static int ip_rt_gc_min_interval        = HZ / 2;
 125 static int ip_rt_redirect_number        = 9;
 126 static int ip_rt_redirect_load          = HZ / 50;
 127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost             = HZ;
 129 static int ip_rt_error_burst            = 5 * HZ;
 130 static int ip_rt_gc_elasticity          = 8;
 131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133 static int ip_rt_min_advmss             = 256;
 134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135 static unsigned long rt_deadline;
 136
 137 #define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139 static struct timer_list rt_flush_timer;
 140 static struct timer_list rt_periodic_timer;
 141 static struct timer_list rt_secret_timer;
 142
 143 /*
 144  *      Interface to generic destination cache.
 145  */
 146
 147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 150                                          struct net_device *dev, int how);
 151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 152 static void              ipv4_link_failure(struct sk_buff *skb);
 153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 154 static int rt_garbage_collect(void);
 155
 156
 157 static struct dst_ops ipv4_dst_ops = {
 158         .family =               AF_INET,
 159         .protocol =             __constant_htons(ETH_P_IP),
 160         .gc =                   rt_garbage_collect,
 161         .check =                ipv4_dst_check,
 162         .destroy =              ipv4_dst_destroy,
 163         .ifdown =               ipv4_dst_ifdown,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .entry_size =           sizeof(struct rtable),
 168 };
 169
 170 #define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172 __u8 ip_tos2prio[16] = {
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(FILLER),
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK)
 189 };
 190
 191
 192 /*
 193  * Route cache.
 194  */
 195
 196 /* The locking scheme is rather straight forward:
 197  *
 198  * 1) Read-Copy Update protects the buckets of the central route hash.
 199  * 2) Only writers remove entries, and they hold the lock
 200  *    as they look at rtable reference counts.
 201  * 3) Only readers acquire references to rtable entries,
 202  *    they do so with atomic increments and with the
 203  *    lock held.
 204  */
 205
 206 struct rt_hash_bucket {
 207         struct rtable   *chain;
 208 };
 209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 210         defined(CONFIG_PROVE_LOCKING)
 211 /*
 212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 213  * The size of this table is a power of two and depends on the number of CPUS.
 214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 215  */
 216 #ifdef CONFIG_LOCKDEP
 217 # define RT_HASH_LOCK_SZ        256
 218 #else
 219 # if NR_CPUS >= 32
 220 #  define RT_HASH_LOCK_SZ       4096
 221 # elif NR_CPUS >= 16
 222 #  define RT_HASH_LOCK_SZ       2048
 223 # elif NR_CPUS >= 8
 224 #  define RT_HASH_LOCK_SZ       1024
 225 # elif NR_CPUS >= 4
 226 #  define RT_HASH_LOCK_SZ       512
 227 # else
 228 #  define RT_HASH_LOCK_SZ       256
 229 # endif
 230 #endif
 231
 232 static spinlock_t       *rt_hash_locks;
 233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 234 # define rt_hash_lock_init()    { \
 235                 int i; \
 236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 239                         spin_lock_init(&rt_hash_locks[i]); \
 240                 }
 241 #else
 242 # define rt_hash_lock_addr(slot) NULL
 243 # define rt_hash_lock_init()
 244 #endif
 245
 246 static struct rt_hash_bucket    *rt_hash_table;
 247 static unsigned                 rt_hash_mask;
 248 static int                      rt_hash_log;
 249 static unsigned int             rt_hash_rnd;
 250
 251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 252 #define RT_CACHE_STAT_INC(field) \
 253         (__raw_get_cpu_var(rt_cache_stat).field++)
 254
 255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 256                                 struct rtable **res);
 257
 258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 259 {
 260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 261                 & rt_hash_mask);
 262 }
 263
 264 #define rt_hash(daddr, saddr, idx) \
 265         rt_hash_code((__force u32)(__be32)(daddr),\
 266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 267
 268 #ifdef CONFIG_PROC_FS
 269 struct rt_cache_iter_state {
 270         int bucket;
 271 };
 272
 273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 274 {
 275         struct rtable *r = NULL;
 276         struct rt_cache_iter_state *st = seq->private;
 277
 278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 279                 rcu_read_lock_bh();
 280                 r = rt_hash_table[st->bucket].chain;
 281                 if (r)
 282                         break;
 283                 rcu_read_unlock_bh();
 284         }
 285         return r;
 286 }
 287
 288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 289 {
 290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 291
 292         r = r->u.rt_next;
 293         while (!r) {
 294                 rcu_read_unlock_bh();
 295                 if (--st->bucket < 0)
 296                         break;
 297                 rcu_read_lock_bh();
 298                 r = rt_hash_table[st->bucket].chain;
 299         }
 300         return r;
 301 }
 302
 303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 304 {
 305         struct rtable *r = rt_cache_get_first(seq);
 306
 307         if (r)
 308                 while (pos && (r = rt_cache_get_next(seq, r)))
 309                         --pos;
 310         return pos ? NULL : r;
 311 }
 312
 313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 314 {
 315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 316 }
 317
 318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 319 {
 320         struct rtable *r = NULL;
 321
 322         if (v == SEQ_START_TOKEN)
 323                 r = rt_cache_get_first(seq);
 324         else
 325                 r = rt_cache_get_next(seq, v);
 326         ++*pos;
 327         return r;
 328 }
 329
 330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 331 {
 332         if (v && v != SEQ_START_TOKEN)
 333                 rcu_read_unlock_bh();
 334 }
 335
 336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 337 {
 338         if (v == SEQ_START_TOKEN)
 339                 seq_printf(seq, "%-127s\n",
 340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 342                            "HHUptod\tSpecDst");
 343         else {
 344                 struct rtable *r = v;
 345                 char temp[256];
 346
 347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 355                         dst_metric(&r->u.dst, RTAX_WINDOW),
 356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 358                         r->fl.fl4_tos,
 359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 361                                        dev_queue_xmit) : 0,
 362                         r->rt_spec_dst);
 363                 seq_printf(seq, "%-127s\n", temp);
 364         }
 365         return 0;
 366 }
 367
 368 static struct seq_operations rt_cache_seq_ops = {
 369         .start  = rt_cache_seq_start,
 370         .next   = rt_cache_seq_next,
 371         .stop   = rt_cache_seq_stop,
 372         .show   = rt_cache_seq_show,
 373 };
 374
 375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 376 {
 377         struct seq_file *seq;
 378         int rc = -ENOMEM;
 379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 380
 381         if (!s)
 382                 goto out;
 383         rc = seq_open(file, &rt_cache_seq_ops);
 384         if (rc)
 385                 goto out_kfree;
 386         seq          = file->private_data;
 387         seq->private = s;
 388         memset(s, 0, sizeof(*s));
 389 out:
 390         return rc;
 391 out_kfree:
 392         kfree(s);
 393         goto out;
 394 }
 395
 396 static struct file_operations rt_cache_seq_fops = {
 397         .owner   = THIS_MODULE,
 398         .open    = rt_cache_seq_open,
 399         .read    = seq_read,
 400         .llseek  = seq_lseek,
 401         .release = seq_release_private,
 402 };
 403
 404
 405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 406 {
 407         int cpu;
 408
 409         if (*pos == 0)
 410                 return SEQ_START_TOKEN;
 411
 412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 413                 if (!cpu_possible(cpu))
 414                         continue;
 415                 *pos = cpu+1;
 416                 return &per_cpu(rt_cache_stat, cpu);
 417         }
 418         return NULL;
 419 }
 420
 421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 422 {
 423         int cpu;
 424
 425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 426                 if (!cpu_possible(cpu))
 427                         continue;
 428                 *pos = cpu+1;
 429                 return &per_cpu(rt_cache_stat, cpu);
 430         }
 431         return NULL;
 432
 433 }
 434
 435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 436 {
 437
 438 }
 439
 440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 441 {
 442         struct rt_cache_stat *st = v;
 443
 444         if (v == SEQ_START_TOKEN) {
 445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 446                 return 0;
 447         }
 448
 449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 451                    atomic_read(&ipv4_dst_ops.entries),
 452                    st->in_hit,
 453                    st->in_slow_tot,
 454                    st->in_slow_mc,
 455                    st->in_no_route,
 456                    st->in_brd,
 457                    st->in_martian_dst,
 458                    st->in_martian_src,
 459
 460                    st->out_hit,
 461                    st->out_slow_tot,
 462                    st->out_slow_mc,
 463
 464                    st->gc_total,
 465                    st->gc_ignored,
 466                    st->gc_goal_miss,
 467                    st->gc_dst_overflow,
 468                    st->in_hlist_search,
 469                    st->out_hlist_search
 470                 );
 471         return 0;
 472 }
 473
 474 static struct seq_operations rt_cpu_seq_ops = {
 475         .start  = rt_cpu_seq_start,
 476         .next   = rt_cpu_seq_next,
 477         .stop   = rt_cpu_seq_stop,
 478         .show   = rt_cpu_seq_show,
 479 };
 480
 481
 482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 483 {
 484         return seq_open(file, &rt_cpu_seq_ops);
 485 }
 486
 487 static struct file_operations rt_cpu_seq_fops = {
 488         .owner   = THIS_MODULE,
 489         .open    = rt_cpu_seq_open,
 490         .read    = seq_read,
 491         .llseek  = seq_lseek,
 492         .release = seq_release,
 493 };
 494
 495 #endif /* CONFIG_PROC_FS */
 496
 497 static __inline__ void rt_free(struct rtable *rt)
 498 {
 499         multipath_remove(rt);
 500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 501 }
 502
 503 static __inline__ void rt_drop(struct rtable *rt)
 504 {
 505         multipath_remove(rt);
 506         ip_rt_put(rt);
 507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 508 }
 509
 510 static __inline__ int rt_fast_clean(struct rtable *rth)
 511 {
 512         /* Kill broadcast/multicast entries very aggresively, if they
 513            collide in hash table with more useful entries */
 514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 515                 rth->fl.iif && rth->u.rt_next;
 516 }
 517
 518 static __inline__ int rt_valuable(struct rtable *rth)
 519 {
 520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 521                 rth->u.dst.expires;
 522 }
 523
 524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 525 {
 526         unsigned long age;
 527         int ret = 0;
 528
 529         if (atomic_read(&rth->u.dst.__refcnt))
 530                 goto out;
 531
 532         ret = 1;
 533         if (rth->u.dst.expires &&
 534             time_after_eq(jiffies, rth->u.dst.expires))
 535                 goto out;
 536
 537         age = jiffies - rth->u.dst.lastuse;
 538         ret = 0;
 539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 540             (age <= tmo2 && rt_valuable(rth)))
 541                 goto out;
 542         ret = 1;
 543 out:    return ret;
 544 }
 545
 546 /* Bits of score are:
 547  * 31: very valuable
 548  * 30: not quite useless
 549  * 29..0: usage counter
 550  */
 551 static inline u32 rt_score(struct rtable *rt)
 552 {
 553         u32 score = jiffies - rt->u.dst.lastuse;
 554
 555         score = ~score & ~(3<<30);
 556
 557         if (rt_valuable(rt))
 558                 score |= (1<<31);
 559
 560         if (!rt->fl.iif ||
 561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 562                 score |= (1<<30);
 563
 564         return score;
 565 }
 566
 567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 568 {
 569         return ((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 570                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 571 #ifdef CONFIG_IP_ROUTE_FWMARK
 572                 (fl1->nl_u.ip4_u.fwmark ^ fl2->nl_u.ip4_u.fwmark) |
 573 #endif
 574                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 575                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 576                 (fl1->oif ^ fl2->oif) |
 577                 (fl1->iif ^ fl2->iif)) == 0;
 578 }
 579
 580 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 581 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 582                                                 struct rtable *expentry,
 583                                                 int *removed_count)
 584 {
 585         int passedexpired = 0;
 586         struct rtable **nextstep = NULL;
 587         struct rtable **rthp = chain_head;
 588         struct rtable *rth;
 589
 590         if (removed_count)
 591                 *removed_count = 0;
 592
 593         while ((rth = *rthp) != NULL) {
 594                 if (rth == expentry)
 595                         passedexpired = 1;
 596
 597                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 598                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 599                         if (*rthp == expentry) {
 600                                 *rthp = rth->u.rt_next;
 601                                 continue;
 602                         } else {
 603                                 *rthp = rth->u.rt_next;
 604                                 rt_free(rth);
 605                                 if (removed_count)
 606                                         ++(*removed_count);
 607                         }
 608                 } else {
 609                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 610                             passedexpired && !nextstep)
 611                                 nextstep = &rth->u.rt_next;
 612
 613                         rthp = &rth->u.rt_next;
 614                 }
 615         }
 616
 617         rt_free(expentry);
 618         if (removed_count)
 619                 ++(*removed_count);
 620
 621         return nextstep;
 622 }
 623 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 624
 625
 626 /* This runs via a timer and thus is always in BH context. */
 627 static void rt_check_expire(unsigned long dummy)
 628 {
 629         static unsigned int rover;
 630         unsigned int i = rover, goal;
 631         struct rtable *rth, **rthp;
 632         unsigned long now = jiffies;
 633         u64 mult;
 634
 635         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 636         if (ip_rt_gc_timeout > 1)
 637                 do_div(mult, ip_rt_gc_timeout);
 638         goal = (unsigned int)mult;
 639         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 640         for (; goal > 0; goal--) {
 641                 unsigned long tmo = ip_rt_gc_timeout;
 642
 643                 i = (i + 1) & rt_hash_mask;
 644                 rthp = &rt_hash_table[i].chain;
 645
 646                 if (*rthp == 0)
 647                         continue;
 648                 spin_lock(rt_hash_lock_addr(i));
 649                 while ((rth = *rthp) != NULL) {
 650                         if (rth->u.dst.expires) {
 651                                 /* Entry is expired even if it is in use */
 652                                 if (time_before_eq(now, rth->u.dst.expires)) {
 653                                         tmo >>= 1;
 654                                         rthp = &rth->u.rt_next;
 655                                         continue;
 656                                 }
 657                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 658                                 tmo >>= 1;
 659                                 rthp = &rth->u.rt_next;
 660                                 continue;
 661                         }
 662
 663                         /* Cleanup aged off entries. */
 664 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 665                         /* remove all related balanced entries if necessary */
 666                         if (rth->u.dst.flags & DST_BALANCED) {
 667                                 rthp = rt_remove_balanced_route(
 668                                         &rt_hash_table[i].chain,
 669                                         rth, NULL);
 670                                 if (!rthp)
 671                                         break;
 672                         } else {
 673                                 *rthp = rth->u.rt_next;
 674                                 rt_free(rth);
 675                         }
 676 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 677                         *rthp = rth->u.rt_next;
 678                         rt_free(rth);
 679 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 680                 }
 681                 spin_unlock(rt_hash_lock_addr(i));
 682
 683                 /* Fallback loop breaker. */
 684                 if (time_after(jiffies, now))
 685                         break;
 686         }
 687         rover = i;
 688         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 689 }
 690
 691 /* This can run from both BH and non-BH contexts, the latter
 692  * in the case of a forced flush event.
 693  */
 694 static void rt_run_flush(unsigned long dummy)
 695 {
 696         int i;
 697         struct rtable *rth, *next;
 698
 699         rt_deadline = 0;
 700
 701         get_random_bytes(&rt_hash_rnd, 4);
 702
 703         for (i = rt_hash_mask; i >= 0; i--) {
 704                 spin_lock_bh(rt_hash_lock_addr(i));
 705                 rth = rt_hash_table[i].chain;
 706                 if (rth)
 707                         rt_hash_table[i].chain = NULL;
 708                 spin_unlock_bh(rt_hash_lock_addr(i));
 709
 710                 for (; rth; rth = next) {
 711                         next = rth->u.rt_next;
 712                         rt_free(rth);
 713                 }
 714         }
 715 }
 716
 717 static DEFINE_SPINLOCK(rt_flush_lock);
 718
 719 void rt_cache_flush(int delay)
 720 {
 721         unsigned long now = jiffies;
 722         int user_mode = !in_softirq();
 723
 724         if (delay < 0)
 725                 delay = ip_rt_min_delay;
 726
 727         /* flush existing multipath state*/
 728         multipath_flush();
 729
 730         spin_lock_bh(&rt_flush_lock);
 731
 732         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 733                 long tmo = (long)(rt_deadline - now);
 734
 735                 /* If flush timer is already running
 736                    and flush request is not immediate (delay > 0):
 737
 738                    if deadline is not achieved, prolongate timer to "delay",
 739                    otherwise fire it at deadline time.
 740                  */
 741
 742                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 743                         tmo = 0;
 744
 745                 if (delay > tmo)
 746                         delay = tmo;
 747         }
 748
 749         if (delay <= 0) {
 750                 spin_unlock_bh(&rt_flush_lock);
 751                 rt_run_flush(0);
 752                 return;
 753         }
 754
 755         if (rt_deadline == 0)
 756                 rt_deadline = now + ip_rt_max_delay;
 757
 758         mod_timer(&rt_flush_timer, now+delay);
 759         spin_unlock_bh(&rt_flush_lock);
 760 }
 761
 762 static void rt_secret_rebuild(unsigned long dummy)
 763 {
 764         unsigned long now = jiffies;
 765
 766         rt_cache_flush(0);
 767         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 768 }
 769
 770 /*
 771    Short description of GC goals.
 772
 773    We want to build algorithm, which will keep routing cache
 774    at some equilibrium point, when number of aged off entries
 775    is kept approximately equal to newly generated ones.
 776
 777    Current expiration strength is variable "expire".
 778    We try to adjust it dynamically, so that if networking
 779    is idle expires is large enough to keep enough of warm entries,
 780    and when load increases it reduces to limit cache size.
 781  */
 782
 783 static int rt_garbage_collect(void)
 784 {
 785         static unsigned long expire = RT_GC_TIMEOUT;
 786         static unsigned long last_gc;
 787         static int rover;
 788         static int equilibrium;
 789         struct rtable *rth, **rthp;
 790         unsigned long now = jiffies;
 791         int goal;
 792
 793         /*
 794          * Garbage collection is pretty expensive,
 795          * do not make it too frequently.
 796          */
 797
 798         RT_CACHE_STAT_INC(gc_total);
 799
 800         if (now - last_gc < ip_rt_gc_min_interval &&
 801             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 802                 RT_CACHE_STAT_INC(gc_ignored);
 803                 goto out;
 804         }
 805
 806         /* Calculate number of entries, which we want to expire now. */
 807         goal = atomic_read(&ipv4_dst_ops.entries) -
 808                 (ip_rt_gc_elasticity << rt_hash_log);
 809         if (goal <= 0) {
 810                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 811                         equilibrium = ipv4_dst_ops.gc_thresh;
 812                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 813                 if (goal > 0) {
 814                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 815                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 816                 }
 817         } else {
 818                 /* We are in dangerous area. Try to reduce cache really
 819                  * aggressively.
 820                  */
 821                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 822                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 823         }
 824
 825         if (now - last_gc >= ip_rt_gc_min_interval)
 826                 last_gc = now;
 827
 828         if (goal <= 0) {
 829                 equilibrium += goal;
 830                 goto work_done;
 831         }
 832
 833         do {
 834                 int i, k;
 835
 836                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 837                         unsigned long tmo = expire;
 838
 839                         k = (k + 1) & rt_hash_mask;
 840                         rthp = &rt_hash_table[k].chain;
 841                         spin_lock_bh(rt_hash_lock_addr(k));
 842                         while ((rth = *rthp) != NULL) {
 843                                 if (!rt_may_expire(rth, tmo, expire)) {
 844                                         tmo >>= 1;
 845                                         rthp = &rth->u.rt_next;
 846                                         continue;
 847                                 }
 848 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 849                                 /* remove all related balanced entries
 850                                  * if necessary
 851                                  */
 852                                 if (rth->u.dst.flags & DST_BALANCED) {
 853                                         int r;
 854
 855                                         rthp = rt_remove_balanced_route(
 856                                                 &rt_hash_table[k].chain,
 857                                                 rth,
 858                                                 &r);
 859                                         goal -= r;
 860                                         if (!rthp)
 861                                                 break;
 862                                 } else {
 863                                         *rthp = rth->u.rt_next;
 864                                         rt_free(rth);
 865                                         goal--;
 866                                 }
 867 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 868                                 *rthp = rth->u.rt_next;
 869                                 rt_free(rth);
 870                                 goal--;
 871 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 872                         }
 873                         spin_unlock_bh(rt_hash_lock_addr(k));
 874                         if (goal <= 0)
 875                                 break;
 876                 }
 877                 rover = k;
 878
 879                 if (goal <= 0)
 880                         goto work_done;
 881
 882                 /* Goal is not achieved. We stop process if:
 883
 884                    - if expire reduced to zero. Otherwise, expire is halfed.
 885                    - if table is not full.
 886                    - if we are called from interrupt.
 887                    - jiffies check is just fallback/debug loop breaker.
 888                      We will not spin here for long time in any case.
 889                  */
 890
 891                 RT_CACHE_STAT_INC(gc_goal_miss);
 892
 893                 if (expire == 0)
 894                         break;
 895
 896                 expire >>= 1;
 897 #if RT_CACHE_DEBUG >= 2
 898                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 899                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 900 #endif
 901
 902                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 903                         goto out;
 904         } while (!in_softirq() && time_before_eq(jiffies, now));
 905
 906         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 907                 goto out;
 908         if (net_ratelimit())
 909                 printk(KERN_WARNING "dst cache overflow\n");
 910         RT_CACHE_STAT_INC(gc_dst_overflow);
 911         return 1;
 912
 913 work_done:
 914         expire += ip_rt_gc_min_interval;
 915         if (expire > ip_rt_gc_timeout ||
 916             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 917                 expire = ip_rt_gc_timeout;
 918 #if RT_CACHE_DEBUG >= 2
 919         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 920                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 921 #endif
 922 out:    return 0;
 923 }
 924
 925 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 926 {
 927         struct rtable   *rth, **rthp;
 928         unsigned long   now;
 929         struct rtable *cand, **candp;
 930         u32             min_score;
 931         int             chain_length;
 932         int attempts = !in_softirq();
 933
 934 restart:
 935         chain_length = 0;
 936         min_score = ~(u32)0;
 937         cand = NULL;
 938         candp = NULL;
 939         now = jiffies;
 940
 941         rthp = &rt_hash_table[hash].chain;
 942
 943         spin_lock_bh(rt_hash_lock_addr(hash));
 944         while ((rth = *rthp) != NULL) {
 945 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 946                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 947                     compare_keys(&rth->fl, &rt->fl)) {
 948 #else
 949                 if (compare_keys(&rth->fl, &rt->fl)) {
 950 #endif
 951                         /* Put it first */
 952                         *rthp = rth->u.rt_next;
 953                         /*
 954                          * Since lookup is lockfree, the deletion
 955                          * must be visible to another weakly ordered CPU before
 956                          * the insertion at the start of the hash chain.
 957                          */
 958                         rcu_assign_pointer(rth->u.rt_next,
 959                                            rt_hash_table[hash].chain);
 960                         /*
 961                          * Since lookup is lockfree, the update writes
 962                          * must be ordered for consistency on SMP.
 963                          */
 964                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 965
 966                         rth->u.dst.__use++;
 967                         dst_hold(&rth->u.dst);
 968                         rth->u.dst.lastuse = now;
 969                         spin_unlock_bh(rt_hash_lock_addr(hash));
 970
 971                         rt_drop(rt);
 972                         *rp = rth;
 973                         return 0;
 974                 }
 975
 976                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 977                         u32 score = rt_score(rth);
 978
 979                         if (score <= min_score) {
 980                                 cand = rth;
 981                                 candp = rthp;
 982                                 min_score = score;
 983                         }
 984                 }
 985
 986                 chain_length++;
 987
 988                 rthp = &rth->u.rt_next;
 989         }
 990
 991         if (cand) {
 992                 /* ip_rt_gc_elasticity used to be average length of chain
 993                  * length, when exceeded gc becomes really aggressive.
 994                  *
 995                  * The second limit is less certain. At the moment it allows
 996                  * only 2 entries per bucket. We will see.
 997                  */
 998                 if (chain_length > ip_rt_gc_elasticity) {
 999                         *candp = cand->u.rt_next;
1000                         rt_free(cand);
1001                 }
1002         }
1003
1004         /* Try to bind route to arp only if it is output
1005            route or unicast forwarding path.
1006          */
1007         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1008                 int err = arp_bind_neighbour(&rt->u.dst);
1009                 if (err) {
1010                         spin_unlock_bh(rt_hash_lock_addr(hash));
1011
1012                         if (err != -ENOBUFS) {
1013                                 rt_drop(rt);
1014                                 return err;
1015                         }
1016
1017                         /* Neighbour tables are full and nothing
1018                            can be released. Try to shrink route cache,
1019                            it is most likely it holds some neighbour records.
1020                          */
1021                         if (attempts-- > 0) {
1022                                 int saved_elasticity = ip_rt_gc_elasticity;
1023                                 int saved_int = ip_rt_gc_min_interval;
1024                                 ip_rt_gc_elasticity     = 1;
1025                                 ip_rt_gc_min_interval   = 0;
1026                                 rt_garbage_collect();
1027                                 ip_rt_gc_min_interval   = saved_int;
1028                                 ip_rt_gc_elasticity     = saved_elasticity;
1029                                 goto restart;
1030                         }
1031
1032                         if (net_ratelimit())
1033                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1034                         rt_drop(rt);
1035                         return -ENOBUFS;
1036                 }
1037         }
1038
1039         rt->u.rt_next = rt_hash_table[hash].chain;
1040 #if RT_CACHE_DEBUG >= 2
1041         if (rt->u.rt_next) {
1042                 struct rtable *trt;
1043                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1044                        NIPQUAD(rt->rt_dst));
1045                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1046                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1047                 printk("\n");
1048         }
1049 #endif
1050         rt_hash_table[hash].chain = rt;
1051         spin_unlock_bh(rt_hash_lock_addr(hash));
1052         *rp = rt;
1053         return 0;
1054 }
1055
1056 void rt_bind_peer(struct rtable *rt, int create)
1057 {
1058         static DEFINE_SPINLOCK(rt_peer_lock);
1059         struct inet_peer *peer;
1060
1061         peer = inet_getpeer(rt->rt_dst, create);
1062
1063         spin_lock_bh(&rt_peer_lock);
1064         if (rt->peer == NULL) {
1065                 rt->peer = peer;
1066                 peer = NULL;
1067         }
1068         spin_unlock_bh(&rt_peer_lock);
1069         if (peer)
1070                 inet_putpeer(peer);
1071 }
1072
1073 /*
1074  * Peer allocation may fail only in serious out-of-memory conditions.  However
1075  * we still can generate some output.
1076  * Random ID selection looks a bit dangerous because we have no chances to
1077  * select ID being unique in a reasonable period of time.
1078  * But broken packet identifier may be better than no packet at all.
1079  */
1080 static void ip_select_fb_ident(struct iphdr *iph)
1081 {
1082         static DEFINE_SPINLOCK(ip_fb_id_lock);
1083         static u32 ip_fallback_id;
1084         u32 salt;
1085
1086         spin_lock_bh(&ip_fb_id_lock);
1087         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1088         iph->id = htons(salt & 0xFFFF);
1089         ip_fallback_id = salt;
1090         spin_unlock_bh(&ip_fb_id_lock);
1091 }
1092
1093 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1094 {
1095         struct rtable *rt = (struct rtable *) dst;
1096
1097         if (rt) {
1098                 if (rt->peer == NULL)
1099                         rt_bind_peer(rt, 1);
1100
1101                 /* If peer is attached to destination, it is never detached,
1102                    so that we need not to grab a lock to dereference it.
1103                  */
1104                 if (rt->peer) {
1105                         iph->id = htons(inet_getid(rt->peer, more));
1106                         return;
1107                 }
1108         } else
1109                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1110                        __builtin_return_address(0));
1111
1112         ip_select_fb_ident(iph);
1113 }
1114
1115 static void rt_del(unsigned hash, struct rtable *rt)
1116 {
1117         struct rtable **rthp;
1118
1119         spin_lock_bh(rt_hash_lock_addr(hash));
1120         ip_rt_put(rt);
1121         for (rthp = &rt_hash_table[hash].chain; *rthp;
1122              rthp = &(*rthp)->u.rt_next)
1123                 if (*rthp == rt) {
1124                         *rthp = rt->u.rt_next;
1125                         rt_free(rt);
1126                         break;
1127                 }
1128         spin_unlock_bh(rt_hash_lock_addr(hash));
1129 }
1130
1131 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1132                     __be32 saddr, struct net_device *dev)
1133 {
1134         int i, k;
1135         struct in_device *in_dev = in_dev_get(dev);
1136         struct rtable *rth, **rthp;
1137         __be32  skeys[2] = { saddr, 0 };
1138         int  ikeys[2] = { dev->ifindex, 0 };
1139         struct netevent_redirect netevent;
1140
1141         if (!in_dev)
1142                 return;
1143
1144         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1145             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1146                 goto reject_redirect;
1147
1148         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1149                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1150                         goto reject_redirect;
1151                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1152                         goto reject_redirect;
1153         } else {
1154                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1155                         goto reject_redirect;
1156         }
1157
1158         for (i = 0; i < 2; i++) {
1159                 for (k = 0; k < 2; k++) {
1160                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1161
1162                         rthp=&rt_hash_table[hash].chain;
1163
1164                         rcu_read_lock();
1165                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1166                                 struct rtable *rt;
1167
1168                                 if (rth->fl.fl4_dst != daddr ||
1169                                     rth->fl.fl4_src != skeys[i] ||
1170                                     rth->fl.oif != ikeys[k] ||
1171                                     rth->fl.iif != 0) {
1172                                         rthp = &rth->u.rt_next;
1173                                         continue;
1174                                 }
1175
1176                                 if (rth->rt_dst != daddr ||
1177                                     rth->rt_src != saddr ||
1178                                     rth->u.dst.error ||
1179                                     rth->rt_gateway != old_gw ||
1180                                     rth->u.dst.dev != dev)
1181                                         break;
1182
1183                                 dst_hold(&rth->u.dst);
1184                                 rcu_read_unlock();
1185
1186                                 rt = dst_alloc(&ipv4_dst_ops);
1187                                 if (rt == NULL) {
1188                                         ip_rt_put(rth);
1189                                         in_dev_put(in_dev);
1190                                         return;
1191                                 }
1192
1193                                 /* Copy all the information. */
1194                                 *rt = *rth;
1195                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1196                                 rt->u.dst.__use         = 1;
1197                                 atomic_set(&rt->u.dst.__refcnt, 1);
1198                                 rt->u.dst.child         = NULL;
1199                                 if (rt->u.dst.dev)
1200                                         dev_hold(rt->u.dst.dev);
1201                                 if (rt->idev)
1202                                         in_dev_hold(rt->idev);
1203                                 rt->u.dst.obsolete      = 0;
1204                                 rt->u.dst.lastuse       = jiffies;
1205                                 rt->u.dst.path          = &rt->u.dst;
1206                                 rt->u.dst.neighbour     = NULL;
1207                                 rt->u.dst.hh            = NULL;
1208                                 rt->u.dst.xfrm          = NULL;
1209
1210                                 rt->rt_flags            |= RTCF_REDIRECTED;
1211
1212                                 /* Gateway is different ... */
1213                                 rt->rt_gateway          = new_gw;
1214
1215                                 /* Redirect received -> path was valid */
1216                                 dst_confirm(&rth->u.dst);
1217
1218                                 if (rt->peer)
1219                                         atomic_inc(&rt->peer->refcnt);
1220
1221                                 if (arp_bind_neighbour(&rt->u.dst) ||
1222                                     !(rt->u.dst.neighbour->nud_state &
1223                                             NUD_VALID)) {
1224                                         if (rt->u.dst.neighbour)
1225                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1226                                         ip_rt_put(rth);
1227                                         rt_drop(rt);
1228                                         goto do_next;
1229                                 }
1230
1231                                 netevent.old = &rth->u.dst;
1232                                 netevent.new = &rt->u.dst;
1233                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1234                                                         &netevent);
1235
1236                                 rt_del(hash, rth);
1237                                 if (!rt_intern_hash(hash, rt, &rt))
1238                                         ip_rt_put(rt);
1239                                 goto do_next;
1240                         }
1241                         rcu_read_unlock();
1242                 do_next:
1243                         ;
1244                 }
1245         }
1246         in_dev_put(in_dev);
1247         return;
1248
1249 reject_redirect:
1250 #ifdef CONFIG_IP_ROUTE_VERBOSE
1251         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1252                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1253                         "%u.%u.%u.%u ignored.\n"
1254                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1255                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1256                        NIPQUAD(saddr), NIPQUAD(daddr));
1257 #endif
1258         in_dev_put(in_dev);
1259 }
1260
1261 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1262 {
1263         struct rtable *rt = (struct rtable*)dst;
1264         struct dst_entry *ret = dst;
1265
1266         if (rt) {
1267                 if (dst->obsolete) {
1268                         ip_rt_put(rt);
1269                         ret = NULL;
1270                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1271                            rt->u.dst.expires) {
1272                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1273                                                 rt->fl.oif);
1274 #if RT_CACHE_DEBUG >= 1
1275                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1276                                           "%u.%u.%u.%u/%02x dropped\n",
1277                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1278 #endif
1279                         rt_del(hash, rt);
1280                         ret = NULL;
1281                 }
1282         }
1283         return ret;
1284 }
1285
1286 /*
1287  * Algorithm:
1288  *      1. The first ip_rt_redirect_number redirects are sent
1289  *         with exponential backoff, then we stop sending them at all,
1290  *         assuming that the host ignores our redirects.
1291  *      2. If we did not see packets requiring redirects
1292  *         during ip_rt_redirect_silence, we assume that the host
1293  *         forgot redirected route and start to send redirects again.
1294  *
1295  * This algorithm is much cheaper and more intelligent than dumb load limiting
1296  * in icmp.c.
1297  *
1298  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1299  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1300  */
1301
1302 void ip_rt_send_redirect(struct sk_buff *skb)
1303 {
1304         struct rtable *rt = (struct rtable*)skb->dst;
1305         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1306
1307         if (!in_dev)
1308                 return;
1309
1310         if (!IN_DEV_TX_REDIRECTS(in_dev))
1311                 goto out;
1312
1313         /* No redirected packets during ip_rt_redirect_silence;
1314          * reset the algorithm.
1315          */
1316         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1317                 rt->u.dst.rate_tokens = 0;
1318
1319         /* Too many ignored redirects; do not send anything
1320          * set u.dst.rate_last to the last seen redirected packet.
1321          */
1322         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1323                 rt->u.dst.rate_last = jiffies;
1324                 goto out;
1325         }
1326
1327         /* Check for load limit; set rate_last to the latest sent
1328          * redirect.
1329          */
1330         if (time_after(jiffies,
1331                        (rt->u.dst.rate_last +
1332                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1333                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1334                 rt->u.dst.rate_last = jiffies;
1335                 ++rt->u.dst.rate_tokens;
1336 #ifdef CONFIG_IP_ROUTE_VERBOSE
1337                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1338                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1339                     net_ratelimit())
1340                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1341                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1342                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1343                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1344 #endif
1345         }
1346 out:
1347         in_dev_put(in_dev);
1348 }
1349
1350 static int ip_error(struct sk_buff *skb)
1351 {
1352         struct rtable *rt = (struct rtable*)skb->dst;
1353         unsigned long now;
1354         int code;
1355
1356         switch (rt->u.dst.error) {
1357                 case EINVAL:
1358                 default:
1359                         goto out;
1360                 case EHOSTUNREACH:
1361                         code = ICMP_HOST_UNREACH;
1362                         break;
1363                 case ENETUNREACH:
1364                         code = ICMP_NET_UNREACH;
1365                         break;
1366                 case EACCES:
1367                         code = ICMP_PKT_FILTERED;
1368                         break;
1369         }
1370
1371         now = jiffies;
1372         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1373         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1374                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1375         rt->u.dst.rate_last = now;
1376         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1377                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1378                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1379         }
1380
1381 out:    kfree_skb(skb);
1382         return 0;
1383 }
1384
1385 /*
1386  *      The last two values are not from the RFC but
1387  *      are needed for AMPRnet AX.25 paths.
1388  */
1389
1390 static const unsigned short mtu_plateau[] =
1391 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1392
1393 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1394 {
1395         int i;
1396
1397         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1398                 if (old_mtu > mtu_plateau[i])
1399                         return mtu_plateau[i];
1400         return 68;
1401 }
1402
1403 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1404 {
1405         int i;
1406         unsigned short old_mtu = ntohs(iph->tot_len);
1407         struct rtable *rth;
1408         __be32  skeys[2] = { iph->saddr, 0, };
1409         __be32  daddr = iph->daddr;
1410         unsigned short est_mtu = 0;
1411
1412         if (ipv4_config.no_pmtu_disc)
1413                 return 0;
1414
1415         for (i = 0; i < 2; i++) {
1416                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1417
1418                 rcu_read_lock();
1419                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1420                      rth = rcu_dereference(rth->u.rt_next)) {
1421                         if (rth->fl.fl4_dst == daddr &&
1422                             rth->fl.fl4_src == skeys[i] &&
1423                             rth->rt_dst  == daddr &&
1424                             rth->rt_src  == iph->saddr &&
1425                             rth->fl.iif == 0 &&
1426                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1427                                 unsigned short mtu = new_mtu;
1428
1429                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1430
1431                                         /* BSD 4.2 compatibility hack :-( */
1432                                         if (mtu == 0 &&
1433                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1434                                             old_mtu >= 68 + (iph->ihl << 2))
1435                                                 old_mtu -= iph->ihl << 2;
1436
1437                                         mtu = guess_mtu(old_mtu);
1438                                 }
1439                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1440                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1441                                                 dst_confirm(&rth->u.dst);
1442                                                 if (mtu < ip_rt_min_pmtu) {
1443                                                         mtu = ip_rt_min_pmtu;
1444                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1445                                                                 (1 << RTAX_MTU);
1446                                                 }
1447                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1448                                                 dst_set_expires(&rth->u.dst,
1449                                                         ip_rt_mtu_expires);
1450                                         }
1451                                         est_mtu = mtu;
1452                                 }
1453                         }
1454                 }
1455                 rcu_read_unlock();
1456         }
1457         return est_mtu ? : new_mtu;
1458 }
1459
1460 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1461 {
1462         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1463             !(dst_metric_locked(dst, RTAX_MTU))) {
1464                 if (mtu < ip_rt_min_pmtu) {
1465                         mtu = ip_rt_min_pmtu;
1466                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1467                 }
1468                 dst->metrics[RTAX_MTU-1] = mtu;
1469                 dst_set_expires(dst, ip_rt_mtu_expires);
1470                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1471         }
1472 }
1473
1474 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1475 {
1476         return NULL;
1477 }
1478
1479 static void ipv4_dst_destroy(struct dst_entry *dst)
1480 {
1481         struct rtable *rt = (struct rtable *) dst;
1482         struct inet_peer *peer = rt->peer;
1483         struct in_device *idev = rt->idev;
1484
1485         if (peer) {
1486                 rt->peer = NULL;
1487                 inet_putpeer(peer);
1488         }
1489
1490         if (idev) {
1491                 rt->idev = NULL;
1492                 in_dev_put(idev);
1493         }
1494 }
1495
1496 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1497                             int how)
1498 {
1499         struct rtable *rt = (struct rtable *) dst;
1500         struct in_device *idev = rt->idev;
1501         if (dev != &loopback_dev && idev && idev->dev == dev) {
1502                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1503                 if (loopback_idev) {
1504                         rt->idev = loopback_idev;
1505                         in_dev_put(idev);
1506                 }
1507         }
1508 }
1509
1510 static void ipv4_link_failure(struct sk_buff *skb)
1511 {
1512         struct rtable *rt;
1513
1514         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1515
1516         rt = (struct rtable *) skb->dst;
1517         if (rt)
1518                 dst_set_expires(&rt->u.dst, 0);
1519 }
1520
1521 static int ip_rt_bug(struct sk_buff *skb)
1522 {
1523         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1524                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1525                 skb->dev ? skb->dev->name : "?");
1526         kfree_skb(skb);
1527         return 0;
1528 }
1529
1530 /*
1531    We do not cache source address of outgoing interface,
1532    because it is used only by IP RR, TS and SRR options,
1533    so that it out of fast path.
1534
1535    BTW remember: "addr" is allowed to be not aligned
1536    in IP options!
1537  */
1538
1539 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1540 {
1541         __be32 src;
1542         struct fib_result res;
1543
1544         if (rt->fl.iif == 0)
1545                 src = rt->rt_src;
1546         else if (fib_lookup(&rt->fl, &res) == 0) {
1547                 src = FIB_RES_PREFSRC(res);
1548                 fib_res_put(&res);
1549         } else
1550                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1551                                         RT_SCOPE_UNIVERSE);
1552         memcpy(addr, &src, 4);
1553 }
1554
1555 #ifdef CONFIG_NET_CLS_ROUTE
1556 static void set_class_tag(struct rtable *rt, u32 tag)
1557 {
1558         if (!(rt->u.dst.tclassid & 0xFFFF))
1559                 rt->u.dst.tclassid |= tag & 0xFFFF;
1560         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1561                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1562 }
1563 #endif
1564
1565 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1566 {
1567         struct fib_info *fi = res->fi;
1568
1569         if (fi) {
1570                 if (FIB_RES_GW(*res) &&
1571                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1572                         rt->rt_gateway = FIB_RES_GW(*res);
1573                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1574                        sizeof(rt->u.dst.metrics));
1575                 if (fi->fib_mtu == 0) {
1576                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1577                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1578                             rt->rt_gateway != rt->rt_dst &&
1579                             rt->u.dst.dev->mtu > 576)
1580                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1581                 }
1582 #ifdef CONFIG_NET_CLS_ROUTE
1583                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1584 #endif
1585         } else
1586                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1587
1588         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1589                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1590         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1591                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1592         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1593                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1594                                        ip_rt_min_advmss);
1595         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1596                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1597
1598 #ifdef CONFIG_NET_CLS_ROUTE
1599 #ifdef CONFIG_IP_MULTIPLE_TABLES
1600         set_class_tag(rt, fib_rules_tclass(res));
1601 #endif
1602         set_class_tag(rt, itag);
1603 #endif
1604         rt->rt_type = res->type;
1605 }
1606
1607 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1608                                 u8 tos, struct net_device *dev, int our)
1609 {
1610         unsigned hash;
1611         struct rtable *rth;
1612         __be32 spec_dst;
1613         struct in_device *in_dev = in_dev_get(dev);
1614         u32 itag = 0;
1615
1616         /* Primary sanity checks. */
1617
1618         if (in_dev == NULL)
1619                 return -EINVAL;
1620
1621         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1622             skb->protocol != htons(ETH_P_IP))
1623                 goto e_inval;
1624
1625         if (ZERONET(saddr)) {
1626                 if (!LOCAL_MCAST(daddr))
1627                         goto e_inval;
1628                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1629         } else if (fib_validate_source(saddr, 0, tos, 0,
1630                                         dev, &spec_dst, &itag) < 0)
1631                 goto e_inval;
1632
1633         rth = dst_alloc(&ipv4_dst_ops);
1634         if (!rth)
1635                 goto e_nobufs;
1636
1637         rth->u.dst.output= ip_rt_bug;
1638
1639         atomic_set(&rth->u.dst.__refcnt, 1);
1640         rth->u.dst.flags= DST_HOST;
1641         if (in_dev->cnf.no_policy)
1642                 rth->u.dst.flags |= DST_NOPOLICY;
1643         rth->fl.fl4_dst = daddr;
1644         rth->rt_dst     = daddr;
1645         rth->fl.fl4_tos = tos;
1646 #ifdef CONFIG_IP_ROUTE_FWMARK
1647         rth->fl.fl4_fwmark= skb->mark;
1648 #endif
1649         rth->fl.fl4_src = saddr;
1650         rth->rt_src     = saddr;
1651 #ifdef CONFIG_NET_CLS_ROUTE
1652         rth->u.dst.tclassid = itag;
1653 #endif
1654         rth->rt_iif     =
1655         rth->fl.iif     = dev->ifindex;
1656         rth->u.dst.dev  = &loopback_dev;
1657         dev_hold(rth->u.dst.dev);
1658         rth->idev       = in_dev_get(rth->u.dst.dev);
1659         rth->fl.oif     = 0;
1660         rth->rt_gateway = daddr;
1661         rth->rt_spec_dst= spec_dst;
1662         rth->rt_type    = RTN_MULTICAST;
1663         rth->rt_flags   = RTCF_MULTICAST;
1664         if (our) {
1665                 rth->u.dst.input= ip_local_deliver;
1666                 rth->rt_flags |= RTCF_LOCAL;
1667         }
1668
1669 #ifdef CONFIG_IP_MROUTE
1670         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1671                 rth->u.dst.input = ip_mr_input;
1672 #endif
1673         RT_CACHE_STAT_INC(in_slow_mc);
1674
1675         in_dev_put(in_dev);
1676         hash = rt_hash(daddr, saddr, dev->ifindex);
1677         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1678
1679 e_nobufs:
1680         in_dev_put(in_dev);
1681         return -ENOBUFS;
1682
1683 e_inval:
1684         in_dev_put(in_dev);
1685         return -EINVAL;
1686 }
1687
1688
1689 static void ip_handle_martian_source(struct net_device *dev,
1690                                      struct in_device *in_dev,
1691                                      struct sk_buff *skb,
1692                                      __be32 daddr,
1693                                      __be32 saddr)
1694 {
1695         RT_CACHE_STAT_INC(in_martian_src);
1696 #ifdef CONFIG_IP_ROUTE_VERBOSE
1697         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1698                 /*
1699                  *      RFC1812 recommendation, if source is martian,
1700                  *      the only hint is MAC header.
1701                  */
1702                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1703                         "%u.%u.%u.%u, on dev %s\n",
1704                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1705                 if (dev->hard_header_len && skb->mac.raw) {
1706                         int i;
1707                         unsigned char *p = skb->mac.raw;
1708                         printk(KERN_WARNING "ll header: ");
1709                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1710                                 printk("%02x", *p);
1711                                 if (i < (dev->hard_header_len - 1))
1712                                         printk(":");
1713                         }
1714                         printk("\n");
1715                 }
1716         }
1717 #endif
1718 }
1719
1720 static inline int __mkroute_input(struct sk_buff *skb,
1721                                   struct fib_result* res,
1722                                   struct in_device *in_dev,
1723                                   __be32 daddr, __be32 saddr, u32 tos,
1724                                   struct rtable **result)
1725 {
1726
1727         struct rtable *rth;
1728         int err;
1729         struct in_device *out_dev;
1730         unsigned flags = 0;
1731         __be32 spec_dst;
1732         u32 itag;
1733
1734         /* get a working reference to the output device */
1735         out_dev = in_dev_get(FIB_RES_DEV(*res));
1736         if (out_dev == NULL) {
1737                 if (net_ratelimit())
1738                         printk(KERN_CRIT "Bug in ip_route_input" \
1739                                "_slow(). Please, report\n");
1740                 return -EINVAL;
1741         }
1742
1743
1744         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1745                                   in_dev->dev, &spec_dst, &itag);
1746         if (err < 0) {
1747                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1748                                          saddr);
1749
1750                 err = -EINVAL;
1751                 goto cleanup;
1752         }
1753
1754         if (err)
1755                 flags |= RTCF_DIRECTSRC;
1756
1757         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1758             (IN_DEV_SHARED_MEDIA(out_dev) ||
1759              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1760                 flags |= RTCF_DOREDIRECT;
1761
1762         if (skb->protocol != htons(ETH_P_IP)) {
1763                 /* Not IP (i.e. ARP). Do not create route, if it is
1764                  * invalid for proxy arp. DNAT routes are always valid.
1765                  */
1766                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1767                         err = -EINVAL;
1768                         goto cleanup;
1769                 }
1770         }
1771
1772
1773         rth = dst_alloc(&ipv4_dst_ops);
1774         if (!rth) {
1775                 err = -ENOBUFS;
1776                 goto cleanup;
1777         }
1778
1779         atomic_set(&rth->u.dst.__refcnt, 1);
1780         rth->u.dst.flags= DST_HOST;
1781 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1782         if (res->fi->fib_nhs > 1)
1783                 rth->u.dst.flags |= DST_BALANCED;
1784 #endif
1785         if (in_dev->cnf.no_policy)
1786                 rth->u.dst.flags |= DST_NOPOLICY;
1787         if (in_dev->cnf.no_xfrm)
1788                 rth->u.dst.flags |= DST_NOXFRM;
1789         rth->fl.fl4_dst = daddr;
1790         rth->rt_dst     = daddr;
1791         rth->fl.fl4_tos = tos;
1792 #ifdef CONFIG_IP_ROUTE_FWMARK
1793         rth->fl.fl4_fwmark= skb->mark;
1794 #endif
1795         rth->fl.fl4_src = saddr;
1796         rth->rt_src     = saddr;
1797         rth->rt_gateway = daddr;
1798         rth->rt_iif     =
1799                 rth->fl.iif     = in_dev->dev->ifindex;
1800         rth->u.dst.dev  = (out_dev)->dev;
1801         dev_hold(rth->u.dst.dev);
1802         rth->idev       = in_dev_get(rth->u.dst.dev);
1803         rth->fl.oif     = 0;
1804         rth->rt_spec_dst= spec_dst;
1805
1806         rth->u.dst.input = ip_forward;
1807         rth->u.dst.output = ip_output;
1808
1809         rt_set_nexthop(rth, res, itag);
1810
1811         rth->rt_flags = flags;
1812
1813         *result = rth;
1814         err = 0;
1815  cleanup:
1816         /* release the working reference to the output device */
1817         in_dev_put(out_dev);
1818         return err;
1819 }
1820
1821 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1822                                        struct fib_result* res,
1823                                        const struct flowi *fl,
1824                                        struct in_device *in_dev,
1825                                        __be32 daddr, __be32 saddr, u32 tos)
1826 {
1827         struct rtable* rth = NULL;
1828         int err;
1829         unsigned hash;
1830
1831 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1832         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1833                 fib_select_multipath(fl, res);
1834 #endif
1835
1836         /* create a routing cache entry */
1837         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1838         if (err)
1839                 return err;
1840
1841         /* put it into the cache */
1842         hash = rt_hash(daddr, saddr, fl->iif);
1843         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1844 }
1845
1846 static inline int ip_mkroute_input(struct sk_buff *skb,
1847                                    struct fib_result* res,
1848                                    const struct flowi *fl,
1849                                    struct in_device *in_dev,
1850                                    __be32 daddr, __be32 saddr, u32 tos)
1851 {
1852 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1853         struct rtable* rth = NULL, *rtres;
1854         unsigned char hop, hopcount;
1855         int err = -EINVAL;
1856         unsigned int hash;
1857
1858         if (res->fi)
1859                 hopcount = res->fi->fib_nhs;
1860         else
1861                 hopcount = 1;
1862
1863         /* distinguish between multipath and singlepath */
1864         if (hopcount < 2)
1865                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1866                                             saddr, tos);
1867
1868         /* add all alternatives to the routing cache */
1869         for (hop = 0; hop < hopcount; hop++) {
1870                 res->nh_sel = hop;
1871
1872                 /* put reference to previous result */
1873                 if (hop)
1874                         ip_rt_put(rtres);
1875
1876                 /* create a routing cache entry */
1877                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1878                                       &rth);
1879                 if (err)
1880                         return err;
1881
1882                 /* put it into the cache */
1883                 hash = rt_hash(daddr, saddr, fl->iif);
1884                 err = rt_intern_hash(hash, rth, &rtres);
1885                 if (err)
1886                         return err;
1887
1888                 /* forward hop information to multipath impl. */
1889                 multipath_set_nhinfo(rth,
1890                                      FIB_RES_NETWORK(*res),
1891                                      FIB_RES_NETMASK(*res),
1892                                      res->prefixlen,
1893                                      &FIB_RES_NH(*res));
1894         }
1895         skb->dst = &rtres->u.dst;
1896         return err;
1897 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1898         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1899 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1900 }
1901
1902
1903 /*
1904  *      NOTE. We drop all the packets that has local source
1905  *      addresses, because every properly looped back packet
1906  *      must have correct destination already attached by output routine.
1907  *
1908  *      Such approach solves two big problems:
1909  *      1. Not simplex devices are handled properly.
1910  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1911  */
1912
1913 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1914                                u8 tos, struct net_device *dev)
1915 {
1916         struct fib_result res;
1917         struct in_device *in_dev = in_dev_get(dev);
1918         struct flowi fl = { .nl_u = { .ip4_u =
1919                                       { .daddr = daddr,
1920                                         .saddr = saddr,
1921                                         .tos = tos,
1922                                         .scope = RT_SCOPE_UNIVERSE,
1923 #ifdef CONFIG_IP_ROUTE_FWMARK
1924                                         .fwmark = skb->mark
1925 #endif
1926                                       } },
1927                             .iif = dev->ifindex };
1928         unsigned        flags = 0;
1929         u32             itag = 0;
1930         struct rtable * rth;
1931         unsigned        hash;
1932         __be32          spec_dst;
1933         int             err = -EINVAL;
1934         int             free_res = 0;
1935
1936         /* IP on this device is disabled. */
1937
1938         if (!in_dev)
1939                 goto out;
1940
1941         /* Check for the most weird martians, which can be not detected
1942            by fib_lookup.
1943          */
1944
1945         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1946                 goto martian_source;
1947
1948         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1949                 goto brd_input;
1950
1951         /* Accept zero addresses only to limited broadcast;
1952          * I even do not know to fix it or not. Waiting for complains :-)
1953          */
1954         if (ZERONET(saddr))
1955                 goto martian_source;
1956
1957         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1958                 goto martian_destination;
1959
1960         /*
1961          *      Now we are ready to route packet.
1962          */
1963         if ((err = fib_lookup(&fl, &res)) != 0) {
1964                 if (!IN_DEV_FORWARD(in_dev))
1965                         goto e_hostunreach;
1966                 goto no_route;
1967         }
1968         free_res = 1;
1969
1970         RT_CACHE_STAT_INC(in_slow_tot);
1971
1972         if (res.type == RTN_BROADCAST)
1973                 goto brd_input;
1974
1975         if (res.type == RTN_LOCAL) {
1976                 int result;
1977                 result = fib_validate_source(saddr, daddr, tos,
1978                                              loopback_dev.ifindex,
1979                                              dev, &spec_dst, &itag);
1980                 if (result < 0)
1981                         goto martian_source;
1982                 if (result)
1983                         flags |= RTCF_DIRECTSRC;
1984                 spec_dst = daddr;
1985                 goto local_input;
1986         }
1987
1988         if (!IN_DEV_FORWARD(in_dev))
1989                 goto e_hostunreach;
1990         if (res.type != RTN_UNICAST)
1991                 goto martian_destination;
1992
1993         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1994         if (err == -ENOBUFS)
1995                 goto e_nobufs;
1996         if (err == -EINVAL)
1997                 goto e_inval;
1998
1999 done:
2000         in_dev_put(in_dev);
2001         if (free_res)
2002                 fib_res_put(&res);
2003 out:    return err;
2004
2005 brd_input:
2006         if (skb->protocol != htons(ETH_P_IP))
2007                 goto e_inval;
2008
2009         if (ZERONET(saddr))
2010                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2011         else {
2012                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2013                                           &itag);
2014                 if (err < 0)
2015                         goto martian_source;
2016                 if (err)
2017                         flags |= RTCF_DIRECTSRC;
2018         }
2019         flags |= RTCF_BROADCAST;
2020         res.type = RTN_BROADCAST;
2021         RT_CACHE_STAT_INC(in_brd);
2022
2023 local_input:
2024         rth = dst_alloc(&ipv4_dst_ops);
2025         if (!rth)
2026                 goto e_nobufs;
2027
2028         rth->u.dst.output= ip_rt_bug;
2029
2030         atomic_set(&rth->u.dst.__refcnt, 1);
2031         rth->u.dst.flags= DST_HOST;
2032         if (in_dev->cnf.no_policy)
2033                 rth->u.dst.flags |= DST_NOPOLICY;
2034         rth->fl.fl4_dst = daddr;
2035         rth->rt_dst     = daddr;
2036         rth->fl.fl4_tos = tos;
2037 #ifdef CONFIG_IP_ROUTE_FWMARK
2038         rth->fl.fl4_fwmark= skb->mark;
2039 #endif
2040         rth->fl.fl4_src = saddr;
2041         rth->rt_src     = saddr;
2042 #ifdef CONFIG_NET_CLS_ROUTE
2043         rth->u.dst.tclassid = itag;
2044 #endif
2045         rth->rt_iif     =
2046         rth->fl.iif     = dev->ifindex;
2047         rth->u.dst.dev  = &loopback_dev;
2048         dev_hold(rth->u.dst.dev);
2049         rth->idev       = in_dev_get(rth->u.dst.dev);
2050         rth->rt_gateway = daddr;
2051         rth->rt_spec_dst= spec_dst;
2052         rth->u.dst.input= ip_local_deliver;
2053         rth->rt_flags   = flags|RTCF_LOCAL;
2054         if (res.type == RTN_UNREACHABLE) {
2055                 rth->u.dst.input= ip_error;
2056                 rth->u.dst.error= -err;
2057                 rth->rt_flags   &= ~RTCF_LOCAL;
2058         }
2059         rth->rt_type    = res.type;
2060         hash = rt_hash(daddr, saddr, fl.iif);
2061         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2062         goto done;
2063
2064 no_route:
2065         RT_CACHE_STAT_INC(in_no_route);
2066         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2067         res.type = RTN_UNREACHABLE;
2068         goto local_input;
2069
2070         /*
2071          *      Do not cache martian addresses: they should be logged (RFC1812)
2072          */
2073 martian_destination:
2074         RT_CACHE_STAT_INC(in_martian_dst);
2075 #ifdef CONFIG_IP_ROUTE_VERBOSE
2076         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2077                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2078                         "%u.%u.%u.%u, dev %s\n",
2079                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2080 #endif
2081
2082 e_hostunreach:
2083         err = -EHOSTUNREACH;
2084         goto done;
2085
2086 e_inval:
2087         err = -EINVAL;
2088         goto done;
2089
2090 e_nobufs:
2091         err = -ENOBUFS;
2092         goto done;
2093
2094 martian_source:
2095         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2096         goto e_inval;
2097 }
2098
2099 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2100                    u8 tos, struct net_device *dev)
2101 {
2102         struct rtable * rth;
2103         unsigned        hash;
2104         int iif = dev->ifindex;
2105
2106         tos &= IPTOS_RT_MASK;
2107         hash = rt_hash(daddr, saddr, iif);
2108
2109         rcu_read_lock();
2110         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2111              rth = rcu_dereference(rth->u.rt_next)) {
2112                 if (rth->fl.fl4_dst == daddr &&
2113                     rth->fl.fl4_src == saddr &&
2114                     rth->fl.iif == iif &&
2115                     rth->fl.oif == 0 &&
2116 #ifdef CONFIG_IP_ROUTE_FWMARK
2117                     rth->fl.fl4_fwmark == skb->mark &&
2118 #endif
2119                     rth->fl.fl4_tos == tos) {
2120                         rth->u.dst.lastuse = jiffies;
2121                         dst_hold(&rth->u.dst);
2122                         rth->u.dst.__use++;
2123                         RT_CACHE_STAT_INC(in_hit);
2124                         rcu_read_unlock();
2125                         skb->dst = (struct dst_entry*)rth;
2126                         return 0;
2127                 }
2128                 RT_CACHE_STAT_INC(in_hlist_search);
2129         }
2130         rcu_read_unlock();
2131
2132         /* Multicast recognition logic is moved from route cache to here.
2133            The problem was that too many Ethernet cards have broken/missing
2134            hardware multicast filters :-( As result the host on multicasting
2135            network acquires a lot of useless route cache entries, sort of
2136            SDR messages from all the world. Now we try to get rid of them.
2137            Really, provided software IP multicast filter is organized
2138            reasonably (at least, hashed), it does not result in a slowdown
2139            comparing with route cache reject entries.
2140            Note, that multicast routers are not affected, because
2141            route cache entry is created eventually.
2142          */
2143         if (MULTICAST(daddr)) {
2144                 struct in_device *in_dev;
2145
2146                 rcu_read_lock();
2147                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2148                         int our = ip_check_mc(in_dev, daddr, saddr,
2149                                 skb->nh.iph->protocol);
2150                         if (our
2151 #ifdef CONFIG_IP_MROUTE
2152                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2153 #endif
2154                             ) {
2155                                 rcu_read_unlock();
2156                                 return ip_route_input_mc(skb, daddr, saddr,
2157                                                          tos, dev, our);
2158                         }
2159                 }
2160                 rcu_read_unlock();
2161                 return -EINVAL;
2162         }
2163         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2164 }
2165
2166 static inline int __mkroute_output(struct rtable **result,
2167                                    struct fib_result* res,
2168                                    const struct flowi *fl,
2169                                    const struct flowi *oldflp,
2170                                    struct net_device *dev_out,
2171                                    unsigned flags)
2172 {
2173         struct rtable *rth;
2174         struct in_device *in_dev;
2175         u32 tos = RT_FL_TOS(oldflp);
2176         int err = 0;
2177
2178         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2179                 return -EINVAL;
2180
2181         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2182                 res->type = RTN_BROADCAST;
2183         else if (MULTICAST(fl->fl4_dst))
2184                 res->type = RTN_MULTICAST;
2185         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2186                 return -EINVAL;
2187
2188         if (dev_out->flags & IFF_LOOPBACK)
2189                 flags |= RTCF_LOCAL;
2190
2191         /* get work reference to inet device */
2192         in_dev = in_dev_get(dev_out);
2193         if (!in_dev)
2194                 return -EINVAL;
2195
2196         if (res->type == RTN_BROADCAST) {
2197                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2198                 if (res->fi) {
2199                         fib_info_put(res->fi);
2200                         res->fi = NULL;
2201                 }
2202         } else if (res->type == RTN_MULTICAST) {
2203                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2204                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2205                                  oldflp->proto))
2206                         flags &= ~RTCF_LOCAL;
2207                 /* If multicast route do not exist use
2208                    default one, but do not gateway in this case.
2209                    Yes, it is hack.
2210                  */
2211                 if (res->fi && res->prefixlen < 4) {
2212                         fib_info_put(res->fi);
2213                         res->fi = NULL;
2214                 }
2215         }
2216
2217
2218         rth = dst_alloc(&ipv4_dst_ops);
2219         if (!rth) {
2220                 err = -ENOBUFS;
2221                 goto cleanup;
2222         }
2223
2224         atomic_set(&rth->u.dst.__refcnt, 1);
2225         rth->u.dst.flags= DST_HOST;
2226 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2227         if (res->fi) {
2228                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2229                 if (res->fi->fib_nhs > 1)
2230                         rth->u.dst.flags |= DST_BALANCED;
2231         }
2232 #endif
2233         if (in_dev->cnf.no_xfrm)
2234                 rth->u.dst.flags |= DST_NOXFRM;
2235         if (in_dev->cnf.no_policy)
2236                 rth->u.dst.flags |= DST_NOPOLICY;
2237
2238         rth->fl.fl4_dst = oldflp->fl4_dst;
2239         rth->fl.fl4_tos = tos;
2240         rth->fl.fl4_src = oldflp->fl4_src;
2241         rth->fl.oif     = oldflp->oif;
2242 #ifdef CONFIG_IP_ROUTE_FWMARK
2243         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2244 #endif
2245         rth->rt_dst     = fl->fl4_dst;
2246         rth->rt_src     = fl->fl4_src;
2247         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2248         /* get references to the devices that are to be hold by the routing
2249            cache entry */
2250         rth->u.dst.dev  = dev_out;
2251         dev_hold(dev_out);
2252         rth->idev       = in_dev_get(dev_out);
2253         rth->rt_gateway = fl->fl4_dst;
2254         rth->rt_spec_dst= fl->fl4_src;
2255
2256         rth->u.dst.output=ip_output;
2257
2258         RT_CACHE_STAT_INC(out_slow_tot);
2259
2260         if (flags & RTCF_LOCAL) {
2261                 rth->u.dst.input = ip_local_deliver;
2262                 rth->rt_spec_dst = fl->fl4_dst;
2263         }
2264         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2265                 rth->rt_spec_dst = fl->fl4_src;
2266                 if (flags & RTCF_LOCAL &&
2267                     !(dev_out->flags & IFF_LOOPBACK)) {
2268                         rth->u.dst.output = ip_mc_output;
2269                         RT_CACHE_STAT_INC(out_slow_mc);
2270                 }
2271 #ifdef CONFIG_IP_MROUTE
2272                 if (res->type == RTN_MULTICAST) {
2273                         if (IN_DEV_MFORWARD(in_dev) &&
2274                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2275                                 rth->u.dst.input = ip_mr_input;
2276                                 rth->u.dst.output = ip_mc_output;
2277                         }
2278                 }
2279 #endif
2280         }
2281
2282         rt_set_nexthop(rth, res, 0);
2283
2284         rth->rt_flags = flags;
2285
2286         *result = rth;
2287  cleanup:
2288         /* release work reference to inet device */
2289         in_dev_put(in_dev);
2290
2291         return err;
2292 }
2293
2294 static inline int ip_mkroute_output_def(struct rtable **rp,
2295                                         struct fib_result* res,
2296                                         const struct flowi *fl,
2297                                         const struct flowi *oldflp,
2298                                         struct net_device *dev_out,
2299                                         unsigned flags)
2300 {
2301         struct rtable *rth = NULL;
2302         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2303         unsigned hash;
2304         if (err == 0) {
2305                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2306                 err = rt_intern_hash(hash, rth, rp);
2307         }
2308
2309         return err;
2310 }
2311
2312 static inline int ip_mkroute_output(struct rtable** rp,
2313                                     struct fib_result* res,
2314                                     const struct flowi *fl,
2315                                     const struct flowi *oldflp,
2316                                     struct net_device *dev_out,
2317                                     unsigned flags)
2318 {
2319 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2320         unsigned char hop;
2321         unsigned hash;
2322         int err = -EINVAL;
2323         struct rtable *rth = NULL;
2324
2325         if (res->fi && res->fi->fib_nhs > 1) {
2326                 unsigned char hopcount = res->fi->fib_nhs;
2327
2328                 for (hop = 0; hop < hopcount; hop++) {
2329                         struct net_device *dev2nexthop;
2330
2331                         res->nh_sel = hop;
2332
2333                         /* hold a work reference to the output device */
2334                         dev2nexthop = FIB_RES_DEV(*res);
2335                         dev_hold(dev2nexthop);
2336
2337                         /* put reference to previous result */
2338                         if (hop)
2339                                 ip_rt_put(*rp);
2340
2341                         err = __mkroute_output(&rth, res, fl, oldflp,
2342                                                dev2nexthop, flags);
2343
2344                         if (err != 0)
2345                                 goto cleanup;
2346
2347                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2348                                         oldflp->oif);
2349                         err = rt_intern_hash(hash, rth, rp);
2350
2351                         /* forward hop information to multipath impl. */
2352                         multipath_set_nhinfo(rth,
2353                                              FIB_RES_NETWORK(*res),
2354                                              FIB_RES_NETMASK(*res),
2355                                              res->prefixlen,
2356                                              &FIB_RES_NH(*res));
2357                 cleanup:
2358                         /* release work reference to output device */
2359                         dev_put(dev2nexthop);
2360
2361                         if (err != 0)
2362                                 return err;
2363                 }
2364                 return err;
2365         } else {
2366                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2367                                              flags);
2368         }
2369 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2370         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2371 #endif
2372 }
2373
2374 /*
2375  * Major route resolver routine.
2376  */
2377
2378 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2379 {
2380         u32 tos = RT_FL_TOS(oldflp);
2381         struct flowi fl = { .nl_u = { .ip4_u =
2382                                       { .daddr = oldflp->fl4_dst,
2383                                         .saddr = oldflp->fl4_src,
2384                                         .tos = tos & IPTOS_RT_MASK,
2385                                         .scope = ((tos & RTO_ONLINK) ?
2386                                                   RT_SCOPE_LINK :
2387                                                   RT_SCOPE_UNIVERSE),
2388 #ifdef CONFIG_IP_ROUTE_FWMARK
2389                                         .fwmark = oldflp->fl4_fwmark
2390 #endif
2391                                       } },
2392                             .iif = loopback_dev.ifindex,
2393                             .oif = oldflp->oif };
2394         struct fib_result res;
2395         unsigned flags = 0;
2396         struct net_device *dev_out = NULL;
2397         int free_res = 0;
2398         int err;
2399
2400
2401         res.fi          = NULL;
2402 #ifdef CONFIG_IP_MULTIPLE_TABLES
2403         res.r           = NULL;
2404 #endif
2405
2406         if (oldflp->fl4_src) {
2407                 err = -EINVAL;
2408                 if (MULTICAST(oldflp->fl4_src) ||
2409                     BADCLASS(oldflp->fl4_src) ||
2410                     ZERONET(oldflp->fl4_src))
2411                         goto out;
2412
2413                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2414                 dev_out = ip_dev_find(oldflp->fl4_src);
2415                 if (dev_out == NULL)
2416                         goto out;
2417
2418                 /* I removed check for oif == dev_out->oif here.
2419                    It was wrong for two reasons:
2420                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2421                       assigned to multiple interfaces.
2422                    2. Moreover, we are allowed to send packets with saddr
2423                       of another iface. --ANK
2424                  */
2425
2426                 if (oldflp->oif == 0
2427                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2428                         /* Special hack: user can direct multicasts
2429                            and limited broadcast via necessary interface
2430                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2431                            This hack is not just for fun, it allows
2432                            vic,vat and friends to work.
2433                            They bind socket to loopback, set ttl to zero
2434                            and expect that it will work.
2435                            From the viewpoint of routing cache they are broken,
2436                            because we are not allowed to build multicast path
2437                            with loopback source addr (look, routing cache
2438                            cannot know, that ttl is zero, so that packet
2439                            will not leave this host and route is valid).
2440                            Luckily, this hack is good workaround.
2441                          */
2442
2443                         fl.oif = dev_out->ifindex;
2444                         goto make_route;
2445                 }
2446                 if (dev_out)
2447                         dev_put(dev_out);
2448                 dev_out = NULL;
2449         }
2450
2451
2452         if (oldflp->oif) {
2453                 dev_out = dev_get_by_index(oldflp->oif);
2454                 err = -ENODEV;
2455                 if (dev_out == NULL)
2456                         goto out;
2457
2458                 /* RACE: Check return value of inet_select_addr instead. */
2459                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2460                         dev_put(dev_out);
2461                         goto out;       /* Wrong error code */
2462                 }
2463
2464                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2465                         if (!fl.fl4_src)
2466                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2467                                                               RT_SCOPE_LINK);
2468                         goto make_route;
2469                 }
2470                 if (!fl.fl4_src) {
2471                         if (MULTICAST(oldflp->fl4_dst))
2472                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2473                                                               fl.fl4_scope);
2474                         else if (!oldflp->fl4_dst)
2475                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2476                                                               RT_SCOPE_HOST);
2477                 }
2478         }
2479
2480         if (!fl.fl4_dst) {
2481                 fl.fl4_dst = fl.fl4_src;
2482                 if (!fl.fl4_dst)
2483                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2484                 if (dev_out)
2485                         dev_put(dev_out);
2486                 dev_out = &loopback_dev;
2487                 dev_hold(dev_out);
2488                 fl.oif = loopback_dev.ifindex;
2489                 res.type = RTN_LOCAL;
2490                 flags |= RTCF_LOCAL;
2491                 goto make_route;
2492         }
2493
2494         if (fib_lookup(&fl, &res)) {
2495                 res.fi = NULL;
2496                 if (oldflp->oif) {
2497                         /* Apparently, routing tables are wrong. Assume,
2498                            that the destination is on link.
2499
2500                            WHY? DW.
2501                            Because we are allowed to send to iface
2502                            even if it has NO routes and NO assigned
2503                            addresses. When oif is specified, routing
2504                            tables are looked up with only one purpose:
2505                            to catch if destination is gatewayed, rather than
2506                            direct. Moreover, if MSG_DONTROUTE is set,
2507                            we send packet, ignoring both routing tables
2508                            and ifaddr state. --ANK
2509
2510
2511                            We could make it even if oif is unknown,
2512                            likely IPv6, but we do not.
2513                          */
2514
2515                         if (fl.fl4_src == 0)
2516                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2517                                                               RT_SCOPE_LINK);
2518                         res.type = RTN_UNICAST;
2519                         goto make_route;
2520                 }
2521                 if (dev_out)
2522                         dev_put(dev_out);
2523                 err = -ENETUNREACH;
2524                 goto out;
2525         }
2526         free_res = 1;
2527
2528         if (res.type == RTN_LOCAL) {
2529                 if (!fl.fl4_src)
2530                         fl.fl4_src = fl.fl4_dst;
2531                 if (dev_out)
2532                         dev_put(dev_out);
2533                 dev_out = &loopback_dev;
2534                 dev_hold(dev_out);
2535                 fl.oif = dev_out->ifindex;
2536                 if (res.fi)
2537                         fib_info_put(res.fi);
2538                 res.fi = NULL;
2539                 flags |= RTCF_LOCAL;
2540                 goto make_route;
2541         }
2542
2543 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2544         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2545                 fib_select_multipath(&fl, &res);
2546         else
2547 #endif
2548         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2549                 fib_select_default(&fl, &res);
2550
2551         if (!fl.fl4_src)
2552                 fl.fl4_src = FIB_RES_PREFSRC(res);
2553
2554         if (dev_out)
2555                 dev_put(dev_out);
2556         dev_out = FIB_RES_DEV(res);
2557         dev_hold(dev_out);
2558         fl.oif = dev_out->ifindex;
2559
2560
2561 make_route:
2562         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2563
2564
2565         if (free_res)
2566                 fib_res_put(&res);
2567         if (dev_out)
2568                 dev_put(dev_out);
2569 out:    return err;
2570 }
2571
2572 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2573 {
2574         unsigned hash;
2575         struct rtable *rth;
2576
2577         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2578
2579         rcu_read_lock_bh();
2580         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2581                 rth = rcu_dereference(rth->u.rt_next)) {
2582                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2583                     rth->fl.fl4_src == flp->fl4_src &&
2584                     rth->fl.iif == 0 &&
2585                     rth->fl.oif == flp->oif &&
2586 #ifdef CONFIG_IP_ROUTE_FWMARK
2587                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2588 #endif
2589                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2590                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2591
2592                         /* check for multipath routes and choose one if
2593                          * necessary
2594                          */
2595                         if (multipath_select_route(flp, rth, rp)) {
2596                                 dst_hold(&(*rp)->u.dst);
2597                                 RT_CACHE_STAT_INC(out_hit);
2598                                 rcu_read_unlock_bh();
2599                                 return 0;
2600                         }
2601
2602                         rth->u.dst.lastuse = jiffies;
2603                         dst_hold(&rth->u.dst);
2604                         rth->u.dst.__use++;
2605                         RT_CACHE_STAT_INC(out_hit);
2606                         rcu_read_unlock_bh();
2607                         *rp = rth;
2608                         return 0;
2609                 }
2610                 RT_CACHE_STAT_INC(out_hlist_search);
2611         }
2612         rcu_read_unlock_bh();
2613
2614         return ip_route_output_slow(rp, flp);
2615 }
2616
2617 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2618
2619 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2620 {
2621         int err;
2622
2623         if ((err = __ip_route_output_key(rp, flp)) != 0)
2624                 return err;
2625
2626         if (flp->proto) {
2627                 if (!flp->fl4_src)
2628                         flp->fl4_src = (*rp)->rt_src;
2629                 if (!flp->fl4_dst)
2630                         flp->fl4_dst = (*rp)->rt_dst;
2631                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2632         }
2633
2634         return 0;
2635 }
2636
2637 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2638
2639 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2640 {
2641         return ip_route_output_flow(rp, flp, NULL, 0);
2642 }
2643
2644 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2645                         int nowait, unsigned int flags)
2646 {
2647         struct rtable *rt = (struct rtable*)skb->dst;
2648         struct rtmsg *r;
2649         struct nlmsghdr *nlh;
2650         struct rta_cacheinfo ci;
2651
2652         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2653         if (nlh == NULL)
2654                 return -ENOBUFS;
2655
2656         r = nlmsg_data(nlh);
2657         r->rtm_family    = AF_INET;
2658         r->rtm_dst_len  = 32;
2659         r->rtm_src_len  = 0;
2660         r->rtm_tos      = rt->fl.fl4_tos;
2661         r->rtm_table    = RT_TABLE_MAIN;
2662         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2663         r->rtm_type     = rt->rt_type;
2664         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2665         r->rtm_protocol = RTPROT_UNSPEC;
2666         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2667         if (rt->rt_flags & RTCF_NOTIFY)
2668                 r->rtm_flags |= RTM_F_NOTIFY;
2669
2670         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2671
2672         if (rt->fl.fl4_src) {
2673                 r->rtm_src_len = 32;
2674                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2675         }
2676         if (rt->u.dst.dev)
2677                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2678 #ifdef CONFIG_NET_CLS_ROUTE
2679         if (rt->u.dst.tclassid)
2680                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2681 #endif
2682 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2683         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2684                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2685 #endif
2686         if (rt->fl.iif)
2687                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2688         else if (rt->rt_src != rt->fl.fl4_src)
2689                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2690
2691         if (rt->rt_dst != rt->rt_gateway)
2692                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2693
2694         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2695                 goto nla_put_failure;
2696
2697         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2698         ci.rta_used     = rt->u.dst.__use;
2699         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2700         if (rt->u.dst.expires)
2701                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2702         else
2703                 ci.rta_expires = 0;
2704         ci.rta_error    = rt->u.dst.error;
2705         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2706         if (rt->peer) {
2707                 ci.rta_id = rt->peer->ip_id_count;
2708                 if (rt->peer->tcp_ts_stamp) {
2709                         ci.rta_ts = rt->peer->tcp_ts;
2710                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2711                 }
2712         }
2713
2714         if (rt->fl.iif) {
2715 #ifdef CONFIG_IP_MROUTE
2716                 __be32 dst = rt->rt_dst;
2717
2718                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2719                     ipv4_devconf.mc_forwarding) {
2720                         int err = ipmr_get_route(skb, r, nowait);
2721                         if (err <= 0) {
2722                                 if (!nowait) {
2723                                         if (err == 0)
2724                                                 return 0;
2725                                         goto nla_put_failure;
2726                                 } else {
2727                                         if (err == -EMSGSIZE)
2728                                                 goto nla_put_failure;
2729                                         ci.rta_error = err;
2730                                 }
2731                         }
2732                 } else
2733 #endif
2734                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2735         }
2736
2737         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2738
2739         return nlmsg_end(skb, nlh);
2740
2741 nla_put_failure:
2742         return nlmsg_cancel(skb, nlh);
2743 }
2744
2745 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2746 {
2747         struct rtmsg *rtm;
2748         struct nlattr *tb[RTA_MAX+1];
2749         struct rtable *rt = NULL;
2750         __be32 dst = 0;
2751         __be32 src = 0;
2752         u32 iif;
2753         int err;
2754         struct sk_buff *skb;
2755
2756         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2757         if (err < 0)
2758                 goto errout;
2759
2760         rtm = nlmsg_data(nlh);
2761
2762         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2763         if (skb == NULL) {
2764                 err = -ENOBUFS;
2765                 goto errout;
2766         }
2767
2768         /* Reserve room for dummy headers, this skb can pass
2769            through good chunk of routing engine.
2770          */
2771         skb->mac.raw = skb->nh.raw = skb->data;
2772
2773         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2774         skb->nh.iph->protocol = IPPROTO_ICMP;
2775         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2776
2777         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2778         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2779         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2780
2781         if (iif) {
2782                 struct net_device *dev;
2783
2784                 dev = __dev_get_by_index(iif);
2785                 if (dev == NULL) {
2786                         err = -ENODEV;
2787                         goto errout_free;
2788                 }
2789
2790                 skb->protocol   = htons(ETH_P_IP);
2791                 skb->dev        = dev;
2792                 local_bh_disable();
2793                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2794                 local_bh_enable();
2795
2796                 rt = (struct rtable*) skb->dst;
2797                 if (err == 0 && rt->u.dst.error)
2798                         err = -rt->u.dst.error;
2799         } else {
2800                 struct flowi fl = {
2801                         .nl_u = {
2802                                 .ip4_u = {
2803                                         .daddr = dst,
2804                                         .saddr = src,
2805                                         .tos = rtm->rtm_tos,
2806                                 },
2807                         },
2808                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2809                 };
2810                 err = ip_route_output_key(&rt, &fl);
2811         }
2812
2813         if (err)
2814                 goto errout_free;
2815
2816         skb->dst = &rt->u.dst;
2817         if (rtm->rtm_flags & RTM_F_NOTIFY)
2818                 rt->rt_flags |= RTCF_NOTIFY;
2819
2820         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2821                                 RTM_NEWROUTE, 0, 0);
2822         if (err <= 0)
2823                 goto errout_free;
2824
2825         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2826 errout:
2827         return err;
2828
2829 errout_free:
2830         kfree_skb(skb);
2831         goto errout;
2832 }
2833
2834 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2835 {
2836         struct rtable *rt;
2837         int h, s_h;
2838         int idx, s_idx;
2839
2840         s_h = cb->args[0];
2841         s_idx = idx = cb->args[1];
2842         for (h = 0; h <= rt_hash_mask; h++) {
2843                 if (h < s_h) continue;
2844                 if (h > s_h)
2845                         s_idx = 0;
2846                 rcu_read_lock_bh();
2847                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2848                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2849                         if (idx < s_idx)
2850                                 continue;
2851                         skb->dst = dst_clone(&rt->u.dst);
2852                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2853                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2854                                          1, NLM_F_MULTI) <= 0) {
2855                                 dst_release(xchg(&skb->dst, NULL));
2856                                 rcu_read_unlock_bh();
2857                                 goto done;
2858                         }
2859                         dst_release(xchg(&skb->dst, NULL));
2860                 }
2861                 rcu_read_unlock_bh();
2862         }
2863
2864 done:
2865         cb->args[0] = h;
2866         cb->args[1] = idx;
2867         return skb->len;
2868 }
2869
2870 void ip_rt_multicast_event(struct in_device *in_dev)
2871 {
2872         rt_cache_flush(0);
2873 }
2874
2875 #ifdef CONFIG_SYSCTL
2876 static int flush_delay;
2877
2878 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2879                                         struct file *filp, void __user *buffer,
2880                                         size_t *lenp, loff_t *ppos)
2881 {
2882         if (write) {
2883                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2884                 rt_cache_flush(flush_delay);
2885                 return 0;
2886         }
2887
2888         return -EINVAL;
2889 }
2890
2891 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2892                                                 int __user *name,
2893                                                 int nlen,
2894                                                 void __user *oldval,
2895                                                 size_t __user *oldlenp,
2896                                                 void __user *newval,
2897                                                 size_t newlen,
2898                                                 void **context)
2899 {
2900         int delay;
2901         if (newlen != sizeof(int))
2902                 return -EINVAL;
2903         if (get_user(delay, (int __user *)newval))
2904                 return -EFAULT;
2905         rt_cache_flush(delay);
2906         return 0;
2907 }
2908
2909 ctl_table ipv4_route_table[] = {
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2912                 .procname       = "flush",
2913                 .data           = &flush_delay,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0200,
2916                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2917                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2918         },
2919         {
2920                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2921                 .procname       = "min_delay",
2922                 .data           = &ip_rt_min_delay,
2923                 .maxlen         = sizeof(int),
2924                 .mode           = 0644,
2925                 .proc_handler   = &proc_dointvec_jiffies,
2926                 .strategy       = &sysctl_jiffies,
2927         },
2928         {
2929                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2930                 .procname       = "max_delay",
2931                 .data           = &ip_rt_max_delay,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = &proc_dointvec_jiffies,
2935                 .strategy       = &sysctl_jiffies,
2936         },
2937         {
2938                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2939                 .procname       = "gc_thresh",
2940                 .data           = &ipv4_dst_ops.gc_thresh,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec,
2944         },
2945         {
2946                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2947                 .procname       = "max_size",
2948                 .data           = &ip_rt_max_size,
2949                 .maxlen         = sizeof(int),
2950                 .mode           = 0644,
2951                 .proc_handler   = &proc_dointvec,
2952         },
2953         {
2954                 /*  Deprecated. Use gc_min_interval_ms */
2955
2956                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2957                 .procname       = "gc_min_interval",
2958                 .data           = &ip_rt_gc_min_interval,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec_jiffies,
2962                 .strategy       = &sysctl_jiffies,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2966                 .procname       = "gc_min_interval_ms",
2967                 .data           = &ip_rt_gc_min_interval,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec_ms_jiffies,
2971                 .strategy       = &sysctl_ms_jiffies,
2972         },
2973         {
2974                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2975                 .procname       = "gc_timeout",
2976                 .data           = &ip_rt_gc_timeout,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = &proc_dointvec_jiffies,
2980                 .strategy       = &sysctl_jiffies,
2981         },
2982         {
2983                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2984                 .procname       = "gc_interval",
2985                 .data           = &ip_rt_gc_interval,
2986                 .maxlen         = sizeof(int),
2987                 .mode           = 0644,
2988                 .proc_handler   = &proc_dointvec_jiffies,
2989                 .strategy       = &sysctl_jiffies,
2990         },
2991         {
2992                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2993                 .procname       = "redirect_load",
2994                 .data           = &ip_rt_redirect_load,
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0644,
2997                 .proc_handler   = &proc_dointvec,
2998         },
2999         {
3000                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3001                 .procname       = "redirect_number",
3002                 .data           = &ip_rt_redirect_number,
3003                 .maxlen         = sizeof(int),
3004                 .mode           = 0644,
3005                 .proc_handler   = &proc_dointvec,
3006         },
3007         {
3008                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3009                 .procname       = "redirect_silence",
3010                 .data           = &ip_rt_redirect_silence,
3011                 .maxlen         = sizeof(int),
3012                 .mode           = 0644,
3013                 .proc_handler   = &proc_dointvec,
3014         },
3015         {
3016                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3017                 .procname       = "error_cost",
3018                 .data           = &ip_rt_error_cost,
3019                 .maxlen         = sizeof(int),
3020                 .mode           = 0644,
3021                 .proc_handler   = &proc_dointvec,
3022         },
3023         {
3024                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3025                 .procname       = "error_burst",
3026                 .data           = &ip_rt_error_burst,
3027                 .maxlen         = sizeof(int),
3028                 .mode           = 0644,
3029                 .proc_handler   = &proc_dointvec,
3030         },
3031         {
3032                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3033                 .procname       = "gc_elasticity",
3034                 .data           = &ip_rt_gc_elasticity,
3035                 .maxlen         = sizeof(int),
3036                 .mode           = 0644,
3037                 .proc_handler   = &proc_dointvec,
3038         },
3039         {
3040                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3041                 .procname       = "mtu_expires",
3042                 .data           = &ip_rt_mtu_expires,
3043                 .maxlen         = sizeof(int),
3044                 .mode           = 0644,
3045                 .proc_handler   = &proc_dointvec_jiffies,
3046                 .strategy       = &sysctl_jiffies,
3047         },
3048         {
3049                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3050                 .procname       = "min_pmtu",
3051                 .data           = &ip_rt_min_pmtu,
3052                 .maxlen         = sizeof(int),
3053                 .mode           = 0644,
3054                 .proc_handler   = &proc_dointvec,
3055         },
3056         {
3057                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3058                 .procname       = "min_adv_mss",
3059                 .data           = &ip_rt_min_advmss,
3060                 .maxlen         = sizeof(int),
3061                 .mode           = 0644,
3062                 .proc_handler   = &proc_dointvec,
3063         },
3064         {
3065                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3066                 .procname       = "secret_interval",
3067                 .data           = &ip_rt_secret_interval,
3068                 .maxlen         = sizeof(int),
3069                 .mode           = 0644,
3070                 .proc_handler   = &proc_dointvec_jiffies,
3071                 .strategy       = &sysctl_jiffies,
3072         },
3073         { .ctl_name = 0 }
3074 };
3075 #endif
3076
3077 #ifdef CONFIG_NET_CLS_ROUTE
3078 struct ip_rt_acct *ip_rt_acct;
3079
3080 /* This code sucks.  But you should have seen it before! --RR */
3081
3082 /* IP route accounting ptr for this logical cpu number. */
3083 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3084
3085 #ifdef CONFIG_PROC_FS
3086 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3087                            int length, int *eof, void *data)
3088 {
3089         unsigned int i;
3090
3091         if ((offset & 3) || (length & 3))
3092                 return -EIO;
3093
3094         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3095                 *eof = 1;
3096                 return 0;
3097         }
3098
3099         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3100                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3101                 *eof = 1;
3102         }
3103
3104         offset /= sizeof(u32);
3105
3106         if (length > 0) {
3107                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3108                 u32 *dst = (u32 *) buffer;
3109
3110                 /* Copy first cpu. */
3111                 *start = buffer;
3112                 memcpy(dst, src, length);
3113
3114                 /* Add the other cpus in, one int at a time */
3115                 for_each_possible_cpu(i) {
3116                         unsigned int j;
3117
3118                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3119
3120                         for (j = 0; j < length/4; j++)
3121                                 dst[j] += src[j];
3122                 }
3123         }
3124         return length;
3125 }
3126 #endif /* CONFIG_PROC_FS */
3127 #endif /* CONFIG_NET_CLS_ROUTE */
3128
3129 static __initdata unsigned long rhash_entries;
3130 static int __init set_rhash_entries(char *str)
3131 {
3132         if (!str)
3133                 return 0;
3134         rhash_entries = simple_strtoul(str, &str, 0);
3135         return 1;
3136 }
3137 __setup("rhash_entries=", set_rhash_entries);
3138
3139 int __init ip_rt_init(void)
3140 {
3141         int rc = 0;
3142
3143         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3144                              (jiffies ^ (jiffies >> 7)));
3145
3146 #ifdef CONFIG_NET_CLS_ROUTE
3147         {
3148         int order;
3149         for (order = 0;
3150              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3151                 /* NOTHING */;
3152         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3153         if (!ip_rt_acct)
3154                 panic("IP: failed to allocate ip_rt_acct\n");
3155         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3156         }
3157 #endif
3158
3159         ipv4_dst_ops.kmem_cachep =
3160                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3161                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3162
3163         rt_hash_table = (struct rt_hash_bucket *)
3164                 alloc_large_system_hash("IP route cache",
3165                                         sizeof(struct rt_hash_bucket),
3166                                         rhash_entries,
3167                                         (num_physpages >= 128 * 1024) ?
3168                                         15 : 17,
3169                                         0,
3170                                         &rt_hash_log,
3171                                         &rt_hash_mask,
3172                                         0);
3173         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3174         rt_hash_lock_init();
3175
3176         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3177         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3178
3179         devinet_init();
3180         ip_fib_init();
3181
3182         init_timer(&rt_flush_timer);
3183         rt_flush_timer.function = rt_run_flush;
3184         init_timer(&rt_periodic_timer);
3185         rt_periodic_timer.function = rt_check_expire;
3186         init_timer(&rt_secret_timer);
3187         rt_secret_timer.function = rt_secret_rebuild;
3188
3189         /* All the timers, started at system startup tend
3190            to synchronize. Perturb it a bit.
3191          */
3192         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3193                                         ip_rt_gc_interval;
3194         add_timer(&rt_periodic_timer);
3195
3196         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3197                 ip_rt_secret_interval;
3198         add_timer(&rt_secret_timer);
3199
3200 #ifdef CONFIG_PROC_FS
3201         {
3202         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3203         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3204             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3205                                              proc_net_stat))) {
3206                 return -ENOMEM;
3207         }
3208         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3209         }
3210 #ifdef CONFIG_NET_CLS_ROUTE
3211         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3212 #endif
3213 #endif
3214 #ifdef CONFIG_XFRM
3215         xfrm_init();
3216         xfrm4_init();
3217 #endif
3218         return rc;
3219 }
3220
3221 EXPORT_SYMBOL(__ip_select_ident);
3222 EXPORT_SYMBOL(ip_route_input);
3223 EXPORT_SYMBOL(ip_route_output_key);