net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 151                             int how)
 152 {
 153 }
 154
 155 static struct dst_ops ipv4_dst_ops = {
 156         .family =               AF_INET,
 157         .protocol =             cpu_to_be16(ETH_P_IP),
 158         .gc =                   rt_garbage_collect,
 159         .check =                ipv4_dst_check,
 160         .default_advmss =       ipv4_default_advmss,
 161         .default_mtu =          ipv4_default_mtu,
 162         .destroy =              ipv4_dst_destroy,
 163         .ifdown =               ipv4_dst_ifdown,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .local_out =            __ip_local_out,
 168 };
 169
 170 #define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172 const __u8 ip_tos2prio[16] = {
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(BESTEFFORT),
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK)
 189 };
 190
 191
 192 /*
 193  * Route cache.
 194  */
 195
 196 /* The locking scheme is rather straight forward:
 197  *
 198  * 1) Read-Copy Update protects the buckets of the central route hash.
 199  * 2) Only writers remove entries, and they hold the lock
 200  *    as they look at rtable reference counts.
 201  * 3) Only readers acquire references to rtable entries,
 202  *    they do so with atomic increments and with the
 203  *    lock held.
 204  */
 205
 206 struct rt_hash_bucket {
 207         struct rtable __rcu     *chain;
 208 };
 209
 210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 211         defined(CONFIG_PROVE_LOCKING)
 212 /*
 213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 214  * The size of this table is a power of two and depends on the number of CPUS.
 215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 216  */
 217 #ifdef CONFIG_LOCKDEP
 218 # define RT_HASH_LOCK_SZ        256
 219 #else
 220 # if NR_CPUS >= 32
 221 #  define RT_HASH_LOCK_SZ       4096
 222 # elif NR_CPUS >= 16
 223 #  define RT_HASH_LOCK_SZ       2048
 224 # elif NR_CPUS >= 8
 225 #  define RT_HASH_LOCK_SZ       1024
 226 # elif NR_CPUS >= 4
 227 #  define RT_HASH_LOCK_SZ       512
 228 # else
 229 #  define RT_HASH_LOCK_SZ       256
 230 # endif
 231 #endif
 232
 233 static spinlock_t       *rt_hash_locks;
 234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 235
 236 static __init void rt_hash_lock_init(void)
 237 {
 238         int i;
 239
 240         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 241                         GFP_KERNEL);
 242         if (!rt_hash_locks)
 243                 panic("IP: failed to allocate rt_hash_locks\n");
 244
 245         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 246                 spin_lock_init(&rt_hash_locks[i]);
 247 }
 248 #else
 249 # define rt_hash_lock_addr(slot) NULL
 250
 251 static inline void rt_hash_lock_init(void)
 252 {
 253 }
 254 #endif
 255
 256 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 257 static unsigned                 rt_hash_mask __read_mostly;
 258 static unsigned int             rt_hash_log  __read_mostly;
 259
 260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 261 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 262
 263 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 264                                    int genid)
 265 {
 266         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 267                             idx, genid)
 268                 & rt_hash_mask;
 269 }
 270
 271 static inline int rt_genid(struct net *net)
 272 {
 273         return atomic_read(&net->ipv4.rt_genid);
 274 }
 275
 276 #ifdef CONFIG_PROC_FS
 277 struct rt_cache_iter_state {
 278         struct seq_net_private p;
 279         int bucket;
 280         int genid;
 281 };
 282
 283 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 284 {
 285         struct rt_cache_iter_state *st = seq->private;
 286         struct rtable *r = NULL;
 287
 288         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 289                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 290                         continue;
 291                 rcu_read_lock_bh();
 292                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 293                 while (r) {
 294                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 295                             r->rt_genid == st->genid)
 296                                 return r;
 297                         r = rcu_dereference_bh(r->dst.rt_next);
 298                 }
 299                 rcu_read_unlock_bh();
 300         }
 301         return r;
 302 }
 303
 304 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 305                                           struct rtable *r)
 306 {
 307         struct rt_cache_iter_state *st = seq->private;
 308
 309         r = rcu_dereference_bh(r->dst.rt_next);
 310         while (!r) {
 311                 rcu_read_unlock_bh();
 312                 do {
 313                         if (--st->bucket < 0)
 314                                 return NULL;
 315                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 316                 rcu_read_lock_bh();
 317                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 318         }
 319         return r;
 320 }
 321
 322 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 323                                         struct rtable *r)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 327                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 328                         continue;
 329                 if (r->rt_genid == st->genid)
 330                         break;
 331         }
 332         return r;
 333 }
 334
 335 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 336 {
 337         struct rtable *r = rt_cache_get_first(seq);
 338
 339         if (r)
 340                 while (pos && (r = rt_cache_get_next(seq, r)))
 341                         --pos;
 342         return pos ? NULL : r;
 343 }
 344
 345 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348         if (*pos)
 349                 return rt_cache_get_idx(seq, *pos - 1);
 350         st->genid = rt_genid(seq_file_net(seq));
 351         return SEQ_START_TOKEN;
 352 }
 353
 354 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 355 {
 356         struct rtable *r;
 357
 358         if (v == SEQ_START_TOKEN)
 359                 r = rt_cache_get_first(seq);
 360         else
 361                 r = rt_cache_get_next(seq, v);
 362         ++*pos;
 363         return r;
 364 }
 365
 366 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 367 {
 368         if (v && v != SEQ_START_TOKEN)
 369                 rcu_read_unlock_bh();
 370 }
 371
 372 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 373 {
 374         if (v == SEQ_START_TOKEN)
 375                 seq_printf(seq, "%-127s\n",
 376                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 377                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 378                            "HHUptod\tSpecDst");
 379         else {
 380                 struct rtable *r = v;
 381                 int len;
 382
 383                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 384                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 385                         r->dst.dev ? r->dst.dev->name : "*",
 386                         (__force u32)r->rt_dst,
 387                         (__force u32)r->rt_gateway,
 388                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 389                         r->dst.__use, 0, (__force u32)r->rt_src,
 390                         dst_metric_advmss(&r->dst) + 40,
 391                         dst_metric(&r->dst, RTAX_WINDOW),
 392                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 393                               dst_metric(&r->dst, RTAX_RTTVAR)),
 394                         r->fl.fl4_tos,
 395                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 396                         r->dst.hh ? (r->dst.hh->hh_output ==
 397                                        dev_queue_xmit) : 0,
 398                         r->rt_spec_dst, &len);
 399
 400                 seq_printf(seq, "%*s\n", 127 - len, "");
 401         }
 402         return 0;
 403 }
 404
 405 static const struct seq_operations rt_cache_seq_ops = {
 406         .start  = rt_cache_seq_start,
 407         .next   = rt_cache_seq_next,
 408         .stop   = rt_cache_seq_stop,
 409         .show   = rt_cache_seq_show,
 410 };
 411
 412 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 413 {
 414         return seq_open_net(inode, file, &rt_cache_seq_ops,
 415                         sizeof(struct rt_cache_iter_state));
 416 }
 417
 418 static const struct file_operations rt_cache_seq_fops = {
 419         .owner   = THIS_MODULE,
 420         .open    = rt_cache_seq_open,
 421         .read    = seq_read,
 422         .llseek  = seq_lseek,
 423         .release = seq_release_net,
 424 };
 425
 426
 427 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 428 {
 429         int cpu;
 430
 431         if (*pos == 0)
 432                 return SEQ_START_TOKEN;
 433
 434         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 435                 if (!cpu_possible(cpu))
 436                         continue;
 437                 *pos = cpu+1;
 438                 return &per_cpu(rt_cache_stat, cpu);
 439         }
 440         return NULL;
 441 }
 442
 443 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 444 {
 445         int cpu;
 446
 447         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 448                 if (!cpu_possible(cpu))
 449                         continue;
 450                 *pos = cpu+1;
 451                 return &per_cpu(rt_cache_stat, cpu);
 452         }
 453         return NULL;
 454
 455 }
 456
 457 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 458 {
 459
 460 }
 461
 462 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 463 {
 464         struct rt_cache_stat *st = v;
 465
 466         if (v == SEQ_START_TOKEN) {
 467                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 468                 return 0;
 469         }
 470
 471         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 472                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 473                    dst_entries_get_slow(&ipv4_dst_ops),
 474                    st->in_hit,
 475                    st->in_slow_tot,
 476                    st->in_slow_mc,
 477                    st->in_no_route,
 478                    st->in_brd,
 479                    st->in_martian_dst,
 480                    st->in_martian_src,
 481
 482                    st->out_hit,
 483                    st->out_slow_tot,
 484                    st->out_slow_mc,
 485
 486                    st->gc_total,
 487                    st->gc_ignored,
 488                    st->gc_goal_miss,
 489                    st->gc_dst_overflow,
 490                    st->in_hlist_search,
 491                    st->out_hlist_search
 492                 );
 493         return 0;
 494 }
 495
 496 static const struct seq_operations rt_cpu_seq_ops = {
 497         .start  = rt_cpu_seq_start,
 498         .next   = rt_cpu_seq_next,
 499         .stop   = rt_cpu_seq_stop,
 500         .show   = rt_cpu_seq_show,
 501 };
 502
 503
 504 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 505 {
 506         return seq_open(file, &rt_cpu_seq_ops);
 507 }
 508
 509 static const struct file_operations rt_cpu_seq_fops = {
 510         .owner   = THIS_MODULE,
 511         .open    = rt_cpu_seq_open,
 512         .read    = seq_read,
 513         .llseek  = seq_lseek,
 514         .release = seq_release,
 515 };
 516
 517 #ifdef CONFIG_NET_CLS_ROUTE
 518 static int rt_acct_proc_show(struct seq_file *m, void *v)
 519 {
 520         struct ip_rt_acct *dst, *src;
 521         unsigned int i, j;
 522
 523         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 524         if (!dst)
 525                 return -ENOMEM;
 526
 527         for_each_possible_cpu(i) {
 528                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 529                 for (j = 0; j < 256; j++) {
 530                         dst[j].o_bytes   += src[j].o_bytes;
 531                         dst[j].o_packets += src[j].o_packets;
 532                         dst[j].i_bytes   += src[j].i_bytes;
 533                         dst[j].i_packets += src[j].i_packets;
 534                 }
 535         }
 536
 537         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 538         kfree(dst);
 539         return 0;
 540 }
 541
 542 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 543 {
 544         return single_open(file, rt_acct_proc_show, NULL);
 545 }
 546
 547 static const struct file_operations rt_acct_proc_fops = {
 548         .owner          = THIS_MODULE,
 549         .open           = rt_acct_proc_open,
 550         .read           = seq_read,
 551         .llseek         = seq_lseek,
 552         .release        = single_release,
 553 };
 554 #endif
 555
 556 static int __net_init ip_rt_do_proc_init(struct net *net)
 557 {
 558         struct proc_dir_entry *pde;
 559
 560         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 561                         &rt_cache_seq_fops);
 562         if (!pde)
 563                 goto err1;
 564
 565         pde = proc_create("rt_cache", S_IRUGO,
 566                           net->proc_net_stat, &rt_cpu_seq_fops);
 567         if (!pde)
 568                 goto err2;
 569
 570 #ifdef CONFIG_NET_CLS_ROUTE
 571         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 572         if (!pde)
 573                 goto err3;
 574 #endif
 575         return 0;
 576
 577 #ifdef CONFIG_NET_CLS_ROUTE
 578 err3:
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580 #endif
 581 err2:
 582         remove_proc_entry("rt_cache", net->proc_net);
 583 err1:
 584         return -ENOMEM;
 585 }
 586
 587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588 {
 589         remove_proc_entry("rt_cache", net->proc_net_stat);
 590         remove_proc_entry("rt_cache", net->proc_net);
 591 #ifdef CONFIG_NET_CLS_ROUTE
 592         remove_proc_entry("rt_acct", net->proc_net);
 593 #endif
 594 }
 595
 596 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 597         .init = ip_rt_do_proc_init,
 598         .exit = ip_rt_do_proc_exit,
 599 };
 600
 601 static int __init ip_rt_proc_init(void)
 602 {
 603         return register_pernet_subsys(&ip_rt_proc_ops);
 604 }
 605
 606 #else
 607 static inline int ip_rt_proc_init(void)
 608 {
 609         return 0;
 610 }
 611 #endif /* CONFIG_PROC_FS */
 612
 613 static inline void rt_free(struct rtable *rt)
 614 {
 615         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 616 }
 617
 618 static inline void rt_drop(struct rtable *rt)
 619 {
 620         ip_rt_put(rt);
 621         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 622 }
 623
 624 static inline int rt_fast_clean(struct rtable *rth)
 625 {
 626         /* Kill broadcast/multicast entries very aggresively, if they
 627            collide in hash table with more useful entries */
 628         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 629                 rt_is_input_route(rth) && rth->dst.rt_next;
 630 }
 631
 632 static inline int rt_valuable(struct rtable *rth)
 633 {
 634         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 635                 rth->dst.expires;
 636 }
 637
 638 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 639 {
 640         unsigned long age;
 641         int ret = 0;
 642
 643         if (atomic_read(&rth->dst.__refcnt))
 644                 goto out;
 645
 646         ret = 1;
 647         if (rth->dst.expires &&
 648             time_after_eq(jiffies, rth->dst.expires))
 649                 goto out;
 650
 651         age = jiffies - rth->dst.lastuse;
 652         ret = 0;
 653         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 654             (age <= tmo2 && rt_valuable(rth)))
 655                 goto out;
 656         ret = 1;
 657 out:    return ret;
 658 }
 659
 660 /* Bits of score are:
 661  * 31: very valuable
 662  * 30: not quite useless
 663  * 29..0: usage counter
 664  */
 665 static inline u32 rt_score(struct rtable *rt)
 666 {
 667         u32 score = jiffies - rt->dst.lastuse;
 668
 669         score = ~score & ~(3<<30);
 670
 671         if (rt_valuable(rt))
 672                 score |= (1<<31);
 673
 674         if (rt_is_output_route(rt) ||
 675             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 676                 score |= (1<<30);
 677
 678         return score;
 679 }
 680
 681 static inline bool rt_caching(const struct net *net)
 682 {
 683         return net->ipv4.current_rt_cache_rebuild_count <=
 684                 net->ipv4.sysctl_rt_cache_rebuild_count;
 685 }
 686
 687 static inline bool compare_hash_inputs(const struct flowi *fl1,
 688                                         const struct flowi *fl2)
 689 {
 690         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 691                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 692                 (fl1->iif ^ fl2->iif)) == 0);
 693 }
 694
 695 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 696 {
 697         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 698                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 699                 (fl1->mark ^ fl2->mark) |
 700                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
 701                 (fl1->oif ^ fl2->oif) |
 702                 (fl1->iif ^ fl2->iif)) == 0;
 703 }
 704
 705 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 706 {
 707         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 708 }
 709
 710 static inline int rt_is_expired(struct rtable *rth)
 711 {
 712         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 713 }
 714
 715 /*
 716  * Perform a full scan of hash table and free all entries.
 717  * Can be called by a softirq or a process.
 718  * In the later case, we want to be reschedule if necessary
 719  */
 720 static void rt_do_flush(struct net *net, int process_context)
 721 {
 722         unsigned int i;
 723         struct rtable *rth, *next;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 struct rtable __rcu **pprev;
 727                 struct rtable *list;
 728
 729                 if (process_context && need_resched())
 730                         cond_resched();
 731                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 732                 if (!rth)
 733                         continue;
 734
 735                 spin_lock_bh(rt_hash_lock_addr(i));
 736
 737                 list = NULL;
 738                 pprev = &rt_hash_table[i].chain;
 739                 rth = rcu_dereference_protected(*pprev,
 740                         lockdep_is_held(rt_hash_lock_addr(i)));
 741
 742                 while (rth) {
 743                         next = rcu_dereference_protected(rth->dst.rt_next,
 744                                 lockdep_is_held(rt_hash_lock_addr(i)));
 745
 746                         if (!net ||
 747                             net_eq(dev_net(rth->dst.dev), net)) {
 748                                 rcu_assign_pointer(*pprev, next);
 749                                 rcu_assign_pointer(rth->dst.rt_next, list);
 750                                 list = rth;
 751                         } else {
 752                                 pprev = &rth->dst.rt_next;
 753                         }
 754                         rth = next;
 755                 }
 756
 757                 spin_unlock_bh(rt_hash_lock_addr(i));
 758
 759                 for (; list; list = next) {
 760                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 761                         rt_free(list);
 762                 }
 763         }
 764 }
 765
 766 /*
 767  * While freeing expired entries, we compute average chain length
 768  * and standard deviation, using fixed-point arithmetic.
 769  * This to have an estimation of rt_chain_length_max
 770  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 771  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 772  */
 773
 774 #define FRACT_BITS 3
 775 #define ONE (1UL << FRACT_BITS)
 776
 777 /*
 778  * Given a hash chain and an item in this hash chain,
 779  * find if a previous entry has the same hash_inputs
 780  * (but differs on tos, mark or oif)
 781  * Returns 0 if an alias is found.
 782  * Returns ONE if rth has no alias before itself.
 783  */
 784 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 785 {
 786         const struct rtable *aux = head;
 787
 788         while (aux != rth) {
 789                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 790                         return 0;
 791                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 792         }
 793         return ONE;
 794 }
 795
 796 static void rt_check_expire(void)
 797 {
 798         static unsigned int rover;
 799         unsigned int i = rover, goal;
 800         struct rtable *rth;
 801         struct rtable __rcu **rthp;
 802         unsigned long samples = 0;
 803         unsigned long sum = 0, sum2 = 0;
 804         unsigned long delta;
 805         u64 mult;
 806
 807         delta = jiffies - expires_ljiffies;
 808         expires_ljiffies = jiffies;
 809         mult = ((u64)delta) << rt_hash_log;
 810         if (ip_rt_gc_timeout > 1)
 811                 do_div(mult, ip_rt_gc_timeout);
 812         goal = (unsigned int)mult;
 813         if (goal > rt_hash_mask)
 814                 goal = rt_hash_mask + 1;
 815         for (; goal > 0; goal--) {
 816                 unsigned long tmo = ip_rt_gc_timeout;
 817                 unsigned long length;
 818
 819                 i = (i + 1) & rt_hash_mask;
 820                 rthp = &rt_hash_table[i].chain;
 821
 822                 if (need_resched())
 823                         cond_resched();
 824
 825                 samples++;
 826
 827                 if (rcu_dereference_raw(*rthp) == NULL)
 828                         continue;
 829                 length = 0;
 830                 spin_lock_bh(rt_hash_lock_addr(i));
 831                 while ((rth = rcu_dereference_protected(*rthp,
 832                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 833                         prefetch(rth->dst.rt_next);
 834                         if (rt_is_expired(rth)) {
 835                                 *rthp = rth->dst.rt_next;
 836                                 rt_free(rth);
 837                                 continue;
 838                         }
 839                         if (rth->dst.expires) {
 840                                 /* Entry is expired even if it is in use */
 841                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 842 nofree:
 843                                         tmo >>= 1;
 844                                         rthp = &rth->dst.rt_next;
 845                                         /*
 846                                          * We only count entries on
 847                                          * a chain with equal hash inputs once
 848                                          * so that entries for different QOS
 849                                          * levels, and other non-hash input
 850                                          * attributes don't unfairly skew
 851                                          * the length computation
 852                                          */
 853                                         length += has_noalias(rt_hash_table[i].chain, rth);
 854                                         continue;
 855                                 }
 856                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 857                                 goto nofree;
 858
 859                         /* Cleanup aged off entries. */
 860                         *rthp = rth->dst.rt_next;
 861                         rt_free(rth);
 862                 }
 863                 spin_unlock_bh(rt_hash_lock_addr(i));
 864                 sum += length;
 865                 sum2 += length*length;
 866         }
 867         if (samples) {
 868                 unsigned long avg = sum / samples;
 869                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 870                 rt_chain_length_max = max_t(unsigned long,
 871                                         ip_rt_gc_elasticity,
 872                                         (avg + 4*sd) >> FRACT_BITS);
 873         }
 874         rover = i;
 875 }
 876
 877 /*
 878  * rt_worker_func() is run in process context.
 879  * we call rt_check_expire() to scan part of the hash table
 880  */
 881 static void rt_worker_func(struct work_struct *work)
 882 {
 883         rt_check_expire();
 884         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 885 }
 886
 887 /*
 888  * Pertubation of rt_genid by a small quantity [1..256]
 889  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 890  * many times (2^24) without giving recent rt_genid.
 891  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 892  */
 893 static void rt_cache_invalidate(struct net *net)
 894 {
 895         unsigned char shuffle;
 896
 897         get_random_bytes(&shuffle, sizeof(shuffle));
 898         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 899 }
 900
 901 /*
 902  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 903  * delay >= 0 : invalidate & flush cache (can be long)
 904  */
 905 void rt_cache_flush(struct net *net, int delay)
 906 {
 907         rt_cache_invalidate(net);
 908         if (delay >= 0)
 909                 rt_do_flush(net, !in_softirq());
 910 }
 911
 912 /* Flush previous cache invalidated entries from the cache */
 913 void rt_cache_flush_batch(struct net *net)
 914 {
 915         rt_do_flush(net, !in_softirq());
 916 }
 917
 918 static void rt_emergency_hash_rebuild(struct net *net)
 919 {
 920         if (net_ratelimit())
 921                 printk(KERN_WARNING "Route hash chain too long!\n");
 922         rt_cache_invalidate(net);
 923 }
 924
 925 /*
 926    Short description of GC goals.
 927
 928    We want to build algorithm, which will keep routing cache
 929    at some equilibrium point, when number of aged off entries
 930    is kept approximately equal to newly generated ones.
 931
 932    Current expiration strength is variable "expire".
 933    We try to adjust it dynamically, so that if networking
 934    is idle expires is large enough to keep enough of warm entries,
 935    and when load increases it reduces to limit cache size.
 936  */
 937
 938 static int rt_garbage_collect(struct dst_ops *ops)
 939 {
 940         static unsigned long expire = RT_GC_TIMEOUT;
 941         static unsigned long last_gc;
 942         static int rover;
 943         static int equilibrium;
 944         struct rtable *rth;
 945         struct rtable __rcu **rthp;
 946         unsigned long now = jiffies;
 947         int goal;
 948         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 949
 950         /*
 951          * Garbage collection is pretty expensive,
 952          * do not make it too frequently.
 953          */
 954
 955         RT_CACHE_STAT_INC(gc_total);
 956
 957         if (now - last_gc < ip_rt_gc_min_interval &&
 958             entries < ip_rt_max_size) {
 959                 RT_CACHE_STAT_INC(gc_ignored);
 960                 goto out;
 961         }
 962
 963         entries = dst_entries_get_slow(&ipv4_dst_ops);
 964         /* Calculate number of entries, which we want to expire now. */
 965         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 966         if (goal <= 0) {
 967                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 968                         equilibrium = ipv4_dst_ops.gc_thresh;
 969                 goal = entries - equilibrium;
 970                 if (goal > 0) {
 971                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 972                         goal = entries - equilibrium;
 973                 }
 974         } else {
 975                 /* We are in dangerous area. Try to reduce cache really
 976                  * aggressively.
 977                  */
 978                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 979                 equilibrium = entries - goal;
 980         }
 981
 982         if (now - last_gc >= ip_rt_gc_min_interval)
 983                 last_gc = now;
 984
 985         if (goal <= 0) {
 986                 equilibrium += goal;
 987                 goto work_done;
 988         }
 989
 990         do {
 991                 int i, k;
 992
 993                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 994                         unsigned long tmo = expire;
 995
 996                         k = (k + 1) & rt_hash_mask;
 997                         rthp = &rt_hash_table[k].chain;
 998                         spin_lock_bh(rt_hash_lock_addr(k));
 999                         while ((rth = rcu_dereference_protected(*rthp,
1000                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1001                                 if (!rt_is_expired(rth) &&
1002                                         !rt_may_expire(rth, tmo, expire)) {
1003                                         tmo >>= 1;
1004                                         rthp = &rth->dst.rt_next;
1005                                         continue;
1006                                 }
1007                                 *rthp = rth->dst.rt_next;
1008                                 rt_free(rth);
1009                                 goal--;
1010                         }
1011                         spin_unlock_bh(rt_hash_lock_addr(k));
1012                         if (goal <= 0)
1013                                 break;
1014                 }
1015                 rover = k;
1016
1017                 if (goal <= 0)
1018                         goto work_done;
1019
1020                 /* Goal is not achieved. We stop process if:
1021
1022                    - if expire reduced to zero. Otherwise, expire is halfed.
1023                    - if table is not full.
1024                    - if we are called from interrupt.
1025                    - jiffies check is just fallback/debug loop breaker.
1026                      We will not spin here for long time in any case.
1027                  */
1028
1029                 RT_CACHE_STAT_INC(gc_goal_miss);
1030
1031                 if (expire == 0)
1032                         break;
1033
1034                 expire >>= 1;
1035 #if RT_CACHE_DEBUG >= 2
1036                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1037                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1038 #endif
1039
1040                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1041                         goto out;
1042         } while (!in_softirq() && time_before_eq(jiffies, now));
1043
1044         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1045                 goto out;
1046         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1047                 goto out;
1048         if (net_ratelimit())
1049                 printk(KERN_WARNING "dst cache overflow\n");
1050         RT_CACHE_STAT_INC(gc_dst_overflow);
1051         return 1;
1052
1053 work_done:
1054         expire += ip_rt_gc_min_interval;
1055         if (expire > ip_rt_gc_timeout ||
1056             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1057             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1058                 expire = ip_rt_gc_timeout;
1059 #if RT_CACHE_DEBUG >= 2
1060         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1061                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1062 #endif
1063 out:    return 0;
1064 }
1065
1066 /*
1067  * Returns number of entries in a hash chain that have different hash_inputs
1068  */
1069 static int slow_chain_length(const struct rtable *head)
1070 {
1071         int length = 0;
1072         const struct rtable *rth = head;
1073
1074         while (rth) {
1075                 length += has_noalias(head, rth);
1076                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1077         }
1078         return length >> FRACT_BITS;
1079 }
1080
1081 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1082                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1083 {
1084         struct rtable   *rth, *cand;
1085         struct rtable __rcu **rthp, **candp;
1086         unsigned long   now;
1087         u32             min_score;
1088         int             chain_length;
1089         int attempts = !in_softirq();
1090
1091 restart:
1092         chain_length = 0;
1093         min_score = ~(u32)0;
1094         cand = NULL;
1095         candp = NULL;
1096         now = jiffies;
1097
1098         if (!rt_caching(dev_net(rt->dst.dev))) {
1099                 /*
1100                  * If we're not caching, just tell the caller we
1101                  * were successful and don't touch the route.  The
1102                  * caller hold the sole reference to the cache entry, and
1103                  * it will be released when the caller is done with it.
1104                  * If we drop it here, the callers have no way to resolve routes
1105                  * when we're not caching.  Instead, just point *rp at rt, so
1106                  * the caller gets a single use out of the route
1107                  * Note that we do rt_free on this new route entry, so that
1108                  * once its refcount hits zero, we are still able to reap it
1109                  * (Thanks Alexey)
1110                  * Note: To avoid expensive rcu stuff for this uncached dst,
1111                  * we set DST_NOCACHE so that dst_release() can free dst without
1112                  * waiting a grace period.
1113                  */
1114
1115                 rt->dst.flags |= DST_NOCACHE;
1116                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1117                         int err = arp_bind_neighbour(&rt->dst);
1118                         if (err) {
1119                                 if (net_ratelimit())
1120                                         printk(KERN_WARNING
1121                                             "Neighbour table failure & not caching routes.\n");
1122                                 ip_rt_put(rt);
1123                                 return err;
1124                         }
1125                 }
1126
1127                 goto skip_hashing;
1128         }
1129
1130         rthp = &rt_hash_table[hash].chain;
1131
1132         spin_lock_bh(rt_hash_lock_addr(hash));
1133         while ((rth = rcu_dereference_protected(*rthp,
1134                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1135                 if (rt_is_expired(rth)) {
1136                         *rthp = rth->dst.rt_next;
1137                         rt_free(rth);
1138                         continue;
1139                 }
1140                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1141                         /* Put it first */
1142                         *rthp = rth->dst.rt_next;
1143                         /*
1144                          * Since lookup is lockfree, the deletion
1145                          * must be visible to another weakly ordered CPU before
1146                          * the insertion at the start of the hash chain.
1147                          */
1148                         rcu_assign_pointer(rth->dst.rt_next,
1149                                            rt_hash_table[hash].chain);
1150                         /*
1151                          * Since lookup is lockfree, the update writes
1152                          * must be ordered for consistency on SMP.
1153                          */
1154                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1155
1156                         dst_use(&rth->dst, now);
1157                         spin_unlock_bh(rt_hash_lock_addr(hash));
1158
1159                         rt_drop(rt);
1160                         if (rp)
1161                                 *rp = rth;
1162                         else
1163                                 skb_dst_set(skb, &rth->dst);
1164                         return 0;
1165                 }
1166
1167                 if (!atomic_read(&rth->dst.__refcnt)) {
1168                         u32 score = rt_score(rth);
1169
1170                         if (score <= min_score) {
1171                                 cand = rth;
1172                                 candp = rthp;
1173                                 min_score = score;
1174                         }
1175                 }
1176
1177                 chain_length++;
1178
1179                 rthp = &rth->dst.rt_next;
1180         }
1181
1182         if (cand) {
1183                 /* ip_rt_gc_elasticity used to be average length of chain
1184                  * length, when exceeded gc becomes really aggressive.
1185                  *
1186                  * The second limit is less certain. At the moment it allows
1187                  * only 2 entries per bucket. We will see.
1188                  */
1189                 if (chain_length > ip_rt_gc_elasticity) {
1190                         *candp = cand->dst.rt_next;
1191                         rt_free(cand);
1192                 }
1193         } else {
1194                 if (chain_length > rt_chain_length_max &&
1195                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1196                         struct net *net = dev_net(rt->dst.dev);
1197                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1198                         if (!rt_caching(net)) {
1199                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1200                                         rt->dst.dev->name, num);
1201                         }
1202                         rt_emergency_hash_rebuild(net);
1203                         spin_unlock_bh(rt_hash_lock_addr(hash));
1204
1205                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1206                                         ifindex, rt_genid(net));
1207                         goto restart;
1208                 }
1209         }
1210
1211         /* Try to bind route to arp only if it is output
1212            route or unicast forwarding path.
1213          */
1214         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1215                 int err = arp_bind_neighbour(&rt->dst);
1216                 if (err) {
1217                         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219                         if (err != -ENOBUFS) {
1220                                 rt_drop(rt);
1221                                 return err;
1222                         }
1223
1224                         /* Neighbour tables are full and nothing
1225                            can be released. Try to shrink route cache,
1226                            it is most likely it holds some neighbour records.
1227                          */
1228                         if (attempts-- > 0) {
1229                                 int saved_elasticity = ip_rt_gc_elasticity;
1230                                 int saved_int = ip_rt_gc_min_interval;
1231                                 ip_rt_gc_elasticity     = 1;
1232                                 ip_rt_gc_min_interval   = 0;
1233                                 rt_garbage_collect(&ipv4_dst_ops);
1234                                 ip_rt_gc_min_interval   = saved_int;
1235                                 ip_rt_gc_elasticity     = saved_elasticity;
1236                                 goto restart;
1237                         }
1238
1239                         if (net_ratelimit())
1240                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1241                         rt_drop(rt);
1242                         return -ENOBUFS;
1243                 }
1244         }
1245
1246         rt->dst.rt_next = rt_hash_table[hash].chain;
1247
1248 #if RT_CACHE_DEBUG >= 2
1249         if (rt->dst.rt_next) {
1250                 struct rtable *trt;
1251                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1252                        hash, &rt->rt_dst);
1253                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1254                         printk(" . %pI4", &trt->rt_dst);
1255                 printk("\n");
1256         }
1257 #endif
1258         /*
1259          * Since lookup is lockfree, we must make sure
1260          * previous writes to rt are comitted to memory
1261          * before making rt visible to other CPUS.
1262          */
1263         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1264
1265         spin_unlock_bh(rt_hash_lock_addr(hash));
1266
1267 skip_hashing:
1268         if (rp)
1269                 *rp = rt;
1270         else
1271                 skb_dst_set(skb, &rt->dst);
1272         return 0;
1273 }
1274
1275 void rt_bind_peer(struct rtable *rt, int create)
1276 {
1277         struct inet_peer *peer;
1278
1279         peer = inet_getpeer_v4(rt->rt_dst, create);
1280
1281         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1282                 inet_putpeer(peer);
1283 }
1284
1285 /*
1286  * Peer allocation may fail only in serious out-of-memory conditions.  However
1287  * we still can generate some output.
1288  * Random ID selection looks a bit dangerous because we have no chances to
1289  * select ID being unique in a reasonable period of time.
1290  * But broken packet identifier may be better than no packet at all.
1291  */
1292 static void ip_select_fb_ident(struct iphdr *iph)
1293 {
1294         static DEFINE_SPINLOCK(ip_fb_id_lock);
1295         static u32 ip_fallback_id;
1296         u32 salt;
1297
1298         spin_lock_bh(&ip_fb_id_lock);
1299         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1300         iph->id = htons(salt & 0xFFFF);
1301         ip_fallback_id = salt;
1302         spin_unlock_bh(&ip_fb_id_lock);
1303 }
1304
1305 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1306 {
1307         struct rtable *rt = (struct rtable *) dst;
1308
1309         if (rt) {
1310                 if (rt->peer == NULL)
1311                         rt_bind_peer(rt, 1);
1312
1313                 /* If peer is attached to destination, it is never detached,
1314                    so that we need not to grab a lock to dereference it.
1315                  */
1316                 if (rt->peer) {
1317                         iph->id = htons(inet_getid(rt->peer, more));
1318                         return;
1319                 }
1320         } else
1321                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1322                        __builtin_return_address(0));
1323
1324         ip_select_fb_ident(iph);
1325 }
1326 EXPORT_SYMBOL(__ip_select_ident);
1327
1328 static void rt_del(unsigned hash, struct rtable *rt)
1329 {
1330         struct rtable __rcu **rthp;
1331         struct rtable *aux;
1332
1333         rthp = &rt_hash_table[hash].chain;
1334         spin_lock_bh(rt_hash_lock_addr(hash));
1335         ip_rt_put(rt);
1336         while ((aux = rcu_dereference_protected(*rthp,
1337                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1338                 if (aux == rt || rt_is_expired(aux)) {
1339                         *rthp = aux->dst.rt_next;
1340                         rt_free(aux);
1341                         continue;
1342                 }
1343                 rthp = &aux->dst.rt_next;
1344         }
1345         spin_unlock_bh(rt_hash_lock_addr(hash));
1346 }
1347
1348 /* called in rcu_read_lock() section */
1349 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350                     __be32 saddr, struct net_device *dev)
1351 {
1352         int i, k;
1353         struct in_device *in_dev = __in_dev_get_rcu(dev);
1354         struct rtable *rth;
1355         struct rtable __rcu **rthp;
1356         __be32  skeys[2] = { saddr, 0 };
1357         int  ikeys[2] = { dev->ifindex, 0 };
1358         struct netevent_redirect netevent;
1359         struct net *net;
1360
1361         if (!in_dev)
1362                 return;
1363
1364         net = dev_net(dev);
1365         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1366             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1367             ipv4_is_zeronet(new_gw))
1368                 goto reject_redirect;
1369
1370         if (!rt_caching(net))
1371                 goto reject_redirect;
1372
1373         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1374                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1375                         goto reject_redirect;
1376                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1377                         goto reject_redirect;
1378         } else {
1379                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1380                         goto reject_redirect;
1381         }
1382
1383         for (i = 0; i < 2; i++) {
1384                 for (k = 0; k < 2; k++) {
1385                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1386                                                 rt_genid(net));
1387
1388                         rthp = &rt_hash_table[hash].chain;
1389
1390                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1391                                 struct rtable *rt;
1392
1393                                 if (rth->fl.fl4_dst != daddr ||
1394                                     rth->fl.fl4_src != skeys[i] ||
1395                                     rth->fl.oif != ikeys[k] ||
1396                                     rt_is_input_route(rth) ||
1397                                     rt_is_expired(rth) ||
1398                                     !net_eq(dev_net(rth->dst.dev), net)) {
1399                                         rthp = &rth->dst.rt_next;
1400                                         continue;
1401                                 }
1402
1403                                 if (rth->rt_dst != daddr ||
1404                                     rth->rt_src != saddr ||
1405                                     rth->dst.error ||
1406                                     rth->rt_gateway != old_gw ||
1407                                     rth->dst.dev != dev)
1408                                         break;
1409
1410                                 dst_hold(&rth->dst);
1411
1412                                 rt = dst_alloc(&ipv4_dst_ops);
1413                                 if (rt == NULL) {
1414                                         ip_rt_put(rth);
1415                                         return;
1416                                 }
1417
1418                                 /* Copy all the information. */
1419                                 *rt = *rth;
1420                                 rt->dst.__use           = 1;
1421                                 atomic_set(&rt->dst.__refcnt, 1);
1422                                 rt->dst.child           = NULL;
1423                                 if (rt->dst.dev)
1424                                         dev_hold(rt->dst.dev);
1425                                 rt->dst.obsolete        = -1;
1426                                 rt->dst.lastuse = jiffies;
1427                                 rt->dst.path            = &rt->dst;
1428                                 rt->dst.neighbour       = NULL;
1429                                 rt->dst.hh              = NULL;
1430 #ifdef CONFIG_XFRM
1431                                 rt->dst.xfrm            = NULL;
1432 #endif
1433                                 rt->rt_genid            = rt_genid(net);
1434                                 rt->rt_flags            |= RTCF_REDIRECTED;
1435
1436                                 /* Gateway is different ... */
1437                                 rt->rt_gateway          = new_gw;
1438
1439                                 /* Redirect received -> path was valid */
1440                                 dst_confirm(&rth->dst);
1441
1442                                 if (rt->peer)
1443                                         atomic_inc(&rt->peer->refcnt);
1444
1445                                 if (arp_bind_neighbour(&rt->dst) ||
1446                                     !(rt->dst.neighbour->nud_state &
1447                                             NUD_VALID)) {
1448                                         if (rt->dst.neighbour)
1449                                                 neigh_event_send(rt->dst.neighbour, NULL);
1450                                         ip_rt_put(rth);
1451                                         rt_drop(rt);
1452                                         goto do_next;
1453                                 }
1454
1455                                 netevent.old = &rth->dst;
1456                                 netevent.new = &rt->dst;
1457                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1458                                                         &netevent);
1459
1460                                 rt_del(hash, rth);
1461                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462                                         ip_rt_put(rt);
1463                                 goto do_next;
1464                         }
1465                 do_next:
1466                         ;
1467                 }
1468         }
1469         return;
1470
1471 reject_redirect:
1472 #ifdef CONFIG_IP_ROUTE_VERBOSE
1473         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1474                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1475                         "  Advised path = %pI4 -> %pI4\n",
1476                        &old_gw, dev->name, &new_gw,
1477                        &saddr, &daddr);
1478 #endif
1479         ;
1480 }
1481
1482 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1483 {
1484         struct rtable *rt = (struct rtable *)dst;
1485         struct dst_entry *ret = dst;
1486
1487         if (rt) {
1488                 if (dst->obsolete > 0) {
1489                         ip_rt_put(rt);
1490                         ret = NULL;
1491                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1492                            (rt->dst.expires &&
1493                             time_after_eq(jiffies, rt->dst.expires))) {
1494                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495                                                 rt->fl.oif,
1496                                                 rt_genid(dev_net(dst->dev)));
1497 #if RT_CACHE_DEBUG >= 1
1498                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499                                 &rt->rt_dst, rt->fl.fl4_tos);
1500 #endif
1501                         rt_del(hash, rt);
1502                         ret = NULL;
1503                 }
1504         }
1505         return ret;
1506 }
1507
1508 /*
1509  * Algorithm:
1510  *      1. The first ip_rt_redirect_number redirects are sent
1511  *         with exponential backoff, then we stop sending them at all,
1512  *         assuming that the host ignores our redirects.
1513  *      2. If we did not see packets requiring redirects
1514  *         during ip_rt_redirect_silence, we assume that the host
1515  *         forgot redirected route and start to send redirects again.
1516  *
1517  * This algorithm is much cheaper and more intelligent than dumb load limiting
1518  * in icmp.c.
1519  *
1520  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1521  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1522  */
1523
1524 void ip_rt_send_redirect(struct sk_buff *skb)
1525 {
1526         struct rtable *rt = skb_rtable(skb);
1527         struct in_device *in_dev;
1528         int log_martians;
1529
1530         rcu_read_lock();
1531         in_dev = __in_dev_get_rcu(rt->dst.dev);
1532         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1533                 rcu_read_unlock();
1534                 return;
1535         }
1536         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537         rcu_read_unlock();
1538
1539         /* No redirected packets during ip_rt_redirect_silence;
1540          * reset the algorithm.
1541          */
1542         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1543                 rt->dst.rate_tokens = 0;
1544
1545         /* Too many ignored redirects; do not send anything
1546          * set dst.rate_last to the last seen redirected packet.
1547          */
1548         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1549                 rt->dst.rate_last = jiffies;
1550                 return;
1551         }
1552
1553         /* Check for load limit; set rate_last to the latest sent
1554          * redirect.
1555          */
1556         if (rt->dst.rate_tokens == 0 ||
1557             time_after(jiffies,
1558                        (rt->dst.rate_last +
1559                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1560                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561                 rt->dst.rate_last = jiffies;
1562                 ++rt->dst.rate_tokens;
1563 #ifdef CONFIG_IP_ROUTE_VERBOSE
1564                 if (log_martians &&
1565                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1566                     net_ratelimit())
1567                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568                                 &rt->rt_src, rt->rt_iif,
1569                                 &rt->rt_dst, &rt->rt_gateway);
1570 #endif
1571         }
1572 }
1573
1574 static int ip_error(struct sk_buff *skb)
1575 {
1576         struct rtable *rt = skb_rtable(skb);
1577         unsigned long now;
1578         int code;
1579
1580         switch (rt->dst.error) {
1581                 case EINVAL:
1582                 default:
1583                         goto out;
1584                 case EHOSTUNREACH:
1585                         code = ICMP_HOST_UNREACH;
1586                         break;
1587                 case ENETUNREACH:
1588                         code = ICMP_NET_UNREACH;
1589                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1590                                         IPSTATS_MIB_INNOROUTES);
1591                         break;
1592                 case EACCES:
1593                         code = ICMP_PKT_FILTERED;
1594                         break;
1595         }
1596
1597         now = jiffies;
1598         rt->dst.rate_tokens += now - rt->dst.rate_last;
1599         if (rt->dst.rate_tokens > ip_rt_error_burst)
1600                 rt->dst.rate_tokens = ip_rt_error_burst;
1601         rt->dst.rate_last = now;
1602         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1603                 rt->dst.rate_tokens -= ip_rt_error_cost;
1604                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1605         }
1606
1607 out:    kfree_skb(skb);
1608         return 0;
1609 }
1610
1611 /*
1612  *      The last two values are not from the RFC but
1613  *      are needed for AMPRnet AX.25 paths.
1614  */
1615
1616 static const unsigned short mtu_plateau[] =
1617 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1618
1619 static inline unsigned short guess_mtu(unsigned short old_mtu)
1620 {
1621         int i;
1622
1623         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1624                 if (old_mtu > mtu_plateau[i])
1625                         return mtu_plateau[i];
1626         return 68;
1627 }
1628
1629 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630                                  unsigned short new_mtu,
1631                                  struct net_device *dev)
1632 {
1633         int i, k;
1634         unsigned short old_mtu = ntohs(iph->tot_len);
1635         struct rtable *rth;
1636         int  ikeys[2] = { dev->ifindex, 0 };
1637         __be32  skeys[2] = { iph->saddr, 0, };
1638         __be32  daddr = iph->daddr;
1639         unsigned short est_mtu = 0;
1640
1641         for (k = 0; k < 2; k++) {
1642                 for (i = 0; i < 2; i++) {
1643                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1644                                                 rt_genid(net));
1645
1646                         rcu_read_lock();
1647                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648                              rth = rcu_dereference(rth->dst.rt_next)) {
1649                                 unsigned short mtu = new_mtu;
1650
1651                                 if (rth->fl.fl4_dst != daddr ||
1652                                     rth->fl.fl4_src != skeys[i] ||
1653                                     rth->rt_dst != daddr ||
1654                                     rth->rt_src != iph->saddr ||
1655                                     rth->fl.oif != ikeys[k] ||
1656                                     rt_is_input_route(rth) ||
1657                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658                                     !net_eq(dev_net(rth->dst.dev), net) ||
1659                                     rt_is_expired(rth))
1660                                         continue;
1661
1662                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1663
1664                                         /* BSD 4.2 compatibility hack :-( */
1665                                         if (mtu == 0 &&
1666                                             old_mtu >= dst_mtu(&rth->dst) &&
1667                                             old_mtu >= 68 + (iph->ihl << 2))
1668                                                 old_mtu -= iph->ihl << 2;
1669
1670                                         mtu = guess_mtu(old_mtu);
1671                                 }
1672                                 if (mtu <= dst_mtu(&rth->dst)) {
1673                                         if (mtu < dst_mtu(&rth->dst)) {
1674                                                 dst_confirm(&rth->dst);
1675                                                 if (mtu < ip_rt_min_pmtu) {
1676                                                         u32 lock = dst_metric(&rth->dst,
1677                                                                               RTAX_LOCK);
1678                                                         mtu = ip_rt_min_pmtu;
1679                                                         lock |= (1 << RTAX_MTU);
1680                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1681                                                                        lock);
1682                                                 }
1683                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1684                                                 dst_set_expires(&rth->dst,
1685                                                         ip_rt_mtu_expires);
1686                                         }
1687                                         est_mtu = mtu;
1688                                 }
1689                         }
1690                         rcu_read_unlock();
1691                 }
1692         }
1693         return est_mtu ? : new_mtu;
1694 }
1695
1696 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697 {
1698         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1699             !(dst_metric_locked(dst, RTAX_MTU))) {
1700                 if (mtu < ip_rt_min_pmtu) {
1701                         u32 lock = dst_metric(dst, RTAX_LOCK);
1702                         mtu = ip_rt_min_pmtu;
1703                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1704                 }
1705                 dst_metric_set(dst, RTAX_MTU, mtu);
1706                 dst_set_expires(dst, ip_rt_mtu_expires);
1707                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1708         }
1709 }
1710
1711 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712 {
1713         if (rt_is_expired((struct rtable *)dst))
1714                 return NULL;
1715         return dst;
1716 }
1717
1718 static void ipv4_dst_destroy(struct dst_entry *dst)
1719 {
1720         struct rtable *rt = (struct rtable *) dst;
1721         struct inet_peer *peer = rt->peer;
1722
1723         if (peer) {
1724                 rt->peer = NULL;
1725                 inet_putpeer(peer);
1726         }
1727 }
1728
1729
1730 static void ipv4_link_failure(struct sk_buff *skb)
1731 {
1732         struct rtable *rt;
1733
1734         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1735
1736         rt = skb_rtable(skb);
1737         if (rt)
1738                 dst_set_expires(&rt->dst, 0);
1739 }
1740
1741 static int ip_rt_bug(struct sk_buff *skb)
1742 {
1743         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1744                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1745                 skb->dev ? skb->dev->name : "?");
1746         kfree_skb(skb);
1747         return 0;
1748 }
1749
1750 /*
1751    We do not cache source address of outgoing interface,
1752    because it is used only by IP RR, TS and SRR options,
1753    so that it out of fast path.
1754
1755    BTW remember: "addr" is allowed to be not aligned
1756    in IP options!
1757  */
1758
1759 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1760 {
1761         __be32 src;
1762         struct fib_result res;
1763
1764         if (rt_is_output_route(rt))
1765                 src = rt->rt_src;
1766         else {
1767                 rcu_read_lock();
1768                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1769                         src = FIB_RES_PREFSRC(res);
1770                 else
1771                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1772                                         RT_SCOPE_UNIVERSE);
1773                 rcu_read_unlock();
1774         }
1775         memcpy(addr, &src, 4);
1776 }
1777
1778 #ifdef CONFIG_NET_CLS_ROUTE
1779 static void set_class_tag(struct rtable *rt, u32 tag)
1780 {
1781         if (!(rt->dst.tclassid & 0xFFFF))
1782                 rt->dst.tclassid |= tag & 0xFFFF;
1783         if (!(rt->dst.tclassid & 0xFFFF0000))
1784                 rt->dst.tclassid |= tag & 0xFFFF0000;
1785 }
1786 #endif
1787
1788 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1789 {
1790         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1791
1792         if (advmss == 0) {
1793                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1794                                ip_rt_min_advmss);
1795                 if (advmss > 65535 - 40)
1796                         advmss = 65535 - 40;
1797         }
1798         return advmss;
1799 }
1800
1801 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1802 {
1803         unsigned int mtu = dst->dev->mtu;
1804
1805         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1806                 const struct rtable *rt = (const struct rtable *) dst;
1807
1808                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1809                         mtu = 576;
1810         }
1811
1812         if (mtu > IP_MAX_MTU)
1813                 mtu = IP_MAX_MTU;
1814
1815         return mtu;
1816 }
1817
1818 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1819 {
1820         struct dst_entry *dst = &rt->dst;
1821         struct fib_info *fi = res->fi;
1822
1823         if (fi) {
1824                 if (FIB_RES_GW(*res) &&
1825                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826                         rt->rt_gateway = FIB_RES_GW(*res);
1827                 dst_import_metrics(dst, fi->fib_metrics);
1828 #ifdef CONFIG_NET_CLS_ROUTE
1829                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830 #endif
1831         }
1832
1833         if (dst_mtu(dst) > IP_MAX_MTU)
1834                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1835         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837
1838 #ifdef CONFIG_NET_CLS_ROUTE
1839 #ifdef CONFIG_IP_MULTIPLE_TABLES
1840         set_class_tag(rt, fib_rules_tclass(res));
1841 #endif
1842         set_class_tag(rt, itag);
1843 #endif
1844         rt->rt_type = res->type;
1845 }
1846
1847 /* called in rcu_read_lock() section */
1848 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1849                                 u8 tos, struct net_device *dev, int our)
1850 {
1851         unsigned int hash;
1852         struct rtable *rth;
1853         __be32 spec_dst;
1854         struct in_device *in_dev = __in_dev_get_rcu(dev);
1855         u32 itag = 0;
1856         int err;
1857
1858         /* Primary sanity checks. */
1859
1860         if (in_dev == NULL)
1861                 return -EINVAL;
1862
1863         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1864             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1865                 goto e_inval;
1866
1867         if (ipv4_is_zeronet(saddr)) {
1868                 if (!ipv4_is_local_multicast(daddr))
1869                         goto e_inval;
1870                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1871         } else {
1872                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1873                                           &itag, 0);
1874                 if (err < 0)
1875                         goto e_err;
1876         }
1877         rth = dst_alloc(&ipv4_dst_ops);
1878         if (!rth)
1879                 goto e_nobufs;
1880
1881         rth->dst.output = ip_rt_bug;
1882         rth->dst.obsolete = -1;
1883
1884         atomic_set(&rth->dst.__refcnt, 1);
1885         rth->dst.flags= DST_HOST;
1886         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1887                 rth->dst.flags |= DST_NOPOLICY;
1888         rth->fl.fl4_dst = daddr;
1889         rth->rt_dst     = daddr;
1890         rth->fl.fl4_tos = tos;
1891         rth->fl.mark    = skb->mark;
1892         rth->fl.fl4_src = saddr;
1893         rth->rt_src     = saddr;
1894 #ifdef CONFIG_NET_CLS_ROUTE
1895         rth->dst.tclassid = itag;
1896 #endif
1897         rth->rt_iif     =
1898         rth->fl.iif     = dev->ifindex;
1899         rth->dst.dev    = init_net.loopback_dev;
1900         dev_hold(rth->dst.dev);
1901         rth->fl.oif     = 0;
1902         rth->rt_gateway = daddr;
1903         rth->rt_spec_dst= spec_dst;
1904         rth->rt_genid   = rt_genid(dev_net(dev));
1905         rth->rt_flags   = RTCF_MULTICAST;
1906         rth->rt_type    = RTN_MULTICAST;
1907         if (our) {
1908                 rth->dst.input= ip_local_deliver;
1909                 rth->rt_flags |= RTCF_LOCAL;
1910         }
1911
1912 #ifdef CONFIG_IP_MROUTE
1913         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1914                 rth->dst.input = ip_mr_input;
1915 #endif
1916         RT_CACHE_STAT_INC(in_slow_mc);
1917
1918         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1919         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1920
1921 e_nobufs:
1922         return -ENOBUFS;
1923 e_inval:
1924         return -EINVAL;
1925 e_err:
1926         return err;
1927 }
1928
1929
1930 static void ip_handle_martian_source(struct net_device *dev,
1931                                      struct in_device *in_dev,
1932                                      struct sk_buff *skb,
1933                                      __be32 daddr,
1934                                      __be32 saddr)
1935 {
1936         RT_CACHE_STAT_INC(in_martian_src);
1937 #ifdef CONFIG_IP_ROUTE_VERBOSE
1938         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1939                 /*
1940                  *      RFC1812 recommendation, if source is martian,
1941                  *      the only hint is MAC header.
1942                  */
1943                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1944                         &daddr, &saddr, dev->name);
1945                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1946                         int i;
1947                         const unsigned char *p = skb_mac_header(skb);
1948                         printk(KERN_WARNING "ll header: ");
1949                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1950                                 printk("%02x", *p);
1951                                 if (i < (dev->hard_header_len - 1))
1952                                         printk(":");
1953                         }
1954                         printk("\n");
1955                 }
1956         }
1957 #endif
1958 }
1959
1960 /* called in rcu_read_lock() section */
1961 static int __mkroute_input(struct sk_buff *skb,
1962                            struct fib_result *res,
1963                            struct in_device *in_dev,
1964                            __be32 daddr, __be32 saddr, u32 tos,
1965                            struct rtable **result)
1966 {
1967         struct rtable *rth;
1968         int err;
1969         struct in_device *out_dev;
1970         unsigned int flags = 0;
1971         __be32 spec_dst;
1972         u32 itag;
1973
1974         /* get a working reference to the output device */
1975         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1976         if (out_dev == NULL) {
1977                 if (net_ratelimit())
1978                         printk(KERN_CRIT "Bug in ip_route_input" \
1979                                "_slow(). Please, report\n");
1980                 return -EINVAL;
1981         }
1982
1983
1984         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1985                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1986         if (err < 0) {
1987                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1988                                          saddr);
1989
1990                 goto cleanup;
1991         }
1992
1993         if (err)
1994                 flags |= RTCF_DIRECTSRC;
1995
1996         if (out_dev == in_dev && err &&
1997             (IN_DEV_SHARED_MEDIA(out_dev) ||
1998              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1999                 flags |= RTCF_DOREDIRECT;
2000
2001         if (skb->protocol != htons(ETH_P_IP)) {
2002                 /* Not IP (i.e. ARP). Do not create route, if it is
2003                  * invalid for proxy arp. DNAT routes are always valid.
2004                  *
2005                  * Proxy arp feature have been extended to allow, ARP
2006                  * replies back to the same interface, to support
2007                  * Private VLAN switch technologies. See arp.c.
2008                  */
2009                 if (out_dev == in_dev &&
2010                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2011                         err = -EINVAL;
2012                         goto cleanup;
2013                 }
2014         }
2015
2016
2017         rth = dst_alloc(&ipv4_dst_ops);
2018         if (!rth) {
2019                 err = -ENOBUFS;
2020                 goto cleanup;
2021         }
2022
2023         atomic_set(&rth->dst.__refcnt, 1);
2024         rth->dst.flags= DST_HOST;
2025         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026                 rth->dst.flags |= DST_NOPOLICY;
2027         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028                 rth->dst.flags |= DST_NOXFRM;
2029         rth->fl.fl4_dst = daddr;
2030         rth->rt_dst     = daddr;
2031         rth->fl.fl4_tos = tos;
2032         rth->fl.mark    = skb->mark;
2033         rth->fl.fl4_src = saddr;
2034         rth->rt_src     = saddr;
2035         rth->rt_gateway = daddr;
2036         rth->rt_iif     =
2037                 rth->fl.iif     = in_dev->dev->ifindex;
2038         rth->dst.dev    = (out_dev)->dev;
2039         dev_hold(rth->dst.dev);
2040         rth->fl.oif     = 0;
2041         rth->rt_spec_dst= spec_dst;
2042
2043         rth->dst.obsolete = -1;
2044         rth->dst.input = ip_forward;
2045         rth->dst.output = ip_output;
2046         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2047
2048         rt_set_nexthop(rth, res, itag);
2049
2050         rth->rt_flags = flags;
2051
2052         *result = rth;
2053         err = 0;
2054  cleanup:
2055         return err;
2056 }
2057
2058 static int ip_mkroute_input(struct sk_buff *skb,
2059                             struct fib_result *res,
2060                             const struct flowi *fl,
2061                             struct in_device *in_dev,
2062                             __be32 daddr, __be32 saddr, u32 tos)
2063 {
2064         struct rtable* rth = NULL;
2065         int err;
2066         unsigned hash;
2067
2068 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2069         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2070                 fib_select_multipath(fl, res);
2071 #endif
2072
2073         /* create a routing cache entry */
2074         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2075         if (err)
2076                 return err;
2077
2078         /* put it into the cache */
2079         hash = rt_hash(daddr, saddr, fl->iif,
2080                        rt_genid(dev_net(rth->dst.dev)));
2081         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2082 }
2083
2084 /*
2085  *      NOTE. We drop all the packets that has local source
2086  *      addresses, because every properly looped back packet
2087  *      must have correct destination already attached by output routine.
2088  *
2089  *      Such approach solves two big problems:
2090  *      1. Not simplex devices are handled properly.
2091  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2092  *      called with rcu_read_lock()
2093  */
2094
2095 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096                                u8 tos, struct net_device *dev)
2097 {
2098         struct fib_result res;
2099         struct in_device *in_dev = __in_dev_get_rcu(dev);
2100         struct flowi fl = { .fl4_dst    = daddr,
2101                             .fl4_src    = saddr,
2102                             .fl4_tos    = tos,
2103                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2104                             .mark = skb->mark,
2105                             .iif = dev->ifindex };
2106         unsigned        flags = 0;
2107         u32             itag = 0;
2108         struct rtable * rth;
2109         unsigned        hash;
2110         __be32          spec_dst;
2111         int             err = -EINVAL;
2112         struct net    * net = dev_net(dev);
2113
2114         /* IP on this device is disabled. */
2115
2116         if (!in_dev)
2117                 goto out;
2118
2119         /* Check for the most weird martians, which can be not detected
2120            by fib_lookup.
2121          */
2122
2123         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2124             ipv4_is_loopback(saddr))
2125                 goto martian_source;
2126
2127         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128                 goto brd_input;
2129
2130         /* Accept zero addresses only to limited broadcast;
2131          * I even do not know to fix it or not. Waiting for complains :-)
2132          */
2133         if (ipv4_is_zeronet(saddr))
2134                 goto martian_source;
2135
2136         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137                 goto martian_destination;
2138
2139         /*
2140          *      Now we are ready to route packet.
2141          */
2142         err = fib_lookup(net, &fl, &res);
2143         if (err != 0) {
2144                 if (!IN_DEV_FORWARD(in_dev))
2145                         goto e_hostunreach;
2146                 goto no_route;
2147         }
2148
2149         RT_CACHE_STAT_INC(in_slow_tot);
2150
2151         if (res.type == RTN_BROADCAST)
2152                 goto brd_input;
2153
2154         if (res.type == RTN_LOCAL) {
2155                 err = fib_validate_source(saddr, daddr, tos,
2156                                           net->loopback_dev->ifindex,
2157                                           dev, &spec_dst, &itag, skb->mark);
2158                 if (err < 0)
2159                         goto martian_source_keep_err;
2160                 if (err)
2161                         flags |= RTCF_DIRECTSRC;
2162                 spec_dst = daddr;
2163                 goto local_input;
2164         }
2165
2166         if (!IN_DEV_FORWARD(in_dev))
2167                 goto e_hostunreach;
2168         if (res.type != RTN_UNICAST)
2169                 goto martian_destination;
2170
2171         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2172 out:    return err;
2173
2174 brd_input:
2175         if (skb->protocol != htons(ETH_P_IP))
2176                 goto e_inval;
2177
2178         if (ipv4_is_zeronet(saddr))
2179                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2180         else {
2181                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2182                                           &itag, skb->mark);
2183                 if (err < 0)
2184                         goto martian_source_keep_err;
2185                 if (err)
2186                         flags |= RTCF_DIRECTSRC;
2187         }
2188         flags |= RTCF_BROADCAST;
2189         res.type = RTN_BROADCAST;
2190         RT_CACHE_STAT_INC(in_brd);
2191
2192 local_input:
2193         rth = dst_alloc(&ipv4_dst_ops);
2194         if (!rth)
2195                 goto e_nobufs;
2196
2197         rth->dst.output= ip_rt_bug;
2198         rth->dst.obsolete = -1;
2199         rth->rt_genid = rt_genid(net);
2200
2201         atomic_set(&rth->dst.__refcnt, 1);
2202         rth->dst.flags= DST_HOST;
2203         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204                 rth->dst.flags |= DST_NOPOLICY;
2205         rth->fl.fl4_dst = daddr;
2206         rth->rt_dst     = daddr;
2207         rth->fl.fl4_tos = tos;
2208         rth->fl.mark    = skb->mark;
2209         rth->fl.fl4_src = saddr;
2210         rth->rt_src     = saddr;
2211 #ifdef CONFIG_NET_CLS_ROUTE
2212         rth->dst.tclassid = itag;
2213 #endif
2214         rth->rt_iif     =
2215         rth->fl.iif     = dev->ifindex;
2216         rth->dst.dev    = net->loopback_dev;
2217         dev_hold(rth->dst.dev);
2218         rth->rt_gateway = daddr;
2219         rth->rt_spec_dst= spec_dst;
2220         rth->dst.input= ip_local_deliver;
2221         rth->rt_flags   = flags|RTCF_LOCAL;
2222         if (res.type == RTN_UNREACHABLE) {
2223                 rth->dst.input= ip_error;
2224                 rth->dst.error= -err;
2225                 rth->rt_flags   &= ~RTCF_LOCAL;
2226         }
2227         rth->rt_type    = res.type;
2228         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2229         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2230         goto out;
2231
2232 no_route:
2233         RT_CACHE_STAT_INC(in_no_route);
2234         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2235         res.type = RTN_UNREACHABLE;
2236         if (err == -ESRCH)
2237                 err = -ENETUNREACH;
2238         goto local_input;
2239
2240         /*
2241          *      Do not cache martian addresses: they should be logged (RFC1812)
2242          */
2243 martian_destination:
2244         RT_CACHE_STAT_INC(in_martian_dst);
2245 #ifdef CONFIG_IP_ROUTE_VERBOSE
2246         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2247                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2248                         &daddr, &saddr, dev->name);
2249 #endif
2250
2251 e_hostunreach:
2252         err = -EHOSTUNREACH;
2253         goto out;
2254
2255 e_inval:
2256         err = -EINVAL;
2257         goto out;
2258
2259 e_nobufs:
2260         err = -ENOBUFS;
2261         goto out;
2262
2263 martian_source:
2264         err = -EINVAL;
2265 martian_source_keep_err:
2266         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2267         goto out;
2268 }
2269
2270 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2271                            u8 tos, struct net_device *dev, bool noref)
2272 {
2273         struct rtable * rth;
2274         unsigned        hash;
2275         int iif = dev->ifindex;
2276         struct net *net;
2277         int res;
2278
2279         net = dev_net(dev);
2280
2281         rcu_read_lock();
2282
2283         if (!rt_caching(net))
2284                 goto skip_cache;
2285
2286         tos &= IPTOS_RT_MASK;
2287         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2288
2289         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290              rth = rcu_dereference(rth->dst.rt_next)) {
2291                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2292                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2293                      (rth->fl.iif ^ iif) |
2294                      rth->fl.oif |
2295                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2296                     rth->fl.mark == skb->mark &&
2297                     net_eq(dev_net(rth->dst.dev), net) &&
2298                     !rt_is_expired(rth)) {
2299                         if (noref) {
2300                                 dst_use_noref(&rth->dst, jiffies);
2301                                 skb_dst_set_noref(skb, &rth->dst);
2302                         } else {
2303                                 dst_use(&rth->dst, jiffies);
2304                                 skb_dst_set(skb, &rth->dst);
2305                         }
2306                         RT_CACHE_STAT_INC(in_hit);
2307                         rcu_read_unlock();
2308                         return 0;
2309                 }
2310                 RT_CACHE_STAT_INC(in_hlist_search);
2311         }
2312
2313 skip_cache:
2314         /* Multicast recognition logic is moved from route cache to here.
2315            The problem was that too many Ethernet cards have broken/missing
2316            hardware multicast filters :-( As result the host on multicasting
2317            network acquires a lot of useless route cache entries, sort of
2318            SDR messages from all the world. Now we try to get rid of them.
2319            Really, provided software IP multicast filter is organized
2320            reasonably (at least, hashed), it does not result in a slowdown
2321            comparing with route cache reject entries.
2322            Note, that multicast routers are not affected, because
2323            route cache entry is created eventually.
2324          */
2325         if (ipv4_is_multicast(daddr)) {
2326                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327
2328                 if (in_dev) {
2329                         int our = ip_check_mc(in_dev, daddr, saddr,
2330                                               ip_hdr(skb)->protocol);
2331                         if (our
2332 #ifdef CONFIG_IP_MROUTE
2333                                 ||
2334                             (!ipv4_is_local_multicast(daddr) &&
2335                              IN_DEV_MFORWARD(in_dev))
2336 #endif
2337                            ) {
2338                                 int res = ip_route_input_mc(skb, daddr, saddr,
2339                                                             tos, dev, our);
2340                                 rcu_read_unlock();
2341                                 return res;
2342                         }
2343                 }
2344                 rcu_read_unlock();
2345                 return -EINVAL;
2346         }
2347         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2348         rcu_read_unlock();
2349         return res;
2350 }
2351 EXPORT_SYMBOL(ip_route_input_common);
2352
2353 /* called with rcu_read_lock() */
2354 static int __mkroute_output(struct rtable **result,
2355                             struct fib_result *res,
2356                             const struct flowi *fl,
2357                             const struct flowi *oldflp,
2358                             struct net_device *dev_out,
2359                             unsigned flags)
2360 {
2361         struct rtable *rth;
2362         struct in_device *in_dev;
2363         u32 tos = RT_FL_TOS(oldflp);
2364
2365         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2366                 return -EINVAL;
2367
2368         if (ipv4_is_lbcast(fl->fl4_dst))
2369                 res->type = RTN_BROADCAST;
2370         else if (ipv4_is_multicast(fl->fl4_dst))
2371                 res->type = RTN_MULTICAST;
2372         else if (ipv4_is_zeronet(fl->fl4_dst))
2373                 return -EINVAL;
2374
2375         if (dev_out->flags & IFF_LOOPBACK)
2376                 flags |= RTCF_LOCAL;
2377
2378         in_dev = __in_dev_get_rcu(dev_out);
2379         if (!in_dev)
2380                 return -EINVAL;
2381
2382         if (res->type == RTN_BROADCAST) {
2383                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384                 res->fi = NULL;
2385         } else if (res->type == RTN_MULTICAST) {
2386                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2388                                  oldflp->proto))
2389                         flags &= ~RTCF_LOCAL;
2390                 /* If multicast route do not exist use
2391                  * default one, but do not gateway in this case.
2392                  * Yes, it is hack.
2393                  */
2394                 if (res->fi && res->prefixlen < 4)
2395                         res->fi = NULL;
2396         }
2397
2398
2399         rth = dst_alloc(&ipv4_dst_ops);
2400         if (!rth)
2401                 return -ENOBUFS;
2402
2403         atomic_set(&rth->dst.__refcnt, 1);
2404         rth->dst.flags= DST_HOST;
2405         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2406                 rth->dst.flags |= DST_NOXFRM;
2407         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2408                 rth->dst.flags |= DST_NOPOLICY;
2409
2410         rth->fl.fl4_dst = oldflp->fl4_dst;
2411         rth->fl.fl4_tos = tos;
2412         rth->fl.fl4_src = oldflp->fl4_src;
2413         rth->fl.oif     = oldflp->oif;
2414         rth->fl.mark    = oldflp->mark;
2415         rth->rt_dst     = fl->fl4_dst;
2416         rth->rt_src     = fl->fl4_src;
2417         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2418         /* get references to the devices that are to be hold by the routing
2419            cache entry */
2420         rth->dst.dev    = dev_out;
2421         dev_hold(dev_out);
2422         rth->rt_gateway = fl->fl4_dst;
2423         rth->rt_spec_dst= fl->fl4_src;
2424
2425         rth->dst.output=ip_output;
2426         rth->dst.obsolete = -1;
2427         rth->rt_genid = rt_genid(dev_net(dev_out));
2428
2429         RT_CACHE_STAT_INC(out_slow_tot);
2430
2431         if (flags & RTCF_LOCAL) {
2432                 rth->dst.input = ip_local_deliver;
2433                 rth->rt_spec_dst = fl->fl4_dst;
2434         }
2435         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2436                 rth->rt_spec_dst = fl->fl4_src;
2437                 if (flags & RTCF_LOCAL &&
2438                     !(dev_out->flags & IFF_LOOPBACK)) {
2439                         rth->dst.output = ip_mc_output;
2440                         RT_CACHE_STAT_INC(out_slow_mc);
2441                 }
2442 #ifdef CONFIG_IP_MROUTE
2443                 if (res->type == RTN_MULTICAST) {
2444                         if (IN_DEV_MFORWARD(in_dev) &&
2445                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2446                                 rth->dst.input = ip_mr_input;
2447                                 rth->dst.output = ip_mc_output;
2448                         }
2449                 }
2450 #endif
2451         }
2452
2453         rt_set_nexthop(rth, res, 0);
2454
2455         rth->rt_flags = flags;
2456         *result = rth;
2457         return 0;
2458 }
2459
2460 /* called with rcu_read_lock() */
2461 static int ip_mkroute_output(struct rtable **rp,
2462                              struct fib_result *res,
2463                              const struct flowi *fl,
2464                              const struct flowi *oldflp,
2465                              struct net_device *dev_out,
2466                              unsigned flags)
2467 {
2468         struct rtable *rth = NULL;
2469         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470         unsigned hash;
2471         if (err == 0) {
2472                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473                                rt_genid(dev_net(dev_out)));
2474                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2475         }
2476
2477         return err;
2478 }
2479
2480 /*
2481  * Major route resolver routine.
2482  * called with rcu_read_lock();
2483  */
2484
2485 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2486                                 const struct flowi *oldflp)
2487 {
2488         u32 tos = RT_FL_TOS(oldflp);
2489         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2490                             .fl4_src = oldflp->fl4_src,
2491                             .fl4_tos = tos & IPTOS_RT_MASK,
2492                             .fl4_scope = ((tos & RTO_ONLINK) ?
2493                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2494                             .mark = oldflp->mark,
2495                             .iif = net->loopback_dev->ifindex,
2496                             .oif = oldflp->oif };
2497         struct fib_result res;
2498         unsigned int flags = 0;
2499         struct net_device *dev_out = NULL;
2500         int err;
2501
2502
2503         res.fi          = NULL;
2504 #ifdef CONFIG_IP_MULTIPLE_TABLES
2505         res.r           = NULL;
2506 #endif
2507
2508         if (oldflp->fl4_src) {
2509                 err = -EINVAL;
2510                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2511                     ipv4_is_lbcast(oldflp->fl4_src) ||
2512                     ipv4_is_zeronet(oldflp->fl4_src))
2513                         goto out;
2514
2515                 /* I removed check for oif == dev_out->oif here.
2516                    It was wrong for two reasons:
2517                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2518                       is assigned to multiple interfaces.
2519                    2. Moreover, we are allowed to send packets with saddr
2520                       of another iface. --ANK
2521                  */
2522
2523                 if (oldflp->oif == 0 &&
2524                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2525                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2526                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2528                         if (dev_out == NULL)
2529                                 goto out;
2530
2531                         /* Special hack: user can direct multicasts
2532                            and limited broadcast via necessary interface
2533                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2534                            This hack is not just for fun, it allows
2535                            vic,vat and friends to work.
2536                            They bind socket to loopback, set ttl to zero
2537                            and expect that it will work.
2538                            From the viewpoint of routing cache they are broken,
2539                            because we are not allowed to build multicast path
2540                            with loopback source addr (look, routing cache
2541                            cannot know, that ttl is zero, so that packet
2542                            will not leave this host and route is valid).
2543                            Luckily, this hack is good workaround.
2544                          */
2545
2546                         fl.oif = dev_out->ifindex;
2547                         goto make_route;
2548                 }
2549
2550                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2551                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2553                                 goto out;
2554                 }
2555         }
2556
2557
2558         if (oldflp->oif) {
2559                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2560                 err = -ENODEV;
2561                 if (dev_out == NULL)
2562                         goto out;
2563
2564                 /* RACE: Check return value of inet_select_addr instead. */
2565                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566                         err = -ENETUNREACH;
2567                         goto out;
2568                 }
2569                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2570                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2571                         if (!fl.fl4_src)
2572                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2573                                                               RT_SCOPE_LINK);
2574                         goto make_route;
2575                 }
2576                 if (!fl.fl4_src) {
2577                         if (ipv4_is_multicast(oldflp->fl4_dst))
2578                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2579                                                               fl.fl4_scope);
2580                         else if (!oldflp->fl4_dst)
2581                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2582                                                               RT_SCOPE_HOST);
2583                 }
2584         }
2585
2586         if (!fl.fl4_dst) {
2587                 fl.fl4_dst = fl.fl4_src;
2588                 if (!fl.fl4_dst)
2589                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2590                 dev_out = net->loopback_dev;
2591                 fl.oif = net->loopback_dev->ifindex;
2592                 res.type = RTN_LOCAL;
2593                 flags |= RTCF_LOCAL;
2594                 goto make_route;
2595         }
2596
2597         if (fib_lookup(net, &fl, &res)) {
2598                 res.fi = NULL;
2599                 if (oldflp->oif) {
2600                         /* Apparently, routing tables are wrong. Assume,
2601                            that the destination is on link.
2602
2603                            WHY? DW.
2604                            Because we are allowed to send to iface
2605                            even if it has NO routes and NO assigned
2606                            addresses. When oif is specified, routing
2607                            tables are looked up with only one purpose:
2608                            to catch if destination is gatewayed, rather than
2609                            direct. Moreover, if MSG_DONTROUTE is set,
2610                            we send packet, ignoring both routing tables
2611                            and ifaddr state. --ANK
2612
2613
2614                            We could make it even if oif is unknown,
2615                            likely IPv6, but we do not.
2616                          */
2617
2618                         if (fl.fl4_src == 0)
2619                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2620                                                               RT_SCOPE_LINK);
2621                         res.type = RTN_UNICAST;
2622                         goto make_route;
2623                 }
2624                 err = -ENETUNREACH;
2625                 goto out;
2626         }
2627
2628         if (res.type == RTN_LOCAL) {
2629                 if (!fl.fl4_src) {
2630                         if (res.fi->fib_prefsrc)
2631                                 fl.fl4_src = res.fi->fib_prefsrc;
2632                         else
2633                                 fl.fl4_src = fl.fl4_dst;
2634                 }
2635                 dev_out = net->loopback_dev;
2636                 fl.oif = dev_out->ifindex;
2637                 res.fi = NULL;
2638                 flags |= RTCF_LOCAL;
2639                 goto make_route;
2640         }
2641
2642 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2643         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2644                 fib_select_multipath(&fl, &res);
2645         else
2646 #endif
2647         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2648                 fib_select_default(net, &fl, &res);
2649
2650         if (!fl.fl4_src)
2651                 fl.fl4_src = FIB_RES_PREFSRC(res);
2652
2653         dev_out = FIB_RES_DEV(res);
2654         fl.oif = dev_out->ifindex;
2655
2656
2657 make_route:
2658         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2659
2660 out:    return err;
2661 }
2662
2663 int __ip_route_output_key(struct net *net, struct rtable **rp,
2664                           const struct flowi *flp)
2665 {
2666         unsigned int hash;
2667         int res;
2668         struct rtable *rth;
2669
2670         if (!rt_caching(net))
2671                 goto slow_output;
2672
2673         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2674
2675         rcu_read_lock_bh();
2676         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2677                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2678                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2679                     rth->fl.fl4_src == flp->fl4_src &&
2680                     rt_is_output_route(rth) &&
2681                     rth->fl.oif == flp->oif &&
2682                     rth->fl.mark == flp->mark &&
2683                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2684                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685                     net_eq(dev_net(rth->dst.dev), net) &&
2686                     !rt_is_expired(rth)) {
2687                         dst_use(&rth->dst, jiffies);
2688                         RT_CACHE_STAT_INC(out_hit);
2689                         rcu_read_unlock_bh();
2690                         *rp = rth;
2691                         return 0;
2692                 }
2693                 RT_CACHE_STAT_INC(out_hlist_search);
2694         }
2695         rcu_read_unlock_bh();
2696
2697 slow_output:
2698         rcu_read_lock();
2699         res = ip_route_output_slow(net, rp, flp);
2700         rcu_read_unlock();
2701         return res;
2702 }
2703 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704
2705 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2706 {
2707         return NULL;
2708 }
2709
2710 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2711 {
2712         return 0;
2713 }
2714
2715 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2716 {
2717 }
2718
2719 static struct dst_ops ipv4_dst_blackhole_ops = {
2720         .family                 =       AF_INET,
2721         .protocol               =       cpu_to_be16(ETH_P_IP),
2722         .destroy                =       ipv4_dst_destroy,
2723         .check                  =       ipv4_blackhole_dst_check,
2724         .default_mtu            =       ipv4_blackhole_default_mtu,
2725         .default_advmss         =       ipv4_default_advmss,
2726         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2727 };
2728
2729
2730 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2731 {
2732         struct rtable *ort = *rp;
2733         struct rtable *rt = (struct rtable *)
2734                 dst_alloc(&ipv4_dst_blackhole_ops);
2735
2736         if (rt) {
2737                 struct dst_entry *new = &rt->dst;
2738
2739                 atomic_set(&new->__refcnt, 1);
2740                 new->__use = 1;
2741                 new->input = dst_discard;
2742                 new->output = dst_discard;
2743                 dst_copy_metrics(new, &ort->dst);
2744
2745                 new->dev = ort->dst.dev;
2746                 if (new->dev)
2747                         dev_hold(new->dev);
2748
2749                 rt->fl = ort->fl;
2750
2751                 rt->rt_genid = rt_genid(net);
2752                 rt->rt_flags = ort->rt_flags;
2753                 rt->rt_type = ort->rt_type;
2754                 rt->rt_dst = ort->rt_dst;
2755                 rt->rt_src = ort->rt_src;
2756                 rt->rt_iif = ort->rt_iif;
2757                 rt->rt_gateway = ort->rt_gateway;
2758                 rt->rt_spec_dst = ort->rt_spec_dst;
2759                 rt->peer = ort->peer;
2760                 if (rt->peer)
2761                         atomic_inc(&rt->peer->refcnt);
2762
2763                 dst_free(new);
2764         }
2765
2766         dst_release(&(*rp)->dst);
2767         *rp = rt;
2768         return rt ? 0 : -ENOMEM;
2769 }
2770
2771 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2772                          struct sock *sk, int flags)
2773 {
2774         int err;
2775
2776         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2777                 return err;
2778
2779         if (flp->proto) {
2780                 if (!flp->fl4_src)
2781                         flp->fl4_src = (*rp)->rt_src;
2782                 if (!flp->fl4_dst)
2783                         flp->fl4_dst = (*rp)->rt_dst;
2784                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2785                                     flags ? XFRM_LOOKUP_WAIT : 0);
2786                 if (err == -EREMOTE)
2787                         err = ipv4_dst_blackhole(net, rp, flp);
2788
2789                 return err;
2790         }
2791
2792         return 0;
2793 }
2794 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2795
2796 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2797 {
2798         return ip_route_output_flow(net, rp, flp, NULL, 0);
2799 }
2800 EXPORT_SYMBOL(ip_route_output_key);
2801
2802 static int rt_fill_info(struct net *net,
2803                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2804                         int nowait, unsigned int flags)
2805 {
2806         struct rtable *rt = skb_rtable(skb);
2807         struct rtmsg *r;
2808         struct nlmsghdr *nlh;
2809         long expires;
2810         u32 id = 0, ts = 0, tsage = 0, error;
2811
2812         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2813         if (nlh == NULL)
2814                 return -EMSGSIZE;
2815
2816         r = nlmsg_data(nlh);
2817         r->rtm_family    = AF_INET;
2818         r->rtm_dst_len  = 32;
2819         r->rtm_src_len  = 0;
2820         r->rtm_tos      = rt->fl.fl4_tos;
2821         r->rtm_table    = RT_TABLE_MAIN;
2822         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823         r->rtm_type     = rt->rt_type;
2824         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2825         r->rtm_protocol = RTPROT_UNSPEC;
2826         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2827         if (rt->rt_flags & RTCF_NOTIFY)
2828                 r->rtm_flags |= RTM_F_NOTIFY;
2829
2830         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831
2832         if (rt->fl.fl4_src) {
2833                 r->rtm_src_len = 32;
2834                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2835         }
2836         if (rt->dst.dev)
2837                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838 #ifdef CONFIG_NET_CLS_ROUTE
2839         if (rt->dst.tclassid)
2840                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841 #endif
2842         if (rt_is_input_route(rt))
2843                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844         else if (rt->rt_src != rt->fl.fl4_src)
2845                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846
2847         if (rt->rt_dst != rt->rt_gateway)
2848                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2849
2850         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2851                 goto nla_put_failure;
2852
2853         if (rt->fl.mark)
2854                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2855
2856         error = rt->dst.error;
2857         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2858         if (rt->peer) {
2859                 inet_peer_refcheck(rt->peer);
2860                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2861                 if (rt->peer->tcp_ts_stamp) {
2862                         ts = rt->peer->tcp_ts;
2863                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2864                 }
2865         }
2866
2867         if (rt_is_input_route(rt)) {
2868 #ifdef CONFIG_IP_MROUTE
2869                 __be32 dst = rt->rt_dst;
2870
2871                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2872                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2873                         int err = ipmr_get_route(net, skb, r, nowait);
2874                         if (err <= 0) {
2875                                 if (!nowait) {
2876                                         if (err == 0)
2877                                                 return 0;
2878                                         goto nla_put_failure;
2879                                 } else {
2880                                         if (err == -EMSGSIZE)
2881                                                 goto nla_put_failure;
2882                                         error = err;
2883                                 }
2884                         }
2885                 } else
2886 #endif
2887                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2888         }
2889
2890         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2891                                expires, error) < 0)
2892                 goto nla_put_failure;
2893
2894         return nlmsg_end(skb, nlh);
2895
2896 nla_put_failure:
2897         nlmsg_cancel(skb, nlh);
2898         return -EMSGSIZE;
2899 }
2900
2901 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2902 {
2903         struct net *net = sock_net(in_skb->sk);
2904         struct rtmsg *rtm;
2905         struct nlattr *tb[RTA_MAX+1];
2906         struct rtable *rt = NULL;
2907         __be32 dst = 0;
2908         __be32 src = 0;
2909         u32 iif;
2910         int err;
2911         int mark;
2912         struct sk_buff *skb;
2913
2914         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2915         if (err < 0)
2916                 goto errout;
2917
2918         rtm = nlmsg_data(nlh);
2919
2920         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2921         if (skb == NULL) {
2922                 err = -ENOBUFS;
2923                 goto errout;
2924         }
2925
2926         /* Reserve room for dummy headers, this skb can pass
2927            through good chunk of routing engine.
2928          */
2929         skb_reset_mac_header(skb);
2930         skb_reset_network_header(skb);
2931
2932         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2933         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2934         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2935
2936         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2937         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2938         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2939         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2940
2941         if (iif) {
2942                 struct net_device *dev;
2943
2944                 dev = __dev_get_by_index(net, iif);
2945                 if (dev == NULL) {
2946                         err = -ENODEV;
2947                         goto errout_free;
2948                 }
2949
2950                 skb->protocol   = htons(ETH_P_IP);
2951                 skb->dev        = dev;
2952                 skb->mark       = mark;
2953                 local_bh_disable();
2954                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2955                 local_bh_enable();
2956
2957                 rt = skb_rtable(skb);
2958                 if (err == 0 && rt->dst.error)
2959                         err = -rt->dst.error;
2960         } else {
2961                 struct flowi fl = {
2962                         .fl4_dst = dst,
2963                         .fl4_src = src,
2964                         .fl4_tos = rtm->rtm_tos,
2965                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2966                         .mark = mark,
2967                 };
2968                 err = ip_route_output_key(net, &rt, &fl);
2969         }
2970
2971         if (err)
2972                 goto errout_free;
2973
2974         skb_dst_set(skb, &rt->dst);
2975         if (rtm->rtm_flags & RTM_F_NOTIFY)
2976                 rt->rt_flags |= RTCF_NOTIFY;
2977
2978         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2979                            RTM_NEWROUTE, 0, 0);
2980         if (err <= 0)
2981                 goto errout_free;
2982
2983         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2984 errout:
2985         return err;
2986
2987 errout_free:
2988         kfree_skb(skb);
2989         goto errout;
2990 }
2991
2992 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2993 {
2994         struct rtable *rt;
2995         int h, s_h;
2996         int idx, s_idx;
2997         struct net *net;
2998
2999         net = sock_net(skb->sk);
3000
3001         s_h = cb->args[0];
3002         if (s_h < 0)
3003                 s_h = 0;
3004         s_idx = idx = cb->args[1];
3005         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3006                 if (!rt_hash_table[h].chain)
3007                         continue;
3008                 rcu_read_lock_bh();
3009                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3010                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3011                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3012                                 continue;
3013                         if (rt_is_expired(rt))
3014                                 continue;
3015                         skb_dst_set_noref(skb, &rt->dst);
3016                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3017                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3018                                          1, NLM_F_MULTI) <= 0) {
3019                                 skb_dst_drop(skb);
3020                                 rcu_read_unlock_bh();
3021                                 goto done;
3022                         }
3023                         skb_dst_drop(skb);
3024                 }
3025                 rcu_read_unlock_bh();
3026         }
3027
3028 done:
3029         cb->args[0] = h;
3030         cb->args[1] = idx;
3031         return skb->len;
3032 }
3033
3034 void ip_rt_multicast_event(struct in_device *in_dev)
3035 {
3036         rt_cache_flush(dev_net(in_dev->dev), 0);
3037 }
3038
3039 #ifdef CONFIG_SYSCTL
3040 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3041                                         void __user *buffer,
3042                                         size_t *lenp, loff_t *ppos)
3043 {
3044         if (write) {
3045                 int flush_delay;
3046                 ctl_table ctl;
3047                 struct net *net;
3048
3049                 memcpy(&ctl, __ctl, sizeof(ctl));
3050                 ctl.data = &flush_delay;
3051                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3052
3053                 net = (struct net *)__ctl->extra1;
3054                 rt_cache_flush(net, flush_delay);
3055                 return 0;
3056         }
3057
3058         return -EINVAL;
3059 }
3060
3061 static ctl_table ipv4_route_table[] = {
3062         {
3063                 .procname       = "gc_thresh",
3064                 .data           = &ipv4_dst_ops.gc_thresh,
3065                 .maxlen         = sizeof(int),
3066                 .mode           = 0644,
3067                 .proc_handler   = proc_dointvec,
3068         },
3069         {
3070                 .procname       = "max_size",
3071                 .data           = &ip_rt_max_size,
3072                 .maxlen         = sizeof(int),
3073                 .mode           = 0644,
3074                 .proc_handler   = proc_dointvec,
3075         },
3076         {
3077                 /*  Deprecated. Use gc_min_interval_ms */
3078
3079                 .procname       = "gc_min_interval",
3080                 .data           = &ip_rt_gc_min_interval,
3081                 .maxlen         = sizeof(int),
3082                 .mode           = 0644,
3083                 .proc_handler   = proc_dointvec_jiffies,
3084         },
3085         {
3086                 .procname       = "gc_min_interval_ms",
3087                 .data           = &ip_rt_gc_min_interval,
3088                 .maxlen         = sizeof(int),
3089                 .mode           = 0644,
3090                 .proc_handler   = proc_dointvec_ms_jiffies,
3091         },
3092         {
3093                 .procname       = "gc_timeout",
3094                 .data           = &ip_rt_gc_timeout,
3095                 .maxlen         = sizeof(int),
3096                 .mode           = 0644,
3097                 .proc_handler   = proc_dointvec_jiffies,
3098         },
3099         {
3100                 .procname       = "gc_interval",
3101                 .data           = &ip_rt_gc_interval,
3102                 .maxlen         = sizeof(int),
3103                 .mode           = 0644,
3104                 .proc_handler   = proc_dointvec_jiffies,
3105         },
3106         {
3107                 .procname       = "redirect_load",
3108                 .data           = &ip_rt_redirect_load,
3109                 .maxlen         = sizeof(int),
3110                 .mode           = 0644,
3111                 .proc_handler   = proc_dointvec,
3112         },
3113         {
3114                 .procname       = "redirect_number",
3115                 .data           = &ip_rt_redirect_number,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = proc_dointvec,
3119         },
3120         {
3121                 .procname       = "redirect_silence",
3122                 .data           = &ip_rt_redirect_silence,
3123                 .maxlen         = sizeof(int),
3124                 .mode           = 0644,
3125                 .proc_handler   = proc_dointvec,
3126         },
3127         {
3128                 .procname       = "error_cost",
3129                 .data           = &ip_rt_error_cost,
3130                 .maxlen         = sizeof(int),
3131                 .mode           = 0644,
3132                 .proc_handler   = proc_dointvec,
3133         },
3134         {
3135                 .procname       = "error_burst",
3136                 .data           = &ip_rt_error_burst,
3137                 .maxlen         = sizeof(int),
3138                 .mode           = 0644,
3139                 .proc_handler   = proc_dointvec,
3140         },
3141         {
3142                 .procname       = "gc_elasticity",
3143                 .data           = &ip_rt_gc_elasticity,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 .procname       = "mtu_expires",
3150                 .data           = &ip_rt_mtu_expires,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec_jiffies,
3154         },
3155         {
3156                 .procname       = "min_pmtu",
3157                 .data           = &ip_rt_min_pmtu,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec,
3161         },
3162         {
3163                 .procname       = "min_adv_mss",
3164                 .data           = &ip_rt_min_advmss,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec,
3168         },
3169         { }
3170 };
3171
3172 static struct ctl_table empty[1];
3173
3174 static struct ctl_table ipv4_skeleton[] =
3175 {
3176         { .procname = "route",
3177           .mode = 0555, .child = ipv4_route_table},
3178         { .procname = "neigh",
3179           .mode = 0555, .child = empty},
3180         { }
3181 };
3182
3183 static __net_initdata struct ctl_path ipv4_path[] = {
3184         { .procname = "net", },
3185         { .procname = "ipv4", },
3186         { },
3187 };
3188
3189 static struct ctl_table ipv4_route_flush_table[] = {
3190         {
3191                 .procname       = "flush",
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0200,
3194                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3195         },
3196         { },
3197 };
3198
3199 static __net_initdata struct ctl_path ipv4_route_path[] = {
3200         { .procname = "net", },
3201         { .procname = "ipv4", },
3202         { .procname = "route", },
3203         { },
3204 };
3205
3206 static __net_init int sysctl_route_net_init(struct net *net)
3207 {
3208         struct ctl_table *tbl;
3209
3210         tbl = ipv4_route_flush_table;
3211         if (!net_eq(net, &init_net)) {
3212                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3213                 if (tbl == NULL)
3214                         goto err_dup;
3215         }
3216         tbl[0].extra1 = net;
3217
3218         net->ipv4.route_hdr =
3219                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3220         if (net->ipv4.route_hdr == NULL)
3221                 goto err_reg;
3222         return 0;
3223
3224 err_reg:
3225         if (tbl != ipv4_route_flush_table)
3226                 kfree(tbl);
3227 err_dup:
3228         return -ENOMEM;
3229 }
3230
3231 static __net_exit void sysctl_route_net_exit(struct net *net)
3232 {
3233         struct ctl_table *tbl;
3234
3235         tbl = net->ipv4.route_hdr->ctl_table_arg;
3236         unregister_net_sysctl_table(net->ipv4.route_hdr);
3237         BUG_ON(tbl == ipv4_route_flush_table);
3238         kfree(tbl);
3239 }
3240
3241 static __net_initdata struct pernet_operations sysctl_route_ops = {
3242         .init = sysctl_route_net_init,
3243         .exit = sysctl_route_net_exit,
3244 };
3245 #endif
3246
3247 static __net_init int rt_genid_init(struct net *net)
3248 {
3249         get_random_bytes(&net->ipv4.rt_genid,
3250                          sizeof(net->ipv4.rt_genid));
3251         return 0;
3252 }
3253
3254 static __net_initdata struct pernet_operations rt_genid_ops = {
3255         .init = rt_genid_init,
3256 };
3257
3258
3259 #ifdef CONFIG_NET_CLS_ROUTE
3260 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3261 #endif /* CONFIG_NET_CLS_ROUTE */
3262
3263 static __initdata unsigned long rhash_entries;
3264 static int __init set_rhash_entries(char *str)
3265 {
3266         if (!str)
3267                 return 0;
3268         rhash_entries = simple_strtoul(str, &str, 0);
3269         return 1;
3270 }
3271 __setup("rhash_entries=", set_rhash_entries);
3272
3273 int __init ip_rt_init(void)
3274 {
3275         int rc = 0;
3276
3277 #ifdef CONFIG_NET_CLS_ROUTE
3278         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3279         if (!ip_rt_acct)
3280                 panic("IP: failed to allocate ip_rt_acct\n");
3281 #endif
3282
3283         ipv4_dst_ops.kmem_cachep =
3284                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3285                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3286
3287         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3288
3289         if (dst_entries_init(&ipv4_dst_ops) < 0)
3290                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3291
3292         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3293                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3294
3295         rt_hash_table = (struct rt_hash_bucket *)
3296                 alloc_large_system_hash("IP route cache",
3297                                         sizeof(struct rt_hash_bucket),
3298                                         rhash_entries,
3299                                         (totalram_pages >= 128 * 1024) ?
3300                                         15 : 17,
3301                                         0,
3302                                         &rt_hash_log,
3303                                         &rt_hash_mask,
3304                                         rhash_entries ? 0 : 512 * 1024);
3305         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3306         rt_hash_lock_init();
3307
3308         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3309         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3310
3311         devinet_init();
3312         ip_fib_init();
3313
3314         /* All the timers, started at system startup tend
3315            to synchronize. Perturb it a bit.
3316          */
3317         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3318         expires_ljiffies = jiffies;
3319         schedule_delayed_work(&expires_work,
3320                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3321
3322         if (ip_rt_proc_init())
3323                 printk(KERN_ERR "Unable to create route proc files\n");
3324 #ifdef CONFIG_XFRM
3325         xfrm_init();
3326         xfrm4_init(ip_rt_max_size);
3327 #endif
3328         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3329
3330 #ifdef CONFIG_SYSCTL
3331         register_pernet_subsys(&sysctl_route_ops);
3332 #endif
3333         register_pernet_subsys(&rt_genid_ops);
3334         return rc;
3335 }
3336
3337 #ifdef CONFIG_SYSCTL
3338 /*
3339  * We really need to sanitize the damn ipv4 init order, then all
3340  * this nonsense will go away.
3341  */
3342 void __init ip_static_sysctl_init(void)
3343 {
3344         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3345 }
3346 #endif