net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 static struct delayed_work expires_work;
 136 static unsigned long expires_ljiffies;
 137
 138 /*
 139  *      Interface to generic destination cache.
 140  */
 141
 142 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 145                                          struct net_device *dev, int how);
 146 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 147 static void              ipv4_link_failure(struct sk_buff *skb);
 148 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 149 static int rt_garbage_collect(struct dst_ops *ops);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                                    int genid)
 262 {
 263         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 264                             idx, genid)
 265                 & rt_hash_mask;
 266 }
 267
 268 static inline int rt_genid(struct net *net)
 269 {
 270         return atomic_read(&net->ipv4.rt_genid);
 271 }
 272
 273 #ifdef CONFIG_PROC_FS
 274 struct rt_cache_iter_state {
 275         struct seq_net_private p;
 276         int bucket;
 277         int genid;
 278 };
 279
 280 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 281 {
 282         struct rt_cache_iter_state *st = seq->private;
 283         struct rtable *r = NULL;
 284
 285         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 286                 if (!rt_hash_table[st->bucket].chain)
 287                         continue;
 288                 rcu_read_lock_bh();
 289                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 290                 while (r) {
 291                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 292                             r->rt_genid == st->genid)
 293                                 return r;
 294                         r = rcu_dereference_bh(r->u.dst.rt_next);
 295                 }
 296                 rcu_read_unlock_bh();
 297         }
 298         return r;
 299 }
 300
 301 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 302                                           struct rtable *r)
 303 {
 304         struct rt_cache_iter_state *st = seq->private;
 305
 306         r = r->u.dst.rt_next;
 307         while (!r) {
 308                 rcu_read_unlock_bh();
 309                 do {
 310                         if (--st->bucket < 0)
 311                                 return NULL;
 312                 } while (!rt_hash_table[st->bucket].chain);
 313                 rcu_read_lock_bh();
 314                 r = rt_hash_table[st->bucket].chain;
 315         }
 316         return rcu_dereference_bh(r);
 317 }
 318
 319 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 320                                         struct rtable *r)
 321 {
 322         struct rt_cache_iter_state *st = seq->private;
 323         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 324                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 325                         continue;
 326                 if (r->rt_genid == st->genid)
 327                         break;
 328         }
 329         return r;
 330 }
 331
 332 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 333 {
 334         struct rtable *r = rt_cache_get_first(seq);
 335
 336         if (r)
 337                 while (pos && (r = rt_cache_get_next(seq, r)))
 338                         --pos;
 339         return pos ? NULL : r;
 340 }
 341
 342 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 343 {
 344         struct rt_cache_iter_state *st = seq->private;
 345         if (*pos)
 346                 return rt_cache_get_idx(seq, *pos - 1);
 347         st->genid = rt_genid(seq_file_net(seq));
 348         return SEQ_START_TOKEN;
 349 }
 350
 351 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 352 {
 353         struct rtable *r;
 354
 355         if (v == SEQ_START_TOKEN)
 356                 r = rt_cache_get_first(seq);
 357         else
 358                 r = rt_cache_get_next(seq, v);
 359         ++*pos;
 360         return r;
 361 }
 362
 363 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 364 {
 365         if (v && v != SEQ_START_TOKEN)
 366                 rcu_read_unlock_bh();
 367 }
 368
 369 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 370 {
 371         if (v == SEQ_START_TOKEN)
 372                 seq_printf(seq, "%-127s\n",
 373                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 374                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 375                            "HHUptod\tSpecDst");
 376         else {
 377                 struct rtable *r = v;
 378                 int len;
 379
 380                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 381                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 382                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 383                         (__force u32)r->rt_dst,
 384                         (__force u32)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (__force u32)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int rt_acct_proc_show(struct seq_file *m, void *v)
 517 {
 518         struct ip_rt_acct *dst, *src;
 519         unsigned int i, j;
 520
 521         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 522         if (!dst)
 523                 return -ENOMEM;
 524
 525         for_each_possible_cpu(i) {
 526                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 527                 for (j = 0; j < 256; j++) {
 528                         dst[j].o_bytes   += src[j].o_bytes;
 529                         dst[j].o_packets += src[j].o_packets;
 530                         dst[j].i_bytes   += src[j].i_bytes;
 531                         dst[j].i_packets += src[j].i_packets;
 532                 }
 533         }
 534
 535         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 536         kfree(dst);
 537         return 0;
 538 }
 539
 540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 541 {
 542         return single_open(file, rt_acct_proc_show, NULL);
 543 }
 544
 545 static const struct file_operations rt_acct_proc_fops = {
 546         .owner          = THIS_MODULE,
 547         .open           = rt_acct_proc_open,
 548         .read           = seq_read,
 549         .llseek         = seq_lseek,
 550         .release        = single_release,
 551 };
 552 #endif
 553
 554 static int __net_init ip_rt_do_proc_init(struct net *net)
 555 {
 556         struct proc_dir_entry *pde;
 557
 558         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 559                         &rt_cache_seq_fops);
 560         if (!pde)
 561                 goto err1;
 562
 563         pde = proc_create("rt_cache", S_IRUGO,
 564                           net->proc_net_stat, &rt_cpu_seq_fops);
 565         if (!pde)
 566                 goto err2;
 567
 568 #ifdef CONFIG_NET_CLS_ROUTE
 569         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 570         if (!pde)
 571                 goto err3;
 572 #endif
 573         return 0;
 574
 575 #ifdef CONFIG_NET_CLS_ROUTE
 576 err3:
 577         remove_proc_entry("rt_cache", net->proc_net_stat);
 578 #endif
 579 err2:
 580         remove_proc_entry("rt_cache", net->proc_net);
 581 err1:
 582         return -ENOMEM;
 583 }
 584
 585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 586 {
 587         remove_proc_entry("rt_cache", net->proc_net_stat);
 588         remove_proc_entry("rt_cache", net->proc_net);
 589 #ifdef CONFIG_NET_CLS_ROUTE
 590         remove_proc_entry("rt_acct", net->proc_net);
 591 #endif
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 689                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 696                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
 699                 (fl1->oif ^ fl2->oif) |
 700                 (fl1->iif ^ fl2->iif)) == 0;
 701 }
 702
 703 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 704 {
 705         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 706 }
 707
 708 static inline int rt_is_expired(struct rtable *rth)
 709 {
 710         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 711 }
 712
 713 /*
 714  * Perform a full scan of hash table and free all entries.
 715  * Can be called by a softirq or a process.
 716  * In the later case, we want to be reschedule if necessary
 717  */
 718 static void rt_do_flush(int process_context)
 719 {
 720         unsigned int i;
 721         struct rtable *rth, *next;
 722         struct rtable * tail;
 723
 724         for (i = 0; i <= rt_hash_mask; i++) {
 725                 if (process_context && need_resched())
 726                         cond_resched();
 727                 rth = rt_hash_table[i].chain;
 728                 if (!rth)
 729                         continue;
 730
 731                 spin_lock_bh(rt_hash_lock_addr(i));
 732 #ifdef CONFIG_NET_NS
 733                 {
 734                 struct rtable ** prev, * p;
 735
 736                 rth = rt_hash_table[i].chain;
 737
 738                 /* defer releasing the head of the list after spin_unlock */
 739                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 740                         if (!rt_is_expired(tail))
 741                                 break;
 742                 if (rth != tail)
 743                         rt_hash_table[i].chain = tail;
 744
 745                 /* call rt_free on entries after the tail requiring flush */
 746                 prev = &rt_hash_table[i].chain;
 747                 for (p = *prev; p; p = next) {
 748                         next = p->u.dst.rt_next;
 749                         if (!rt_is_expired(p)) {
 750                                 prev = &p->u.dst.rt_next;
 751                         } else {
 752                                 *prev = next;
 753                                 rt_free(p);
 754                         }
 755                 }
 756                 }
 757 #else
 758                 rth = rt_hash_table[i].chain;
 759                 rt_hash_table[i].chain = NULL;
 760                 tail = NULL;
 761 #endif
 762                 spin_unlock_bh(rt_hash_lock_addr(i));
 763
 764                 for (; rth != tail; rth = next) {
 765                         next = rth->u.dst.rt_next;
 766                         rt_free(rth);
 767                 }
 768         }
 769 }
 770
 771 /*
 772  * While freeing expired entries, we compute average chain length
 773  * and standard deviation, using fixed-point arithmetic.
 774  * This to have an estimation of rt_chain_length_max
 775  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 776  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 777  */
 778
 779 #define FRACT_BITS 3
 780 #define ONE (1UL << FRACT_BITS)
 781
 782 /*
 783  * Given a hash chain and an item in this hash chain,
 784  * find if a previous entry has the same hash_inputs
 785  * (but differs on tos, mark or oif)
 786  * Returns 0 if an alias is found.
 787  * Returns ONE if rth has no alias before itself.
 788  */
 789 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 790 {
 791         const struct rtable *aux = head;
 792
 793         while (aux != rth) {
 794                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 795                         return 0;
 796                 aux = aux->u.dst.rt_next;
 797         }
 798         return ONE;
 799 }
 800
 801 static void rt_check_expire(void)
 802 {
 803         static unsigned int rover;
 804         unsigned int i = rover, goal;
 805         struct rtable *rth, **rthp;
 806         unsigned long samples = 0;
 807         unsigned long sum = 0, sum2 = 0;
 808         unsigned long delta;
 809         u64 mult;
 810
 811         delta = jiffies - expires_ljiffies;
 812         expires_ljiffies = jiffies;
 813         mult = ((u64)delta) << rt_hash_log;
 814         if (ip_rt_gc_timeout > 1)
 815                 do_div(mult, ip_rt_gc_timeout);
 816         goal = (unsigned int)mult;
 817         if (goal > rt_hash_mask)
 818                 goal = rt_hash_mask + 1;
 819         for (; goal > 0; goal--) {
 820                 unsigned long tmo = ip_rt_gc_timeout;
 821                 unsigned long length;
 822
 823                 i = (i + 1) & rt_hash_mask;
 824                 rthp = &rt_hash_table[i].chain;
 825
 826                 if (need_resched())
 827                         cond_resched();
 828
 829                 samples++;
 830
 831                 if (*rthp == NULL)
 832                         continue;
 833                 length = 0;
 834                 spin_lock_bh(rt_hash_lock_addr(i));
 835                 while ((rth = *rthp) != NULL) {
 836                         prefetch(rth->u.dst.rt_next);
 837                         if (rt_is_expired(rth)) {
 838                                 *rthp = rth->u.dst.rt_next;
 839                                 rt_free(rth);
 840                                 continue;
 841                         }
 842                         if (rth->u.dst.expires) {
 843                                 /* Entry is expired even if it is in use */
 844                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 845 nofree:
 846                                         tmo >>= 1;
 847                                         rthp = &rth->u.dst.rt_next;
 848                                         /*
 849                                          * We only count entries on
 850                                          * a chain with equal hash inputs once
 851                                          * so that entries for different QOS
 852                                          * levels, and other non-hash input
 853                                          * attributes don't unfairly skew
 854                                          * the length computation
 855                                          */
 856                                         length += has_noalias(rt_hash_table[i].chain, rth);
 857                                         continue;
 858                                 }
 859                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 860                                 goto nofree;
 861
 862                         /* Cleanup aged off entries. */
 863                         *rthp = rth->u.dst.rt_next;
 864                         rt_free(rth);
 865                 }
 866                 spin_unlock_bh(rt_hash_lock_addr(i));
 867                 sum += length;
 868                 sum2 += length*length;
 869         }
 870         if (samples) {
 871                 unsigned long avg = sum / samples;
 872                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 873                 rt_chain_length_max = max_t(unsigned long,
 874                                         ip_rt_gc_elasticity,
 875                                         (avg + 4*sd) >> FRACT_BITS);
 876         }
 877         rover = i;
 878 }
 879
 880 /*
 881  * rt_worker_func() is run in process context.
 882  * we call rt_check_expire() to scan part of the hash table
 883  */
 884 static void rt_worker_func(struct work_struct *work)
 885 {
 886         rt_check_expire();
 887         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 888 }
 889
 890 /*
 891  * Pertubation of rt_genid by a small quantity [1..256]
 892  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 893  * many times (2^24) without giving recent rt_genid.
 894  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 895  */
 896 static void rt_cache_invalidate(struct net *net)
 897 {
 898         unsigned char shuffle;
 899
 900         get_random_bytes(&shuffle, sizeof(shuffle));
 901         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 902 }
 903
 904 /*
 905  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 906  * delay >= 0 : invalidate & flush cache (can be long)
 907  */
 908 void rt_cache_flush(struct net *net, int delay)
 909 {
 910         rt_cache_invalidate(net);
 911         if (delay >= 0)
 912                 rt_do_flush(!in_softirq());
 913 }
 914
 915 /* Flush previous cache invalidated entries from the cache */
 916 void rt_cache_flush_batch(void)
 917 {
 918         rt_do_flush(!in_softirq());
 919 }
 920
 921 /*
 922  * We change rt_genid and let gc do the cleanup
 923  */
 924 static void rt_secret_rebuild(unsigned long __net)
 925 {
 926         struct net *net = (struct net *)__net;
 927         rt_cache_invalidate(net);
 928         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 929 }
 930
 931 static void rt_secret_rebuild_oneshot(struct net *net)
 932 {
 933         del_timer_sync(&net->ipv4.rt_secret_timer);
 934         rt_cache_invalidate(net);
 935         if (ip_rt_secret_interval)
 936                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 937 }
 938
 939 static void rt_emergency_hash_rebuild(struct net *net)
 940 {
 941         if (net_ratelimit()) {
 942                 printk(KERN_WARNING "Route hash chain too long!\n");
 943                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 944         }
 945
 946         rt_secret_rebuild_oneshot(net);
 947 }
 948
 949 /*
 950    Short description of GC goals.
 951
 952    We want to build algorithm, which will keep routing cache
 953    at some equilibrium point, when number of aged off entries
 954    is kept approximately equal to newly generated ones.
 955
 956    Current expiration strength is variable "expire".
 957    We try to adjust it dynamically, so that if networking
 958    is idle expires is large enough to keep enough of warm entries,
 959    and when load increases it reduces to limit cache size.
 960  */
 961
 962 static int rt_garbage_collect(struct dst_ops *ops)
 963 {
 964         static unsigned long expire = RT_GC_TIMEOUT;
 965         static unsigned long last_gc;
 966         static int rover;
 967         static int equilibrium;
 968         struct rtable *rth, **rthp;
 969         unsigned long now = jiffies;
 970         int goal;
 971
 972         /*
 973          * Garbage collection is pretty expensive,
 974          * do not make it too frequently.
 975          */
 976
 977         RT_CACHE_STAT_INC(gc_total);
 978
 979         if (now - last_gc < ip_rt_gc_min_interval &&
 980             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 981                 RT_CACHE_STAT_INC(gc_ignored);
 982                 goto out;
 983         }
 984
 985         /* Calculate number of entries, which we want to expire now. */
 986         goal = atomic_read(&ipv4_dst_ops.entries) -
 987                 (ip_rt_gc_elasticity << rt_hash_log);
 988         if (goal <= 0) {
 989                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 990                         equilibrium = ipv4_dst_ops.gc_thresh;
 991                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 992                 if (goal > 0) {
 993                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 994                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 995                 }
 996         } else {
 997                 /* We are in dangerous area. Try to reduce cache really
 998                  * aggressively.
 999                  */
1000                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1001                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1002         }
1003
1004         if (now - last_gc >= ip_rt_gc_min_interval)
1005                 last_gc = now;
1006
1007         if (goal <= 0) {
1008                 equilibrium += goal;
1009                 goto work_done;
1010         }
1011
1012         do {
1013                 int i, k;
1014
1015                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1016                         unsigned long tmo = expire;
1017
1018                         k = (k + 1) & rt_hash_mask;
1019                         rthp = &rt_hash_table[k].chain;
1020                         spin_lock_bh(rt_hash_lock_addr(k));
1021                         while ((rth = *rthp) != NULL) {
1022                                 if (!rt_is_expired(rth) &&
1023                                         !rt_may_expire(rth, tmo, expire)) {
1024                                         tmo >>= 1;
1025                                         rthp = &rth->u.dst.rt_next;
1026                                         continue;
1027                                 }
1028                                 *rthp = rth->u.dst.rt_next;
1029                                 rt_free(rth);
1030                                 goal--;
1031                         }
1032                         spin_unlock_bh(rt_hash_lock_addr(k));
1033                         if (goal <= 0)
1034                                 break;
1035                 }
1036                 rover = k;
1037
1038                 if (goal <= 0)
1039                         goto work_done;
1040
1041                 /* Goal is not achieved. We stop process if:
1042
1043                    - if expire reduced to zero. Otherwise, expire is halfed.
1044                    - if table is not full.
1045                    - if we are called from interrupt.
1046                    - jiffies check is just fallback/debug loop breaker.
1047                      We will not spin here for long time in any case.
1048                  */
1049
1050                 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052                 if (expire == 0)
1053                         break;
1054
1055                 expire >>= 1;
1056 #if RT_CACHE_DEBUG >= 2
1057                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1058                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1059 #endif
1060
1061                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1062                         goto out;
1063         } while (!in_softirq() && time_before_eq(jiffies, now));
1064
1065         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1066                 goto out;
1067         if (net_ratelimit())
1068                 printk(KERN_WARNING "dst cache overflow\n");
1069         RT_CACHE_STAT_INC(gc_dst_overflow);
1070         return 1;
1071
1072 work_done:
1073         expire += ip_rt_gc_min_interval;
1074         if (expire > ip_rt_gc_timeout ||
1075             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1076                 expire = ip_rt_gc_timeout;
1077 #if RT_CACHE_DEBUG >= 2
1078         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1079                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1080 #endif
1081 out:    return 0;
1082 }
1083
1084 /*
1085  * Returns number of entries in a hash chain that have different hash_inputs
1086  */
1087 static int slow_chain_length(const struct rtable *head)
1088 {
1089         int length = 0;
1090         const struct rtable *rth = head;
1091
1092         while (rth) {
1093                 length += has_noalias(head, rth);
1094                 rth = rth->u.dst.rt_next;
1095         }
1096         return length >> FRACT_BITS;
1097 }
1098
1099 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1100                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1101 {
1102         struct rtable   *rth, **rthp;
1103         unsigned long   now;
1104         struct rtable *cand, **candp;
1105         u32             min_score;
1106         int             chain_length;
1107         int attempts = !in_softirq();
1108
1109 restart:
1110         chain_length = 0;
1111         min_score = ~(u32)0;
1112         cand = NULL;
1113         candp = NULL;
1114         now = jiffies;
1115
1116         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1117                 /*
1118                  * If we're not caching, just tell the caller we
1119                  * were successful and don't touch the route.  The
1120                  * caller hold the sole reference to the cache entry, and
1121                  * it will be released when the caller is done with it.
1122                  * If we drop it here, the callers have no way to resolve routes
1123                  * when we're not caching.  Instead, just point *rp at rt, so
1124                  * the caller gets a single use out of the route
1125                  * Note that we do rt_free on this new route entry, so that
1126                  * once its refcount hits zero, we are still able to reap it
1127                  * (Thanks Alexey)
1128                  * Note also the rt_free uses call_rcu.  We don't actually
1129                  * need rcu protection here, this is just our path to get
1130                  * on the route gc list.
1131                  */
1132
1133                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1134                         int err = arp_bind_neighbour(&rt->u.dst);
1135                         if (err) {
1136                                 if (net_ratelimit())
1137                                         printk(KERN_WARNING
1138                                             "Neighbour table failure & not caching routes.\n");
1139                                 rt_drop(rt);
1140                                 return err;
1141                         }
1142                 }
1143
1144                 rt_free(rt);
1145                 goto skip_hashing;
1146         }
1147
1148         rthp = &rt_hash_table[hash].chain;
1149
1150         spin_lock_bh(rt_hash_lock_addr(hash));
1151         while ((rth = *rthp) != NULL) {
1152                 if (rt_is_expired(rth)) {
1153                         *rthp = rth->u.dst.rt_next;
1154                         rt_free(rth);
1155                         continue;
1156                 }
1157                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1158                         /* Put it first */
1159                         *rthp = rth->u.dst.rt_next;
1160                         /*
1161                          * Since lookup is lockfree, the deletion
1162                          * must be visible to another weakly ordered CPU before
1163                          * the insertion at the start of the hash chain.
1164                          */
1165                         rcu_assign_pointer(rth->u.dst.rt_next,
1166                                            rt_hash_table[hash].chain);
1167                         /*
1168                          * Since lookup is lockfree, the update writes
1169                          * must be ordered for consistency on SMP.
1170                          */
1171                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1172
1173                         dst_use(&rth->u.dst, now);
1174                         spin_unlock_bh(rt_hash_lock_addr(hash));
1175
1176                         rt_drop(rt);
1177                         if (rp)
1178                                 *rp = rth;
1179                         else
1180                                 skb_dst_set(skb, &rth->u.dst);
1181                         return 0;
1182                 }
1183
1184                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1185                         u32 score = rt_score(rth);
1186
1187                         if (score <= min_score) {
1188                                 cand = rth;
1189                                 candp = rthp;
1190                                 min_score = score;
1191                         }
1192                 }
1193
1194                 chain_length++;
1195
1196                 rthp = &rth->u.dst.rt_next;
1197         }
1198
1199         if (cand) {
1200                 /* ip_rt_gc_elasticity used to be average length of chain
1201                  * length, when exceeded gc becomes really aggressive.
1202                  *
1203                  * The second limit is less certain. At the moment it allows
1204                  * only 2 entries per bucket. We will see.
1205                  */
1206                 if (chain_length > ip_rt_gc_elasticity) {
1207                         *candp = cand->u.dst.rt_next;
1208                         rt_free(cand);
1209                 }
1210         } else {
1211                 if (chain_length > rt_chain_length_max &&
1212                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1213                         struct net *net = dev_net(rt->u.dst.dev);
1214                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1215                         if (!rt_caching(net)) {
1216                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1217                                         rt->u.dst.dev->name, num);
1218                         }
1219                         rt_emergency_hash_rebuild(net);
1220                         spin_unlock_bh(rt_hash_lock_addr(hash));
1221
1222                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1223                                         ifindex, rt_genid(net));
1224                         goto restart;
1225                 }
1226         }
1227
1228         /* Try to bind route to arp only if it is output
1229            route or unicast forwarding path.
1230          */
1231         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1232                 int err = arp_bind_neighbour(&rt->u.dst);
1233                 if (err) {
1234                         spin_unlock_bh(rt_hash_lock_addr(hash));
1235
1236                         if (err != -ENOBUFS) {
1237                                 rt_drop(rt);
1238                                 return err;
1239                         }
1240
1241                         /* Neighbour tables are full and nothing
1242                            can be released. Try to shrink route cache,
1243                            it is most likely it holds some neighbour records.
1244                          */
1245                         if (attempts-- > 0) {
1246                                 int saved_elasticity = ip_rt_gc_elasticity;
1247                                 int saved_int = ip_rt_gc_min_interval;
1248                                 ip_rt_gc_elasticity     = 1;
1249                                 ip_rt_gc_min_interval   = 0;
1250                                 rt_garbage_collect(&ipv4_dst_ops);
1251                                 ip_rt_gc_min_interval   = saved_int;
1252                                 ip_rt_gc_elasticity     = saved_elasticity;
1253                                 goto restart;
1254                         }
1255
1256                         if (net_ratelimit())
1257                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1258                         rt_drop(rt);
1259                         return -ENOBUFS;
1260                 }
1261         }
1262
1263         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1264
1265 #if RT_CACHE_DEBUG >= 2
1266         if (rt->u.dst.rt_next) {
1267                 struct rtable *trt;
1268                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1269                        hash, &rt->rt_dst);
1270                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1271                         printk(" . %pI4", &trt->rt_dst);
1272                 printk("\n");
1273         }
1274 #endif
1275         /*
1276          * Since lookup is lockfree, we must make sure
1277          * previous writes to rt are comitted to memory
1278          * before making rt visible to other CPUS.
1279          */
1280         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1281
1282         spin_unlock_bh(rt_hash_lock_addr(hash));
1283
1284 skip_hashing:
1285         if (rp)
1286                 *rp = rt;
1287         else
1288                 skb_dst_set(skb, &rt->u.dst);
1289         return 0;
1290 }
1291
1292 void rt_bind_peer(struct rtable *rt, int create)
1293 {
1294         static DEFINE_SPINLOCK(rt_peer_lock);
1295         struct inet_peer *peer;
1296
1297         peer = inet_getpeer(rt->rt_dst, create);
1298
1299         spin_lock_bh(&rt_peer_lock);
1300         if (rt->peer == NULL) {
1301                 rt->peer = peer;
1302                 peer = NULL;
1303         }
1304         spin_unlock_bh(&rt_peer_lock);
1305         if (peer)
1306                 inet_putpeer(peer);
1307 }
1308
1309 /*
1310  * Peer allocation may fail only in serious out-of-memory conditions.  However
1311  * we still can generate some output.
1312  * Random ID selection looks a bit dangerous because we have no chances to
1313  * select ID being unique in a reasonable period of time.
1314  * But broken packet identifier may be better than no packet at all.
1315  */
1316 static void ip_select_fb_ident(struct iphdr *iph)
1317 {
1318         static DEFINE_SPINLOCK(ip_fb_id_lock);
1319         static u32 ip_fallback_id;
1320         u32 salt;
1321
1322         spin_lock_bh(&ip_fb_id_lock);
1323         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1324         iph->id = htons(salt & 0xFFFF);
1325         ip_fallback_id = salt;
1326         spin_unlock_bh(&ip_fb_id_lock);
1327 }
1328
1329 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332
1333         if (rt) {
1334                 if (rt->peer == NULL)
1335                         rt_bind_peer(rt, 1);
1336
1337                 /* If peer is attached to destination, it is never detached,
1338                    so that we need not to grab a lock to dereference it.
1339                  */
1340                 if (rt->peer) {
1341                         iph->id = htons(inet_getid(rt->peer, more));
1342                         return;
1343                 }
1344         } else
1345                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1346                        __builtin_return_address(0));
1347
1348         ip_select_fb_ident(iph);
1349 }
1350
1351 static void rt_del(unsigned hash, struct rtable *rt)
1352 {
1353         struct rtable **rthp, *aux;
1354
1355         rthp = &rt_hash_table[hash].chain;
1356         spin_lock_bh(rt_hash_lock_addr(hash));
1357         ip_rt_put(rt);
1358         while ((aux = *rthp) != NULL) {
1359                 if (aux == rt || rt_is_expired(aux)) {
1360                         *rthp = aux->u.dst.rt_next;
1361                         rt_free(aux);
1362                         continue;
1363                 }
1364                 rthp = &aux->u.dst.rt_next;
1365         }
1366         spin_unlock_bh(rt_hash_lock_addr(hash));
1367 }
1368
1369 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1370                     __be32 saddr, struct net_device *dev)
1371 {
1372         int i, k;
1373         struct in_device *in_dev = in_dev_get(dev);
1374         struct rtable *rth, **rthp;
1375         __be32  skeys[2] = { saddr, 0 };
1376         int  ikeys[2] = { dev->ifindex, 0 };
1377         struct netevent_redirect netevent;
1378         struct net *net;
1379
1380         if (!in_dev)
1381                 return;
1382
1383         net = dev_net(dev);
1384         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386             ipv4_is_zeronet(new_gw))
1387                 goto reject_redirect;
1388
1389         if (!rt_caching(net))
1390                 goto reject_redirect;
1391
1392         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1393                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1394                         goto reject_redirect;
1395                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1396                         goto reject_redirect;
1397         } else {
1398                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1399                         goto reject_redirect;
1400         }
1401
1402         for (i = 0; i < 2; i++) {
1403                 for (k = 0; k < 2; k++) {
1404                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1405                                                 rt_genid(net));
1406
1407                         rthp=&rt_hash_table[hash].chain;
1408
1409                         rcu_read_lock();
1410                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1411                                 struct rtable *rt;
1412
1413                                 if (rth->fl.fl4_dst != daddr ||
1414                                     rth->fl.fl4_src != skeys[i] ||
1415                                     rth->fl.oif != ikeys[k] ||
1416                                     rth->fl.iif != 0 ||
1417                                     rt_is_expired(rth) ||
1418                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1419                                         rthp = &rth->u.dst.rt_next;
1420                                         continue;
1421                                 }
1422
1423                                 if (rth->rt_dst != daddr ||
1424                                     rth->rt_src != saddr ||
1425                                     rth->u.dst.error ||
1426                                     rth->rt_gateway != old_gw ||
1427                                     rth->u.dst.dev != dev)
1428                                         break;
1429
1430                                 dst_hold(&rth->u.dst);
1431                                 rcu_read_unlock();
1432
1433                                 rt = dst_alloc(&ipv4_dst_ops);
1434                                 if (rt == NULL) {
1435                                         ip_rt_put(rth);
1436                                         in_dev_put(in_dev);
1437                                         return;
1438                                 }
1439
1440                                 /* Copy all the information. */
1441                                 *rt = *rth;
1442                                 rt->u.dst.__use         = 1;
1443                                 atomic_set(&rt->u.dst.__refcnt, 1);
1444                                 rt->u.dst.child         = NULL;
1445                                 if (rt->u.dst.dev)
1446                                         dev_hold(rt->u.dst.dev);
1447                                 if (rt->idev)
1448                                         in_dev_hold(rt->idev);
1449                                 rt->u.dst.obsolete      = -1;
1450                                 rt->u.dst.lastuse       = jiffies;
1451                                 rt->u.dst.path          = &rt->u.dst;
1452                                 rt->u.dst.neighbour     = NULL;
1453                                 rt->u.dst.hh            = NULL;
1454 #ifdef CONFIG_XFRM
1455                                 rt->u.dst.xfrm          = NULL;
1456 #endif
1457                                 rt->rt_genid            = rt_genid(net);
1458                                 rt->rt_flags            |= RTCF_REDIRECTED;
1459
1460                                 /* Gateway is different ... */
1461                                 rt->rt_gateway          = new_gw;
1462
1463                                 /* Redirect received -> path was valid */
1464                                 dst_confirm(&rth->u.dst);
1465
1466                                 if (rt->peer)
1467                                         atomic_inc(&rt->peer->refcnt);
1468
1469                                 if (arp_bind_neighbour(&rt->u.dst) ||
1470                                     !(rt->u.dst.neighbour->nud_state &
1471                                             NUD_VALID)) {
1472                                         if (rt->u.dst.neighbour)
1473                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1474                                         ip_rt_put(rth);
1475                                         rt_drop(rt);
1476                                         goto do_next;
1477                                 }
1478
1479                                 netevent.old = &rth->u.dst;
1480                                 netevent.new = &rt->u.dst;
1481                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1482                                                         &netevent);
1483
1484                                 rt_del(hash, rth);
1485                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1486                                         ip_rt_put(rt);
1487                                 goto do_next;
1488                         }
1489                         rcu_read_unlock();
1490                 do_next:
1491                         ;
1492                 }
1493         }
1494         in_dev_put(in_dev);
1495         return;
1496
1497 reject_redirect:
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1500                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1501                         "  Advised path = %pI4 -> %pI4\n",
1502                        &old_gw, dev->name, &new_gw,
1503                        &saddr, &daddr);
1504 #endif
1505         in_dev_put(in_dev);
1506 }
1507
1508 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1509 {
1510         struct rtable *rt = (struct rtable *)dst;
1511         struct dst_entry *ret = dst;
1512
1513         if (rt) {
1514                 if (dst->obsolete > 0) {
1515                         ip_rt_put(rt);
1516                         ret = NULL;
1517                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1518                            (rt->u.dst.expires &&
1519                             time_after_eq(jiffies, rt->u.dst.expires))) {
1520                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1521                                                 rt->fl.oif,
1522                                                 rt_genid(dev_net(dst->dev)));
1523 #if RT_CACHE_DEBUG >= 1
1524                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1525                                 &rt->rt_dst, rt->fl.fl4_tos);
1526 #endif
1527                         rt_del(hash, rt);
1528                         ret = NULL;
1529                 }
1530         }
1531         return ret;
1532 }
1533
1534 /*
1535  * Algorithm:
1536  *      1. The first ip_rt_redirect_number redirects are sent
1537  *         with exponential backoff, then we stop sending them at all,
1538  *         assuming that the host ignores our redirects.
1539  *      2. If we did not see packets requiring redirects
1540  *         during ip_rt_redirect_silence, we assume that the host
1541  *         forgot redirected route and start to send redirects again.
1542  *
1543  * This algorithm is much cheaper and more intelligent than dumb load limiting
1544  * in icmp.c.
1545  *
1546  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548  */
1549
1550 void ip_rt_send_redirect(struct sk_buff *skb)
1551 {
1552         struct rtable *rt = skb_rtable(skb);
1553         struct in_device *in_dev;
1554         int log_martians;
1555
1556         rcu_read_lock();
1557         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1558         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1559                 rcu_read_unlock();
1560                 return;
1561         }
1562         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1563         rcu_read_unlock();
1564
1565         /* No redirected packets during ip_rt_redirect_silence;
1566          * reset the algorithm.
1567          */
1568         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1569                 rt->u.dst.rate_tokens = 0;
1570
1571         /* Too many ignored redirects; do not send anything
1572          * set u.dst.rate_last to the last seen redirected packet.
1573          */
1574         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1575                 rt->u.dst.rate_last = jiffies;
1576                 return;
1577         }
1578
1579         /* Check for load limit; set rate_last to the latest sent
1580          * redirect.
1581          */
1582         if (rt->u.dst.rate_tokens == 0 ||
1583             time_after(jiffies,
1584                        (rt->u.dst.rate_last +
1585                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1586                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1587                 rt->u.dst.rate_last = jiffies;
1588                 ++rt->u.dst.rate_tokens;
1589 #ifdef CONFIG_IP_ROUTE_VERBOSE
1590                 if (log_martians &&
1591                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1592                     net_ratelimit())
1593                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1594                                 &rt->rt_src, rt->rt_iif,
1595                                 &rt->rt_dst, &rt->rt_gateway);
1596 #endif
1597         }
1598 }
1599
1600 static int ip_error(struct sk_buff *skb)
1601 {
1602         struct rtable *rt = skb_rtable(skb);
1603         unsigned long now;
1604         int code;
1605
1606         switch (rt->u.dst.error) {
1607                 case EINVAL:
1608                 default:
1609                         goto out;
1610                 case EHOSTUNREACH:
1611                         code = ICMP_HOST_UNREACH;
1612                         break;
1613                 case ENETUNREACH:
1614                         code = ICMP_NET_UNREACH;
1615                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1616                                         IPSTATS_MIB_INNOROUTES);
1617                         break;
1618                 case EACCES:
1619                         code = ICMP_PKT_FILTERED;
1620                         break;
1621         }
1622
1623         now = jiffies;
1624         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1625         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1626                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1627         rt->u.dst.rate_last = now;
1628         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1629                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1630                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1631         }
1632
1633 out:    kfree_skb(skb);
1634         return 0;
1635 }
1636
1637 /*
1638  *      The last two values are not from the RFC but
1639  *      are needed for AMPRnet AX.25 paths.
1640  */
1641
1642 static const unsigned short mtu_plateau[] =
1643 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1644
1645 static inline unsigned short guess_mtu(unsigned short old_mtu)
1646 {
1647         int i;
1648
1649         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1650                 if (old_mtu > mtu_plateau[i])
1651                         return mtu_plateau[i];
1652         return 68;
1653 }
1654
1655 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1656                                  unsigned short new_mtu,
1657                                  struct net_device *dev)
1658 {
1659         int i, k;
1660         unsigned short old_mtu = ntohs(iph->tot_len);
1661         struct rtable *rth;
1662         int  ikeys[2] = { dev->ifindex, 0 };
1663         __be32  skeys[2] = { iph->saddr, 0, };
1664         __be32  daddr = iph->daddr;
1665         unsigned short est_mtu = 0;
1666
1667         for (k = 0; k < 2; k++) {
1668                 for (i = 0; i < 2; i++) {
1669                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1670                                                 rt_genid(net));
1671
1672                         rcu_read_lock();
1673                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1674                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1675                                 unsigned short mtu = new_mtu;
1676
1677                                 if (rth->fl.fl4_dst != daddr ||
1678                                     rth->fl.fl4_src != skeys[i] ||
1679                                     rth->rt_dst != daddr ||
1680                                     rth->rt_src != iph->saddr ||
1681                                     rth->fl.oif != ikeys[k] ||
1682                                     rth->fl.iif != 0 ||
1683                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1684                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1685                                     rt_is_expired(rth))
1686                                         continue;
1687
1688                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1689
1690                                         /* BSD 4.2 compatibility hack :-( */
1691                                         if (mtu == 0 &&
1692                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1693                                             old_mtu >= 68 + (iph->ihl << 2))
1694                                                 old_mtu -= iph->ihl << 2;
1695
1696                                         mtu = guess_mtu(old_mtu);
1697                                 }
1698                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1699                                         if (mtu < dst_mtu(&rth->u.dst)) {
1700                                                 dst_confirm(&rth->u.dst);
1701                                                 if (mtu < ip_rt_min_pmtu) {
1702                                                         mtu = ip_rt_min_pmtu;
1703                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1704                                                                 (1 << RTAX_MTU);
1705                                                 }
1706                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1707                                                 dst_set_expires(&rth->u.dst,
1708                                                         ip_rt_mtu_expires);
1709                                         }
1710                                         est_mtu = mtu;
1711                                 }
1712                         }
1713                         rcu_read_unlock();
1714                 }
1715         }
1716         return est_mtu ? : new_mtu;
1717 }
1718
1719 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1720 {
1721         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1722             !(dst_metric_locked(dst, RTAX_MTU))) {
1723                 if (mtu < ip_rt_min_pmtu) {
1724                         mtu = ip_rt_min_pmtu;
1725                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1726                 }
1727                 dst->metrics[RTAX_MTU-1] = mtu;
1728                 dst_set_expires(dst, ip_rt_mtu_expires);
1729                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1730         }
1731 }
1732
1733 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1734 {
1735         if (rt_is_expired((struct rtable *)dst))
1736                 return NULL;
1737         return dst;
1738 }
1739
1740 static void ipv4_dst_destroy(struct dst_entry *dst)
1741 {
1742         struct rtable *rt = (struct rtable *) dst;
1743         struct inet_peer *peer = rt->peer;
1744         struct in_device *idev = rt->idev;
1745
1746         if (peer) {
1747                 rt->peer = NULL;
1748                 inet_putpeer(peer);
1749         }
1750
1751         if (idev) {
1752                 rt->idev = NULL;
1753                 in_dev_put(idev);
1754         }
1755 }
1756
1757 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1758                             int how)
1759 {
1760         struct rtable *rt = (struct rtable *) dst;
1761         struct in_device *idev = rt->idev;
1762         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1763                 struct in_device *loopback_idev =
1764                         in_dev_get(dev_net(dev)->loopback_dev);
1765                 if (loopback_idev) {
1766                         rt->idev = loopback_idev;
1767                         in_dev_put(idev);
1768                 }
1769         }
1770 }
1771
1772 static void ipv4_link_failure(struct sk_buff *skb)
1773 {
1774         struct rtable *rt;
1775
1776         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
1778         rt = skb_rtable(skb);
1779         if (rt)
1780                 dst_set_expires(&rt->u.dst, 0);
1781 }
1782
1783 static int ip_rt_bug(struct sk_buff *skb)
1784 {
1785         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1787                 skb->dev ? skb->dev->name : "?");
1788         kfree_skb(skb);
1789         return 0;
1790 }
1791
1792 /*
1793    We do not cache source address of outgoing interface,
1794    because it is used only by IP RR, TS and SRR options,
1795    so that it out of fast path.
1796
1797    BTW remember: "addr" is allowed to be not aligned
1798    in IP options!
1799  */
1800
1801 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802 {
1803         __be32 src;
1804         struct fib_result res;
1805
1806         if (rt->fl.iif == 0)
1807                 src = rt->rt_src;
1808         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1809                 src = FIB_RES_PREFSRC(res);
1810                 fib_res_put(&res);
1811         } else
1812                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1813                                         RT_SCOPE_UNIVERSE);
1814         memcpy(addr, &src, 4);
1815 }
1816
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818 static void set_class_tag(struct rtable *rt, u32 tag)
1819 {
1820         if (!(rt->u.dst.tclassid & 0xFFFF))
1821                 rt->u.dst.tclassid |= tag & 0xFFFF;
1822         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1823                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1824 }
1825 #endif
1826
1827 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1828 {
1829         struct fib_info *fi = res->fi;
1830
1831         if (fi) {
1832                 if (FIB_RES_GW(*res) &&
1833                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1834                         rt->rt_gateway = FIB_RES_GW(*res);
1835                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1836                        sizeof(rt->u.dst.metrics));
1837                 if (fi->fib_mtu == 0) {
1838                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1839                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1840                             rt->rt_gateway != rt->rt_dst &&
1841                             rt->u.dst.dev->mtu > 576)
1842                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1843                 }
1844 #ifdef CONFIG_NET_CLS_ROUTE
1845                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1846 #endif
1847         } else
1848                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1849
1850         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1851                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1852         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1853                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1854         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1855                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1856                                        ip_rt_min_advmss);
1857         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1858                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1859
1860 #ifdef CONFIG_NET_CLS_ROUTE
1861 #ifdef CONFIG_IP_MULTIPLE_TABLES
1862         set_class_tag(rt, fib_rules_tclass(res));
1863 #endif
1864         set_class_tag(rt, itag);
1865 #endif
1866         rt->rt_type = res->type;
1867 }
1868
1869 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1870                                 u8 tos, struct net_device *dev, int our)
1871 {
1872         unsigned hash;
1873         struct rtable *rth;
1874         __be32 spec_dst;
1875         struct in_device *in_dev = in_dev_get(dev);
1876         u32 itag = 0;
1877
1878         /* Primary sanity checks. */
1879
1880         if (in_dev == NULL)
1881                 return -EINVAL;
1882
1883         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1884             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1885                 goto e_inval;
1886
1887         if (ipv4_is_zeronet(saddr)) {
1888                 if (!ipv4_is_local_multicast(daddr))
1889                         goto e_inval;
1890                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1891         } else if (fib_validate_source(saddr, 0, tos, 0,
1892                                         dev, &spec_dst, &itag, 0) < 0)
1893                 goto e_inval;
1894
1895         rth = dst_alloc(&ipv4_dst_ops);
1896         if (!rth)
1897                 goto e_nobufs;
1898
1899         rth->u.dst.output = ip_rt_bug;
1900         rth->u.dst.obsolete = -1;
1901
1902         atomic_set(&rth->u.dst.__refcnt, 1);
1903         rth->u.dst.flags= DST_HOST;
1904         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1905                 rth->u.dst.flags |= DST_NOPOLICY;
1906         rth->fl.fl4_dst = daddr;
1907         rth->rt_dst     = daddr;
1908         rth->fl.fl4_tos = tos;
1909         rth->fl.mark    = skb->mark;
1910         rth->fl.fl4_src = saddr;
1911         rth->rt_src     = saddr;
1912 #ifdef CONFIG_NET_CLS_ROUTE
1913         rth->u.dst.tclassid = itag;
1914 #endif
1915         rth->rt_iif     =
1916         rth->fl.iif     = dev->ifindex;
1917         rth->u.dst.dev  = init_net.loopback_dev;
1918         dev_hold(rth->u.dst.dev);
1919         rth->idev       = in_dev_get(rth->u.dst.dev);
1920         rth->fl.oif     = 0;
1921         rth->rt_gateway = daddr;
1922         rth->rt_spec_dst= spec_dst;
1923         rth->rt_genid   = rt_genid(dev_net(dev));
1924         rth->rt_flags   = RTCF_MULTICAST;
1925         rth->rt_type    = RTN_MULTICAST;
1926         if (our) {
1927                 rth->u.dst.input= ip_local_deliver;
1928                 rth->rt_flags |= RTCF_LOCAL;
1929         }
1930
1931 #ifdef CONFIG_IP_MROUTE
1932         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1933                 rth->u.dst.input = ip_mr_input;
1934 #endif
1935         RT_CACHE_STAT_INC(in_slow_mc);
1936
1937         in_dev_put(in_dev);
1938         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1939         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1940
1941 e_nobufs:
1942         in_dev_put(in_dev);
1943         return -ENOBUFS;
1944
1945 e_inval:
1946         in_dev_put(in_dev);
1947         return -EINVAL;
1948 }
1949
1950
1951 static void ip_handle_martian_source(struct net_device *dev,
1952                                      struct in_device *in_dev,
1953                                      struct sk_buff *skb,
1954                                      __be32 daddr,
1955                                      __be32 saddr)
1956 {
1957         RT_CACHE_STAT_INC(in_martian_src);
1958 #ifdef CONFIG_IP_ROUTE_VERBOSE
1959         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1960                 /*
1961                  *      RFC1812 recommendation, if source is martian,
1962                  *      the only hint is MAC header.
1963                  */
1964                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1965                         &daddr, &saddr, dev->name);
1966                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1967                         int i;
1968                         const unsigned char *p = skb_mac_header(skb);
1969                         printk(KERN_WARNING "ll header: ");
1970                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1971                                 printk("%02x", *p);
1972                                 if (i < (dev->hard_header_len - 1))
1973                                         printk(":");
1974                         }
1975                         printk("\n");
1976                 }
1977         }
1978 #endif
1979 }
1980
1981 static int __mkroute_input(struct sk_buff *skb,
1982                            struct fib_result *res,
1983                            struct in_device *in_dev,
1984                            __be32 daddr, __be32 saddr, u32 tos,
1985                            struct rtable **result)
1986 {
1987
1988         struct rtable *rth;
1989         int err;
1990         struct in_device *out_dev;
1991         unsigned flags = 0;
1992         __be32 spec_dst;
1993         u32 itag;
1994
1995         /* get a working reference to the output device */
1996         out_dev = in_dev_get(FIB_RES_DEV(*res));
1997         if (out_dev == NULL) {
1998                 if (net_ratelimit())
1999                         printk(KERN_CRIT "Bug in ip_route_input" \
2000                                "_slow(). Please, report\n");
2001                 return -EINVAL;
2002         }
2003
2004
2005         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2006                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2007         if (err < 0) {
2008                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2009                                          saddr);
2010
2011                 err = -EINVAL;
2012                 goto cleanup;
2013         }
2014
2015         if (err)
2016                 flags |= RTCF_DIRECTSRC;
2017
2018         if (out_dev == in_dev && err &&
2019             (IN_DEV_SHARED_MEDIA(out_dev) ||
2020              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2021                 flags |= RTCF_DOREDIRECT;
2022
2023         if (skb->protocol != htons(ETH_P_IP)) {
2024                 /* Not IP (i.e. ARP). Do not create route, if it is
2025                  * invalid for proxy arp. DNAT routes are always valid.
2026                  *
2027                  * Proxy arp feature have been extended to allow, ARP
2028                  * replies back to the same interface, to support
2029                  * Private VLAN switch technologies. See arp.c.
2030                  */
2031                 if (out_dev == in_dev &&
2032                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2033                         err = -EINVAL;
2034                         goto cleanup;
2035                 }
2036         }
2037
2038
2039         rth = dst_alloc(&ipv4_dst_ops);
2040         if (!rth) {
2041                 err = -ENOBUFS;
2042                 goto cleanup;
2043         }
2044
2045         atomic_set(&rth->u.dst.__refcnt, 1);
2046         rth->u.dst.flags= DST_HOST;
2047         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2048                 rth->u.dst.flags |= DST_NOPOLICY;
2049         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2050                 rth->u.dst.flags |= DST_NOXFRM;
2051         rth->fl.fl4_dst = daddr;
2052         rth->rt_dst     = daddr;
2053         rth->fl.fl4_tos = tos;
2054         rth->fl.mark    = skb->mark;
2055         rth->fl.fl4_src = saddr;
2056         rth->rt_src     = saddr;
2057         rth->rt_gateway = daddr;
2058         rth->rt_iif     =
2059                 rth->fl.iif     = in_dev->dev->ifindex;
2060         rth->u.dst.dev  = (out_dev)->dev;
2061         dev_hold(rth->u.dst.dev);
2062         rth->idev       = in_dev_get(rth->u.dst.dev);
2063         rth->fl.oif     = 0;
2064         rth->rt_spec_dst= spec_dst;
2065
2066         rth->u.dst.obsolete = -1;
2067         rth->u.dst.input = ip_forward;
2068         rth->u.dst.output = ip_output;
2069         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2070
2071         rt_set_nexthop(rth, res, itag);
2072
2073         rth->rt_flags = flags;
2074
2075         *result = rth;
2076         err = 0;
2077  cleanup:
2078         /* release the working reference to the output device */
2079         in_dev_put(out_dev);
2080         return err;
2081 }
2082
2083 static int ip_mkroute_input(struct sk_buff *skb,
2084                             struct fib_result *res,
2085                             const struct flowi *fl,
2086                             struct in_device *in_dev,
2087                             __be32 daddr, __be32 saddr, u32 tos)
2088 {
2089         struct rtable* rth = NULL;
2090         int err;
2091         unsigned hash;
2092
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2095                 fib_select_multipath(fl, res);
2096 #endif
2097
2098         /* create a routing cache entry */
2099         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100         if (err)
2101                 return err;
2102
2103         /* put it into the cache */
2104         hash = rt_hash(daddr, saddr, fl->iif,
2105                        rt_genid(dev_net(rth->u.dst.dev)));
2106         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2107 }
2108
2109 /*
2110  *      NOTE. We drop all the packets that has local source
2111  *      addresses, because every properly looped back packet
2112  *      must have correct destination already attached by output routine.
2113  *
2114  *      Such approach solves two big problems:
2115  *      1. Not simplex devices are handled properly.
2116  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2117  */
2118
2119 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120                                u8 tos, struct net_device *dev)
2121 {
2122         struct fib_result res;
2123         struct in_device *in_dev = in_dev_get(dev);
2124         struct flowi fl = { .nl_u = { .ip4_u =
2125                                       { .daddr = daddr,
2126                                         .saddr = saddr,
2127                                         .tos = tos,
2128                                         .scope = RT_SCOPE_UNIVERSE,
2129                                       } },
2130                             .mark = skb->mark,
2131                             .iif = dev->ifindex };
2132         unsigned        flags = 0;
2133         u32             itag = 0;
2134         struct rtable * rth;
2135         unsigned        hash;
2136         __be32          spec_dst;
2137         int             err = -EINVAL;
2138         int             free_res = 0;
2139         struct net    * net = dev_net(dev);
2140
2141         /* IP on this device is disabled. */
2142
2143         if (!in_dev)
2144                 goto out;
2145
2146         /* Check for the most weird martians, which can be not detected
2147            by fib_lookup.
2148          */
2149
2150         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2151             ipv4_is_loopback(saddr))
2152                 goto martian_source;
2153
2154         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2155                 goto brd_input;
2156
2157         /* Accept zero addresses only to limited broadcast;
2158          * I even do not know to fix it or not. Waiting for complains :-)
2159          */
2160         if (ipv4_is_zeronet(saddr))
2161                 goto martian_source;
2162
2163         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2164             ipv4_is_loopback(daddr))
2165                 goto martian_destination;
2166
2167         /*
2168          *      Now we are ready to route packet.
2169          */
2170         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2171                 if (!IN_DEV_FORWARD(in_dev))
2172                         goto e_hostunreach;
2173                 goto no_route;
2174         }
2175         free_res = 1;
2176
2177         RT_CACHE_STAT_INC(in_slow_tot);
2178
2179         if (res.type == RTN_BROADCAST)
2180                 goto brd_input;
2181
2182         if (res.type == RTN_LOCAL) {
2183                 int result;
2184                 result = fib_validate_source(saddr, daddr, tos,
2185                                              net->loopback_dev->ifindex,
2186                                              dev, &spec_dst, &itag, skb->mark);
2187                 if (result < 0)
2188                         goto martian_source;
2189                 if (result)
2190                         flags |= RTCF_DIRECTSRC;
2191                 spec_dst = daddr;
2192                 goto local_input;
2193         }
2194
2195         if (!IN_DEV_FORWARD(in_dev))
2196                 goto e_hostunreach;
2197         if (res.type != RTN_UNICAST)
2198                 goto martian_destination;
2199
2200         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2201 done:
2202         in_dev_put(in_dev);
2203         if (free_res)
2204                 fib_res_put(&res);
2205 out:    return err;
2206
2207 brd_input:
2208         if (skb->protocol != htons(ETH_P_IP))
2209                 goto e_inval;
2210
2211         if (ipv4_is_zeronet(saddr))
2212                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2213         else {
2214                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2215                                           &itag, skb->mark);
2216                 if (err < 0)
2217                         goto martian_source;
2218                 if (err)
2219                         flags |= RTCF_DIRECTSRC;
2220         }
2221         flags |= RTCF_BROADCAST;
2222         res.type = RTN_BROADCAST;
2223         RT_CACHE_STAT_INC(in_brd);
2224
2225 local_input:
2226         rth = dst_alloc(&ipv4_dst_ops);
2227         if (!rth)
2228                 goto e_nobufs;
2229
2230         rth->u.dst.output= ip_rt_bug;
2231         rth->u.dst.obsolete = -1;
2232         rth->rt_genid = rt_genid(net);
2233
2234         atomic_set(&rth->u.dst.__refcnt, 1);
2235         rth->u.dst.flags= DST_HOST;
2236         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2237                 rth->u.dst.flags |= DST_NOPOLICY;
2238         rth->fl.fl4_dst = daddr;
2239         rth->rt_dst     = daddr;
2240         rth->fl.fl4_tos = tos;
2241         rth->fl.mark    = skb->mark;
2242         rth->fl.fl4_src = saddr;
2243         rth->rt_src     = saddr;
2244 #ifdef CONFIG_NET_CLS_ROUTE
2245         rth->u.dst.tclassid = itag;
2246 #endif
2247         rth->rt_iif     =
2248         rth->fl.iif     = dev->ifindex;
2249         rth->u.dst.dev  = net->loopback_dev;
2250         dev_hold(rth->u.dst.dev);
2251         rth->idev       = in_dev_get(rth->u.dst.dev);
2252         rth->rt_gateway = daddr;
2253         rth->rt_spec_dst= spec_dst;
2254         rth->u.dst.input= ip_local_deliver;
2255         rth->rt_flags   = flags|RTCF_LOCAL;
2256         if (res.type == RTN_UNREACHABLE) {
2257                 rth->u.dst.input= ip_error;
2258                 rth->u.dst.error= -err;
2259                 rth->rt_flags   &= ~RTCF_LOCAL;
2260         }
2261         rth->rt_type    = res.type;
2262         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2263         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2264         goto done;
2265
2266 no_route:
2267         RT_CACHE_STAT_INC(in_no_route);
2268         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269         res.type = RTN_UNREACHABLE;
2270         if (err == -ESRCH)
2271                 err = -ENETUNREACH;
2272         goto local_input;
2273
2274         /*
2275          *      Do not cache martian addresses: they should be logged (RFC1812)
2276          */
2277 martian_destination:
2278         RT_CACHE_STAT_INC(in_martian_dst);
2279 #ifdef CONFIG_IP_ROUTE_VERBOSE
2280         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2281                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282                         &daddr, &saddr, dev->name);
2283 #endif
2284
2285 e_hostunreach:
2286         err = -EHOSTUNREACH;
2287         goto done;
2288
2289 e_inval:
2290         err = -EINVAL;
2291         goto done;
2292
2293 e_nobufs:
2294         err = -ENOBUFS;
2295         goto done;
2296
2297 martian_source:
2298         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2299         goto e_inval;
2300 }
2301
2302 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2303                    u8 tos, struct net_device *dev)
2304 {
2305         struct rtable * rth;
2306         unsigned        hash;
2307         int iif = dev->ifindex;
2308         struct net *net;
2309
2310         net = dev_net(dev);
2311
2312         if (!rt_caching(net))
2313                 goto skip_cache;
2314
2315         tos &= IPTOS_RT_MASK;
2316         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2317
2318         rcu_read_lock();
2319         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320              rth = rcu_dereference(rth->u.dst.rt_next)) {
2321                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2322                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2323                      (rth->fl.iif ^ iif) |
2324                      rth->fl.oif |
2325                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2326                     rth->fl.mark == skb->mark &&
2327                     net_eq(dev_net(rth->u.dst.dev), net) &&
2328                     !rt_is_expired(rth)) {
2329                         dst_use(&rth->u.dst, jiffies);
2330                         RT_CACHE_STAT_INC(in_hit);
2331                         rcu_read_unlock();
2332                         skb_dst_set(skb, &rth->u.dst);
2333                         return 0;
2334                 }
2335                 RT_CACHE_STAT_INC(in_hlist_search);
2336         }
2337         rcu_read_unlock();
2338
2339 skip_cache:
2340         /* Multicast recognition logic is moved from route cache to here.
2341            The problem was that too many Ethernet cards have broken/missing
2342            hardware multicast filters :-( As result the host on multicasting
2343            network acquires a lot of useless route cache entries, sort of
2344            SDR messages from all the world. Now we try to get rid of them.
2345            Really, provided software IP multicast filter is organized
2346            reasonably (at least, hashed), it does not result in a slowdown
2347            comparing with route cache reject entries.
2348            Note, that multicast routers are not affected, because
2349            route cache entry is created eventually.
2350          */
2351         if (ipv4_is_multicast(daddr)) {
2352                 struct in_device *in_dev;
2353
2354                 rcu_read_lock();
2355                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2356                         int our = ip_check_mc(in_dev, daddr, saddr,
2357                                 ip_hdr(skb)->protocol);
2358                         if (our
2359 #ifdef CONFIG_IP_MROUTE
2360                                 ||
2361                             (!ipv4_is_local_multicast(daddr) &&
2362                              IN_DEV_MFORWARD(in_dev))
2363 #endif
2364                            ) {
2365                                 rcu_read_unlock();
2366                                 return ip_route_input_mc(skb, daddr, saddr,
2367                                                          tos, dev, our);
2368                         }
2369                 }
2370                 rcu_read_unlock();
2371                 return -EINVAL;
2372         }
2373         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2374 }
2375
2376 static int __mkroute_output(struct rtable **result,
2377                             struct fib_result *res,
2378                             const struct flowi *fl,
2379                             const struct flowi *oldflp,
2380                             struct net_device *dev_out,
2381                             unsigned flags)
2382 {
2383         struct rtable *rth;
2384         struct in_device *in_dev;
2385         u32 tos = RT_FL_TOS(oldflp);
2386         int err = 0;
2387
2388         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2389                 return -EINVAL;
2390
2391         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2392                 res->type = RTN_BROADCAST;
2393         else if (ipv4_is_multicast(fl->fl4_dst))
2394                 res->type = RTN_MULTICAST;
2395         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2396                 return -EINVAL;
2397
2398         if (dev_out->flags & IFF_LOOPBACK)
2399                 flags |= RTCF_LOCAL;
2400
2401         /* get work reference to inet device */
2402         in_dev = in_dev_get(dev_out);
2403         if (!in_dev)
2404                 return -EINVAL;
2405
2406         if (res->type == RTN_BROADCAST) {
2407                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2408                 if (res->fi) {
2409                         fib_info_put(res->fi);
2410                         res->fi = NULL;
2411                 }
2412         } else if (res->type == RTN_MULTICAST) {
2413                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2414                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2415                                  oldflp->proto))
2416                         flags &= ~RTCF_LOCAL;
2417                 /* If multicast route do not exist use
2418                    default one, but do not gateway in this case.
2419                    Yes, it is hack.
2420                  */
2421                 if (res->fi && res->prefixlen < 4) {
2422                         fib_info_put(res->fi);
2423                         res->fi = NULL;
2424                 }
2425         }
2426
2427
2428         rth = dst_alloc(&ipv4_dst_ops);
2429         if (!rth) {
2430                 err = -ENOBUFS;
2431                 goto cleanup;
2432         }
2433
2434         atomic_set(&rth->u.dst.__refcnt, 1);
2435         rth->u.dst.flags= DST_HOST;
2436         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2437                 rth->u.dst.flags |= DST_NOXFRM;
2438         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2439                 rth->u.dst.flags |= DST_NOPOLICY;
2440
2441         rth->fl.fl4_dst = oldflp->fl4_dst;
2442         rth->fl.fl4_tos = tos;
2443         rth->fl.fl4_src = oldflp->fl4_src;
2444         rth->fl.oif     = oldflp->oif;
2445         rth->fl.mark    = oldflp->mark;
2446         rth->rt_dst     = fl->fl4_dst;
2447         rth->rt_src     = fl->fl4_src;
2448         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2449         /* get references to the devices that are to be hold by the routing
2450            cache entry */
2451         rth->u.dst.dev  = dev_out;
2452         dev_hold(dev_out);
2453         rth->idev       = in_dev_get(dev_out);
2454         rth->rt_gateway = fl->fl4_dst;
2455         rth->rt_spec_dst= fl->fl4_src;
2456
2457         rth->u.dst.output=ip_output;
2458         rth->u.dst.obsolete = -1;
2459         rth->rt_genid = rt_genid(dev_net(dev_out));
2460
2461         RT_CACHE_STAT_INC(out_slow_tot);
2462
2463         if (flags & RTCF_LOCAL) {
2464                 rth->u.dst.input = ip_local_deliver;
2465                 rth->rt_spec_dst = fl->fl4_dst;
2466         }
2467         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468                 rth->rt_spec_dst = fl->fl4_src;
2469                 if (flags & RTCF_LOCAL &&
2470                     !(dev_out->flags & IFF_LOOPBACK)) {
2471                         rth->u.dst.output = ip_mc_output;
2472                         RT_CACHE_STAT_INC(out_slow_mc);
2473                 }
2474 #ifdef CONFIG_IP_MROUTE
2475                 if (res->type == RTN_MULTICAST) {
2476                         if (IN_DEV_MFORWARD(in_dev) &&
2477                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2478                                 rth->u.dst.input = ip_mr_input;
2479                                 rth->u.dst.output = ip_mc_output;
2480                         }
2481                 }
2482 #endif
2483         }
2484
2485         rt_set_nexthop(rth, res, 0);
2486
2487         rth->rt_flags = flags;
2488
2489         *result = rth;
2490  cleanup:
2491         /* release work reference to inet device */
2492         in_dev_put(in_dev);
2493
2494         return err;
2495 }
2496
2497 static int ip_mkroute_output(struct rtable **rp,
2498                              struct fib_result *res,
2499                              const struct flowi *fl,
2500                              const struct flowi *oldflp,
2501                              struct net_device *dev_out,
2502                              unsigned flags)
2503 {
2504         struct rtable *rth = NULL;
2505         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2506         unsigned hash;
2507         if (err == 0) {
2508                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2509                                rt_genid(dev_net(dev_out)));
2510                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2511         }
2512
2513         return err;
2514 }
2515
2516 /*
2517  * Major route resolver routine.
2518  */
2519
2520 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2521                                 const struct flowi *oldflp)
2522 {
2523         u32 tos = RT_FL_TOS(oldflp);
2524         struct flowi fl = { .nl_u = { .ip4_u =
2525                                       { .daddr = oldflp->fl4_dst,
2526                                         .saddr = oldflp->fl4_src,
2527                                         .tos = tos & IPTOS_RT_MASK,
2528                                         .scope = ((tos & RTO_ONLINK) ?
2529                                                   RT_SCOPE_LINK :
2530                                                   RT_SCOPE_UNIVERSE),
2531                                       } },
2532                             .mark = oldflp->mark,
2533                             .iif = net->loopback_dev->ifindex,
2534                             .oif = oldflp->oif };
2535         struct fib_result res;
2536         unsigned flags = 0;
2537         struct net_device *dev_out = NULL;
2538         int free_res = 0;
2539         int err;
2540
2541
2542         res.fi          = NULL;
2543 #ifdef CONFIG_IP_MULTIPLE_TABLES
2544         res.r           = NULL;
2545 #endif
2546
2547         if (oldflp->fl4_src) {
2548                 err = -EINVAL;
2549                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2550                     ipv4_is_lbcast(oldflp->fl4_src) ||
2551                     ipv4_is_zeronet(oldflp->fl4_src))
2552                         goto out;
2553
2554                 /* I removed check for oif == dev_out->oif here.
2555                    It was wrong for two reasons:
2556                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2557                       is assigned to multiple interfaces.
2558                    2. Moreover, we are allowed to send packets with saddr
2559                       of another iface. --ANK
2560                  */
2561
2562                 if (oldflp->oif == 0 &&
2563                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2564                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2565                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2566                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2567                         if (dev_out == NULL)
2568                                 goto out;
2569
2570                         /* Special hack: user can direct multicasts
2571                            and limited broadcast via necessary interface
2572                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2573                            This hack is not just for fun, it allows
2574                            vic,vat and friends to work.
2575                            They bind socket to loopback, set ttl to zero
2576                            and expect that it will work.
2577                            From the viewpoint of routing cache they are broken,
2578                            because we are not allowed to build multicast path
2579                            with loopback source addr (look, routing cache
2580                            cannot know, that ttl is zero, so that packet
2581                            will not leave this host and route is valid).
2582                            Luckily, this hack is good workaround.
2583                          */
2584
2585                         fl.oif = dev_out->ifindex;
2586                         goto make_route;
2587                 }
2588
2589                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2590                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2592                         if (dev_out == NULL)
2593                                 goto out;
2594                         dev_put(dev_out);
2595                         dev_out = NULL;
2596                 }
2597         }
2598
2599
2600         if (oldflp->oif) {
2601                 dev_out = dev_get_by_index(net, oldflp->oif);
2602                 err = -ENODEV;
2603                 if (dev_out == NULL)
2604                         goto out;
2605
2606                 /* RACE: Check return value of inet_select_addr instead. */
2607                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2608                         dev_put(dev_out);
2609                         goto out;       /* Wrong error code */
2610                 }
2611
2612                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2613                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2614                         if (!fl.fl4_src)
2615                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2616                                                               RT_SCOPE_LINK);
2617                         goto make_route;
2618                 }
2619                 if (!fl.fl4_src) {
2620                         if (ipv4_is_multicast(oldflp->fl4_dst))
2621                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2622                                                               fl.fl4_scope);
2623                         else if (!oldflp->fl4_dst)
2624                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2625                                                               RT_SCOPE_HOST);
2626                 }
2627         }
2628
2629         if (!fl.fl4_dst) {
2630                 fl.fl4_dst = fl.fl4_src;
2631                 if (!fl.fl4_dst)
2632                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2633                 if (dev_out)
2634                         dev_put(dev_out);
2635                 dev_out = net->loopback_dev;
2636                 dev_hold(dev_out);
2637                 fl.oif = net->loopback_dev->ifindex;
2638                 res.type = RTN_LOCAL;
2639                 flags |= RTCF_LOCAL;
2640                 goto make_route;
2641         }
2642
2643         if (fib_lookup(net, &fl, &res)) {
2644                 res.fi = NULL;
2645                 if (oldflp->oif) {
2646                         /* Apparently, routing tables are wrong. Assume,
2647                            that the destination is on link.
2648
2649                            WHY? DW.
2650                            Because we are allowed to send to iface
2651                            even if it has NO routes and NO assigned
2652                            addresses. When oif is specified, routing
2653                            tables are looked up with only one purpose:
2654                            to catch if destination is gatewayed, rather than
2655                            direct. Moreover, if MSG_DONTROUTE is set,
2656                            we send packet, ignoring both routing tables
2657                            and ifaddr state. --ANK
2658
2659
2660                            We could make it even if oif is unknown,
2661                            likely IPv6, but we do not.
2662                          */
2663
2664                         if (fl.fl4_src == 0)
2665                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2666                                                               RT_SCOPE_LINK);
2667                         res.type = RTN_UNICAST;
2668                         goto make_route;
2669                 }
2670                 if (dev_out)
2671                         dev_put(dev_out);
2672                 err = -ENETUNREACH;
2673                 goto out;
2674         }
2675         free_res = 1;
2676
2677         if (res.type == RTN_LOCAL) {
2678                 if (!fl.fl4_src)
2679                         fl.fl4_src = fl.fl4_dst;
2680                 if (dev_out)
2681                         dev_put(dev_out);
2682                 dev_out = net->loopback_dev;
2683                 dev_hold(dev_out);
2684                 fl.oif = dev_out->ifindex;
2685                 if (res.fi)
2686                         fib_info_put(res.fi);
2687                 res.fi = NULL;
2688                 flags |= RTCF_LOCAL;
2689                 goto make_route;
2690         }
2691
2692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2693         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2694                 fib_select_multipath(&fl, &res);
2695         else
2696 #endif
2697         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2698                 fib_select_default(net, &fl, &res);
2699
2700         if (!fl.fl4_src)
2701                 fl.fl4_src = FIB_RES_PREFSRC(res);
2702
2703         if (dev_out)
2704                 dev_put(dev_out);
2705         dev_out = FIB_RES_DEV(res);
2706         dev_hold(dev_out);
2707         fl.oif = dev_out->ifindex;
2708
2709
2710 make_route:
2711         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2712
2713
2714         if (free_res)
2715                 fib_res_put(&res);
2716         if (dev_out)
2717                 dev_put(dev_out);
2718 out:    return err;
2719 }
2720
2721 int __ip_route_output_key(struct net *net, struct rtable **rp,
2722                           const struct flowi *flp)
2723 {
2724         unsigned hash;
2725         struct rtable *rth;
2726
2727         if (!rt_caching(net))
2728                 goto slow_output;
2729
2730         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2731
2732         rcu_read_lock_bh();
2733         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2734                 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2735                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2736                     rth->fl.fl4_src == flp->fl4_src &&
2737                     rth->fl.iif == 0 &&
2738                     rth->fl.oif == flp->oif &&
2739                     rth->fl.mark == flp->mark &&
2740                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2741                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2742                     net_eq(dev_net(rth->u.dst.dev), net) &&
2743                     !rt_is_expired(rth)) {
2744                         dst_use(&rth->u.dst, jiffies);
2745                         RT_CACHE_STAT_INC(out_hit);
2746                         rcu_read_unlock_bh();
2747                         *rp = rth;
2748                         return 0;
2749                 }
2750                 RT_CACHE_STAT_INC(out_hlist_search);
2751         }
2752         rcu_read_unlock_bh();
2753
2754 slow_output:
2755         return ip_route_output_slow(net, rp, flp);
2756 }
2757
2758 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2759
2760 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2761 {
2762 }
2763
2764 static struct dst_ops ipv4_dst_blackhole_ops = {
2765         .family                 =       AF_INET,
2766         .protocol               =       cpu_to_be16(ETH_P_IP),
2767         .destroy                =       ipv4_dst_destroy,
2768         .check                  =       ipv4_dst_check,
2769         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2770         .entries                =       ATOMIC_INIT(0),
2771 };
2772
2773
2774 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2775 {
2776         struct rtable *ort = *rp;
2777         struct rtable *rt = (struct rtable *)
2778                 dst_alloc(&ipv4_dst_blackhole_ops);
2779
2780         if (rt) {
2781                 struct dst_entry *new = &rt->u.dst;
2782
2783                 atomic_set(&new->__refcnt, 1);
2784                 new->__use = 1;
2785                 new->input = dst_discard;
2786                 new->output = dst_discard;
2787                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2788
2789                 new->dev = ort->u.dst.dev;
2790                 if (new->dev)
2791                         dev_hold(new->dev);
2792
2793                 rt->fl = ort->fl;
2794
2795                 rt->idev = ort->idev;
2796                 if (rt->idev)
2797                         in_dev_hold(rt->idev);
2798                 rt->rt_genid = rt_genid(net);
2799                 rt->rt_flags = ort->rt_flags;
2800                 rt->rt_type = ort->rt_type;
2801                 rt->rt_dst = ort->rt_dst;
2802                 rt->rt_src = ort->rt_src;
2803                 rt->rt_iif = ort->rt_iif;
2804                 rt->rt_gateway = ort->rt_gateway;
2805                 rt->rt_spec_dst = ort->rt_spec_dst;
2806                 rt->peer = ort->peer;
2807                 if (rt->peer)
2808                         atomic_inc(&rt->peer->refcnt);
2809
2810                 dst_free(new);
2811         }
2812
2813         dst_release(&(*rp)->u.dst);
2814         *rp = rt;
2815         return (rt ? 0 : -ENOMEM);
2816 }
2817
2818 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2819                          struct sock *sk, int flags)
2820 {
2821         int err;
2822
2823         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2824                 return err;
2825
2826         if (flp->proto) {
2827                 if (!flp->fl4_src)
2828                         flp->fl4_src = (*rp)->rt_src;
2829                 if (!flp->fl4_dst)
2830                         flp->fl4_dst = (*rp)->rt_dst;
2831                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2832                                     flags ? XFRM_LOOKUP_WAIT : 0);
2833                 if (err == -EREMOTE)
2834                         err = ipv4_dst_blackhole(net, rp, flp);
2835
2836                 return err;
2837         }
2838
2839         return 0;
2840 }
2841
2842 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2843
2844 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2845 {
2846         return ip_route_output_flow(net, rp, flp, NULL, 0);
2847 }
2848
2849 static int rt_fill_info(struct net *net,
2850                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2851                         int nowait, unsigned int flags)
2852 {
2853         struct rtable *rt = skb_rtable(skb);
2854         struct rtmsg *r;
2855         struct nlmsghdr *nlh;
2856         long expires;
2857         u32 id = 0, ts = 0, tsage = 0, error;
2858
2859         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2860         if (nlh == NULL)
2861                 return -EMSGSIZE;
2862
2863         r = nlmsg_data(nlh);
2864         r->rtm_family    = AF_INET;
2865         r->rtm_dst_len  = 32;
2866         r->rtm_src_len  = 0;
2867         r->rtm_tos      = rt->fl.fl4_tos;
2868         r->rtm_table    = RT_TABLE_MAIN;
2869         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2870         r->rtm_type     = rt->rt_type;
2871         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2872         r->rtm_protocol = RTPROT_UNSPEC;
2873         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2874         if (rt->rt_flags & RTCF_NOTIFY)
2875                 r->rtm_flags |= RTM_F_NOTIFY;
2876
2877         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2878
2879         if (rt->fl.fl4_src) {
2880                 r->rtm_src_len = 32;
2881                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2882         }
2883         if (rt->u.dst.dev)
2884                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2885 #ifdef CONFIG_NET_CLS_ROUTE
2886         if (rt->u.dst.tclassid)
2887                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2888 #endif
2889         if (rt->fl.iif)
2890                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2891         else if (rt->rt_src != rt->fl.fl4_src)
2892                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2893
2894         if (rt->rt_dst != rt->rt_gateway)
2895                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2896
2897         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2898                 goto nla_put_failure;
2899
2900         error = rt->u.dst.error;
2901         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2902         if (rt->peer) {
2903                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2904                 if (rt->peer->tcp_ts_stamp) {
2905                         ts = rt->peer->tcp_ts;
2906                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2907                 }
2908         }
2909
2910         if (rt->fl.iif) {
2911 #ifdef CONFIG_IP_MROUTE
2912                 __be32 dst = rt->rt_dst;
2913
2914                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2915                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2916                         int err = ipmr_get_route(net, skb, r, nowait);
2917                         if (err <= 0) {
2918                                 if (!nowait) {
2919                                         if (err == 0)
2920                                                 return 0;
2921                                         goto nla_put_failure;
2922                                 } else {
2923                                         if (err == -EMSGSIZE)
2924                                                 goto nla_put_failure;
2925                                         error = err;
2926                                 }
2927                         }
2928                 } else
2929 #endif
2930                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2931         }
2932
2933         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2934                                expires, error) < 0)
2935                 goto nla_put_failure;
2936
2937         return nlmsg_end(skb, nlh);
2938
2939 nla_put_failure:
2940         nlmsg_cancel(skb, nlh);
2941         return -EMSGSIZE;
2942 }
2943
2944 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2945 {
2946         struct net *net = sock_net(in_skb->sk);
2947         struct rtmsg *rtm;
2948         struct nlattr *tb[RTA_MAX+1];
2949         struct rtable *rt = NULL;
2950         __be32 dst = 0;
2951         __be32 src = 0;
2952         u32 iif;
2953         int err;
2954         struct sk_buff *skb;
2955
2956         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2957         if (err < 0)
2958                 goto errout;
2959
2960         rtm = nlmsg_data(nlh);
2961
2962         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2963         if (skb == NULL) {
2964                 err = -ENOBUFS;
2965                 goto errout;
2966         }
2967
2968         /* Reserve room for dummy headers, this skb can pass
2969            through good chunk of routing engine.
2970          */
2971         skb_reset_mac_header(skb);
2972         skb_reset_network_header(skb);
2973
2974         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2975         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2976         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2977
2978         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2979         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2980         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2981
2982         if (iif) {
2983                 struct net_device *dev;
2984
2985                 dev = __dev_get_by_index(net, iif);
2986                 if (dev == NULL) {
2987                         err = -ENODEV;
2988                         goto errout_free;
2989                 }
2990
2991                 skb->protocol   = htons(ETH_P_IP);
2992                 skb->dev        = dev;
2993                 local_bh_disable();
2994                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2995                 local_bh_enable();
2996
2997                 rt = skb_rtable(skb);
2998                 if (err == 0 && rt->u.dst.error)
2999                         err = -rt->u.dst.error;
3000         } else {
3001                 struct flowi fl = {
3002                         .nl_u = {
3003                                 .ip4_u = {
3004                                         .daddr = dst,
3005                                         .saddr = src,
3006                                         .tos = rtm->rtm_tos,
3007                                 },
3008                         },
3009                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3010                 };
3011                 err = ip_route_output_key(net, &rt, &fl);
3012         }
3013
3014         if (err)
3015                 goto errout_free;
3016
3017         skb_dst_set(skb, &rt->u.dst);
3018         if (rtm->rtm_flags & RTM_F_NOTIFY)
3019                 rt->rt_flags |= RTCF_NOTIFY;
3020
3021         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3022                            RTM_NEWROUTE, 0, 0);
3023         if (err <= 0)
3024                 goto errout_free;
3025
3026         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3027 errout:
3028         return err;
3029
3030 errout_free:
3031         kfree_skb(skb);
3032         goto errout;
3033 }
3034
3035 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3036 {
3037         struct rtable *rt;
3038         int h, s_h;
3039         int idx, s_idx;
3040         struct net *net;
3041
3042         net = sock_net(skb->sk);
3043
3044         s_h = cb->args[0];
3045         if (s_h < 0)
3046                 s_h = 0;
3047         s_idx = idx = cb->args[1];
3048         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3049                 if (!rt_hash_table[h].chain)
3050                         continue;
3051                 rcu_read_lock_bh();
3052                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3053                      rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3054                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3055                                 continue;
3056                         if (rt_is_expired(rt))
3057                                 continue;
3058                         skb_dst_set(skb, dst_clone(&rt->u.dst));
3059                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3060                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3061                                          1, NLM_F_MULTI) <= 0) {
3062                                 skb_dst_drop(skb);
3063                                 rcu_read_unlock_bh();
3064                                 goto done;
3065                         }
3066                         skb_dst_drop(skb);
3067                 }
3068                 rcu_read_unlock_bh();
3069         }
3070
3071 done:
3072         cb->args[0] = h;
3073         cb->args[1] = idx;
3074         return skb->len;
3075 }
3076
3077 void ip_rt_multicast_event(struct in_device *in_dev)
3078 {
3079         rt_cache_flush(dev_net(in_dev->dev), 0);
3080 }
3081
3082 #ifdef CONFIG_SYSCTL
3083 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3084                                         void __user *buffer,
3085                                         size_t *lenp, loff_t *ppos)
3086 {
3087         if (write) {
3088                 int flush_delay;
3089                 ctl_table ctl;
3090                 struct net *net;
3091
3092                 memcpy(&ctl, __ctl, sizeof(ctl));
3093                 ctl.data = &flush_delay;
3094                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3095
3096                 net = (struct net *)__ctl->extra1;
3097                 rt_cache_flush(net, flush_delay);
3098                 return 0;
3099         }
3100
3101         return -EINVAL;
3102 }
3103
3104 static void rt_secret_reschedule(int old)
3105 {
3106         struct net *net;
3107         int new = ip_rt_secret_interval;
3108         int diff = new - old;
3109
3110         if (!diff)
3111                 return;
3112
3113         rtnl_lock();
3114         for_each_net(net) {
3115                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3116                 long time;
3117
3118                 if (!new)
3119                         continue;
3120
3121                 if (deleted) {
3122                         time = net->ipv4.rt_secret_timer.expires - jiffies;
3123
3124                         if (time <= 0 || (time += diff) <= 0)
3125                                 time = 0;
3126                 } else
3127                         time = new;
3128
3129                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3130         }
3131         rtnl_unlock();
3132 }
3133
3134 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3135                                           void __user *buffer, size_t *lenp,
3136                                           loff_t *ppos)
3137 {
3138         int old = ip_rt_secret_interval;
3139         int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3140
3141         rt_secret_reschedule(old);
3142
3143         return ret;
3144 }
3145
3146 static ctl_table ipv4_route_table[] = {
3147         {
3148                 .procname       = "gc_thresh",
3149                 .data           = &ipv4_dst_ops.gc_thresh,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         {
3155                 .procname       = "max_size",
3156                 .data           = &ip_rt_max_size,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 /*  Deprecated. Use gc_min_interval_ms */
3163
3164                 .procname       = "gc_min_interval",
3165                 .data           = &ip_rt_gc_min_interval,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec_jiffies,
3169         },
3170         {
3171                 .procname       = "gc_min_interval_ms",
3172                 .data           = &ip_rt_gc_min_interval,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec_ms_jiffies,
3176         },
3177         {
3178                 .procname       = "gc_timeout",
3179                 .data           = &ip_rt_gc_timeout,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec_jiffies,
3183         },
3184         {
3185                 .procname       = "gc_interval",
3186                 .data           = &ip_rt_gc_interval,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec_jiffies,
3190         },
3191         {
3192                 .procname       = "redirect_load",
3193                 .data           = &ip_rt_redirect_load,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec,
3197         },
3198         {
3199                 .procname       = "redirect_number",
3200                 .data           = &ip_rt_redirect_number,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec,
3204         },
3205         {
3206                 .procname       = "redirect_silence",
3207                 .data           = &ip_rt_redirect_silence,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec,
3211         },
3212         {
3213                 .procname       = "error_cost",
3214                 .data           = &ip_rt_error_cost,
3215                 .maxlen         = sizeof(int),
3216                 .mode           = 0644,
3217                 .proc_handler   = proc_dointvec,
3218         },
3219         {
3220                 .procname       = "error_burst",
3221                 .data           = &ip_rt_error_burst,
3222                 .maxlen         = sizeof(int),
3223                 .mode           = 0644,
3224                 .proc_handler   = proc_dointvec,
3225         },
3226         {
3227                 .procname       = "gc_elasticity",
3228                 .data           = &ip_rt_gc_elasticity,
3229                 .maxlen         = sizeof(int),
3230                 .mode           = 0644,
3231                 .proc_handler   = proc_dointvec,
3232         },
3233         {
3234                 .procname       = "mtu_expires",
3235                 .data           = &ip_rt_mtu_expires,
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0644,
3238                 .proc_handler   = proc_dointvec_jiffies,
3239         },
3240         {
3241                 .procname       = "min_pmtu",
3242                 .data           = &ip_rt_min_pmtu,
3243                 .maxlen         = sizeof(int),
3244                 .mode           = 0644,
3245                 .proc_handler   = proc_dointvec,
3246         },
3247         {
3248                 .procname       = "min_adv_mss",
3249                 .data           = &ip_rt_min_advmss,
3250                 .maxlen         = sizeof(int),
3251                 .mode           = 0644,
3252                 .proc_handler   = proc_dointvec,
3253         },
3254         {
3255                 .procname       = "secret_interval",
3256                 .data           = &ip_rt_secret_interval,
3257                 .maxlen         = sizeof(int),
3258                 .mode           = 0644,
3259                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3260         },
3261         { }
3262 };
3263
3264 static struct ctl_table empty[1];
3265
3266 static struct ctl_table ipv4_skeleton[] =
3267 {
3268         { .procname = "route",
3269           .mode = 0555, .child = ipv4_route_table},
3270         { .procname = "neigh",
3271           .mode = 0555, .child = empty},
3272         { }
3273 };
3274
3275 static __net_initdata struct ctl_path ipv4_path[] = {
3276         { .procname = "net", },
3277         { .procname = "ipv4", },
3278         { },
3279 };
3280
3281 static struct ctl_table ipv4_route_flush_table[] = {
3282         {
3283                 .procname       = "flush",
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0200,
3286                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3287         },
3288         { },
3289 };
3290
3291 static __net_initdata struct ctl_path ipv4_route_path[] = {
3292         { .procname = "net", },
3293         { .procname = "ipv4", },
3294         { .procname = "route", },
3295         { },
3296 };
3297
3298 static __net_init int sysctl_route_net_init(struct net *net)
3299 {
3300         struct ctl_table *tbl;
3301
3302         tbl = ipv4_route_flush_table;
3303         if (!net_eq(net, &init_net)) {
3304                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3305                 if (tbl == NULL)
3306                         goto err_dup;
3307         }
3308         tbl[0].extra1 = net;
3309
3310         net->ipv4.route_hdr =
3311                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3312         if (net->ipv4.route_hdr == NULL)
3313                 goto err_reg;
3314         return 0;
3315
3316 err_reg:
3317         if (tbl != ipv4_route_flush_table)
3318                 kfree(tbl);
3319 err_dup:
3320         return -ENOMEM;
3321 }
3322
3323 static __net_exit void sysctl_route_net_exit(struct net *net)
3324 {
3325         struct ctl_table *tbl;
3326
3327         tbl = net->ipv4.route_hdr->ctl_table_arg;
3328         unregister_net_sysctl_table(net->ipv4.route_hdr);
3329         BUG_ON(tbl == ipv4_route_flush_table);
3330         kfree(tbl);
3331 }
3332
3333 static __net_initdata struct pernet_operations sysctl_route_ops = {
3334         .init = sysctl_route_net_init,
3335         .exit = sysctl_route_net_exit,
3336 };
3337 #endif
3338
3339
3340 static __net_init int rt_secret_timer_init(struct net *net)
3341 {
3342         atomic_set(&net->ipv4.rt_genid,
3343                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3344                         (jiffies ^ (jiffies >> 7))));
3345
3346         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3347         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3348         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3349
3350         if (ip_rt_secret_interval) {
3351                 net->ipv4.rt_secret_timer.expires =
3352                         jiffies + net_random() % ip_rt_secret_interval +
3353                         ip_rt_secret_interval;
3354                 add_timer(&net->ipv4.rt_secret_timer);
3355         }
3356         return 0;
3357 }
3358
3359 static __net_exit void rt_secret_timer_exit(struct net *net)
3360 {
3361         del_timer_sync(&net->ipv4.rt_secret_timer);
3362 }
3363
3364 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3365         .init = rt_secret_timer_init,
3366         .exit = rt_secret_timer_exit,
3367 };
3368
3369
3370 #ifdef CONFIG_NET_CLS_ROUTE
3371 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3372 #endif /* CONFIG_NET_CLS_ROUTE */
3373
3374 static __initdata unsigned long rhash_entries;
3375 static int __init set_rhash_entries(char *str)
3376 {
3377         if (!str)
3378                 return 0;
3379         rhash_entries = simple_strtoul(str, &str, 0);
3380         return 1;
3381 }
3382 __setup("rhash_entries=", set_rhash_entries);
3383
3384 int __init ip_rt_init(void)
3385 {
3386         int rc = 0;
3387
3388 #ifdef CONFIG_NET_CLS_ROUTE
3389         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3390         if (!ip_rt_acct)
3391                 panic("IP: failed to allocate ip_rt_acct\n");
3392 #endif
3393
3394         ipv4_dst_ops.kmem_cachep =
3395                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3396                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3397
3398         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3399
3400         rt_hash_table = (struct rt_hash_bucket *)
3401                 alloc_large_system_hash("IP route cache",
3402                                         sizeof(struct rt_hash_bucket),
3403                                         rhash_entries,
3404                                         (totalram_pages >= 128 * 1024) ?
3405                                         15 : 17,
3406                                         0,
3407                                         &rt_hash_log,
3408                                         &rt_hash_mask,
3409                                         rhash_entries ? 0 : 512 * 1024);
3410         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3411         rt_hash_lock_init();
3412
3413         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3414         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3415
3416         devinet_init();
3417         ip_fib_init();
3418
3419         /* All the timers, started at system startup tend
3420            to synchronize. Perturb it a bit.
3421          */
3422         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3423         expires_ljiffies = jiffies;
3424         schedule_delayed_work(&expires_work,
3425                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3426
3427         if (register_pernet_subsys(&rt_secret_timer_ops))
3428                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3429
3430         if (ip_rt_proc_init())
3431                 printk(KERN_ERR "Unable to create route proc files\n");
3432 #ifdef CONFIG_XFRM
3433         xfrm_init();
3434         xfrm4_init(ip_rt_max_size);
3435 #endif
3436         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3437
3438 #ifdef CONFIG_SYSCTL
3439         register_pernet_subsys(&sysctl_route_ops);
3440 #endif
3441         return rc;
3442 }
3443
3444 #ifdef CONFIG_SYSCTL
3445 /*
3446  * We really need to sanitize the damn ipv4 init order, then all
3447  * this nonsense will go away.
3448  */
3449 void __init ip_static_sysctl_init(void)
3450 {
3451         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3452 }
3453 #endif
3454
3455 EXPORT_SYMBOL(__ip_select_ident);
3456 EXPORT_SYMBOL(ip_route_input);
3457 EXPORT_SYMBOL(ip_route_output_key);