net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int rt_acct_proc_show(struct seq_file *m, void *v)
 517 {
 518         struct ip_rt_acct *dst, *src;
 519         unsigned int i, j;
 520
 521         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 522         if (!dst)
 523                 return -ENOMEM;
 524
 525         for_each_possible_cpu(i) {
 526                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 527                 for (j = 0; j < 256; j++) {
 528                         dst[j].o_bytes   += src[j].o_bytes;
 529                         dst[j].o_packets += src[j].o_packets;
 530                         dst[j].i_bytes   += src[j].i_bytes;
 531                         dst[j].i_packets += src[j].i_packets;
 532                 }
 533         }
 534
 535         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 536         kfree(dst);
 537         return 0;
 538 }
 539
 540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 541 {
 542         return single_open(file, rt_acct_proc_show, NULL);
 543 }
 544
 545 static const struct file_operations rt_acct_proc_fops = {
 546         .owner          = THIS_MODULE,
 547         .open           = rt_acct_proc_open,
 548         .read           = seq_read,
 549         .llseek         = seq_lseek,
 550         .release        = single_release,
 551 };
 552 #endif
 553
 554 static int __net_init ip_rt_do_proc_init(struct net *net)
 555 {
 556         struct proc_dir_entry *pde;
 557
 558         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 559                         &rt_cache_seq_fops);
 560         if (!pde)
 561                 goto err1;
 562
 563         pde = proc_create("rt_cache", S_IRUGO,
 564                           net->proc_net_stat, &rt_cpu_seq_fops);
 565         if (!pde)
 566                 goto err2;
 567
 568 #ifdef CONFIG_NET_CLS_ROUTE
 569         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 570         if (!pde)
 571                 goto err3;
 572 #endif
 573         return 0;
 574
 575 #ifdef CONFIG_NET_CLS_ROUTE
 576 err3:
 577         remove_proc_entry("rt_cache", net->proc_net_stat);
 578 #endif
 579 err2:
 580         remove_proc_entry("rt_cache", net->proc_net);
 581 err1:
 582         return -ENOMEM;
 583 }
 584
 585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 586 {
 587         remove_proc_entry("rt_cache", net->proc_net_stat);
 588         remove_proc_entry("rt_cache", net->proc_net);
 589 #ifdef CONFIG_NET_CLS_ROUTE
 590         remove_proc_entry("rt_acct", net->proc_net);
 591 #endif
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                 (fl1->oif ^ fl2->oif) |
 701                 (fl1->iif ^ fl2->iif)) == 0;
 702 }
 703
 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 707 }
 708
 709 static inline int rt_is_expired(struct rtable *rth)
 710 {
 711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712 }
 713
 714 /*
 715  * Perform a full scan of hash table and free all entries.
 716  * Can be called by a softirq or a process.
 717  * In the later case, we want to be reschedule if necessary
 718  */
 719 static void rt_do_flush(int process_context)
 720 {
 721         unsigned int i;
 722         struct rtable *rth, *next;
 723         struct rtable * tail;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 if (process_context && need_resched())
 727                         cond_resched();
 728                 rth = rt_hash_table[i].chain;
 729                 if (!rth)
 730                         continue;
 731
 732                 spin_lock_bh(rt_hash_lock_addr(i));
 733 #ifdef CONFIG_NET_NS
 734                 {
 735                 struct rtable ** prev, * p;
 736
 737                 rth = rt_hash_table[i].chain;
 738
 739                 /* defer releasing the head of the list after spin_unlock */
 740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                         if (!rt_is_expired(tail))
 742                                 break;
 743                 if (rth != tail)
 744                         rt_hash_table[i].chain = tail;
 745
 746                 /* call rt_free on entries after the tail requiring flush */
 747                 prev = &rt_hash_table[i].chain;
 748                 for (p = *prev; p; p = next) {
 749                         next = p->u.dst.rt_next;
 750                         if (!rt_is_expired(p)) {
 751                                 prev = &p->u.dst.rt_next;
 752                         } else {
 753                                 *prev = next;
 754                                 rt_free(p);
 755                         }
 756                 }
 757                 }
 758 #else
 759                 rth = rt_hash_table[i].chain;
 760                 rt_hash_table[i].chain = NULL;
 761                 tail = NULL;
 762 #endif
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                 for (; rth != tail; rth = next) {
 766                         next = rth->u.dst.rt_next;
 767                         rt_free(rth);
 768                 }
 769         }
 770 }
 771
 772 /*
 773  * While freeing expired entries, we compute average chain length
 774  * and standard deviation, using fixed-point arithmetic.
 775  * This to have an estimation of rt_chain_length_max
 776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778  */
 779
 780 #define FRACT_BITS 3
 781 #define ONE (1UL << FRACT_BITS)
 782
 783 static void rt_check_expire(void)
 784 {
 785         static unsigned int rover;
 786         unsigned int i = rover, goal;
 787         struct rtable *rth, *aux, **rthp;
 788         unsigned long samples = 0;
 789         unsigned long sum = 0, sum2 = 0;
 790         unsigned long delta;
 791         u64 mult;
 792
 793         delta = jiffies - expires_ljiffies;
 794         expires_ljiffies = jiffies;
 795         mult = ((u64)delta) << rt_hash_log;
 796         if (ip_rt_gc_timeout > 1)
 797                 do_div(mult, ip_rt_gc_timeout);
 798         goal = (unsigned int)mult;
 799         if (goal > rt_hash_mask)
 800                 goal = rt_hash_mask + 1;
 801         for (; goal > 0; goal--) {
 802                 unsigned long tmo = ip_rt_gc_timeout;
 803                 unsigned long length;
 804
 805                 i = (i + 1) & rt_hash_mask;
 806                 rthp = &rt_hash_table[i].chain;
 807
 808                 if (need_resched())
 809                         cond_resched();
 810
 811                 samples++;
 812
 813                 if (*rthp == NULL)
 814                         continue;
 815                 length = 0;
 816                 spin_lock_bh(rt_hash_lock_addr(i));
 817                 while ((rth = *rthp) != NULL) {
 818                         prefetch(rth->u.dst.rt_next);
 819                         if (rt_is_expired(rth)) {
 820                                 *rthp = rth->u.dst.rt_next;
 821                                 rt_free(rth);
 822                                 continue;
 823                         }
 824                         if (rth->u.dst.expires) {
 825                                 /* Entry is expired even if it is in use */
 826                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 827 nofree:
 828                                         tmo >>= 1;
 829                                         rthp = &rth->u.dst.rt_next;
 830                                         /*
 831                                          * We only count entries on
 832                                          * a chain with equal hash inputs once
 833                                          * so that entries for different QOS
 834                                          * levels, and other non-hash input
 835                                          * attributes don't unfairly skew
 836                                          * the length computation
 837                                          */
 838                                         for (aux = rt_hash_table[i].chain;;) {
 839                                                 if (aux == rth) {
 840                                                         length += ONE;
 841                                                         break;
 842                                                 }
 843                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 844                                                         break;
 845                                                 aux = aux->u.dst.rt_next;
 846                                         }
 847                                         continue;
 848                                 }
 849                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 850                                 goto nofree;
 851
 852                         /* Cleanup aged off entries. */
 853                         *rthp = rth->u.dst.rt_next;
 854                         rt_free(rth);
 855                 }
 856                 spin_unlock_bh(rt_hash_lock_addr(i));
 857                 sum += length;
 858                 sum2 += length*length;
 859         }
 860         if (samples) {
 861                 unsigned long avg = sum / samples;
 862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                 rt_chain_length_max = max_t(unsigned long,
 864                                         ip_rt_gc_elasticity,
 865                                         (avg + 4*sd) >> FRACT_BITS);
 866         }
 867         rover = i;
 868 }
 869
 870 /*
 871  * rt_worker_func() is run in process context.
 872  * we call rt_check_expire() to scan part of the hash table
 873  */
 874 static void rt_worker_func(struct work_struct *work)
 875 {
 876         rt_check_expire();
 877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878 }
 879
 880 /*
 881  * Pertubation of rt_genid by a small quantity [1..256]
 882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883  * many times (2^24) without giving recent rt_genid.
 884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885  */
 886 static void rt_cache_invalidate(struct net *net)
 887 {
 888         unsigned char shuffle;
 889
 890         get_random_bytes(&shuffle, sizeof(shuffle));
 891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892 }
 893
 894 /*
 895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896  * delay >= 0 : invalidate & flush cache (can be long)
 897  */
 898 void rt_cache_flush(struct net *net, int delay)
 899 {
 900         rt_cache_invalidate(net);
 901         if (delay >= 0)
 902                 rt_do_flush(!in_softirq());
 903 }
 904
 905 /* Flush previous cache invalidated entries from the cache */
 906 void rt_cache_flush_batch(void)
 907 {
 908         rt_do_flush(!in_softirq());
 909 }
 910
 911 /*
 912  * We change rt_genid and let gc do the cleanup
 913  */
 914 static void rt_secret_rebuild(unsigned long __net)
 915 {
 916         struct net *net = (struct net *)__net;
 917         rt_cache_invalidate(net);
 918         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 919 }
 920
 921 static void rt_secret_rebuild_oneshot(struct net *net)
 922 {
 923         del_timer_sync(&net->ipv4.rt_secret_timer);
 924         rt_cache_invalidate(net);
 925         if (ip_rt_secret_interval)
 926                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 927 }
 928
 929 static void rt_emergency_hash_rebuild(struct net *net)
 930 {
 931         if (net_ratelimit()) {
 932                 printk(KERN_WARNING "Route hash chain too long!\n");
 933                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 934         }
 935
 936         rt_secret_rebuild_oneshot(net);
 937 }
 938
 939 /*
 940    Short description of GC goals.
 941
 942    We want to build algorithm, which will keep routing cache
 943    at some equilibrium point, when number of aged off entries
 944    is kept approximately equal to newly generated ones.
 945
 946    Current expiration strength is variable "expire".
 947    We try to adjust it dynamically, so that if networking
 948    is idle expires is large enough to keep enough of warm entries,
 949    and when load increases it reduces to limit cache size.
 950  */
 951
 952 static int rt_garbage_collect(struct dst_ops *ops)
 953 {
 954         static unsigned long expire = RT_GC_TIMEOUT;
 955         static unsigned long last_gc;
 956         static int rover;
 957         static int equilibrium;
 958         struct rtable *rth, **rthp;
 959         unsigned long now = jiffies;
 960         int goal;
 961
 962         /*
 963          * Garbage collection is pretty expensive,
 964          * do not make it too frequently.
 965          */
 966
 967         RT_CACHE_STAT_INC(gc_total);
 968
 969         if (now - last_gc < ip_rt_gc_min_interval &&
 970             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 971                 RT_CACHE_STAT_INC(gc_ignored);
 972                 goto out;
 973         }
 974
 975         /* Calculate number of entries, which we want to expire now. */
 976         goal = atomic_read(&ipv4_dst_ops.entries) -
 977                 (ip_rt_gc_elasticity << rt_hash_log);
 978         if (goal <= 0) {
 979                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 980                         equilibrium = ipv4_dst_ops.gc_thresh;
 981                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 982                 if (goal > 0) {
 983                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 984                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 985                 }
 986         } else {
 987                 /* We are in dangerous area. Try to reduce cache really
 988                  * aggressively.
 989                  */
 990                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 991                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 992         }
 993
 994         if (now - last_gc >= ip_rt_gc_min_interval)
 995                 last_gc = now;
 996
 997         if (goal <= 0) {
 998                 equilibrium += goal;
 999                 goto work_done;
1000         }
1001
1002         do {
1003                 int i, k;
1004
1005                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1006                         unsigned long tmo = expire;
1007
1008                         k = (k + 1) & rt_hash_mask;
1009                         rthp = &rt_hash_table[k].chain;
1010                         spin_lock_bh(rt_hash_lock_addr(k));
1011                         while ((rth = *rthp) != NULL) {
1012                                 if (!rt_is_expired(rth) &&
1013                                         !rt_may_expire(rth, tmo, expire)) {
1014                                         tmo >>= 1;
1015                                         rthp = &rth->u.dst.rt_next;
1016                                         continue;
1017                                 }
1018                                 *rthp = rth->u.dst.rt_next;
1019                                 rt_free(rth);
1020                                 goal--;
1021                         }
1022                         spin_unlock_bh(rt_hash_lock_addr(k));
1023                         if (goal <= 0)
1024                                 break;
1025                 }
1026                 rover = k;
1027
1028                 if (goal <= 0)
1029                         goto work_done;
1030
1031                 /* Goal is not achieved. We stop process if:
1032
1033                    - if expire reduced to zero. Otherwise, expire is halfed.
1034                    - if table is not full.
1035                    - if we are called from interrupt.
1036                    - jiffies check is just fallback/debug loop breaker.
1037                      We will not spin here for long time in any case.
1038                  */
1039
1040                 RT_CACHE_STAT_INC(gc_goal_miss);
1041
1042                 if (expire == 0)
1043                         break;
1044
1045                 expire >>= 1;
1046 #if RT_CACHE_DEBUG >= 2
1047                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1049 #endif
1050
1051                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052                         goto out;
1053         } while (!in_softirq() && time_before_eq(jiffies, now));
1054
1055         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1056                 goto out;
1057         if (net_ratelimit())
1058                 printk(KERN_WARNING "dst cache overflow\n");
1059         RT_CACHE_STAT_INC(gc_dst_overflow);
1060         return 1;
1061
1062 work_done:
1063         expire += ip_rt_gc_min_interval;
1064         if (expire > ip_rt_gc_timeout ||
1065             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1066                 expire = ip_rt_gc_timeout;
1067 #if RT_CACHE_DEBUG >= 2
1068         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1069                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1070 #endif
1071 out:    return 0;
1072 }
1073
1074 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1075                           struct rtable **rp, struct sk_buff *skb)
1076 {
1077         struct rtable   *rth, **rthp;
1078         unsigned long   now;
1079         struct rtable *cand, **candp;
1080         u32             min_score;
1081         int             chain_length;
1082         int attempts = !in_softirq();
1083
1084 restart:
1085         chain_length = 0;
1086         min_score = ~(u32)0;
1087         cand = NULL;
1088         candp = NULL;
1089         now = jiffies;
1090
1091         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1092                 /*
1093                  * If we're not caching, just tell the caller we
1094                  * were successful and don't touch the route.  The
1095                  * caller hold the sole reference to the cache entry, and
1096                  * it will be released when the caller is done with it.
1097                  * If we drop it here, the callers have no way to resolve routes
1098                  * when we're not caching.  Instead, just point *rp at rt, so
1099                  * the caller gets a single use out of the route
1100                  * Note that we do rt_free on this new route entry, so that
1101                  * once its refcount hits zero, we are still able to reap it
1102                  * (Thanks Alexey)
1103                  * Note also the rt_free uses call_rcu.  We don't actually
1104                  * need rcu protection here, this is just our path to get
1105                  * on the route gc list.
1106                  */
1107
1108                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1109                         int err = arp_bind_neighbour(&rt->u.dst);
1110                         if (err) {
1111                                 if (net_ratelimit())
1112                                         printk(KERN_WARNING
1113                                             "Neighbour table failure & not caching routes.\n");
1114                                 rt_drop(rt);
1115                                 return err;
1116                         }
1117                 }
1118
1119                 rt_free(rt);
1120                 goto skip_hashing;
1121         }
1122
1123         rthp = &rt_hash_table[hash].chain;
1124
1125         spin_lock_bh(rt_hash_lock_addr(hash));
1126         while ((rth = *rthp) != NULL) {
1127                 if (rt_is_expired(rth)) {
1128                         *rthp = rth->u.dst.rt_next;
1129                         rt_free(rth);
1130                         continue;
1131                 }
1132                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1133                         /* Put it first */
1134                         *rthp = rth->u.dst.rt_next;
1135                         /*
1136                          * Since lookup is lockfree, the deletion
1137                          * must be visible to another weakly ordered CPU before
1138                          * the insertion at the start of the hash chain.
1139                          */
1140                         rcu_assign_pointer(rth->u.dst.rt_next,
1141                                            rt_hash_table[hash].chain);
1142                         /*
1143                          * Since lookup is lockfree, the update writes
1144                          * must be ordered for consistency on SMP.
1145                          */
1146                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
1148                         dst_use(&rth->u.dst, now);
1149                         spin_unlock_bh(rt_hash_lock_addr(hash));
1150
1151                         rt_drop(rt);
1152                         if (rp)
1153                                 *rp = rth;
1154                         else
1155                                 skb_dst_set(skb, &rth->u.dst);
1156                         return 0;
1157                 }
1158
1159                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1160                         u32 score = rt_score(rth);
1161
1162                         if (score <= min_score) {
1163                                 cand = rth;
1164                                 candp = rthp;
1165                                 min_score = score;
1166                         }
1167                 }
1168
1169                 chain_length++;
1170
1171                 rthp = &rth->u.dst.rt_next;
1172         }
1173
1174         if (cand) {
1175                 /* ip_rt_gc_elasticity used to be average length of chain
1176                  * length, when exceeded gc becomes really aggressive.
1177                  *
1178                  * The second limit is less certain. At the moment it allows
1179                  * only 2 entries per bucket. We will see.
1180                  */
1181                 if (chain_length > ip_rt_gc_elasticity) {
1182                         *candp = cand->u.dst.rt_next;
1183                         rt_free(cand);
1184                 }
1185         } else {
1186                 if (chain_length > rt_chain_length_max) {
1187                         struct net *net = dev_net(rt->u.dst.dev);
1188                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1189                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1190                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1191                                         rt->u.dst.dev->name, num);
1192                         }
1193                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1194                 }
1195         }
1196
1197         /* Try to bind route to arp only if it is output
1198            route or unicast forwarding path.
1199          */
1200         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1201                 int err = arp_bind_neighbour(&rt->u.dst);
1202                 if (err) {
1203                         spin_unlock_bh(rt_hash_lock_addr(hash));
1204
1205                         if (err != -ENOBUFS) {
1206                                 rt_drop(rt);
1207                                 return err;
1208                         }
1209
1210                         /* Neighbour tables are full and nothing
1211                            can be released. Try to shrink route cache,
1212                            it is most likely it holds some neighbour records.
1213                          */
1214                         if (attempts-- > 0) {
1215                                 int saved_elasticity = ip_rt_gc_elasticity;
1216                                 int saved_int = ip_rt_gc_min_interval;
1217                                 ip_rt_gc_elasticity     = 1;
1218                                 ip_rt_gc_min_interval   = 0;
1219                                 rt_garbage_collect(&ipv4_dst_ops);
1220                                 ip_rt_gc_min_interval   = saved_int;
1221                                 ip_rt_gc_elasticity     = saved_elasticity;
1222                                 goto restart;
1223                         }
1224
1225                         if (net_ratelimit())
1226                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1227                         rt_drop(rt);
1228                         return -ENOBUFS;
1229                 }
1230         }
1231
1232         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1233
1234 #if RT_CACHE_DEBUG >= 2
1235         if (rt->u.dst.rt_next) {
1236                 struct rtable *trt;
1237                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1238                        hash, &rt->rt_dst);
1239                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1240                         printk(" . %pI4", &trt->rt_dst);
1241                 printk("\n");
1242         }
1243 #endif
1244         /*
1245          * Since lookup is lockfree, we must make sure
1246          * previous writes to rt are comitted to memory
1247          * before making rt visible to other CPUS.
1248          */
1249         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1250
1251         spin_unlock_bh(rt_hash_lock_addr(hash));
1252
1253 skip_hashing:
1254         if (rp)
1255                 *rp = rt;
1256         else
1257                 skb_dst_set(skb, &rt->u.dst);
1258         return 0;
1259 }
1260
1261 void rt_bind_peer(struct rtable *rt, int create)
1262 {
1263         static DEFINE_SPINLOCK(rt_peer_lock);
1264         struct inet_peer *peer;
1265
1266         peer = inet_getpeer(rt->rt_dst, create);
1267
1268         spin_lock_bh(&rt_peer_lock);
1269         if (rt->peer == NULL) {
1270                 rt->peer = peer;
1271                 peer = NULL;
1272         }
1273         spin_unlock_bh(&rt_peer_lock);
1274         if (peer)
1275                 inet_putpeer(peer);
1276 }
1277
1278 /*
1279  * Peer allocation may fail only in serious out-of-memory conditions.  However
1280  * we still can generate some output.
1281  * Random ID selection looks a bit dangerous because we have no chances to
1282  * select ID being unique in a reasonable period of time.
1283  * But broken packet identifier may be better than no packet at all.
1284  */
1285 static void ip_select_fb_ident(struct iphdr *iph)
1286 {
1287         static DEFINE_SPINLOCK(ip_fb_id_lock);
1288         static u32 ip_fallback_id;
1289         u32 salt;
1290
1291         spin_lock_bh(&ip_fb_id_lock);
1292         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1293         iph->id = htons(salt & 0xFFFF);
1294         ip_fallback_id = salt;
1295         spin_unlock_bh(&ip_fb_id_lock);
1296 }
1297
1298 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1299 {
1300         struct rtable *rt = (struct rtable *) dst;
1301
1302         if (rt) {
1303                 if (rt->peer == NULL)
1304                         rt_bind_peer(rt, 1);
1305
1306                 /* If peer is attached to destination, it is never detached,
1307                    so that we need not to grab a lock to dereference it.
1308                  */
1309                 if (rt->peer) {
1310                         iph->id = htons(inet_getid(rt->peer, more));
1311                         return;
1312                 }
1313         } else
1314                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1315                        __builtin_return_address(0));
1316
1317         ip_select_fb_ident(iph);
1318 }
1319
1320 static void rt_del(unsigned hash, struct rtable *rt)
1321 {
1322         struct rtable **rthp, *aux;
1323
1324         rthp = &rt_hash_table[hash].chain;
1325         spin_lock_bh(rt_hash_lock_addr(hash));
1326         ip_rt_put(rt);
1327         while ((aux = *rthp) != NULL) {
1328                 if (aux == rt || rt_is_expired(aux)) {
1329                         *rthp = aux->u.dst.rt_next;
1330                         rt_free(aux);
1331                         continue;
1332                 }
1333                 rthp = &aux->u.dst.rt_next;
1334         }
1335         spin_unlock_bh(rt_hash_lock_addr(hash));
1336 }
1337
1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339                     __be32 saddr, struct net_device *dev)
1340 {
1341         int i, k;
1342         struct in_device *in_dev = in_dev_get(dev);
1343         struct rtable *rth, **rthp;
1344         __be32  skeys[2] = { saddr, 0 };
1345         int  ikeys[2] = { dev->ifindex, 0 };
1346         struct netevent_redirect netevent;
1347         struct net *net;
1348
1349         if (!in_dev)
1350                 return;
1351
1352         net = dev_net(dev);
1353         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1354             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1355             ipv4_is_zeronet(new_gw))
1356                 goto reject_redirect;
1357
1358         if (!rt_caching(net))
1359                 goto reject_redirect;
1360
1361         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363                         goto reject_redirect;
1364                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365                         goto reject_redirect;
1366         } else {
1367                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368                         goto reject_redirect;
1369         }
1370
1371         for (i = 0; i < 2; i++) {
1372                 for (k = 0; k < 2; k++) {
1373                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1374                                                 rt_genid(net));
1375
1376                         rthp=&rt_hash_table[hash].chain;
1377
1378                         rcu_read_lock();
1379                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1380                                 struct rtable *rt;
1381
1382                                 if (rth->fl.fl4_dst != daddr ||
1383                                     rth->fl.fl4_src != skeys[i] ||
1384                                     rth->fl.oif != ikeys[k] ||
1385                                     rth->fl.iif != 0 ||
1386                                     rt_is_expired(rth) ||
1387                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1388                                         rthp = &rth->u.dst.rt_next;
1389                                         continue;
1390                                 }
1391
1392                                 if (rth->rt_dst != daddr ||
1393                                     rth->rt_src != saddr ||
1394                                     rth->u.dst.error ||
1395                                     rth->rt_gateway != old_gw ||
1396                                     rth->u.dst.dev != dev)
1397                                         break;
1398
1399                                 dst_hold(&rth->u.dst);
1400                                 rcu_read_unlock();
1401
1402                                 rt = dst_alloc(&ipv4_dst_ops);
1403                                 if (rt == NULL) {
1404                                         ip_rt_put(rth);
1405                                         in_dev_put(in_dev);
1406                                         return;
1407                                 }
1408
1409                                 /* Copy all the information. */
1410                                 *rt = *rth;
1411                                 rt->u.dst.__use         = 1;
1412                                 atomic_set(&rt->u.dst.__refcnt, 1);
1413                                 rt->u.dst.child         = NULL;
1414                                 if (rt->u.dst.dev)
1415                                         dev_hold(rt->u.dst.dev);
1416                                 if (rt->idev)
1417                                         in_dev_hold(rt->idev);
1418                                 rt->u.dst.obsolete      = -1;
1419                                 rt->u.dst.lastuse       = jiffies;
1420                                 rt->u.dst.path          = &rt->u.dst;
1421                                 rt->u.dst.neighbour     = NULL;
1422                                 rt->u.dst.hh            = NULL;
1423 #ifdef CONFIG_XFRM
1424                                 rt->u.dst.xfrm          = NULL;
1425 #endif
1426                                 rt->rt_genid            = rt_genid(net);
1427                                 rt->rt_flags            |= RTCF_REDIRECTED;
1428
1429                                 /* Gateway is different ... */
1430                                 rt->rt_gateway          = new_gw;
1431
1432                                 /* Redirect received -> path was valid */
1433                                 dst_confirm(&rth->u.dst);
1434
1435                                 if (rt->peer)
1436                                         atomic_inc(&rt->peer->refcnt);
1437
1438                                 if (arp_bind_neighbour(&rt->u.dst) ||
1439                                     !(rt->u.dst.neighbour->nud_state &
1440                                             NUD_VALID)) {
1441                                         if (rt->u.dst.neighbour)
1442                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1443                                         ip_rt_put(rth);
1444                                         rt_drop(rt);
1445                                         goto do_next;
1446                                 }
1447
1448                                 netevent.old = &rth->u.dst;
1449                                 netevent.new = &rt->u.dst;
1450                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1451                                                         &netevent);
1452
1453                                 rt_del(hash, rth);
1454                                 if (!rt_intern_hash(hash, rt, &rt, NULL))
1455                                         ip_rt_put(rt);
1456                                 goto do_next;
1457                         }
1458                         rcu_read_unlock();
1459                 do_next:
1460                         ;
1461                 }
1462         }
1463         in_dev_put(in_dev);
1464         return;
1465
1466 reject_redirect:
1467 #ifdef CONFIG_IP_ROUTE_VERBOSE
1468         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1469                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1470                         "  Advised path = %pI4 -> %pI4\n",
1471                        &old_gw, dev->name, &new_gw,
1472                        &saddr, &daddr);
1473 #endif
1474         in_dev_put(in_dev);
1475 }
1476
1477 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1478 {
1479         struct rtable *rt = (struct rtable *)dst;
1480         struct dst_entry *ret = dst;
1481
1482         if (rt) {
1483                 if (dst->obsolete > 0) {
1484                         ip_rt_put(rt);
1485                         ret = NULL;
1486                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1487                            (rt->u.dst.expires &&
1488                             time_after_eq(jiffies, rt->u.dst.expires))) {
1489                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1490                                                 rt->fl.oif,
1491                                                 rt_genid(dev_net(dst->dev)));
1492 #if RT_CACHE_DEBUG >= 1
1493                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1494                                 &rt->rt_dst, rt->fl.fl4_tos);
1495 #endif
1496                         rt_del(hash, rt);
1497                         ret = NULL;
1498                 }
1499         }
1500         return ret;
1501 }
1502
1503 /*
1504  * Algorithm:
1505  *      1. The first ip_rt_redirect_number redirects are sent
1506  *         with exponential backoff, then we stop sending them at all,
1507  *         assuming that the host ignores our redirects.
1508  *      2. If we did not see packets requiring redirects
1509  *         during ip_rt_redirect_silence, we assume that the host
1510  *         forgot redirected route and start to send redirects again.
1511  *
1512  * This algorithm is much cheaper and more intelligent than dumb load limiting
1513  * in icmp.c.
1514  *
1515  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1516  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1517  */
1518
1519 void ip_rt_send_redirect(struct sk_buff *skb)
1520 {
1521         struct rtable *rt = skb_rtable(skb);
1522         struct in_device *in_dev;
1523         int log_martians;
1524
1525         rcu_read_lock();
1526         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1527         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1528                 rcu_read_unlock();
1529                 return;
1530         }
1531         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1532         rcu_read_unlock();
1533
1534         /* No redirected packets during ip_rt_redirect_silence;
1535          * reset the algorithm.
1536          */
1537         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1538                 rt->u.dst.rate_tokens = 0;
1539
1540         /* Too many ignored redirects; do not send anything
1541          * set u.dst.rate_last to the last seen redirected packet.
1542          */
1543         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1544                 rt->u.dst.rate_last = jiffies;
1545                 return;
1546         }
1547
1548         /* Check for load limit; set rate_last to the latest sent
1549          * redirect.
1550          */
1551         if (rt->u.dst.rate_tokens == 0 ||
1552             time_after(jiffies,
1553                        (rt->u.dst.rate_last +
1554                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1555                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1556                 rt->u.dst.rate_last = jiffies;
1557                 ++rt->u.dst.rate_tokens;
1558 #ifdef CONFIG_IP_ROUTE_VERBOSE
1559                 if (log_martians &&
1560                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1561                     net_ratelimit())
1562                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1563                                 &rt->rt_src, rt->rt_iif,
1564                                 &rt->rt_dst, &rt->rt_gateway);
1565 #endif
1566         }
1567 }
1568
1569 static int ip_error(struct sk_buff *skb)
1570 {
1571         struct rtable *rt = skb_rtable(skb);
1572         unsigned long now;
1573         int code;
1574
1575         switch (rt->u.dst.error) {
1576                 case EINVAL:
1577                 default:
1578                         goto out;
1579                 case EHOSTUNREACH:
1580                         code = ICMP_HOST_UNREACH;
1581                         break;
1582                 case ENETUNREACH:
1583                         code = ICMP_NET_UNREACH;
1584                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1585                                         IPSTATS_MIB_INNOROUTES);
1586                         break;
1587                 case EACCES:
1588                         code = ICMP_PKT_FILTERED;
1589                         break;
1590         }
1591
1592         now = jiffies;
1593         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1594         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1595                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1596         rt->u.dst.rate_last = now;
1597         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1598                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1599                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1600         }
1601
1602 out:    kfree_skb(skb);
1603         return 0;
1604 }
1605
1606 /*
1607  *      The last two values are not from the RFC but
1608  *      are needed for AMPRnet AX.25 paths.
1609  */
1610
1611 static const unsigned short mtu_plateau[] =
1612 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1613
1614 static inline unsigned short guess_mtu(unsigned short old_mtu)
1615 {
1616         int i;
1617
1618         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1619                 if (old_mtu > mtu_plateau[i])
1620                         return mtu_plateau[i];
1621         return 68;
1622 }
1623
1624 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1625                                  unsigned short new_mtu,
1626                                  struct net_device *dev)
1627 {
1628         int i, k;
1629         unsigned short old_mtu = ntohs(iph->tot_len);
1630         struct rtable *rth;
1631         int  ikeys[2] = { dev->ifindex, 0 };
1632         __be32  skeys[2] = { iph->saddr, 0, };
1633         __be32  daddr = iph->daddr;
1634         unsigned short est_mtu = 0;
1635
1636         for (k = 0; k < 2; k++) {
1637                 for (i = 0; i < 2; i++) {
1638                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1639                                                 rt_genid(net));
1640
1641                         rcu_read_lock();
1642                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1643                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1644                                 unsigned short mtu = new_mtu;
1645
1646                                 if (rth->fl.fl4_dst != daddr ||
1647                                     rth->fl.fl4_src != skeys[i] ||
1648                                     rth->rt_dst != daddr ||
1649                                     rth->rt_src != iph->saddr ||
1650                                     rth->fl.oif != ikeys[k] ||
1651                                     rth->fl.iif != 0 ||
1652                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1653                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1654                                     rt_is_expired(rth))
1655                                         continue;
1656
1657                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1658
1659                                         /* BSD 4.2 compatibility hack :-( */
1660                                         if (mtu == 0 &&
1661                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1662                                             old_mtu >= 68 + (iph->ihl << 2))
1663                                                 old_mtu -= iph->ihl << 2;
1664
1665                                         mtu = guess_mtu(old_mtu);
1666                                 }
1667                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1668                                         if (mtu < dst_mtu(&rth->u.dst)) {
1669                                                 dst_confirm(&rth->u.dst);
1670                                                 if (mtu < ip_rt_min_pmtu) {
1671                                                         mtu = ip_rt_min_pmtu;
1672                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1673                                                                 (1 << RTAX_MTU);
1674                                                 }
1675                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1676                                                 dst_set_expires(&rth->u.dst,
1677                                                         ip_rt_mtu_expires);
1678                                         }
1679                                         est_mtu = mtu;
1680                                 }
1681                         }
1682                         rcu_read_unlock();
1683                 }
1684         }
1685         return est_mtu ? : new_mtu;
1686 }
1687
1688 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1689 {
1690         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1691             !(dst_metric_locked(dst, RTAX_MTU))) {
1692                 if (mtu < ip_rt_min_pmtu) {
1693                         mtu = ip_rt_min_pmtu;
1694                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1695                 }
1696                 dst->metrics[RTAX_MTU-1] = mtu;
1697                 dst_set_expires(dst, ip_rt_mtu_expires);
1698                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1699         }
1700 }
1701
1702 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1703 {
1704         if (rt_is_expired((struct rtable *)dst))
1705                 return NULL;
1706         return dst;
1707 }
1708
1709 static void ipv4_dst_destroy(struct dst_entry *dst)
1710 {
1711         struct rtable *rt = (struct rtable *) dst;
1712         struct inet_peer *peer = rt->peer;
1713         struct in_device *idev = rt->idev;
1714
1715         if (peer) {
1716                 rt->peer = NULL;
1717                 inet_putpeer(peer);
1718         }
1719
1720         if (idev) {
1721                 rt->idev = NULL;
1722                 in_dev_put(idev);
1723         }
1724 }
1725
1726 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1727                             int how)
1728 {
1729         struct rtable *rt = (struct rtable *) dst;
1730         struct in_device *idev = rt->idev;
1731         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1732                 struct in_device *loopback_idev =
1733                         in_dev_get(dev_net(dev)->loopback_dev);
1734                 if (loopback_idev) {
1735                         rt->idev = loopback_idev;
1736                         in_dev_put(idev);
1737                 }
1738         }
1739 }
1740
1741 static void ipv4_link_failure(struct sk_buff *skb)
1742 {
1743         struct rtable *rt;
1744
1745         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1746
1747         rt = skb_rtable(skb);
1748         if (rt)
1749                 dst_set_expires(&rt->u.dst, 0);
1750 }
1751
1752 static int ip_rt_bug(struct sk_buff *skb)
1753 {
1754         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1755                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1756                 skb->dev ? skb->dev->name : "?");
1757         kfree_skb(skb);
1758         return 0;
1759 }
1760
1761 /*
1762    We do not cache source address of outgoing interface,
1763    because it is used only by IP RR, TS and SRR options,
1764    so that it out of fast path.
1765
1766    BTW remember: "addr" is allowed to be not aligned
1767    in IP options!
1768  */
1769
1770 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1771 {
1772         __be32 src;
1773         struct fib_result res;
1774
1775         if (rt->fl.iif == 0)
1776                 src = rt->rt_src;
1777         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1778                 src = FIB_RES_PREFSRC(res);
1779                 fib_res_put(&res);
1780         } else
1781                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1782                                         RT_SCOPE_UNIVERSE);
1783         memcpy(addr, &src, 4);
1784 }
1785
1786 #ifdef CONFIG_NET_CLS_ROUTE
1787 static void set_class_tag(struct rtable *rt, u32 tag)
1788 {
1789         if (!(rt->u.dst.tclassid & 0xFFFF))
1790                 rt->u.dst.tclassid |= tag & 0xFFFF;
1791         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1792                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1793 }
1794 #endif
1795
1796 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1797 {
1798         struct fib_info *fi = res->fi;
1799
1800         if (fi) {
1801                 if (FIB_RES_GW(*res) &&
1802                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1803                         rt->rt_gateway = FIB_RES_GW(*res);
1804                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1805                        sizeof(rt->u.dst.metrics));
1806                 if (fi->fib_mtu == 0) {
1807                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1808                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1809                             rt->rt_gateway != rt->rt_dst &&
1810                             rt->u.dst.dev->mtu > 576)
1811                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1812                 }
1813 #ifdef CONFIG_NET_CLS_ROUTE
1814                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1815 #endif
1816         } else
1817                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1818
1819         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1820                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1821         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1822                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1823         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1824                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1825                                        ip_rt_min_advmss);
1826         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1827                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1828
1829 #ifdef CONFIG_NET_CLS_ROUTE
1830 #ifdef CONFIG_IP_MULTIPLE_TABLES
1831         set_class_tag(rt, fib_rules_tclass(res));
1832 #endif
1833         set_class_tag(rt, itag);
1834 #endif
1835         rt->rt_type = res->type;
1836 }
1837
1838 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1839                                 u8 tos, struct net_device *dev, int our)
1840 {
1841         unsigned hash;
1842         struct rtable *rth;
1843         __be32 spec_dst;
1844         struct in_device *in_dev = in_dev_get(dev);
1845         u32 itag = 0;
1846
1847         /* Primary sanity checks. */
1848
1849         if (in_dev == NULL)
1850                 return -EINVAL;
1851
1852         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1853             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1854                 goto e_inval;
1855
1856         if (ipv4_is_zeronet(saddr)) {
1857                 if (!ipv4_is_local_multicast(daddr))
1858                         goto e_inval;
1859                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1860         } else if (fib_validate_source(saddr, 0, tos, 0,
1861                                         dev, &spec_dst, &itag, 0) < 0)
1862                 goto e_inval;
1863
1864         rth = dst_alloc(&ipv4_dst_ops);
1865         if (!rth)
1866                 goto e_nobufs;
1867
1868         rth->u.dst.output = ip_rt_bug;
1869         rth->u.dst.obsolete = -1;
1870
1871         atomic_set(&rth->u.dst.__refcnt, 1);
1872         rth->u.dst.flags= DST_HOST;
1873         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1874                 rth->u.dst.flags |= DST_NOPOLICY;
1875         rth->fl.fl4_dst = daddr;
1876         rth->rt_dst     = daddr;
1877         rth->fl.fl4_tos = tos;
1878         rth->fl.mark    = skb->mark;
1879         rth->fl.fl4_src = saddr;
1880         rth->rt_src     = saddr;
1881 #ifdef CONFIG_NET_CLS_ROUTE
1882         rth->u.dst.tclassid = itag;
1883 #endif
1884         rth->rt_iif     =
1885         rth->fl.iif     = dev->ifindex;
1886         rth->u.dst.dev  = init_net.loopback_dev;
1887         dev_hold(rth->u.dst.dev);
1888         rth->idev       = in_dev_get(rth->u.dst.dev);
1889         rth->fl.oif     = 0;
1890         rth->rt_gateway = daddr;
1891         rth->rt_spec_dst= spec_dst;
1892         rth->rt_genid   = rt_genid(dev_net(dev));
1893         rth->rt_flags   = RTCF_MULTICAST;
1894         rth->rt_type    = RTN_MULTICAST;
1895         if (our) {
1896                 rth->u.dst.input= ip_local_deliver;
1897                 rth->rt_flags |= RTCF_LOCAL;
1898         }
1899
1900 #ifdef CONFIG_IP_MROUTE
1901         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1902                 rth->u.dst.input = ip_mr_input;
1903 #endif
1904         RT_CACHE_STAT_INC(in_slow_mc);
1905
1906         in_dev_put(in_dev);
1907         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1908         return rt_intern_hash(hash, rth, NULL, skb);
1909
1910 e_nobufs:
1911         in_dev_put(in_dev);
1912         return -ENOBUFS;
1913
1914 e_inval:
1915         in_dev_put(in_dev);
1916         return -EINVAL;
1917 }
1918
1919
1920 static void ip_handle_martian_source(struct net_device *dev,
1921                                      struct in_device *in_dev,
1922                                      struct sk_buff *skb,
1923                                      __be32 daddr,
1924                                      __be32 saddr)
1925 {
1926         RT_CACHE_STAT_INC(in_martian_src);
1927 #ifdef CONFIG_IP_ROUTE_VERBOSE
1928         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1929                 /*
1930                  *      RFC1812 recommendation, if source is martian,
1931                  *      the only hint is MAC header.
1932                  */
1933                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1934                         &daddr, &saddr, dev->name);
1935                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1936                         int i;
1937                         const unsigned char *p = skb_mac_header(skb);
1938                         printk(KERN_WARNING "ll header: ");
1939                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1940                                 printk("%02x", *p);
1941                                 if (i < (dev->hard_header_len - 1))
1942                                         printk(":");
1943                         }
1944                         printk("\n");
1945                 }
1946         }
1947 #endif
1948 }
1949
1950 static int __mkroute_input(struct sk_buff *skb,
1951                            struct fib_result *res,
1952                            struct in_device *in_dev,
1953                            __be32 daddr, __be32 saddr, u32 tos,
1954                            struct rtable **result)
1955 {
1956
1957         struct rtable *rth;
1958         int err;
1959         struct in_device *out_dev;
1960         unsigned flags = 0;
1961         __be32 spec_dst;
1962         u32 itag;
1963
1964         /* get a working reference to the output device */
1965         out_dev = in_dev_get(FIB_RES_DEV(*res));
1966         if (out_dev == NULL) {
1967                 if (net_ratelimit())
1968                         printk(KERN_CRIT "Bug in ip_route_input" \
1969                                "_slow(). Please, report\n");
1970                 return -EINVAL;
1971         }
1972
1973
1974         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1975                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1976         if (err < 0) {
1977                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1978                                          saddr);
1979
1980                 err = -EINVAL;
1981                 goto cleanup;
1982         }
1983
1984         if (err)
1985                 flags |= RTCF_DIRECTSRC;
1986
1987         if (out_dev == in_dev && err &&
1988             (IN_DEV_SHARED_MEDIA(out_dev) ||
1989              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1990                 flags |= RTCF_DOREDIRECT;
1991
1992         if (skb->protocol != htons(ETH_P_IP)) {
1993                 /* Not IP (i.e. ARP). Do not create route, if it is
1994                  * invalid for proxy arp. DNAT routes are always valid.
1995                  */
1996                 if (out_dev == in_dev) {
1997                         err = -EINVAL;
1998                         goto cleanup;
1999                 }
2000         }
2001
2002
2003         rth = dst_alloc(&ipv4_dst_ops);
2004         if (!rth) {
2005                 err = -ENOBUFS;
2006                 goto cleanup;
2007         }
2008
2009         atomic_set(&rth->u.dst.__refcnt, 1);
2010         rth->u.dst.flags= DST_HOST;
2011         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2012                 rth->u.dst.flags |= DST_NOPOLICY;
2013         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2014                 rth->u.dst.flags |= DST_NOXFRM;
2015         rth->fl.fl4_dst = daddr;
2016         rth->rt_dst     = daddr;
2017         rth->fl.fl4_tos = tos;
2018         rth->fl.mark    = skb->mark;
2019         rth->fl.fl4_src = saddr;
2020         rth->rt_src     = saddr;
2021         rth->rt_gateway = daddr;
2022         rth->rt_iif     =
2023                 rth->fl.iif     = in_dev->dev->ifindex;
2024         rth->u.dst.dev  = (out_dev)->dev;
2025         dev_hold(rth->u.dst.dev);
2026         rth->idev       = in_dev_get(rth->u.dst.dev);
2027         rth->fl.oif     = 0;
2028         rth->rt_spec_dst= spec_dst;
2029
2030         rth->u.dst.obsolete = -1;
2031         rth->u.dst.input = ip_forward;
2032         rth->u.dst.output = ip_output;
2033         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2034
2035         rt_set_nexthop(rth, res, itag);
2036
2037         rth->rt_flags = flags;
2038
2039         *result = rth;
2040         err = 0;
2041  cleanup:
2042         /* release the working reference to the output device */
2043         in_dev_put(out_dev);
2044         return err;
2045 }
2046
2047 static int ip_mkroute_input(struct sk_buff *skb,
2048                             struct fib_result *res,
2049                             const struct flowi *fl,
2050                             struct in_device *in_dev,
2051                             __be32 daddr, __be32 saddr, u32 tos)
2052 {
2053         struct rtable* rth = NULL;
2054         int err;
2055         unsigned hash;
2056
2057 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2058         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2059                 fib_select_multipath(fl, res);
2060 #endif
2061
2062         /* create a routing cache entry */
2063         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2064         if (err)
2065                 return err;
2066
2067         /* put it into the cache */
2068         hash = rt_hash(daddr, saddr, fl->iif,
2069                        rt_genid(dev_net(rth->u.dst.dev)));
2070         return rt_intern_hash(hash, rth, NULL, skb);
2071 }
2072
2073 /*
2074  *      NOTE. We drop all the packets that has local source
2075  *      addresses, because every properly looped back packet
2076  *      must have correct destination already attached by output routine.
2077  *
2078  *      Such approach solves two big problems:
2079  *      1. Not simplex devices are handled properly.
2080  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2081  */
2082
2083 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2084                                u8 tos, struct net_device *dev)
2085 {
2086         struct fib_result res;
2087         struct in_device *in_dev = in_dev_get(dev);
2088         struct flowi fl = { .nl_u = { .ip4_u =
2089                                       { .daddr = daddr,
2090                                         .saddr = saddr,
2091                                         .tos = tos,
2092                                         .scope = RT_SCOPE_UNIVERSE,
2093                                       } },
2094                             .mark = skb->mark,
2095                             .iif = dev->ifindex };
2096         unsigned        flags = 0;
2097         u32             itag = 0;
2098         struct rtable * rth;
2099         unsigned        hash;
2100         __be32          spec_dst;
2101         int             err = -EINVAL;
2102         int             free_res = 0;
2103         struct net    * net = dev_net(dev);
2104
2105         /* IP on this device is disabled. */
2106
2107         if (!in_dev)
2108                 goto out;
2109
2110         /* Check for the most weird martians, which can be not detected
2111            by fib_lookup.
2112          */
2113
2114         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2115             ipv4_is_loopback(saddr))
2116                 goto martian_source;
2117
2118         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2119                 goto brd_input;
2120
2121         /* Accept zero addresses only to limited broadcast;
2122          * I even do not know to fix it or not. Waiting for complains :-)
2123          */
2124         if (ipv4_is_zeronet(saddr))
2125                 goto martian_source;
2126
2127         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2128             ipv4_is_loopback(daddr))
2129                 goto martian_destination;
2130
2131         /*
2132          *      Now we are ready to route packet.
2133          */
2134         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2135                 if (!IN_DEV_FORWARD(in_dev))
2136                         goto e_hostunreach;
2137                 goto no_route;
2138         }
2139         free_res = 1;
2140
2141         RT_CACHE_STAT_INC(in_slow_tot);
2142
2143         if (res.type == RTN_BROADCAST)
2144                 goto brd_input;
2145
2146         if (res.type == RTN_LOCAL) {
2147                 int result;
2148                 result = fib_validate_source(saddr, daddr, tos,
2149                                              net->loopback_dev->ifindex,
2150                                              dev, &spec_dst, &itag, skb->mark);
2151                 if (result < 0)
2152                         goto martian_source;
2153                 if (result)
2154                         flags |= RTCF_DIRECTSRC;
2155                 spec_dst = daddr;
2156                 goto local_input;
2157         }
2158
2159         if (!IN_DEV_FORWARD(in_dev))
2160                 goto e_hostunreach;
2161         if (res.type != RTN_UNICAST)
2162                 goto martian_destination;
2163
2164         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2165 done:
2166         in_dev_put(in_dev);
2167         if (free_res)
2168                 fib_res_put(&res);
2169 out:    return err;
2170
2171 brd_input:
2172         if (skb->protocol != htons(ETH_P_IP))
2173                 goto e_inval;
2174
2175         if (ipv4_is_zeronet(saddr))
2176                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2177         else {
2178                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2179                                           &itag, skb->mark);
2180                 if (err < 0)
2181                         goto martian_source;
2182                 if (err)
2183                         flags |= RTCF_DIRECTSRC;
2184         }
2185         flags |= RTCF_BROADCAST;
2186         res.type = RTN_BROADCAST;
2187         RT_CACHE_STAT_INC(in_brd);
2188
2189 local_input:
2190         rth = dst_alloc(&ipv4_dst_ops);
2191         if (!rth)
2192                 goto e_nobufs;
2193
2194         rth->u.dst.output= ip_rt_bug;
2195         rth->u.dst.obsolete = -1;
2196         rth->rt_genid = rt_genid(net);
2197
2198         atomic_set(&rth->u.dst.__refcnt, 1);
2199         rth->u.dst.flags= DST_HOST;
2200         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2201                 rth->u.dst.flags |= DST_NOPOLICY;
2202         rth->fl.fl4_dst = daddr;
2203         rth->rt_dst     = daddr;
2204         rth->fl.fl4_tos = tos;
2205         rth->fl.mark    = skb->mark;
2206         rth->fl.fl4_src = saddr;
2207         rth->rt_src     = saddr;
2208 #ifdef CONFIG_NET_CLS_ROUTE
2209         rth->u.dst.tclassid = itag;
2210 #endif
2211         rth->rt_iif     =
2212         rth->fl.iif     = dev->ifindex;
2213         rth->u.dst.dev  = net->loopback_dev;
2214         dev_hold(rth->u.dst.dev);
2215         rth->idev       = in_dev_get(rth->u.dst.dev);
2216         rth->rt_gateway = daddr;
2217         rth->rt_spec_dst= spec_dst;
2218         rth->u.dst.input= ip_local_deliver;
2219         rth->rt_flags   = flags|RTCF_LOCAL;
2220         if (res.type == RTN_UNREACHABLE) {
2221                 rth->u.dst.input= ip_error;
2222                 rth->u.dst.error= -err;
2223                 rth->rt_flags   &= ~RTCF_LOCAL;
2224         }
2225         rth->rt_type    = res.type;
2226         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2227         err = rt_intern_hash(hash, rth, NULL, skb);
2228         goto done;
2229
2230 no_route:
2231         RT_CACHE_STAT_INC(in_no_route);
2232         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2233         res.type = RTN_UNREACHABLE;
2234         if (err == -ESRCH)
2235                 err = -ENETUNREACH;
2236         goto local_input;
2237
2238         /*
2239          *      Do not cache martian addresses: they should be logged (RFC1812)
2240          */
2241 martian_destination:
2242         RT_CACHE_STAT_INC(in_martian_dst);
2243 #ifdef CONFIG_IP_ROUTE_VERBOSE
2244         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2245                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2246                         &daddr, &saddr, dev->name);
2247 #endif
2248
2249 e_hostunreach:
2250         err = -EHOSTUNREACH;
2251         goto done;
2252
2253 e_inval:
2254         err = -EINVAL;
2255         goto done;
2256
2257 e_nobufs:
2258         err = -ENOBUFS;
2259         goto done;
2260
2261 martian_source:
2262         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2263         goto e_inval;
2264 }
2265
2266 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2267                    u8 tos, struct net_device *dev)
2268 {
2269         struct rtable * rth;
2270         unsigned        hash;
2271         int iif = dev->ifindex;
2272         struct net *net;
2273
2274         net = dev_net(dev);
2275
2276         if (!rt_caching(net))
2277                 goto skip_cache;
2278
2279         tos &= IPTOS_RT_MASK;
2280         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2281
2282         rcu_read_lock();
2283         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2284              rth = rcu_dereference(rth->u.dst.rt_next)) {
2285                 if (((rth->fl.fl4_dst ^ daddr) |
2286                      (rth->fl.fl4_src ^ saddr) |
2287                      (rth->fl.iif ^ iif) |
2288                      rth->fl.oif |
2289                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2290                     rth->fl.mark == skb->mark &&
2291                     net_eq(dev_net(rth->u.dst.dev), net) &&
2292                     !rt_is_expired(rth)) {
2293                         dst_use(&rth->u.dst, jiffies);
2294                         RT_CACHE_STAT_INC(in_hit);
2295                         rcu_read_unlock();
2296                         skb_dst_set(skb, &rth->u.dst);
2297                         return 0;
2298                 }
2299                 RT_CACHE_STAT_INC(in_hlist_search);
2300         }
2301         rcu_read_unlock();
2302
2303 skip_cache:
2304         /* Multicast recognition logic is moved from route cache to here.
2305            The problem was that too many Ethernet cards have broken/missing
2306            hardware multicast filters :-( As result the host on multicasting
2307            network acquires a lot of useless route cache entries, sort of
2308            SDR messages from all the world. Now we try to get rid of them.
2309            Really, provided software IP multicast filter is organized
2310            reasonably (at least, hashed), it does not result in a slowdown
2311            comparing with route cache reject entries.
2312            Note, that multicast routers are not affected, because
2313            route cache entry is created eventually.
2314          */
2315         if (ipv4_is_multicast(daddr)) {
2316                 struct in_device *in_dev;
2317
2318                 rcu_read_lock();
2319                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2320                         int our = ip_check_mc(in_dev, daddr, saddr,
2321                                 ip_hdr(skb)->protocol);
2322                         if (our
2323 #ifdef CONFIG_IP_MROUTE
2324                                 ||
2325                             (!ipv4_is_local_multicast(daddr) &&
2326                              IN_DEV_MFORWARD(in_dev))
2327 #endif
2328                            ) {
2329                                 rcu_read_unlock();
2330                                 return ip_route_input_mc(skb, daddr, saddr,
2331                                                          tos, dev, our);
2332                         }
2333                 }
2334                 rcu_read_unlock();
2335                 return -EINVAL;
2336         }
2337         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2338 }
2339
2340 static int __mkroute_output(struct rtable **result,
2341                             struct fib_result *res,
2342                             const struct flowi *fl,
2343                             const struct flowi *oldflp,
2344                             struct net_device *dev_out,
2345                             unsigned flags)
2346 {
2347         struct rtable *rth;
2348         struct in_device *in_dev;
2349         u32 tos = RT_FL_TOS(oldflp);
2350         int err = 0;
2351
2352         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2353                 return -EINVAL;
2354
2355         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2356                 res->type = RTN_BROADCAST;
2357         else if (ipv4_is_multicast(fl->fl4_dst))
2358                 res->type = RTN_MULTICAST;
2359         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2360                 return -EINVAL;
2361
2362         if (dev_out->flags & IFF_LOOPBACK)
2363                 flags |= RTCF_LOCAL;
2364
2365         /* get work reference to inet device */
2366         in_dev = in_dev_get(dev_out);
2367         if (!in_dev)
2368                 return -EINVAL;
2369
2370         if (res->type == RTN_BROADCAST) {
2371                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2372                 if (res->fi) {
2373                         fib_info_put(res->fi);
2374                         res->fi = NULL;
2375                 }
2376         } else if (res->type == RTN_MULTICAST) {
2377                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2378                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2379                                  oldflp->proto))
2380                         flags &= ~RTCF_LOCAL;
2381                 /* If multicast route do not exist use
2382                    default one, but do not gateway in this case.
2383                    Yes, it is hack.
2384                  */
2385                 if (res->fi && res->prefixlen < 4) {
2386                         fib_info_put(res->fi);
2387                         res->fi = NULL;
2388                 }
2389         }
2390
2391
2392         rth = dst_alloc(&ipv4_dst_ops);
2393         if (!rth) {
2394                 err = -ENOBUFS;
2395                 goto cleanup;
2396         }
2397
2398         atomic_set(&rth->u.dst.__refcnt, 1);
2399         rth->u.dst.flags= DST_HOST;
2400         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2401                 rth->u.dst.flags |= DST_NOXFRM;
2402         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2403                 rth->u.dst.flags |= DST_NOPOLICY;
2404
2405         rth->fl.fl4_dst = oldflp->fl4_dst;
2406         rth->fl.fl4_tos = tos;
2407         rth->fl.fl4_src = oldflp->fl4_src;
2408         rth->fl.oif     = oldflp->oif;
2409         rth->fl.mark    = oldflp->mark;
2410         rth->rt_dst     = fl->fl4_dst;
2411         rth->rt_src     = fl->fl4_src;
2412         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2413         /* get references to the devices that are to be hold by the routing
2414            cache entry */
2415         rth->u.dst.dev  = dev_out;
2416         dev_hold(dev_out);
2417         rth->idev       = in_dev_get(dev_out);
2418         rth->rt_gateway = fl->fl4_dst;
2419         rth->rt_spec_dst= fl->fl4_src;
2420
2421         rth->u.dst.output=ip_output;
2422         rth->u.dst.obsolete = -1;
2423         rth->rt_genid = rt_genid(dev_net(dev_out));
2424
2425         RT_CACHE_STAT_INC(out_slow_tot);
2426
2427         if (flags & RTCF_LOCAL) {
2428                 rth->u.dst.input = ip_local_deliver;
2429                 rth->rt_spec_dst = fl->fl4_dst;
2430         }
2431         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2432                 rth->rt_spec_dst = fl->fl4_src;
2433                 if (flags & RTCF_LOCAL &&
2434                     !(dev_out->flags & IFF_LOOPBACK)) {
2435                         rth->u.dst.output = ip_mc_output;
2436                         RT_CACHE_STAT_INC(out_slow_mc);
2437                 }
2438 #ifdef CONFIG_IP_MROUTE
2439                 if (res->type == RTN_MULTICAST) {
2440                         if (IN_DEV_MFORWARD(in_dev) &&
2441                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2442                                 rth->u.dst.input = ip_mr_input;
2443                                 rth->u.dst.output = ip_mc_output;
2444                         }
2445                 }
2446 #endif
2447         }
2448
2449         rt_set_nexthop(rth, res, 0);
2450
2451         rth->rt_flags = flags;
2452
2453         *result = rth;
2454  cleanup:
2455         /* release work reference to inet device */
2456         in_dev_put(in_dev);
2457
2458         return err;
2459 }
2460
2461 static int ip_mkroute_output(struct rtable **rp,
2462                              struct fib_result *res,
2463                              const struct flowi *fl,
2464                              const struct flowi *oldflp,
2465                              struct net_device *dev_out,
2466                              unsigned flags)
2467 {
2468         struct rtable *rth = NULL;
2469         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470         unsigned hash;
2471         if (err == 0) {
2472                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473                                rt_genid(dev_net(dev_out)));
2474                 err = rt_intern_hash(hash, rth, rp, NULL);
2475         }
2476
2477         return err;
2478 }
2479
2480 /*
2481  * Major route resolver routine.
2482  */
2483
2484 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2485                                 const struct flowi *oldflp)
2486 {
2487         u32 tos = RT_FL_TOS(oldflp);
2488         struct flowi fl = { .nl_u = { .ip4_u =
2489                                       { .daddr = oldflp->fl4_dst,
2490                                         .saddr = oldflp->fl4_src,
2491                                         .tos = tos & IPTOS_RT_MASK,
2492                                         .scope = ((tos & RTO_ONLINK) ?
2493                                                   RT_SCOPE_LINK :
2494                                                   RT_SCOPE_UNIVERSE),
2495                                       } },
2496                             .mark = oldflp->mark,
2497                             .iif = net->loopback_dev->ifindex,
2498                             .oif = oldflp->oif };
2499         struct fib_result res;
2500         unsigned flags = 0;
2501         struct net_device *dev_out = NULL;
2502         int free_res = 0;
2503         int err;
2504
2505
2506         res.fi          = NULL;
2507 #ifdef CONFIG_IP_MULTIPLE_TABLES
2508         res.r           = NULL;
2509 #endif
2510
2511         if (oldflp->fl4_src) {
2512                 err = -EINVAL;
2513                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2514                     ipv4_is_lbcast(oldflp->fl4_src) ||
2515                     ipv4_is_zeronet(oldflp->fl4_src))
2516                         goto out;
2517
2518                 /* I removed check for oif == dev_out->oif here.
2519                    It was wrong for two reasons:
2520                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2521                       is assigned to multiple interfaces.
2522                    2. Moreover, we are allowed to send packets with saddr
2523                       of another iface. --ANK
2524                  */
2525
2526                 if (oldflp->oif == 0 &&
2527                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2528                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2529                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2530                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2531                         if (dev_out == NULL)
2532                                 goto out;
2533
2534                         /* Special hack: user can direct multicasts
2535                            and limited broadcast via necessary interface
2536                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2537                            This hack is not just for fun, it allows
2538                            vic,vat and friends to work.
2539                            They bind socket to loopback, set ttl to zero
2540                            and expect that it will work.
2541                            From the viewpoint of routing cache they are broken,
2542                            because we are not allowed to build multicast path
2543                            with loopback source addr (look, routing cache
2544                            cannot know, that ttl is zero, so that packet
2545                            will not leave this host and route is valid).
2546                            Luckily, this hack is good workaround.
2547                          */
2548
2549                         fl.oif = dev_out->ifindex;
2550                         goto make_route;
2551                 }
2552
2553                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2554                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2555                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2556                         if (dev_out == NULL)
2557                                 goto out;
2558                         dev_put(dev_out);
2559                         dev_out = NULL;
2560                 }
2561         }
2562
2563
2564         if (oldflp->oif) {
2565                 dev_out = dev_get_by_index(net, oldflp->oif);
2566                 err = -ENODEV;
2567                 if (dev_out == NULL)
2568                         goto out;
2569
2570                 /* RACE: Check return value of inet_select_addr instead. */
2571                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2572                         dev_put(dev_out);
2573                         goto out;       /* Wrong error code */
2574                 }
2575
2576                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2577                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2578                         if (!fl.fl4_src)
2579                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2580                                                               RT_SCOPE_LINK);
2581                         goto make_route;
2582                 }
2583                 if (!fl.fl4_src) {
2584                         if (ipv4_is_multicast(oldflp->fl4_dst))
2585                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2586                                                               fl.fl4_scope);
2587                         else if (!oldflp->fl4_dst)
2588                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2589                                                               RT_SCOPE_HOST);
2590                 }
2591         }
2592
2593         if (!fl.fl4_dst) {
2594                 fl.fl4_dst = fl.fl4_src;
2595                 if (!fl.fl4_dst)
2596                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2597                 if (dev_out)
2598                         dev_put(dev_out);
2599                 dev_out = net->loopback_dev;
2600                 dev_hold(dev_out);
2601                 fl.oif = net->loopback_dev->ifindex;
2602                 res.type = RTN_LOCAL;
2603                 flags |= RTCF_LOCAL;
2604                 goto make_route;
2605         }
2606
2607         if (fib_lookup(net, &fl, &res)) {
2608                 res.fi = NULL;
2609                 if (oldflp->oif) {
2610                         /* Apparently, routing tables are wrong. Assume,
2611                            that the destination is on link.
2612
2613                            WHY? DW.
2614                            Because we are allowed to send to iface
2615                            even if it has NO routes and NO assigned
2616                            addresses. When oif is specified, routing
2617                            tables are looked up with only one purpose:
2618                            to catch if destination is gatewayed, rather than
2619                            direct. Moreover, if MSG_DONTROUTE is set,
2620                            we send packet, ignoring both routing tables
2621                            and ifaddr state. --ANK
2622
2623
2624                            We could make it even if oif is unknown,
2625                            likely IPv6, but we do not.
2626                          */
2627
2628                         if (fl.fl4_src == 0)
2629                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2630                                                               RT_SCOPE_LINK);
2631                         res.type = RTN_UNICAST;
2632                         goto make_route;
2633                 }
2634                 if (dev_out)
2635                         dev_put(dev_out);
2636                 err = -ENETUNREACH;
2637                 goto out;
2638         }
2639         free_res = 1;
2640
2641         if (res.type == RTN_LOCAL) {
2642                 if (!fl.fl4_src)
2643                         fl.fl4_src = fl.fl4_dst;
2644                 if (dev_out)
2645                         dev_put(dev_out);
2646                 dev_out = net->loopback_dev;
2647                 dev_hold(dev_out);
2648                 fl.oif = dev_out->ifindex;
2649                 if (res.fi)
2650                         fib_info_put(res.fi);
2651                 res.fi = NULL;
2652                 flags |= RTCF_LOCAL;
2653                 goto make_route;
2654         }
2655
2656 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2657         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2658                 fib_select_multipath(&fl, &res);
2659         else
2660 #endif
2661         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2662                 fib_select_default(net, &fl, &res);
2663
2664         if (!fl.fl4_src)
2665                 fl.fl4_src = FIB_RES_PREFSRC(res);
2666
2667         if (dev_out)
2668                 dev_put(dev_out);
2669         dev_out = FIB_RES_DEV(res);
2670         dev_hold(dev_out);
2671         fl.oif = dev_out->ifindex;
2672
2673
2674 make_route:
2675         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2676
2677
2678         if (free_res)
2679                 fib_res_put(&res);
2680         if (dev_out)
2681                 dev_put(dev_out);
2682 out:    return err;
2683 }
2684
2685 int __ip_route_output_key(struct net *net, struct rtable **rp,
2686                           const struct flowi *flp)
2687 {
2688         unsigned hash;
2689         struct rtable *rth;
2690
2691         if (!rt_caching(net))
2692                 goto slow_output;
2693
2694         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2695
2696         rcu_read_lock_bh();
2697         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2698                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2699                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2700                     rth->fl.fl4_src == flp->fl4_src &&
2701                     rth->fl.iif == 0 &&
2702                     rth->fl.oif == flp->oif &&
2703                     rth->fl.mark == flp->mark &&
2704                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2705                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2706                     net_eq(dev_net(rth->u.dst.dev), net) &&
2707                     !rt_is_expired(rth)) {
2708                         dst_use(&rth->u.dst, jiffies);
2709                         RT_CACHE_STAT_INC(out_hit);
2710                         rcu_read_unlock_bh();
2711                         *rp = rth;
2712                         return 0;
2713                 }
2714                 RT_CACHE_STAT_INC(out_hlist_search);
2715         }
2716         rcu_read_unlock_bh();
2717
2718 slow_output:
2719         return ip_route_output_slow(net, rp, flp);
2720 }
2721
2722 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2723
2724 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2725 {
2726         return NULL;
2727 }
2728
2729 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2730 {
2731 }
2732
2733 static struct dst_ops ipv4_dst_blackhole_ops = {
2734         .family                 =       AF_INET,
2735         .protocol               =       cpu_to_be16(ETH_P_IP),
2736         .destroy                =       ipv4_dst_destroy,
2737         .check                  =       ipv4_blackhole_dst_check,
2738         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2739         .entries                =       ATOMIC_INIT(0),
2740 };
2741
2742
2743 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2744 {
2745         struct rtable *ort = *rp;
2746         struct rtable *rt = (struct rtable *)
2747                 dst_alloc(&ipv4_dst_blackhole_ops);
2748
2749         if (rt) {
2750                 struct dst_entry *new = &rt->u.dst;
2751
2752                 atomic_set(&new->__refcnt, 1);
2753                 new->__use = 1;
2754                 new->input = dst_discard;
2755                 new->output = dst_discard;
2756                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2757
2758                 new->dev = ort->u.dst.dev;
2759                 if (new->dev)
2760                         dev_hold(new->dev);
2761
2762                 rt->fl = ort->fl;
2763
2764                 rt->idev = ort->idev;
2765                 if (rt->idev)
2766                         in_dev_hold(rt->idev);
2767                 rt->rt_genid = rt_genid(net);
2768                 rt->rt_flags = ort->rt_flags;
2769                 rt->rt_type = ort->rt_type;
2770                 rt->rt_dst = ort->rt_dst;
2771                 rt->rt_src = ort->rt_src;
2772                 rt->rt_iif = ort->rt_iif;
2773                 rt->rt_gateway = ort->rt_gateway;
2774                 rt->rt_spec_dst = ort->rt_spec_dst;
2775                 rt->peer = ort->peer;
2776                 if (rt->peer)
2777                         atomic_inc(&rt->peer->refcnt);
2778
2779                 dst_free(new);
2780         }
2781
2782         dst_release(&(*rp)->u.dst);
2783         *rp = rt;
2784         return (rt ? 0 : -ENOMEM);
2785 }
2786
2787 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2788                          struct sock *sk, int flags)
2789 {
2790         int err;
2791
2792         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2793                 return err;
2794
2795         if (flp->proto) {
2796                 if (!flp->fl4_src)
2797                         flp->fl4_src = (*rp)->rt_src;
2798                 if (!flp->fl4_dst)
2799                         flp->fl4_dst = (*rp)->rt_dst;
2800                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2801                                     flags ? XFRM_LOOKUP_WAIT : 0);
2802                 if (err == -EREMOTE)
2803                         err = ipv4_dst_blackhole(net, rp, flp);
2804
2805                 return err;
2806         }
2807
2808         return 0;
2809 }
2810
2811 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2812
2813 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2814 {
2815         return ip_route_output_flow(net, rp, flp, NULL, 0);
2816 }
2817
2818 static int rt_fill_info(struct net *net,
2819                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2820                         int nowait, unsigned int flags)
2821 {
2822         struct rtable *rt = skb_rtable(skb);
2823         struct rtmsg *r;
2824         struct nlmsghdr *nlh;
2825         long expires;
2826         u32 id = 0, ts = 0, tsage = 0, error;
2827
2828         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2829         if (nlh == NULL)
2830                 return -EMSGSIZE;
2831
2832         r = nlmsg_data(nlh);
2833         r->rtm_family    = AF_INET;
2834         r->rtm_dst_len  = 32;
2835         r->rtm_src_len  = 0;
2836         r->rtm_tos      = rt->fl.fl4_tos;
2837         r->rtm_table    = RT_TABLE_MAIN;
2838         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2839         r->rtm_type     = rt->rt_type;
2840         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2841         r->rtm_protocol = RTPROT_UNSPEC;
2842         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2843         if (rt->rt_flags & RTCF_NOTIFY)
2844                 r->rtm_flags |= RTM_F_NOTIFY;
2845
2846         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2847
2848         if (rt->fl.fl4_src) {
2849                 r->rtm_src_len = 32;
2850                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2851         }
2852         if (rt->u.dst.dev)
2853                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2854 #ifdef CONFIG_NET_CLS_ROUTE
2855         if (rt->u.dst.tclassid)
2856                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2857 #endif
2858         if (rt->fl.iif)
2859                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2860         else if (rt->rt_src != rt->fl.fl4_src)
2861                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2862
2863         if (rt->rt_dst != rt->rt_gateway)
2864                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2865
2866         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2867                 goto nla_put_failure;
2868
2869         error = rt->u.dst.error;
2870         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2871         if (rt->peer) {
2872                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2873                 if (rt->peer->tcp_ts_stamp) {
2874                         ts = rt->peer->tcp_ts;
2875                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2876                 }
2877         }
2878
2879         if (rt->fl.iif) {
2880 #ifdef CONFIG_IP_MROUTE
2881                 __be32 dst = rt->rt_dst;
2882
2883                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2884                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2885                         int err = ipmr_get_route(net, skb, r, nowait);
2886                         if (err <= 0) {
2887                                 if (!nowait) {
2888                                         if (err == 0)
2889                                                 return 0;
2890                                         goto nla_put_failure;
2891                                 } else {
2892                                         if (err == -EMSGSIZE)
2893                                                 goto nla_put_failure;
2894                                         error = err;
2895                                 }
2896                         }
2897                 } else
2898 #endif
2899                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2900         }
2901
2902         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2903                                expires, error) < 0)
2904                 goto nla_put_failure;
2905
2906         return nlmsg_end(skb, nlh);
2907
2908 nla_put_failure:
2909         nlmsg_cancel(skb, nlh);
2910         return -EMSGSIZE;
2911 }
2912
2913 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2914 {
2915         struct net *net = sock_net(in_skb->sk);
2916         struct rtmsg *rtm;
2917         struct nlattr *tb[RTA_MAX+1];
2918         struct rtable *rt = NULL;
2919         __be32 dst = 0;
2920         __be32 src = 0;
2921         u32 iif;
2922         int err;
2923         struct sk_buff *skb;
2924
2925         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2926         if (err < 0)
2927                 goto errout;
2928
2929         rtm = nlmsg_data(nlh);
2930
2931         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2932         if (skb == NULL) {
2933                 err = -ENOBUFS;
2934                 goto errout;
2935         }
2936
2937         /* Reserve room for dummy headers, this skb can pass
2938            through good chunk of routing engine.
2939          */
2940         skb_reset_mac_header(skb);
2941         skb_reset_network_header(skb);
2942
2943         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2944         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2945         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2946
2947         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2948         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2949         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2950
2951         if (iif) {
2952                 struct net_device *dev;
2953
2954                 dev = __dev_get_by_index(net, iif);
2955                 if (dev == NULL) {
2956                         err = -ENODEV;
2957                         goto errout_free;
2958                 }
2959
2960                 skb->protocol   = htons(ETH_P_IP);
2961                 skb->dev        = dev;
2962                 local_bh_disable();
2963                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2964                 local_bh_enable();
2965
2966                 rt = skb_rtable(skb);
2967                 if (err == 0 && rt->u.dst.error)
2968                         err = -rt->u.dst.error;
2969         } else {
2970                 struct flowi fl = {
2971                         .nl_u = {
2972                                 .ip4_u = {
2973                                         .daddr = dst,
2974                                         .saddr = src,
2975                                         .tos = rtm->rtm_tos,
2976                                 },
2977                         },
2978                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2979                 };
2980                 err = ip_route_output_key(net, &rt, &fl);
2981         }
2982
2983         if (err)
2984                 goto errout_free;
2985
2986         skb_dst_set(skb, &rt->u.dst);
2987         if (rtm->rtm_flags & RTM_F_NOTIFY)
2988                 rt->rt_flags |= RTCF_NOTIFY;
2989
2990         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2991                            RTM_NEWROUTE, 0, 0);
2992         if (err <= 0)
2993                 goto errout_free;
2994
2995         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2996 errout:
2997         return err;
2998
2999 errout_free:
3000         kfree_skb(skb);
3001         goto errout;
3002 }
3003
3004 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3005 {
3006         struct rtable *rt;
3007         int h, s_h;
3008         int idx, s_idx;
3009         struct net *net;
3010
3011         net = sock_net(skb->sk);
3012
3013         s_h = cb->args[0];
3014         if (s_h < 0)
3015                 s_h = 0;
3016         s_idx = idx = cb->args[1];
3017         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3018                 if (!rt_hash_table[h].chain)
3019                         continue;
3020                 rcu_read_lock_bh();
3021                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3022                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3023                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3024                                 continue;
3025                         if (rt_is_expired(rt))
3026                                 continue;
3027                         skb_dst_set(skb, dst_clone(&rt->u.dst));
3028                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3029                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3030                                          1, NLM_F_MULTI) <= 0) {
3031                                 skb_dst_drop(skb);
3032                                 rcu_read_unlock_bh();
3033                                 goto done;
3034                         }
3035                         skb_dst_drop(skb);
3036                 }
3037                 rcu_read_unlock_bh();
3038         }
3039
3040 done:
3041         cb->args[0] = h;
3042         cb->args[1] = idx;
3043         return skb->len;
3044 }
3045
3046 void ip_rt_multicast_event(struct in_device *in_dev)
3047 {
3048         rt_cache_flush(dev_net(in_dev->dev), 0);
3049 }
3050
3051 #ifdef CONFIG_SYSCTL
3052 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3053                                         void __user *buffer,
3054                                         size_t *lenp, loff_t *ppos)
3055 {
3056         if (write) {
3057                 int flush_delay;
3058                 ctl_table ctl;
3059                 struct net *net;
3060
3061                 memcpy(&ctl, __ctl, sizeof(ctl));
3062                 ctl.data = &flush_delay;
3063                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3064
3065                 net = (struct net *)__ctl->extra1;
3066                 rt_cache_flush(net, flush_delay);
3067                 return 0;
3068         }
3069
3070         return -EINVAL;
3071 }
3072
3073 static void rt_secret_reschedule(int old)
3074 {
3075         struct net *net;
3076         int new = ip_rt_secret_interval;
3077         int diff = new - old;
3078
3079         if (!diff)
3080                 return;
3081
3082         rtnl_lock();
3083         for_each_net(net) {
3084                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3085                 long time;
3086
3087                 if (!new)
3088                         continue;
3089
3090                 if (deleted) {
3091                         time = net->ipv4.rt_secret_timer.expires - jiffies;
3092
3093                         if (time <= 0 || (time += diff) <= 0)
3094                                 time = 0;
3095                 } else
3096                         time = new;
3097
3098                 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3099         }
3100         rtnl_unlock();
3101 }
3102
3103 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3104                                           void __user *buffer, size_t *lenp,
3105                                           loff_t *ppos)
3106 {
3107         int old = ip_rt_secret_interval;
3108         int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3109
3110         rt_secret_reschedule(old);
3111
3112         return ret;
3113 }
3114
3115 static ctl_table ipv4_route_table[] = {
3116         {
3117                 .procname       = "gc_thresh",
3118                 .data           = &ipv4_dst_ops.gc_thresh,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec,
3122         },
3123         {
3124                 .procname       = "max_size",
3125                 .data           = &ip_rt_max_size,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec,
3129         },
3130         {
3131                 /*  Deprecated. Use gc_min_interval_ms */
3132
3133                 .procname       = "gc_min_interval",
3134                 .data           = &ip_rt_gc_min_interval,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec_jiffies,
3138         },
3139         {
3140                 .procname       = "gc_min_interval_ms",
3141                 .data           = &ip_rt_gc_min_interval,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec_ms_jiffies,
3145         },
3146         {
3147                 .procname       = "gc_timeout",
3148                 .data           = &ip_rt_gc_timeout,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec_jiffies,
3152         },
3153         {
3154                 .procname       = "gc_interval",
3155                 .data           = &ip_rt_gc_interval,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec_jiffies,
3159         },
3160         {
3161                 .procname       = "redirect_load",
3162                 .data           = &ip_rt_redirect_load,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec,
3166         },
3167         {
3168                 .procname       = "redirect_number",
3169                 .data           = &ip_rt_redirect_number,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec,
3173         },
3174         {
3175                 .procname       = "redirect_silence",
3176                 .data           = &ip_rt_redirect_silence,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec,
3180         },
3181         {
3182                 .procname       = "error_cost",
3183                 .data           = &ip_rt_error_cost,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "error_burst",
3190                 .data           = &ip_rt_error_burst,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .procname       = "gc_elasticity",
3197                 .data           = &ip_rt_gc_elasticity,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "mtu_expires",
3204                 .data           = &ip_rt_mtu_expires,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec_jiffies,
3208         },
3209         {
3210                 .procname       = "min_pmtu",
3211                 .data           = &ip_rt_min_pmtu,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec,
3215         },
3216         {
3217                 .procname       = "min_adv_mss",
3218                 .data           = &ip_rt_min_advmss,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec,
3222         },
3223         {
3224                 .procname       = "secret_interval",
3225                 .data           = &ip_rt_secret_interval,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3229         },
3230         { }
3231 };
3232
3233 static struct ctl_table empty[1];
3234
3235 static struct ctl_table ipv4_skeleton[] =
3236 {
3237         { .procname = "route",
3238           .mode = 0555, .child = ipv4_route_table},
3239         { .procname = "neigh",
3240           .mode = 0555, .child = empty},
3241         { }
3242 };
3243
3244 static __net_initdata struct ctl_path ipv4_path[] = {
3245         { .procname = "net", },
3246         { .procname = "ipv4", },
3247         { },
3248 };
3249
3250 static struct ctl_table ipv4_route_flush_table[] = {
3251         {
3252                 .procname       = "flush",
3253                 .maxlen         = sizeof(int),
3254                 .mode           = 0200,
3255                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3256         },
3257         { },
3258 };
3259
3260 static __net_initdata struct ctl_path ipv4_route_path[] = {
3261         { .procname = "net", },
3262         { .procname = "ipv4", },
3263         { .procname = "route", },
3264         { },
3265 };
3266
3267 static __net_init int sysctl_route_net_init(struct net *net)
3268 {
3269         struct ctl_table *tbl;
3270
3271         tbl = ipv4_route_flush_table;
3272         if (!net_eq(net, &init_net)) {
3273                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3274                 if (tbl == NULL)
3275                         goto err_dup;
3276         }
3277         tbl[0].extra1 = net;
3278
3279         net->ipv4.route_hdr =
3280                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3281         if (net->ipv4.route_hdr == NULL)
3282                 goto err_reg;
3283         return 0;
3284
3285 err_reg:
3286         if (tbl != ipv4_route_flush_table)
3287                 kfree(tbl);
3288 err_dup:
3289         return -ENOMEM;
3290 }
3291
3292 static __net_exit void sysctl_route_net_exit(struct net *net)
3293 {
3294         struct ctl_table *tbl;
3295
3296         tbl = net->ipv4.route_hdr->ctl_table_arg;
3297         unregister_net_sysctl_table(net->ipv4.route_hdr);
3298         BUG_ON(tbl == ipv4_route_flush_table);
3299         kfree(tbl);
3300 }
3301
3302 static __net_initdata struct pernet_operations sysctl_route_ops = {
3303         .init = sysctl_route_net_init,
3304         .exit = sysctl_route_net_exit,
3305 };
3306 #endif
3307
3308
3309 static __net_init int rt_secret_timer_init(struct net *net)
3310 {
3311         atomic_set(&net->ipv4.rt_genid,
3312                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3313                         (jiffies ^ (jiffies >> 7))));
3314
3315         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3316         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3317         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3318
3319         if (ip_rt_secret_interval) {
3320                 net->ipv4.rt_secret_timer.expires =
3321                         jiffies + net_random() % ip_rt_secret_interval +
3322                         ip_rt_secret_interval;
3323                 add_timer(&net->ipv4.rt_secret_timer);
3324         }
3325         return 0;
3326 }
3327
3328 static __net_exit void rt_secret_timer_exit(struct net *net)
3329 {
3330         del_timer_sync(&net->ipv4.rt_secret_timer);
3331 }
3332
3333 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3334         .init = rt_secret_timer_init,
3335         .exit = rt_secret_timer_exit,
3336 };
3337
3338
3339 #ifdef CONFIG_NET_CLS_ROUTE
3340 struct ip_rt_acct *ip_rt_acct __read_mostly;
3341 #endif /* CONFIG_NET_CLS_ROUTE */
3342
3343 static __initdata unsigned long rhash_entries;
3344 static int __init set_rhash_entries(char *str)
3345 {
3346         if (!str)
3347                 return 0;
3348         rhash_entries = simple_strtoul(str, &str, 0);
3349         return 1;
3350 }
3351 __setup("rhash_entries=", set_rhash_entries);
3352
3353 int __init ip_rt_init(void)
3354 {
3355         int rc = 0;
3356
3357 #ifdef CONFIG_NET_CLS_ROUTE
3358         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3359         if (!ip_rt_acct)
3360                 panic("IP: failed to allocate ip_rt_acct\n");
3361 #endif
3362
3363         ipv4_dst_ops.kmem_cachep =
3364                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3365                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3366
3367         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3368
3369         rt_hash_table = (struct rt_hash_bucket *)
3370                 alloc_large_system_hash("IP route cache",
3371                                         sizeof(struct rt_hash_bucket),
3372                                         rhash_entries,
3373                                         (totalram_pages >= 128 * 1024) ?
3374                                         15 : 17,
3375                                         0,
3376                                         &rt_hash_log,
3377                                         &rt_hash_mask,
3378                                         rhash_entries ? 0 : 512 * 1024);
3379         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3380         rt_hash_lock_init();
3381
3382         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3383         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3384
3385         devinet_init();
3386         ip_fib_init();
3387
3388         /* All the timers, started at system startup tend
3389            to synchronize. Perturb it a bit.
3390          */
3391         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3392         expires_ljiffies = jiffies;
3393         schedule_delayed_work(&expires_work,
3394                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3395
3396         if (register_pernet_subsys(&rt_secret_timer_ops))
3397                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3398
3399         if (ip_rt_proc_init())
3400                 printk(KERN_ERR "Unable to create route proc files\n");
3401 #ifdef CONFIG_XFRM
3402         xfrm_init();
3403         xfrm4_init(ip_rt_max_size);
3404 #endif
3405         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3406
3407 #ifdef CONFIG_SYSCTL
3408         register_pernet_subsys(&sysctl_route_ops);
3409 #endif
3410         return rc;
3411 }
3412
3413 #ifdef CONFIG_SYSCTL
3414 /*
3415  * We really need to sanitize the damn ipv4 init order, then all
3416  * this nonsense will go away.
3417  */
3418 void __init ip_static_sysctl_init(void)
3419 {
3420         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3421 }
3422 #endif
3423
3424 EXPORT_SYMBOL(__ip_select_ident);
3425 EXPORT_SYMBOL(ip_route_input);
3426 EXPORT_SYMBOL(ip_route_output_key);