net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132
 133 static void rt_worker_func(struct work_struct *work);
 134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static void              ipv4_dst_destroy(struct dst_entry *dst);
 142 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 143                                          struct net_device *dev, int how);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149
 150 static struct dst_ops ipv4_dst_ops = {
 151         .family =               AF_INET,
 152         .protocol =             __constant_htons(ETH_P_IP),
 153         .gc =                   rt_garbage_collect,
 154         .check =                ipv4_dst_check,
 155         .destroy =              ipv4_dst_destroy,
 156         .ifdown =               ipv4_dst_ifdown,
 157         .negative_advice =      ipv4_negative_advice,
 158         .link_failure =         ipv4_link_failure,
 159         .update_pmtu =          ip_rt_update_pmtu,
 160         .local_out =            __ip_local_out,
 161         .entry_size =           sizeof(struct rtable),
 162         .entries =              ATOMIC_INIT(0),
 163 };
 164
 165 #define ECN_OR_COST(class)      TC_PRIO_##class
 166
 167 const __u8 ip_tos2prio[16] = {
 168         TC_PRIO_BESTEFFORT,
 169         ECN_OR_COST(FILLER),
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(BESTEFFORT),
 172         TC_PRIO_BULK,
 173         ECN_OR_COST(BULK),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_INTERACTIVE,
 177         ECN_OR_COST(INTERACTIVE),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE_BULK,
 181         ECN_OR_COST(INTERACTIVE_BULK),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK)
 184 };
 185
 186
 187 /*
 188  * Route cache.
 189  */
 190
 191 /* The locking scheme is rather straight forward:
 192  *
 193  * 1) Read-Copy Update protects the buckets of the central route hash.
 194  * 2) Only writers remove entries, and they hold the lock
 195  *    as they look at rtable reference counts.
 196  * 3) Only readers acquire references to rtable entries,
 197  *    they do so with atomic increments and with the
 198  *    lock held.
 199  */
 200
 201 struct rt_hash_bucket {
 202         struct rtable   *chain;
 203 };
 204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 205         defined(CONFIG_PROVE_LOCKING)
 206 /*
 207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 208  * The size of this table is a power of two and depends on the number of CPUS.
 209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 210  */
 211 #ifdef CONFIG_LOCKDEP
 212 # define RT_HASH_LOCK_SZ        256
 213 #else
 214 # if NR_CPUS >= 32
 215 #  define RT_HASH_LOCK_SZ       4096
 216 # elif NR_CPUS >= 16
 217 #  define RT_HASH_LOCK_SZ       2048
 218 # elif NR_CPUS >= 8
 219 #  define RT_HASH_LOCK_SZ       1024
 220 # elif NR_CPUS >= 4
 221 #  define RT_HASH_LOCK_SZ       512
 222 # else
 223 #  define RT_HASH_LOCK_SZ       256
 224 # endif
 225 #endif
 226
 227 static spinlock_t       *rt_hash_locks;
 228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 229
 230 static __init void rt_hash_lock_init(void)
 231 {
 232         int i;
 233
 234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 235                         GFP_KERNEL);
 236         if (!rt_hash_locks)
 237                 panic("IP: failed to allocate rt_hash_locks\n");
 238
 239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 240                 spin_lock_init(&rt_hash_locks[i]);
 241 }
 242 #else
 243 # define rt_hash_lock_addr(slot) NULL
 244
 245 static inline void rt_hash_lock_init(void)
 246 {
 247 }
 248 #endif
 249
 250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 251 static unsigned                 rt_hash_mask __read_mostly;
 252 static unsigned int             rt_hash_log  __read_mostly;
 253
 254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 255 #define RT_CACHE_STAT_INC(field) \
 256         (__raw_get_cpu_var(rt_cache_stat).field++)
 257
 258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 259                 int genid)
 260 {
 261         return jhash_3words((__force u32)(__be32)(daddr),
 262                             (__force u32)(__be32)(saddr),
 263                             idx, genid)
 264                 & rt_hash_mask;
 265 }
 266
 267 static inline int rt_genid(struct net *net)
 268 {
 269         return atomic_read(&net->ipv4.rt_genid);
 270 }
 271
 272 #ifdef CONFIG_PROC_FS
 273 struct rt_cache_iter_state {
 274         struct seq_net_private p;
 275         int bucket;
 276         int genid;
 277 };
 278
 279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 280 {
 281         struct rt_cache_iter_state *st = seq->private;
 282         struct rtable *r = NULL;
 283
 284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 285                 if (!rt_hash_table[st->bucket].chain)
 286                         continue;
 287                 rcu_read_lock_bh();
 288                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 289                 while (r) {
 290                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 291                             r->rt_genid == st->genid)
 292                                 return r;
 293                         r = rcu_dereference(r->u.dst.rt_next);
 294                 }
 295                 rcu_read_unlock_bh();
 296         }
 297         return r;
 298 }
 299
 300 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 301                                           struct rtable *r)
 302 {
 303         struct rt_cache_iter_state *st = seq->private;
 304
 305         r = r->u.dst.rt_next;
 306         while (!r) {
 307                 rcu_read_unlock_bh();
 308                 do {
 309                         if (--st->bucket < 0)
 310                                 return NULL;
 311                 } while (!rt_hash_table[st->bucket].chain);
 312                 rcu_read_lock_bh();
 313                 r = rt_hash_table[st->bucket].chain;
 314         }
 315         return rcu_dereference(r);
 316 }
 317
 318 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 319                                         struct rtable *r)
 320 {
 321         struct rt_cache_iter_state *st = seq->private;
 322         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 323                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 324                         continue;
 325                 if (r->rt_genid == st->genid)
 326                         break;
 327         }
 328         return r;
 329 }
 330
 331 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 332 {
 333         struct rtable *r = rt_cache_get_first(seq);
 334
 335         if (r)
 336                 while (pos && (r = rt_cache_get_next(seq, r)))
 337                         --pos;
 338         return pos ? NULL : r;
 339 }
 340
 341 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 342 {
 343         struct rt_cache_iter_state *st = seq->private;
 344         if (*pos)
 345                 return rt_cache_get_idx(seq, *pos - 1);
 346         st->genid = rt_genid(seq_file_net(seq));
 347         return SEQ_START_TOKEN;
 348 }
 349
 350 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 351 {
 352         struct rtable *r;
 353
 354         if (v == SEQ_START_TOKEN)
 355                 r = rt_cache_get_first(seq);
 356         else
 357                 r = rt_cache_get_next(seq, v);
 358         ++*pos;
 359         return r;
 360 }
 361
 362 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 363 {
 364         if (v && v != SEQ_START_TOKEN)
 365                 rcu_read_unlock_bh();
 366 }
 367
 368 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 369 {
 370         if (v == SEQ_START_TOKEN)
 371                 seq_printf(seq, "%-127s\n",
 372                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 373                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 374                            "HHUptod\tSpecDst");
 375         else {
 376                 struct rtable *r = v;
 377                 int len;
 378
 379                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 380                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 381                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 382                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 383                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 384                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 385                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 386                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 387                         dst_metric(&r->u.dst, RTAX_WINDOW),
 388                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 389                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 390                         r->fl.fl4_tos,
 391                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 392                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 393                                        dev_queue_xmit) : 0,
 394                         r->rt_spec_dst, &len);
 395
 396                 seq_printf(seq, "%*s\n", 127 - len, "");
 397         }
 398         return 0;
 399 }
 400
 401 static const struct seq_operations rt_cache_seq_ops = {
 402         .start  = rt_cache_seq_start,
 403         .next   = rt_cache_seq_next,
 404         .stop   = rt_cache_seq_stop,
 405         .show   = rt_cache_seq_show,
 406 };
 407
 408 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 409 {
 410         return seq_open_net(inode, file, &rt_cache_seq_ops,
 411                         sizeof(struct rt_cache_iter_state));
 412 }
 413
 414 static const struct file_operations rt_cache_seq_fops = {
 415         .owner   = THIS_MODULE,
 416         .open    = rt_cache_seq_open,
 417         .read    = seq_read,
 418         .llseek  = seq_lseek,
 419         .release = seq_release_net,
 420 };
 421
 422
 423 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 424 {
 425         int cpu;
 426
 427         if (*pos == 0)
 428                 return SEQ_START_TOKEN;
 429
 430         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 431                 if (!cpu_possible(cpu))
 432                         continue;
 433                 *pos = cpu+1;
 434                 return &per_cpu(rt_cache_stat, cpu);
 435         }
 436         return NULL;
 437 }
 438
 439 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 440 {
 441         int cpu;
 442
 443         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 444                 if (!cpu_possible(cpu))
 445                         continue;
 446                 *pos = cpu+1;
 447                 return &per_cpu(rt_cache_stat, cpu);
 448         }
 449         return NULL;
 450
 451 }
 452
 453 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 454 {
 455
 456 }
 457
 458 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 459 {
 460         struct rt_cache_stat *st = v;
 461
 462         if (v == SEQ_START_TOKEN) {
 463                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 464                 return 0;
 465         }
 466
 467         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 468                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 469                    atomic_read(&ipv4_dst_ops.entries),
 470                    st->in_hit,
 471                    st->in_slow_tot,
 472                    st->in_slow_mc,
 473                    st->in_no_route,
 474                    st->in_brd,
 475                    st->in_martian_dst,
 476                    st->in_martian_src,
 477
 478                    st->out_hit,
 479                    st->out_slow_tot,
 480                    st->out_slow_mc,
 481
 482                    st->gc_total,
 483                    st->gc_ignored,
 484                    st->gc_goal_miss,
 485                    st->gc_dst_overflow,
 486                    st->in_hlist_search,
 487                    st->out_hlist_search
 488                 );
 489         return 0;
 490 }
 491
 492 static const struct seq_operations rt_cpu_seq_ops = {
 493         .start  = rt_cpu_seq_start,
 494         .next   = rt_cpu_seq_next,
 495         .stop   = rt_cpu_seq_stop,
 496         .show   = rt_cpu_seq_show,
 497 };
 498
 499
 500 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 501 {
 502         return seq_open(file, &rt_cpu_seq_ops);
 503 }
 504
 505 static const struct file_operations rt_cpu_seq_fops = {
 506         .owner   = THIS_MODULE,
 507         .open    = rt_cpu_seq_open,
 508         .read    = seq_read,
 509         .llseek  = seq_lseek,
 510         .release = seq_release,
 511 };
 512
 513 #ifdef CONFIG_NET_CLS_ROUTE
 514 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 515                            int length, int *eof, void *data)
 516 {
 517         unsigned int i;
 518
 519         if ((offset & 3) || (length & 3))
 520                 return -EIO;
 521
 522         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 523                 *eof = 1;
 524                 return 0;
 525         }
 526
 527         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 528                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 529                 *eof = 1;
 530         }
 531
 532         offset /= sizeof(u32);
 533
 534         if (length > 0) {
 535                 u32 *dst = (u32 *) buffer;
 536
 537                 *start = buffer;
 538                 memset(dst, 0, length);
 539
 540                 for_each_possible_cpu(i) {
 541                         unsigned int j;
 542                         u32 *src;
 543
 544                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 545                         for (j = 0; j < length/4; j++)
 546                                 dst[j] += src[j];
 547                 }
 548         }
 549         return length;
 550 }
 551 #endif
 552
 553 static int __net_init ip_rt_do_proc_init(struct net *net)
 554 {
 555         struct proc_dir_entry *pde;
 556
 557         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 558                         &rt_cache_seq_fops);
 559         if (!pde)
 560                 goto err1;
 561
 562         pde = proc_create("rt_cache", S_IRUGO,
 563                           net->proc_net_stat, &rt_cpu_seq_fops);
 564         if (!pde)
 565                 goto err2;
 566
 567 #ifdef CONFIG_NET_CLS_ROUTE
 568         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 569                         ip_rt_acct_read, NULL);
 570         if (!pde)
 571                 goto err3;
 572 #endif
 573         return 0;
 574
 575 #ifdef CONFIG_NET_CLS_ROUTE
 576 err3:
 577         remove_proc_entry("rt_cache", net->proc_net_stat);
 578 #endif
 579 err2:
 580         remove_proc_entry("rt_cache", net->proc_net);
 581 err1:
 582         return -ENOMEM;
 583 }
 584
 585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 586 {
 587         remove_proc_entry("rt_cache", net->proc_net_stat);
 588         remove_proc_entry("rt_cache", net->proc_net);
 589         remove_proc_entry("rt_acct", net->proc_net);
 590 }
 591
 592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 593         .init = ip_rt_do_proc_init,
 594         .exit = ip_rt_do_proc_exit,
 595 };
 596
 597 static int __init ip_rt_proc_init(void)
 598 {
 599         return register_pernet_subsys(&ip_rt_proc_ops);
 600 }
 601
 602 #else
 603 static inline int ip_rt_proc_init(void)
 604 {
 605         return 0;
 606 }
 607 #endif /* CONFIG_PROC_FS */
 608
 609 static inline void rt_free(struct rtable *rt)
 610 {
 611         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 612 }
 613
 614 static inline void rt_drop(struct rtable *rt)
 615 {
 616         ip_rt_put(rt);
 617         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 618 }
 619
 620 static inline int rt_fast_clean(struct rtable *rth)
 621 {
 622         /* Kill broadcast/multicast entries very aggresively, if they
 623            collide in hash table with more useful entries */
 624         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 625                 rth->fl.iif && rth->u.dst.rt_next;
 626 }
 627
 628 static inline int rt_valuable(struct rtable *rth)
 629 {
 630         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 631                 rth->u.dst.expires;
 632 }
 633
 634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 635 {
 636         unsigned long age;
 637         int ret = 0;
 638
 639         if (atomic_read(&rth->u.dst.__refcnt))
 640                 goto out;
 641
 642         ret = 1;
 643         if (rth->u.dst.expires &&
 644             time_after_eq(jiffies, rth->u.dst.expires))
 645                 goto out;
 646
 647         age = jiffies - rth->u.dst.lastuse;
 648         ret = 0;
 649         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 650             (age <= tmo2 && rt_valuable(rth)))
 651                 goto out;
 652         ret = 1;
 653 out:    return ret;
 654 }
 655
 656 /* Bits of score are:
 657  * 31: very valuable
 658  * 30: not quite useless
 659  * 29..0: usage counter
 660  */
 661 static inline u32 rt_score(struct rtable *rt)
 662 {
 663         u32 score = jiffies - rt->u.dst.lastuse;
 664
 665         score = ~score & ~(3<<30);
 666
 667         if (rt_valuable(rt))
 668                 score |= (1<<31);
 669
 670         if (!rt->fl.iif ||
 671             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 672                 score |= (1<<30);
 673
 674         return score;
 675 }
 676
 677 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 678 {
 679         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 680                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 681                 (fl1->mark ^ fl2->mark) |
 682                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 683                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 684                 (fl1->oif ^ fl2->oif) |
 685                 (fl1->iif ^ fl2->iif)) == 0;
 686 }
 687
 688 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 689 {
 690         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 691 }
 692
 693 static inline int rt_is_expired(struct rtable *rth)
 694 {
 695         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 696 }
 697
 698 /*
 699  * Perform a full scan of hash table and free all entries.
 700  * Can be called by a softirq or a process.
 701  * In the later case, we want to be reschedule if necessary
 702  */
 703 static void rt_do_flush(int process_context)
 704 {
 705         unsigned int i;
 706         struct rtable *rth, *next;
 707         struct rtable * tail;
 708
 709         for (i = 0; i <= rt_hash_mask; i++) {
 710                 if (process_context && need_resched())
 711                         cond_resched();
 712                 rth = rt_hash_table[i].chain;
 713                 if (!rth)
 714                         continue;
 715
 716                 spin_lock_bh(rt_hash_lock_addr(i));
 717 #ifdef CONFIG_NET_NS
 718                 {
 719                 struct rtable ** prev, * p;
 720
 721                 rth = rt_hash_table[i].chain;
 722
 723                 /* defer releasing the head of the list after spin_unlock */
 724                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 725                         if (!rt_is_expired(tail))
 726                                 break;
 727                 if (rth != tail)
 728                         rt_hash_table[i].chain = tail;
 729
 730                 /* call rt_free on entries after the tail requiring flush */
 731                 prev = &rt_hash_table[i].chain;
 732                 for (p = *prev; p; p = next) {
 733                         next = p->u.dst.rt_next;
 734                         if (!rt_is_expired(p)) {
 735                                 prev = &p->u.dst.rt_next;
 736                         } else {
 737                                 *prev = next;
 738                                 rt_free(p);
 739                         }
 740                 }
 741                 }
 742 #else
 743                 rth = rt_hash_table[i].chain;
 744                 rt_hash_table[i].chain = NULL;
 745                 tail = NULL;
 746 #endif
 747                 spin_unlock_bh(rt_hash_lock_addr(i));
 748
 749                 for (; rth != tail; rth = next) {
 750                         next = rth->u.dst.rt_next;
 751                         rt_free(rth);
 752                 }
 753         }
 754 }
 755
 756 static void rt_check_expire(void)
 757 {
 758         static unsigned int rover;
 759         unsigned int i = rover, goal;
 760         struct rtable *rth, **rthp;
 761         u64 mult;
 762
 763         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 764         if (ip_rt_gc_timeout > 1)
 765                 do_div(mult, ip_rt_gc_timeout);
 766         goal = (unsigned int)mult;
 767         if (goal > rt_hash_mask)
 768                 goal = rt_hash_mask + 1;
 769         for (; goal > 0; goal--) {
 770                 unsigned long tmo = ip_rt_gc_timeout;
 771
 772                 i = (i + 1) & rt_hash_mask;
 773                 rthp = &rt_hash_table[i].chain;
 774
 775                 if (need_resched())
 776                         cond_resched();
 777
 778                 if (*rthp == NULL)
 779                         continue;
 780                 spin_lock_bh(rt_hash_lock_addr(i));
 781                 while ((rth = *rthp) != NULL) {
 782                         if (rt_is_expired(rth)) {
 783                                 *rthp = rth->u.dst.rt_next;
 784                                 rt_free(rth);
 785                                 continue;
 786                         }
 787                         if (rth->u.dst.expires) {
 788                                 /* Entry is expired even if it is in use */
 789                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 790                                         tmo >>= 1;
 791                                         rthp = &rth->u.dst.rt_next;
 792                                         continue;
 793                                 }
 794                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 795                                 tmo >>= 1;
 796                                 rthp = &rth->u.dst.rt_next;
 797                                 continue;
 798                         }
 799
 800                         /* Cleanup aged off entries. */
 801                         *rthp = rth->u.dst.rt_next;
 802                         rt_free(rth);
 803                 }
 804                 spin_unlock_bh(rt_hash_lock_addr(i));
 805         }
 806         rover = i;
 807 }
 808
 809 /*
 810  * rt_worker_func() is run in process context.
 811  * we call rt_check_expire() to scan part of the hash table
 812  */
 813 static void rt_worker_func(struct work_struct *work)
 814 {
 815         rt_check_expire();
 816         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 817 }
 818
 819 /*
 820  * Pertubation of rt_genid by a small quantity [1..256]
 821  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 822  * many times (2^24) without giving recent rt_genid.
 823  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 824  */
 825 static void rt_cache_invalidate(struct net *net)
 826 {
 827         unsigned char shuffle;
 828
 829         get_random_bytes(&shuffle, sizeof(shuffle));
 830         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 831 }
 832
 833 /*
 834  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 835  * delay >= 0 : invalidate & flush cache (can be long)
 836  */
 837 void rt_cache_flush(struct net *net, int delay)
 838 {
 839         rt_cache_invalidate(net);
 840         if (delay >= 0)
 841                 rt_do_flush(!in_softirq());
 842 }
 843
 844 /*
 845  * We change rt_genid and let gc do the cleanup
 846  */
 847 static void rt_secret_rebuild(unsigned long __net)
 848 {
 849         struct net *net = (struct net *)__net;
 850         rt_cache_invalidate(net);
 851         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 852 }
 853
 854 /*
 855    Short description of GC goals.
 856
 857    We want to build algorithm, which will keep routing cache
 858    at some equilibrium point, when number of aged off entries
 859    is kept approximately equal to newly generated ones.
 860
 861    Current expiration strength is variable "expire".
 862    We try to adjust it dynamically, so that if networking
 863    is idle expires is large enough to keep enough of warm entries,
 864    and when load increases it reduces to limit cache size.
 865  */
 866
 867 static int rt_garbage_collect(struct dst_ops *ops)
 868 {
 869         static unsigned long expire = RT_GC_TIMEOUT;
 870         static unsigned long last_gc;
 871         static int rover;
 872         static int equilibrium;
 873         struct rtable *rth, **rthp;
 874         unsigned long now = jiffies;
 875         int goal;
 876
 877         /*
 878          * Garbage collection is pretty expensive,
 879          * do not make it too frequently.
 880          */
 881
 882         RT_CACHE_STAT_INC(gc_total);
 883
 884         if (now - last_gc < ip_rt_gc_min_interval &&
 885             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 886                 RT_CACHE_STAT_INC(gc_ignored);
 887                 goto out;
 888         }
 889
 890         /* Calculate number of entries, which we want to expire now. */
 891         goal = atomic_read(&ipv4_dst_ops.entries) -
 892                 (ip_rt_gc_elasticity << rt_hash_log);
 893         if (goal <= 0) {
 894                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 895                         equilibrium = ipv4_dst_ops.gc_thresh;
 896                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 897                 if (goal > 0) {
 898                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 899                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 900                 }
 901         } else {
 902                 /* We are in dangerous area. Try to reduce cache really
 903                  * aggressively.
 904                  */
 905                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 906                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 907         }
 908
 909         if (now - last_gc >= ip_rt_gc_min_interval)
 910                 last_gc = now;
 911
 912         if (goal <= 0) {
 913                 equilibrium += goal;
 914                 goto work_done;
 915         }
 916
 917         do {
 918                 int i, k;
 919
 920                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 921                         unsigned long tmo = expire;
 922
 923                         k = (k + 1) & rt_hash_mask;
 924                         rthp = &rt_hash_table[k].chain;
 925                         spin_lock_bh(rt_hash_lock_addr(k));
 926                         while ((rth = *rthp) != NULL) {
 927                                 if (!rt_is_expired(rth) &&
 928                                         !rt_may_expire(rth, tmo, expire)) {
 929                                         tmo >>= 1;
 930                                         rthp = &rth->u.dst.rt_next;
 931                                         continue;
 932                                 }
 933                                 *rthp = rth->u.dst.rt_next;
 934                                 rt_free(rth);
 935                                 goal--;
 936                         }
 937                         spin_unlock_bh(rt_hash_lock_addr(k));
 938                         if (goal <= 0)
 939                                 break;
 940                 }
 941                 rover = k;
 942
 943                 if (goal <= 0)
 944                         goto work_done;
 945
 946                 /* Goal is not achieved. We stop process if:
 947
 948                    - if expire reduced to zero. Otherwise, expire is halfed.
 949                    - if table is not full.
 950                    - if we are called from interrupt.
 951                    - jiffies check is just fallback/debug loop breaker.
 952                      We will not spin here for long time in any case.
 953                  */
 954
 955                 RT_CACHE_STAT_INC(gc_goal_miss);
 956
 957                 if (expire == 0)
 958                         break;
 959
 960                 expire >>= 1;
 961 #if RT_CACHE_DEBUG >= 2
 962                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 963                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 964 #endif
 965
 966                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 967                         goto out;
 968         } while (!in_softirq() && time_before_eq(jiffies, now));
 969
 970         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 971                 goto out;
 972         if (net_ratelimit())
 973                 printk(KERN_WARNING "dst cache overflow\n");
 974         RT_CACHE_STAT_INC(gc_dst_overflow);
 975         return 1;
 976
 977 work_done:
 978         expire += ip_rt_gc_min_interval;
 979         if (expire > ip_rt_gc_timeout ||
 980             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 981                 expire = ip_rt_gc_timeout;
 982 #if RT_CACHE_DEBUG >= 2
 983         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 984                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 985 #endif
 986 out:    return 0;
 987 }
 988
 989 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 990 {
 991         struct rtable   *rth, **rthp;
 992         unsigned long   now;
 993         struct rtable *cand, **candp;
 994         u32             min_score;
 995         int             chain_length;
 996         int attempts = !in_softirq();
 997
 998 restart:
 999         chain_length = 0;
1000         min_score = ~(u32)0;
1001         cand = NULL;
1002         candp = NULL;
1003         now = jiffies;
1004
1005         rthp = &rt_hash_table[hash].chain;
1006
1007         spin_lock_bh(rt_hash_lock_addr(hash));
1008         while ((rth = *rthp) != NULL) {
1009                 if (rt_is_expired(rth)) {
1010                         *rthp = rth->u.dst.rt_next;
1011                         rt_free(rth);
1012                         continue;
1013                 }
1014                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1015                         /* Put it first */
1016                         *rthp = rth->u.dst.rt_next;
1017                         /*
1018                          * Since lookup is lockfree, the deletion
1019                          * must be visible to another weakly ordered CPU before
1020                          * the insertion at the start of the hash chain.
1021                          */
1022                         rcu_assign_pointer(rth->u.dst.rt_next,
1023                                            rt_hash_table[hash].chain);
1024                         /*
1025                          * Since lookup is lockfree, the update writes
1026                          * must be ordered for consistency on SMP.
1027                          */
1028                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1029
1030                         dst_use(&rth->u.dst, now);
1031                         spin_unlock_bh(rt_hash_lock_addr(hash));
1032
1033                         rt_drop(rt);
1034                         *rp = rth;
1035                         return 0;
1036                 }
1037
1038                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1039                         u32 score = rt_score(rth);
1040
1041                         if (score <= min_score) {
1042                                 cand = rth;
1043                                 candp = rthp;
1044                                 min_score = score;
1045                         }
1046                 }
1047
1048                 chain_length++;
1049
1050                 rthp = &rth->u.dst.rt_next;
1051         }
1052
1053         if (cand) {
1054                 /* ip_rt_gc_elasticity used to be average length of chain
1055                  * length, when exceeded gc becomes really aggressive.
1056                  *
1057                  * The second limit is less certain. At the moment it allows
1058                  * only 2 entries per bucket. We will see.
1059                  */
1060                 if (chain_length > ip_rt_gc_elasticity) {
1061                         *candp = cand->u.dst.rt_next;
1062                         rt_free(cand);
1063                 }
1064         }
1065
1066         /* Try to bind route to arp only if it is output
1067            route or unicast forwarding path.
1068          */
1069         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1070                 int err = arp_bind_neighbour(&rt->u.dst);
1071                 if (err) {
1072                         spin_unlock_bh(rt_hash_lock_addr(hash));
1073
1074                         if (err != -ENOBUFS) {
1075                                 rt_drop(rt);
1076                                 return err;
1077                         }
1078
1079                         /* Neighbour tables are full and nothing
1080                            can be released. Try to shrink route cache,
1081                            it is most likely it holds some neighbour records.
1082                          */
1083                         if (attempts-- > 0) {
1084                                 int saved_elasticity = ip_rt_gc_elasticity;
1085                                 int saved_int = ip_rt_gc_min_interval;
1086                                 ip_rt_gc_elasticity     = 1;
1087                                 ip_rt_gc_min_interval   = 0;
1088                                 rt_garbage_collect(&ipv4_dst_ops);
1089                                 ip_rt_gc_min_interval   = saved_int;
1090                                 ip_rt_gc_elasticity     = saved_elasticity;
1091                                 goto restart;
1092                         }
1093
1094                         if (net_ratelimit())
1095                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1096                         rt_drop(rt);
1097                         return -ENOBUFS;
1098                 }
1099         }
1100
1101         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1102 #if RT_CACHE_DEBUG >= 2
1103         if (rt->u.dst.rt_next) {
1104                 struct rtable *trt;
1105                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1106                        NIPQUAD(rt->rt_dst));
1107                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1108                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1109                 printk("\n");
1110         }
1111 #endif
1112         /*
1113          * Since lookup is lockfree, we must make sure
1114          * previous writes to rt are comitted to memory
1115          * before making rt visible to other CPUS.
1116          */
1117         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1118         spin_unlock_bh(rt_hash_lock_addr(hash));
1119         *rp = rt;
1120         return 0;
1121 }
1122
1123 void rt_bind_peer(struct rtable *rt, int create)
1124 {
1125         static DEFINE_SPINLOCK(rt_peer_lock);
1126         struct inet_peer *peer;
1127
1128         peer = inet_getpeer(rt->rt_dst, create);
1129
1130         spin_lock_bh(&rt_peer_lock);
1131         if (rt->peer == NULL) {
1132                 rt->peer = peer;
1133                 peer = NULL;
1134         }
1135         spin_unlock_bh(&rt_peer_lock);
1136         if (peer)
1137                 inet_putpeer(peer);
1138 }
1139
1140 /*
1141  * Peer allocation may fail only in serious out-of-memory conditions.  However
1142  * we still can generate some output.
1143  * Random ID selection looks a bit dangerous because we have no chances to
1144  * select ID being unique in a reasonable period of time.
1145  * But broken packet identifier may be better than no packet at all.
1146  */
1147 static void ip_select_fb_ident(struct iphdr *iph)
1148 {
1149         static DEFINE_SPINLOCK(ip_fb_id_lock);
1150         static u32 ip_fallback_id;
1151         u32 salt;
1152
1153         spin_lock_bh(&ip_fb_id_lock);
1154         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1155         iph->id = htons(salt & 0xFFFF);
1156         ip_fallback_id = salt;
1157         spin_unlock_bh(&ip_fb_id_lock);
1158 }
1159
1160 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1161 {
1162         struct rtable *rt = (struct rtable *) dst;
1163
1164         if (rt) {
1165                 if (rt->peer == NULL)
1166                         rt_bind_peer(rt, 1);
1167
1168                 /* If peer is attached to destination, it is never detached,
1169                    so that we need not to grab a lock to dereference it.
1170                  */
1171                 if (rt->peer) {
1172                         iph->id = htons(inet_getid(rt->peer, more));
1173                         return;
1174                 }
1175         } else
1176                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1177                        __builtin_return_address(0));
1178
1179         ip_select_fb_ident(iph);
1180 }
1181
1182 static void rt_del(unsigned hash, struct rtable *rt)
1183 {
1184         struct rtable **rthp, *aux;
1185
1186         rthp = &rt_hash_table[hash].chain;
1187         spin_lock_bh(rt_hash_lock_addr(hash));
1188         ip_rt_put(rt);
1189         while ((aux = *rthp) != NULL) {
1190                 if (aux == rt || rt_is_expired(aux)) {
1191                         *rthp = aux->u.dst.rt_next;
1192                         rt_free(aux);
1193                         continue;
1194                 }
1195                 rthp = &aux->u.dst.rt_next;
1196         }
1197         spin_unlock_bh(rt_hash_lock_addr(hash));
1198 }
1199
1200 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1201                     __be32 saddr, struct net_device *dev)
1202 {
1203         int i, k;
1204         struct in_device *in_dev = in_dev_get(dev);
1205         struct rtable *rth, **rthp;
1206         __be32  skeys[2] = { saddr, 0 };
1207         int  ikeys[2] = { dev->ifindex, 0 };
1208         struct netevent_redirect netevent;
1209         struct net *net;
1210
1211         if (!in_dev)
1212                 return;
1213
1214         net = dev_net(dev);
1215         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1216             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1217             || ipv4_is_zeronet(new_gw))
1218                 goto reject_redirect;
1219
1220         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1221                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1222                         goto reject_redirect;
1223                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1224                         goto reject_redirect;
1225         } else {
1226                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1227                         goto reject_redirect;
1228         }
1229
1230         for (i = 0; i < 2; i++) {
1231                 for (k = 0; k < 2; k++) {
1232                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1233                                                 rt_genid(net));
1234
1235                         rthp=&rt_hash_table[hash].chain;
1236
1237                         rcu_read_lock();
1238                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1239                                 struct rtable *rt;
1240
1241                                 if (rth->fl.fl4_dst != daddr ||
1242                                     rth->fl.fl4_src != skeys[i] ||
1243                                     rth->fl.oif != ikeys[k] ||
1244                                     rth->fl.iif != 0 ||
1245                                     rt_is_expired(rth) ||
1246                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1247                                         rthp = &rth->u.dst.rt_next;
1248                                         continue;
1249                                 }
1250
1251                                 if (rth->rt_dst != daddr ||
1252                                     rth->rt_src != saddr ||
1253                                     rth->u.dst.error ||
1254                                     rth->rt_gateway != old_gw ||
1255                                     rth->u.dst.dev != dev)
1256                                         break;
1257
1258                                 dst_hold(&rth->u.dst);
1259                                 rcu_read_unlock();
1260
1261                                 rt = dst_alloc(&ipv4_dst_ops);
1262                                 if (rt == NULL) {
1263                                         ip_rt_put(rth);
1264                                         in_dev_put(in_dev);
1265                                         return;
1266                                 }
1267
1268                                 /* Copy all the information. */
1269                                 *rt = *rth;
1270                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1271                                 rt->u.dst.__use         = 1;
1272                                 atomic_set(&rt->u.dst.__refcnt, 1);
1273                                 rt->u.dst.child         = NULL;
1274                                 if (rt->u.dst.dev)
1275                                         dev_hold(rt->u.dst.dev);
1276                                 if (rt->idev)
1277                                         in_dev_hold(rt->idev);
1278                                 rt->u.dst.obsolete      = 0;
1279                                 rt->u.dst.lastuse       = jiffies;
1280                                 rt->u.dst.path          = &rt->u.dst;
1281                                 rt->u.dst.neighbour     = NULL;
1282                                 rt->u.dst.hh            = NULL;
1283                                 rt->u.dst.xfrm          = NULL;
1284                                 rt->rt_genid            = rt_genid(net);
1285                                 rt->rt_flags            |= RTCF_REDIRECTED;
1286
1287                                 /* Gateway is different ... */
1288                                 rt->rt_gateway          = new_gw;
1289
1290                                 /* Redirect received -> path was valid */
1291                                 dst_confirm(&rth->u.dst);
1292
1293                                 if (rt->peer)
1294                                         atomic_inc(&rt->peer->refcnt);
1295
1296                                 if (arp_bind_neighbour(&rt->u.dst) ||
1297                                     !(rt->u.dst.neighbour->nud_state &
1298                                             NUD_VALID)) {
1299                                         if (rt->u.dst.neighbour)
1300                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1301                                         ip_rt_put(rth);
1302                                         rt_drop(rt);
1303                                         goto do_next;
1304                                 }
1305
1306                                 netevent.old = &rth->u.dst;
1307                                 netevent.new = &rt->u.dst;
1308                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1309                                                         &netevent);
1310
1311                                 rt_del(hash, rth);
1312                                 if (!rt_intern_hash(hash, rt, &rt))
1313                                         ip_rt_put(rt);
1314                                 goto do_next;
1315                         }
1316                         rcu_read_unlock();
1317                 do_next:
1318                         ;
1319                 }
1320         }
1321         in_dev_put(in_dev);
1322         return;
1323
1324 reject_redirect:
1325 #ifdef CONFIG_IP_ROUTE_VERBOSE
1326         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1327                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1328                         NIPQUAD_FMT " ignored.\n"
1329                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1330                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1331                        NIPQUAD(saddr), NIPQUAD(daddr));
1332 #endif
1333         in_dev_put(in_dev);
1334 }
1335
1336 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1337 {
1338         struct rtable *rt = (struct rtable *)dst;
1339         struct dst_entry *ret = dst;
1340
1341         if (rt) {
1342                 if (dst->obsolete) {
1343                         ip_rt_put(rt);
1344                         ret = NULL;
1345                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1346                            rt->u.dst.expires) {
1347                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1348                                                 rt->fl.oif,
1349                                                 rt_genid(dev_net(dst->dev)));
1350 #if RT_CACHE_DEBUG >= 1
1351                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1352                                           NIPQUAD_FMT "/%02x dropped\n",
1353                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1354 #endif
1355                         rt_del(hash, rt);
1356                         ret = NULL;
1357                 }
1358         }
1359         return ret;
1360 }
1361
1362 /*
1363  * Algorithm:
1364  *      1. The first ip_rt_redirect_number redirects are sent
1365  *         with exponential backoff, then we stop sending them at all,
1366  *         assuming that the host ignores our redirects.
1367  *      2. If we did not see packets requiring redirects
1368  *         during ip_rt_redirect_silence, we assume that the host
1369  *         forgot redirected route and start to send redirects again.
1370  *
1371  * This algorithm is much cheaper and more intelligent than dumb load limiting
1372  * in icmp.c.
1373  *
1374  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1375  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1376  */
1377
1378 void ip_rt_send_redirect(struct sk_buff *skb)
1379 {
1380         struct rtable *rt = skb->rtable;
1381         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1382
1383         if (!in_dev)
1384                 return;
1385
1386         if (!IN_DEV_TX_REDIRECTS(in_dev))
1387                 goto out;
1388
1389         /* No redirected packets during ip_rt_redirect_silence;
1390          * reset the algorithm.
1391          */
1392         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1393                 rt->u.dst.rate_tokens = 0;
1394
1395         /* Too many ignored redirects; do not send anything
1396          * set u.dst.rate_last to the last seen redirected packet.
1397          */
1398         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1399                 rt->u.dst.rate_last = jiffies;
1400                 goto out;
1401         }
1402
1403         /* Check for load limit; set rate_last to the latest sent
1404          * redirect.
1405          */
1406         if (rt->u.dst.rate_tokens == 0 ||
1407             time_after(jiffies,
1408                        (rt->u.dst.rate_last +
1409                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1410                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1411                 rt->u.dst.rate_last = jiffies;
1412                 ++rt->u.dst.rate_tokens;
1413 #ifdef CONFIG_IP_ROUTE_VERBOSE
1414                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1415                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1416                     net_ratelimit())
1417                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1418                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1419                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1420                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1421 #endif
1422         }
1423 out:
1424         in_dev_put(in_dev);
1425 }
1426
1427 static int ip_error(struct sk_buff *skb)
1428 {
1429         struct rtable *rt = skb->rtable;
1430         unsigned long now;
1431         int code;
1432
1433         switch (rt->u.dst.error) {
1434                 case EINVAL:
1435                 default:
1436                         goto out;
1437                 case EHOSTUNREACH:
1438                         code = ICMP_HOST_UNREACH;
1439                         break;
1440                 case ENETUNREACH:
1441                         code = ICMP_NET_UNREACH;
1442                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1443                                         IPSTATS_MIB_INNOROUTES);
1444                         break;
1445                 case EACCES:
1446                         code = ICMP_PKT_FILTERED;
1447                         break;
1448         }
1449
1450         now = jiffies;
1451         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1452         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1453                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1454         rt->u.dst.rate_last = now;
1455         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1456                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1457                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1458         }
1459
1460 out:    kfree_skb(skb);
1461         return 0;
1462 }
1463
1464 /*
1465  *      The last two values are not from the RFC but
1466  *      are needed for AMPRnet AX.25 paths.
1467  */
1468
1469 static const unsigned short mtu_plateau[] =
1470 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1471
1472 static inline unsigned short guess_mtu(unsigned short old_mtu)
1473 {
1474         int i;
1475
1476         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1477                 if (old_mtu > mtu_plateau[i])
1478                         return mtu_plateau[i];
1479         return 68;
1480 }
1481
1482 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1483                                  unsigned short new_mtu,
1484                                  struct net_device *dev)
1485 {
1486         int i, k;
1487         unsigned short old_mtu = ntohs(iph->tot_len);
1488         struct rtable *rth;
1489         int  ikeys[2] = { dev->ifindex, 0 };
1490         __be32  skeys[2] = { iph->saddr, 0, };
1491         __be32  daddr = iph->daddr;
1492         unsigned short est_mtu = 0;
1493
1494         if (ipv4_config.no_pmtu_disc)
1495                 return 0;
1496
1497         for (k = 0; k < 2; k++) {
1498                 for (i = 0; i < 2; i++) {
1499                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1500                                                 rt_genid(net));
1501
1502                         rcu_read_lock();
1503                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1504                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1505                                 unsigned short mtu = new_mtu;
1506
1507                                 if (rth->fl.fl4_dst != daddr ||
1508                                     rth->fl.fl4_src != skeys[i] ||
1509                                     rth->rt_dst != daddr ||
1510                                     rth->rt_src != iph->saddr ||
1511                                     rth->fl.oif != ikeys[k] ||
1512                                     rth->fl.iif != 0 ||
1513                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1514                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1515                                     rt_is_expired(rth))
1516                                         continue;
1517
1518                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1519
1520                                         /* BSD 4.2 compatibility hack :-( */
1521                                         if (mtu == 0 &&
1522                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1523                                             old_mtu >= 68 + (iph->ihl << 2))
1524                                                 old_mtu -= iph->ihl << 2;
1525
1526                                         mtu = guess_mtu(old_mtu);
1527                                 }
1528                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1529                                         if (mtu < dst_mtu(&rth->u.dst)) {
1530                                                 dst_confirm(&rth->u.dst);
1531                                                 if (mtu < ip_rt_min_pmtu) {
1532                                                         mtu = ip_rt_min_pmtu;
1533                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1534                                                                 (1 << RTAX_MTU);
1535                                                 }
1536                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1537                                                 dst_set_expires(&rth->u.dst,
1538                                                         ip_rt_mtu_expires);
1539                                         }
1540                                         est_mtu = mtu;
1541                                 }
1542                         }
1543                         rcu_read_unlock();
1544                 }
1545         }
1546         return est_mtu ? : new_mtu;
1547 }
1548
1549 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1550 {
1551         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1552             !(dst_metric_locked(dst, RTAX_MTU))) {
1553                 if (mtu < ip_rt_min_pmtu) {
1554                         mtu = ip_rt_min_pmtu;
1555                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1556                 }
1557                 dst->metrics[RTAX_MTU-1] = mtu;
1558                 dst_set_expires(dst, ip_rt_mtu_expires);
1559                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1560         }
1561 }
1562
1563 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1564 {
1565         return NULL;
1566 }
1567
1568 static void ipv4_dst_destroy(struct dst_entry *dst)
1569 {
1570         struct rtable *rt = (struct rtable *) dst;
1571         struct inet_peer *peer = rt->peer;
1572         struct in_device *idev = rt->idev;
1573
1574         if (peer) {
1575                 rt->peer = NULL;
1576                 inet_putpeer(peer);
1577         }
1578
1579         if (idev) {
1580                 rt->idev = NULL;
1581                 in_dev_put(idev);
1582         }
1583 }
1584
1585 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1586                             int how)
1587 {
1588         struct rtable *rt = (struct rtable *) dst;
1589         struct in_device *idev = rt->idev;
1590         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1591                 struct in_device *loopback_idev =
1592                         in_dev_get(dev_net(dev)->loopback_dev);
1593                 if (loopback_idev) {
1594                         rt->idev = loopback_idev;
1595                         in_dev_put(idev);
1596                 }
1597         }
1598 }
1599
1600 static void ipv4_link_failure(struct sk_buff *skb)
1601 {
1602         struct rtable *rt;
1603
1604         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1605
1606         rt = skb->rtable;
1607         if (rt)
1608                 dst_set_expires(&rt->u.dst, 0);
1609 }
1610
1611 static int ip_rt_bug(struct sk_buff *skb)
1612 {
1613         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1614                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1615                 skb->dev ? skb->dev->name : "?");
1616         kfree_skb(skb);
1617         return 0;
1618 }
1619
1620 /*
1621    We do not cache source address of outgoing interface,
1622    because it is used only by IP RR, TS and SRR options,
1623    so that it out of fast path.
1624
1625    BTW remember: "addr" is allowed to be not aligned
1626    in IP options!
1627  */
1628
1629 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1630 {
1631         __be32 src;
1632         struct fib_result res;
1633
1634         if (rt->fl.iif == 0)
1635                 src = rt->rt_src;
1636         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1637                 src = FIB_RES_PREFSRC(res);
1638                 fib_res_put(&res);
1639         } else
1640                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1641                                         RT_SCOPE_UNIVERSE);
1642         memcpy(addr, &src, 4);
1643 }
1644
1645 #ifdef CONFIG_NET_CLS_ROUTE
1646 static void set_class_tag(struct rtable *rt, u32 tag)
1647 {
1648         if (!(rt->u.dst.tclassid & 0xFFFF))
1649                 rt->u.dst.tclassid |= tag & 0xFFFF;
1650         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1651                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1652 }
1653 #endif
1654
1655 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1656 {
1657         struct fib_info *fi = res->fi;
1658
1659         if (fi) {
1660                 if (FIB_RES_GW(*res) &&
1661                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1662                         rt->rt_gateway = FIB_RES_GW(*res);
1663                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1664                        sizeof(rt->u.dst.metrics));
1665                 if (fi->fib_mtu == 0) {
1666                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1667                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1668                             rt->rt_gateway != rt->rt_dst &&
1669                             rt->u.dst.dev->mtu > 576)
1670                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1671                 }
1672 #ifdef CONFIG_NET_CLS_ROUTE
1673                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1674 #endif
1675         } else
1676                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1677
1678         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1679                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1680         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1681                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1682         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1683                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1684                                        ip_rt_min_advmss);
1685         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1686                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1687
1688 #ifdef CONFIG_NET_CLS_ROUTE
1689 #ifdef CONFIG_IP_MULTIPLE_TABLES
1690         set_class_tag(rt, fib_rules_tclass(res));
1691 #endif
1692         set_class_tag(rt, itag);
1693 #endif
1694         rt->rt_type = res->type;
1695 }
1696
1697 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1698                                 u8 tos, struct net_device *dev, int our)
1699 {
1700         unsigned hash;
1701         struct rtable *rth;
1702         __be32 spec_dst;
1703         struct in_device *in_dev = in_dev_get(dev);
1704         u32 itag = 0;
1705
1706         /* Primary sanity checks. */
1707
1708         if (in_dev == NULL)
1709                 return -EINVAL;
1710
1711         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1712             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1713                 goto e_inval;
1714
1715         if (ipv4_is_zeronet(saddr)) {
1716                 if (!ipv4_is_local_multicast(daddr))
1717                         goto e_inval;
1718                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1719         } else if (fib_validate_source(saddr, 0, tos, 0,
1720                                         dev, &spec_dst, &itag) < 0)
1721                 goto e_inval;
1722
1723         rth = dst_alloc(&ipv4_dst_ops);
1724         if (!rth)
1725                 goto e_nobufs;
1726
1727         rth->u.dst.output= ip_rt_bug;
1728
1729         atomic_set(&rth->u.dst.__refcnt, 1);
1730         rth->u.dst.flags= DST_HOST;
1731         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1732                 rth->u.dst.flags |= DST_NOPOLICY;
1733         rth->fl.fl4_dst = daddr;
1734         rth->rt_dst     = daddr;
1735         rth->fl.fl4_tos = tos;
1736         rth->fl.mark    = skb->mark;
1737         rth->fl.fl4_src = saddr;
1738         rth->rt_src     = saddr;
1739 #ifdef CONFIG_NET_CLS_ROUTE
1740         rth->u.dst.tclassid = itag;
1741 #endif
1742         rth->rt_iif     =
1743         rth->fl.iif     = dev->ifindex;
1744         rth->u.dst.dev  = init_net.loopback_dev;
1745         dev_hold(rth->u.dst.dev);
1746         rth->idev       = in_dev_get(rth->u.dst.dev);
1747         rth->fl.oif     = 0;
1748         rth->rt_gateway = daddr;
1749         rth->rt_spec_dst= spec_dst;
1750         rth->rt_genid   = rt_genid(dev_net(dev));
1751         rth->rt_flags   = RTCF_MULTICAST;
1752         rth->rt_type    = RTN_MULTICAST;
1753         if (our) {
1754                 rth->u.dst.input= ip_local_deliver;
1755                 rth->rt_flags |= RTCF_LOCAL;
1756         }
1757
1758 #ifdef CONFIG_IP_MROUTE
1759         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1760                 rth->u.dst.input = ip_mr_input;
1761 #endif
1762         RT_CACHE_STAT_INC(in_slow_mc);
1763
1764         in_dev_put(in_dev);
1765         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1766         return rt_intern_hash(hash, rth, &skb->rtable);
1767
1768 e_nobufs:
1769         in_dev_put(in_dev);
1770         return -ENOBUFS;
1771
1772 e_inval:
1773         in_dev_put(in_dev);
1774         return -EINVAL;
1775 }
1776
1777
1778 static void ip_handle_martian_source(struct net_device *dev,
1779                                      struct in_device *in_dev,
1780                                      struct sk_buff *skb,
1781                                      __be32 daddr,
1782                                      __be32 saddr)
1783 {
1784         RT_CACHE_STAT_INC(in_martian_src);
1785 #ifdef CONFIG_IP_ROUTE_VERBOSE
1786         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1787                 /*
1788                  *      RFC1812 recommendation, if source is martian,
1789                  *      the only hint is MAC header.
1790                  */
1791                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1792                         NIPQUAD_FMT", on dev %s\n",
1793                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1794                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1795                         int i;
1796                         const unsigned char *p = skb_mac_header(skb);
1797                         printk(KERN_WARNING "ll header: ");
1798                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1799                                 printk("%02x", *p);
1800                                 if (i < (dev->hard_header_len - 1))
1801                                         printk(":");
1802                         }
1803                         printk("\n");
1804                 }
1805         }
1806 #endif
1807 }
1808
1809 static int __mkroute_input(struct sk_buff *skb,
1810                            struct fib_result *res,
1811                            struct in_device *in_dev,
1812                            __be32 daddr, __be32 saddr, u32 tos,
1813                            struct rtable **result)
1814 {
1815
1816         struct rtable *rth;
1817         int err;
1818         struct in_device *out_dev;
1819         unsigned flags = 0;
1820         __be32 spec_dst;
1821         u32 itag;
1822
1823         /* get a working reference to the output device */
1824         out_dev = in_dev_get(FIB_RES_DEV(*res));
1825         if (out_dev == NULL) {
1826                 if (net_ratelimit())
1827                         printk(KERN_CRIT "Bug in ip_route_input" \
1828                                "_slow(). Please, report\n");
1829                 return -EINVAL;
1830         }
1831
1832
1833         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1834                                   in_dev->dev, &spec_dst, &itag);
1835         if (err < 0) {
1836                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1837                                          saddr);
1838
1839                 err = -EINVAL;
1840                 goto cleanup;
1841         }
1842
1843         if (err)
1844                 flags |= RTCF_DIRECTSRC;
1845
1846         if (out_dev == in_dev && err &&
1847             (IN_DEV_SHARED_MEDIA(out_dev) ||
1848              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1849                 flags |= RTCF_DOREDIRECT;
1850
1851         if (skb->protocol != htons(ETH_P_IP)) {
1852                 /* Not IP (i.e. ARP). Do not create route, if it is
1853                  * invalid for proxy arp. DNAT routes are always valid.
1854                  */
1855                 if (out_dev == in_dev) {
1856                         err = -EINVAL;
1857                         goto cleanup;
1858                 }
1859         }
1860
1861
1862         rth = dst_alloc(&ipv4_dst_ops);
1863         if (!rth) {
1864                 err = -ENOBUFS;
1865                 goto cleanup;
1866         }
1867
1868         atomic_set(&rth->u.dst.__refcnt, 1);
1869         rth->u.dst.flags= DST_HOST;
1870         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1871                 rth->u.dst.flags |= DST_NOPOLICY;
1872         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1873                 rth->u.dst.flags |= DST_NOXFRM;
1874         rth->fl.fl4_dst = daddr;
1875         rth->rt_dst     = daddr;
1876         rth->fl.fl4_tos = tos;
1877         rth->fl.mark    = skb->mark;
1878         rth->fl.fl4_src = saddr;
1879         rth->rt_src     = saddr;
1880         rth->rt_gateway = daddr;
1881         rth->rt_iif     =
1882                 rth->fl.iif     = in_dev->dev->ifindex;
1883         rth->u.dst.dev  = (out_dev)->dev;
1884         dev_hold(rth->u.dst.dev);
1885         rth->idev       = in_dev_get(rth->u.dst.dev);
1886         rth->fl.oif     = 0;
1887         rth->rt_spec_dst= spec_dst;
1888
1889         rth->u.dst.input = ip_forward;
1890         rth->u.dst.output = ip_output;
1891         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1892
1893         rt_set_nexthop(rth, res, itag);
1894
1895         rth->rt_flags = flags;
1896
1897         *result = rth;
1898         err = 0;
1899  cleanup:
1900         /* release the working reference to the output device */
1901         in_dev_put(out_dev);
1902         return err;
1903 }
1904
1905 static int ip_mkroute_input(struct sk_buff *skb,
1906                             struct fib_result *res,
1907                             const struct flowi *fl,
1908                             struct in_device *in_dev,
1909                             __be32 daddr, __be32 saddr, u32 tos)
1910 {
1911         struct rtable* rth = NULL;
1912         int err;
1913         unsigned hash;
1914
1915 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1916         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1917                 fib_select_multipath(fl, res);
1918 #endif
1919
1920         /* create a routing cache entry */
1921         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1922         if (err)
1923                 return err;
1924
1925         /* put it into the cache */
1926         hash = rt_hash(daddr, saddr, fl->iif,
1927                        rt_genid(dev_net(rth->u.dst.dev)));
1928         return rt_intern_hash(hash, rth, &skb->rtable);
1929 }
1930
1931 /*
1932  *      NOTE. We drop all the packets that has local source
1933  *      addresses, because every properly looped back packet
1934  *      must have correct destination already attached by output routine.
1935  *
1936  *      Such approach solves two big problems:
1937  *      1. Not simplex devices are handled properly.
1938  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1939  */
1940
1941 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1942                                u8 tos, struct net_device *dev)
1943 {
1944         struct fib_result res;
1945         struct in_device *in_dev = in_dev_get(dev);
1946         struct flowi fl = { .nl_u = { .ip4_u =
1947                                       { .daddr = daddr,
1948                                         .saddr = saddr,
1949                                         .tos = tos,
1950                                         .scope = RT_SCOPE_UNIVERSE,
1951                                       } },
1952                             .mark = skb->mark,
1953                             .iif = dev->ifindex };
1954         unsigned        flags = 0;
1955         u32             itag = 0;
1956         struct rtable * rth;
1957         unsigned        hash;
1958         __be32          spec_dst;
1959         int             err = -EINVAL;
1960         int             free_res = 0;
1961         struct net    * net = dev_net(dev);
1962
1963         /* IP on this device is disabled. */
1964
1965         if (!in_dev)
1966                 goto out;
1967
1968         /* Check for the most weird martians, which can be not detected
1969            by fib_lookup.
1970          */
1971
1972         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1973             ipv4_is_loopback(saddr))
1974                 goto martian_source;
1975
1976         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1977                 goto brd_input;
1978
1979         /* Accept zero addresses only to limited broadcast;
1980          * I even do not know to fix it or not. Waiting for complains :-)
1981          */
1982         if (ipv4_is_zeronet(saddr))
1983                 goto martian_source;
1984
1985         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1986             ipv4_is_loopback(daddr))
1987                 goto martian_destination;
1988
1989         /*
1990          *      Now we are ready to route packet.
1991          */
1992         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1993                 if (!IN_DEV_FORWARD(in_dev))
1994                         goto e_hostunreach;
1995                 goto no_route;
1996         }
1997         free_res = 1;
1998
1999         RT_CACHE_STAT_INC(in_slow_tot);
2000
2001         if (res.type == RTN_BROADCAST)
2002                 goto brd_input;
2003
2004         if (res.type == RTN_LOCAL) {
2005                 int result;
2006                 result = fib_validate_source(saddr, daddr, tos,
2007                                              net->loopback_dev->ifindex,
2008                                              dev, &spec_dst, &itag);
2009                 if (result < 0)
2010                         goto martian_source;
2011                 if (result)
2012                         flags |= RTCF_DIRECTSRC;
2013                 spec_dst = daddr;
2014                 goto local_input;
2015         }
2016
2017         if (!IN_DEV_FORWARD(in_dev))
2018                 goto e_hostunreach;
2019         if (res.type != RTN_UNICAST)
2020                 goto martian_destination;
2021
2022         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2023 done:
2024         in_dev_put(in_dev);
2025         if (free_res)
2026                 fib_res_put(&res);
2027 out:    return err;
2028
2029 brd_input:
2030         if (skb->protocol != htons(ETH_P_IP))
2031                 goto e_inval;
2032
2033         if (ipv4_is_zeronet(saddr))
2034                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2035         else {
2036                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2037                                           &itag);
2038                 if (err < 0)
2039                         goto martian_source;
2040                 if (err)
2041                         flags |= RTCF_DIRECTSRC;
2042         }
2043         flags |= RTCF_BROADCAST;
2044         res.type = RTN_BROADCAST;
2045         RT_CACHE_STAT_INC(in_brd);
2046
2047 local_input:
2048         rth = dst_alloc(&ipv4_dst_ops);
2049         if (!rth)
2050                 goto e_nobufs;
2051
2052         rth->u.dst.output= ip_rt_bug;
2053         rth->rt_genid = rt_genid(net);
2054
2055         atomic_set(&rth->u.dst.__refcnt, 1);
2056         rth->u.dst.flags= DST_HOST;
2057         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2058                 rth->u.dst.flags |= DST_NOPOLICY;
2059         rth->fl.fl4_dst = daddr;
2060         rth->rt_dst     = daddr;
2061         rth->fl.fl4_tos = tos;
2062         rth->fl.mark    = skb->mark;
2063         rth->fl.fl4_src = saddr;
2064         rth->rt_src     = saddr;
2065 #ifdef CONFIG_NET_CLS_ROUTE
2066         rth->u.dst.tclassid = itag;
2067 #endif
2068         rth->rt_iif     =
2069         rth->fl.iif     = dev->ifindex;
2070         rth->u.dst.dev  = net->loopback_dev;
2071         dev_hold(rth->u.dst.dev);
2072         rth->idev       = in_dev_get(rth->u.dst.dev);
2073         rth->rt_gateway = daddr;
2074         rth->rt_spec_dst= spec_dst;
2075         rth->u.dst.input= ip_local_deliver;
2076         rth->rt_flags   = flags|RTCF_LOCAL;
2077         if (res.type == RTN_UNREACHABLE) {
2078                 rth->u.dst.input= ip_error;
2079                 rth->u.dst.error= -err;
2080                 rth->rt_flags   &= ~RTCF_LOCAL;
2081         }
2082         rth->rt_type    = res.type;
2083         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2084         err = rt_intern_hash(hash, rth, &skb->rtable);
2085         goto done;
2086
2087 no_route:
2088         RT_CACHE_STAT_INC(in_no_route);
2089         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2090         res.type = RTN_UNREACHABLE;
2091         if (err == -ESRCH)
2092                 err = -ENETUNREACH;
2093         goto local_input;
2094
2095         /*
2096          *      Do not cache martian addresses: they should be logged (RFC1812)
2097          */
2098 martian_destination:
2099         RT_CACHE_STAT_INC(in_martian_dst);
2100 #ifdef CONFIG_IP_ROUTE_VERBOSE
2101         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2102                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2103                         NIPQUAD_FMT ", dev %s\n",
2104                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2105 #endif
2106
2107 e_hostunreach:
2108         err = -EHOSTUNREACH;
2109         goto done;
2110
2111 e_inval:
2112         err = -EINVAL;
2113         goto done;
2114
2115 e_nobufs:
2116         err = -ENOBUFS;
2117         goto done;
2118
2119 martian_source:
2120         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2121         goto e_inval;
2122 }
2123
2124 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2125                    u8 tos, struct net_device *dev)
2126 {
2127         struct rtable * rth;
2128         unsigned        hash;
2129         int iif = dev->ifindex;
2130         struct net *net;
2131
2132         net = dev_net(dev);
2133         tos &= IPTOS_RT_MASK;
2134         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2135
2136         rcu_read_lock();
2137         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2138              rth = rcu_dereference(rth->u.dst.rt_next)) {
2139                 if (((rth->fl.fl4_dst ^ daddr) |
2140                      (rth->fl.fl4_src ^ saddr) |
2141                      (rth->fl.iif ^ iif) |
2142                      rth->fl.oif |
2143                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2144                     rth->fl.mark == skb->mark &&
2145                     net_eq(dev_net(rth->u.dst.dev), net) &&
2146                     !rt_is_expired(rth)) {
2147                         dst_use(&rth->u.dst, jiffies);
2148                         RT_CACHE_STAT_INC(in_hit);
2149                         rcu_read_unlock();
2150                         skb->rtable = rth;
2151                         return 0;
2152                 }
2153                 RT_CACHE_STAT_INC(in_hlist_search);
2154         }
2155         rcu_read_unlock();
2156
2157         /* Multicast recognition logic is moved from route cache to here.
2158            The problem was that too many Ethernet cards have broken/missing
2159            hardware multicast filters :-( As result the host on multicasting
2160            network acquires a lot of useless route cache entries, sort of
2161            SDR messages from all the world. Now we try to get rid of them.
2162            Really, provided software IP multicast filter is organized
2163            reasonably (at least, hashed), it does not result in a slowdown
2164            comparing with route cache reject entries.
2165            Note, that multicast routers are not affected, because
2166            route cache entry is created eventually.
2167          */
2168         if (ipv4_is_multicast(daddr)) {
2169                 struct in_device *in_dev;
2170
2171                 rcu_read_lock();
2172                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2173                         int our = ip_check_mc(in_dev, daddr, saddr,
2174                                 ip_hdr(skb)->protocol);
2175                         if (our
2176 #ifdef CONFIG_IP_MROUTE
2177                             || (!ipv4_is_local_multicast(daddr) &&
2178                                 IN_DEV_MFORWARD(in_dev))
2179 #endif
2180                             ) {
2181                                 rcu_read_unlock();
2182                                 return ip_route_input_mc(skb, daddr, saddr,
2183                                                          tos, dev, our);
2184                         }
2185                 }
2186                 rcu_read_unlock();
2187                 return -EINVAL;
2188         }
2189         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2190 }
2191
2192 static int __mkroute_output(struct rtable **result,
2193                             struct fib_result *res,
2194                             const struct flowi *fl,
2195                             const struct flowi *oldflp,
2196                             struct net_device *dev_out,
2197                             unsigned flags)
2198 {
2199         struct rtable *rth;
2200         struct in_device *in_dev;
2201         u32 tos = RT_FL_TOS(oldflp);
2202         int err = 0;
2203
2204         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2205                 return -EINVAL;
2206
2207         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2208                 res->type = RTN_BROADCAST;
2209         else if (ipv4_is_multicast(fl->fl4_dst))
2210                 res->type = RTN_MULTICAST;
2211         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2212                 return -EINVAL;
2213
2214         if (dev_out->flags & IFF_LOOPBACK)
2215                 flags |= RTCF_LOCAL;
2216
2217         /* get work reference to inet device */
2218         in_dev = in_dev_get(dev_out);
2219         if (!in_dev)
2220                 return -EINVAL;
2221
2222         if (res->type == RTN_BROADCAST) {
2223                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2224                 if (res->fi) {
2225                         fib_info_put(res->fi);
2226                         res->fi = NULL;
2227                 }
2228         } else if (res->type == RTN_MULTICAST) {
2229                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2230                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2231                                  oldflp->proto))
2232                         flags &= ~RTCF_LOCAL;
2233                 /* If multicast route do not exist use
2234                    default one, but do not gateway in this case.
2235                    Yes, it is hack.
2236                  */
2237                 if (res->fi && res->prefixlen < 4) {
2238                         fib_info_put(res->fi);
2239                         res->fi = NULL;
2240                 }
2241         }
2242
2243
2244         rth = dst_alloc(&ipv4_dst_ops);
2245         if (!rth) {
2246                 err = -ENOBUFS;
2247                 goto cleanup;
2248         }
2249
2250         atomic_set(&rth->u.dst.__refcnt, 1);
2251         rth->u.dst.flags= DST_HOST;
2252         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2253                 rth->u.dst.flags |= DST_NOXFRM;
2254         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2255                 rth->u.dst.flags |= DST_NOPOLICY;
2256
2257         rth->fl.fl4_dst = oldflp->fl4_dst;
2258         rth->fl.fl4_tos = tos;
2259         rth->fl.fl4_src = oldflp->fl4_src;
2260         rth->fl.oif     = oldflp->oif;
2261         rth->fl.mark    = oldflp->mark;
2262         rth->rt_dst     = fl->fl4_dst;
2263         rth->rt_src     = fl->fl4_src;
2264         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2265         /* get references to the devices that are to be hold by the routing
2266            cache entry */
2267         rth->u.dst.dev  = dev_out;
2268         dev_hold(dev_out);
2269         rth->idev       = in_dev_get(dev_out);
2270         rth->rt_gateway = fl->fl4_dst;
2271         rth->rt_spec_dst= fl->fl4_src;
2272
2273         rth->u.dst.output=ip_output;
2274         rth->rt_genid = rt_genid(dev_net(dev_out));
2275
2276         RT_CACHE_STAT_INC(out_slow_tot);
2277
2278         if (flags & RTCF_LOCAL) {
2279                 rth->u.dst.input = ip_local_deliver;
2280                 rth->rt_spec_dst = fl->fl4_dst;
2281         }
2282         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2283                 rth->rt_spec_dst = fl->fl4_src;
2284                 if (flags & RTCF_LOCAL &&
2285                     !(dev_out->flags & IFF_LOOPBACK)) {
2286                         rth->u.dst.output = ip_mc_output;
2287                         RT_CACHE_STAT_INC(out_slow_mc);
2288                 }
2289 #ifdef CONFIG_IP_MROUTE
2290                 if (res->type == RTN_MULTICAST) {
2291                         if (IN_DEV_MFORWARD(in_dev) &&
2292                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2293                                 rth->u.dst.input = ip_mr_input;
2294                                 rth->u.dst.output = ip_mc_output;
2295                         }
2296                 }
2297 #endif
2298         }
2299
2300         rt_set_nexthop(rth, res, 0);
2301
2302         rth->rt_flags = flags;
2303
2304         *result = rth;
2305  cleanup:
2306         /* release work reference to inet device */
2307         in_dev_put(in_dev);
2308
2309         return err;
2310 }
2311
2312 static int ip_mkroute_output(struct rtable **rp,
2313                              struct fib_result *res,
2314                              const struct flowi *fl,
2315                              const struct flowi *oldflp,
2316                              struct net_device *dev_out,
2317                              unsigned flags)
2318 {
2319         struct rtable *rth = NULL;
2320         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2321         unsigned hash;
2322         if (err == 0) {
2323                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2324                                rt_genid(dev_net(dev_out)));
2325                 err = rt_intern_hash(hash, rth, rp);
2326         }
2327
2328         return err;
2329 }
2330
2331 /*
2332  * Major route resolver routine.
2333  */
2334
2335 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2336                                 const struct flowi *oldflp)
2337 {
2338         u32 tos = RT_FL_TOS(oldflp);
2339         struct flowi fl = { .nl_u = { .ip4_u =
2340                                       { .daddr = oldflp->fl4_dst,
2341                                         .saddr = oldflp->fl4_src,
2342                                         .tos = tos & IPTOS_RT_MASK,
2343                                         .scope = ((tos & RTO_ONLINK) ?
2344                                                   RT_SCOPE_LINK :
2345                                                   RT_SCOPE_UNIVERSE),
2346                                       } },
2347                             .mark = oldflp->mark,
2348                             .iif = net->loopback_dev->ifindex,
2349                             .oif = oldflp->oif };
2350         struct fib_result res;
2351         unsigned flags = 0;
2352         struct net_device *dev_out = NULL;
2353         int free_res = 0;
2354         int err;
2355
2356
2357         res.fi          = NULL;
2358 #ifdef CONFIG_IP_MULTIPLE_TABLES
2359         res.r           = NULL;
2360 #endif
2361
2362         if (oldflp->fl4_src) {
2363                 err = -EINVAL;
2364                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2365                     ipv4_is_lbcast(oldflp->fl4_src) ||
2366                     ipv4_is_zeronet(oldflp->fl4_src))
2367                         goto out;
2368
2369                 /* I removed check for oif == dev_out->oif here.
2370                    It was wrong for two reasons:
2371                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2372                       is assigned to multiple interfaces.
2373                    2. Moreover, we are allowed to send packets with saddr
2374                       of another iface. --ANK
2375                  */
2376
2377                 if (oldflp->oif == 0
2378                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2379                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2380                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2381                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2382                         if (dev_out == NULL)
2383                                 goto out;
2384
2385                         /* Special hack: user can direct multicasts
2386                            and limited broadcast via necessary interface
2387                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2388                            This hack is not just for fun, it allows
2389                            vic,vat and friends to work.
2390                            They bind socket to loopback, set ttl to zero
2391                            and expect that it will work.
2392                            From the viewpoint of routing cache they are broken,
2393                            because we are not allowed to build multicast path
2394                            with loopback source addr (look, routing cache
2395                            cannot know, that ttl is zero, so that packet
2396                            will not leave this host and route is valid).
2397                            Luckily, this hack is good workaround.
2398                          */
2399
2400                         fl.oif = dev_out->ifindex;
2401                         goto make_route;
2402                 }
2403
2404                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2405                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2406                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2407                         if (dev_out == NULL)
2408                                 goto out;
2409                         dev_put(dev_out);
2410                         dev_out = NULL;
2411                 }
2412         }
2413
2414
2415         if (oldflp->oif) {
2416                 dev_out = dev_get_by_index(net, oldflp->oif);
2417                 err = -ENODEV;
2418                 if (dev_out == NULL)
2419                         goto out;
2420
2421                 /* RACE: Check return value of inet_select_addr instead. */
2422                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2423                         dev_put(dev_out);
2424                         goto out;       /* Wrong error code */
2425                 }
2426
2427                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2428                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2429                         if (!fl.fl4_src)
2430                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2431                                                               RT_SCOPE_LINK);
2432                         goto make_route;
2433                 }
2434                 if (!fl.fl4_src) {
2435                         if (ipv4_is_multicast(oldflp->fl4_dst))
2436                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2437                                                               fl.fl4_scope);
2438                         else if (!oldflp->fl4_dst)
2439                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2440                                                               RT_SCOPE_HOST);
2441                 }
2442         }
2443
2444         if (!fl.fl4_dst) {
2445                 fl.fl4_dst = fl.fl4_src;
2446                 if (!fl.fl4_dst)
2447                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2448                 if (dev_out)
2449                         dev_put(dev_out);
2450                 dev_out = net->loopback_dev;
2451                 dev_hold(dev_out);
2452                 fl.oif = net->loopback_dev->ifindex;
2453                 res.type = RTN_LOCAL;
2454                 flags |= RTCF_LOCAL;
2455                 goto make_route;
2456         }
2457
2458         if (fib_lookup(net, &fl, &res)) {
2459                 res.fi = NULL;
2460                 if (oldflp->oif) {
2461                         /* Apparently, routing tables are wrong. Assume,
2462                            that the destination is on link.
2463
2464                            WHY? DW.
2465                            Because we are allowed to send to iface
2466                            even if it has NO routes and NO assigned
2467                            addresses. When oif is specified, routing
2468                            tables are looked up with only one purpose:
2469                            to catch if destination is gatewayed, rather than
2470                            direct. Moreover, if MSG_DONTROUTE is set,
2471                            we send packet, ignoring both routing tables
2472                            and ifaddr state. --ANK
2473
2474
2475                            We could make it even if oif is unknown,
2476                            likely IPv6, but we do not.
2477                          */
2478
2479                         if (fl.fl4_src == 0)
2480                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2481                                                               RT_SCOPE_LINK);
2482                         res.type = RTN_UNICAST;
2483                         goto make_route;
2484                 }
2485                 if (dev_out)
2486                         dev_put(dev_out);
2487                 err = -ENETUNREACH;
2488                 goto out;
2489         }
2490         free_res = 1;
2491
2492         if (res.type == RTN_LOCAL) {
2493                 if (!fl.fl4_src)
2494                         fl.fl4_src = fl.fl4_dst;
2495                 if (dev_out)
2496                         dev_put(dev_out);
2497                 dev_out = net->loopback_dev;
2498                 dev_hold(dev_out);
2499                 fl.oif = dev_out->ifindex;
2500                 if (res.fi)
2501                         fib_info_put(res.fi);
2502                 res.fi = NULL;
2503                 flags |= RTCF_LOCAL;
2504                 goto make_route;
2505         }
2506
2507 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2508         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2509                 fib_select_multipath(&fl, &res);
2510         else
2511 #endif
2512         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2513                 fib_select_default(net, &fl, &res);
2514
2515         if (!fl.fl4_src)
2516                 fl.fl4_src = FIB_RES_PREFSRC(res);
2517
2518         if (dev_out)
2519                 dev_put(dev_out);
2520         dev_out = FIB_RES_DEV(res);
2521         dev_hold(dev_out);
2522         fl.oif = dev_out->ifindex;
2523
2524
2525 make_route:
2526         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2527
2528
2529         if (free_res)
2530                 fib_res_put(&res);
2531         if (dev_out)
2532                 dev_put(dev_out);
2533 out:    return err;
2534 }
2535
2536 int __ip_route_output_key(struct net *net, struct rtable **rp,
2537                           const struct flowi *flp)
2538 {
2539         unsigned hash;
2540         struct rtable *rth;
2541
2542         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2543
2544         rcu_read_lock_bh();
2545         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2546                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2547                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2548                     rth->fl.fl4_src == flp->fl4_src &&
2549                     rth->fl.iif == 0 &&
2550                     rth->fl.oif == flp->oif &&
2551                     rth->fl.mark == flp->mark &&
2552                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2553                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2554                     net_eq(dev_net(rth->u.dst.dev), net) &&
2555                     !rt_is_expired(rth)) {
2556                         dst_use(&rth->u.dst, jiffies);
2557                         RT_CACHE_STAT_INC(out_hit);
2558                         rcu_read_unlock_bh();
2559                         *rp = rth;
2560                         return 0;
2561                 }
2562                 RT_CACHE_STAT_INC(out_hlist_search);
2563         }
2564         rcu_read_unlock_bh();
2565
2566         return ip_route_output_slow(net, rp, flp);
2567 }
2568
2569 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2570
2571 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2572 {
2573 }
2574
2575 static struct dst_ops ipv4_dst_blackhole_ops = {
2576         .family                 =       AF_INET,
2577         .protocol               =       __constant_htons(ETH_P_IP),
2578         .destroy                =       ipv4_dst_destroy,
2579         .check                  =       ipv4_dst_check,
2580         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2581         .entry_size             =       sizeof(struct rtable),
2582         .entries                =       ATOMIC_INIT(0),
2583 };
2584
2585
2586 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2587 {
2588         struct rtable *ort = *rp;
2589         struct rtable *rt = (struct rtable *)
2590                 dst_alloc(&ipv4_dst_blackhole_ops);
2591
2592         if (rt) {
2593                 struct dst_entry *new = &rt->u.dst;
2594
2595                 atomic_set(&new->__refcnt, 1);
2596                 new->__use = 1;
2597                 new->input = dst_discard;
2598                 new->output = dst_discard;
2599                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2600
2601                 new->dev = ort->u.dst.dev;
2602                 if (new->dev)
2603                         dev_hold(new->dev);
2604
2605                 rt->fl = ort->fl;
2606
2607                 rt->idev = ort->idev;
2608                 if (rt->idev)
2609                         in_dev_hold(rt->idev);
2610                 rt->rt_genid = rt_genid(net);
2611                 rt->rt_flags = ort->rt_flags;
2612                 rt->rt_type = ort->rt_type;
2613                 rt->rt_dst = ort->rt_dst;
2614                 rt->rt_src = ort->rt_src;
2615                 rt->rt_iif = ort->rt_iif;
2616                 rt->rt_gateway = ort->rt_gateway;
2617                 rt->rt_spec_dst = ort->rt_spec_dst;
2618                 rt->peer = ort->peer;
2619                 if (rt->peer)
2620                         atomic_inc(&rt->peer->refcnt);
2621
2622                 dst_free(new);
2623         }
2624
2625         dst_release(&(*rp)->u.dst);
2626         *rp = rt;
2627         return (rt ? 0 : -ENOMEM);
2628 }
2629
2630 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2631                          struct sock *sk, int flags)
2632 {
2633         int err;
2634
2635         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2636                 return err;
2637
2638         if (flp->proto) {
2639                 if (!flp->fl4_src)
2640                         flp->fl4_src = (*rp)->rt_src;
2641                 if (!flp->fl4_dst)
2642                         flp->fl4_dst = (*rp)->rt_dst;
2643                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2644                                     flags ? XFRM_LOOKUP_WAIT : 0);
2645                 if (err == -EREMOTE)
2646                         err = ipv4_dst_blackhole(net, rp, flp);
2647
2648                 return err;
2649         }
2650
2651         return 0;
2652 }
2653
2654 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2655
2656 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2657 {
2658         return ip_route_output_flow(net, rp, flp, NULL, 0);
2659 }
2660
2661 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2662                         int nowait, unsigned int flags)
2663 {
2664         struct rtable *rt = skb->rtable;
2665         struct rtmsg *r;
2666         struct nlmsghdr *nlh;
2667         long expires;
2668         u32 id = 0, ts = 0, tsage = 0, error;
2669
2670         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2671         if (nlh == NULL)
2672                 return -EMSGSIZE;
2673
2674         r = nlmsg_data(nlh);
2675         r->rtm_family    = AF_INET;
2676         r->rtm_dst_len  = 32;
2677         r->rtm_src_len  = 0;
2678         r->rtm_tos      = rt->fl.fl4_tos;
2679         r->rtm_table    = RT_TABLE_MAIN;
2680         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2681         r->rtm_type     = rt->rt_type;
2682         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2683         r->rtm_protocol = RTPROT_UNSPEC;
2684         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2685         if (rt->rt_flags & RTCF_NOTIFY)
2686                 r->rtm_flags |= RTM_F_NOTIFY;
2687
2688         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2689
2690         if (rt->fl.fl4_src) {
2691                 r->rtm_src_len = 32;
2692                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2693         }
2694         if (rt->u.dst.dev)
2695                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2696 #ifdef CONFIG_NET_CLS_ROUTE
2697         if (rt->u.dst.tclassid)
2698                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2699 #endif
2700         if (rt->fl.iif)
2701                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2702         else if (rt->rt_src != rt->fl.fl4_src)
2703                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2704
2705         if (rt->rt_dst != rt->rt_gateway)
2706                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2707
2708         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2709                 goto nla_put_failure;
2710
2711         error = rt->u.dst.error;
2712         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2713         if (rt->peer) {
2714                 id = rt->peer->ip_id_count;
2715                 if (rt->peer->tcp_ts_stamp) {
2716                         ts = rt->peer->tcp_ts;
2717                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2718                 }
2719         }
2720
2721         if (rt->fl.iif) {
2722 #ifdef CONFIG_IP_MROUTE
2723                 __be32 dst = rt->rt_dst;
2724
2725                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2726                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2727                         int err = ipmr_get_route(skb, r, nowait);
2728                         if (err <= 0) {
2729                                 if (!nowait) {
2730                                         if (err == 0)
2731                                                 return 0;
2732                                         goto nla_put_failure;
2733                                 } else {
2734                                         if (err == -EMSGSIZE)
2735                                                 goto nla_put_failure;
2736                                         error = err;
2737                                 }
2738                         }
2739                 } else
2740 #endif
2741                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2742         }
2743
2744         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2745                                expires, error) < 0)
2746                 goto nla_put_failure;
2747
2748         return nlmsg_end(skb, nlh);
2749
2750 nla_put_failure:
2751         nlmsg_cancel(skb, nlh);
2752         return -EMSGSIZE;
2753 }
2754
2755 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2756 {
2757         struct net *net = sock_net(in_skb->sk);
2758         struct rtmsg *rtm;
2759         struct nlattr *tb[RTA_MAX+1];
2760         struct rtable *rt = NULL;
2761         __be32 dst = 0;
2762         __be32 src = 0;
2763         u32 iif;
2764         int err;
2765         struct sk_buff *skb;
2766
2767         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2768         if (err < 0)
2769                 goto errout;
2770
2771         rtm = nlmsg_data(nlh);
2772
2773         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2774         if (skb == NULL) {
2775                 err = -ENOBUFS;
2776                 goto errout;
2777         }
2778
2779         /* Reserve room for dummy headers, this skb can pass
2780            through good chunk of routing engine.
2781          */
2782         skb_reset_mac_header(skb);
2783         skb_reset_network_header(skb);
2784
2785         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2786         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2787         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2788
2789         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2790         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2791         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2792
2793         if (iif) {
2794                 struct net_device *dev;
2795
2796                 dev = __dev_get_by_index(net, iif);
2797                 if (dev == NULL) {
2798                         err = -ENODEV;
2799                         goto errout_free;
2800                 }
2801
2802                 skb->protocol   = htons(ETH_P_IP);
2803                 skb->dev        = dev;
2804                 local_bh_disable();
2805                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2806                 local_bh_enable();
2807
2808                 rt = skb->rtable;
2809                 if (err == 0 && rt->u.dst.error)
2810                         err = -rt->u.dst.error;
2811         } else {
2812                 struct flowi fl = {
2813                         .nl_u = {
2814                                 .ip4_u = {
2815                                         .daddr = dst,
2816                                         .saddr = src,
2817                                         .tos = rtm->rtm_tos,
2818                                 },
2819                         },
2820                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2821                 };
2822                 err = ip_route_output_key(net, &rt, &fl);
2823         }
2824
2825         if (err)
2826                 goto errout_free;
2827
2828         skb->rtable = rt;
2829         if (rtm->rtm_flags & RTM_F_NOTIFY)
2830                 rt->rt_flags |= RTCF_NOTIFY;
2831
2832         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2833                            RTM_NEWROUTE, 0, 0);
2834         if (err <= 0)
2835                 goto errout_free;
2836
2837         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2838 errout:
2839         return err;
2840
2841 errout_free:
2842         kfree_skb(skb);
2843         goto errout;
2844 }
2845
2846 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2847 {
2848         struct rtable *rt;
2849         int h, s_h;
2850         int idx, s_idx;
2851         struct net *net;
2852
2853         net = sock_net(skb->sk);
2854
2855         s_h = cb->args[0];
2856         if (s_h < 0)
2857                 s_h = 0;
2858         s_idx = idx = cb->args[1];
2859         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2860                 if (!rt_hash_table[h].chain)
2861                         continue;
2862                 rcu_read_lock_bh();
2863                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2864                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2865                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2866                                 continue;
2867                         if (rt_is_expired(rt))
2868                                 continue;
2869                         skb->dst = dst_clone(&rt->u.dst);
2870                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2871                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2872                                          1, NLM_F_MULTI) <= 0) {
2873                                 dst_release(xchg(&skb->dst, NULL));
2874                                 rcu_read_unlock_bh();
2875                                 goto done;
2876                         }
2877                         dst_release(xchg(&skb->dst, NULL));
2878                 }
2879                 rcu_read_unlock_bh();
2880         }
2881
2882 done:
2883         cb->args[0] = h;
2884         cb->args[1] = idx;
2885         return skb->len;
2886 }
2887
2888 void ip_rt_multicast_event(struct in_device *in_dev)
2889 {
2890         rt_cache_flush(dev_net(in_dev->dev), 0);
2891 }
2892
2893 #ifdef CONFIG_SYSCTL
2894 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2895                                         struct file *filp, void __user *buffer,
2896                                         size_t *lenp, loff_t *ppos)
2897 {
2898         if (write) {
2899                 int flush_delay;
2900                 ctl_table ctl;
2901                 struct net *net;
2902
2903                 memcpy(&ctl, __ctl, sizeof(ctl));
2904                 ctl.data = &flush_delay;
2905                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2906
2907                 net = (struct net *)__ctl->extra1;
2908                 rt_cache_flush(net, flush_delay);
2909                 return 0;
2910         }
2911
2912         return -EINVAL;
2913 }
2914
2915 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2916                                                 void __user *oldval,
2917                                                 size_t __user *oldlenp,
2918                                                 void __user *newval,
2919                                                 size_t newlen)
2920 {
2921         int delay;
2922         struct net *net;
2923         if (newlen != sizeof(int))
2924                 return -EINVAL;
2925         if (get_user(delay, (int __user *)newval))
2926                 return -EFAULT;
2927         net = (struct net *)table->extra1;
2928         rt_cache_flush(net, delay);
2929         return 0;
2930 }
2931
2932 static void rt_secret_reschedule(int old)
2933 {
2934         struct net *net;
2935         int new = ip_rt_secret_interval;
2936         int diff = new - old;
2937
2938         if (!diff)
2939                 return;
2940
2941         rtnl_lock();
2942         for_each_net(net) {
2943                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
2944
2945                 if (!new)
2946                         continue;
2947
2948                 if (deleted) {
2949                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
2950
2951                         if (time <= 0 || (time += diff) <= 0)
2952                                 time = 0;
2953
2954                         net->ipv4.rt_secret_timer.expires = time;
2955                 } else
2956                         net->ipv4.rt_secret_timer.expires = new;
2957
2958                 net->ipv4.rt_secret_timer.expires += jiffies;
2959                 add_timer(&net->ipv4.rt_secret_timer);
2960         }
2961         rtnl_unlock();
2962 }
2963
2964 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
2965                                           struct file *filp,
2966                                           void __user *buffer, size_t *lenp,
2967                                           loff_t *ppos)
2968 {
2969         int old = ip_rt_secret_interval;
2970         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
2971
2972         rt_secret_reschedule(old);
2973
2974         return ret;
2975 }
2976
2977 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
2978                                                    void __user *oldval,
2979                                                    size_t __user *oldlenp,
2980                                                    void __user *newval,
2981                                                    size_t newlen)
2982 {
2983         int old = ip_rt_secret_interval;
2984         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
2985
2986         rt_secret_reschedule(old);
2987
2988         return ret;
2989 }
2990
2991 static ctl_table ipv4_route_table[] = {
2992         {
2993                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2994                 .procname       = "gc_thresh",
2995                 .data           = &ipv4_dst_ops.gc_thresh,
2996                 .maxlen         = sizeof(int),
2997                 .mode           = 0644,
2998                 .proc_handler   = &proc_dointvec,
2999         },
3000         {
3001                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3002                 .procname       = "max_size",
3003                 .data           = &ip_rt_max_size,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = &proc_dointvec,
3007         },
3008         {
3009                 /*  Deprecated. Use gc_min_interval_ms */
3010
3011                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3012                 .procname       = "gc_min_interval",
3013                 .data           = &ip_rt_gc_min_interval,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec_jiffies,
3017                 .strategy       = &sysctl_jiffies,
3018         },
3019         {
3020                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3021                 .procname       = "gc_min_interval_ms",
3022                 .data           = &ip_rt_gc_min_interval,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = &proc_dointvec_ms_jiffies,
3026                 .strategy       = &sysctl_ms_jiffies,
3027         },
3028         {
3029                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3030                 .procname       = "gc_timeout",
3031                 .data           = &ip_rt_gc_timeout,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = &proc_dointvec_jiffies,
3035                 .strategy       = &sysctl_jiffies,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3039                 .procname       = "gc_interval",
3040                 .data           = &ip_rt_gc_interval,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec_jiffies,
3044                 .strategy       = &sysctl_jiffies,
3045         },
3046         {
3047                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3048                 .procname       = "redirect_load",
3049                 .data           = &ip_rt_redirect_load,
3050                 .maxlen         = sizeof(int),
3051                 .mode           = 0644,
3052                 .proc_handler   = &proc_dointvec,
3053         },
3054         {
3055                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3056                 .procname       = "redirect_number",
3057                 .data           = &ip_rt_redirect_number,
3058                 .maxlen         = sizeof(int),
3059                 .mode           = 0644,
3060                 .proc_handler   = &proc_dointvec,
3061         },
3062         {
3063                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3064                 .procname       = "redirect_silence",
3065                 .data           = &ip_rt_redirect_silence,
3066                 .maxlen         = sizeof(int),
3067                 .mode           = 0644,
3068                 .proc_handler   = &proc_dointvec,
3069         },
3070         {
3071                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3072                 .procname       = "error_cost",
3073                 .data           = &ip_rt_error_cost,
3074                 .maxlen         = sizeof(int),
3075                 .mode           = 0644,
3076                 .proc_handler   = &proc_dointvec,
3077         },
3078         {
3079                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3080                 .procname       = "error_burst",
3081                 .data           = &ip_rt_error_burst,
3082                 .maxlen         = sizeof(int),
3083                 .mode           = 0644,
3084                 .proc_handler   = &proc_dointvec,
3085         },
3086         {
3087                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3088                 .procname       = "gc_elasticity",
3089                 .data           = &ip_rt_gc_elasticity,
3090                 .maxlen         = sizeof(int),
3091                 .mode           = 0644,
3092                 .proc_handler   = &proc_dointvec,
3093         },
3094         {
3095                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3096                 .procname       = "mtu_expires",
3097                 .data           = &ip_rt_mtu_expires,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = &proc_dointvec_jiffies,
3101                 .strategy       = &sysctl_jiffies,
3102         },
3103         {
3104                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3105                 .procname       = "min_pmtu",
3106                 .data           = &ip_rt_min_pmtu,
3107                 .maxlen         = sizeof(int),
3108                 .mode           = 0644,
3109                 .proc_handler   = &proc_dointvec,
3110         },
3111         {
3112                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3113                 .procname       = "min_adv_mss",
3114                 .data           = &ip_rt_min_advmss,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = &proc_dointvec,
3118         },
3119         {
3120                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3121                 .procname       = "secret_interval",
3122                 .data           = &ip_rt_secret_interval,
3123                 .maxlen         = sizeof(int),
3124                 .mode           = 0644,
3125                 .proc_handler   = &ipv4_sysctl_rt_secret_interval,
3126                 .strategy       = &ipv4_sysctl_rt_secret_interval_strategy,
3127         },
3128         { .ctl_name = 0 }
3129 };
3130
3131 static struct ctl_table empty[1];
3132
3133 static struct ctl_table ipv4_skeleton[] =
3134 {
3135         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3136           .mode = 0555, .child = ipv4_route_table},
3137         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3138           .mode = 0555, .child = empty},
3139         { }
3140 };
3141
3142 static __net_initdata struct ctl_path ipv4_path[] = {
3143         { .procname = "net", .ctl_name = CTL_NET, },
3144         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3145         { },
3146 };
3147
3148 static struct ctl_table ipv4_route_flush_table[] = {
3149         {
3150                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3151                 .procname       = "flush",
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0200,
3154                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3155                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3156         },
3157         { .ctl_name = 0 },
3158 };
3159
3160 static __net_initdata struct ctl_path ipv4_route_path[] = {
3161         { .procname = "net", .ctl_name = CTL_NET, },
3162         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3163         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3164         { },
3165 };
3166
3167 static __net_init int sysctl_route_net_init(struct net *net)
3168 {
3169         struct ctl_table *tbl;
3170
3171         tbl = ipv4_route_flush_table;
3172         if (net != &init_net) {
3173                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3174                 if (tbl == NULL)
3175                         goto err_dup;
3176         }
3177         tbl[0].extra1 = net;
3178
3179         net->ipv4.route_hdr =
3180                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3181         if (net->ipv4.route_hdr == NULL)
3182                 goto err_reg;
3183         return 0;
3184
3185 err_reg:
3186         if (tbl != ipv4_route_flush_table)
3187                 kfree(tbl);
3188 err_dup:
3189         return -ENOMEM;
3190 }
3191
3192 static __net_exit void sysctl_route_net_exit(struct net *net)
3193 {
3194         struct ctl_table *tbl;
3195
3196         tbl = net->ipv4.route_hdr->ctl_table_arg;
3197         unregister_net_sysctl_table(net->ipv4.route_hdr);
3198         BUG_ON(tbl == ipv4_route_flush_table);
3199         kfree(tbl);
3200 }
3201
3202 static __net_initdata struct pernet_operations sysctl_route_ops = {
3203         .init = sysctl_route_net_init,
3204         .exit = sysctl_route_net_exit,
3205 };
3206 #endif
3207
3208
3209 static __net_init int rt_secret_timer_init(struct net *net)
3210 {
3211         atomic_set(&net->ipv4.rt_genid,
3212                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3213                         (jiffies ^ (jiffies >> 7))));
3214
3215         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3216         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3217         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3218
3219         if (ip_rt_secret_interval) {
3220                 net->ipv4.rt_secret_timer.expires =
3221                         jiffies + net_random() % ip_rt_secret_interval +
3222                         ip_rt_secret_interval;
3223                 add_timer(&net->ipv4.rt_secret_timer);
3224         }
3225         return 0;
3226 }
3227
3228 static __net_exit void rt_secret_timer_exit(struct net *net)
3229 {
3230         del_timer_sync(&net->ipv4.rt_secret_timer);
3231 }
3232
3233 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3234         .init = rt_secret_timer_init,
3235         .exit = rt_secret_timer_exit,
3236 };
3237
3238
3239 #ifdef CONFIG_NET_CLS_ROUTE
3240 struct ip_rt_acct *ip_rt_acct __read_mostly;
3241 #endif /* CONFIG_NET_CLS_ROUTE */
3242
3243 static __initdata unsigned long rhash_entries;
3244 static int __init set_rhash_entries(char *str)
3245 {
3246         if (!str)
3247                 return 0;
3248         rhash_entries = simple_strtoul(str, &str, 0);
3249         return 1;
3250 }
3251 __setup("rhash_entries=", set_rhash_entries);
3252
3253 int __init ip_rt_init(void)
3254 {
3255         int rc = 0;
3256
3257 #ifdef CONFIG_NET_CLS_ROUTE
3258         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3259         if (!ip_rt_acct)
3260                 panic("IP: failed to allocate ip_rt_acct\n");
3261 #endif
3262
3263         ipv4_dst_ops.kmem_cachep =
3264                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3265                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3266
3267         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3268
3269         rt_hash_table = (struct rt_hash_bucket *)
3270                 alloc_large_system_hash("IP route cache",
3271                                         sizeof(struct rt_hash_bucket),
3272                                         rhash_entries,
3273                                         (num_physpages >= 128 * 1024) ?
3274                                         15 : 17,
3275                                         0,
3276                                         &rt_hash_log,
3277                                         &rt_hash_mask,
3278                                         0);
3279         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3280         rt_hash_lock_init();
3281
3282         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3283         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3284
3285         devinet_init();
3286         ip_fib_init();
3287
3288         /* All the timers, started at system startup tend
3289            to synchronize. Perturb it a bit.
3290          */
3291         schedule_delayed_work(&expires_work,
3292                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3293
3294         if (register_pernet_subsys(&rt_secret_timer_ops))
3295                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3296
3297         if (ip_rt_proc_init())
3298                 printk(KERN_ERR "Unable to create route proc files\n");
3299 #ifdef CONFIG_XFRM
3300         xfrm_init();
3301         xfrm4_init();
3302 #endif
3303         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3304
3305 #ifdef CONFIG_SYSCTL
3306         register_pernet_subsys(&sysctl_route_ops);
3307 #endif
3308         return rc;
3309 }
3310
3311 #ifdef CONFIG_SYSCTL
3312 /*
3313  * We really need to sanitize the damn ipv4 init order, then all
3314  * this nonsense will go away.
3315  */
3316 void __init ip_static_sysctl_init(void)
3317 {
3318         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3319 }
3320 #endif
3321
3322 EXPORT_SYMBOL(__ip_select_ident);
3323 EXPORT_SYMBOL(ip_route_input);
3324 EXPORT_SYMBOL(ip_route_output_key);