net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132
 133 static void rt_worker_func(struct work_struct *work);
 134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static void              ipv4_dst_destroy(struct dst_entry *dst);
 142 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 143                                          struct net_device *dev, int how);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149
 150 static struct dst_ops ipv4_dst_ops = {
 151         .family =               AF_INET,
 152         .protocol =             __constant_htons(ETH_P_IP),
 153         .gc =                   rt_garbage_collect,
 154         .check =                ipv4_dst_check,
 155         .destroy =              ipv4_dst_destroy,
 156         .ifdown =               ipv4_dst_ifdown,
 157         .negative_advice =      ipv4_negative_advice,
 158         .link_failure =         ipv4_link_failure,
 159         .update_pmtu =          ip_rt_update_pmtu,
 160         .local_out =            __ip_local_out,
 161         .entry_size =           sizeof(struct rtable),
 162         .entries =              ATOMIC_INIT(0),
 163 };
 164
 165 #define ECN_OR_COST(class)      TC_PRIO_##class
 166
 167 const __u8 ip_tos2prio[16] = {
 168         TC_PRIO_BESTEFFORT,
 169         ECN_OR_COST(FILLER),
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(BESTEFFORT),
 172         TC_PRIO_BULK,
 173         ECN_OR_COST(BULK),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_INTERACTIVE,
 177         ECN_OR_COST(INTERACTIVE),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE_BULK,
 181         ECN_OR_COST(INTERACTIVE_BULK),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK)
 184 };
 185
 186
 187 /*
 188  * Route cache.
 189  */
 190
 191 /* The locking scheme is rather straight forward:
 192  *
 193  * 1) Read-Copy Update protects the buckets of the central route hash.
 194  * 2) Only writers remove entries, and they hold the lock
 195  *    as they look at rtable reference counts.
 196  * 3) Only readers acquire references to rtable entries,
 197  *    they do so with atomic increments and with the
 198  *    lock held.
 199  */
 200
 201 struct rt_hash_bucket {
 202         struct rtable   *chain;
 203 };
 204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 205         defined(CONFIG_PROVE_LOCKING)
 206 /*
 207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 208  * The size of this table is a power of two and depends on the number of CPUS.
 209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 210  */
 211 #ifdef CONFIG_LOCKDEP
 212 # define RT_HASH_LOCK_SZ        256
 213 #else
 214 # if NR_CPUS >= 32
 215 #  define RT_HASH_LOCK_SZ       4096
 216 # elif NR_CPUS >= 16
 217 #  define RT_HASH_LOCK_SZ       2048
 218 # elif NR_CPUS >= 8
 219 #  define RT_HASH_LOCK_SZ       1024
 220 # elif NR_CPUS >= 4
 221 #  define RT_HASH_LOCK_SZ       512
 222 # else
 223 #  define RT_HASH_LOCK_SZ       256
 224 # endif
 225 #endif
 226
 227 static spinlock_t       *rt_hash_locks;
 228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 229
 230 static __init void rt_hash_lock_init(void)
 231 {
 232         int i;
 233
 234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 235                         GFP_KERNEL);
 236         if (!rt_hash_locks)
 237                 panic("IP: failed to allocate rt_hash_locks\n");
 238
 239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 240                 spin_lock_init(&rt_hash_locks[i]);
 241 }
 242 #else
 243 # define rt_hash_lock_addr(slot) NULL
 244
 245 static inline void rt_hash_lock_init(void)
 246 {
 247 }
 248 #endif
 249
 250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 251 static unsigned                 rt_hash_mask __read_mostly;
 252 static unsigned int             rt_hash_log  __read_mostly;
 253
 254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 255 #define RT_CACHE_STAT_INC(field) \
 256         (__raw_get_cpu_var(rt_cache_stat).field++)
 257
 258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 259                 int genid)
 260 {
 261         return jhash_3words((__force u32)(__be32)(daddr),
 262                             (__force u32)(__be32)(saddr),
 263                             idx, genid)
 264                 & rt_hash_mask;
 265 }
 266
 267 static inline int rt_genid(struct net *net)
 268 {
 269         return atomic_read(&net->ipv4.rt_genid);
 270 }
 271
 272 #ifdef CONFIG_PROC_FS
 273 struct rt_cache_iter_state {
 274         struct seq_net_private p;
 275         int bucket;
 276         int genid;
 277 };
 278
 279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 280 {
 281         struct rt_cache_iter_state *st = seq->private;
 282         struct rtable *r = NULL;
 283
 284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 285                 rcu_read_lock_bh();
 286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 287                 while (r) {
 288                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 289                             r->rt_genid == st->genid)
 290                                 return r;
 291                         r = rcu_dereference(r->u.dst.rt_next);
 292                 }
 293                 rcu_read_unlock_bh();
 294         }
 295         return r;
 296 }
 297
 298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 299                                           struct rtable *r)
 300 {
 301         struct rt_cache_iter_state *st = seq->private;
 302         r = r->u.dst.rt_next;
 303         while (!r) {
 304                 rcu_read_unlock_bh();
 305                 if (--st->bucket < 0)
 306                         break;
 307                 rcu_read_lock_bh();
 308                 r = rt_hash_table[st->bucket].chain;
 309         }
 310         return rcu_dereference(r);
 311 }
 312
 313 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 314                                         struct rtable *r)
 315 {
 316         struct rt_cache_iter_state *st = seq->private;
 317         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 318                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 319                         continue;
 320                 if (r->rt_genid == st->genid)
 321                         break;
 322         }
 323         return r;
 324 }
 325
 326 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 327 {
 328         struct rtable *r = rt_cache_get_first(seq);
 329
 330         if (r)
 331                 while (pos && (r = rt_cache_get_next(seq, r)))
 332                         --pos;
 333         return pos ? NULL : r;
 334 }
 335
 336 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 337 {
 338         struct rt_cache_iter_state *st = seq->private;
 339         if (*pos)
 340                 return rt_cache_get_idx(seq, *pos - 1);
 341         st->genid = rt_genid(seq_file_net(seq));
 342         return SEQ_START_TOKEN;
 343 }
 344
 345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 346 {
 347         struct rtable *r;
 348
 349         if (v == SEQ_START_TOKEN)
 350                 r = rt_cache_get_first(seq);
 351         else
 352                 r = rt_cache_get_next(seq, v);
 353         ++*pos;
 354         return r;
 355 }
 356
 357 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 358 {
 359         if (v && v != SEQ_START_TOKEN)
 360                 rcu_read_unlock_bh();
 361 }
 362
 363 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 364 {
 365         if (v == SEQ_START_TOKEN)
 366                 seq_printf(seq, "%-127s\n",
 367                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 368                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 369                            "HHUptod\tSpecDst");
 370         else {
 371                 struct rtable *r = v;
 372                 int len;
 373
 374                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 375                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 376                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 377                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 378                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 379                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 380                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 381                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 382                         dst_metric(&r->u.dst, RTAX_WINDOW),
 383                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 384                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 385                         r->fl.fl4_tos,
 386                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 387                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 388                                        dev_queue_xmit) : 0,
 389                         r->rt_spec_dst, &len);
 390
 391                 seq_printf(seq, "%*s\n", 127 - len, "");
 392         }
 393         return 0;
 394 }
 395
 396 static const struct seq_operations rt_cache_seq_ops = {
 397         .start  = rt_cache_seq_start,
 398         .next   = rt_cache_seq_next,
 399         .stop   = rt_cache_seq_stop,
 400         .show   = rt_cache_seq_show,
 401 };
 402
 403 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 404 {
 405         return seq_open_net(inode, file, &rt_cache_seq_ops,
 406                         sizeof(struct rt_cache_iter_state));
 407 }
 408
 409 static const struct file_operations rt_cache_seq_fops = {
 410         .owner   = THIS_MODULE,
 411         .open    = rt_cache_seq_open,
 412         .read    = seq_read,
 413         .llseek  = seq_lseek,
 414         .release = seq_release_net,
 415 };
 416
 417
 418 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 419 {
 420         int cpu;
 421
 422         if (*pos == 0)
 423                 return SEQ_START_TOKEN;
 424
 425         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 426                 if (!cpu_possible(cpu))
 427                         continue;
 428                 *pos = cpu+1;
 429                 return &per_cpu(rt_cache_stat, cpu);
 430         }
 431         return NULL;
 432 }
 433
 434 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 435 {
 436         int cpu;
 437
 438         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 439                 if (!cpu_possible(cpu))
 440                         continue;
 441                 *pos = cpu+1;
 442                 return &per_cpu(rt_cache_stat, cpu);
 443         }
 444         return NULL;
 445
 446 }
 447
 448 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 449 {
 450
 451 }
 452
 453 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 454 {
 455         struct rt_cache_stat *st = v;
 456
 457         if (v == SEQ_START_TOKEN) {
 458                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 459                 return 0;
 460         }
 461
 462         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 463                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 464                    atomic_read(&ipv4_dst_ops.entries),
 465                    st->in_hit,
 466                    st->in_slow_tot,
 467                    st->in_slow_mc,
 468                    st->in_no_route,
 469                    st->in_brd,
 470                    st->in_martian_dst,
 471                    st->in_martian_src,
 472
 473                    st->out_hit,
 474                    st->out_slow_tot,
 475                    st->out_slow_mc,
 476
 477                    st->gc_total,
 478                    st->gc_ignored,
 479                    st->gc_goal_miss,
 480                    st->gc_dst_overflow,
 481                    st->in_hlist_search,
 482                    st->out_hlist_search
 483                 );
 484         return 0;
 485 }
 486
 487 static const struct seq_operations rt_cpu_seq_ops = {
 488         .start  = rt_cpu_seq_start,
 489         .next   = rt_cpu_seq_next,
 490         .stop   = rt_cpu_seq_stop,
 491         .show   = rt_cpu_seq_show,
 492 };
 493
 494
 495 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 496 {
 497         return seq_open(file, &rt_cpu_seq_ops);
 498 }
 499
 500 static const struct file_operations rt_cpu_seq_fops = {
 501         .owner   = THIS_MODULE,
 502         .open    = rt_cpu_seq_open,
 503         .read    = seq_read,
 504         .llseek  = seq_lseek,
 505         .release = seq_release,
 506 };
 507
 508 #ifdef CONFIG_NET_CLS_ROUTE
 509 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 510                            int length, int *eof, void *data)
 511 {
 512         unsigned int i;
 513
 514         if ((offset & 3) || (length & 3))
 515                 return -EIO;
 516
 517         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 518                 *eof = 1;
 519                 return 0;
 520         }
 521
 522         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 523                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 524                 *eof = 1;
 525         }
 526
 527         offset /= sizeof(u32);
 528
 529         if (length > 0) {
 530                 u32 *dst = (u32 *) buffer;
 531
 532                 *start = buffer;
 533                 memset(dst, 0, length);
 534
 535                 for_each_possible_cpu(i) {
 536                         unsigned int j;
 537                         u32 *src;
 538
 539                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 540                         for (j = 0; j < length/4; j++)
 541                                 dst[j] += src[j];
 542                 }
 543         }
 544         return length;
 545 }
 546 #endif
 547
 548 static int __net_init ip_rt_do_proc_init(struct net *net)
 549 {
 550         struct proc_dir_entry *pde;
 551
 552         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 553                         &rt_cache_seq_fops);
 554         if (!pde)
 555                 goto err1;
 556
 557         pde = proc_create("rt_cache", S_IRUGO,
 558                           net->proc_net_stat, &rt_cpu_seq_fops);
 559         if (!pde)
 560                 goto err2;
 561
 562 #ifdef CONFIG_NET_CLS_ROUTE
 563         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 564                         ip_rt_acct_read, NULL);
 565         if (!pde)
 566                 goto err3;
 567 #endif
 568         return 0;
 569
 570 #ifdef CONFIG_NET_CLS_ROUTE
 571 err3:
 572         remove_proc_entry("rt_cache", net->proc_net_stat);
 573 #endif
 574 err2:
 575         remove_proc_entry("rt_cache", net->proc_net);
 576 err1:
 577         return -ENOMEM;
 578 }
 579
 580 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 581 {
 582         remove_proc_entry("rt_cache", net->proc_net_stat);
 583         remove_proc_entry("rt_cache", net->proc_net);
 584         remove_proc_entry("rt_acct", net->proc_net);
 585 }
 586
 587 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 588         .init = ip_rt_do_proc_init,
 589         .exit = ip_rt_do_proc_exit,
 590 };
 591
 592 static int __init ip_rt_proc_init(void)
 593 {
 594         return register_pernet_subsys(&ip_rt_proc_ops);
 595 }
 596
 597 #else
 598 static inline int ip_rt_proc_init(void)
 599 {
 600         return 0;
 601 }
 602 #endif /* CONFIG_PROC_FS */
 603
 604 static inline void rt_free(struct rtable *rt)
 605 {
 606         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 607 }
 608
 609 static inline void rt_drop(struct rtable *rt)
 610 {
 611         ip_rt_put(rt);
 612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 613 }
 614
 615 static inline int rt_fast_clean(struct rtable *rth)
 616 {
 617         /* Kill broadcast/multicast entries very aggresively, if they
 618            collide in hash table with more useful entries */
 619         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 620                 rth->fl.iif && rth->u.dst.rt_next;
 621 }
 622
 623 static inline int rt_valuable(struct rtable *rth)
 624 {
 625         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 626                 rth->u.dst.expires;
 627 }
 628
 629 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 630 {
 631         unsigned long age;
 632         int ret = 0;
 633
 634         if (atomic_read(&rth->u.dst.__refcnt))
 635                 goto out;
 636
 637         ret = 1;
 638         if (rth->u.dst.expires &&
 639             time_after_eq(jiffies, rth->u.dst.expires))
 640                 goto out;
 641
 642         age = jiffies - rth->u.dst.lastuse;
 643         ret = 0;
 644         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 645             (age <= tmo2 && rt_valuable(rth)))
 646                 goto out;
 647         ret = 1;
 648 out:    return ret;
 649 }
 650
 651 /* Bits of score are:
 652  * 31: very valuable
 653  * 30: not quite useless
 654  * 29..0: usage counter
 655  */
 656 static inline u32 rt_score(struct rtable *rt)
 657 {
 658         u32 score = jiffies - rt->u.dst.lastuse;
 659
 660         score = ~score & ~(3<<30);
 661
 662         if (rt_valuable(rt))
 663                 score |= (1<<31);
 664
 665         if (!rt->fl.iif ||
 666             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 667                 score |= (1<<30);
 668
 669         return score;
 670 }
 671
 672 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 673 {
 674         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 675                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 676                 (fl1->mark ^ fl2->mark) |
 677                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 678                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 679                 (fl1->oif ^ fl2->oif) |
 680                 (fl1->iif ^ fl2->iif)) == 0;
 681 }
 682
 683 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 684 {
 685         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 686 }
 687
 688 static inline int rt_is_expired(struct rtable *rth)
 689 {
 690         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 691 }
 692
 693 /*
 694  * Perform a full scan of hash table and free all entries.
 695  * Can be called by a softirq or a process.
 696  * In the later case, we want to be reschedule if necessary
 697  */
 698 static void rt_do_flush(int process_context)
 699 {
 700         unsigned int i;
 701         struct rtable *rth, *next;
 702         struct rtable * tail;
 703
 704         for (i = 0; i <= rt_hash_mask; i++) {
 705                 if (process_context && need_resched())
 706                         cond_resched();
 707                 rth = rt_hash_table[i].chain;
 708                 if (!rth)
 709                         continue;
 710
 711                 spin_lock_bh(rt_hash_lock_addr(i));
 712 #ifdef CONFIG_NET_NS
 713                 {
 714                 struct rtable ** prev, * p;
 715
 716                 rth = rt_hash_table[i].chain;
 717
 718                 /* defer releasing the head of the list after spin_unlock */
 719                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 720                         if (!rt_is_expired(tail))
 721                                 break;
 722                 if (rth != tail)
 723                         rt_hash_table[i].chain = tail;
 724
 725                 /* call rt_free on entries after the tail requiring flush */
 726                 prev = &rt_hash_table[i].chain;
 727                 for (p = *prev; p; p = next) {
 728                         next = p->u.dst.rt_next;
 729                         if (!rt_is_expired(p)) {
 730                                 prev = &p->u.dst.rt_next;
 731                         } else {
 732                                 *prev = next;
 733                                 rt_free(p);
 734                         }
 735                 }
 736                 }
 737 #else
 738                 rth = rt_hash_table[i].chain;
 739                 rt_hash_table[i].chain = NULL;
 740                 tail = NULL;
 741 #endif
 742                 spin_unlock_bh(rt_hash_lock_addr(i));
 743
 744                 for (; rth != tail; rth = next) {
 745                         next = rth->u.dst.rt_next;
 746                         rt_free(rth);
 747                 }
 748         }
 749 }
 750
 751 static void rt_check_expire(void)
 752 {
 753         static unsigned int rover;
 754         unsigned int i = rover, goal;
 755         struct rtable *rth, **rthp;
 756         u64 mult;
 757
 758         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 759         if (ip_rt_gc_timeout > 1)
 760                 do_div(mult, ip_rt_gc_timeout);
 761         goal = (unsigned int)mult;
 762         if (goal > rt_hash_mask)
 763                 goal = rt_hash_mask + 1;
 764         for (; goal > 0; goal--) {
 765                 unsigned long tmo = ip_rt_gc_timeout;
 766
 767                 i = (i + 1) & rt_hash_mask;
 768                 rthp = &rt_hash_table[i].chain;
 769
 770                 if (need_resched())
 771                         cond_resched();
 772
 773                 if (*rthp == NULL)
 774                         continue;
 775                 spin_lock_bh(rt_hash_lock_addr(i));
 776                 while ((rth = *rthp) != NULL) {
 777                         if (rt_is_expired(rth)) {
 778                                 *rthp = rth->u.dst.rt_next;
 779                                 rt_free(rth);
 780                                 continue;
 781                         }
 782                         if (rth->u.dst.expires) {
 783                                 /* Entry is expired even if it is in use */
 784                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 785                                         tmo >>= 1;
 786                                         rthp = &rth->u.dst.rt_next;
 787                                         continue;
 788                                 }
 789                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 790                                 tmo >>= 1;
 791                                 rthp = &rth->u.dst.rt_next;
 792                                 continue;
 793                         }
 794
 795                         /* Cleanup aged off entries. */
 796                         *rthp = rth->u.dst.rt_next;
 797                         rt_free(rth);
 798                 }
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800         }
 801         rover = i;
 802 }
 803
 804 /*
 805  * rt_worker_func() is run in process context.
 806  * we call rt_check_expire() to scan part of the hash table
 807  */
 808 static void rt_worker_func(struct work_struct *work)
 809 {
 810         rt_check_expire();
 811         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 812 }
 813
 814 /*
 815  * Pertubation of rt_genid by a small quantity [1..256]
 816  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 817  * many times (2^24) without giving recent rt_genid.
 818  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 819  */
 820 static void rt_cache_invalidate(struct net *net)
 821 {
 822         unsigned char shuffle;
 823
 824         get_random_bytes(&shuffle, sizeof(shuffle));
 825         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 826 }
 827
 828 /*
 829  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 830  * delay >= 0 : invalidate & flush cache (can be long)
 831  */
 832 void rt_cache_flush(struct net *net, int delay)
 833 {
 834         rt_cache_invalidate(net);
 835         if (delay >= 0)
 836                 rt_do_flush(!in_softirq());
 837 }
 838
 839 /*
 840  * We change rt_genid and let gc do the cleanup
 841  */
 842 static void rt_secret_rebuild(unsigned long __net)
 843 {
 844         struct net *net = (struct net *)__net;
 845         rt_cache_invalidate(net);
 846         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 847 }
 848
 849 /*
 850    Short description of GC goals.
 851
 852    We want to build algorithm, which will keep routing cache
 853    at some equilibrium point, when number of aged off entries
 854    is kept approximately equal to newly generated ones.
 855
 856    Current expiration strength is variable "expire".
 857    We try to adjust it dynamically, so that if networking
 858    is idle expires is large enough to keep enough of warm entries,
 859    and when load increases it reduces to limit cache size.
 860  */
 861
 862 static int rt_garbage_collect(struct dst_ops *ops)
 863 {
 864         static unsigned long expire = RT_GC_TIMEOUT;
 865         static unsigned long last_gc;
 866         static int rover;
 867         static int equilibrium;
 868         struct rtable *rth, **rthp;
 869         unsigned long now = jiffies;
 870         int goal;
 871
 872         /*
 873          * Garbage collection is pretty expensive,
 874          * do not make it too frequently.
 875          */
 876
 877         RT_CACHE_STAT_INC(gc_total);
 878
 879         if (now - last_gc < ip_rt_gc_min_interval &&
 880             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 881                 RT_CACHE_STAT_INC(gc_ignored);
 882                 goto out;
 883         }
 884
 885         /* Calculate number of entries, which we want to expire now. */
 886         goal = atomic_read(&ipv4_dst_ops.entries) -
 887                 (ip_rt_gc_elasticity << rt_hash_log);
 888         if (goal <= 0) {
 889                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 890                         equilibrium = ipv4_dst_ops.gc_thresh;
 891                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 892                 if (goal > 0) {
 893                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 894                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 895                 }
 896         } else {
 897                 /* We are in dangerous area. Try to reduce cache really
 898                  * aggressively.
 899                  */
 900                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 901                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 902         }
 903
 904         if (now - last_gc >= ip_rt_gc_min_interval)
 905                 last_gc = now;
 906
 907         if (goal <= 0) {
 908                 equilibrium += goal;
 909                 goto work_done;
 910         }
 911
 912         do {
 913                 int i, k;
 914
 915                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 916                         unsigned long tmo = expire;
 917
 918                         k = (k + 1) & rt_hash_mask;
 919                         rthp = &rt_hash_table[k].chain;
 920                         spin_lock_bh(rt_hash_lock_addr(k));
 921                         while ((rth = *rthp) != NULL) {
 922                                 if (!rt_is_expired(rth) &&
 923                                         !rt_may_expire(rth, tmo, expire)) {
 924                                         tmo >>= 1;
 925                                         rthp = &rth->u.dst.rt_next;
 926                                         continue;
 927                                 }
 928                                 *rthp = rth->u.dst.rt_next;
 929                                 rt_free(rth);
 930                                 goal--;
 931                         }
 932                         spin_unlock_bh(rt_hash_lock_addr(k));
 933                         if (goal <= 0)
 934                                 break;
 935                 }
 936                 rover = k;
 937
 938                 if (goal <= 0)
 939                         goto work_done;
 940
 941                 /* Goal is not achieved. We stop process if:
 942
 943                    - if expire reduced to zero. Otherwise, expire is halfed.
 944                    - if table is not full.
 945                    - if we are called from interrupt.
 946                    - jiffies check is just fallback/debug loop breaker.
 947                      We will not spin here for long time in any case.
 948                  */
 949
 950                 RT_CACHE_STAT_INC(gc_goal_miss);
 951
 952                 if (expire == 0)
 953                         break;
 954
 955                 expire >>= 1;
 956 #if RT_CACHE_DEBUG >= 2
 957                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 958                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 959 #endif
 960
 961                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 962                         goto out;
 963         } while (!in_softirq() && time_before_eq(jiffies, now));
 964
 965         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 966                 goto out;
 967         if (net_ratelimit())
 968                 printk(KERN_WARNING "dst cache overflow\n");
 969         RT_CACHE_STAT_INC(gc_dst_overflow);
 970         return 1;
 971
 972 work_done:
 973         expire += ip_rt_gc_min_interval;
 974         if (expire > ip_rt_gc_timeout ||
 975             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 976                 expire = ip_rt_gc_timeout;
 977 #if RT_CACHE_DEBUG >= 2
 978         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 979                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 980 #endif
 981 out:    return 0;
 982 }
 983
 984 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 985 {
 986         struct rtable   *rth, **rthp;
 987         unsigned long   now;
 988         struct rtable *cand, **candp;
 989         u32             min_score;
 990         int             chain_length;
 991         int attempts = !in_softirq();
 992
 993 restart:
 994         chain_length = 0;
 995         min_score = ~(u32)0;
 996         cand = NULL;
 997         candp = NULL;
 998         now = jiffies;
 999
1000         rthp = &rt_hash_table[hash].chain;
1001
1002         spin_lock_bh(rt_hash_lock_addr(hash));
1003         while ((rth = *rthp) != NULL) {
1004                 if (rt_is_expired(rth)) {
1005                         *rthp = rth->u.dst.rt_next;
1006                         rt_free(rth);
1007                         continue;
1008                 }
1009                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1010                         /* Put it first */
1011                         *rthp = rth->u.dst.rt_next;
1012                         /*
1013                          * Since lookup is lockfree, the deletion
1014                          * must be visible to another weakly ordered CPU before
1015                          * the insertion at the start of the hash chain.
1016                          */
1017                         rcu_assign_pointer(rth->u.dst.rt_next,
1018                                            rt_hash_table[hash].chain);
1019                         /*
1020                          * Since lookup is lockfree, the update writes
1021                          * must be ordered for consistency on SMP.
1022                          */
1023                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1024
1025                         dst_use(&rth->u.dst, now);
1026                         spin_unlock_bh(rt_hash_lock_addr(hash));
1027
1028                         rt_drop(rt);
1029                         *rp = rth;
1030                         return 0;
1031                 }
1032
1033                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1034                         u32 score = rt_score(rth);
1035
1036                         if (score <= min_score) {
1037                                 cand = rth;
1038                                 candp = rthp;
1039                                 min_score = score;
1040                         }
1041                 }
1042
1043                 chain_length++;
1044
1045                 rthp = &rth->u.dst.rt_next;
1046         }
1047
1048         if (cand) {
1049                 /* ip_rt_gc_elasticity used to be average length of chain
1050                  * length, when exceeded gc becomes really aggressive.
1051                  *
1052                  * The second limit is less certain. At the moment it allows
1053                  * only 2 entries per bucket. We will see.
1054                  */
1055                 if (chain_length > ip_rt_gc_elasticity) {
1056                         *candp = cand->u.dst.rt_next;
1057                         rt_free(cand);
1058                 }
1059         }
1060
1061         /* Try to bind route to arp only if it is output
1062            route or unicast forwarding path.
1063          */
1064         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1065                 int err = arp_bind_neighbour(&rt->u.dst);
1066                 if (err) {
1067                         spin_unlock_bh(rt_hash_lock_addr(hash));
1068
1069                         if (err != -ENOBUFS) {
1070                                 rt_drop(rt);
1071                                 return err;
1072                         }
1073
1074                         /* Neighbour tables are full and nothing
1075                            can be released. Try to shrink route cache,
1076                            it is most likely it holds some neighbour records.
1077                          */
1078                         if (attempts-- > 0) {
1079                                 int saved_elasticity = ip_rt_gc_elasticity;
1080                                 int saved_int = ip_rt_gc_min_interval;
1081                                 ip_rt_gc_elasticity     = 1;
1082                                 ip_rt_gc_min_interval   = 0;
1083                                 rt_garbage_collect(&ipv4_dst_ops);
1084                                 ip_rt_gc_min_interval   = saved_int;
1085                                 ip_rt_gc_elasticity     = saved_elasticity;
1086                                 goto restart;
1087                         }
1088
1089                         if (net_ratelimit())
1090                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1091                         rt_drop(rt);
1092                         return -ENOBUFS;
1093                 }
1094         }
1095
1096         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1097 #if RT_CACHE_DEBUG >= 2
1098         if (rt->u.dst.rt_next) {
1099                 struct rtable *trt;
1100                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1101                        NIPQUAD(rt->rt_dst));
1102                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1103                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1104                 printk("\n");
1105         }
1106 #endif
1107         rt_hash_table[hash].chain = rt;
1108         spin_unlock_bh(rt_hash_lock_addr(hash));
1109         *rp = rt;
1110         return 0;
1111 }
1112
1113 void rt_bind_peer(struct rtable *rt, int create)
1114 {
1115         static DEFINE_SPINLOCK(rt_peer_lock);
1116         struct inet_peer *peer;
1117
1118         peer = inet_getpeer(rt->rt_dst, create);
1119
1120         spin_lock_bh(&rt_peer_lock);
1121         if (rt->peer == NULL) {
1122                 rt->peer = peer;
1123                 peer = NULL;
1124         }
1125         spin_unlock_bh(&rt_peer_lock);
1126         if (peer)
1127                 inet_putpeer(peer);
1128 }
1129
1130 /*
1131  * Peer allocation may fail only in serious out-of-memory conditions.  However
1132  * we still can generate some output.
1133  * Random ID selection looks a bit dangerous because we have no chances to
1134  * select ID being unique in a reasonable period of time.
1135  * But broken packet identifier may be better than no packet at all.
1136  */
1137 static void ip_select_fb_ident(struct iphdr *iph)
1138 {
1139         static DEFINE_SPINLOCK(ip_fb_id_lock);
1140         static u32 ip_fallback_id;
1141         u32 salt;
1142
1143         spin_lock_bh(&ip_fb_id_lock);
1144         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1145         iph->id = htons(salt & 0xFFFF);
1146         ip_fallback_id = salt;
1147         spin_unlock_bh(&ip_fb_id_lock);
1148 }
1149
1150 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1151 {
1152         struct rtable *rt = (struct rtable *) dst;
1153
1154         if (rt) {
1155                 if (rt->peer == NULL)
1156                         rt_bind_peer(rt, 1);
1157
1158                 /* If peer is attached to destination, it is never detached,
1159                    so that we need not to grab a lock to dereference it.
1160                  */
1161                 if (rt->peer) {
1162                         iph->id = htons(inet_getid(rt->peer, more));
1163                         return;
1164                 }
1165         } else
1166                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1167                        __builtin_return_address(0));
1168
1169         ip_select_fb_ident(iph);
1170 }
1171
1172 static void rt_del(unsigned hash, struct rtable *rt)
1173 {
1174         struct rtable **rthp, *aux;
1175
1176         rthp = &rt_hash_table[hash].chain;
1177         spin_lock_bh(rt_hash_lock_addr(hash));
1178         ip_rt_put(rt);
1179         while ((aux = *rthp) != NULL) {
1180                 if (aux == rt || rt_is_expired(aux)) {
1181                         *rthp = aux->u.dst.rt_next;
1182                         rt_free(aux);
1183                         continue;
1184                 }
1185                 rthp = &aux->u.dst.rt_next;
1186         }
1187         spin_unlock_bh(rt_hash_lock_addr(hash));
1188 }
1189
1190 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1191                     __be32 saddr, struct net_device *dev)
1192 {
1193         int i, k;
1194         struct in_device *in_dev = in_dev_get(dev);
1195         struct rtable *rth, **rthp;
1196         __be32  skeys[2] = { saddr, 0 };
1197         int  ikeys[2] = { dev->ifindex, 0 };
1198         struct netevent_redirect netevent;
1199         struct net *net;
1200
1201         if (!in_dev)
1202                 return;
1203
1204         net = dev_net(dev);
1205         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1206             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1207             || ipv4_is_zeronet(new_gw))
1208                 goto reject_redirect;
1209
1210         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1211                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1212                         goto reject_redirect;
1213                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1214                         goto reject_redirect;
1215         } else {
1216                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1217                         goto reject_redirect;
1218         }
1219
1220         for (i = 0; i < 2; i++) {
1221                 for (k = 0; k < 2; k++) {
1222                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1223                                                 rt_genid(net));
1224
1225                         rthp=&rt_hash_table[hash].chain;
1226
1227                         rcu_read_lock();
1228                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1229                                 struct rtable *rt;
1230
1231                                 if (rth->fl.fl4_dst != daddr ||
1232                                     rth->fl.fl4_src != skeys[i] ||
1233                                     rth->fl.oif != ikeys[k] ||
1234                                     rth->fl.iif != 0 ||
1235                                     rt_is_expired(rth) ||
1236                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1237                                         rthp = &rth->u.dst.rt_next;
1238                                         continue;
1239                                 }
1240
1241                                 if (rth->rt_dst != daddr ||
1242                                     rth->rt_src != saddr ||
1243                                     rth->u.dst.error ||
1244                                     rth->rt_gateway != old_gw ||
1245                                     rth->u.dst.dev != dev)
1246                                         break;
1247
1248                                 dst_hold(&rth->u.dst);
1249                                 rcu_read_unlock();
1250
1251                                 rt = dst_alloc(&ipv4_dst_ops);
1252                                 if (rt == NULL) {
1253                                         ip_rt_put(rth);
1254                                         in_dev_put(in_dev);
1255                                         return;
1256                                 }
1257
1258                                 /* Copy all the information. */
1259                                 *rt = *rth;
1260                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1261                                 rt->u.dst.__use         = 1;
1262                                 atomic_set(&rt->u.dst.__refcnt, 1);
1263                                 rt->u.dst.child         = NULL;
1264                                 if (rt->u.dst.dev)
1265                                         dev_hold(rt->u.dst.dev);
1266                                 if (rt->idev)
1267                                         in_dev_hold(rt->idev);
1268                                 rt->u.dst.obsolete      = 0;
1269                                 rt->u.dst.lastuse       = jiffies;
1270                                 rt->u.dst.path          = &rt->u.dst;
1271                                 rt->u.dst.neighbour     = NULL;
1272                                 rt->u.dst.hh            = NULL;
1273                                 rt->u.dst.xfrm          = NULL;
1274                                 rt->rt_genid            = rt_genid(net);
1275                                 rt->rt_flags            |= RTCF_REDIRECTED;
1276
1277                                 /* Gateway is different ... */
1278                                 rt->rt_gateway          = new_gw;
1279
1280                                 /* Redirect received -> path was valid */
1281                                 dst_confirm(&rth->u.dst);
1282
1283                                 if (rt->peer)
1284                                         atomic_inc(&rt->peer->refcnt);
1285
1286                                 if (arp_bind_neighbour(&rt->u.dst) ||
1287                                     !(rt->u.dst.neighbour->nud_state &
1288                                             NUD_VALID)) {
1289                                         if (rt->u.dst.neighbour)
1290                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1291                                         ip_rt_put(rth);
1292                                         rt_drop(rt);
1293                                         goto do_next;
1294                                 }
1295
1296                                 netevent.old = &rth->u.dst;
1297                                 netevent.new = &rt->u.dst;
1298                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1299                                                         &netevent);
1300
1301                                 rt_del(hash, rth);
1302                                 if (!rt_intern_hash(hash, rt, &rt))
1303                                         ip_rt_put(rt);
1304                                 goto do_next;
1305                         }
1306                         rcu_read_unlock();
1307                 do_next:
1308                         ;
1309                 }
1310         }
1311         in_dev_put(in_dev);
1312         return;
1313
1314 reject_redirect:
1315 #ifdef CONFIG_IP_ROUTE_VERBOSE
1316         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1317                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1318                         NIPQUAD_FMT " ignored.\n"
1319                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1320                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1321                        NIPQUAD(saddr), NIPQUAD(daddr));
1322 #endif
1323         in_dev_put(in_dev);
1324 }
1325
1326 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1327 {
1328         struct rtable *rt = (struct rtable *)dst;
1329         struct dst_entry *ret = dst;
1330
1331         if (rt) {
1332                 if (dst->obsolete) {
1333                         ip_rt_put(rt);
1334                         ret = NULL;
1335                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1336                            rt->u.dst.expires) {
1337                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1338                                                 rt->fl.oif,
1339                                                 rt_genid(dev_net(dst->dev)));
1340 #if RT_CACHE_DEBUG >= 1
1341                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1342                                           NIPQUAD_FMT "/%02x dropped\n",
1343                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1344 #endif
1345                         rt_del(hash, rt);
1346                         ret = NULL;
1347                 }
1348         }
1349         return ret;
1350 }
1351
1352 /*
1353  * Algorithm:
1354  *      1. The first ip_rt_redirect_number redirects are sent
1355  *         with exponential backoff, then we stop sending them at all,
1356  *         assuming that the host ignores our redirects.
1357  *      2. If we did not see packets requiring redirects
1358  *         during ip_rt_redirect_silence, we assume that the host
1359  *         forgot redirected route and start to send redirects again.
1360  *
1361  * This algorithm is much cheaper and more intelligent than dumb load limiting
1362  * in icmp.c.
1363  *
1364  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1365  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1366  */
1367
1368 void ip_rt_send_redirect(struct sk_buff *skb)
1369 {
1370         struct rtable *rt = skb->rtable;
1371         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1372
1373         if (!in_dev)
1374                 return;
1375
1376         if (!IN_DEV_TX_REDIRECTS(in_dev))
1377                 goto out;
1378
1379         /* No redirected packets during ip_rt_redirect_silence;
1380          * reset the algorithm.
1381          */
1382         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1383                 rt->u.dst.rate_tokens = 0;
1384
1385         /* Too many ignored redirects; do not send anything
1386          * set u.dst.rate_last to the last seen redirected packet.
1387          */
1388         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1389                 rt->u.dst.rate_last = jiffies;
1390                 goto out;
1391         }
1392
1393         /* Check for load limit; set rate_last to the latest sent
1394          * redirect.
1395          */
1396         if (rt->u.dst.rate_tokens == 0 ||
1397             time_after(jiffies,
1398                        (rt->u.dst.rate_last +
1399                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1400                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1401                 rt->u.dst.rate_last = jiffies;
1402                 ++rt->u.dst.rate_tokens;
1403 #ifdef CONFIG_IP_ROUTE_VERBOSE
1404                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1405                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1406                     net_ratelimit())
1407                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1408                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1409                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1410                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1411 #endif
1412         }
1413 out:
1414         in_dev_put(in_dev);
1415 }
1416
1417 static int ip_error(struct sk_buff *skb)
1418 {
1419         struct rtable *rt = skb->rtable;
1420         unsigned long now;
1421         int code;
1422
1423         switch (rt->u.dst.error) {
1424                 case EINVAL:
1425                 default:
1426                         goto out;
1427                 case EHOSTUNREACH:
1428                         code = ICMP_HOST_UNREACH;
1429                         break;
1430                 case ENETUNREACH:
1431                         code = ICMP_NET_UNREACH;
1432                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1433                         break;
1434                 case EACCES:
1435                         code = ICMP_PKT_FILTERED;
1436                         break;
1437         }
1438
1439         now = jiffies;
1440         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1441         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1442                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1443         rt->u.dst.rate_last = now;
1444         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1445                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1446                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1447         }
1448
1449 out:    kfree_skb(skb);
1450         return 0;
1451 }
1452
1453 /*
1454  *      The last two values are not from the RFC but
1455  *      are needed for AMPRnet AX.25 paths.
1456  */
1457
1458 static const unsigned short mtu_plateau[] =
1459 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1460
1461 static inline unsigned short guess_mtu(unsigned short old_mtu)
1462 {
1463         int i;
1464
1465         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1466                 if (old_mtu > mtu_plateau[i])
1467                         return mtu_plateau[i];
1468         return 68;
1469 }
1470
1471 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1472                                  unsigned short new_mtu,
1473                                  struct net_device *dev)
1474 {
1475         int i, k;
1476         unsigned short old_mtu = ntohs(iph->tot_len);
1477         struct rtable *rth;
1478         int  ikeys[2] = { dev->ifindex, 0 };
1479         __be32  skeys[2] = { iph->saddr, 0, };
1480         __be32  daddr = iph->daddr;
1481         unsigned short est_mtu = 0;
1482
1483         if (ipv4_config.no_pmtu_disc)
1484                 return 0;
1485
1486         for (k = 0; k < 2; k++) {
1487                 for (i = 0; i < 2; i++) {
1488                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1489                                                 rt_genid(net));
1490
1491                         rcu_read_lock();
1492                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1493                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1494                                 unsigned short mtu = new_mtu;
1495
1496                                 if (rth->fl.fl4_dst != daddr ||
1497                                     rth->fl.fl4_src != skeys[i] ||
1498                                     rth->rt_dst != daddr ||
1499                                     rth->rt_src != iph->saddr ||
1500                                     rth->fl.oif != ikeys[k] ||
1501                                     rth->fl.iif != 0 ||
1502                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1503                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1504                                     !rt_is_expired(rth))
1505                                         continue;
1506
1507                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1508
1509                                         /* BSD 4.2 compatibility hack :-( */
1510                                         if (mtu == 0 &&
1511                                             old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1512                                             old_mtu >= 68 + (iph->ihl << 2))
1513                                                 old_mtu -= iph->ihl << 2;
1514
1515                                         mtu = guess_mtu(old_mtu);
1516                                 }
1517                                 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1518                                         if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1519                                                 dst_confirm(&rth->u.dst);
1520                                                 if (mtu < ip_rt_min_pmtu) {
1521                                                         mtu = ip_rt_min_pmtu;
1522                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1523                                                                 (1 << RTAX_MTU);
1524                                                 }
1525                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1526                                                 dst_set_expires(&rth->u.dst,
1527                                                         ip_rt_mtu_expires);
1528                                         }
1529                                         est_mtu = mtu;
1530                                 }
1531                         }
1532                         rcu_read_unlock();
1533                 }
1534         }
1535         return est_mtu ? : new_mtu;
1536 }
1537
1538 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1539 {
1540         if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1541             !(dst_metric_locked(dst, RTAX_MTU))) {
1542                 if (mtu < ip_rt_min_pmtu) {
1543                         mtu = ip_rt_min_pmtu;
1544                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1545                 }
1546                 dst->metrics[RTAX_MTU-1] = mtu;
1547                 dst_set_expires(dst, ip_rt_mtu_expires);
1548                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1549         }
1550 }
1551
1552 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1553 {
1554         return NULL;
1555 }
1556
1557 static void ipv4_dst_destroy(struct dst_entry *dst)
1558 {
1559         struct rtable *rt = (struct rtable *) dst;
1560         struct inet_peer *peer = rt->peer;
1561         struct in_device *idev = rt->idev;
1562
1563         if (peer) {
1564                 rt->peer = NULL;
1565                 inet_putpeer(peer);
1566         }
1567
1568         if (idev) {
1569                 rt->idev = NULL;
1570                 in_dev_put(idev);
1571         }
1572 }
1573
1574 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1575                             int how)
1576 {
1577         struct rtable *rt = (struct rtable *) dst;
1578         struct in_device *idev = rt->idev;
1579         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1580                 struct in_device *loopback_idev =
1581                         in_dev_get(dev_net(dev)->loopback_dev);
1582                 if (loopback_idev) {
1583                         rt->idev = loopback_idev;
1584                         in_dev_put(idev);
1585                 }
1586         }
1587 }
1588
1589 static void ipv4_link_failure(struct sk_buff *skb)
1590 {
1591         struct rtable *rt;
1592
1593         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1594
1595         rt = skb->rtable;
1596         if (rt)
1597                 dst_set_expires(&rt->u.dst, 0);
1598 }
1599
1600 static int ip_rt_bug(struct sk_buff *skb)
1601 {
1602         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1603                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1604                 skb->dev ? skb->dev->name : "?");
1605         kfree_skb(skb);
1606         return 0;
1607 }
1608
1609 /*
1610    We do not cache source address of outgoing interface,
1611    because it is used only by IP RR, TS and SRR options,
1612    so that it out of fast path.
1613
1614    BTW remember: "addr" is allowed to be not aligned
1615    in IP options!
1616  */
1617
1618 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1619 {
1620         __be32 src;
1621         struct fib_result res;
1622
1623         if (rt->fl.iif == 0)
1624                 src = rt->rt_src;
1625         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1626                 src = FIB_RES_PREFSRC(res);
1627                 fib_res_put(&res);
1628         } else
1629                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1630                                         RT_SCOPE_UNIVERSE);
1631         memcpy(addr, &src, 4);
1632 }
1633
1634 #ifdef CONFIG_NET_CLS_ROUTE
1635 static void set_class_tag(struct rtable *rt, u32 tag)
1636 {
1637         if (!(rt->u.dst.tclassid & 0xFFFF))
1638                 rt->u.dst.tclassid |= tag & 0xFFFF;
1639         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1640                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1641 }
1642 #endif
1643
1644 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1645 {
1646         struct fib_info *fi = res->fi;
1647
1648         if (fi) {
1649                 if (FIB_RES_GW(*res) &&
1650                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1651                         rt->rt_gateway = FIB_RES_GW(*res);
1652                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1653                        sizeof(rt->u.dst.metrics));
1654                 if (fi->fib_mtu == 0) {
1655                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1656                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1657                             rt->rt_gateway != rt->rt_dst &&
1658                             rt->u.dst.dev->mtu > 576)
1659                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1660                 }
1661 #ifdef CONFIG_NET_CLS_ROUTE
1662                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1663 #endif
1664         } else
1665                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1666
1667         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1668                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1669         if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1670                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1671         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1672                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1673                                        ip_rt_min_advmss);
1674         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1675                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1676
1677 #ifdef CONFIG_NET_CLS_ROUTE
1678 #ifdef CONFIG_IP_MULTIPLE_TABLES
1679         set_class_tag(rt, fib_rules_tclass(res));
1680 #endif
1681         set_class_tag(rt, itag);
1682 #endif
1683         rt->rt_type = res->type;
1684 }
1685
1686 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1687                                 u8 tos, struct net_device *dev, int our)
1688 {
1689         unsigned hash;
1690         struct rtable *rth;
1691         __be32 spec_dst;
1692         struct in_device *in_dev = in_dev_get(dev);
1693         u32 itag = 0;
1694
1695         /* Primary sanity checks. */
1696
1697         if (in_dev == NULL)
1698                 return -EINVAL;
1699
1700         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1701             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1702                 goto e_inval;
1703
1704         if (ipv4_is_zeronet(saddr)) {
1705                 if (!ipv4_is_local_multicast(daddr))
1706                         goto e_inval;
1707                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1708         } else if (fib_validate_source(saddr, 0, tos, 0,
1709                                         dev, &spec_dst, &itag) < 0)
1710                 goto e_inval;
1711
1712         rth = dst_alloc(&ipv4_dst_ops);
1713         if (!rth)
1714                 goto e_nobufs;
1715
1716         rth->u.dst.output= ip_rt_bug;
1717
1718         atomic_set(&rth->u.dst.__refcnt, 1);
1719         rth->u.dst.flags= DST_HOST;
1720         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1721                 rth->u.dst.flags |= DST_NOPOLICY;
1722         rth->fl.fl4_dst = daddr;
1723         rth->rt_dst     = daddr;
1724         rth->fl.fl4_tos = tos;
1725         rth->fl.mark    = skb->mark;
1726         rth->fl.fl4_src = saddr;
1727         rth->rt_src     = saddr;
1728 #ifdef CONFIG_NET_CLS_ROUTE
1729         rth->u.dst.tclassid = itag;
1730 #endif
1731         rth->rt_iif     =
1732         rth->fl.iif     = dev->ifindex;
1733         rth->u.dst.dev  = init_net.loopback_dev;
1734         dev_hold(rth->u.dst.dev);
1735         rth->idev       = in_dev_get(rth->u.dst.dev);
1736         rth->fl.oif     = 0;
1737         rth->rt_gateway = daddr;
1738         rth->rt_spec_dst= spec_dst;
1739         rth->rt_genid   = rt_genid(dev_net(dev));
1740         rth->rt_flags   = RTCF_MULTICAST;
1741         rth->rt_type    = RTN_MULTICAST;
1742         if (our) {
1743                 rth->u.dst.input= ip_local_deliver;
1744                 rth->rt_flags |= RTCF_LOCAL;
1745         }
1746
1747 #ifdef CONFIG_IP_MROUTE
1748         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1749                 rth->u.dst.input = ip_mr_input;
1750 #endif
1751         RT_CACHE_STAT_INC(in_slow_mc);
1752
1753         in_dev_put(in_dev);
1754         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1755         return rt_intern_hash(hash, rth, &skb->rtable);
1756
1757 e_nobufs:
1758         in_dev_put(in_dev);
1759         return -ENOBUFS;
1760
1761 e_inval:
1762         in_dev_put(in_dev);
1763         return -EINVAL;
1764 }
1765
1766
1767 static void ip_handle_martian_source(struct net_device *dev,
1768                                      struct in_device *in_dev,
1769                                      struct sk_buff *skb,
1770                                      __be32 daddr,
1771                                      __be32 saddr)
1772 {
1773         RT_CACHE_STAT_INC(in_martian_src);
1774 #ifdef CONFIG_IP_ROUTE_VERBOSE
1775         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1776                 /*
1777                  *      RFC1812 recommendation, if source is martian,
1778                  *      the only hint is MAC header.
1779                  */
1780                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1781                         NIPQUAD_FMT", on dev %s\n",
1782                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1783                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1784                         int i;
1785                         const unsigned char *p = skb_mac_header(skb);
1786                         printk(KERN_WARNING "ll header: ");
1787                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1788                                 printk("%02x", *p);
1789                                 if (i < (dev->hard_header_len - 1))
1790                                         printk(":");
1791                         }
1792                         printk("\n");
1793                 }
1794         }
1795 #endif
1796 }
1797
1798 static int __mkroute_input(struct sk_buff *skb,
1799                            struct fib_result *res,
1800                            struct in_device *in_dev,
1801                            __be32 daddr, __be32 saddr, u32 tos,
1802                            struct rtable **result)
1803 {
1804
1805         struct rtable *rth;
1806         int err;
1807         struct in_device *out_dev;
1808         unsigned flags = 0;
1809         __be32 spec_dst;
1810         u32 itag;
1811
1812         /* get a working reference to the output device */
1813         out_dev = in_dev_get(FIB_RES_DEV(*res));
1814         if (out_dev == NULL) {
1815                 if (net_ratelimit())
1816                         printk(KERN_CRIT "Bug in ip_route_input" \
1817                                "_slow(). Please, report\n");
1818                 return -EINVAL;
1819         }
1820
1821
1822         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1823                                   in_dev->dev, &spec_dst, &itag);
1824         if (err < 0) {
1825                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1826                                          saddr);
1827
1828                 err = -EINVAL;
1829                 goto cleanup;
1830         }
1831
1832         if (err)
1833                 flags |= RTCF_DIRECTSRC;
1834
1835         if (out_dev == in_dev && err &&
1836             (IN_DEV_SHARED_MEDIA(out_dev) ||
1837              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1838                 flags |= RTCF_DOREDIRECT;
1839
1840         if (skb->protocol != htons(ETH_P_IP)) {
1841                 /* Not IP (i.e. ARP). Do not create route, if it is
1842                  * invalid for proxy arp. DNAT routes are always valid.
1843                  */
1844                 if (out_dev == in_dev) {
1845                         err = -EINVAL;
1846                         goto cleanup;
1847                 }
1848         }
1849
1850
1851         rth = dst_alloc(&ipv4_dst_ops);
1852         if (!rth) {
1853                 err = -ENOBUFS;
1854                 goto cleanup;
1855         }
1856
1857         atomic_set(&rth->u.dst.__refcnt, 1);
1858         rth->u.dst.flags= DST_HOST;
1859         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1860                 rth->u.dst.flags |= DST_NOPOLICY;
1861         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1862                 rth->u.dst.flags |= DST_NOXFRM;
1863         rth->fl.fl4_dst = daddr;
1864         rth->rt_dst     = daddr;
1865         rth->fl.fl4_tos = tos;
1866         rth->fl.mark    = skb->mark;
1867         rth->fl.fl4_src = saddr;
1868         rth->rt_src     = saddr;
1869         rth->rt_gateway = daddr;
1870         rth->rt_iif     =
1871                 rth->fl.iif     = in_dev->dev->ifindex;
1872         rth->u.dst.dev  = (out_dev)->dev;
1873         dev_hold(rth->u.dst.dev);
1874         rth->idev       = in_dev_get(rth->u.dst.dev);
1875         rth->fl.oif     = 0;
1876         rth->rt_spec_dst= spec_dst;
1877
1878         rth->u.dst.input = ip_forward;
1879         rth->u.dst.output = ip_output;
1880         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1881
1882         rt_set_nexthop(rth, res, itag);
1883
1884         rth->rt_flags = flags;
1885
1886         *result = rth;
1887         err = 0;
1888  cleanup:
1889         /* release the working reference to the output device */
1890         in_dev_put(out_dev);
1891         return err;
1892 }
1893
1894 static int ip_mkroute_input(struct sk_buff *skb,
1895                             struct fib_result *res,
1896                             const struct flowi *fl,
1897                             struct in_device *in_dev,
1898                             __be32 daddr, __be32 saddr, u32 tos)
1899 {
1900         struct rtable* rth = NULL;
1901         int err;
1902         unsigned hash;
1903
1904 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1905         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1906                 fib_select_multipath(fl, res);
1907 #endif
1908
1909         /* create a routing cache entry */
1910         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1911         if (err)
1912                 return err;
1913
1914         /* put it into the cache */
1915         hash = rt_hash(daddr, saddr, fl->iif,
1916                        rt_genid(dev_net(rth->u.dst.dev)));
1917         return rt_intern_hash(hash, rth, &skb->rtable);
1918 }
1919
1920 /*
1921  *      NOTE. We drop all the packets that has local source
1922  *      addresses, because every properly looped back packet
1923  *      must have correct destination already attached by output routine.
1924  *
1925  *      Such approach solves two big problems:
1926  *      1. Not simplex devices are handled properly.
1927  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1928  */
1929
1930 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1931                                u8 tos, struct net_device *dev)
1932 {
1933         struct fib_result res;
1934         struct in_device *in_dev = in_dev_get(dev);
1935         struct flowi fl = { .nl_u = { .ip4_u =
1936                                       { .daddr = daddr,
1937                                         .saddr = saddr,
1938                                         .tos = tos,
1939                                         .scope = RT_SCOPE_UNIVERSE,
1940                                       } },
1941                             .mark = skb->mark,
1942                             .iif = dev->ifindex };
1943         unsigned        flags = 0;
1944         u32             itag = 0;
1945         struct rtable * rth;
1946         unsigned        hash;
1947         __be32          spec_dst;
1948         int             err = -EINVAL;
1949         int             free_res = 0;
1950         struct net    * net = dev_net(dev);
1951
1952         /* IP on this device is disabled. */
1953
1954         if (!in_dev)
1955                 goto out;
1956
1957         /* Check for the most weird martians, which can be not detected
1958            by fib_lookup.
1959          */
1960
1961         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1962             ipv4_is_loopback(saddr))
1963                 goto martian_source;
1964
1965         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1966                 goto brd_input;
1967
1968         /* Accept zero addresses only to limited broadcast;
1969          * I even do not know to fix it or not. Waiting for complains :-)
1970          */
1971         if (ipv4_is_zeronet(saddr))
1972                 goto martian_source;
1973
1974         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1975             ipv4_is_loopback(daddr))
1976                 goto martian_destination;
1977
1978         /*
1979          *      Now we are ready to route packet.
1980          */
1981         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1982                 if (!IN_DEV_FORWARD(in_dev))
1983                         goto e_hostunreach;
1984                 goto no_route;
1985         }
1986         free_res = 1;
1987
1988         RT_CACHE_STAT_INC(in_slow_tot);
1989
1990         if (res.type == RTN_BROADCAST)
1991                 goto brd_input;
1992
1993         if (res.type == RTN_LOCAL) {
1994                 int result;
1995                 result = fib_validate_source(saddr, daddr, tos,
1996                                              net->loopback_dev->ifindex,
1997                                              dev, &spec_dst, &itag);
1998                 if (result < 0)
1999                         goto martian_source;
2000                 if (result)
2001                         flags |= RTCF_DIRECTSRC;
2002                 spec_dst = daddr;
2003                 goto local_input;
2004         }
2005
2006         if (!IN_DEV_FORWARD(in_dev))
2007                 goto e_hostunreach;
2008         if (res.type != RTN_UNICAST)
2009                 goto martian_destination;
2010
2011         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2012 done:
2013         in_dev_put(in_dev);
2014         if (free_res)
2015                 fib_res_put(&res);
2016 out:    return err;
2017
2018 brd_input:
2019         if (skb->protocol != htons(ETH_P_IP))
2020                 goto e_inval;
2021
2022         if (ipv4_is_zeronet(saddr))
2023                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2024         else {
2025                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2026                                           &itag);
2027                 if (err < 0)
2028                         goto martian_source;
2029                 if (err)
2030                         flags |= RTCF_DIRECTSRC;
2031         }
2032         flags |= RTCF_BROADCAST;
2033         res.type = RTN_BROADCAST;
2034         RT_CACHE_STAT_INC(in_brd);
2035
2036 local_input:
2037         rth = dst_alloc(&ipv4_dst_ops);
2038         if (!rth)
2039                 goto e_nobufs;
2040
2041         rth->u.dst.output= ip_rt_bug;
2042         rth->rt_genid = rt_genid(net);
2043
2044         atomic_set(&rth->u.dst.__refcnt, 1);
2045         rth->u.dst.flags= DST_HOST;
2046         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2047                 rth->u.dst.flags |= DST_NOPOLICY;
2048         rth->fl.fl4_dst = daddr;
2049         rth->rt_dst     = daddr;
2050         rth->fl.fl4_tos = tos;
2051         rth->fl.mark    = skb->mark;
2052         rth->fl.fl4_src = saddr;
2053         rth->rt_src     = saddr;
2054 #ifdef CONFIG_NET_CLS_ROUTE
2055         rth->u.dst.tclassid = itag;
2056 #endif
2057         rth->rt_iif     =
2058         rth->fl.iif     = dev->ifindex;
2059         rth->u.dst.dev  = net->loopback_dev;
2060         dev_hold(rth->u.dst.dev);
2061         rth->idev       = in_dev_get(rth->u.dst.dev);
2062         rth->rt_gateway = daddr;
2063         rth->rt_spec_dst= spec_dst;
2064         rth->u.dst.input= ip_local_deliver;
2065         rth->rt_flags   = flags|RTCF_LOCAL;
2066         if (res.type == RTN_UNREACHABLE) {
2067                 rth->u.dst.input= ip_error;
2068                 rth->u.dst.error= -err;
2069                 rth->rt_flags   &= ~RTCF_LOCAL;
2070         }
2071         rth->rt_type    = res.type;
2072         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2073         err = rt_intern_hash(hash, rth, &skb->rtable);
2074         goto done;
2075
2076 no_route:
2077         RT_CACHE_STAT_INC(in_no_route);
2078         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2079         res.type = RTN_UNREACHABLE;
2080         if (err == -ESRCH)
2081                 err = -ENETUNREACH;
2082         goto local_input;
2083
2084         /*
2085          *      Do not cache martian addresses: they should be logged (RFC1812)
2086          */
2087 martian_destination:
2088         RT_CACHE_STAT_INC(in_martian_dst);
2089 #ifdef CONFIG_IP_ROUTE_VERBOSE
2090         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2091                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2092                         NIPQUAD_FMT ", dev %s\n",
2093                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2094 #endif
2095
2096 e_hostunreach:
2097         err = -EHOSTUNREACH;
2098         goto done;
2099
2100 e_inval:
2101         err = -EINVAL;
2102         goto done;
2103
2104 e_nobufs:
2105         err = -ENOBUFS;
2106         goto done;
2107
2108 martian_source:
2109         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2110         goto e_inval;
2111 }
2112
2113 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2114                    u8 tos, struct net_device *dev)
2115 {
2116         struct rtable * rth;
2117         unsigned        hash;
2118         int iif = dev->ifindex;
2119         struct net *net;
2120
2121         net = dev_net(dev);
2122         tos &= IPTOS_RT_MASK;
2123         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2124
2125         rcu_read_lock();
2126         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2127              rth = rcu_dereference(rth->u.dst.rt_next)) {
2128                 if (((rth->fl.fl4_dst ^ daddr) |
2129                      (rth->fl.fl4_src ^ saddr) |
2130                      (rth->fl.iif ^ iif) |
2131                      rth->fl.oif |
2132                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2133                     rth->fl.mark == skb->mark &&
2134                     net_eq(dev_net(rth->u.dst.dev), net) &&
2135                     !rt_is_expired(rth)) {
2136                         dst_use(&rth->u.dst, jiffies);
2137                         RT_CACHE_STAT_INC(in_hit);
2138                         rcu_read_unlock();
2139                         skb->rtable = rth;
2140                         return 0;
2141                 }
2142                 RT_CACHE_STAT_INC(in_hlist_search);
2143         }
2144         rcu_read_unlock();
2145
2146         /* Multicast recognition logic is moved from route cache to here.
2147            The problem was that too many Ethernet cards have broken/missing
2148            hardware multicast filters :-( As result the host on multicasting
2149            network acquires a lot of useless route cache entries, sort of
2150            SDR messages from all the world. Now we try to get rid of them.
2151            Really, provided software IP multicast filter is organized
2152            reasonably (at least, hashed), it does not result in a slowdown
2153            comparing with route cache reject entries.
2154            Note, that multicast routers are not affected, because
2155            route cache entry is created eventually.
2156          */
2157         if (ipv4_is_multicast(daddr)) {
2158                 struct in_device *in_dev;
2159
2160                 rcu_read_lock();
2161                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2162                         int our = ip_check_mc(in_dev, daddr, saddr,
2163                                 ip_hdr(skb)->protocol);
2164                         if (our
2165 #ifdef CONFIG_IP_MROUTE
2166                             || (!ipv4_is_local_multicast(daddr) &&
2167                                 IN_DEV_MFORWARD(in_dev))
2168 #endif
2169                             ) {
2170                                 rcu_read_unlock();
2171                                 return ip_route_input_mc(skb, daddr, saddr,
2172                                                          tos, dev, our);
2173                         }
2174                 }
2175                 rcu_read_unlock();
2176                 return -EINVAL;
2177         }
2178         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2179 }
2180
2181 static int __mkroute_output(struct rtable **result,
2182                             struct fib_result *res,
2183                             const struct flowi *fl,
2184                             const struct flowi *oldflp,
2185                             struct net_device *dev_out,
2186                             unsigned flags)
2187 {
2188         struct rtable *rth;
2189         struct in_device *in_dev;
2190         u32 tos = RT_FL_TOS(oldflp);
2191         int err = 0;
2192
2193         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2194                 return -EINVAL;
2195
2196         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2197                 res->type = RTN_BROADCAST;
2198         else if (ipv4_is_multicast(fl->fl4_dst))
2199                 res->type = RTN_MULTICAST;
2200         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2201                 return -EINVAL;
2202
2203         if (dev_out->flags & IFF_LOOPBACK)
2204                 flags |= RTCF_LOCAL;
2205
2206         /* get work reference to inet device */
2207         in_dev = in_dev_get(dev_out);
2208         if (!in_dev)
2209                 return -EINVAL;
2210
2211         if (res->type == RTN_BROADCAST) {
2212                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2213                 if (res->fi) {
2214                         fib_info_put(res->fi);
2215                         res->fi = NULL;
2216                 }
2217         } else if (res->type == RTN_MULTICAST) {
2218                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2219                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2220                                  oldflp->proto))
2221                         flags &= ~RTCF_LOCAL;
2222                 /* If multicast route do not exist use
2223                    default one, but do not gateway in this case.
2224                    Yes, it is hack.
2225                  */
2226                 if (res->fi && res->prefixlen < 4) {
2227                         fib_info_put(res->fi);
2228                         res->fi = NULL;
2229                 }
2230         }
2231
2232
2233         rth = dst_alloc(&ipv4_dst_ops);
2234         if (!rth) {
2235                 err = -ENOBUFS;
2236                 goto cleanup;
2237         }
2238
2239         atomic_set(&rth->u.dst.__refcnt, 1);
2240         rth->u.dst.flags= DST_HOST;
2241         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2242                 rth->u.dst.flags |= DST_NOXFRM;
2243         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2244                 rth->u.dst.flags |= DST_NOPOLICY;
2245
2246         rth->fl.fl4_dst = oldflp->fl4_dst;
2247         rth->fl.fl4_tos = tos;
2248         rth->fl.fl4_src = oldflp->fl4_src;
2249         rth->fl.oif     = oldflp->oif;
2250         rth->fl.mark    = oldflp->mark;
2251         rth->rt_dst     = fl->fl4_dst;
2252         rth->rt_src     = fl->fl4_src;
2253         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2254         /* get references to the devices that are to be hold by the routing
2255            cache entry */
2256         rth->u.dst.dev  = dev_out;
2257         dev_hold(dev_out);
2258         rth->idev       = in_dev_get(dev_out);
2259         rth->rt_gateway = fl->fl4_dst;
2260         rth->rt_spec_dst= fl->fl4_src;
2261
2262         rth->u.dst.output=ip_output;
2263         rth->rt_genid = rt_genid(dev_net(dev_out));
2264
2265         RT_CACHE_STAT_INC(out_slow_tot);
2266
2267         if (flags & RTCF_LOCAL) {
2268                 rth->u.dst.input = ip_local_deliver;
2269                 rth->rt_spec_dst = fl->fl4_dst;
2270         }
2271         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2272                 rth->rt_spec_dst = fl->fl4_src;
2273                 if (flags & RTCF_LOCAL &&
2274                     !(dev_out->flags & IFF_LOOPBACK)) {
2275                         rth->u.dst.output = ip_mc_output;
2276                         RT_CACHE_STAT_INC(out_slow_mc);
2277                 }
2278 #ifdef CONFIG_IP_MROUTE
2279                 if (res->type == RTN_MULTICAST) {
2280                         if (IN_DEV_MFORWARD(in_dev) &&
2281                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2282                                 rth->u.dst.input = ip_mr_input;
2283                                 rth->u.dst.output = ip_mc_output;
2284                         }
2285                 }
2286 #endif
2287         }
2288
2289         rt_set_nexthop(rth, res, 0);
2290
2291         rth->rt_flags = flags;
2292
2293         *result = rth;
2294  cleanup:
2295         /* release work reference to inet device */
2296         in_dev_put(in_dev);
2297
2298         return err;
2299 }
2300
2301 static int ip_mkroute_output(struct rtable **rp,
2302                              struct fib_result *res,
2303                              const struct flowi *fl,
2304                              const struct flowi *oldflp,
2305                              struct net_device *dev_out,
2306                              unsigned flags)
2307 {
2308         struct rtable *rth = NULL;
2309         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2310         unsigned hash;
2311         if (err == 0) {
2312                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2313                                rt_genid(dev_net(dev_out)));
2314                 err = rt_intern_hash(hash, rth, rp);
2315         }
2316
2317         return err;
2318 }
2319
2320 /*
2321  * Major route resolver routine.
2322  */
2323
2324 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2325                                 const struct flowi *oldflp)
2326 {
2327         u32 tos = RT_FL_TOS(oldflp);
2328         struct flowi fl = { .nl_u = { .ip4_u =
2329                                       { .daddr = oldflp->fl4_dst,
2330                                         .saddr = oldflp->fl4_src,
2331                                         .tos = tos & IPTOS_RT_MASK,
2332                                         .scope = ((tos & RTO_ONLINK) ?
2333                                                   RT_SCOPE_LINK :
2334                                                   RT_SCOPE_UNIVERSE),
2335                                       } },
2336                             .mark = oldflp->mark,
2337                             .iif = net->loopback_dev->ifindex,
2338                             .oif = oldflp->oif };
2339         struct fib_result res;
2340         unsigned flags = 0;
2341         struct net_device *dev_out = NULL;
2342         int free_res = 0;
2343         int err;
2344
2345
2346         res.fi          = NULL;
2347 #ifdef CONFIG_IP_MULTIPLE_TABLES
2348         res.r           = NULL;
2349 #endif
2350
2351         if (oldflp->fl4_src) {
2352                 err = -EINVAL;
2353                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2354                     ipv4_is_lbcast(oldflp->fl4_src) ||
2355                     ipv4_is_zeronet(oldflp->fl4_src))
2356                         goto out;
2357
2358                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2359                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2360                 if (dev_out == NULL)
2361                         goto out;
2362
2363                 /* I removed check for oif == dev_out->oif here.
2364                    It was wrong for two reasons:
2365                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2366                       is assigned to multiple interfaces.
2367                    2. Moreover, we are allowed to send packets with saddr
2368                       of another iface. --ANK
2369                  */
2370
2371                 if (oldflp->oif == 0
2372                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2373                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2374                         /* Special hack: user can direct multicasts
2375                            and limited broadcast via necessary interface
2376                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2377                            This hack is not just for fun, it allows
2378                            vic,vat and friends to work.
2379                            They bind socket to loopback, set ttl to zero
2380                            and expect that it will work.
2381                            From the viewpoint of routing cache they are broken,
2382                            because we are not allowed to build multicast path
2383                            with loopback source addr (look, routing cache
2384                            cannot know, that ttl is zero, so that packet
2385                            will not leave this host and route is valid).
2386                            Luckily, this hack is good workaround.
2387                          */
2388
2389                         fl.oif = dev_out->ifindex;
2390                         goto make_route;
2391                 }
2392                 if (dev_out)
2393                         dev_put(dev_out);
2394                 dev_out = NULL;
2395         }
2396
2397
2398         if (oldflp->oif) {
2399                 dev_out = dev_get_by_index(net, oldflp->oif);
2400                 err = -ENODEV;
2401                 if (dev_out == NULL)
2402                         goto out;
2403
2404                 /* RACE: Check return value of inet_select_addr instead. */
2405                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2406                         dev_put(dev_out);
2407                         goto out;       /* Wrong error code */
2408                 }
2409
2410                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2411                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2412                         if (!fl.fl4_src)
2413                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2414                                                               RT_SCOPE_LINK);
2415                         goto make_route;
2416                 }
2417                 if (!fl.fl4_src) {
2418                         if (ipv4_is_multicast(oldflp->fl4_dst))
2419                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2420                                                               fl.fl4_scope);
2421                         else if (!oldflp->fl4_dst)
2422                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2423                                                               RT_SCOPE_HOST);
2424                 }
2425         }
2426
2427         if (!fl.fl4_dst) {
2428                 fl.fl4_dst = fl.fl4_src;
2429                 if (!fl.fl4_dst)
2430                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2431                 if (dev_out)
2432                         dev_put(dev_out);
2433                 dev_out = net->loopback_dev;
2434                 dev_hold(dev_out);
2435                 fl.oif = net->loopback_dev->ifindex;
2436                 res.type = RTN_LOCAL;
2437                 flags |= RTCF_LOCAL;
2438                 goto make_route;
2439         }
2440
2441         if (fib_lookup(net, &fl, &res)) {
2442                 res.fi = NULL;
2443                 if (oldflp->oif) {
2444                         /* Apparently, routing tables are wrong. Assume,
2445                            that the destination is on link.
2446
2447                            WHY? DW.
2448                            Because we are allowed to send to iface
2449                            even if it has NO routes and NO assigned
2450                            addresses. When oif is specified, routing
2451                            tables are looked up with only one purpose:
2452                            to catch if destination is gatewayed, rather than
2453                            direct. Moreover, if MSG_DONTROUTE is set,
2454                            we send packet, ignoring both routing tables
2455                            and ifaddr state. --ANK
2456
2457
2458                            We could make it even if oif is unknown,
2459                            likely IPv6, but we do not.
2460                          */
2461
2462                         if (fl.fl4_src == 0)
2463                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2464                                                               RT_SCOPE_LINK);
2465                         res.type = RTN_UNICAST;
2466                         goto make_route;
2467                 }
2468                 if (dev_out)
2469                         dev_put(dev_out);
2470                 err = -ENETUNREACH;
2471                 goto out;
2472         }
2473         free_res = 1;
2474
2475         if (res.type == RTN_LOCAL) {
2476                 if (!fl.fl4_src)
2477                         fl.fl4_src = fl.fl4_dst;
2478                 if (dev_out)
2479                         dev_put(dev_out);
2480                 dev_out = net->loopback_dev;
2481                 dev_hold(dev_out);
2482                 fl.oif = dev_out->ifindex;
2483                 if (res.fi)
2484                         fib_info_put(res.fi);
2485                 res.fi = NULL;
2486                 flags |= RTCF_LOCAL;
2487                 goto make_route;
2488         }
2489
2490 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2491         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2492                 fib_select_multipath(&fl, &res);
2493         else
2494 #endif
2495         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2496                 fib_select_default(net, &fl, &res);
2497
2498         if (!fl.fl4_src)
2499                 fl.fl4_src = FIB_RES_PREFSRC(res);
2500
2501         if (dev_out)
2502                 dev_put(dev_out);
2503         dev_out = FIB_RES_DEV(res);
2504         dev_hold(dev_out);
2505         fl.oif = dev_out->ifindex;
2506
2507
2508 make_route:
2509         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2510
2511
2512         if (free_res)
2513                 fib_res_put(&res);
2514         if (dev_out)
2515                 dev_put(dev_out);
2516 out:    return err;
2517 }
2518
2519 int __ip_route_output_key(struct net *net, struct rtable **rp,
2520                           const struct flowi *flp)
2521 {
2522         unsigned hash;
2523         struct rtable *rth;
2524
2525         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2526
2527         rcu_read_lock_bh();
2528         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2529                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2530                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2531                     rth->fl.fl4_src == flp->fl4_src &&
2532                     rth->fl.iif == 0 &&
2533                     rth->fl.oif == flp->oif &&
2534                     rth->fl.mark == flp->mark &&
2535                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2536                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2537                     net_eq(dev_net(rth->u.dst.dev), net) &&
2538                     !rt_is_expired(rth)) {
2539                         dst_use(&rth->u.dst, jiffies);
2540                         RT_CACHE_STAT_INC(out_hit);
2541                         rcu_read_unlock_bh();
2542                         *rp = rth;
2543                         return 0;
2544                 }
2545                 RT_CACHE_STAT_INC(out_hlist_search);
2546         }
2547         rcu_read_unlock_bh();
2548
2549         return ip_route_output_slow(net, rp, flp);
2550 }
2551
2552 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2553
2554 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2555 {
2556 }
2557
2558 static struct dst_ops ipv4_dst_blackhole_ops = {
2559         .family                 =       AF_INET,
2560         .protocol               =       __constant_htons(ETH_P_IP),
2561         .destroy                =       ipv4_dst_destroy,
2562         .check                  =       ipv4_dst_check,
2563         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2564         .entry_size             =       sizeof(struct rtable),
2565         .entries                =       ATOMIC_INIT(0),
2566 };
2567
2568
2569 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2570 {
2571         struct rtable *ort = *rp;
2572         struct rtable *rt = (struct rtable *)
2573                 dst_alloc(&ipv4_dst_blackhole_ops);
2574
2575         if (rt) {
2576                 struct dst_entry *new = &rt->u.dst;
2577
2578                 atomic_set(&new->__refcnt, 1);
2579                 new->__use = 1;
2580                 new->input = dst_discard;
2581                 new->output = dst_discard;
2582                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2583
2584                 new->dev = ort->u.dst.dev;
2585                 if (new->dev)
2586                         dev_hold(new->dev);
2587
2588                 rt->fl = ort->fl;
2589
2590                 rt->idev = ort->idev;
2591                 if (rt->idev)
2592                         in_dev_hold(rt->idev);
2593                 rt->rt_genid = rt_genid(net);
2594                 rt->rt_flags = ort->rt_flags;
2595                 rt->rt_type = ort->rt_type;
2596                 rt->rt_dst = ort->rt_dst;
2597                 rt->rt_src = ort->rt_src;
2598                 rt->rt_iif = ort->rt_iif;
2599                 rt->rt_gateway = ort->rt_gateway;
2600                 rt->rt_spec_dst = ort->rt_spec_dst;
2601                 rt->peer = ort->peer;
2602                 if (rt->peer)
2603                         atomic_inc(&rt->peer->refcnt);
2604
2605                 dst_free(new);
2606         }
2607
2608         dst_release(&(*rp)->u.dst);
2609         *rp = rt;
2610         return (rt ? 0 : -ENOMEM);
2611 }
2612
2613 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2614                          struct sock *sk, int flags)
2615 {
2616         int err;
2617
2618         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2619                 return err;
2620
2621         if (flp->proto) {
2622                 if (!flp->fl4_src)
2623                         flp->fl4_src = (*rp)->rt_src;
2624                 if (!flp->fl4_dst)
2625                         flp->fl4_dst = (*rp)->rt_dst;
2626                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2627                                     flags ? XFRM_LOOKUP_WAIT : 0);
2628                 if (err == -EREMOTE)
2629                         err = ipv4_dst_blackhole(net, rp, flp);
2630
2631                 return err;
2632         }
2633
2634         return 0;
2635 }
2636
2637 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2638
2639 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2640 {
2641         return ip_route_output_flow(net, rp, flp, NULL, 0);
2642 }
2643
2644 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2645                         int nowait, unsigned int flags)
2646 {
2647         struct rtable *rt = skb->rtable;
2648         struct rtmsg *r;
2649         struct nlmsghdr *nlh;
2650         long expires;
2651         u32 id = 0, ts = 0, tsage = 0, error;
2652
2653         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2654         if (nlh == NULL)
2655                 return -EMSGSIZE;
2656
2657         r = nlmsg_data(nlh);
2658         r->rtm_family    = AF_INET;
2659         r->rtm_dst_len  = 32;
2660         r->rtm_src_len  = 0;
2661         r->rtm_tos      = rt->fl.fl4_tos;
2662         r->rtm_table    = RT_TABLE_MAIN;
2663         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2664         r->rtm_type     = rt->rt_type;
2665         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2666         r->rtm_protocol = RTPROT_UNSPEC;
2667         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2668         if (rt->rt_flags & RTCF_NOTIFY)
2669                 r->rtm_flags |= RTM_F_NOTIFY;
2670
2671         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2672
2673         if (rt->fl.fl4_src) {
2674                 r->rtm_src_len = 32;
2675                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2676         }
2677         if (rt->u.dst.dev)
2678                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2679 #ifdef CONFIG_NET_CLS_ROUTE
2680         if (rt->u.dst.tclassid)
2681                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2682 #endif
2683         if (rt->fl.iif)
2684                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2685         else if (rt->rt_src != rt->fl.fl4_src)
2686                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2687
2688         if (rt->rt_dst != rt->rt_gateway)
2689                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2690
2691         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2692                 goto nla_put_failure;
2693
2694         error = rt->u.dst.error;
2695         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2696         if (rt->peer) {
2697                 id = rt->peer->ip_id_count;
2698                 if (rt->peer->tcp_ts_stamp) {
2699                         ts = rt->peer->tcp_ts;
2700                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2701                 }
2702         }
2703
2704         if (rt->fl.iif) {
2705 #ifdef CONFIG_IP_MROUTE
2706                 __be32 dst = rt->rt_dst;
2707
2708                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2709                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2710                         int err = ipmr_get_route(skb, r, nowait);
2711                         if (err <= 0) {
2712                                 if (!nowait) {
2713                                         if (err == 0)
2714                                                 return 0;
2715                                         goto nla_put_failure;
2716                                 } else {
2717                                         if (err == -EMSGSIZE)
2718                                                 goto nla_put_failure;
2719                                         error = err;
2720                                 }
2721                         }
2722                 } else
2723 #endif
2724                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2725         }
2726
2727         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2728                                expires, error) < 0)
2729                 goto nla_put_failure;
2730
2731         return nlmsg_end(skb, nlh);
2732
2733 nla_put_failure:
2734         nlmsg_cancel(skb, nlh);
2735         return -EMSGSIZE;
2736 }
2737
2738 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2739 {
2740         struct net *net = sock_net(in_skb->sk);
2741         struct rtmsg *rtm;
2742         struct nlattr *tb[RTA_MAX+1];
2743         struct rtable *rt = NULL;
2744         __be32 dst = 0;
2745         __be32 src = 0;
2746         u32 iif;
2747         int err;
2748         struct sk_buff *skb;
2749
2750         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2751         if (err < 0)
2752                 goto errout;
2753
2754         rtm = nlmsg_data(nlh);
2755
2756         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2757         if (skb == NULL) {
2758                 err = -ENOBUFS;
2759                 goto errout;
2760         }
2761
2762         /* Reserve room for dummy headers, this skb can pass
2763            through good chunk of routing engine.
2764          */
2765         skb_reset_mac_header(skb);
2766         skb_reset_network_header(skb);
2767
2768         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2769         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2770         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2771
2772         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2773         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2774         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2775
2776         if (iif) {
2777                 struct net_device *dev;
2778
2779                 dev = __dev_get_by_index(net, iif);
2780                 if (dev == NULL) {
2781                         err = -ENODEV;
2782                         goto errout_free;
2783                 }
2784
2785                 skb->protocol   = htons(ETH_P_IP);
2786                 skb->dev        = dev;
2787                 local_bh_disable();
2788                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2789                 local_bh_enable();
2790
2791                 rt = skb->rtable;
2792                 if (err == 0 && rt->u.dst.error)
2793                         err = -rt->u.dst.error;
2794         } else {
2795                 struct flowi fl = {
2796                         .nl_u = {
2797                                 .ip4_u = {
2798                                         .daddr = dst,
2799                                         .saddr = src,
2800                                         .tos = rtm->rtm_tos,
2801                                 },
2802                         },
2803                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2804                 };
2805                 err = ip_route_output_key(net, &rt, &fl);
2806         }
2807
2808         if (err)
2809                 goto errout_free;
2810
2811         skb->rtable = rt;
2812         if (rtm->rtm_flags & RTM_F_NOTIFY)
2813                 rt->rt_flags |= RTCF_NOTIFY;
2814
2815         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2816                            RTM_NEWROUTE, 0, 0);
2817         if (err <= 0)
2818                 goto errout_free;
2819
2820         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2821 errout:
2822         return err;
2823
2824 errout_free:
2825         kfree_skb(skb);
2826         goto errout;
2827 }
2828
2829 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2830 {
2831         struct rtable *rt;
2832         int h, s_h;
2833         int idx, s_idx;
2834         struct net *net;
2835
2836         net = sock_net(skb->sk);
2837
2838         s_h = cb->args[0];
2839         if (s_h < 0)
2840                 s_h = 0;
2841         s_idx = idx = cb->args[1];
2842         for (h = s_h; h <= rt_hash_mask; h++) {
2843                 rcu_read_lock_bh();
2844                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2845                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2846                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2847                                 continue;
2848                         if (rt_is_expired(rt))
2849                                 continue;
2850                         skb->dst = dst_clone(&rt->u.dst);
2851                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2852                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2853                                          1, NLM_F_MULTI) <= 0) {
2854                                 dst_release(xchg(&skb->dst, NULL));
2855                                 rcu_read_unlock_bh();
2856                                 goto done;
2857                         }
2858                         dst_release(xchg(&skb->dst, NULL));
2859                 }
2860                 rcu_read_unlock_bh();
2861                 s_idx = 0;
2862         }
2863
2864 done:
2865         cb->args[0] = h;
2866         cb->args[1] = idx;
2867         return skb->len;
2868 }
2869
2870 void ip_rt_multicast_event(struct in_device *in_dev)
2871 {
2872         rt_cache_flush(dev_net(in_dev->dev), 0);
2873 }
2874
2875 #ifdef CONFIG_SYSCTL
2876 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2877                                         struct file *filp, void __user *buffer,
2878                                         size_t *lenp, loff_t *ppos)
2879 {
2880         if (write) {
2881                 int flush_delay;
2882                 ctl_table ctl;
2883                 struct net *net;
2884
2885                 memcpy(&ctl, __ctl, sizeof(ctl));
2886                 ctl.data = &flush_delay;
2887                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2888
2889                 net = (struct net *)__ctl->extra1;
2890                 rt_cache_flush(net, flush_delay);
2891                 return 0;
2892         }
2893
2894         return -EINVAL;
2895 }
2896
2897 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2898                                                 int __user *name,
2899                                                 int nlen,
2900                                                 void __user *oldval,
2901                                                 size_t __user *oldlenp,
2902                                                 void __user *newval,
2903                                                 size_t newlen)
2904 {
2905         int delay;
2906         struct net *net;
2907         if (newlen != sizeof(int))
2908                 return -EINVAL;
2909         if (get_user(delay, (int __user *)newval))
2910                 return -EFAULT;
2911         net = (struct net *)table->extra1;
2912         rt_cache_flush(net, delay);
2913         return 0;
2914 }
2915
2916 ctl_table ipv4_route_table[] = {
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2919                 .procname       = "gc_thresh",
2920                 .data           = &ipv4_dst_ops.gc_thresh,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec,
2924         },
2925         {
2926                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2927                 .procname       = "max_size",
2928                 .data           = &ip_rt_max_size,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = &proc_dointvec,
2932         },
2933         {
2934                 /*  Deprecated. Use gc_min_interval_ms */
2935
2936                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2937                 .procname       = "gc_min_interval",
2938                 .data           = &ip_rt_gc_min_interval,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = &proc_dointvec_jiffies,
2942                 .strategy       = &sysctl_jiffies,
2943         },
2944         {
2945                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2946                 .procname       = "gc_min_interval_ms",
2947                 .data           = &ip_rt_gc_min_interval,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = &proc_dointvec_ms_jiffies,
2951                 .strategy       = &sysctl_ms_jiffies,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2955                 .procname       = "gc_timeout",
2956                 .data           = &ip_rt_gc_timeout,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec_jiffies,
2960                 .strategy       = &sysctl_jiffies,
2961         },
2962         {
2963                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2964                 .procname       = "gc_interval",
2965                 .data           = &ip_rt_gc_interval,
2966                 .maxlen         = sizeof(int),
2967                 .mode           = 0644,
2968                 .proc_handler   = &proc_dointvec_jiffies,
2969                 .strategy       = &sysctl_jiffies,
2970         },
2971         {
2972                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2973                 .procname       = "redirect_load",
2974                 .data           = &ip_rt_redirect_load,
2975                 .maxlen         = sizeof(int),
2976                 .mode           = 0644,
2977                 .proc_handler   = &proc_dointvec,
2978         },
2979         {
2980                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2981                 .procname       = "redirect_number",
2982                 .data           = &ip_rt_redirect_number,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = &proc_dointvec,
2986         },
2987         {
2988                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2989                 .procname       = "redirect_silence",
2990                 .data           = &ip_rt_redirect_silence,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = &proc_dointvec,
2994         },
2995         {
2996                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2997                 .procname       = "error_cost",
2998                 .data           = &ip_rt_error_cost,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = &proc_dointvec,
3002         },
3003         {
3004                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3005                 .procname       = "error_burst",
3006                 .data           = &ip_rt_error_burst,
3007                 .maxlen         = sizeof(int),
3008                 .mode           = 0644,
3009                 .proc_handler   = &proc_dointvec,
3010         },
3011         {
3012                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3013                 .procname       = "gc_elasticity",
3014                 .data           = &ip_rt_gc_elasticity,
3015                 .maxlen         = sizeof(int),
3016                 .mode           = 0644,
3017                 .proc_handler   = &proc_dointvec,
3018         },
3019         {
3020                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3021                 .procname       = "mtu_expires",
3022                 .data           = &ip_rt_mtu_expires,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = &proc_dointvec_jiffies,
3026                 .strategy       = &sysctl_jiffies,
3027         },
3028         {
3029                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3030                 .procname       = "min_pmtu",
3031                 .data           = &ip_rt_min_pmtu,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = &proc_dointvec,
3035         },
3036         {
3037                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3038                 .procname       = "min_adv_mss",
3039                 .data           = &ip_rt_min_advmss,
3040                 .maxlen         = sizeof(int),
3041                 .mode           = 0644,
3042                 .proc_handler   = &proc_dointvec,
3043         },
3044         {
3045                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3046                 .procname       = "secret_interval",
3047                 .data           = &ip_rt_secret_interval,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = &proc_dointvec_jiffies,
3051                 .strategy       = &sysctl_jiffies,
3052         },
3053         { .ctl_name = 0 }
3054 };
3055
3056 static __net_initdata struct ctl_path ipv4_route_path[] = {
3057         { .procname = "net", .ctl_name = CTL_NET, },
3058         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3059         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3060         { },
3061 };
3062
3063
3064 static struct ctl_table ipv4_route_flush_table[] = {
3065         {
3066                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3067                 .procname       = "flush",
3068                 .maxlen         = sizeof(int),
3069                 .mode           = 0200,
3070                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3071                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3072         },
3073         { .ctl_name = 0 },
3074 };
3075
3076 static __net_init int sysctl_route_net_init(struct net *net)
3077 {
3078         struct ctl_table *tbl;
3079
3080         tbl = ipv4_route_flush_table;
3081         if (net != &init_net) {
3082                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3083                 if (tbl == NULL)
3084                         goto err_dup;
3085         }
3086         tbl[0].extra1 = net;
3087
3088         net->ipv4.route_hdr =
3089                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3090         if (net->ipv4.route_hdr == NULL)
3091                 goto err_reg;
3092         return 0;
3093
3094 err_reg:
3095         if (tbl != ipv4_route_flush_table)
3096                 kfree(tbl);
3097 err_dup:
3098         return -ENOMEM;
3099 }
3100
3101 static __net_exit void sysctl_route_net_exit(struct net *net)
3102 {
3103         struct ctl_table *tbl;
3104
3105         tbl = net->ipv4.route_hdr->ctl_table_arg;
3106         unregister_net_sysctl_table(net->ipv4.route_hdr);
3107         BUG_ON(tbl == ipv4_route_flush_table);
3108         kfree(tbl);
3109 }
3110
3111 static __net_initdata struct pernet_operations sysctl_route_ops = {
3112         .init = sysctl_route_net_init,
3113         .exit = sysctl_route_net_exit,
3114 };
3115 #endif
3116
3117
3118 static __net_init int rt_secret_timer_init(struct net *net)
3119 {
3120         atomic_set(&net->ipv4.rt_genid,
3121                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3122                         (jiffies ^ (jiffies >> 7))));
3123
3124         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3125         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3126         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3127
3128         net->ipv4.rt_secret_timer.expires =
3129                 jiffies + net_random() % ip_rt_secret_interval +
3130                 ip_rt_secret_interval;
3131         add_timer(&net->ipv4.rt_secret_timer);
3132         return 0;
3133 }
3134
3135 static __net_exit void rt_secret_timer_exit(struct net *net)
3136 {
3137         del_timer_sync(&net->ipv4.rt_secret_timer);
3138 }
3139
3140 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3141         .init = rt_secret_timer_init,
3142         .exit = rt_secret_timer_exit,
3143 };
3144
3145
3146 #ifdef CONFIG_NET_CLS_ROUTE
3147 struct ip_rt_acct *ip_rt_acct __read_mostly;
3148 #endif /* CONFIG_NET_CLS_ROUTE */
3149
3150 static __initdata unsigned long rhash_entries;
3151 static int __init set_rhash_entries(char *str)
3152 {
3153         if (!str)
3154                 return 0;
3155         rhash_entries = simple_strtoul(str, &str, 0);
3156         return 1;
3157 }
3158 __setup("rhash_entries=", set_rhash_entries);
3159
3160 int __init ip_rt_init(void)
3161 {
3162         int rc = 0;
3163
3164 #ifdef CONFIG_NET_CLS_ROUTE
3165         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3166         if (!ip_rt_acct)
3167                 panic("IP: failed to allocate ip_rt_acct\n");
3168 #endif
3169
3170         ipv4_dst_ops.kmem_cachep =
3171                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3172                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3173
3174         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3175
3176         rt_hash_table = (struct rt_hash_bucket *)
3177                 alloc_large_system_hash("IP route cache",
3178                                         sizeof(struct rt_hash_bucket),
3179                                         rhash_entries,
3180                                         (num_physpages >= 128 * 1024) ?
3181                                         15 : 17,
3182                                         0,
3183                                         &rt_hash_log,
3184                                         &rt_hash_mask,
3185                                         0);
3186         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3187         rt_hash_lock_init();
3188
3189         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3190         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3191
3192         devinet_init();
3193         ip_fib_init();
3194
3195         /* All the timers, started at system startup tend
3196            to synchronize. Perturb it a bit.
3197          */
3198         schedule_delayed_work(&expires_work,
3199                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3200
3201         if (register_pernet_subsys(&rt_secret_timer_ops))
3202                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3203
3204         if (ip_rt_proc_init())
3205                 printk(KERN_ERR "Unable to create route proc files\n");
3206 #ifdef CONFIG_XFRM
3207         xfrm_init();
3208         xfrm4_init();
3209 #endif
3210         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3211
3212 #ifdef CONFIG_SYSCTL
3213         register_pernet_subsys(&sysctl_route_ops);
3214 #endif
3215         return rc;
3216 }
3217
3218 EXPORT_SYMBOL(__ip_select_ident);
3219 EXPORT_SYMBOL(ip_route_input);
3220 EXPORT_SYMBOL(ip_route_output_key);