net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .default_mtu =          ipv4_default_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len, HHUptod;
 421
 422                 rcu_read_lock();
 423                 n = dst_get_neighbour(&r->dst);
 424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 425                 rcu_read_unlock();
 426
 427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 429                         r->dst.dev ? r->dst.dev->name : "*",
 430                         (__force u32)r->rt_dst,
 431                         (__force u32)r->rt_gateway,
 432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 433                         r->dst.__use, 0, (__force u32)r->rt_src,
 434                         dst_metric_advmss(&r->dst) + 40,
 435                         dst_metric(&r->dst, RTAX_WINDOW),
 436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 437                               dst_metric(&r->dst, RTAX_RTTVAR)),
 438                         r->rt_key_tos,
 439                         -1,
 440                         HHUptod,
 441                         r->rt_spec_dst, &len);
 442
 443                 seq_printf(seq, "%*s\n", 127 - len, "");
 444         }
 445         return 0;
 446 }
 447
 448 static const struct seq_operations rt_cache_seq_ops = {
 449         .start  = rt_cache_seq_start,
 450         .next   = rt_cache_seq_next,
 451         .stop   = rt_cache_seq_stop,
 452         .show   = rt_cache_seq_show,
 453 };
 454
 455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 456 {
 457         return seq_open_net(inode, file, &rt_cache_seq_ops,
 458                         sizeof(struct rt_cache_iter_state));
 459 }
 460
 461 static const struct file_operations rt_cache_seq_fops = {
 462         .owner   = THIS_MODULE,
 463         .open    = rt_cache_seq_open,
 464         .read    = seq_read,
 465         .llseek  = seq_lseek,
 466         .release = seq_release_net,
 467 };
 468
 469
 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 471 {
 472         int cpu;
 473
 474         if (*pos == 0)
 475                 return SEQ_START_TOKEN;
 476
 477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 478                 if (!cpu_possible(cpu))
 479                         continue;
 480                 *pos = cpu+1;
 481                 return &per_cpu(rt_cache_stat, cpu);
 482         }
 483         return NULL;
 484 }
 485
 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 487 {
 488         int cpu;
 489
 490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 491                 if (!cpu_possible(cpu))
 492                         continue;
 493                 *pos = cpu+1;
 494                 return &per_cpu(rt_cache_stat, cpu);
 495         }
 496         return NULL;
 497
 498 }
 499
 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 501 {
 502
 503 }
 504
 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 506 {
 507         struct rt_cache_stat *st = v;
 508
 509         if (v == SEQ_START_TOKEN) {
 510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 511                 return 0;
 512         }
 513
 514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 516                    dst_entries_get_slow(&ipv4_dst_ops),
 517                    st->in_hit,
 518                    st->in_slow_tot,
 519                    st->in_slow_mc,
 520                    st->in_no_route,
 521                    st->in_brd,
 522                    st->in_martian_dst,
 523                    st->in_martian_src,
 524
 525                    st->out_hit,
 526                    st->out_slow_tot,
 527                    st->out_slow_mc,
 528
 529                    st->gc_total,
 530                    st->gc_ignored,
 531                    st->gc_goal_miss,
 532                    st->gc_dst_overflow,
 533                    st->in_hlist_search,
 534                    st->out_hlist_search
 535                 );
 536         return 0;
 537 }
 538
 539 static const struct seq_operations rt_cpu_seq_ops = {
 540         .start  = rt_cpu_seq_start,
 541         .next   = rt_cpu_seq_next,
 542         .stop   = rt_cpu_seq_stop,
 543         .show   = rt_cpu_seq_show,
 544 };
 545
 546
 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 548 {
 549         return seq_open(file, &rt_cpu_seq_ops);
 550 }
 551
 552 static const struct file_operations rt_cpu_seq_fops = {
 553         .owner   = THIS_MODULE,
 554         .open    = rt_cpu_seq_open,
 555         .read    = seq_read,
 556         .llseek  = seq_lseek,
 557         .release = seq_release,
 558 };
 559
 560 #ifdef CONFIG_IP_ROUTE_CLASSID
 561 static int rt_acct_proc_show(struct seq_file *m, void *v)
 562 {
 563         struct ip_rt_acct *dst, *src;
 564         unsigned int i, j;
 565
 566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 567         if (!dst)
 568                 return -ENOMEM;
 569
 570         for_each_possible_cpu(i) {
 571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 572                 for (j = 0; j < 256; j++) {
 573                         dst[j].o_bytes   += src[j].o_bytes;
 574                         dst[j].o_packets += src[j].o_packets;
 575                         dst[j].i_bytes   += src[j].i_bytes;
 576                         dst[j].i_packets += src[j].i_packets;
 577                 }
 578         }
 579
 580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 581         kfree(dst);
 582         return 0;
 583 }
 584
 585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 586 {
 587         return single_open(file, rt_acct_proc_show, NULL);
 588 }
 589
 590 static const struct file_operations rt_acct_proc_fops = {
 591         .owner          = THIS_MODULE,
 592         .open           = rt_acct_proc_open,
 593         .read           = seq_read,
 594         .llseek         = seq_lseek,
 595         .release        = single_release,
 596 };
 597 #endif
 598
 599 static int __net_init ip_rt_do_proc_init(struct net *net)
 600 {
 601         struct proc_dir_entry *pde;
 602
 603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 604                         &rt_cache_seq_fops);
 605         if (!pde)
 606                 goto err1;
 607
 608         pde = proc_create("rt_cache", S_IRUGO,
 609                           net->proc_net_stat, &rt_cpu_seq_fops);
 610         if (!pde)
 611                 goto err2;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 615         if (!pde)
 616                 goto err3;
 617 #endif
 618         return 0;
 619
 620 #ifdef CONFIG_IP_ROUTE_CLASSID
 621 err3:
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623 #endif
 624 err2:
 625         remove_proc_entry("rt_cache", net->proc_net);
 626 err1:
 627         return -ENOMEM;
 628 }
 629
 630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 631 {
 632         remove_proc_entry("rt_cache", net->proc_net_stat);
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 #ifdef CONFIG_IP_ROUTE_CLASSID
 635         remove_proc_entry("rt_acct", net->proc_net);
 636 #endif
 637 }
 638
 639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 640         .init = ip_rt_do_proc_init,
 641         .exit = ip_rt_do_proc_exit,
 642 };
 643
 644 static int __init ip_rt_proc_init(void)
 645 {
 646         return register_pernet_subsys(&ip_rt_proc_ops);
 647 }
 648
 649 #else
 650 static inline int ip_rt_proc_init(void)
 651 {
 652         return 0;
 653 }
 654 #endif /* CONFIG_PROC_FS */
 655
 656 static inline void rt_free(struct rtable *rt)
 657 {
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline void rt_drop(struct rtable *rt)
 662 {
 663         ip_rt_put(rt);
 664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 665 }
 666
 667 static inline int rt_fast_clean(struct rtable *rth)
 668 {
 669         /* Kill broadcast/multicast entries very aggresively, if they
 670            collide in hash table with more useful entries */
 671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 672                 rt_is_input_route(rth) && rth->dst.rt_next;
 673 }
 674
 675 static inline int rt_valuable(struct rtable *rth)
 676 {
 677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 678                 (rth->peer && rth->peer->pmtu_expires);
 679 }
 680
 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 682 {
 683         unsigned long age;
 684         int ret = 0;
 685
 686         if (atomic_read(&rth->dst.__refcnt))
 687                 goto out;
 688
 689         age = jiffies - rth->dst.lastuse;
 690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 691             (age <= tmo2 && rt_valuable(rth)))
 692                 goto out;
 693         ret = 1;
 694 out:    return ret;
 695 }
 696
 697 /* Bits of score are:
 698  * 31: very valuable
 699  * 30: not quite useless
 700  * 29..0: usage counter
 701  */
 702 static inline u32 rt_score(struct rtable *rt)
 703 {
 704         u32 score = jiffies - rt->dst.lastuse;
 705
 706         score = ~score & ~(3<<30);
 707
 708         if (rt_valuable(rt))
 709                 score |= (1<<31);
 710
 711         if (rt_is_output_route(rt) ||
 712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 713                 score |= (1<<30);
 714
 715         return score;
 716 }
 717
 718 static inline bool rt_caching(const struct net *net)
 719 {
 720         return net->ipv4.current_rt_cache_rebuild_count <=
 721                 net->ipv4.sysctl_rt_cache_rebuild_count;
 722 }
 723
 724 static inline bool compare_hash_inputs(const struct rtable *rt1,
 725                                        const struct rtable *rt2)
 726 {
 727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 730 }
 731
 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 736                 (rt1->rt_mark ^ rt2->rt_mark) |
 737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 740 }
 741
 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 743 {
 744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 745 }
 746
 747 static inline int rt_is_expired(struct rtable *rth)
 748 {
 749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 750 }
 751
 752 /*
 753  * Perform a full scan of hash table and free all entries.
 754  * Can be called by a softirq or a process.
 755  * In the later case, we want to be reschedule if necessary
 756  */
 757 static void rt_do_flush(struct net *net, int process_context)
 758 {
 759         unsigned int i;
 760         struct rtable *rth, *next;
 761
 762         for (i = 0; i <= rt_hash_mask; i++) {
 763                 struct rtable __rcu **pprev;
 764                 struct rtable *list;
 765
 766                 if (process_context && need_resched())
 767                         cond_resched();
 768                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 769                 if (!rth)
 770                         continue;
 771
 772                 spin_lock_bh(rt_hash_lock_addr(i));
 773
 774                 list = NULL;
 775                 pprev = &rt_hash_table[i].chain;
 776                 rth = rcu_dereference_protected(*pprev,
 777                         lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                 while (rth) {
 780                         next = rcu_dereference_protected(rth->dst.rt_next,
 781                                 lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                         if (!net ||
 784                             net_eq(dev_net(rth->dst.dev), net)) {
 785                                 rcu_assign_pointer(*pprev, next);
 786                                 rcu_assign_pointer(rth->dst.rt_next, list);
 787                                 list = rth;
 788                         } else {
 789                                 pprev = &rth->dst.rt_next;
 790                         }
 791                         rth = next;
 792                 }
 793
 794                 spin_unlock_bh(rt_hash_lock_addr(i));
 795
 796                 for (; list; list = next) {
 797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 798                         rt_free(list);
 799                 }
 800         }
 801 }
 802
 803 /*
 804  * While freeing expired entries, we compute average chain length
 805  * and standard deviation, using fixed-point arithmetic.
 806  * This to have an estimation of rt_chain_length_max
 807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 809  */
 810
 811 #define FRACT_BITS 3
 812 #define ONE (1UL << FRACT_BITS)
 813
 814 /*
 815  * Given a hash chain and an item in this hash chain,
 816  * find if a previous entry has the same hash_inputs
 817  * (but differs on tos, mark or oif)
 818  * Returns 0 if an alias is found.
 819  * Returns ONE if rth has no alias before itself.
 820  */
 821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 822 {
 823         const struct rtable *aux = head;
 824
 825         while (aux != rth) {
 826                 if (compare_hash_inputs(aux, rth))
 827                         return 0;
 828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 829         }
 830         return ONE;
 831 }
 832
 833 /*
 834  * Perturbation of rt_genid by a small quantity [1..256]
 835  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 836  * many times (2^24) without giving recent rt_genid.
 837  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 838  */
 839 static void rt_cache_invalidate(struct net *net)
 840 {
 841         unsigned char shuffle;
 842
 843         get_random_bytes(&shuffle, sizeof(shuffle));
 844         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 845 }
 846
 847 /*
 848  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 849  * delay >= 0 : invalidate & flush cache (can be long)
 850  */
 851 void rt_cache_flush(struct net *net, int delay)
 852 {
 853         rt_cache_invalidate(net);
 854         if (delay >= 0)
 855                 rt_do_flush(net, !in_softirq());
 856 }
 857
 858 /* Flush previous cache invalidated entries from the cache */
 859 void rt_cache_flush_batch(struct net *net)
 860 {
 861         rt_do_flush(net, !in_softirq());
 862 }
 863
 864 static void rt_emergency_hash_rebuild(struct net *net)
 865 {
 866         if (net_ratelimit())
 867                 printk(KERN_WARNING "Route hash chain too long!\n");
 868         rt_cache_invalidate(net);
 869 }
 870
 871 /*
 872    Short description of GC goals.
 873
 874    We want to build algorithm, which will keep routing cache
 875    at some equilibrium point, when number of aged off entries
 876    is kept approximately equal to newly generated ones.
 877
 878    Current expiration strength is variable "expire".
 879    We try to adjust it dynamically, so that if networking
 880    is idle expires is large enough to keep enough of warm entries,
 881    and when load increases it reduces to limit cache size.
 882  */
 883
 884 static int rt_garbage_collect(struct dst_ops *ops)
 885 {
 886         static unsigned long expire = RT_GC_TIMEOUT;
 887         static unsigned long last_gc;
 888         static int rover;
 889         static int equilibrium;
 890         struct rtable *rth;
 891         struct rtable __rcu **rthp;
 892         unsigned long now = jiffies;
 893         int goal;
 894         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 895
 896         /*
 897          * Garbage collection is pretty expensive,
 898          * do not make it too frequently.
 899          */
 900
 901         RT_CACHE_STAT_INC(gc_total);
 902
 903         if (now - last_gc < ip_rt_gc_min_interval &&
 904             entries < ip_rt_max_size) {
 905                 RT_CACHE_STAT_INC(gc_ignored);
 906                 goto out;
 907         }
 908
 909         entries = dst_entries_get_slow(&ipv4_dst_ops);
 910         /* Calculate number of entries, which we want to expire now. */
 911         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 912         if (goal <= 0) {
 913                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 914                         equilibrium = ipv4_dst_ops.gc_thresh;
 915                 goal = entries - equilibrium;
 916                 if (goal > 0) {
 917                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 918                         goal = entries - equilibrium;
 919                 }
 920         } else {
 921                 /* We are in dangerous area. Try to reduce cache really
 922                  * aggressively.
 923                  */
 924                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 925                 equilibrium = entries - goal;
 926         }
 927
 928         if (now - last_gc >= ip_rt_gc_min_interval)
 929                 last_gc = now;
 930
 931         if (goal <= 0) {
 932                 equilibrium += goal;
 933                 goto work_done;
 934         }
 935
 936         do {
 937                 int i, k;
 938
 939                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 940                         unsigned long tmo = expire;
 941
 942                         k = (k + 1) & rt_hash_mask;
 943                         rthp = &rt_hash_table[k].chain;
 944                         spin_lock_bh(rt_hash_lock_addr(k));
 945                         while ((rth = rcu_dereference_protected(*rthp,
 946                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 947                                 if (!rt_is_expired(rth) &&
 948                                         !rt_may_expire(rth, tmo, expire)) {
 949                                         tmo >>= 1;
 950                                         rthp = &rth->dst.rt_next;
 951                                         continue;
 952                                 }
 953                                 *rthp = rth->dst.rt_next;
 954                                 rt_free(rth);
 955                                 goal--;
 956                         }
 957                         spin_unlock_bh(rt_hash_lock_addr(k));
 958                         if (goal <= 0)
 959                                 break;
 960                 }
 961                 rover = k;
 962
 963                 if (goal <= 0)
 964                         goto work_done;
 965
 966                 /* Goal is not achieved. We stop process if:
 967
 968                    - if expire reduced to zero. Otherwise, expire is halfed.
 969                    - if table is not full.
 970                    - if we are called from interrupt.
 971                    - jiffies check is just fallback/debug loop breaker.
 972                      We will not spin here for long time in any case.
 973                  */
 974
 975                 RT_CACHE_STAT_INC(gc_goal_miss);
 976
 977                 if (expire == 0)
 978                         break;
 979
 980                 expire >>= 1;
 981
 982                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 983                         goto out;
 984         } while (!in_softirq() && time_before_eq(jiffies, now));
 985
 986         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 987                 goto out;
 988         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 989                 goto out;
 990         if (net_ratelimit())
 991                 printk(KERN_WARNING "dst cache overflow\n");
 992         RT_CACHE_STAT_INC(gc_dst_overflow);
 993         return 1;
 994
 995 work_done:
 996         expire += ip_rt_gc_min_interval;
 997         if (expire > ip_rt_gc_timeout ||
 998             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 999             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1000                 expire = ip_rt_gc_timeout;
1001 out:    return 0;
1002 }
1003
1004 /*
1005  * Returns number of entries in a hash chain that have different hash_inputs
1006  */
1007 static int slow_chain_length(const struct rtable *head)
1008 {
1009         int length = 0;
1010         const struct rtable *rth = head;
1011
1012         while (rth) {
1013                 length += has_noalias(head, rth);
1014                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1015         }
1016         return length >> FRACT_BITS;
1017 }
1018
1019 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1020 {
1021         struct neigh_table *tbl = &arp_tbl;
1022         static const __be32 inaddr_any = 0;
1023         struct net_device *dev = dst->dev;
1024         const __be32 *pkey = daddr;
1025         struct neighbour *n;
1026
1027 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1028         if (dev->type == ARPHRD_ATM)
1029                 tbl = clip_tbl_hook;
1030 #endif
1031         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1032                 pkey = &inaddr_any;
1033
1034         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1035         if (n)
1036                 return n;
1037         return neigh_create(tbl, pkey, dev);
1038 }
1039
1040 static int rt_bind_neighbour(struct rtable *rt)
1041 {
1042         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1043         if (IS_ERR(n))
1044                 return PTR_ERR(n);
1045         dst_set_neighbour(&rt->dst, n);
1046
1047         return 0;
1048 }
1049
1050 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1051                                      struct sk_buff *skb, int ifindex)
1052 {
1053         struct rtable   *rth, *cand;
1054         struct rtable __rcu **rthp, **candp;
1055         unsigned long   now;
1056         u32             min_score;
1057         int             chain_length;
1058         int attempts = !in_softirq();
1059
1060 restart:
1061         chain_length = 0;
1062         min_score = ~(u32)0;
1063         cand = NULL;
1064         candp = NULL;
1065         now = jiffies;
1066
1067         if (!rt_caching(dev_net(rt->dst.dev))) {
1068                 /*
1069                  * If we're not caching, just tell the caller we
1070                  * were successful and don't touch the route.  The
1071                  * caller hold the sole reference to the cache entry, and
1072                  * it will be released when the caller is done with it.
1073                  * If we drop it here, the callers have no way to resolve routes
1074                  * when we're not caching.  Instead, just point *rp at rt, so
1075                  * the caller gets a single use out of the route
1076                  * Note that we do rt_free on this new route entry, so that
1077                  * once its refcount hits zero, we are still able to reap it
1078                  * (Thanks Alexey)
1079                  * Note: To avoid expensive rcu stuff for this uncached dst,
1080                  * we set DST_NOCACHE so that dst_release() can free dst without
1081                  * waiting a grace period.
1082                  */
1083
1084                 rt->dst.flags |= DST_NOCACHE;
1085                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1086                         int err = rt_bind_neighbour(rt);
1087                         if (err) {
1088                                 if (net_ratelimit())
1089                                         printk(KERN_WARNING
1090                                             "Neighbour table failure & not caching routes.\n");
1091                                 ip_rt_put(rt);
1092                                 return ERR_PTR(err);
1093                         }
1094                 }
1095
1096                 goto skip_hashing;
1097         }
1098
1099         rthp = &rt_hash_table[hash].chain;
1100
1101         spin_lock_bh(rt_hash_lock_addr(hash));
1102         while ((rth = rcu_dereference_protected(*rthp,
1103                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1104                 if (rt_is_expired(rth)) {
1105                         *rthp = rth->dst.rt_next;
1106                         rt_free(rth);
1107                         continue;
1108                 }
1109                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1110                         /* Put it first */
1111                         *rthp = rth->dst.rt_next;
1112                         /*
1113                          * Since lookup is lockfree, the deletion
1114                          * must be visible to another weakly ordered CPU before
1115                          * the insertion at the start of the hash chain.
1116                          */
1117                         rcu_assign_pointer(rth->dst.rt_next,
1118                                            rt_hash_table[hash].chain);
1119                         /*
1120                          * Since lookup is lockfree, the update writes
1121                          * must be ordered for consistency on SMP.
1122                          */
1123                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1124
1125                         dst_use(&rth->dst, now);
1126                         spin_unlock_bh(rt_hash_lock_addr(hash));
1127
1128                         rt_drop(rt);
1129                         if (skb)
1130                                 skb_dst_set(skb, &rth->dst);
1131                         return rth;
1132                 }
1133
1134                 if (!atomic_read(&rth->dst.__refcnt)) {
1135                         u32 score = rt_score(rth);
1136
1137                         if (score <= min_score) {
1138                                 cand = rth;
1139                                 candp = rthp;
1140                                 min_score = score;
1141                         }
1142                 }
1143
1144                 chain_length++;
1145
1146                 rthp = &rth->dst.rt_next;
1147         }
1148
1149         if (cand) {
1150                 /* ip_rt_gc_elasticity used to be average length of chain
1151                  * length, when exceeded gc becomes really aggressive.
1152                  *
1153                  * The second limit is less certain. At the moment it allows
1154                  * only 2 entries per bucket. We will see.
1155                  */
1156                 if (chain_length > ip_rt_gc_elasticity) {
1157                         *candp = cand->dst.rt_next;
1158                         rt_free(cand);
1159                 }
1160         } else {
1161                 if (chain_length > rt_chain_length_max &&
1162                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1163                         struct net *net = dev_net(rt->dst.dev);
1164                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1165                         if (!rt_caching(net)) {
1166                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1167                                         rt->dst.dev->name, num);
1168                         }
1169                         rt_emergency_hash_rebuild(net);
1170                         spin_unlock_bh(rt_hash_lock_addr(hash));
1171
1172                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1173                                         ifindex, rt_genid(net));
1174                         goto restart;
1175                 }
1176         }
1177
1178         /* Try to bind route to arp only if it is output
1179            route or unicast forwarding path.
1180          */
1181         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182                 int err = rt_bind_neighbour(rt);
1183                 if (err) {
1184                         spin_unlock_bh(rt_hash_lock_addr(hash));
1185
1186                         if (err != -ENOBUFS) {
1187                                 rt_drop(rt);
1188                                 return ERR_PTR(err);
1189                         }
1190
1191                         /* Neighbour tables are full and nothing
1192                            can be released. Try to shrink route cache,
1193                            it is most likely it holds some neighbour records.
1194                          */
1195                         if (attempts-- > 0) {
1196                                 int saved_elasticity = ip_rt_gc_elasticity;
1197                                 int saved_int = ip_rt_gc_min_interval;
1198                                 ip_rt_gc_elasticity     = 1;
1199                                 ip_rt_gc_min_interval   = 0;
1200                                 rt_garbage_collect(&ipv4_dst_ops);
1201                                 ip_rt_gc_min_interval   = saved_int;
1202                                 ip_rt_gc_elasticity     = saved_elasticity;
1203                                 goto restart;
1204                         }
1205
1206                         if (net_ratelimit())
1207                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1208                         rt_drop(rt);
1209                         return ERR_PTR(-ENOBUFS);
1210                 }
1211         }
1212
1213         rt->dst.rt_next = rt_hash_table[hash].chain;
1214
1215         /*
1216          * Since lookup is lockfree, we must make sure
1217          * previous writes to rt are committed to memory
1218          * before making rt visible to other CPUS.
1219          */
1220         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1221
1222         spin_unlock_bh(rt_hash_lock_addr(hash));
1223
1224 skip_hashing:
1225         if (skb)
1226                 skb_dst_set(skb, &rt->dst);
1227         return rt;
1228 }
1229
1230 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1231
1232 static u32 rt_peer_genid(void)
1233 {
1234         return atomic_read(&__rt_peer_genid);
1235 }
1236
1237 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1238 {
1239         struct inet_peer *peer;
1240
1241         peer = inet_getpeer_v4(daddr, create);
1242
1243         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1244                 inet_putpeer(peer);
1245         else
1246                 rt->rt_peer_genid = rt_peer_genid();
1247 }
1248
1249 /*
1250  * Peer allocation may fail only in serious out-of-memory conditions.  However
1251  * we still can generate some output.
1252  * Random ID selection looks a bit dangerous because we have no chances to
1253  * select ID being unique in a reasonable period of time.
1254  * But broken packet identifier may be better than no packet at all.
1255  */
1256 static void ip_select_fb_ident(struct iphdr *iph)
1257 {
1258         static DEFINE_SPINLOCK(ip_fb_id_lock);
1259         static u32 ip_fallback_id;
1260         u32 salt;
1261
1262         spin_lock_bh(&ip_fb_id_lock);
1263         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1264         iph->id = htons(salt & 0xFFFF);
1265         ip_fallback_id = salt;
1266         spin_unlock_bh(&ip_fb_id_lock);
1267 }
1268
1269 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1270 {
1271         struct rtable *rt = (struct rtable *) dst;
1272
1273         if (rt) {
1274                 if (rt->peer == NULL)
1275                         rt_bind_peer(rt, rt->rt_dst, 1);
1276
1277                 /* If peer is attached to destination, it is never detached,
1278                    so that we need not to grab a lock to dereference it.
1279                  */
1280                 if (rt->peer) {
1281                         iph->id = htons(inet_getid(rt->peer, more));
1282                         return;
1283                 }
1284         } else
1285                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1286                        __builtin_return_address(0));
1287
1288         ip_select_fb_ident(iph);
1289 }
1290 EXPORT_SYMBOL(__ip_select_ident);
1291
1292 static void rt_del(unsigned hash, struct rtable *rt)
1293 {
1294         struct rtable __rcu **rthp;
1295         struct rtable *aux;
1296
1297         rthp = &rt_hash_table[hash].chain;
1298         spin_lock_bh(rt_hash_lock_addr(hash));
1299         ip_rt_put(rt);
1300         while ((aux = rcu_dereference_protected(*rthp,
1301                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1302                 if (aux == rt || rt_is_expired(aux)) {
1303                         *rthp = aux->dst.rt_next;
1304                         rt_free(aux);
1305                         continue;
1306                 }
1307                 rthp = &aux->dst.rt_next;
1308         }
1309         spin_unlock_bh(rt_hash_lock_addr(hash));
1310 }
1311
1312 /* called in rcu_read_lock() section */
1313 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1314                     __be32 saddr, struct net_device *dev)
1315 {
1316         struct in_device *in_dev = __in_dev_get_rcu(dev);
1317         struct inet_peer *peer;
1318         struct net *net;
1319
1320         if (!in_dev)
1321                 return;
1322
1323         net = dev_net(dev);
1324         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1325             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1326             ipv4_is_zeronet(new_gw))
1327                 goto reject_redirect;
1328
1329         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1330                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1331                         goto reject_redirect;
1332                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1333                         goto reject_redirect;
1334         } else {
1335                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1336                         goto reject_redirect;
1337         }
1338
1339         peer = inet_getpeer_v4(daddr, 1);
1340         if (peer) {
1341                 peer->redirect_learned.a4 = new_gw;
1342
1343                 inet_putpeer(peer);
1344
1345                 atomic_inc(&__rt_peer_genid);
1346         }
1347         return;
1348
1349 reject_redirect:
1350 #ifdef CONFIG_IP_ROUTE_VERBOSE
1351         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1352                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1353                         "  Advised path = %pI4 -> %pI4\n",
1354                        &old_gw, dev->name, &new_gw,
1355                        &saddr, &daddr);
1356 #endif
1357         ;
1358 }
1359
1360 static bool peer_pmtu_expired(struct inet_peer *peer)
1361 {
1362         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1363
1364         return orig &&
1365                time_after_eq(jiffies, orig) &&
1366                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1367 }
1368
1369 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1370 {
1371         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1372
1373         return orig &&
1374                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1375 }
1376
1377 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1378 {
1379         struct rtable *rt = (struct rtable *)dst;
1380         struct dst_entry *ret = dst;
1381
1382         if (rt) {
1383                 if (dst->obsolete > 0) {
1384                         ip_rt_put(rt);
1385                         ret = NULL;
1386                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1387                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1388                                                 rt->rt_oif,
1389                                                 rt_genid(dev_net(dst->dev)));
1390                         rt_del(hash, rt);
1391                         ret = NULL;
1392                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1393                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1394                 }
1395         }
1396         return ret;
1397 }
1398
1399 /*
1400  * Algorithm:
1401  *      1. The first ip_rt_redirect_number redirects are sent
1402  *         with exponential backoff, then we stop sending them at all,
1403  *         assuming that the host ignores our redirects.
1404  *      2. If we did not see packets requiring redirects
1405  *         during ip_rt_redirect_silence, we assume that the host
1406  *         forgot redirected route and start to send redirects again.
1407  *
1408  * This algorithm is much cheaper and more intelligent than dumb load limiting
1409  * in icmp.c.
1410  *
1411  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1412  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1413  */
1414
1415 void ip_rt_send_redirect(struct sk_buff *skb)
1416 {
1417         struct rtable *rt = skb_rtable(skb);
1418         struct in_device *in_dev;
1419         struct inet_peer *peer;
1420         int log_martians;
1421
1422         rcu_read_lock();
1423         in_dev = __in_dev_get_rcu(rt->dst.dev);
1424         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1425                 rcu_read_unlock();
1426                 return;
1427         }
1428         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1429         rcu_read_unlock();
1430
1431         if (!rt->peer)
1432                 rt_bind_peer(rt, rt->rt_dst, 1);
1433         peer = rt->peer;
1434         if (!peer) {
1435                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1436                 return;
1437         }
1438
1439         /* No redirected packets during ip_rt_redirect_silence;
1440          * reset the algorithm.
1441          */
1442         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1443                 peer->rate_tokens = 0;
1444
1445         /* Too many ignored redirects; do not send anything
1446          * set dst.rate_last to the last seen redirected packet.
1447          */
1448         if (peer->rate_tokens >= ip_rt_redirect_number) {
1449                 peer->rate_last = jiffies;
1450                 return;
1451         }
1452
1453         /* Check for load limit; set rate_last to the latest sent
1454          * redirect.
1455          */
1456         if (peer->rate_tokens == 0 ||
1457             time_after(jiffies,
1458                        (peer->rate_last +
1459                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1460                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1461                 peer->rate_last = jiffies;
1462                 ++peer->rate_tokens;
1463 #ifdef CONFIG_IP_ROUTE_VERBOSE
1464                 if (log_martians &&
1465                     peer->rate_tokens == ip_rt_redirect_number &&
1466                     net_ratelimit())
1467                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1468                                &ip_hdr(skb)->saddr, rt->rt_iif,
1469                                 &rt->rt_dst, &rt->rt_gateway);
1470 #endif
1471         }
1472 }
1473
1474 static int ip_error(struct sk_buff *skb)
1475 {
1476         struct rtable *rt = skb_rtable(skb);
1477         struct inet_peer *peer;
1478         unsigned long now;
1479         bool send;
1480         int code;
1481
1482         switch (rt->dst.error) {
1483         case EINVAL:
1484         default:
1485                 goto out;
1486         case EHOSTUNREACH:
1487                 code = ICMP_HOST_UNREACH;
1488                 break;
1489         case ENETUNREACH:
1490                 code = ICMP_NET_UNREACH;
1491                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1492                                 IPSTATS_MIB_INNOROUTES);
1493                 break;
1494         case EACCES:
1495                 code = ICMP_PKT_FILTERED;
1496                 break;
1497         }
1498
1499         if (!rt->peer)
1500                 rt_bind_peer(rt, rt->rt_dst, 1);
1501         peer = rt->peer;
1502
1503         send = true;
1504         if (peer) {
1505                 now = jiffies;
1506                 peer->rate_tokens += now - peer->rate_last;
1507                 if (peer->rate_tokens > ip_rt_error_burst)
1508                         peer->rate_tokens = ip_rt_error_burst;
1509                 peer->rate_last = now;
1510                 if (peer->rate_tokens >= ip_rt_error_cost)
1511                         peer->rate_tokens -= ip_rt_error_cost;
1512                 else
1513                         send = false;
1514         }
1515         if (send)
1516                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1517
1518 out:    kfree_skb(skb);
1519         return 0;
1520 }
1521
1522 /*
1523  *      The last two values are not from the RFC but
1524  *      are needed for AMPRnet AX.25 paths.
1525  */
1526
1527 static const unsigned short mtu_plateau[] =
1528 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1529
1530 static inline unsigned short guess_mtu(unsigned short old_mtu)
1531 {
1532         int i;
1533
1534         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1535                 if (old_mtu > mtu_plateau[i])
1536                         return mtu_plateau[i];
1537         return 68;
1538 }
1539
1540 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1541                                  unsigned short new_mtu,
1542                                  struct net_device *dev)
1543 {
1544         unsigned short old_mtu = ntohs(iph->tot_len);
1545         unsigned short est_mtu = 0;
1546         struct inet_peer *peer;
1547
1548         peer = inet_getpeer_v4(iph->daddr, 1);
1549         if (peer) {
1550                 unsigned short mtu = new_mtu;
1551
1552                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1553                         /* BSD 4.2 derived systems incorrectly adjust
1554                          * tot_len by the IP header length, and report
1555                          * a zero MTU in the ICMP message.
1556                          */
1557                         if (mtu == 0 &&
1558                             old_mtu >= 68 + (iph->ihl << 2))
1559                                 old_mtu -= iph->ihl << 2;
1560                         mtu = guess_mtu(old_mtu);
1561                 }
1562
1563                 if (mtu < ip_rt_min_pmtu)
1564                         mtu = ip_rt_min_pmtu;
1565                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1566                         unsigned long pmtu_expires;
1567
1568                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1569                         if (!pmtu_expires)
1570                                 pmtu_expires = 1UL;
1571
1572                         est_mtu = mtu;
1573                         peer->pmtu_learned = mtu;
1574                         peer->pmtu_expires = pmtu_expires;
1575                 }
1576
1577                 inet_putpeer(peer);
1578
1579                 atomic_inc(&__rt_peer_genid);
1580         }
1581         return est_mtu ? : new_mtu;
1582 }
1583
1584 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1585 {
1586         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1587
1588         if (!expires)
1589                 return;
1590         if (time_before(jiffies, expires)) {
1591                 u32 orig_dst_mtu = dst_mtu(dst);
1592                 if (peer->pmtu_learned < orig_dst_mtu) {
1593                         if (!peer->pmtu_orig)
1594                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1595                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1596                 }
1597         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1598                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1599 }
1600
1601 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1602 {
1603         struct rtable *rt = (struct rtable *) dst;
1604         struct inet_peer *peer;
1605
1606         dst_confirm(dst);
1607
1608         if (!rt->peer)
1609                 rt_bind_peer(rt, rt->rt_dst, 1);
1610         peer = rt->peer;
1611         if (peer) {
1612                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1613
1614                 if (mtu < ip_rt_min_pmtu)
1615                         mtu = ip_rt_min_pmtu;
1616                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1617
1618                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1619                         if (!pmtu_expires)
1620                                 pmtu_expires = 1UL;
1621
1622                         peer->pmtu_learned = mtu;
1623                         peer->pmtu_expires = pmtu_expires;
1624
1625                         atomic_inc(&__rt_peer_genid);
1626                         rt->rt_peer_genid = rt_peer_genid();
1627                 }
1628                 check_peer_pmtu(dst, peer);
1629         }
1630 }
1631
1632 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1633 {
1634         struct rtable *rt = (struct rtable *) dst;
1635         __be32 orig_gw = rt->rt_gateway;
1636         struct neighbour *n, *old_n;
1637
1638         dst_confirm(&rt->dst);
1639
1640         rt->rt_gateway = peer->redirect_learned.a4;
1641
1642         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1643         if (IS_ERR(n))
1644                 return PTR_ERR(n);
1645         old_n = xchg(&rt->dst._neighbour, n);
1646         if (old_n)
1647                 neigh_release(old_n);
1648         if (!n || !(n->nud_state & NUD_VALID)) {
1649                 if (n)
1650                         neigh_event_send(n, NULL);
1651                 rt->rt_gateway = orig_gw;
1652                 return -EAGAIN;
1653         } else {
1654                 rt->rt_flags |= RTCF_REDIRECTED;
1655                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1656         }
1657         return 0;
1658 }
1659
1660 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1661 {
1662         struct rtable *rt = (struct rtable *) dst;
1663
1664         if (rt_is_expired(rt))
1665                 return NULL;
1666         if (rt->rt_peer_genid != rt_peer_genid()) {
1667                 struct inet_peer *peer;
1668
1669                 if (!rt->peer)
1670                         rt_bind_peer(rt, rt->rt_dst, 0);
1671
1672                 peer = rt->peer;
1673                 if (peer) {
1674                         check_peer_pmtu(dst, peer);
1675
1676                         if (peer->redirect_learned.a4 &&
1677                             peer->redirect_learned.a4 != rt->rt_gateway) {
1678                                 if (check_peer_redir(dst, peer))
1679                                         return NULL;
1680                         }
1681                 }
1682
1683                 rt->rt_peer_genid = rt_peer_genid();
1684         }
1685         return dst;
1686 }
1687
1688 static void ipv4_dst_destroy(struct dst_entry *dst)
1689 {
1690         struct rtable *rt = (struct rtable *) dst;
1691         struct inet_peer *peer = rt->peer;
1692
1693         if (rt->fi) {
1694                 fib_info_put(rt->fi);
1695                 rt->fi = NULL;
1696         }
1697         if (peer) {
1698                 rt->peer = NULL;
1699                 inet_putpeer(peer);
1700         }
1701 }
1702
1703
1704 static void ipv4_link_failure(struct sk_buff *skb)
1705 {
1706         struct rtable *rt;
1707
1708         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1709
1710         rt = skb_rtable(skb);
1711         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1712                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1713 }
1714
1715 static int ip_rt_bug(struct sk_buff *skb)
1716 {
1717         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1718                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1719                 skb->dev ? skb->dev->name : "?");
1720         kfree_skb(skb);
1721         WARN_ON(1);
1722         return 0;
1723 }
1724
1725 /*
1726    We do not cache source address of outgoing interface,
1727    because it is used only by IP RR, TS and SRR options,
1728    so that it out of fast path.
1729
1730    BTW remember: "addr" is allowed to be not aligned
1731    in IP options!
1732  */
1733
1734 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1735 {
1736         __be32 src;
1737
1738         if (rt_is_output_route(rt))
1739                 src = ip_hdr(skb)->saddr;
1740         else {
1741                 struct fib_result res;
1742                 struct flowi4 fl4;
1743                 struct iphdr *iph;
1744
1745                 iph = ip_hdr(skb);
1746
1747                 memset(&fl4, 0, sizeof(fl4));
1748                 fl4.daddr = iph->daddr;
1749                 fl4.saddr = iph->saddr;
1750                 fl4.flowi4_tos = RT_TOS(iph->tos);
1751                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1752                 fl4.flowi4_iif = skb->dev->ifindex;
1753                 fl4.flowi4_mark = skb->mark;
1754
1755                 rcu_read_lock();
1756                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1757                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1758                 else
1759                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1760                                         RT_SCOPE_UNIVERSE);
1761                 rcu_read_unlock();
1762         }
1763         memcpy(addr, &src, 4);
1764 }
1765
1766 #ifdef CONFIG_IP_ROUTE_CLASSID
1767 static void set_class_tag(struct rtable *rt, u32 tag)
1768 {
1769         if (!(rt->dst.tclassid & 0xFFFF))
1770                 rt->dst.tclassid |= tag & 0xFFFF;
1771         if (!(rt->dst.tclassid & 0xFFFF0000))
1772                 rt->dst.tclassid |= tag & 0xFFFF0000;
1773 }
1774 #endif
1775
1776 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1777 {
1778         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1779
1780         if (advmss == 0) {
1781                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1782                                ip_rt_min_advmss);
1783                 if (advmss > 65535 - 40)
1784                         advmss = 65535 - 40;
1785         }
1786         return advmss;
1787 }
1788
1789 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1790 {
1791         unsigned int mtu = dst->dev->mtu;
1792
1793         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1794                 const struct rtable *rt = (const struct rtable *) dst;
1795
1796                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1797                         mtu = 576;
1798         }
1799
1800         if (mtu > IP_MAX_MTU)
1801                 mtu = IP_MAX_MTU;
1802
1803         return mtu;
1804 }
1805
1806 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1807                             struct fib_info *fi)
1808 {
1809         struct inet_peer *peer;
1810         int create = 0;
1811
1812         /* If a peer entry exists for this destination, we must hook
1813          * it up in order to get at cached metrics.
1814          */
1815         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1816                 create = 1;
1817
1818         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1819         if (peer) {
1820                 rt->rt_peer_genid = rt_peer_genid();
1821                 if (inet_metrics_new(peer))
1822                         memcpy(peer->metrics, fi->fib_metrics,
1823                                sizeof(u32) * RTAX_MAX);
1824                 dst_init_metrics(&rt->dst, peer->metrics, false);
1825
1826                 check_peer_pmtu(&rt->dst, peer);
1827                 if (peer->redirect_learned.a4 &&
1828                     peer->redirect_learned.a4 != rt->rt_gateway) {
1829                         rt->rt_gateway = peer->redirect_learned.a4;
1830                         rt->rt_flags |= RTCF_REDIRECTED;
1831                 }
1832         } else {
1833                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1834                         rt->fi = fi;
1835                         atomic_inc(&fi->fib_clntref);
1836                 }
1837                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1838         }
1839 }
1840
1841 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1842                            const struct fib_result *res,
1843                            struct fib_info *fi, u16 type, u32 itag)
1844 {
1845         struct dst_entry *dst = &rt->dst;
1846
1847         if (fi) {
1848                 if (FIB_RES_GW(*res) &&
1849                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1850                         rt->rt_gateway = FIB_RES_GW(*res);
1851                 rt_init_metrics(rt, fl4, fi);
1852 #ifdef CONFIG_IP_ROUTE_CLASSID
1853                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1854 #endif
1855         }
1856
1857         if (dst_mtu(dst) > IP_MAX_MTU)
1858                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1859         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1860                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1861
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864         set_class_tag(rt, fib_rules_tclass(res));
1865 #endif
1866         set_class_tag(rt, itag);
1867 #endif
1868 }
1869
1870 static struct rtable *rt_dst_alloc(struct net_device *dev,
1871                                    bool nopolicy, bool noxfrm)
1872 {
1873         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1874                          DST_HOST |
1875                          (nopolicy ? DST_NOPOLICY : 0) |
1876                          (noxfrm ? DST_NOXFRM : 0));
1877 }
1878
1879 /* called in rcu_read_lock() section */
1880 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1881                                 u8 tos, struct net_device *dev, int our)
1882 {
1883         unsigned int hash;
1884         struct rtable *rth;
1885         __be32 spec_dst;
1886         struct in_device *in_dev = __in_dev_get_rcu(dev);
1887         u32 itag = 0;
1888         int err;
1889
1890         /* Primary sanity checks. */
1891
1892         if (in_dev == NULL)
1893                 return -EINVAL;
1894
1895         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1896             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1897                 goto e_inval;
1898
1899         if (ipv4_is_zeronet(saddr)) {
1900                 if (!ipv4_is_local_multicast(daddr))
1901                         goto e_inval;
1902                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1903         } else {
1904                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1905                                           &itag);
1906                 if (err < 0)
1907                         goto e_err;
1908         }
1909         rth = rt_dst_alloc(init_net.loopback_dev,
1910                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1911         if (!rth)
1912                 goto e_nobufs;
1913
1914 #ifdef CONFIG_IP_ROUTE_CLASSID
1915         rth->dst.tclassid = itag;
1916 #endif
1917         rth->dst.output = ip_rt_bug;
1918
1919         rth->rt_key_dst = daddr;
1920         rth->rt_key_src = saddr;
1921         rth->rt_genid   = rt_genid(dev_net(dev));
1922         rth->rt_flags   = RTCF_MULTICAST;
1923         rth->rt_type    = RTN_MULTICAST;
1924         rth->rt_key_tos = tos;
1925         rth->rt_dst     = daddr;
1926         rth->rt_src     = saddr;
1927         rth->rt_route_iif = dev->ifindex;
1928         rth->rt_iif     = dev->ifindex;
1929         rth->rt_oif     = 0;
1930         rth->rt_mark    = skb->mark;
1931         rth->rt_gateway = daddr;
1932         rth->rt_spec_dst= spec_dst;
1933         rth->rt_peer_genid = 0;
1934         rth->peer = NULL;
1935         rth->fi = NULL;
1936         if (our) {
1937                 rth->dst.input= ip_local_deliver;
1938                 rth->rt_flags |= RTCF_LOCAL;
1939         }
1940
1941 #ifdef CONFIG_IP_MROUTE
1942         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1943                 rth->dst.input = ip_mr_input;
1944 #endif
1945         RT_CACHE_STAT_INC(in_slow_mc);
1946
1947         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1948         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1949         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1950
1951 e_nobufs:
1952         return -ENOBUFS;
1953 e_inval:
1954         return -EINVAL;
1955 e_err:
1956         return err;
1957 }
1958
1959
1960 static void ip_handle_martian_source(struct net_device *dev,
1961                                      struct in_device *in_dev,
1962                                      struct sk_buff *skb,
1963                                      __be32 daddr,
1964                                      __be32 saddr)
1965 {
1966         RT_CACHE_STAT_INC(in_martian_src);
1967 #ifdef CONFIG_IP_ROUTE_VERBOSE
1968         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1969                 /*
1970                  *      RFC1812 recommendation, if source is martian,
1971                  *      the only hint is MAC header.
1972                  */
1973                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1974                         &daddr, &saddr, dev->name);
1975                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1976                         int i;
1977                         const unsigned char *p = skb_mac_header(skb);
1978                         printk(KERN_WARNING "ll header: ");
1979                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1980                                 printk("%02x", *p);
1981                                 if (i < (dev->hard_header_len - 1))
1982                                         printk(":");
1983                         }
1984                         printk("\n");
1985                 }
1986         }
1987 #endif
1988 }
1989
1990 /* called in rcu_read_lock() section */
1991 static int __mkroute_input(struct sk_buff *skb,
1992                            const struct fib_result *res,
1993                            struct in_device *in_dev,
1994                            __be32 daddr, __be32 saddr, u32 tos,
1995                            struct rtable **result)
1996 {
1997         struct rtable *rth;
1998         int err;
1999         struct in_device *out_dev;
2000         unsigned int flags = 0;
2001         __be32 spec_dst;
2002         u32 itag;
2003
2004         /* get a working reference to the output device */
2005         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2006         if (out_dev == NULL) {
2007                 if (net_ratelimit())
2008                         printk(KERN_CRIT "Bug in ip_route_input" \
2009                                "_slow(). Please, report\n");
2010                 return -EINVAL;
2011         }
2012
2013
2014         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2015                                   in_dev->dev, &spec_dst, &itag);
2016         if (err < 0) {
2017                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2018                                          saddr);
2019
2020                 goto cleanup;
2021         }
2022
2023         if (err)
2024                 flags |= RTCF_DIRECTSRC;
2025
2026         if (out_dev == in_dev && err &&
2027             (IN_DEV_SHARED_MEDIA(out_dev) ||
2028              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2029                 flags |= RTCF_DOREDIRECT;
2030
2031         if (skb->protocol != htons(ETH_P_IP)) {
2032                 /* Not IP (i.e. ARP). Do not create route, if it is
2033                  * invalid for proxy arp. DNAT routes are always valid.
2034                  *
2035                  * Proxy arp feature have been extended to allow, ARP
2036                  * replies back to the same interface, to support
2037                  * Private VLAN switch technologies. See arp.c.
2038                  */
2039                 if (out_dev == in_dev &&
2040                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2041                         err = -EINVAL;
2042                         goto cleanup;
2043                 }
2044         }
2045
2046         rth = rt_dst_alloc(out_dev->dev,
2047                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2048                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2049         if (!rth) {
2050                 err = -ENOBUFS;
2051                 goto cleanup;
2052         }
2053
2054         rth->rt_key_dst = daddr;
2055         rth->rt_key_src = saddr;
2056         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2057         rth->rt_flags = flags;
2058         rth->rt_type = res->type;
2059         rth->rt_key_tos = tos;
2060         rth->rt_dst     = daddr;
2061         rth->rt_src     = saddr;
2062         rth->rt_route_iif = in_dev->dev->ifindex;
2063         rth->rt_iif     = in_dev->dev->ifindex;
2064         rth->rt_oif     = 0;
2065         rth->rt_mark    = skb->mark;
2066         rth->rt_gateway = daddr;
2067         rth->rt_spec_dst= spec_dst;
2068         rth->rt_peer_genid = 0;
2069         rth->peer = NULL;
2070         rth->fi = NULL;
2071
2072         rth->dst.input = ip_forward;
2073         rth->dst.output = ip_output;
2074
2075         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2076
2077         *result = rth;
2078         err = 0;
2079  cleanup:
2080         return err;
2081 }
2082
2083 static int ip_mkroute_input(struct sk_buff *skb,
2084                             struct fib_result *res,
2085                             const struct flowi4 *fl4,
2086                             struct in_device *in_dev,
2087                             __be32 daddr, __be32 saddr, u32 tos)
2088 {
2089         struct rtable* rth = NULL;
2090         int err;
2091         unsigned hash;
2092
2093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2094         if (res->fi && res->fi->fib_nhs > 1)
2095                 fib_select_multipath(res);
2096 #endif
2097
2098         /* create a routing cache entry */
2099         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100         if (err)
2101                 return err;
2102
2103         /* put it into the cache */
2104         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2105                        rt_genid(dev_net(rth->dst.dev)));
2106         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2107         if (IS_ERR(rth))
2108                 return PTR_ERR(rth);
2109         return 0;
2110 }
2111
2112 /*
2113  *      NOTE. We drop all the packets that has local source
2114  *      addresses, because every properly looped back packet
2115  *      must have correct destination already attached by output routine.
2116  *
2117  *      Such approach solves two big problems:
2118  *      1. Not simplex devices are handled properly.
2119  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2120  *      called with rcu_read_lock()
2121  */
2122
2123 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124                                u8 tos, struct net_device *dev)
2125 {
2126         struct fib_result res;
2127         struct in_device *in_dev = __in_dev_get_rcu(dev);
2128         struct flowi4   fl4;
2129         unsigned        flags = 0;
2130         u32             itag = 0;
2131         struct rtable * rth;
2132         unsigned        hash;
2133         __be32          spec_dst;
2134         int             err = -EINVAL;
2135         struct net    * net = dev_net(dev);
2136
2137         /* IP on this device is disabled. */
2138
2139         if (!in_dev)
2140                 goto out;
2141
2142         /* Check for the most weird martians, which can be not detected
2143            by fib_lookup.
2144          */
2145
2146         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2147             ipv4_is_loopback(saddr))
2148                 goto martian_source;
2149
2150         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2151                 goto brd_input;
2152
2153         /* Accept zero addresses only to limited broadcast;
2154          * I even do not know to fix it or not. Waiting for complains :-)
2155          */
2156         if (ipv4_is_zeronet(saddr))
2157                 goto martian_source;
2158
2159         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2160                 goto martian_destination;
2161
2162         /*
2163          *      Now we are ready to route packet.
2164          */
2165         fl4.flowi4_oif = 0;
2166         fl4.flowi4_iif = dev->ifindex;
2167         fl4.flowi4_mark = skb->mark;
2168         fl4.flowi4_tos = tos;
2169         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2170         fl4.daddr = daddr;
2171         fl4.saddr = saddr;
2172         err = fib_lookup(net, &fl4, &res);
2173         if (err != 0) {
2174                 if (!IN_DEV_FORWARD(in_dev))
2175                         goto e_hostunreach;
2176                 goto no_route;
2177         }
2178
2179         RT_CACHE_STAT_INC(in_slow_tot);
2180
2181         if (res.type == RTN_BROADCAST)
2182                 goto brd_input;
2183
2184         if (res.type == RTN_LOCAL) {
2185                 err = fib_validate_source(skb, saddr, daddr, tos,
2186                                           net->loopback_dev->ifindex,
2187                                           dev, &spec_dst, &itag);
2188                 if (err < 0)
2189                         goto martian_source_keep_err;
2190                 if (err)
2191                         flags |= RTCF_DIRECTSRC;
2192                 spec_dst = daddr;
2193                 goto local_input;
2194         }
2195
2196         if (!IN_DEV_FORWARD(in_dev))
2197                 goto e_hostunreach;
2198         if (res.type != RTN_UNICAST)
2199                 goto martian_destination;
2200
2201         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2202 out:    return err;
2203
2204 brd_input:
2205         if (skb->protocol != htons(ETH_P_IP))
2206                 goto e_inval;
2207
2208         if (ipv4_is_zeronet(saddr))
2209                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2210         else {
2211                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2212                                           &itag);
2213                 if (err < 0)
2214                         goto martian_source_keep_err;
2215                 if (err)
2216                         flags |= RTCF_DIRECTSRC;
2217         }
2218         flags |= RTCF_BROADCAST;
2219         res.type = RTN_BROADCAST;
2220         RT_CACHE_STAT_INC(in_brd);
2221
2222 local_input:
2223         rth = rt_dst_alloc(net->loopback_dev,
2224                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2225         if (!rth)
2226                 goto e_nobufs;
2227
2228         rth->dst.input= ip_local_deliver;
2229         rth->dst.output= ip_rt_bug;
2230 #ifdef CONFIG_IP_ROUTE_CLASSID
2231         rth->dst.tclassid = itag;
2232 #endif
2233
2234         rth->rt_key_dst = daddr;
2235         rth->rt_key_src = saddr;
2236         rth->rt_genid = rt_genid(net);
2237         rth->rt_flags   = flags|RTCF_LOCAL;
2238         rth->rt_type    = res.type;
2239         rth->rt_key_tos = tos;
2240         rth->rt_dst     = daddr;
2241         rth->rt_src     = saddr;
2242 #ifdef CONFIG_IP_ROUTE_CLASSID
2243         rth->dst.tclassid = itag;
2244 #endif
2245         rth->rt_route_iif = dev->ifindex;
2246         rth->rt_iif     = dev->ifindex;
2247         rth->rt_oif     = 0;
2248         rth->rt_mark    = skb->mark;
2249         rth->rt_gateway = daddr;
2250         rth->rt_spec_dst= spec_dst;
2251         rth->rt_peer_genid = 0;
2252         rth->peer = NULL;
2253         rth->fi = NULL;
2254         if (res.type == RTN_UNREACHABLE) {
2255                 rth->dst.input= ip_error;
2256                 rth->dst.error= -err;
2257                 rth->rt_flags   &= ~RTCF_LOCAL;
2258         }
2259         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2260         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2261         err = 0;
2262         if (IS_ERR(rth))
2263                 err = PTR_ERR(rth);
2264         goto out;
2265
2266 no_route:
2267         RT_CACHE_STAT_INC(in_no_route);
2268         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269         res.type = RTN_UNREACHABLE;
2270         if (err == -ESRCH)
2271                 err = -ENETUNREACH;
2272         goto local_input;
2273
2274         /*
2275          *      Do not cache martian addresses: they should be logged (RFC1812)
2276          */
2277 martian_destination:
2278         RT_CACHE_STAT_INC(in_martian_dst);
2279 #ifdef CONFIG_IP_ROUTE_VERBOSE
2280         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2281                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282                         &daddr, &saddr, dev->name);
2283 #endif
2284
2285 e_hostunreach:
2286         err = -EHOSTUNREACH;
2287         goto out;
2288
2289 e_inval:
2290         err = -EINVAL;
2291         goto out;
2292
2293 e_nobufs:
2294         err = -ENOBUFS;
2295         goto out;
2296
2297 martian_source:
2298         err = -EINVAL;
2299 martian_source_keep_err:
2300         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2301         goto out;
2302 }
2303
2304 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2305                            u8 tos, struct net_device *dev, bool noref)
2306 {
2307         struct rtable * rth;
2308         unsigned        hash;
2309         int iif = dev->ifindex;
2310         struct net *net;
2311         int res;
2312
2313         net = dev_net(dev);
2314
2315         rcu_read_lock();
2316
2317         if (!rt_caching(net))
2318                 goto skip_cache;
2319
2320         tos &= IPTOS_RT_MASK;
2321         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2322
2323         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2324              rth = rcu_dereference(rth->dst.rt_next)) {
2325                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2326                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2327                      (rth->rt_route_iif ^ iif) |
2328                      (rth->rt_key_tos ^ tos)) == 0 &&
2329                     rth->rt_mark == skb->mark &&
2330                     net_eq(dev_net(rth->dst.dev), net) &&
2331                     !rt_is_expired(rth)) {
2332                         if (noref) {
2333                                 dst_use_noref(&rth->dst, jiffies);
2334                                 skb_dst_set_noref(skb, &rth->dst);
2335                         } else {
2336                                 dst_use(&rth->dst, jiffies);
2337                                 skb_dst_set(skb, &rth->dst);
2338                         }
2339                         RT_CACHE_STAT_INC(in_hit);
2340                         rcu_read_unlock();
2341                         return 0;
2342                 }
2343                 RT_CACHE_STAT_INC(in_hlist_search);
2344         }
2345
2346 skip_cache:
2347         /* Multicast recognition logic is moved from route cache to here.
2348            The problem was that too many Ethernet cards have broken/missing
2349            hardware multicast filters :-( As result the host on multicasting
2350            network acquires a lot of useless route cache entries, sort of
2351            SDR messages from all the world. Now we try to get rid of them.
2352            Really, provided software IP multicast filter is organized
2353            reasonably (at least, hashed), it does not result in a slowdown
2354            comparing with route cache reject entries.
2355            Note, that multicast routers are not affected, because
2356            route cache entry is created eventually.
2357          */
2358         if (ipv4_is_multicast(daddr)) {
2359                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2360
2361                 if (in_dev) {
2362                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2363                                                   ip_hdr(skb)->protocol);
2364                         if (our
2365 #ifdef CONFIG_IP_MROUTE
2366                                 ||
2367                             (!ipv4_is_local_multicast(daddr) &&
2368                              IN_DEV_MFORWARD(in_dev))
2369 #endif
2370                            ) {
2371                                 int res = ip_route_input_mc(skb, daddr, saddr,
2372                                                             tos, dev, our);
2373                                 rcu_read_unlock();
2374                                 return res;
2375                         }
2376                 }
2377                 rcu_read_unlock();
2378                 return -EINVAL;
2379         }
2380         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2381         rcu_read_unlock();
2382         return res;
2383 }
2384 EXPORT_SYMBOL(ip_route_input_common);
2385
2386 /* called with rcu_read_lock() */
2387 static struct rtable *__mkroute_output(const struct fib_result *res,
2388                                        const struct flowi4 *fl4,
2389                                        __be32 orig_daddr, __be32 orig_saddr,
2390                                        int orig_oif, struct net_device *dev_out,
2391                                        unsigned int flags)
2392 {
2393         struct fib_info *fi = res->fi;
2394         u32 tos = RT_FL_TOS(fl4);
2395         struct in_device *in_dev;
2396         u16 type = res->type;
2397         struct rtable *rth;
2398
2399         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2400                 return ERR_PTR(-EINVAL);
2401
2402         if (ipv4_is_lbcast(fl4->daddr))
2403                 type = RTN_BROADCAST;
2404         else if (ipv4_is_multicast(fl4->daddr))
2405                 type = RTN_MULTICAST;
2406         else if (ipv4_is_zeronet(fl4->daddr))
2407                 return ERR_PTR(-EINVAL);
2408
2409         if (dev_out->flags & IFF_LOOPBACK)
2410                 flags |= RTCF_LOCAL;
2411
2412         in_dev = __in_dev_get_rcu(dev_out);
2413         if (!in_dev)
2414                 return ERR_PTR(-EINVAL);
2415
2416         if (type == RTN_BROADCAST) {
2417                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2418                 fi = NULL;
2419         } else if (type == RTN_MULTICAST) {
2420                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2421                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2422                                      fl4->flowi4_proto))
2423                         flags &= ~RTCF_LOCAL;
2424                 /* If multicast route do not exist use
2425                  * default one, but do not gateway in this case.
2426                  * Yes, it is hack.
2427                  */
2428                 if (fi && res->prefixlen < 4)
2429                         fi = NULL;
2430         }
2431
2432         rth = rt_dst_alloc(dev_out,
2433                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2434                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2435         if (!rth)
2436                 return ERR_PTR(-ENOBUFS);
2437
2438         rth->dst.output = ip_output;
2439
2440         rth->rt_key_dst = orig_daddr;
2441         rth->rt_key_src = orig_saddr;
2442         rth->rt_genid = rt_genid(dev_net(dev_out));
2443         rth->rt_flags   = flags;
2444         rth->rt_type    = type;
2445         rth->rt_key_tos = tos;
2446         rth->rt_dst     = fl4->daddr;
2447         rth->rt_src     = fl4->saddr;
2448         rth->rt_route_iif = 0;
2449         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2450         rth->rt_oif     = orig_oif;
2451         rth->rt_mark    = fl4->flowi4_mark;
2452         rth->rt_gateway = fl4->daddr;
2453         rth->rt_spec_dst= fl4->saddr;
2454         rth->rt_peer_genid = 0;
2455         rth->peer = NULL;
2456         rth->fi = NULL;
2457
2458         RT_CACHE_STAT_INC(out_slow_tot);
2459
2460         if (flags & RTCF_LOCAL) {
2461                 rth->dst.input = ip_local_deliver;
2462                 rth->rt_spec_dst = fl4->daddr;
2463         }
2464         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2465                 rth->rt_spec_dst = fl4->saddr;
2466                 if (flags & RTCF_LOCAL &&
2467                     !(dev_out->flags & IFF_LOOPBACK)) {
2468                         rth->dst.output = ip_mc_output;
2469                         RT_CACHE_STAT_INC(out_slow_mc);
2470                 }
2471 #ifdef CONFIG_IP_MROUTE
2472                 if (type == RTN_MULTICAST) {
2473                         if (IN_DEV_MFORWARD(in_dev) &&
2474                             !ipv4_is_local_multicast(fl4->daddr)) {
2475                                 rth->dst.input = ip_mr_input;
2476                                 rth->dst.output = ip_mc_output;
2477                         }
2478                 }
2479 #endif
2480         }
2481
2482         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2483
2484         return rth;
2485 }
2486
2487 /*
2488  * Major route resolver routine.
2489  * called with rcu_read_lock();
2490  */
2491
2492 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2493 {
2494         struct net_device *dev_out = NULL;
2495         u32 tos = RT_FL_TOS(fl4);
2496         unsigned int flags = 0;
2497         struct fib_result res;
2498         struct rtable *rth;
2499         __be32 orig_daddr;
2500         __be32 orig_saddr;
2501         int orig_oif;
2502
2503         res.fi          = NULL;
2504 #ifdef CONFIG_IP_MULTIPLE_TABLES
2505         res.r           = NULL;
2506 #endif
2507
2508         orig_daddr = fl4->daddr;
2509         orig_saddr = fl4->saddr;
2510         orig_oif = fl4->flowi4_oif;
2511
2512         fl4->flowi4_iif = net->loopback_dev->ifindex;
2513         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2514         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2515                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2516
2517         rcu_read_lock();
2518         if (fl4->saddr) {
2519                 rth = ERR_PTR(-EINVAL);
2520                 if (ipv4_is_multicast(fl4->saddr) ||
2521                     ipv4_is_lbcast(fl4->saddr) ||
2522                     ipv4_is_zeronet(fl4->saddr))
2523                         goto out;
2524
2525                 /* I removed check for oif == dev_out->oif here.
2526                    It was wrong for two reasons:
2527                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2528                       is assigned to multiple interfaces.
2529                    2. Moreover, we are allowed to send packets with saddr
2530                       of another iface. --ANK
2531                  */
2532
2533                 if (fl4->flowi4_oif == 0 &&
2534                     (ipv4_is_multicast(fl4->daddr) ||
2535                      ipv4_is_lbcast(fl4->daddr))) {
2536                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2537                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2538                         if (dev_out == NULL)
2539                                 goto out;
2540
2541                         /* Special hack: user can direct multicasts
2542                            and limited broadcast via necessary interface
2543                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2544                            This hack is not just for fun, it allows
2545                            vic,vat and friends to work.
2546                            They bind socket to loopback, set ttl to zero
2547                            and expect that it will work.
2548                            From the viewpoint of routing cache they are broken,
2549                            because we are not allowed to build multicast path
2550                            with loopback source addr (look, routing cache
2551                            cannot know, that ttl is zero, so that packet
2552                            will not leave this host and route is valid).
2553                            Luckily, this hack is good workaround.
2554                          */
2555
2556                         fl4->flowi4_oif = dev_out->ifindex;
2557                         goto make_route;
2558                 }
2559
2560                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2561                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2562                         if (!__ip_dev_find(net, fl4->saddr, false))
2563                                 goto out;
2564                 }
2565         }
2566
2567
2568         if (fl4->flowi4_oif) {
2569                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2570                 rth = ERR_PTR(-ENODEV);
2571                 if (dev_out == NULL)
2572                         goto out;
2573
2574                 /* RACE: Check return value of inet_select_addr instead. */
2575                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2576                         rth = ERR_PTR(-ENETUNREACH);
2577                         goto out;
2578                 }
2579                 if (ipv4_is_local_multicast(fl4->daddr) ||
2580                     ipv4_is_lbcast(fl4->daddr)) {
2581                         if (!fl4->saddr)
2582                                 fl4->saddr = inet_select_addr(dev_out, 0,
2583                                                               RT_SCOPE_LINK);
2584                         goto make_route;
2585                 }
2586                 if (fl4->saddr) {
2587                         if (ipv4_is_multicast(fl4->daddr))
2588                                 fl4->saddr = inet_select_addr(dev_out, 0,
2589                                                               fl4->flowi4_scope);
2590                         else if (!fl4->daddr)
2591                                 fl4->saddr = inet_select_addr(dev_out, 0,
2592                                                               RT_SCOPE_HOST);
2593                 }
2594         }
2595
2596         if (!fl4->daddr) {
2597                 fl4->daddr = fl4->saddr;
2598                 if (!fl4->daddr)
2599                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2600                 dev_out = net->loopback_dev;
2601                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2602                 res.type = RTN_LOCAL;
2603                 flags |= RTCF_LOCAL;
2604                 goto make_route;
2605         }
2606
2607         if (fib_lookup(net, fl4, &res)) {
2608                 res.fi = NULL;
2609                 if (fl4->flowi4_oif) {
2610                         /* Apparently, routing tables are wrong. Assume,
2611                            that the destination is on link.
2612
2613                            WHY? DW.
2614                            Because we are allowed to send to iface
2615                            even if it has NO routes and NO assigned
2616                            addresses. When oif is specified, routing
2617                            tables are looked up with only one purpose:
2618                            to catch if destination is gatewayed, rather than
2619                            direct. Moreover, if MSG_DONTROUTE is set,
2620                            we send packet, ignoring both routing tables
2621                            and ifaddr state. --ANK
2622
2623
2624                            We could make it even if oif is unknown,
2625                            likely IPv6, but we do not.
2626                          */
2627
2628                         if (fl4->saddr == 0)
2629                                 fl4->saddr = inet_select_addr(dev_out, 0,
2630                                                               RT_SCOPE_LINK);
2631                         res.type = RTN_UNICAST;
2632                         goto make_route;
2633                 }
2634                 rth = ERR_PTR(-ENETUNREACH);
2635                 goto out;
2636         }
2637
2638         if (res.type == RTN_LOCAL) {
2639                 if (!fl4->saddr) {
2640                         if (res.fi->fib_prefsrc)
2641                                 fl4->saddr = res.fi->fib_prefsrc;
2642                         else
2643                                 fl4->saddr = fl4->daddr;
2644                 }
2645                 dev_out = net->loopback_dev;
2646                 fl4->flowi4_oif = dev_out->ifindex;
2647                 res.fi = NULL;
2648                 flags |= RTCF_LOCAL;
2649                 goto make_route;
2650         }
2651
2652 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2653         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2654                 fib_select_multipath(&res);
2655         else
2656 #endif
2657         if (!res.prefixlen &&
2658             res.table->tb_num_default > 1 &&
2659             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2660                 fib_select_default(&res);
2661
2662         if (!fl4->saddr)
2663                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2664
2665         dev_out = FIB_RES_DEV(res);
2666         fl4->flowi4_oif = dev_out->ifindex;
2667
2668
2669 make_route:
2670         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2671                                dev_out, flags);
2672         if (!IS_ERR(rth)) {
2673                 unsigned int hash;
2674
2675                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2676                                rt_genid(dev_net(dev_out)));
2677                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2678         }
2679
2680 out:
2681         rcu_read_unlock();
2682         return rth;
2683 }
2684
2685 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2686 {
2687         struct rtable *rth;
2688         unsigned int hash;
2689
2690         if (!rt_caching(net))
2691                 goto slow_output;
2692
2693         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2694
2695         rcu_read_lock_bh();
2696         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2697                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2698                 if (rth->rt_key_dst == flp4->daddr &&
2699                     rth->rt_key_src == flp4->saddr &&
2700                     rt_is_output_route(rth) &&
2701                     rth->rt_oif == flp4->flowi4_oif &&
2702                     rth->rt_mark == flp4->flowi4_mark &&
2703                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2704                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2705                     net_eq(dev_net(rth->dst.dev), net) &&
2706                     !rt_is_expired(rth)) {
2707                         dst_use(&rth->dst, jiffies);
2708                         RT_CACHE_STAT_INC(out_hit);
2709                         rcu_read_unlock_bh();
2710                         if (!flp4->saddr)
2711                                 flp4->saddr = rth->rt_src;
2712                         if (!flp4->daddr)
2713                                 flp4->daddr = rth->rt_dst;
2714                         return rth;
2715                 }
2716                 RT_CACHE_STAT_INC(out_hlist_search);
2717         }
2718         rcu_read_unlock_bh();
2719
2720 slow_output:
2721         return ip_route_output_slow(net, flp4);
2722 }
2723 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2724
2725 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2726 {
2727         return NULL;
2728 }
2729
2730 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2731 {
2732         return 0;
2733 }
2734
2735 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2736 {
2737 }
2738
2739 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2740                                           unsigned long old)
2741 {
2742         return NULL;
2743 }
2744
2745 static struct dst_ops ipv4_dst_blackhole_ops = {
2746         .family                 =       AF_INET,
2747         .protocol               =       cpu_to_be16(ETH_P_IP),
2748         .destroy                =       ipv4_dst_destroy,
2749         .check                  =       ipv4_blackhole_dst_check,
2750         .default_mtu            =       ipv4_blackhole_default_mtu,
2751         .default_advmss         =       ipv4_default_advmss,
2752         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2753         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2754         .neigh_lookup           =       ipv4_neigh_lookup,
2755 };
2756
2757 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2758 {
2759         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2760         struct rtable *ort = (struct rtable *) dst_orig;
2761
2762         if (rt) {
2763                 struct dst_entry *new = &rt->dst;
2764
2765                 new->__use = 1;
2766                 new->input = dst_discard;
2767                 new->output = dst_discard;
2768                 dst_copy_metrics(new, &ort->dst);
2769
2770                 new->dev = ort->dst.dev;
2771                 if (new->dev)
2772                         dev_hold(new->dev);
2773
2774                 rt->rt_key_dst = ort->rt_key_dst;
2775                 rt->rt_key_src = ort->rt_key_src;
2776                 rt->rt_key_tos = ort->rt_key_tos;
2777                 rt->rt_route_iif = ort->rt_route_iif;
2778                 rt->rt_iif = ort->rt_iif;
2779                 rt->rt_oif = ort->rt_oif;
2780                 rt->rt_mark = ort->rt_mark;
2781
2782                 rt->rt_genid = rt_genid(net);
2783                 rt->rt_flags = ort->rt_flags;
2784                 rt->rt_type = ort->rt_type;
2785                 rt->rt_dst = ort->rt_dst;
2786                 rt->rt_src = ort->rt_src;
2787                 rt->rt_gateway = ort->rt_gateway;
2788                 rt->rt_spec_dst = ort->rt_spec_dst;
2789                 rt->peer = ort->peer;
2790                 if (rt->peer)
2791                         atomic_inc(&rt->peer->refcnt);
2792                 rt->fi = ort->fi;
2793                 if (rt->fi)
2794                         atomic_inc(&rt->fi->fib_clntref);
2795
2796                 dst_free(new);
2797         }
2798
2799         dst_release(dst_orig);
2800
2801         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2802 }
2803
2804 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2805                                     struct sock *sk)
2806 {
2807         struct rtable *rt = __ip_route_output_key(net, flp4);
2808
2809         if (IS_ERR(rt))
2810                 return rt;
2811
2812         if (flp4->flowi4_proto)
2813                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2814                                                    flowi4_to_flowi(flp4),
2815                                                    sk, 0);
2816
2817         return rt;
2818 }
2819 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2820
2821 static int rt_fill_info(struct net *net,
2822                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2823                         int nowait, unsigned int flags)
2824 {
2825         struct rtable *rt = skb_rtable(skb);
2826         struct rtmsg *r;
2827         struct nlmsghdr *nlh;
2828         long expires = 0;
2829         const struct inet_peer *peer = rt->peer;
2830         u32 id = 0, ts = 0, tsage = 0, error;
2831
2832         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2833         if (nlh == NULL)
2834                 return -EMSGSIZE;
2835
2836         r = nlmsg_data(nlh);
2837         r->rtm_family    = AF_INET;
2838         r->rtm_dst_len  = 32;
2839         r->rtm_src_len  = 0;
2840         r->rtm_tos      = rt->rt_key_tos;
2841         r->rtm_table    = RT_TABLE_MAIN;
2842         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2843         r->rtm_type     = rt->rt_type;
2844         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2845         r->rtm_protocol = RTPROT_UNSPEC;
2846         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2847         if (rt->rt_flags & RTCF_NOTIFY)
2848                 r->rtm_flags |= RTM_F_NOTIFY;
2849
2850         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2851
2852         if (rt->rt_key_src) {
2853                 r->rtm_src_len = 32;
2854                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2855         }
2856         if (rt->dst.dev)
2857                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2858 #ifdef CONFIG_IP_ROUTE_CLASSID
2859         if (rt->dst.tclassid)
2860                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2861 #endif
2862         if (rt_is_input_route(rt))
2863                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2864         else if (rt->rt_src != rt->rt_key_src)
2865                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2866
2867         if (rt->rt_dst != rt->rt_gateway)
2868                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2869
2870         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2871                 goto nla_put_failure;
2872
2873         if (rt->rt_mark)
2874                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2875
2876         error = rt->dst.error;
2877         if (peer) {
2878                 inet_peer_refcheck(rt->peer);
2879                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2880                 if (peer->tcp_ts_stamp) {
2881                         ts = peer->tcp_ts;
2882                         tsage = get_seconds() - peer->tcp_ts_stamp;
2883                 }
2884                 expires = ACCESS_ONCE(peer->pmtu_expires);
2885                 if (expires)
2886                         expires -= jiffies;
2887         }
2888
2889         if (rt_is_input_route(rt)) {
2890 #ifdef CONFIG_IP_MROUTE
2891                 __be32 dst = rt->rt_dst;
2892
2893                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2894                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2895                         int err = ipmr_get_route(net, skb,
2896                                                  rt->rt_src, rt->rt_dst,
2897                                                  r, nowait);
2898                         if (err <= 0) {
2899                                 if (!nowait) {
2900                                         if (err == 0)
2901                                                 return 0;
2902                                         goto nla_put_failure;
2903                                 } else {
2904                                         if (err == -EMSGSIZE)
2905                                                 goto nla_put_failure;
2906                                         error = err;
2907                                 }
2908                         }
2909                 } else
2910 #endif
2911                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2912         }
2913
2914         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2915                                expires, error) < 0)
2916                 goto nla_put_failure;
2917
2918         return nlmsg_end(skb, nlh);
2919
2920 nla_put_failure:
2921         nlmsg_cancel(skb, nlh);
2922         return -EMSGSIZE;
2923 }
2924
2925 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2926 {
2927         struct net *net = sock_net(in_skb->sk);
2928         struct rtmsg *rtm;
2929         struct nlattr *tb[RTA_MAX+1];
2930         struct rtable *rt = NULL;
2931         __be32 dst = 0;
2932         __be32 src = 0;
2933         u32 iif;
2934         int err;
2935         int mark;
2936         struct sk_buff *skb;
2937
2938         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2939         if (err < 0)
2940                 goto errout;
2941
2942         rtm = nlmsg_data(nlh);
2943
2944         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2945         if (skb == NULL) {
2946                 err = -ENOBUFS;
2947                 goto errout;
2948         }
2949
2950         /* Reserve room for dummy headers, this skb can pass
2951            through good chunk of routing engine.
2952          */
2953         skb_reset_mac_header(skb);
2954         skb_reset_network_header(skb);
2955
2956         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2957         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2958         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2959
2960         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2961         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2962         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2963         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2964
2965         if (iif) {
2966                 struct net_device *dev;
2967
2968                 dev = __dev_get_by_index(net, iif);
2969                 if (dev == NULL) {
2970                         err = -ENODEV;
2971                         goto errout_free;
2972                 }
2973
2974                 skb->protocol   = htons(ETH_P_IP);
2975                 skb->dev        = dev;
2976                 skb->mark       = mark;
2977                 local_bh_disable();
2978                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2979                 local_bh_enable();
2980
2981                 rt = skb_rtable(skb);
2982                 if (err == 0 && rt->dst.error)
2983                         err = -rt->dst.error;
2984         } else {
2985                 struct flowi4 fl4 = {
2986                         .daddr = dst,
2987                         .saddr = src,
2988                         .flowi4_tos = rtm->rtm_tos,
2989                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2990                         .flowi4_mark = mark,
2991                 };
2992                 rt = ip_route_output_key(net, &fl4);
2993
2994                 err = 0;
2995                 if (IS_ERR(rt))
2996                         err = PTR_ERR(rt);
2997         }
2998
2999         if (err)
3000                 goto errout_free;
3001
3002         skb_dst_set(skb, &rt->dst);
3003         if (rtm->rtm_flags & RTM_F_NOTIFY)
3004                 rt->rt_flags |= RTCF_NOTIFY;
3005
3006         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3007                            RTM_NEWROUTE, 0, 0);
3008         if (err <= 0)
3009                 goto errout_free;
3010
3011         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3012 errout:
3013         return err;
3014
3015 errout_free:
3016         kfree_skb(skb);
3017         goto errout;
3018 }
3019
3020 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3021 {
3022         struct rtable *rt;
3023         int h, s_h;
3024         int idx, s_idx;
3025         struct net *net;
3026
3027         net = sock_net(skb->sk);
3028
3029         s_h = cb->args[0];
3030         if (s_h < 0)
3031                 s_h = 0;
3032         s_idx = idx = cb->args[1];
3033         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3034                 if (!rt_hash_table[h].chain)
3035                         continue;
3036                 rcu_read_lock_bh();
3037                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3038                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3039                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3040                                 continue;
3041                         if (rt_is_expired(rt))
3042                                 continue;
3043                         skb_dst_set_noref(skb, &rt->dst);
3044                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3045                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3046                                          1, NLM_F_MULTI) <= 0) {
3047                                 skb_dst_drop(skb);
3048                                 rcu_read_unlock_bh();
3049                                 goto done;
3050                         }
3051                         skb_dst_drop(skb);
3052                 }
3053                 rcu_read_unlock_bh();
3054         }
3055
3056 done:
3057         cb->args[0] = h;
3058         cb->args[1] = idx;
3059         return skb->len;
3060 }
3061
3062 void ip_rt_multicast_event(struct in_device *in_dev)
3063 {
3064         rt_cache_flush(dev_net(in_dev->dev), 0);
3065 }
3066
3067 #ifdef CONFIG_SYSCTL
3068 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3069                                         void __user *buffer,
3070                                         size_t *lenp, loff_t *ppos)
3071 {
3072         if (write) {
3073                 int flush_delay;
3074                 ctl_table ctl;
3075                 struct net *net;
3076
3077                 memcpy(&ctl, __ctl, sizeof(ctl));
3078                 ctl.data = &flush_delay;
3079                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3080
3081                 net = (struct net *)__ctl->extra1;
3082                 rt_cache_flush(net, flush_delay);
3083                 return 0;
3084         }
3085
3086         return -EINVAL;
3087 }
3088
3089 static ctl_table ipv4_route_table[] = {
3090         {
3091                 .procname       = "gc_thresh",
3092                 .data           = &ipv4_dst_ops.gc_thresh,
3093                 .maxlen         = sizeof(int),
3094                 .mode           = 0644,
3095                 .proc_handler   = proc_dointvec,
3096         },
3097         {
3098                 .procname       = "max_size",
3099                 .data           = &ip_rt_max_size,
3100                 .maxlen         = sizeof(int),
3101                 .mode           = 0644,
3102                 .proc_handler   = proc_dointvec,
3103         },
3104         {
3105                 /*  Deprecated. Use gc_min_interval_ms */
3106
3107                 .procname       = "gc_min_interval",
3108                 .data           = &ip_rt_gc_min_interval,
3109                 .maxlen         = sizeof(int),
3110                 .mode           = 0644,
3111                 .proc_handler   = proc_dointvec_jiffies,
3112         },
3113         {
3114                 .procname       = "gc_min_interval_ms",
3115                 .data           = &ip_rt_gc_min_interval,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = proc_dointvec_ms_jiffies,
3119         },
3120         {
3121                 .procname       = "gc_timeout",
3122                 .data           = &ip_rt_gc_timeout,
3123                 .maxlen         = sizeof(int),
3124                 .mode           = 0644,
3125                 .proc_handler   = proc_dointvec_jiffies,
3126         },
3127         {
3128                 .procname       = "gc_interval",
3129                 .data           = &ip_rt_gc_interval,
3130                 .maxlen         = sizeof(int),
3131                 .mode           = 0644,
3132                 .proc_handler   = proc_dointvec_jiffies,
3133         },
3134         {
3135                 .procname       = "redirect_load",
3136                 .data           = &ip_rt_redirect_load,
3137                 .maxlen         = sizeof(int),
3138                 .mode           = 0644,
3139                 .proc_handler   = proc_dointvec,
3140         },
3141         {
3142                 .procname       = "redirect_number",
3143                 .data           = &ip_rt_redirect_number,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 .procname       = "redirect_silence",
3150                 .data           = &ip_rt_redirect_silence,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 .procname       = "error_cost",
3157                 .data           = &ip_rt_error_cost,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec,
3161         },
3162         {
3163                 .procname       = "error_burst",
3164                 .data           = &ip_rt_error_burst,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec,
3168         },
3169         {
3170                 .procname       = "gc_elasticity",
3171                 .data           = &ip_rt_gc_elasticity,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec,
3175         },
3176         {
3177                 .procname       = "mtu_expires",
3178                 .data           = &ip_rt_mtu_expires,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec_jiffies,
3182         },
3183         {
3184                 .procname       = "min_pmtu",
3185                 .data           = &ip_rt_min_pmtu,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec,
3189         },
3190         {
3191                 .procname       = "min_adv_mss",
3192                 .data           = &ip_rt_min_advmss,
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0644,
3195                 .proc_handler   = proc_dointvec,
3196         },
3197         { }
3198 };
3199
3200 static struct ctl_table empty[1];
3201
3202 static struct ctl_table ipv4_skeleton[] =
3203 {
3204         { .procname = "route",
3205           .mode = 0555, .child = ipv4_route_table},
3206         { .procname = "neigh",
3207           .mode = 0555, .child = empty},
3208         { }
3209 };
3210
3211 static __net_initdata struct ctl_path ipv4_path[] = {
3212         { .procname = "net", },
3213         { .procname = "ipv4", },
3214         { },
3215 };
3216
3217 static struct ctl_table ipv4_route_flush_table[] = {
3218         {
3219                 .procname       = "flush",
3220                 .maxlen         = sizeof(int),
3221                 .mode           = 0200,
3222                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3223         },
3224         { },
3225 };
3226
3227 static __net_initdata struct ctl_path ipv4_route_path[] = {
3228         { .procname = "net", },
3229         { .procname = "ipv4", },
3230         { .procname = "route", },
3231         { },
3232 };
3233
3234 static __net_init int sysctl_route_net_init(struct net *net)
3235 {
3236         struct ctl_table *tbl;
3237
3238         tbl = ipv4_route_flush_table;
3239         if (!net_eq(net, &init_net)) {
3240                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3241                 if (tbl == NULL)
3242                         goto err_dup;
3243         }
3244         tbl[0].extra1 = net;
3245
3246         net->ipv4.route_hdr =
3247                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3248         if (net->ipv4.route_hdr == NULL)
3249                 goto err_reg;
3250         return 0;
3251
3252 err_reg:
3253         if (tbl != ipv4_route_flush_table)
3254                 kfree(tbl);
3255 err_dup:
3256         return -ENOMEM;
3257 }
3258
3259 static __net_exit void sysctl_route_net_exit(struct net *net)
3260 {
3261         struct ctl_table *tbl;
3262
3263         tbl = net->ipv4.route_hdr->ctl_table_arg;
3264         unregister_net_sysctl_table(net->ipv4.route_hdr);
3265         BUG_ON(tbl == ipv4_route_flush_table);
3266         kfree(tbl);
3267 }
3268
3269 static __net_initdata struct pernet_operations sysctl_route_ops = {
3270         .init = sysctl_route_net_init,
3271         .exit = sysctl_route_net_exit,
3272 };
3273 #endif
3274
3275 static __net_init int rt_genid_init(struct net *net)
3276 {
3277         get_random_bytes(&net->ipv4.rt_genid,
3278                          sizeof(net->ipv4.rt_genid));
3279         get_random_bytes(&net->ipv4.dev_addr_genid,
3280                          sizeof(net->ipv4.dev_addr_genid));
3281         return 0;
3282 }
3283
3284 static __net_initdata struct pernet_operations rt_genid_ops = {
3285         .init = rt_genid_init,
3286 };
3287
3288
3289 #ifdef CONFIG_IP_ROUTE_CLASSID
3290 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3291 #endif /* CONFIG_IP_ROUTE_CLASSID */
3292
3293 static __initdata unsigned long rhash_entries;
3294 static int __init set_rhash_entries(char *str)
3295 {
3296         if (!str)
3297                 return 0;
3298         rhash_entries = simple_strtoul(str, &str, 0);
3299         return 1;
3300 }
3301 __setup("rhash_entries=", set_rhash_entries);
3302
3303 int __init ip_rt_init(void)
3304 {
3305         int rc = 0;
3306
3307 #ifdef CONFIG_IP_ROUTE_CLASSID
3308         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3309         if (!ip_rt_acct)
3310                 panic("IP: failed to allocate ip_rt_acct\n");
3311 #endif
3312
3313         ipv4_dst_ops.kmem_cachep =
3314                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3315                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3316
3317         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3318
3319         if (dst_entries_init(&ipv4_dst_ops) < 0)
3320                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3321
3322         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3323                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3324
3325         rt_hash_table = (struct rt_hash_bucket *)
3326                 alloc_large_system_hash("IP route cache",
3327                                         sizeof(struct rt_hash_bucket),
3328                                         rhash_entries,
3329                                         (totalram_pages >= 128 * 1024) ?
3330                                         15 : 17,
3331                                         0,
3332                                         &rt_hash_log,
3333                                         &rt_hash_mask,
3334                                         rhash_entries ? 0 : 512 * 1024);
3335         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3336         rt_hash_lock_init();
3337
3338         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3339         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3340
3341         devinet_init();
3342         ip_fib_init();
3343
3344         if (ip_rt_proc_init())
3345                 printk(KERN_ERR "Unable to create route proc files\n");
3346 #ifdef CONFIG_XFRM
3347         xfrm_init();
3348         xfrm4_init(ip_rt_max_size);
3349 #endif
3350         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3351
3352 #ifdef CONFIG_SYSCTL
3353         register_pernet_subsys(&sysctl_route_ops);
3354 #endif
3355         register_pernet_subsys(&rt_genid_ops);
3356         return rc;
3357 }
3358
3359 #ifdef CONFIG_SYSCTL
3360 /*
3361  * We really need to sanitize the damn ipv4 init order, then all
3362  * this nonsense will go away.
3363  */
3364 void __init ip_static_sysctl_init(void)
3365 {
3366         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3367 }
3368 #endif