net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .default_mtu =          ipv4_default_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len;
 421
 422                 n = dst_get_neighbour(&r->dst);
 423                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425                         r->dst.dev ? r->dst.dev->name : "*",
 426                         (__force u32)r->rt_dst,
 427                         (__force u32)r->rt_gateway,
 428                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 429                         r->dst.__use, 0, (__force u32)r->rt_src,
 430                         dst_metric_advmss(&r->dst) + 40,
 431                         dst_metric(&r->dst, RTAX_WINDOW),
 432                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433                               dst_metric(&r->dst, RTAX_RTTVAR)),
 434                         r->rt_key_tos,
 435                         -1,
 436                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 437                         r->rt_spec_dst, &len);
 438
 439                 seq_printf(seq, "%*s\n", 127 - len, "");
 440         }
 441         return 0;
 442 }
 443
 444 static const struct seq_operations rt_cache_seq_ops = {
 445         .start  = rt_cache_seq_start,
 446         .next   = rt_cache_seq_next,
 447         .stop   = rt_cache_seq_stop,
 448         .show   = rt_cache_seq_show,
 449 };
 450
 451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452 {
 453         return seq_open_net(inode, file, &rt_cache_seq_ops,
 454                         sizeof(struct rt_cache_iter_state));
 455 }
 456
 457 static const struct file_operations rt_cache_seq_fops = {
 458         .owner   = THIS_MODULE,
 459         .open    = rt_cache_seq_open,
 460         .read    = seq_read,
 461         .llseek  = seq_lseek,
 462         .release = seq_release_net,
 463 };
 464
 465
 466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467 {
 468         int cpu;
 469
 470         if (*pos == 0)
 471                 return SEQ_START_TOKEN;
 472
 473         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474                 if (!cpu_possible(cpu))
 475                         continue;
 476                 *pos = cpu+1;
 477                 return &per_cpu(rt_cache_stat, cpu);
 478         }
 479         return NULL;
 480 }
 481
 482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483 {
 484         int cpu;
 485
 486         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487                 if (!cpu_possible(cpu))
 488                         continue;
 489                 *pos = cpu+1;
 490                 return &per_cpu(rt_cache_stat, cpu);
 491         }
 492         return NULL;
 493
 494 }
 495
 496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497 {
 498
 499 }
 500
 501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502 {
 503         struct rt_cache_stat *st = v;
 504
 505         if (v == SEQ_START_TOKEN) {
 506                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507                 return 0;
 508         }
 509
 510         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512                    dst_entries_get_slow(&ipv4_dst_ops),
 513                    st->in_hit,
 514                    st->in_slow_tot,
 515                    st->in_slow_mc,
 516                    st->in_no_route,
 517                    st->in_brd,
 518                    st->in_martian_dst,
 519                    st->in_martian_src,
 520
 521                    st->out_hit,
 522                    st->out_slow_tot,
 523                    st->out_slow_mc,
 524
 525                    st->gc_total,
 526                    st->gc_ignored,
 527                    st->gc_goal_miss,
 528                    st->gc_dst_overflow,
 529                    st->in_hlist_search,
 530                    st->out_hlist_search
 531                 );
 532         return 0;
 533 }
 534
 535 static const struct seq_operations rt_cpu_seq_ops = {
 536         .start  = rt_cpu_seq_start,
 537         .next   = rt_cpu_seq_next,
 538         .stop   = rt_cpu_seq_stop,
 539         .show   = rt_cpu_seq_show,
 540 };
 541
 542
 543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544 {
 545         return seq_open(file, &rt_cpu_seq_ops);
 546 }
 547
 548 static const struct file_operations rt_cpu_seq_fops = {
 549         .owner   = THIS_MODULE,
 550         .open    = rt_cpu_seq_open,
 551         .read    = seq_read,
 552         .llseek  = seq_lseek,
 553         .release = seq_release,
 554 };
 555
 556 #ifdef CONFIG_IP_ROUTE_CLASSID
 557 static int rt_acct_proc_show(struct seq_file *m, void *v)
 558 {
 559         struct ip_rt_acct *dst, *src;
 560         unsigned int i, j;
 561
 562         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563         if (!dst)
 564                 return -ENOMEM;
 565
 566         for_each_possible_cpu(i) {
 567                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568                 for (j = 0; j < 256; j++) {
 569                         dst[j].o_bytes   += src[j].o_bytes;
 570                         dst[j].o_packets += src[j].o_packets;
 571                         dst[j].i_bytes   += src[j].i_bytes;
 572                         dst[j].i_packets += src[j].i_packets;
 573                 }
 574         }
 575
 576         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577         kfree(dst);
 578         return 0;
 579 }
 580
 581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582 {
 583         return single_open(file, rt_acct_proc_show, NULL);
 584 }
 585
 586 static const struct file_operations rt_acct_proc_fops = {
 587         .owner          = THIS_MODULE,
 588         .open           = rt_acct_proc_open,
 589         .read           = seq_read,
 590         .llseek         = seq_lseek,
 591         .release        = single_release,
 592 };
 593 #endif
 594
 595 static int __net_init ip_rt_do_proc_init(struct net *net)
 596 {
 597         struct proc_dir_entry *pde;
 598
 599         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600                         &rt_cache_seq_fops);
 601         if (!pde)
 602                 goto err1;
 603
 604         pde = proc_create("rt_cache", S_IRUGO,
 605                           net->proc_net_stat, &rt_cpu_seq_fops);
 606         if (!pde)
 607                 goto err2;
 608
 609 #ifdef CONFIG_IP_ROUTE_CLASSID
 610         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 611         if (!pde)
 612                 goto err3;
 613 #endif
 614         return 0;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617 err3:
 618         remove_proc_entry("rt_cache", net->proc_net_stat);
 619 #endif
 620 err2:
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 err1:
 623         return -ENOMEM;
 624 }
 625
 626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627 {
 628         remove_proc_entry("rt_cache", net->proc_net_stat);
 629         remove_proc_entry("rt_cache", net->proc_net);
 630 #ifdef CONFIG_IP_ROUTE_CLASSID
 631         remove_proc_entry("rt_acct", net->proc_net);
 632 #endif
 633 }
 634
 635 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636         .init = ip_rt_do_proc_init,
 637         .exit = ip_rt_do_proc_exit,
 638 };
 639
 640 static int __init ip_rt_proc_init(void)
 641 {
 642         return register_pernet_subsys(&ip_rt_proc_ops);
 643 }
 644
 645 #else
 646 static inline int ip_rt_proc_init(void)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_PROC_FS */
 651
 652 static inline void rt_free(struct rtable *rt)
 653 {
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline void rt_drop(struct rtable *rt)
 658 {
 659         ip_rt_put(rt);
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline int rt_fast_clean(struct rtable *rth)
 664 {
 665         /* Kill broadcast/multicast entries very aggresively, if they
 666            collide in hash table with more useful entries */
 667         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668                 rt_is_input_route(rth) && rth->dst.rt_next;
 669 }
 670
 671 static inline int rt_valuable(struct rtable *rth)
 672 {
 673         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674                 (rth->peer && rth->peer->pmtu_expires);
 675 }
 676
 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678 {
 679         unsigned long age;
 680         int ret = 0;
 681
 682         if (atomic_read(&rth->dst.__refcnt))
 683                 goto out;
 684
 685         age = jiffies - rth->dst.lastuse;
 686         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687             (age <= tmo2 && rt_valuable(rth)))
 688                 goto out;
 689         ret = 1;
 690 out:    return ret;
 691 }
 692
 693 /* Bits of score are:
 694  * 31: very valuable
 695  * 30: not quite useless
 696  * 29..0: usage counter
 697  */
 698 static inline u32 rt_score(struct rtable *rt)
 699 {
 700         u32 score = jiffies - rt->dst.lastuse;
 701
 702         score = ~score & ~(3<<30);
 703
 704         if (rt_valuable(rt))
 705                 score |= (1<<31);
 706
 707         if (rt_is_output_route(rt) ||
 708             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709                 score |= (1<<30);
 710
 711         return score;
 712 }
 713
 714 static inline bool rt_caching(const struct net *net)
 715 {
 716         return net->ipv4.current_rt_cache_rebuild_count <=
 717                 net->ipv4.sysctl_rt_cache_rebuild_count;
 718 }
 719
 720 static inline bool compare_hash_inputs(const struct rtable *rt1,
 721                                        const struct rtable *rt2)
 722 {
 723         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 726 }
 727
 728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729 {
 730         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_mark ^ rt2->rt_mark) |
 733                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736 }
 737
 738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739 {
 740         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741 }
 742
 743 static inline int rt_is_expired(struct rtable *rth)
 744 {
 745         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746 }
 747
 748 /*
 749  * Perform a full scan of hash table and free all entries.
 750  * Can be called by a softirq or a process.
 751  * In the later case, we want to be reschedule if necessary
 752  */
 753 static void rt_do_flush(struct net *net, int process_context)
 754 {
 755         unsigned int i;
 756         struct rtable *rth, *next;
 757
 758         for (i = 0; i <= rt_hash_mask; i++) {
 759                 struct rtable __rcu **pprev;
 760                 struct rtable *list;
 761
 762                 if (process_context && need_resched())
 763                         cond_resched();
 764                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 765                 if (!rth)
 766                         continue;
 767
 768                 spin_lock_bh(rt_hash_lock_addr(i));
 769
 770                 list = NULL;
 771                 pprev = &rt_hash_table[i].chain;
 772                 rth = rcu_dereference_protected(*pprev,
 773                         lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775                 while (rth) {
 776                         next = rcu_dereference_protected(rth->dst.rt_next,
 777                                 lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                         if (!net ||
 780                             net_eq(dev_net(rth->dst.dev), net)) {
 781                                 rcu_assign_pointer(*pprev, next);
 782                                 rcu_assign_pointer(rth->dst.rt_next, list);
 783                                 list = rth;
 784                         } else {
 785                                 pprev = &rth->dst.rt_next;
 786                         }
 787                         rth = next;
 788                 }
 789
 790                 spin_unlock_bh(rt_hash_lock_addr(i));
 791
 792                 for (; list; list = next) {
 793                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 794                         rt_free(list);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * While freeing expired entries, we compute average chain length
 801  * and standard deviation, using fixed-point arithmetic.
 802  * This to have an estimation of rt_chain_length_max
 803  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805  */
 806
 807 #define FRACT_BITS 3
 808 #define ONE (1UL << FRACT_BITS)
 809
 810 /*
 811  * Given a hash chain and an item in this hash chain,
 812  * find if a previous entry has the same hash_inputs
 813  * (but differs on tos, mark or oif)
 814  * Returns 0 if an alias is found.
 815  * Returns ONE if rth has no alias before itself.
 816  */
 817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818 {
 819         const struct rtable *aux = head;
 820
 821         while (aux != rth) {
 822                 if (compare_hash_inputs(aux, rth))
 823                         return 0;
 824                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825         }
 826         return ONE;
 827 }
 828
 829 /*
 830  * Perturbation of rt_genid by a small quantity [1..256]
 831  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 832  * many times (2^24) without giving recent rt_genid.
 833  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 834  */
 835 static void rt_cache_invalidate(struct net *net)
 836 {
 837         unsigned char shuffle;
 838
 839         get_random_bytes(&shuffle, sizeof(shuffle));
 840         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 841 }
 842
 843 /*
 844  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 845  * delay >= 0 : invalidate & flush cache (can be long)
 846  */
 847 void rt_cache_flush(struct net *net, int delay)
 848 {
 849         rt_cache_invalidate(net);
 850         if (delay >= 0)
 851                 rt_do_flush(net, !in_softirq());
 852 }
 853
 854 /* Flush previous cache invalidated entries from the cache */
 855 void rt_cache_flush_batch(struct net *net)
 856 {
 857         rt_do_flush(net, !in_softirq());
 858 }
 859
 860 static void rt_emergency_hash_rebuild(struct net *net)
 861 {
 862         if (net_ratelimit())
 863                 printk(KERN_WARNING "Route hash chain too long!\n");
 864         rt_cache_invalidate(net);
 865 }
 866
 867 /*
 868    Short description of GC goals.
 869
 870    We want to build algorithm, which will keep routing cache
 871    at some equilibrium point, when number of aged off entries
 872    is kept approximately equal to newly generated ones.
 873
 874    Current expiration strength is variable "expire".
 875    We try to adjust it dynamically, so that if networking
 876    is idle expires is large enough to keep enough of warm entries,
 877    and when load increases it reduces to limit cache size.
 878  */
 879
 880 static int rt_garbage_collect(struct dst_ops *ops)
 881 {
 882         static unsigned long expire = RT_GC_TIMEOUT;
 883         static unsigned long last_gc;
 884         static int rover;
 885         static int equilibrium;
 886         struct rtable *rth;
 887         struct rtable __rcu **rthp;
 888         unsigned long now = jiffies;
 889         int goal;
 890         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 891
 892         /*
 893          * Garbage collection is pretty expensive,
 894          * do not make it too frequently.
 895          */
 896
 897         RT_CACHE_STAT_INC(gc_total);
 898
 899         if (now - last_gc < ip_rt_gc_min_interval &&
 900             entries < ip_rt_max_size) {
 901                 RT_CACHE_STAT_INC(gc_ignored);
 902                 goto out;
 903         }
 904
 905         entries = dst_entries_get_slow(&ipv4_dst_ops);
 906         /* Calculate number of entries, which we want to expire now. */
 907         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 908         if (goal <= 0) {
 909                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 910                         equilibrium = ipv4_dst_ops.gc_thresh;
 911                 goal = entries - equilibrium;
 912                 if (goal > 0) {
 913                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 914                         goal = entries - equilibrium;
 915                 }
 916         } else {
 917                 /* We are in dangerous area. Try to reduce cache really
 918                  * aggressively.
 919                  */
 920                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 921                 equilibrium = entries - goal;
 922         }
 923
 924         if (now - last_gc >= ip_rt_gc_min_interval)
 925                 last_gc = now;
 926
 927         if (goal <= 0) {
 928                 equilibrium += goal;
 929                 goto work_done;
 930         }
 931
 932         do {
 933                 int i, k;
 934
 935                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 936                         unsigned long tmo = expire;
 937
 938                         k = (k + 1) & rt_hash_mask;
 939                         rthp = &rt_hash_table[k].chain;
 940                         spin_lock_bh(rt_hash_lock_addr(k));
 941                         while ((rth = rcu_dereference_protected(*rthp,
 942                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 943                                 if (!rt_is_expired(rth) &&
 944                                         !rt_may_expire(rth, tmo, expire)) {
 945                                         tmo >>= 1;
 946                                         rthp = &rth->dst.rt_next;
 947                                         continue;
 948                                 }
 949                                 *rthp = rth->dst.rt_next;
 950                                 rt_free(rth);
 951                                 goal--;
 952                         }
 953                         spin_unlock_bh(rt_hash_lock_addr(k));
 954                         if (goal <= 0)
 955                                 break;
 956                 }
 957                 rover = k;
 958
 959                 if (goal <= 0)
 960                         goto work_done;
 961
 962                 /* Goal is not achieved. We stop process if:
 963
 964                    - if expire reduced to zero. Otherwise, expire is halfed.
 965                    - if table is not full.
 966                    - if we are called from interrupt.
 967                    - jiffies check is just fallback/debug loop breaker.
 968                      We will not spin here for long time in any case.
 969                  */
 970
 971                 RT_CACHE_STAT_INC(gc_goal_miss);
 972
 973                 if (expire == 0)
 974                         break;
 975
 976                 expire >>= 1;
 977
 978                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 979                         goto out;
 980         } while (!in_softirq() && time_before_eq(jiffies, now));
 981
 982         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 983                 goto out;
 984         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 985                 goto out;
 986         if (net_ratelimit())
 987                 printk(KERN_WARNING "dst cache overflow\n");
 988         RT_CACHE_STAT_INC(gc_dst_overflow);
 989         return 1;
 990
 991 work_done:
 992         expire += ip_rt_gc_min_interval;
 993         if (expire > ip_rt_gc_timeout ||
 994             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 995             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 996                 expire = ip_rt_gc_timeout;
 997 out:    return 0;
 998 }
 999
1000 /*
1001  * Returns number of entries in a hash chain that have different hash_inputs
1002  */
1003 static int slow_chain_length(const struct rtable *head)
1004 {
1005         int length = 0;
1006         const struct rtable *rth = head;
1007
1008         while (rth) {
1009                 length += has_noalias(head, rth);
1010                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1011         }
1012         return length >> FRACT_BITS;
1013 }
1014
1015 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1016 {
1017         struct neigh_table *tbl = &arp_tbl;
1018         static const __be32 inaddr_any = 0;
1019         struct net_device *dev = dst->dev;
1020         const __be32 *pkey = daddr;
1021         struct neighbour *n;
1022
1023 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1024         if (dev->type == ARPHRD_ATM)
1025                 tbl = clip_tbl_hook;
1026 #endif
1027         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1028                 pkey = &inaddr_any;
1029
1030         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1031         if (n)
1032                 return n;
1033         return neigh_create(tbl, pkey, dev);
1034 }
1035
1036 static int rt_bind_neighbour(struct rtable *rt)
1037 {
1038         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1039         if (IS_ERR(n))
1040                 return PTR_ERR(n);
1041         dst_set_neighbour(&rt->dst, n);
1042
1043         return 0;
1044 }
1045
1046 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1047                                      struct sk_buff *skb, int ifindex)
1048 {
1049         struct rtable   *rth, *cand;
1050         struct rtable __rcu **rthp, **candp;
1051         unsigned long   now;
1052         u32             min_score;
1053         int             chain_length;
1054         int attempts = !in_softirq();
1055
1056 restart:
1057         chain_length = 0;
1058         min_score = ~(u32)0;
1059         cand = NULL;
1060         candp = NULL;
1061         now = jiffies;
1062
1063         if (!rt_caching(dev_net(rt->dst.dev))) {
1064                 /*
1065                  * If we're not caching, just tell the caller we
1066                  * were successful and don't touch the route.  The
1067                  * caller hold the sole reference to the cache entry, and
1068                  * it will be released when the caller is done with it.
1069                  * If we drop it here, the callers have no way to resolve routes
1070                  * when we're not caching.  Instead, just point *rp at rt, so
1071                  * the caller gets a single use out of the route
1072                  * Note that we do rt_free on this new route entry, so that
1073                  * once its refcount hits zero, we are still able to reap it
1074                  * (Thanks Alexey)
1075                  * Note: To avoid expensive rcu stuff for this uncached dst,
1076                  * we set DST_NOCACHE so that dst_release() can free dst without
1077                  * waiting a grace period.
1078                  */
1079
1080                 rt->dst.flags |= DST_NOCACHE;
1081                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1082                         int err = rt_bind_neighbour(rt);
1083                         if (err) {
1084                                 if (net_ratelimit())
1085                                         printk(KERN_WARNING
1086                                             "Neighbour table failure & not caching routes.\n");
1087                                 ip_rt_put(rt);
1088                                 return ERR_PTR(err);
1089                         }
1090                 }
1091
1092                 goto skip_hashing;
1093         }
1094
1095         rthp = &rt_hash_table[hash].chain;
1096
1097         spin_lock_bh(rt_hash_lock_addr(hash));
1098         while ((rth = rcu_dereference_protected(*rthp,
1099                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1100                 if (rt_is_expired(rth)) {
1101                         *rthp = rth->dst.rt_next;
1102                         rt_free(rth);
1103                         continue;
1104                 }
1105                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1106                         /* Put it first */
1107                         *rthp = rth->dst.rt_next;
1108                         /*
1109                          * Since lookup is lockfree, the deletion
1110                          * must be visible to another weakly ordered CPU before
1111                          * the insertion at the start of the hash chain.
1112                          */
1113                         rcu_assign_pointer(rth->dst.rt_next,
1114                                            rt_hash_table[hash].chain);
1115                         /*
1116                          * Since lookup is lockfree, the update writes
1117                          * must be ordered for consistency on SMP.
1118                          */
1119                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1120
1121                         dst_use(&rth->dst, now);
1122                         spin_unlock_bh(rt_hash_lock_addr(hash));
1123
1124                         rt_drop(rt);
1125                         if (skb)
1126                                 skb_dst_set(skb, &rth->dst);
1127                         return rth;
1128                 }
1129
1130                 if (!atomic_read(&rth->dst.__refcnt)) {
1131                         u32 score = rt_score(rth);
1132
1133                         if (score <= min_score) {
1134                                 cand = rth;
1135                                 candp = rthp;
1136                                 min_score = score;
1137                         }
1138                 }
1139
1140                 chain_length++;
1141
1142                 rthp = &rth->dst.rt_next;
1143         }
1144
1145         if (cand) {
1146                 /* ip_rt_gc_elasticity used to be average length of chain
1147                  * length, when exceeded gc becomes really aggressive.
1148                  *
1149                  * The second limit is less certain. At the moment it allows
1150                  * only 2 entries per bucket. We will see.
1151                  */
1152                 if (chain_length > ip_rt_gc_elasticity) {
1153                         *candp = cand->dst.rt_next;
1154                         rt_free(cand);
1155                 }
1156         } else {
1157                 if (chain_length > rt_chain_length_max &&
1158                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1159                         struct net *net = dev_net(rt->dst.dev);
1160                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1161                         if (!rt_caching(net)) {
1162                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1163                                         rt->dst.dev->name, num);
1164                         }
1165                         rt_emergency_hash_rebuild(net);
1166                         spin_unlock_bh(rt_hash_lock_addr(hash));
1167
1168                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1169                                         ifindex, rt_genid(net));
1170                         goto restart;
1171                 }
1172         }
1173
1174         /* Try to bind route to arp only if it is output
1175            route or unicast forwarding path.
1176          */
1177         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                 int err = rt_bind_neighbour(rt);
1179                 if (err) {
1180                         spin_unlock_bh(rt_hash_lock_addr(hash));
1181
1182                         if (err != -ENOBUFS) {
1183                                 rt_drop(rt);
1184                                 return ERR_PTR(err);
1185                         }
1186
1187                         /* Neighbour tables are full and nothing
1188                            can be released. Try to shrink route cache,
1189                            it is most likely it holds some neighbour records.
1190                          */
1191                         if (attempts-- > 0) {
1192                                 int saved_elasticity = ip_rt_gc_elasticity;
1193                                 int saved_int = ip_rt_gc_min_interval;
1194                                 ip_rt_gc_elasticity     = 1;
1195                                 ip_rt_gc_min_interval   = 0;
1196                                 rt_garbage_collect(&ipv4_dst_ops);
1197                                 ip_rt_gc_min_interval   = saved_int;
1198                                 ip_rt_gc_elasticity     = saved_elasticity;
1199                                 goto restart;
1200                         }
1201
1202                         if (net_ratelimit())
1203                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1204                         rt_drop(rt);
1205                         return ERR_PTR(-ENOBUFS);
1206                 }
1207         }
1208
1209         rt->dst.rt_next = rt_hash_table[hash].chain;
1210
1211         /*
1212          * Since lookup is lockfree, we must make sure
1213          * previous writes to rt are committed to memory
1214          * before making rt visible to other CPUS.
1215          */
1216         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1217
1218         spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220 skip_hashing:
1221         if (skb)
1222                 skb_dst_set(skb, &rt->dst);
1223         return rt;
1224 }
1225
1226 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1227
1228 static u32 rt_peer_genid(void)
1229 {
1230         return atomic_read(&__rt_peer_genid);
1231 }
1232
1233 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1234 {
1235         struct inet_peer *peer;
1236
1237         peer = inet_getpeer_v4(daddr, create);
1238
1239         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1240                 inet_putpeer(peer);
1241         else
1242                 rt->rt_peer_genid = rt_peer_genid();
1243 }
1244
1245 /*
1246  * Peer allocation may fail only in serious out-of-memory conditions.  However
1247  * we still can generate some output.
1248  * Random ID selection looks a bit dangerous because we have no chances to
1249  * select ID being unique in a reasonable period of time.
1250  * But broken packet identifier may be better than no packet at all.
1251  */
1252 static void ip_select_fb_ident(struct iphdr *iph)
1253 {
1254         static DEFINE_SPINLOCK(ip_fb_id_lock);
1255         static u32 ip_fallback_id;
1256         u32 salt;
1257
1258         spin_lock_bh(&ip_fb_id_lock);
1259         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1260         iph->id = htons(salt & 0xFFFF);
1261         ip_fallback_id = salt;
1262         spin_unlock_bh(&ip_fb_id_lock);
1263 }
1264
1265 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1266 {
1267         struct rtable *rt = (struct rtable *) dst;
1268
1269         if (rt) {
1270                 if (rt->peer == NULL)
1271                         rt_bind_peer(rt, rt->rt_dst, 1);
1272
1273                 /* If peer is attached to destination, it is never detached,
1274                    so that we need not to grab a lock to dereference it.
1275                  */
1276                 if (rt->peer) {
1277                         iph->id = htons(inet_getid(rt->peer, more));
1278                         return;
1279                 }
1280         } else
1281                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1282                        __builtin_return_address(0));
1283
1284         ip_select_fb_ident(iph);
1285 }
1286 EXPORT_SYMBOL(__ip_select_ident);
1287
1288 static void rt_del(unsigned hash, struct rtable *rt)
1289 {
1290         struct rtable __rcu **rthp;
1291         struct rtable *aux;
1292
1293         rthp = &rt_hash_table[hash].chain;
1294         spin_lock_bh(rt_hash_lock_addr(hash));
1295         ip_rt_put(rt);
1296         while ((aux = rcu_dereference_protected(*rthp,
1297                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1298                 if (aux == rt || rt_is_expired(aux)) {
1299                         *rthp = aux->dst.rt_next;
1300                         rt_free(aux);
1301                         continue;
1302                 }
1303                 rthp = &aux->dst.rt_next;
1304         }
1305         spin_unlock_bh(rt_hash_lock_addr(hash));
1306 }
1307
1308 /* called in rcu_read_lock() section */
1309 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1310                     __be32 saddr, struct net_device *dev)
1311 {
1312         struct in_device *in_dev = __in_dev_get_rcu(dev);
1313         struct inet_peer *peer;
1314         struct net *net;
1315
1316         if (!in_dev)
1317                 return;
1318
1319         net = dev_net(dev);
1320         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1321             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1322             ipv4_is_zeronet(new_gw))
1323                 goto reject_redirect;
1324
1325         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1326                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1327                         goto reject_redirect;
1328                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1329                         goto reject_redirect;
1330         } else {
1331                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1332                         goto reject_redirect;
1333         }
1334
1335         peer = inet_getpeer_v4(daddr, 1);
1336         if (peer) {
1337                 peer->redirect_learned.a4 = new_gw;
1338
1339                 inet_putpeer(peer);
1340
1341                 atomic_inc(&__rt_peer_genid);
1342         }
1343         return;
1344
1345 reject_redirect:
1346 #ifdef CONFIG_IP_ROUTE_VERBOSE
1347         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1348                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1349                         "  Advised path = %pI4 -> %pI4\n",
1350                        &old_gw, dev->name, &new_gw,
1351                        &saddr, &daddr);
1352 #endif
1353         ;
1354 }
1355
1356 static bool peer_pmtu_expired(struct inet_peer *peer)
1357 {
1358         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1359
1360         return orig &&
1361                time_after_eq(jiffies, orig) &&
1362                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1363 }
1364
1365 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1366 {
1367         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1368
1369         return orig &&
1370                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1371 }
1372
1373 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1374 {
1375         struct rtable *rt = (struct rtable *)dst;
1376         struct dst_entry *ret = dst;
1377
1378         if (rt) {
1379                 if (dst->obsolete > 0) {
1380                         ip_rt_put(rt);
1381                         ret = NULL;
1382                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1383                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1384                                                 rt->rt_oif,
1385                                                 rt_genid(dev_net(dst->dev)));
1386                         rt_del(hash, rt);
1387                         ret = NULL;
1388                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1389                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1390                 }
1391         }
1392         return ret;
1393 }
1394
1395 /*
1396  * Algorithm:
1397  *      1. The first ip_rt_redirect_number redirects are sent
1398  *         with exponential backoff, then we stop sending them at all,
1399  *         assuming that the host ignores our redirects.
1400  *      2. If we did not see packets requiring redirects
1401  *         during ip_rt_redirect_silence, we assume that the host
1402  *         forgot redirected route and start to send redirects again.
1403  *
1404  * This algorithm is much cheaper and more intelligent than dumb load limiting
1405  * in icmp.c.
1406  *
1407  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1408  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1409  */
1410
1411 void ip_rt_send_redirect(struct sk_buff *skb)
1412 {
1413         struct rtable *rt = skb_rtable(skb);
1414         struct in_device *in_dev;
1415         struct inet_peer *peer;
1416         int log_martians;
1417
1418         rcu_read_lock();
1419         in_dev = __in_dev_get_rcu(rt->dst.dev);
1420         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1421                 rcu_read_unlock();
1422                 return;
1423         }
1424         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1425         rcu_read_unlock();
1426
1427         if (!rt->peer)
1428                 rt_bind_peer(rt, rt->rt_dst, 1);
1429         peer = rt->peer;
1430         if (!peer) {
1431                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1432                 return;
1433         }
1434
1435         /* No redirected packets during ip_rt_redirect_silence;
1436          * reset the algorithm.
1437          */
1438         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1439                 peer->rate_tokens = 0;
1440
1441         /* Too many ignored redirects; do not send anything
1442          * set dst.rate_last to the last seen redirected packet.
1443          */
1444         if (peer->rate_tokens >= ip_rt_redirect_number) {
1445                 peer->rate_last = jiffies;
1446                 return;
1447         }
1448
1449         /* Check for load limit; set rate_last to the latest sent
1450          * redirect.
1451          */
1452         if (peer->rate_tokens == 0 ||
1453             time_after(jiffies,
1454                        (peer->rate_last +
1455                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1456                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1457                 peer->rate_last = jiffies;
1458                 ++peer->rate_tokens;
1459 #ifdef CONFIG_IP_ROUTE_VERBOSE
1460                 if (log_martians &&
1461                     peer->rate_tokens == ip_rt_redirect_number &&
1462                     net_ratelimit())
1463                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1464                                &ip_hdr(skb)->saddr, rt->rt_iif,
1465                                 &rt->rt_dst, &rt->rt_gateway);
1466 #endif
1467         }
1468 }
1469
1470 static int ip_error(struct sk_buff *skb)
1471 {
1472         struct rtable *rt = skb_rtable(skb);
1473         struct inet_peer *peer;
1474         unsigned long now;
1475         bool send;
1476         int code;
1477
1478         switch (rt->dst.error) {
1479         case EINVAL:
1480         default:
1481                 goto out;
1482         case EHOSTUNREACH:
1483                 code = ICMP_HOST_UNREACH;
1484                 break;
1485         case ENETUNREACH:
1486                 code = ICMP_NET_UNREACH;
1487                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1488                                 IPSTATS_MIB_INNOROUTES);
1489                 break;
1490         case EACCES:
1491                 code = ICMP_PKT_FILTERED;
1492                 break;
1493         }
1494
1495         if (!rt->peer)
1496                 rt_bind_peer(rt, rt->rt_dst, 1);
1497         peer = rt->peer;
1498
1499         send = true;
1500         if (peer) {
1501                 now = jiffies;
1502                 peer->rate_tokens += now - peer->rate_last;
1503                 if (peer->rate_tokens > ip_rt_error_burst)
1504                         peer->rate_tokens = ip_rt_error_burst;
1505                 peer->rate_last = now;
1506                 if (peer->rate_tokens >= ip_rt_error_cost)
1507                         peer->rate_tokens -= ip_rt_error_cost;
1508                 else
1509                         send = false;
1510         }
1511         if (send)
1512                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1513
1514 out:    kfree_skb(skb);
1515         return 0;
1516 }
1517
1518 /*
1519  *      The last two values are not from the RFC but
1520  *      are needed for AMPRnet AX.25 paths.
1521  */
1522
1523 static const unsigned short mtu_plateau[] =
1524 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1525
1526 static inline unsigned short guess_mtu(unsigned short old_mtu)
1527 {
1528         int i;
1529
1530         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1531                 if (old_mtu > mtu_plateau[i])
1532                         return mtu_plateau[i];
1533         return 68;
1534 }
1535
1536 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1537                                  unsigned short new_mtu,
1538                                  struct net_device *dev)
1539 {
1540         unsigned short old_mtu = ntohs(iph->tot_len);
1541         unsigned short est_mtu = 0;
1542         struct inet_peer *peer;
1543
1544         peer = inet_getpeer_v4(iph->daddr, 1);
1545         if (peer) {
1546                 unsigned short mtu = new_mtu;
1547
1548                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1549                         /* BSD 4.2 derived systems incorrectly adjust
1550                          * tot_len by the IP header length, and report
1551                          * a zero MTU in the ICMP message.
1552                          */
1553                         if (mtu == 0 &&
1554                             old_mtu >= 68 + (iph->ihl << 2))
1555                                 old_mtu -= iph->ihl << 2;
1556                         mtu = guess_mtu(old_mtu);
1557                 }
1558
1559                 if (mtu < ip_rt_min_pmtu)
1560                         mtu = ip_rt_min_pmtu;
1561                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1562                         unsigned long pmtu_expires;
1563
1564                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1565                         if (!pmtu_expires)
1566                                 pmtu_expires = 1UL;
1567
1568                         est_mtu = mtu;
1569                         peer->pmtu_learned = mtu;
1570                         peer->pmtu_expires = pmtu_expires;
1571                 }
1572
1573                 inet_putpeer(peer);
1574
1575                 atomic_inc(&__rt_peer_genid);
1576         }
1577         return est_mtu ? : new_mtu;
1578 }
1579
1580 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1581 {
1582         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1583
1584         if (!expires)
1585                 return;
1586         if (time_before(jiffies, expires)) {
1587                 u32 orig_dst_mtu = dst_mtu(dst);
1588                 if (peer->pmtu_learned < orig_dst_mtu) {
1589                         if (!peer->pmtu_orig)
1590                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1591                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1592                 }
1593         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1594                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1595 }
1596
1597 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1598 {
1599         struct rtable *rt = (struct rtable *) dst;
1600         struct inet_peer *peer;
1601
1602         dst_confirm(dst);
1603
1604         if (!rt->peer)
1605                 rt_bind_peer(rt, rt->rt_dst, 1);
1606         peer = rt->peer;
1607         if (peer) {
1608                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1609
1610                 if (mtu < ip_rt_min_pmtu)
1611                         mtu = ip_rt_min_pmtu;
1612                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1613
1614                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1615                         if (!pmtu_expires)
1616                                 pmtu_expires = 1UL;
1617
1618                         peer->pmtu_learned = mtu;
1619                         peer->pmtu_expires = pmtu_expires;
1620
1621                         atomic_inc(&__rt_peer_genid);
1622                         rt->rt_peer_genid = rt_peer_genid();
1623                 }
1624                 check_peer_pmtu(dst, peer);
1625         }
1626 }
1627
1628 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1629 {
1630         struct rtable *rt = (struct rtable *) dst;
1631         __be32 orig_gw = rt->rt_gateway;
1632         struct neighbour *n, *old_n;
1633
1634         dst_confirm(&rt->dst);
1635
1636         rt->rt_gateway = peer->redirect_learned.a4;
1637
1638         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1639         if (IS_ERR(n))
1640                 return PTR_ERR(n);
1641         old_n = xchg(&rt->dst._neighbour, n);
1642         if (old_n)
1643                 neigh_release(old_n);
1644         if (!n || !(n->nud_state & NUD_VALID)) {
1645                 if (n)
1646                         neigh_event_send(n, NULL);
1647                 rt->rt_gateway = orig_gw;
1648                 return -EAGAIN;
1649         } else {
1650                 rt->rt_flags |= RTCF_REDIRECTED;
1651                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1652         }
1653         return 0;
1654 }
1655
1656 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1657 {
1658         struct rtable *rt = (struct rtable *) dst;
1659
1660         if (rt_is_expired(rt))
1661                 return NULL;
1662         if (rt->rt_peer_genid != rt_peer_genid()) {
1663                 struct inet_peer *peer;
1664
1665                 if (!rt->peer)
1666                         rt_bind_peer(rt, rt->rt_dst, 0);
1667
1668                 peer = rt->peer;
1669                 if (peer) {
1670                         check_peer_pmtu(dst, peer);
1671
1672                         if (peer->redirect_learned.a4 &&
1673                             peer->redirect_learned.a4 != rt->rt_gateway) {
1674                                 if (check_peer_redir(dst, peer))
1675                                         return NULL;
1676                         }
1677                 }
1678
1679                 rt->rt_peer_genid = rt_peer_genid();
1680         }
1681         return dst;
1682 }
1683
1684 static void ipv4_dst_destroy(struct dst_entry *dst)
1685 {
1686         struct rtable *rt = (struct rtable *) dst;
1687         struct inet_peer *peer = rt->peer;
1688
1689         if (rt->fi) {
1690                 fib_info_put(rt->fi);
1691                 rt->fi = NULL;
1692         }
1693         if (peer) {
1694                 rt->peer = NULL;
1695                 inet_putpeer(peer);
1696         }
1697 }
1698
1699
1700 static void ipv4_link_failure(struct sk_buff *skb)
1701 {
1702         struct rtable *rt;
1703
1704         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1705
1706         rt = skb_rtable(skb);
1707         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1708                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1709 }
1710
1711 static int ip_rt_bug(struct sk_buff *skb)
1712 {
1713         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1714                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1715                 skb->dev ? skb->dev->name : "?");
1716         kfree_skb(skb);
1717         WARN_ON(1);
1718         return 0;
1719 }
1720
1721 /*
1722    We do not cache source address of outgoing interface,
1723    because it is used only by IP RR, TS and SRR options,
1724    so that it out of fast path.
1725
1726    BTW remember: "addr" is allowed to be not aligned
1727    in IP options!
1728  */
1729
1730 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1731 {
1732         __be32 src;
1733
1734         if (rt_is_output_route(rt))
1735                 src = ip_hdr(skb)->saddr;
1736         else {
1737                 struct fib_result res;
1738                 struct flowi4 fl4;
1739                 struct iphdr *iph;
1740
1741                 iph = ip_hdr(skb);
1742
1743                 memset(&fl4, 0, sizeof(fl4));
1744                 fl4.daddr = iph->daddr;
1745                 fl4.saddr = iph->saddr;
1746                 fl4.flowi4_tos = RT_TOS(iph->tos);
1747                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1748                 fl4.flowi4_iif = skb->dev->ifindex;
1749                 fl4.flowi4_mark = skb->mark;
1750
1751                 rcu_read_lock();
1752                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1753                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1754                 else
1755                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1756                                         RT_SCOPE_UNIVERSE);
1757                 rcu_read_unlock();
1758         }
1759         memcpy(addr, &src, 4);
1760 }
1761
1762 #ifdef CONFIG_IP_ROUTE_CLASSID
1763 static void set_class_tag(struct rtable *rt, u32 tag)
1764 {
1765         if (!(rt->dst.tclassid & 0xFFFF))
1766                 rt->dst.tclassid |= tag & 0xFFFF;
1767         if (!(rt->dst.tclassid & 0xFFFF0000))
1768                 rt->dst.tclassid |= tag & 0xFFFF0000;
1769 }
1770 #endif
1771
1772 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1773 {
1774         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1775
1776         if (advmss == 0) {
1777                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1778                                ip_rt_min_advmss);
1779                 if (advmss > 65535 - 40)
1780                         advmss = 65535 - 40;
1781         }
1782         return advmss;
1783 }
1784
1785 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1786 {
1787         unsigned int mtu = dst->dev->mtu;
1788
1789         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1790                 const struct rtable *rt = (const struct rtable *) dst;
1791
1792                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1793                         mtu = 576;
1794         }
1795
1796         if (mtu > IP_MAX_MTU)
1797                 mtu = IP_MAX_MTU;
1798
1799         return mtu;
1800 }
1801
1802 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1803                             struct fib_info *fi)
1804 {
1805         struct inet_peer *peer;
1806         int create = 0;
1807
1808         /* If a peer entry exists for this destination, we must hook
1809          * it up in order to get at cached metrics.
1810          */
1811         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1812                 create = 1;
1813
1814         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1815         if (peer) {
1816                 rt->rt_peer_genid = rt_peer_genid();
1817                 if (inet_metrics_new(peer))
1818                         memcpy(peer->metrics, fi->fib_metrics,
1819                                sizeof(u32) * RTAX_MAX);
1820                 dst_init_metrics(&rt->dst, peer->metrics, false);
1821
1822                 check_peer_pmtu(&rt->dst, peer);
1823                 if (peer->redirect_learned.a4 &&
1824                     peer->redirect_learned.a4 != rt->rt_gateway) {
1825                         rt->rt_gateway = peer->redirect_learned.a4;
1826                         rt->rt_flags |= RTCF_REDIRECTED;
1827                 }
1828         } else {
1829                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1830                         rt->fi = fi;
1831                         atomic_inc(&fi->fib_clntref);
1832                 }
1833                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1834         }
1835 }
1836
1837 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1838                            const struct fib_result *res,
1839                            struct fib_info *fi, u16 type, u32 itag)
1840 {
1841         struct dst_entry *dst = &rt->dst;
1842
1843         if (fi) {
1844                 if (FIB_RES_GW(*res) &&
1845                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1846                         rt->rt_gateway = FIB_RES_GW(*res);
1847                 rt_init_metrics(rt, fl4, fi);
1848 #ifdef CONFIG_IP_ROUTE_CLASSID
1849                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1850 #endif
1851         }
1852
1853         if (dst_mtu(dst) > IP_MAX_MTU)
1854                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1855         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1856                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1857
1858 #ifdef CONFIG_IP_ROUTE_CLASSID
1859 #ifdef CONFIG_IP_MULTIPLE_TABLES
1860         set_class_tag(rt, fib_rules_tclass(res));
1861 #endif
1862         set_class_tag(rt, itag);
1863 #endif
1864 }
1865
1866 static struct rtable *rt_dst_alloc(struct net_device *dev,
1867                                    bool nopolicy, bool noxfrm)
1868 {
1869         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1870                          DST_HOST |
1871                          (nopolicy ? DST_NOPOLICY : 0) |
1872                          (noxfrm ? DST_NOXFRM : 0));
1873 }
1874
1875 /* called in rcu_read_lock() section */
1876 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1877                                 u8 tos, struct net_device *dev, int our)
1878 {
1879         unsigned int hash;
1880         struct rtable *rth;
1881         __be32 spec_dst;
1882         struct in_device *in_dev = __in_dev_get_rcu(dev);
1883         u32 itag = 0;
1884         int err;
1885
1886         /* Primary sanity checks. */
1887
1888         if (in_dev == NULL)
1889                 return -EINVAL;
1890
1891         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1892             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1893                 goto e_inval;
1894
1895         if (ipv4_is_zeronet(saddr)) {
1896                 if (!ipv4_is_local_multicast(daddr))
1897                         goto e_inval;
1898                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1899         } else {
1900                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1901                                           &itag);
1902                 if (err < 0)
1903                         goto e_err;
1904         }
1905         rth = rt_dst_alloc(init_net.loopback_dev,
1906                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1907         if (!rth)
1908                 goto e_nobufs;
1909
1910 #ifdef CONFIG_IP_ROUTE_CLASSID
1911         rth->dst.tclassid = itag;
1912 #endif
1913         rth->dst.output = ip_rt_bug;
1914
1915         rth->rt_key_dst = daddr;
1916         rth->rt_key_src = saddr;
1917         rth->rt_genid   = rt_genid(dev_net(dev));
1918         rth->rt_flags   = RTCF_MULTICAST;
1919         rth->rt_type    = RTN_MULTICAST;
1920         rth->rt_key_tos = tos;
1921         rth->rt_dst     = daddr;
1922         rth->rt_src     = saddr;
1923         rth->rt_route_iif = dev->ifindex;
1924         rth->rt_iif     = dev->ifindex;
1925         rth->rt_oif     = 0;
1926         rth->rt_mark    = skb->mark;
1927         rth->rt_gateway = daddr;
1928         rth->rt_spec_dst= spec_dst;
1929         rth->rt_peer_genid = 0;
1930         rth->peer = NULL;
1931         rth->fi = NULL;
1932         if (our) {
1933                 rth->dst.input= ip_local_deliver;
1934                 rth->rt_flags |= RTCF_LOCAL;
1935         }
1936
1937 #ifdef CONFIG_IP_MROUTE
1938         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1939                 rth->dst.input = ip_mr_input;
1940 #endif
1941         RT_CACHE_STAT_INC(in_slow_mc);
1942
1943         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1944         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1945         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1946
1947 e_nobufs:
1948         return -ENOBUFS;
1949 e_inval:
1950         return -EINVAL;
1951 e_err:
1952         return err;
1953 }
1954
1955
1956 static void ip_handle_martian_source(struct net_device *dev,
1957                                      struct in_device *in_dev,
1958                                      struct sk_buff *skb,
1959                                      __be32 daddr,
1960                                      __be32 saddr)
1961 {
1962         RT_CACHE_STAT_INC(in_martian_src);
1963 #ifdef CONFIG_IP_ROUTE_VERBOSE
1964         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1965                 /*
1966                  *      RFC1812 recommendation, if source is martian,
1967                  *      the only hint is MAC header.
1968                  */
1969                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1970                         &daddr, &saddr, dev->name);
1971                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1972                         int i;
1973                         const unsigned char *p = skb_mac_header(skb);
1974                         printk(KERN_WARNING "ll header: ");
1975                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1976                                 printk("%02x", *p);
1977                                 if (i < (dev->hard_header_len - 1))
1978                                         printk(":");
1979                         }
1980                         printk("\n");
1981                 }
1982         }
1983 #endif
1984 }
1985
1986 /* called in rcu_read_lock() section */
1987 static int __mkroute_input(struct sk_buff *skb,
1988                            const struct fib_result *res,
1989                            struct in_device *in_dev,
1990                            __be32 daddr, __be32 saddr, u32 tos,
1991                            struct rtable **result)
1992 {
1993         struct rtable *rth;
1994         int err;
1995         struct in_device *out_dev;
1996         unsigned int flags = 0;
1997         __be32 spec_dst;
1998         u32 itag;
1999
2000         /* get a working reference to the output device */
2001         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2002         if (out_dev == NULL) {
2003                 if (net_ratelimit())
2004                         printk(KERN_CRIT "Bug in ip_route_input" \
2005                                "_slow(). Please, report\n");
2006                 return -EINVAL;
2007         }
2008
2009
2010         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2011                                   in_dev->dev, &spec_dst, &itag);
2012         if (err < 0) {
2013                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2014                                          saddr);
2015
2016                 goto cleanup;
2017         }
2018
2019         if (err)
2020                 flags |= RTCF_DIRECTSRC;
2021
2022         if (out_dev == in_dev && err &&
2023             (IN_DEV_SHARED_MEDIA(out_dev) ||
2024              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2025                 flags |= RTCF_DOREDIRECT;
2026
2027         if (skb->protocol != htons(ETH_P_IP)) {
2028                 /* Not IP (i.e. ARP). Do not create route, if it is
2029                  * invalid for proxy arp. DNAT routes are always valid.
2030                  *
2031                  * Proxy arp feature have been extended to allow, ARP
2032                  * replies back to the same interface, to support
2033                  * Private VLAN switch technologies. See arp.c.
2034                  */
2035                 if (out_dev == in_dev &&
2036                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2037                         err = -EINVAL;
2038                         goto cleanup;
2039                 }
2040         }
2041
2042         rth = rt_dst_alloc(out_dev->dev,
2043                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2044                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2045         if (!rth) {
2046                 err = -ENOBUFS;
2047                 goto cleanup;
2048         }
2049
2050         rth->rt_key_dst = daddr;
2051         rth->rt_key_src = saddr;
2052         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2053         rth->rt_flags = flags;
2054         rth->rt_type = res->type;
2055         rth->rt_key_tos = tos;
2056         rth->rt_dst     = daddr;
2057         rth->rt_src     = saddr;
2058         rth->rt_route_iif = in_dev->dev->ifindex;
2059         rth->rt_iif     = in_dev->dev->ifindex;
2060         rth->rt_oif     = 0;
2061         rth->rt_mark    = skb->mark;
2062         rth->rt_gateway = daddr;
2063         rth->rt_spec_dst= spec_dst;
2064         rth->rt_peer_genid = 0;
2065         rth->peer = NULL;
2066         rth->fi = NULL;
2067
2068         rth->dst.input = ip_forward;
2069         rth->dst.output = ip_output;
2070
2071         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2072
2073         *result = rth;
2074         err = 0;
2075  cleanup:
2076         return err;
2077 }
2078
2079 static int ip_mkroute_input(struct sk_buff *skb,
2080                             struct fib_result *res,
2081                             const struct flowi4 *fl4,
2082                             struct in_device *in_dev,
2083                             __be32 daddr, __be32 saddr, u32 tos)
2084 {
2085         struct rtable* rth = NULL;
2086         int err;
2087         unsigned hash;
2088
2089 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2090         if (res->fi && res->fi->fib_nhs > 1)
2091                 fib_select_multipath(res);
2092 #endif
2093
2094         /* create a routing cache entry */
2095         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2096         if (err)
2097                 return err;
2098
2099         /* put it into the cache */
2100         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2101                        rt_genid(dev_net(rth->dst.dev)));
2102         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2103         if (IS_ERR(rth))
2104                 return PTR_ERR(rth);
2105         return 0;
2106 }
2107
2108 /*
2109  *      NOTE. We drop all the packets that has local source
2110  *      addresses, because every properly looped back packet
2111  *      must have correct destination already attached by output routine.
2112  *
2113  *      Such approach solves two big problems:
2114  *      1. Not simplex devices are handled properly.
2115  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2116  *      called with rcu_read_lock()
2117  */
2118
2119 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120                                u8 tos, struct net_device *dev)
2121 {
2122         struct fib_result res;
2123         struct in_device *in_dev = __in_dev_get_rcu(dev);
2124         struct flowi4   fl4;
2125         unsigned        flags = 0;
2126         u32             itag = 0;
2127         struct rtable * rth;
2128         unsigned        hash;
2129         __be32          spec_dst;
2130         int             err = -EINVAL;
2131         struct net    * net = dev_net(dev);
2132
2133         /* IP on this device is disabled. */
2134
2135         if (!in_dev)
2136                 goto out;
2137
2138         /* Check for the most weird martians, which can be not detected
2139            by fib_lookup.
2140          */
2141
2142         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2143             ipv4_is_loopback(saddr))
2144                 goto martian_source;
2145
2146         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2147                 goto brd_input;
2148
2149         /* Accept zero addresses only to limited broadcast;
2150          * I even do not know to fix it or not. Waiting for complains :-)
2151          */
2152         if (ipv4_is_zeronet(saddr))
2153                 goto martian_source;
2154
2155         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2156                 goto martian_destination;
2157
2158         /*
2159          *      Now we are ready to route packet.
2160          */
2161         fl4.flowi4_oif = 0;
2162         fl4.flowi4_iif = dev->ifindex;
2163         fl4.flowi4_mark = skb->mark;
2164         fl4.flowi4_tos = tos;
2165         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2166         fl4.daddr = daddr;
2167         fl4.saddr = saddr;
2168         err = fib_lookup(net, &fl4, &res);
2169         if (err != 0) {
2170                 if (!IN_DEV_FORWARD(in_dev))
2171                         goto e_hostunreach;
2172                 goto no_route;
2173         }
2174
2175         RT_CACHE_STAT_INC(in_slow_tot);
2176
2177         if (res.type == RTN_BROADCAST)
2178                 goto brd_input;
2179
2180         if (res.type == RTN_LOCAL) {
2181                 err = fib_validate_source(skb, saddr, daddr, tos,
2182                                           net->loopback_dev->ifindex,
2183                                           dev, &spec_dst, &itag);
2184                 if (err < 0)
2185                         goto martian_source_keep_err;
2186                 if (err)
2187                         flags |= RTCF_DIRECTSRC;
2188                 spec_dst = daddr;
2189                 goto local_input;
2190         }
2191
2192         if (!IN_DEV_FORWARD(in_dev))
2193                 goto e_hostunreach;
2194         if (res.type != RTN_UNICAST)
2195                 goto martian_destination;
2196
2197         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2198 out:    return err;
2199
2200 brd_input:
2201         if (skb->protocol != htons(ETH_P_IP))
2202                 goto e_inval;
2203
2204         if (ipv4_is_zeronet(saddr))
2205                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2206         else {
2207                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2208                                           &itag);
2209                 if (err < 0)
2210                         goto martian_source_keep_err;
2211                 if (err)
2212                         flags |= RTCF_DIRECTSRC;
2213         }
2214         flags |= RTCF_BROADCAST;
2215         res.type = RTN_BROADCAST;
2216         RT_CACHE_STAT_INC(in_brd);
2217
2218 local_input:
2219         rth = rt_dst_alloc(net->loopback_dev,
2220                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2221         if (!rth)
2222                 goto e_nobufs;
2223
2224         rth->dst.input= ip_local_deliver;
2225         rth->dst.output= ip_rt_bug;
2226 #ifdef CONFIG_IP_ROUTE_CLASSID
2227         rth->dst.tclassid = itag;
2228 #endif
2229
2230         rth->rt_key_dst = daddr;
2231         rth->rt_key_src = saddr;
2232         rth->rt_genid = rt_genid(net);
2233         rth->rt_flags   = flags|RTCF_LOCAL;
2234         rth->rt_type    = res.type;
2235         rth->rt_key_tos = tos;
2236         rth->rt_dst     = daddr;
2237         rth->rt_src     = saddr;
2238 #ifdef CONFIG_IP_ROUTE_CLASSID
2239         rth->dst.tclassid = itag;
2240 #endif
2241         rth->rt_route_iif = dev->ifindex;
2242         rth->rt_iif     = dev->ifindex;
2243         rth->rt_oif     = 0;
2244         rth->rt_mark    = skb->mark;
2245         rth->rt_gateway = daddr;
2246         rth->rt_spec_dst= spec_dst;
2247         rth->rt_peer_genid = 0;
2248         rth->peer = NULL;
2249         rth->fi = NULL;
2250         if (res.type == RTN_UNREACHABLE) {
2251                 rth->dst.input= ip_error;
2252                 rth->dst.error= -err;
2253                 rth->rt_flags   &= ~RTCF_LOCAL;
2254         }
2255         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2256         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2257         err = 0;
2258         if (IS_ERR(rth))
2259                 err = PTR_ERR(rth);
2260         goto out;
2261
2262 no_route:
2263         RT_CACHE_STAT_INC(in_no_route);
2264         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2265         res.type = RTN_UNREACHABLE;
2266         if (err == -ESRCH)
2267                 err = -ENETUNREACH;
2268         goto local_input;
2269
2270         /*
2271          *      Do not cache martian addresses: they should be logged (RFC1812)
2272          */
2273 martian_destination:
2274         RT_CACHE_STAT_INC(in_martian_dst);
2275 #ifdef CONFIG_IP_ROUTE_VERBOSE
2276         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2277                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2278                         &daddr, &saddr, dev->name);
2279 #endif
2280
2281 e_hostunreach:
2282         err = -EHOSTUNREACH;
2283         goto out;
2284
2285 e_inval:
2286         err = -EINVAL;
2287         goto out;
2288
2289 e_nobufs:
2290         err = -ENOBUFS;
2291         goto out;
2292
2293 martian_source:
2294         err = -EINVAL;
2295 martian_source_keep_err:
2296         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297         goto out;
2298 }
2299
2300 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301                            u8 tos, struct net_device *dev, bool noref)
2302 {
2303         struct rtable * rth;
2304         unsigned        hash;
2305         int iif = dev->ifindex;
2306         struct net *net;
2307         int res;
2308
2309         net = dev_net(dev);
2310
2311         rcu_read_lock();
2312
2313         if (!rt_caching(net))
2314                 goto skip_cache;
2315
2316         tos &= IPTOS_RT_MASK;
2317         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2318
2319         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320              rth = rcu_dereference(rth->dst.rt_next)) {
2321                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2322                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2323                      (rth->rt_route_iif ^ iif) |
2324                      (rth->rt_key_tos ^ tos)) == 0 &&
2325                     rth->rt_mark == skb->mark &&
2326                     net_eq(dev_net(rth->dst.dev), net) &&
2327                     !rt_is_expired(rth)) {
2328                         if (noref) {
2329                                 dst_use_noref(&rth->dst, jiffies);
2330                                 skb_dst_set_noref(skb, &rth->dst);
2331                         } else {
2332                                 dst_use(&rth->dst, jiffies);
2333                                 skb_dst_set(skb, &rth->dst);
2334                         }
2335                         RT_CACHE_STAT_INC(in_hit);
2336                         rcu_read_unlock();
2337                         return 0;
2338                 }
2339                 RT_CACHE_STAT_INC(in_hlist_search);
2340         }
2341
2342 skip_cache:
2343         /* Multicast recognition logic is moved from route cache to here.
2344            The problem was that too many Ethernet cards have broken/missing
2345            hardware multicast filters :-( As result the host on multicasting
2346            network acquires a lot of useless route cache entries, sort of
2347            SDR messages from all the world. Now we try to get rid of them.
2348            Really, provided software IP multicast filter is organized
2349            reasonably (at least, hashed), it does not result in a slowdown
2350            comparing with route cache reject entries.
2351            Note, that multicast routers are not affected, because
2352            route cache entry is created eventually.
2353          */
2354         if (ipv4_is_multicast(daddr)) {
2355                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2356
2357                 if (in_dev) {
2358                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2359                                                   ip_hdr(skb)->protocol);
2360                         if (our
2361 #ifdef CONFIG_IP_MROUTE
2362                                 ||
2363                             (!ipv4_is_local_multicast(daddr) &&
2364                              IN_DEV_MFORWARD(in_dev))
2365 #endif
2366                            ) {
2367                                 int res = ip_route_input_mc(skb, daddr, saddr,
2368                                                             tos, dev, our);
2369                                 rcu_read_unlock();
2370                                 return res;
2371                         }
2372                 }
2373                 rcu_read_unlock();
2374                 return -EINVAL;
2375         }
2376         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2377         rcu_read_unlock();
2378         return res;
2379 }
2380 EXPORT_SYMBOL(ip_route_input_common);
2381
2382 /* called with rcu_read_lock() */
2383 static struct rtable *__mkroute_output(const struct fib_result *res,
2384                                        const struct flowi4 *fl4,
2385                                        __be32 orig_daddr, __be32 orig_saddr,
2386                                        int orig_oif, struct net_device *dev_out,
2387                                        unsigned int flags)
2388 {
2389         struct fib_info *fi = res->fi;
2390         u32 tos = RT_FL_TOS(fl4);
2391         struct in_device *in_dev;
2392         u16 type = res->type;
2393         struct rtable *rth;
2394
2395         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2396                 return ERR_PTR(-EINVAL);
2397
2398         if (ipv4_is_lbcast(fl4->daddr))
2399                 type = RTN_BROADCAST;
2400         else if (ipv4_is_multicast(fl4->daddr))
2401                 type = RTN_MULTICAST;
2402         else if (ipv4_is_zeronet(fl4->daddr))
2403                 return ERR_PTR(-EINVAL);
2404
2405         if (dev_out->flags & IFF_LOOPBACK)
2406                 flags |= RTCF_LOCAL;
2407
2408         in_dev = __in_dev_get_rcu(dev_out);
2409         if (!in_dev)
2410                 return ERR_PTR(-EINVAL);
2411
2412         if (type == RTN_BROADCAST) {
2413                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2414                 fi = NULL;
2415         } else if (type == RTN_MULTICAST) {
2416                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2417                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2418                                      fl4->flowi4_proto))
2419                         flags &= ~RTCF_LOCAL;
2420                 /* If multicast route do not exist use
2421                  * default one, but do not gateway in this case.
2422                  * Yes, it is hack.
2423                  */
2424                 if (fi && res->prefixlen < 4)
2425                         fi = NULL;
2426         }
2427
2428         rth = rt_dst_alloc(dev_out,
2429                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2430                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2431         if (!rth)
2432                 return ERR_PTR(-ENOBUFS);
2433
2434         rth->dst.output = ip_output;
2435
2436         rth->rt_key_dst = orig_daddr;
2437         rth->rt_key_src = orig_saddr;
2438         rth->rt_genid = rt_genid(dev_net(dev_out));
2439         rth->rt_flags   = flags;
2440         rth->rt_type    = type;
2441         rth->rt_key_tos = tos;
2442         rth->rt_dst     = fl4->daddr;
2443         rth->rt_src     = fl4->saddr;
2444         rth->rt_route_iif = 0;
2445         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2446         rth->rt_oif     = orig_oif;
2447         rth->rt_mark    = fl4->flowi4_mark;
2448         rth->rt_gateway = fl4->daddr;
2449         rth->rt_spec_dst= fl4->saddr;
2450         rth->rt_peer_genid = 0;
2451         rth->peer = NULL;
2452         rth->fi = NULL;
2453
2454         RT_CACHE_STAT_INC(out_slow_tot);
2455
2456         if (flags & RTCF_LOCAL) {
2457                 rth->dst.input = ip_local_deliver;
2458                 rth->rt_spec_dst = fl4->daddr;
2459         }
2460         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2461                 rth->rt_spec_dst = fl4->saddr;
2462                 if (flags & RTCF_LOCAL &&
2463                     !(dev_out->flags & IFF_LOOPBACK)) {
2464                         rth->dst.output = ip_mc_output;
2465                         RT_CACHE_STAT_INC(out_slow_mc);
2466                 }
2467 #ifdef CONFIG_IP_MROUTE
2468                 if (type == RTN_MULTICAST) {
2469                         if (IN_DEV_MFORWARD(in_dev) &&
2470                             !ipv4_is_local_multicast(fl4->daddr)) {
2471                                 rth->dst.input = ip_mr_input;
2472                                 rth->dst.output = ip_mc_output;
2473                         }
2474                 }
2475 #endif
2476         }
2477
2478         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2479
2480         return rth;
2481 }
2482
2483 /*
2484  * Major route resolver routine.
2485  * called with rcu_read_lock();
2486  */
2487
2488 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2489 {
2490         struct net_device *dev_out = NULL;
2491         u32 tos = RT_FL_TOS(fl4);
2492         unsigned int flags = 0;
2493         struct fib_result res;
2494         struct rtable *rth;
2495         __be32 orig_daddr;
2496         __be32 orig_saddr;
2497         int orig_oif;
2498
2499         res.fi          = NULL;
2500 #ifdef CONFIG_IP_MULTIPLE_TABLES
2501         res.r           = NULL;
2502 #endif
2503
2504         orig_daddr = fl4->daddr;
2505         orig_saddr = fl4->saddr;
2506         orig_oif = fl4->flowi4_oif;
2507
2508         fl4->flowi4_iif = net->loopback_dev->ifindex;
2509         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2510         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2511                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2512
2513         rcu_read_lock();
2514         if (fl4->saddr) {
2515                 rth = ERR_PTR(-EINVAL);
2516                 if (ipv4_is_multicast(fl4->saddr) ||
2517                     ipv4_is_lbcast(fl4->saddr) ||
2518                     ipv4_is_zeronet(fl4->saddr))
2519                         goto out;
2520
2521                 /* I removed check for oif == dev_out->oif here.
2522                    It was wrong for two reasons:
2523                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524                       is assigned to multiple interfaces.
2525                    2. Moreover, we are allowed to send packets with saddr
2526                       of another iface. --ANK
2527                  */
2528
2529                 if (fl4->flowi4_oif == 0 &&
2530                     (ipv4_is_multicast(fl4->daddr) ||
2531                      ipv4_is_lbcast(fl4->daddr))) {
2532                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2533                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2534                         if (dev_out == NULL)
2535                                 goto out;
2536
2537                         /* Special hack: user can direct multicasts
2538                            and limited broadcast via necessary interface
2539                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540                            This hack is not just for fun, it allows
2541                            vic,vat and friends to work.
2542                            They bind socket to loopback, set ttl to zero
2543                            and expect that it will work.
2544                            From the viewpoint of routing cache they are broken,
2545                            because we are not allowed to build multicast path
2546                            with loopback source addr (look, routing cache
2547                            cannot know, that ttl is zero, so that packet
2548                            will not leave this host and route is valid).
2549                            Luckily, this hack is good workaround.
2550                          */
2551
2552                         fl4->flowi4_oif = dev_out->ifindex;
2553                         goto make_route;
2554                 }
2555
2556                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2557                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558                         if (!__ip_dev_find(net, fl4->saddr, false))
2559                                 goto out;
2560                 }
2561         }
2562
2563
2564         if (fl4->flowi4_oif) {
2565                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2566                 rth = ERR_PTR(-ENODEV);
2567                 if (dev_out == NULL)
2568                         goto out;
2569
2570                 /* RACE: Check return value of inet_select_addr instead. */
2571                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2572                         rth = ERR_PTR(-ENETUNREACH);
2573                         goto out;
2574                 }
2575                 if (ipv4_is_local_multicast(fl4->daddr) ||
2576                     ipv4_is_lbcast(fl4->daddr)) {
2577                         if (!fl4->saddr)
2578                                 fl4->saddr = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         goto make_route;
2581                 }
2582                 if (fl4->saddr) {
2583                         if (ipv4_is_multicast(fl4->daddr))
2584                                 fl4->saddr = inet_select_addr(dev_out, 0,
2585                                                               fl4->flowi4_scope);
2586                         else if (!fl4->daddr)
2587                                 fl4->saddr = inet_select_addr(dev_out, 0,
2588                                                               RT_SCOPE_HOST);
2589                 }
2590         }
2591
2592         if (!fl4->daddr) {
2593                 fl4->daddr = fl4->saddr;
2594                 if (!fl4->daddr)
2595                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2596                 dev_out = net->loopback_dev;
2597                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2598                 res.type = RTN_LOCAL;
2599                 flags |= RTCF_LOCAL;
2600                 goto make_route;
2601         }
2602
2603         if (fib_lookup(net, fl4, &res)) {
2604                 res.fi = NULL;
2605                 if (fl4->flowi4_oif) {
2606                         /* Apparently, routing tables are wrong. Assume,
2607                            that the destination is on link.
2608
2609                            WHY? DW.
2610                            Because we are allowed to send to iface
2611                            even if it has NO routes and NO assigned
2612                            addresses. When oif is specified, routing
2613                            tables are looked up with only one purpose:
2614                            to catch if destination is gatewayed, rather than
2615                            direct. Moreover, if MSG_DONTROUTE is set,
2616                            we send packet, ignoring both routing tables
2617                            and ifaddr state. --ANK
2618
2619
2620                            We could make it even if oif is unknown,
2621                            likely IPv6, but we do not.
2622                          */
2623
2624                         if (fl4->saddr == 0)
2625                                 fl4->saddr = inet_select_addr(dev_out, 0,
2626                                                               RT_SCOPE_LINK);
2627                         res.type = RTN_UNICAST;
2628                         goto make_route;
2629                 }
2630                 rth = ERR_PTR(-ENETUNREACH);
2631                 goto out;
2632         }
2633
2634         if (res.type == RTN_LOCAL) {
2635                 if (!fl4->saddr) {
2636                         if (res.fi->fib_prefsrc)
2637                                 fl4->saddr = res.fi->fib_prefsrc;
2638                         else
2639                                 fl4->saddr = fl4->daddr;
2640                 }
2641                 dev_out = net->loopback_dev;
2642                 fl4->flowi4_oif = dev_out->ifindex;
2643                 res.fi = NULL;
2644                 flags |= RTCF_LOCAL;
2645                 goto make_route;
2646         }
2647
2648 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2649         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2650                 fib_select_multipath(&res);
2651         else
2652 #endif
2653         if (!res.prefixlen &&
2654             res.table->tb_num_default > 1 &&
2655             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2656                 fib_select_default(&res);
2657
2658         if (!fl4->saddr)
2659                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2660
2661         dev_out = FIB_RES_DEV(res);
2662         fl4->flowi4_oif = dev_out->ifindex;
2663
2664
2665 make_route:
2666         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2667                                dev_out, flags);
2668         if (!IS_ERR(rth)) {
2669                 unsigned int hash;
2670
2671                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2672                                rt_genid(dev_net(dev_out)));
2673                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2674         }
2675
2676 out:
2677         rcu_read_unlock();
2678         return rth;
2679 }
2680
2681 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2682 {
2683         struct rtable *rth;
2684         unsigned int hash;
2685
2686         if (!rt_caching(net))
2687                 goto slow_output;
2688
2689         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2690
2691         rcu_read_lock_bh();
2692         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2694                 if (rth->rt_key_dst == flp4->daddr &&
2695                     rth->rt_key_src == flp4->saddr &&
2696                     rt_is_output_route(rth) &&
2697                     rth->rt_oif == flp4->flowi4_oif &&
2698                     rth->rt_mark == flp4->flowi4_mark &&
2699                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2700                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701                     net_eq(dev_net(rth->dst.dev), net) &&
2702                     !rt_is_expired(rth)) {
2703                         dst_use(&rth->dst, jiffies);
2704                         RT_CACHE_STAT_INC(out_hit);
2705                         rcu_read_unlock_bh();
2706                         if (!flp4->saddr)
2707                                 flp4->saddr = rth->rt_src;
2708                         if (!flp4->daddr)
2709                                 flp4->daddr = rth->rt_dst;
2710                         return rth;
2711                 }
2712                 RT_CACHE_STAT_INC(out_hlist_search);
2713         }
2714         rcu_read_unlock_bh();
2715
2716 slow_output:
2717         return ip_route_output_slow(net, flp4);
2718 }
2719 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2720
2721 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2722 {
2723         return NULL;
2724 }
2725
2726 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2727 {
2728         return 0;
2729 }
2730
2731 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2732 {
2733 }
2734
2735 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2736                                           unsigned long old)
2737 {
2738         return NULL;
2739 }
2740
2741 static struct dst_ops ipv4_dst_blackhole_ops = {
2742         .family                 =       AF_INET,
2743         .protocol               =       cpu_to_be16(ETH_P_IP),
2744         .destroy                =       ipv4_dst_destroy,
2745         .check                  =       ipv4_blackhole_dst_check,
2746         .default_mtu            =       ipv4_blackhole_default_mtu,
2747         .default_advmss         =       ipv4_default_advmss,
2748         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2749         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2750         .neigh_lookup           =       ipv4_neigh_lookup,
2751 };
2752
2753 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2754 {
2755         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2756         struct rtable *ort = (struct rtable *) dst_orig;
2757
2758         if (rt) {
2759                 struct dst_entry *new = &rt->dst;
2760
2761                 new->__use = 1;
2762                 new->input = dst_discard;
2763                 new->output = dst_discard;
2764                 dst_copy_metrics(new, &ort->dst);
2765
2766                 new->dev = ort->dst.dev;
2767                 if (new->dev)
2768                         dev_hold(new->dev);
2769
2770                 rt->rt_key_dst = ort->rt_key_dst;
2771                 rt->rt_key_src = ort->rt_key_src;
2772                 rt->rt_key_tos = ort->rt_key_tos;
2773                 rt->rt_route_iif = ort->rt_route_iif;
2774                 rt->rt_iif = ort->rt_iif;
2775                 rt->rt_oif = ort->rt_oif;
2776                 rt->rt_mark = ort->rt_mark;
2777
2778                 rt->rt_genid = rt_genid(net);
2779                 rt->rt_flags = ort->rt_flags;
2780                 rt->rt_type = ort->rt_type;
2781                 rt->rt_dst = ort->rt_dst;
2782                 rt->rt_src = ort->rt_src;
2783                 rt->rt_gateway = ort->rt_gateway;
2784                 rt->rt_spec_dst = ort->rt_spec_dst;
2785                 rt->peer = ort->peer;
2786                 if (rt->peer)
2787                         atomic_inc(&rt->peer->refcnt);
2788                 rt->fi = ort->fi;
2789                 if (rt->fi)
2790                         atomic_inc(&rt->fi->fib_clntref);
2791
2792                 dst_free(new);
2793         }
2794
2795         dst_release(dst_orig);
2796
2797         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2798 }
2799
2800 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2801                                     struct sock *sk)
2802 {
2803         struct rtable *rt = __ip_route_output_key(net, flp4);
2804
2805         if (IS_ERR(rt))
2806                 return rt;
2807
2808         if (flp4->flowi4_proto)
2809                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2810                                                    flowi4_to_flowi(flp4),
2811                                                    sk, 0);
2812
2813         return rt;
2814 }
2815 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2816
2817 static int rt_fill_info(struct net *net,
2818                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2819                         int nowait, unsigned int flags)
2820 {
2821         struct rtable *rt = skb_rtable(skb);
2822         struct rtmsg *r;
2823         struct nlmsghdr *nlh;
2824         long expires = 0;
2825         const struct inet_peer *peer = rt->peer;
2826         u32 id = 0, ts = 0, tsage = 0, error;
2827
2828         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2829         if (nlh == NULL)
2830                 return -EMSGSIZE;
2831
2832         r = nlmsg_data(nlh);
2833         r->rtm_family    = AF_INET;
2834         r->rtm_dst_len  = 32;
2835         r->rtm_src_len  = 0;
2836         r->rtm_tos      = rt->rt_key_tos;
2837         r->rtm_table    = RT_TABLE_MAIN;
2838         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2839         r->rtm_type     = rt->rt_type;
2840         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2841         r->rtm_protocol = RTPROT_UNSPEC;
2842         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2843         if (rt->rt_flags & RTCF_NOTIFY)
2844                 r->rtm_flags |= RTM_F_NOTIFY;
2845
2846         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2847
2848         if (rt->rt_key_src) {
2849                 r->rtm_src_len = 32;
2850                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2851         }
2852         if (rt->dst.dev)
2853                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2854 #ifdef CONFIG_IP_ROUTE_CLASSID
2855         if (rt->dst.tclassid)
2856                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2857 #endif
2858         if (rt_is_input_route(rt))
2859                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2860         else if (rt->rt_src != rt->rt_key_src)
2861                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2862
2863         if (rt->rt_dst != rt->rt_gateway)
2864                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2865
2866         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2867                 goto nla_put_failure;
2868
2869         if (rt->rt_mark)
2870                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2871
2872         error = rt->dst.error;
2873         if (peer) {
2874                 inet_peer_refcheck(rt->peer);
2875                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2876                 if (peer->tcp_ts_stamp) {
2877                         ts = peer->tcp_ts;
2878                         tsage = get_seconds() - peer->tcp_ts_stamp;
2879                 }
2880                 expires = ACCESS_ONCE(peer->pmtu_expires);
2881                 if (expires)
2882                         expires -= jiffies;
2883         }
2884
2885         if (rt_is_input_route(rt)) {
2886 #ifdef CONFIG_IP_MROUTE
2887                 __be32 dst = rt->rt_dst;
2888
2889                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2890                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2891                         int err = ipmr_get_route(net, skb,
2892                                                  rt->rt_src, rt->rt_dst,
2893                                                  r, nowait);
2894                         if (err <= 0) {
2895                                 if (!nowait) {
2896                                         if (err == 0)
2897                                                 return 0;
2898                                         goto nla_put_failure;
2899                                 } else {
2900                                         if (err == -EMSGSIZE)
2901                                                 goto nla_put_failure;
2902                                         error = err;
2903                                 }
2904                         }
2905                 } else
2906 #endif
2907                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2908         }
2909
2910         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2911                                expires, error) < 0)
2912                 goto nla_put_failure;
2913
2914         return nlmsg_end(skb, nlh);
2915
2916 nla_put_failure:
2917         nlmsg_cancel(skb, nlh);
2918         return -EMSGSIZE;
2919 }
2920
2921 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2922 {
2923         struct net *net = sock_net(in_skb->sk);
2924         struct rtmsg *rtm;
2925         struct nlattr *tb[RTA_MAX+1];
2926         struct rtable *rt = NULL;
2927         __be32 dst = 0;
2928         __be32 src = 0;
2929         u32 iif;
2930         int err;
2931         int mark;
2932         struct sk_buff *skb;
2933
2934         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2935         if (err < 0)
2936                 goto errout;
2937
2938         rtm = nlmsg_data(nlh);
2939
2940         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2941         if (skb == NULL) {
2942                 err = -ENOBUFS;
2943                 goto errout;
2944         }
2945
2946         /* Reserve room for dummy headers, this skb can pass
2947            through good chunk of routing engine.
2948          */
2949         skb_reset_mac_header(skb);
2950         skb_reset_network_header(skb);
2951
2952         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2953         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2954         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2955
2956         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2957         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2958         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2959         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2960
2961         if (iif) {
2962                 struct net_device *dev;
2963
2964                 dev = __dev_get_by_index(net, iif);
2965                 if (dev == NULL) {
2966                         err = -ENODEV;
2967                         goto errout_free;
2968                 }
2969
2970                 skb->protocol   = htons(ETH_P_IP);
2971                 skb->dev        = dev;
2972                 skb->mark       = mark;
2973                 local_bh_disable();
2974                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2975                 local_bh_enable();
2976
2977                 rt = skb_rtable(skb);
2978                 if (err == 0 && rt->dst.error)
2979                         err = -rt->dst.error;
2980         } else {
2981                 struct flowi4 fl4 = {
2982                         .daddr = dst,
2983                         .saddr = src,
2984                         .flowi4_tos = rtm->rtm_tos,
2985                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2986                         .flowi4_mark = mark,
2987                 };
2988                 rt = ip_route_output_key(net, &fl4);
2989
2990                 err = 0;
2991                 if (IS_ERR(rt))
2992                         err = PTR_ERR(rt);
2993         }
2994
2995         if (err)
2996                 goto errout_free;
2997
2998         skb_dst_set(skb, &rt->dst);
2999         if (rtm->rtm_flags & RTM_F_NOTIFY)
3000                 rt->rt_flags |= RTCF_NOTIFY;
3001
3002         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3003                            RTM_NEWROUTE, 0, 0);
3004         if (err <= 0)
3005                 goto errout_free;
3006
3007         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3008 errout:
3009         return err;
3010
3011 errout_free:
3012         kfree_skb(skb);
3013         goto errout;
3014 }
3015
3016 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3017 {
3018         struct rtable *rt;
3019         int h, s_h;
3020         int idx, s_idx;
3021         struct net *net;
3022
3023         net = sock_net(skb->sk);
3024
3025         s_h = cb->args[0];
3026         if (s_h < 0)
3027                 s_h = 0;
3028         s_idx = idx = cb->args[1];
3029         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3030                 if (!rt_hash_table[h].chain)
3031                         continue;
3032                 rcu_read_lock_bh();
3033                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3034                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3035                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3036                                 continue;
3037                         if (rt_is_expired(rt))
3038                                 continue;
3039                         skb_dst_set_noref(skb, &rt->dst);
3040                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3041                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3042                                          1, NLM_F_MULTI) <= 0) {
3043                                 skb_dst_drop(skb);
3044                                 rcu_read_unlock_bh();
3045                                 goto done;
3046                         }
3047                         skb_dst_drop(skb);
3048                 }
3049                 rcu_read_unlock_bh();
3050         }
3051
3052 done:
3053         cb->args[0] = h;
3054         cb->args[1] = idx;
3055         return skb->len;
3056 }
3057
3058 void ip_rt_multicast_event(struct in_device *in_dev)
3059 {
3060         rt_cache_flush(dev_net(in_dev->dev), 0);
3061 }
3062
3063 #ifdef CONFIG_SYSCTL
3064 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3065                                         void __user *buffer,
3066                                         size_t *lenp, loff_t *ppos)
3067 {
3068         if (write) {
3069                 int flush_delay;
3070                 ctl_table ctl;
3071                 struct net *net;
3072
3073                 memcpy(&ctl, __ctl, sizeof(ctl));
3074                 ctl.data = &flush_delay;
3075                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3076
3077                 net = (struct net *)__ctl->extra1;
3078                 rt_cache_flush(net, flush_delay);
3079                 return 0;
3080         }
3081
3082         return -EINVAL;
3083 }
3084
3085 static ctl_table ipv4_route_table[] = {
3086         {
3087                 .procname       = "gc_thresh",
3088                 .data           = &ipv4_dst_ops.gc_thresh,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec,
3092         },
3093         {
3094                 .procname       = "max_size",
3095                 .data           = &ip_rt_max_size,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec,
3099         },
3100         {
3101                 /*  Deprecated. Use gc_min_interval_ms */
3102
3103                 .procname       = "gc_min_interval",
3104                 .data           = &ip_rt_gc_min_interval,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec_jiffies,
3108         },
3109         {
3110                 .procname       = "gc_min_interval_ms",
3111                 .data           = &ip_rt_gc_min_interval,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec_ms_jiffies,
3115         },
3116         {
3117                 .procname       = "gc_timeout",
3118                 .data           = &ip_rt_gc_timeout,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec_jiffies,
3122         },
3123         {
3124                 .procname       = "gc_interval",
3125                 .data           = &ip_rt_gc_interval,
3126                 .maxlen         = sizeof(int),
3127                 .mode           = 0644,
3128                 .proc_handler   = proc_dointvec_jiffies,
3129         },
3130         {
3131                 .procname       = "redirect_load",
3132                 .data           = &ip_rt_redirect_load,
3133                 .maxlen         = sizeof(int),
3134                 .mode           = 0644,
3135                 .proc_handler   = proc_dointvec,
3136         },
3137         {
3138                 .procname       = "redirect_number",
3139                 .data           = &ip_rt_redirect_number,
3140                 .maxlen         = sizeof(int),
3141                 .mode           = 0644,
3142                 .proc_handler   = proc_dointvec,
3143         },
3144         {
3145                 .procname       = "redirect_silence",
3146                 .data           = &ip_rt_redirect_silence,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "error_cost",
3153                 .data           = &ip_rt_error_cost,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 .procname       = "error_burst",
3160                 .data           = &ip_rt_error_burst,
3161                 .maxlen         = sizeof(int),
3162                 .mode           = 0644,
3163                 .proc_handler   = proc_dointvec,
3164         },
3165         {
3166                 .procname       = "gc_elasticity",
3167                 .data           = &ip_rt_gc_elasticity,
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0644,
3170                 .proc_handler   = proc_dointvec,
3171         },
3172         {
3173                 .procname       = "mtu_expires",
3174                 .data           = &ip_rt_mtu_expires,
3175                 .maxlen         = sizeof(int),
3176                 .mode           = 0644,
3177                 .proc_handler   = proc_dointvec_jiffies,
3178         },
3179         {
3180                 .procname       = "min_pmtu",
3181                 .data           = &ip_rt_min_pmtu,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .procname       = "min_adv_mss",
3188                 .data           = &ip_rt_min_advmss,
3189                 .maxlen         = sizeof(int),
3190                 .mode           = 0644,
3191                 .proc_handler   = proc_dointvec,
3192         },
3193         { }
3194 };
3195
3196 static struct ctl_table empty[1];
3197
3198 static struct ctl_table ipv4_skeleton[] =
3199 {
3200         { .procname = "route",
3201           .mode = 0555, .child = ipv4_route_table},
3202         { .procname = "neigh",
3203           .mode = 0555, .child = empty},
3204         { }
3205 };
3206
3207 static __net_initdata struct ctl_path ipv4_path[] = {
3208         { .procname = "net", },
3209         { .procname = "ipv4", },
3210         { },
3211 };
3212
3213 static struct ctl_table ipv4_route_flush_table[] = {
3214         {
3215                 .procname       = "flush",
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0200,
3218                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3219         },
3220         { },
3221 };
3222
3223 static __net_initdata struct ctl_path ipv4_route_path[] = {
3224         { .procname = "net", },
3225         { .procname = "ipv4", },
3226         { .procname = "route", },
3227         { },
3228 };
3229
3230 static __net_init int sysctl_route_net_init(struct net *net)
3231 {
3232         struct ctl_table *tbl;
3233
3234         tbl = ipv4_route_flush_table;
3235         if (!net_eq(net, &init_net)) {
3236                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3237                 if (tbl == NULL)
3238                         goto err_dup;
3239         }
3240         tbl[0].extra1 = net;
3241
3242         net->ipv4.route_hdr =
3243                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3244         if (net->ipv4.route_hdr == NULL)
3245                 goto err_reg;
3246         return 0;
3247
3248 err_reg:
3249         if (tbl != ipv4_route_flush_table)
3250                 kfree(tbl);
3251 err_dup:
3252         return -ENOMEM;
3253 }
3254
3255 static __net_exit void sysctl_route_net_exit(struct net *net)
3256 {
3257         struct ctl_table *tbl;
3258
3259         tbl = net->ipv4.route_hdr->ctl_table_arg;
3260         unregister_net_sysctl_table(net->ipv4.route_hdr);
3261         BUG_ON(tbl == ipv4_route_flush_table);
3262         kfree(tbl);
3263 }
3264
3265 static __net_initdata struct pernet_operations sysctl_route_ops = {
3266         .init = sysctl_route_net_init,
3267         .exit = sysctl_route_net_exit,
3268 };
3269 #endif
3270
3271 static __net_init int rt_genid_init(struct net *net)
3272 {
3273         get_random_bytes(&net->ipv4.rt_genid,
3274                          sizeof(net->ipv4.rt_genid));
3275         get_random_bytes(&net->ipv4.dev_addr_genid,
3276                          sizeof(net->ipv4.dev_addr_genid));
3277         return 0;
3278 }
3279
3280 static __net_initdata struct pernet_operations rt_genid_ops = {
3281         .init = rt_genid_init,
3282 };
3283
3284
3285 #ifdef CONFIG_IP_ROUTE_CLASSID
3286 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3287 #endif /* CONFIG_IP_ROUTE_CLASSID */
3288
3289 static __initdata unsigned long rhash_entries;
3290 static int __init set_rhash_entries(char *str)
3291 {
3292         if (!str)
3293                 return 0;
3294         rhash_entries = simple_strtoul(str, &str, 0);
3295         return 1;
3296 }
3297 __setup("rhash_entries=", set_rhash_entries);
3298
3299 int __init ip_rt_init(void)
3300 {
3301         int rc = 0;
3302
3303 #ifdef CONFIG_IP_ROUTE_CLASSID
3304         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3305         if (!ip_rt_acct)
3306                 panic("IP: failed to allocate ip_rt_acct\n");
3307 #endif
3308
3309         ipv4_dst_ops.kmem_cachep =
3310                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3311                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3312
3313         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3314
3315         if (dst_entries_init(&ipv4_dst_ops) < 0)
3316                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3317
3318         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3319                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3320
3321         rt_hash_table = (struct rt_hash_bucket *)
3322                 alloc_large_system_hash("IP route cache",
3323                                         sizeof(struct rt_hash_bucket),
3324                                         rhash_entries,
3325                                         (totalram_pages >= 128 * 1024) ?
3326                                         15 : 17,
3327                                         0,
3328                                         &rt_hash_log,
3329                                         &rt_hash_mask,
3330                                         rhash_entries ? 0 : 512 * 1024);
3331         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3332         rt_hash_lock_init();
3333
3334         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3335         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3336
3337         devinet_init();
3338         ip_fib_init();
3339
3340         if (ip_rt_proc_init())
3341                 printk(KERN_ERR "Unable to create route proc files\n");
3342 #ifdef CONFIG_XFRM
3343         xfrm_init();
3344         xfrm4_init(ip_rt_max_size);
3345 #endif
3346         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3347
3348 #ifdef CONFIG_SYSCTL
3349         register_pernet_subsys(&sysctl_route_ops);
3350 #endif
3351         register_pernet_subsys(&rt_genid_ops);
3352         return rc;
3353 }
3354
3355 #ifdef CONFIG_SYSCTL
3356 /*
3357  * We really need to sanitize the damn ipv4 init order, then all
3358  * this nonsense will go away.
3359  */
3360 void __init ip_static_sysctl_init(void)
3361 {
3362         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3363 }
3364 #endif