net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp4) \
 113     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 /*
 135  *      Interface to generic destination cache.
 136  */
 137
 138 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 139 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 140 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 141 static void              ipv4_dst_destroy(struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 145 static int rt_garbage_collect(struct dst_ops *ops);
 146
 147 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 148                             int how)
 149 {
 150 }
 151
 152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 153 {
 154         struct rtable *rt = (struct rtable *) dst;
 155         struct inet_peer *peer;
 156         u32 *p = NULL;
 157
 158         if (!rt->peer)
 159                 rt_bind_peer(rt, rt->rt_dst, 1);
 160
 161         peer = rt->peer;
 162         if (peer) {
 163                 u32 *old_p = __DST_METRICS_PTR(old);
 164                 unsigned long prev, new;
 165
 166                 p = peer->metrics;
 167                 if (inet_metrics_new(peer))
 168                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 169
 170                 new = (unsigned long) p;
 171                 prev = cmpxchg(&dst->_metrics, old, new);
 172
 173                 if (prev != old) {
 174                         p = __DST_METRICS_PTR(prev);
 175                         if (prev & DST_METRICS_READ_ONLY)
 176                                 p = NULL;
 177                 } else {
 178                         if (rt->fi) {
 179                                 fib_info_put(rt->fi);
 180                                 rt->fi = NULL;
 181                         }
 182                 }
 183         }
 184         return p;
 185 }
 186
 187 static struct dst_ops ipv4_dst_ops = {
 188         .family =               AF_INET,
 189         .protocol =             cpu_to_be16(ETH_P_IP),
 190         .gc =                   rt_garbage_collect,
 191         .check =                ipv4_dst_check,
 192         .default_advmss =       ipv4_default_advmss,
 193         .default_mtu =          ipv4_default_mtu,
 194         .cow_metrics =          ipv4_cow_metrics,
 195         .destroy =              ipv4_dst_destroy,
 196         .ifdown =               ipv4_dst_ifdown,
 197         .negative_advice =      ipv4_negative_advice,
 198         .link_failure =         ipv4_link_failure,
 199         .update_pmtu =          ip_rt_update_pmtu,
 200         .local_out =            __ip_local_out,
 201 };
 202
 203 #define ECN_OR_COST(class)      TC_PRIO_##class
 204
 205 const __u8 ip_tos2prio[16] = {
 206         TC_PRIO_BESTEFFORT,
 207         ECN_OR_COST(BESTEFFORT),
 208         TC_PRIO_BESTEFFORT,
 209         ECN_OR_COST(BESTEFFORT),
 210         TC_PRIO_BULK,
 211         ECN_OR_COST(BULK),
 212         TC_PRIO_BULK,
 213         ECN_OR_COST(BULK),
 214         TC_PRIO_INTERACTIVE,
 215         ECN_OR_COST(INTERACTIVE),
 216         TC_PRIO_INTERACTIVE,
 217         ECN_OR_COST(INTERACTIVE),
 218         TC_PRIO_INTERACTIVE_BULK,
 219         ECN_OR_COST(INTERACTIVE_BULK),
 220         TC_PRIO_INTERACTIVE_BULK,
 221         ECN_OR_COST(INTERACTIVE_BULK)
 222 };
 223
 224
 225 /*
 226  * Route cache.
 227  */
 228
 229 /* The locking scheme is rather straight forward:
 230  *
 231  * 1) Read-Copy Update protects the buckets of the central route hash.
 232  * 2) Only writers remove entries, and they hold the lock
 233  *    as they look at rtable reference counts.
 234  * 3) Only readers acquire references to rtable entries,
 235  *    they do so with atomic increments and with the
 236  *    lock held.
 237  */
 238
 239 struct rt_hash_bucket {
 240         struct rtable __rcu     *chain;
 241 };
 242
 243 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 244         defined(CONFIG_PROVE_LOCKING)
 245 /*
 246  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 247  * The size of this table is a power of two and depends on the number of CPUS.
 248  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 249  */
 250 #ifdef CONFIG_LOCKDEP
 251 # define RT_HASH_LOCK_SZ        256
 252 #else
 253 # if NR_CPUS >= 32
 254 #  define RT_HASH_LOCK_SZ       4096
 255 # elif NR_CPUS >= 16
 256 #  define RT_HASH_LOCK_SZ       2048
 257 # elif NR_CPUS >= 8
 258 #  define RT_HASH_LOCK_SZ       1024
 259 # elif NR_CPUS >= 4
 260 #  define RT_HASH_LOCK_SZ       512
 261 # else
 262 #  define RT_HASH_LOCK_SZ       256
 263 # endif
 264 #endif
 265
 266 static spinlock_t       *rt_hash_locks;
 267 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 268
 269 static __init void rt_hash_lock_init(void)
 270 {
 271         int i;
 272
 273         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 274                         GFP_KERNEL);
 275         if (!rt_hash_locks)
 276                 panic("IP: failed to allocate rt_hash_locks\n");
 277
 278         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 279                 spin_lock_init(&rt_hash_locks[i]);
 280 }
 281 #else
 282 # define rt_hash_lock_addr(slot) NULL
 283
 284 static inline void rt_hash_lock_init(void)
 285 {
 286 }
 287 #endif
 288
 289 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 290 static unsigned                 rt_hash_mask __read_mostly;
 291 static unsigned int             rt_hash_log  __read_mostly;
 292
 293 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 294 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 295
 296 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 297                                    int genid)
 298 {
 299         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 300                             idx, genid)
 301                 & rt_hash_mask;
 302 }
 303
 304 static inline int rt_genid(struct net *net)
 305 {
 306         return atomic_read(&net->ipv4.rt_genid);
 307 }
 308
 309 #ifdef CONFIG_PROC_FS
 310 struct rt_cache_iter_state {
 311         struct seq_net_private p;
 312         int bucket;
 313         int genid;
 314 };
 315
 316 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 317 {
 318         struct rt_cache_iter_state *st = seq->private;
 319         struct rtable *r = NULL;
 320
 321         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 322                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 323                         continue;
 324                 rcu_read_lock_bh();
 325                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 326                 while (r) {
 327                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 328                             r->rt_genid == st->genid)
 329                                 return r;
 330                         r = rcu_dereference_bh(r->dst.rt_next);
 331                 }
 332                 rcu_read_unlock_bh();
 333         }
 334         return r;
 335 }
 336
 337 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 338                                           struct rtable *r)
 339 {
 340         struct rt_cache_iter_state *st = seq->private;
 341
 342         r = rcu_dereference_bh(r->dst.rt_next);
 343         while (!r) {
 344                 rcu_read_unlock_bh();
 345                 do {
 346                         if (--st->bucket < 0)
 347                                 return NULL;
 348                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 349                 rcu_read_lock_bh();
 350                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 356                                         struct rtable *r)
 357 {
 358         struct rt_cache_iter_state *st = seq->private;
 359         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 360                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 361                         continue;
 362                 if (r->rt_genid == st->genid)
 363                         break;
 364         }
 365         return r;
 366 }
 367
 368 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 369 {
 370         struct rtable *r = rt_cache_get_first(seq);
 371
 372         if (r)
 373                 while (pos && (r = rt_cache_get_next(seq, r)))
 374                         --pos;
 375         return pos ? NULL : r;
 376 }
 377
 378 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 379 {
 380         struct rt_cache_iter_state *st = seq->private;
 381         if (*pos)
 382                 return rt_cache_get_idx(seq, *pos - 1);
 383         st->genid = rt_genid(seq_file_net(seq));
 384         return SEQ_START_TOKEN;
 385 }
 386
 387 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 388 {
 389         struct rtable *r;
 390
 391         if (v == SEQ_START_TOKEN)
 392                 r = rt_cache_get_first(seq);
 393         else
 394                 r = rt_cache_get_next(seq, v);
 395         ++*pos;
 396         return r;
 397 }
 398
 399 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 400 {
 401         if (v && v != SEQ_START_TOKEN)
 402                 rcu_read_unlock_bh();
 403 }
 404
 405 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 406 {
 407         if (v == SEQ_START_TOKEN)
 408                 seq_printf(seq, "%-127s\n",
 409                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 410                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 411                            "HHUptod\tSpecDst");
 412         else {
 413                 struct rtable *r = v;
 414                 int len;
 415
 416                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 417                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 418                         r->dst.dev ? r->dst.dev->name : "*",
 419                         (__force u32)r->rt_dst,
 420                         (__force u32)r->rt_gateway,
 421                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 422                         r->dst.__use, 0, (__force u32)r->rt_src,
 423                         dst_metric_advmss(&r->dst) + 40,
 424                         dst_metric(&r->dst, RTAX_WINDOW),
 425                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 426                               dst_metric(&r->dst, RTAX_RTTVAR)),
 427                         r->rt_key_tos,
 428                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 429                         r->dst.hh ? (r->dst.hh->hh_output ==
 430                                        dev_queue_xmit) : 0,
 431                         r->rt_spec_dst, &len);
 432
 433                 seq_printf(seq, "%*s\n", 127 - len, "");
 434         }
 435         return 0;
 436 }
 437
 438 static const struct seq_operations rt_cache_seq_ops = {
 439         .start  = rt_cache_seq_start,
 440         .next   = rt_cache_seq_next,
 441         .stop   = rt_cache_seq_stop,
 442         .show   = rt_cache_seq_show,
 443 };
 444
 445 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 446 {
 447         return seq_open_net(inode, file, &rt_cache_seq_ops,
 448                         sizeof(struct rt_cache_iter_state));
 449 }
 450
 451 static const struct file_operations rt_cache_seq_fops = {
 452         .owner   = THIS_MODULE,
 453         .open    = rt_cache_seq_open,
 454         .read    = seq_read,
 455         .llseek  = seq_lseek,
 456         .release = seq_release_net,
 457 };
 458
 459
 460 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 461 {
 462         int cpu;
 463
 464         if (*pos == 0)
 465                 return SEQ_START_TOKEN;
 466
 467         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 468                 if (!cpu_possible(cpu))
 469                         continue;
 470                 *pos = cpu+1;
 471                 return &per_cpu(rt_cache_stat, cpu);
 472         }
 473         return NULL;
 474 }
 475
 476 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 477 {
 478         int cpu;
 479
 480         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 481                 if (!cpu_possible(cpu))
 482                         continue;
 483                 *pos = cpu+1;
 484                 return &per_cpu(rt_cache_stat, cpu);
 485         }
 486         return NULL;
 487
 488 }
 489
 490 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 491 {
 492
 493 }
 494
 495 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 496 {
 497         struct rt_cache_stat *st = v;
 498
 499         if (v == SEQ_START_TOKEN) {
 500                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 501                 return 0;
 502         }
 503
 504         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 505                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 506                    dst_entries_get_slow(&ipv4_dst_ops),
 507                    st->in_hit,
 508                    st->in_slow_tot,
 509                    st->in_slow_mc,
 510                    st->in_no_route,
 511                    st->in_brd,
 512                    st->in_martian_dst,
 513                    st->in_martian_src,
 514
 515                    st->out_hit,
 516                    st->out_slow_tot,
 517                    st->out_slow_mc,
 518
 519                    st->gc_total,
 520                    st->gc_ignored,
 521                    st->gc_goal_miss,
 522                    st->gc_dst_overflow,
 523                    st->in_hlist_search,
 524                    st->out_hlist_search
 525                 );
 526         return 0;
 527 }
 528
 529 static const struct seq_operations rt_cpu_seq_ops = {
 530         .start  = rt_cpu_seq_start,
 531         .next   = rt_cpu_seq_next,
 532         .stop   = rt_cpu_seq_stop,
 533         .show   = rt_cpu_seq_show,
 534 };
 535
 536
 537 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 538 {
 539         return seq_open(file, &rt_cpu_seq_ops);
 540 }
 541
 542 static const struct file_operations rt_cpu_seq_fops = {
 543         .owner   = THIS_MODULE,
 544         .open    = rt_cpu_seq_open,
 545         .read    = seq_read,
 546         .llseek  = seq_lseek,
 547         .release = seq_release,
 548 };
 549
 550 #ifdef CONFIG_IP_ROUTE_CLASSID
 551 static int rt_acct_proc_show(struct seq_file *m, void *v)
 552 {
 553         struct ip_rt_acct *dst, *src;
 554         unsigned int i, j;
 555
 556         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 557         if (!dst)
 558                 return -ENOMEM;
 559
 560         for_each_possible_cpu(i) {
 561                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 562                 for (j = 0; j < 256; j++) {
 563                         dst[j].o_bytes   += src[j].o_bytes;
 564                         dst[j].o_packets += src[j].o_packets;
 565                         dst[j].i_bytes   += src[j].i_bytes;
 566                         dst[j].i_packets += src[j].i_packets;
 567                 }
 568         }
 569
 570         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 571         kfree(dst);
 572         return 0;
 573 }
 574
 575 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 576 {
 577         return single_open(file, rt_acct_proc_show, NULL);
 578 }
 579
 580 static const struct file_operations rt_acct_proc_fops = {
 581         .owner          = THIS_MODULE,
 582         .open           = rt_acct_proc_open,
 583         .read           = seq_read,
 584         .llseek         = seq_lseek,
 585         .release        = single_release,
 586 };
 587 #endif
 588
 589 static int __net_init ip_rt_do_proc_init(struct net *net)
 590 {
 591         struct proc_dir_entry *pde;
 592
 593         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 594                         &rt_cache_seq_fops);
 595         if (!pde)
 596                 goto err1;
 597
 598         pde = proc_create("rt_cache", S_IRUGO,
 599                           net->proc_net_stat, &rt_cpu_seq_fops);
 600         if (!pde)
 601                 goto err2;
 602
 603 #ifdef CONFIG_IP_ROUTE_CLASSID
 604         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 605         if (!pde)
 606                 goto err3;
 607 #endif
 608         return 0;
 609
 610 #ifdef CONFIG_IP_ROUTE_CLASSID
 611 err3:
 612         remove_proc_entry("rt_cache", net->proc_net_stat);
 613 #endif
 614 err2:
 615         remove_proc_entry("rt_cache", net->proc_net);
 616 err1:
 617         return -ENOMEM;
 618 }
 619
 620 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 621 {
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623         remove_proc_entry("rt_cache", net->proc_net);
 624 #ifdef CONFIG_IP_ROUTE_CLASSID
 625         remove_proc_entry("rt_acct", net->proc_net);
 626 #endif
 627 }
 628
 629 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 630         .init = ip_rt_do_proc_init,
 631         .exit = ip_rt_do_proc_exit,
 632 };
 633
 634 static int __init ip_rt_proc_init(void)
 635 {
 636         return register_pernet_subsys(&ip_rt_proc_ops);
 637 }
 638
 639 #else
 640 static inline int ip_rt_proc_init(void)
 641 {
 642         return 0;
 643 }
 644 #endif /* CONFIG_PROC_FS */
 645
 646 static inline void rt_free(struct rtable *rt)
 647 {
 648         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 649 }
 650
 651 static inline void rt_drop(struct rtable *rt)
 652 {
 653         ip_rt_put(rt);
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline int rt_fast_clean(struct rtable *rth)
 658 {
 659         /* Kill broadcast/multicast entries very aggresively, if they
 660            collide in hash table with more useful entries */
 661         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 662                 rt_is_input_route(rth) && rth->dst.rt_next;
 663 }
 664
 665 static inline int rt_valuable(struct rtable *rth)
 666 {
 667         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 668                 (rth->peer && rth->peer->pmtu_expires);
 669 }
 670
 671 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 672 {
 673         unsigned long age;
 674         int ret = 0;
 675
 676         if (atomic_read(&rth->dst.__refcnt))
 677                 goto out;
 678
 679         age = jiffies - rth->dst.lastuse;
 680         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 681             (age <= tmo2 && rt_valuable(rth)))
 682                 goto out;
 683         ret = 1;
 684 out:    return ret;
 685 }
 686
 687 /* Bits of score are:
 688  * 31: very valuable
 689  * 30: not quite useless
 690  * 29..0: usage counter
 691  */
 692 static inline u32 rt_score(struct rtable *rt)
 693 {
 694         u32 score = jiffies - rt->dst.lastuse;
 695
 696         score = ~score & ~(3<<30);
 697
 698         if (rt_valuable(rt))
 699                 score |= (1<<31);
 700
 701         if (rt_is_output_route(rt) ||
 702             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 703                 score |= (1<<30);
 704
 705         return score;
 706 }
 707
 708 static inline bool rt_caching(const struct net *net)
 709 {
 710         return net->ipv4.current_rt_cache_rebuild_count <=
 711                 net->ipv4.sysctl_rt_cache_rebuild_count;
 712 }
 713
 714 static inline bool compare_hash_inputs(const struct rtable *rt1,
 715                                        const struct rtable *rt2)
 716 {
 717         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 718                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 719                 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
 720 }
 721
 722 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 723 {
 724         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 725                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 726                 (rt1->rt_mark ^ rt2->rt_mark) |
 727                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 728                 (rt1->rt_oif ^ rt2->rt_oif) |
 729                 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
 730 }
 731
 732 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 735 }
 736
 737 static inline int rt_is_expired(struct rtable *rth)
 738 {
 739         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 740 }
 741
 742 /*
 743  * Perform a full scan of hash table and free all entries.
 744  * Can be called by a softirq or a process.
 745  * In the later case, we want to be reschedule if necessary
 746  */
 747 static void rt_do_flush(struct net *net, int process_context)
 748 {
 749         unsigned int i;
 750         struct rtable *rth, *next;
 751
 752         for (i = 0; i <= rt_hash_mask; i++) {
 753                 struct rtable __rcu **pprev;
 754                 struct rtable *list;
 755
 756                 if (process_context && need_resched())
 757                         cond_resched();
 758                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 759                 if (!rth)
 760                         continue;
 761
 762                 spin_lock_bh(rt_hash_lock_addr(i));
 763
 764                 list = NULL;
 765                 pprev = &rt_hash_table[i].chain;
 766                 rth = rcu_dereference_protected(*pprev,
 767                         lockdep_is_held(rt_hash_lock_addr(i)));
 768
 769                 while (rth) {
 770                         next = rcu_dereference_protected(rth->dst.rt_next,
 771                                 lockdep_is_held(rt_hash_lock_addr(i)));
 772
 773                         if (!net ||
 774                             net_eq(dev_net(rth->dst.dev), net)) {
 775                                 rcu_assign_pointer(*pprev, next);
 776                                 rcu_assign_pointer(rth->dst.rt_next, list);
 777                                 list = rth;
 778                         } else {
 779                                 pprev = &rth->dst.rt_next;
 780                         }
 781                         rth = next;
 782                 }
 783
 784                 spin_unlock_bh(rt_hash_lock_addr(i));
 785
 786                 for (; list; list = next) {
 787                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 788                         rt_free(list);
 789                 }
 790         }
 791 }
 792
 793 /*
 794  * While freeing expired entries, we compute average chain length
 795  * and standard deviation, using fixed-point arithmetic.
 796  * This to have an estimation of rt_chain_length_max
 797  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 798  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 799  */
 800
 801 #define FRACT_BITS 3
 802 #define ONE (1UL << FRACT_BITS)
 803
 804 /*
 805  * Given a hash chain and an item in this hash chain,
 806  * find if a previous entry has the same hash_inputs
 807  * (but differs on tos, mark or oif)
 808  * Returns 0 if an alias is found.
 809  * Returns ONE if rth has no alias before itself.
 810  */
 811 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 812 {
 813         const struct rtable *aux = head;
 814
 815         while (aux != rth) {
 816                 if (compare_hash_inputs(aux, rth))
 817                         return 0;
 818                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 819         }
 820         return ONE;
 821 }
 822
 823 /*
 824  * Perturbation of rt_genid by a small quantity [1..256]
 825  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 826  * many times (2^24) without giving recent rt_genid.
 827  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 828  */
 829 static void rt_cache_invalidate(struct net *net)
 830 {
 831         unsigned char shuffle;
 832
 833         get_random_bytes(&shuffle, sizeof(shuffle));
 834         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 835 }
 836
 837 /*
 838  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 839  * delay >= 0 : invalidate & flush cache (can be long)
 840  */
 841 void rt_cache_flush(struct net *net, int delay)
 842 {
 843         rt_cache_invalidate(net);
 844         if (delay >= 0)
 845                 rt_do_flush(net, !in_softirq());
 846 }
 847
 848 /* Flush previous cache invalidated entries from the cache */
 849 void rt_cache_flush_batch(struct net *net)
 850 {
 851         rt_do_flush(net, !in_softirq());
 852 }
 853
 854 static void rt_emergency_hash_rebuild(struct net *net)
 855 {
 856         if (net_ratelimit())
 857                 printk(KERN_WARNING "Route hash chain too long!\n");
 858         rt_cache_invalidate(net);
 859 }
 860
 861 /*
 862    Short description of GC goals.
 863
 864    We want to build algorithm, which will keep routing cache
 865    at some equilibrium point, when number of aged off entries
 866    is kept approximately equal to newly generated ones.
 867
 868    Current expiration strength is variable "expire".
 869    We try to adjust it dynamically, so that if networking
 870    is idle expires is large enough to keep enough of warm entries,
 871    and when load increases it reduces to limit cache size.
 872  */
 873
 874 static int rt_garbage_collect(struct dst_ops *ops)
 875 {
 876         static unsigned long expire = RT_GC_TIMEOUT;
 877         static unsigned long last_gc;
 878         static int rover;
 879         static int equilibrium;
 880         struct rtable *rth;
 881         struct rtable __rcu **rthp;
 882         unsigned long now = jiffies;
 883         int goal;
 884         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 885
 886         /*
 887          * Garbage collection is pretty expensive,
 888          * do not make it too frequently.
 889          */
 890
 891         RT_CACHE_STAT_INC(gc_total);
 892
 893         if (now - last_gc < ip_rt_gc_min_interval &&
 894             entries < ip_rt_max_size) {
 895                 RT_CACHE_STAT_INC(gc_ignored);
 896                 goto out;
 897         }
 898
 899         entries = dst_entries_get_slow(&ipv4_dst_ops);
 900         /* Calculate number of entries, which we want to expire now. */
 901         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 902         if (goal <= 0) {
 903                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 904                         equilibrium = ipv4_dst_ops.gc_thresh;
 905                 goal = entries - equilibrium;
 906                 if (goal > 0) {
 907                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 908                         goal = entries - equilibrium;
 909                 }
 910         } else {
 911                 /* We are in dangerous area. Try to reduce cache really
 912                  * aggressively.
 913                  */
 914                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 915                 equilibrium = entries - goal;
 916         }
 917
 918         if (now - last_gc >= ip_rt_gc_min_interval)
 919                 last_gc = now;
 920
 921         if (goal <= 0) {
 922                 equilibrium += goal;
 923                 goto work_done;
 924         }
 925
 926         do {
 927                 int i, k;
 928
 929                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 930                         unsigned long tmo = expire;
 931
 932                         k = (k + 1) & rt_hash_mask;
 933                         rthp = &rt_hash_table[k].chain;
 934                         spin_lock_bh(rt_hash_lock_addr(k));
 935                         while ((rth = rcu_dereference_protected(*rthp,
 936                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 937                                 if (!rt_is_expired(rth) &&
 938                                         !rt_may_expire(rth, tmo, expire)) {
 939                                         tmo >>= 1;
 940                                         rthp = &rth->dst.rt_next;
 941                                         continue;
 942                                 }
 943                                 *rthp = rth->dst.rt_next;
 944                                 rt_free(rth);
 945                                 goal--;
 946                         }
 947                         spin_unlock_bh(rt_hash_lock_addr(k));
 948                         if (goal <= 0)
 949                                 break;
 950                 }
 951                 rover = k;
 952
 953                 if (goal <= 0)
 954                         goto work_done;
 955
 956                 /* Goal is not achieved. We stop process if:
 957
 958                    - if expire reduced to zero. Otherwise, expire is halfed.
 959                    - if table is not full.
 960                    - if we are called from interrupt.
 961                    - jiffies check is just fallback/debug loop breaker.
 962                      We will not spin here for long time in any case.
 963                  */
 964
 965                 RT_CACHE_STAT_INC(gc_goal_miss);
 966
 967                 if (expire == 0)
 968                         break;
 969
 970                 expire >>= 1;
 971
 972                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 973                         goto out;
 974         } while (!in_softirq() && time_before_eq(jiffies, now));
 975
 976         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 977                 goto out;
 978         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 979                 goto out;
 980         if (net_ratelimit())
 981                 printk(KERN_WARNING "dst cache overflow\n");
 982         RT_CACHE_STAT_INC(gc_dst_overflow);
 983         return 1;
 984
 985 work_done:
 986         expire += ip_rt_gc_min_interval;
 987         if (expire > ip_rt_gc_timeout ||
 988             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 989             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 990                 expire = ip_rt_gc_timeout;
 991 out:    return 0;
 992 }
 993
 994 /*
 995  * Returns number of entries in a hash chain that have different hash_inputs
 996  */
 997 static int slow_chain_length(const struct rtable *head)
 998 {
 999         int length = 0;
1000         const struct rtable *rth = head;
1001
1002         while (rth) {
1003                 length += has_noalias(head, rth);
1004                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1005         }
1006         return length >> FRACT_BITS;
1007 }
1008
1009 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1010                                      struct sk_buff *skb, int ifindex)
1011 {
1012         struct rtable   *rth, *cand;
1013         struct rtable __rcu **rthp, **candp;
1014         unsigned long   now;
1015         u32             min_score;
1016         int             chain_length;
1017         int attempts = !in_softirq();
1018
1019 restart:
1020         chain_length = 0;
1021         min_score = ~(u32)0;
1022         cand = NULL;
1023         candp = NULL;
1024         now = jiffies;
1025
1026         if (!rt_caching(dev_net(rt->dst.dev))) {
1027                 /*
1028                  * If we're not caching, just tell the caller we
1029                  * were successful and don't touch the route.  The
1030                  * caller hold the sole reference to the cache entry, and
1031                  * it will be released when the caller is done with it.
1032                  * If we drop it here, the callers have no way to resolve routes
1033                  * when we're not caching.  Instead, just point *rp at rt, so
1034                  * the caller gets a single use out of the route
1035                  * Note that we do rt_free on this new route entry, so that
1036                  * once its refcount hits zero, we are still able to reap it
1037                  * (Thanks Alexey)
1038                  * Note: To avoid expensive rcu stuff for this uncached dst,
1039                  * we set DST_NOCACHE so that dst_release() can free dst without
1040                  * waiting a grace period.
1041                  */
1042
1043                 rt->dst.flags |= DST_NOCACHE;
1044                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1045                         int err = arp_bind_neighbour(&rt->dst);
1046                         if (err) {
1047                                 if (net_ratelimit())
1048                                         printk(KERN_WARNING
1049                                             "Neighbour table failure & not caching routes.\n");
1050                                 ip_rt_put(rt);
1051                                 return ERR_PTR(err);
1052                         }
1053                 }
1054
1055                 goto skip_hashing;
1056         }
1057
1058         rthp = &rt_hash_table[hash].chain;
1059
1060         spin_lock_bh(rt_hash_lock_addr(hash));
1061         while ((rth = rcu_dereference_protected(*rthp,
1062                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1063                 if (rt_is_expired(rth)) {
1064                         *rthp = rth->dst.rt_next;
1065                         rt_free(rth);
1066                         continue;
1067                 }
1068                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1069                         /* Put it first */
1070                         *rthp = rth->dst.rt_next;
1071                         /*
1072                          * Since lookup is lockfree, the deletion
1073                          * must be visible to another weakly ordered CPU before
1074                          * the insertion at the start of the hash chain.
1075                          */
1076                         rcu_assign_pointer(rth->dst.rt_next,
1077                                            rt_hash_table[hash].chain);
1078                         /*
1079                          * Since lookup is lockfree, the update writes
1080                          * must be ordered for consistency on SMP.
1081                          */
1082                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1083
1084                         dst_use(&rth->dst, now);
1085                         spin_unlock_bh(rt_hash_lock_addr(hash));
1086
1087                         rt_drop(rt);
1088                         if (skb)
1089                                 skb_dst_set(skb, &rth->dst);
1090                         return rth;
1091                 }
1092
1093                 if (!atomic_read(&rth->dst.__refcnt)) {
1094                         u32 score = rt_score(rth);
1095
1096                         if (score <= min_score) {
1097                                 cand = rth;
1098                                 candp = rthp;
1099                                 min_score = score;
1100                         }
1101                 }
1102
1103                 chain_length++;
1104
1105                 rthp = &rth->dst.rt_next;
1106         }
1107
1108         if (cand) {
1109                 /* ip_rt_gc_elasticity used to be average length of chain
1110                  * length, when exceeded gc becomes really aggressive.
1111                  *
1112                  * The second limit is less certain. At the moment it allows
1113                  * only 2 entries per bucket. We will see.
1114                  */
1115                 if (chain_length > ip_rt_gc_elasticity) {
1116                         *candp = cand->dst.rt_next;
1117                         rt_free(cand);
1118                 }
1119         } else {
1120                 if (chain_length > rt_chain_length_max &&
1121                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1122                         struct net *net = dev_net(rt->dst.dev);
1123                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1124                         if (!rt_caching(net)) {
1125                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1126                                         rt->dst.dev->name, num);
1127                         }
1128                         rt_emergency_hash_rebuild(net);
1129                         spin_unlock_bh(rt_hash_lock_addr(hash));
1130
1131                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1132                                         ifindex, rt_genid(net));
1133                         goto restart;
1134                 }
1135         }
1136
1137         /* Try to bind route to arp only if it is output
1138            route or unicast forwarding path.
1139          */
1140         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1141                 int err = arp_bind_neighbour(&rt->dst);
1142                 if (err) {
1143                         spin_unlock_bh(rt_hash_lock_addr(hash));
1144
1145                         if (err != -ENOBUFS) {
1146                                 rt_drop(rt);
1147                                 return ERR_PTR(err);
1148                         }
1149
1150                         /* Neighbour tables are full and nothing
1151                            can be released. Try to shrink route cache,
1152                            it is most likely it holds some neighbour records.
1153                          */
1154                         if (attempts-- > 0) {
1155                                 int saved_elasticity = ip_rt_gc_elasticity;
1156                                 int saved_int = ip_rt_gc_min_interval;
1157                                 ip_rt_gc_elasticity     = 1;
1158                                 ip_rt_gc_min_interval   = 0;
1159                                 rt_garbage_collect(&ipv4_dst_ops);
1160                                 ip_rt_gc_min_interval   = saved_int;
1161                                 ip_rt_gc_elasticity     = saved_elasticity;
1162                                 goto restart;
1163                         }
1164
1165                         if (net_ratelimit())
1166                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1167                         rt_drop(rt);
1168                         return ERR_PTR(-ENOBUFS);
1169                 }
1170         }
1171
1172         rt->dst.rt_next = rt_hash_table[hash].chain;
1173
1174         /*
1175          * Since lookup is lockfree, we must make sure
1176          * previous writes to rt are committed to memory
1177          * before making rt visible to other CPUS.
1178          */
1179         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1180
1181         spin_unlock_bh(rt_hash_lock_addr(hash));
1182
1183 skip_hashing:
1184         if (skb)
1185                 skb_dst_set(skb, &rt->dst);
1186         return rt;
1187 }
1188
1189 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191 static u32 rt_peer_genid(void)
1192 {
1193         return atomic_read(&__rt_peer_genid);
1194 }
1195
1196 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1197 {
1198         struct inet_peer *peer;
1199
1200         peer = inet_getpeer_v4(daddr, create);
1201
1202         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1203                 inet_putpeer(peer);
1204         else
1205                 rt->rt_peer_genid = rt_peer_genid();
1206 }
1207
1208 /*
1209  * Peer allocation may fail only in serious out-of-memory conditions.  However
1210  * we still can generate some output.
1211  * Random ID selection looks a bit dangerous because we have no chances to
1212  * select ID being unique in a reasonable period of time.
1213  * But broken packet identifier may be better than no packet at all.
1214  */
1215 static void ip_select_fb_ident(struct iphdr *iph)
1216 {
1217         static DEFINE_SPINLOCK(ip_fb_id_lock);
1218         static u32 ip_fallback_id;
1219         u32 salt;
1220
1221         spin_lock_bh(&ip_fb_id_lock);
1222         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1223         iph->id = htons(salt & 0xFFFF);
1224         ip_fallback_id = salt;
1225         spin_unlock_bh(&ip_fb_id_lock);
1226 }
1227
1228 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1229 {
1230         struct rtable *rt = (struct rtable *) dst;
1231
1232         if (rt) {
1233                 if (rt->peer == NULL)
1234                         rt_bind_peer(rt, rt->rt_dst, 1);
1235
1236                 /* If peer is attached to destination, it is never detached,
1237                    so that we need not to grab a lock to dereference it.
1238                  */
1239                 if (rt->peer) {
1240                         iph->id = htons(inet_getid(rt->peer, more));
1241                         return;
1242                 }
1243         } else
1244                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1245                        __builtin_return_address(0));
1246
1247         ip_select_fb_ident(iph);
1248 }
1249 EXPORT_SYMBOL(__ip_select_ident);
1250
1251 static void rt_del(unsigned hash, struct rtable *rt)
1252 {
1253         struct rtable __rcu **rthp;
1254         struct rtable *aux;
1255
1256         rthp = &rt_hash_table[hash].chain;
1257         spin_lock_bh(rt_hash_lock_addr(hash));
1258         ip_rt_put(rt);
1259         while ((aux = rcu_dereference_protected(*rthp,
1260                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1261                 if (aux == rt || rt_is_expired(aux)) {
1262                         *rthp = aux->dst.rt_next;
1263                         rt_free(aux);
1264                         continue;
1265                 }
1266                 rthp = &aux->dst.rt_next;
1267         }
1268         spin_unlock_bh(rt_hash_lock_addr(hash));
1269 }
1270
1271 /* called in rcu_read_lock() section */
1272 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1273                     __be32 saddr, struct net_device *dev)
1274 {
1275         struct in_device *in_dev = __in_dev_get_rcu(dev);
1276         struct inet_peer *peer;
1277         struct net *net;
1278
1279         if (!in_dev)
1280                 return;
1281
1282         net = dev_net(dev);
1283         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1284             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1285             ipv4_is_zeronet(new_gw))
1286                 goto reject_redirect;
1287
1288         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1289                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1290                         goto reject_redirect;
1291                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1292                         goto reject_redirect;
1293         } else {
1294                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1295                         goto reject_redirect;
1296         }
1297
1298         peer = inet_getpeer_v4(daddr, 1);
1299         if (peer) {
1300                 peer->redirect_learned.a4 = new_gw;
1301
1302                 inet_putpeer(peer);
1303
1304                 atomic_inc(&__rt_peer_genid);
1305         }
1306         return;
1307
1308 reject_redirect:
1309 #ifdef CONFIG_IP_ROUTE_VERBOSE
1310         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1311                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1312                         "  Advised path = %pI4 -> %pI4\n",
1313                        &old_gw, dev->name, &new_gw,
1314                        &saddr, &daddr);
1315 #endif
1316         ;
1317 }
1318
1319 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1320 {
1321         struct rtable *rt = (struct rtable *)dst;
1322         struct dst_entry *ret = dst;
1323
1324         if (rt) {
1325                 if (dst->obsolete > 0) {
1326                         ip_rt_put(rt);
1327                         ret = NULL;
1328                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1329                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1330                                                 rt->rt_oif,
1331                                                 rt_genid(dev_net(dst->dev)));
1332                         rt_del(hash, rt);
1333                         ret = NULL;
1334                 } else if (rt->peer &&
1335                            rt->peer->pmtu_expires &&
1336                            time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1337                         unsigned long orig = rt->peer->pmtu_expires;
1338
1339                         if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1340                                 dst_metric_set(dst, RTAX_MTU,
1341                                                rt->peer->pmtu_orig);
1342                 }
1343         }
1344         return ret;
1345 }
1346
1347 /*
1348  * Algorithm:
1349  *      1. The first ip_rt_redirect_number redirects are sent
1350  *         with exponential backoff, then we stop sending them at all,
1351  *         assuming that the host ignores our redirects.
1352  *      2. If we did not see packets requiring redirects
1353  *         during ip_rt_redirect_silence, we assume that the host
1354  *         forgot redirected route and start to send redirects again.
1355  *
1356  * This algorithm is much cheaper and more intelligent than dumb load limiting
1357  * in icmp.c.
1358  *
1359  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1360  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1361  */
1362
1363 void ip_rt_send_redirect(struct sk_buff *skb)
1364 {
1365         struct rtable *rt = skb_rtable(skb);
1366         struct in_device *in_dev;
1367         struct inet_peer *peer;
1368         int log_martians;
1369
1370         rcu_read_lock();
1371         in_dev = __in_dev_get_rcu(rt->dst.dev);
1372         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1373                 rcu_read_unlock();
1374                 return;
1375         }
1376         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1377         rcu_read_unlock();
1378
1379         if (!rt->peer)
1380                 rt_bind_peer(rt, rt->rt_dst, 1);
1381         peer = rt->peer;
1382         if (!peer) {
1383                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1384                 return;
1385         }
1386
1387         /* No redirected packets during ip_rt_redirect_silence;
1388          * reset the algorithm.
1389          */
1390         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1391                 peer->rate_tokens = 0;
1392
1393         /* Too many ignored redirects; do not send anything
1394          * set dst.rate_last to the last seen redirected packet.
1395          */
1396         if (peer->rate_tokens >= ip_rt_redirect_number) {
1397                 peer->rate_last = jiffies;
1398                 return;
1399         }
1400
1401         /* Check for load limit; set rate_last to the latest sent
1402          * redirect.
1403          */
1404         if (peer->rate_tokens == 0 ||
1405             time_after(jiffies,
1406                        (peer->rate_last +
1407                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1408                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1409                 peer->rate_last = jiffies;
1410                 ++peer->rate_tokens;
1411 #ifdef CONFIG_IP_ROUTE_VERBOSE
1412                 if (log_martians &&
1413                     peer->rate_tokens == ip_rt_redirect_number &&
1414                     net_ratelimit())
1415                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1416                                &ip_hdr(skb)->saddr, rt->rt_iif,
1417                                 &rt->rt_dst, &rt->rt_gateway);
1418 #endif
1419         }
1420 }
1421
1422 static int ip_error(struct sk_buff *skb)
1423 {
1424         struct rtable *rt = skb_rtable(skb);
1425         struct inet_peer *peer;
1426         unsigned long now;
1427         bool send;
1428         int code;
1429
1430         switch (rt->dst.error) {
1431                 case EINVAL:
1432                 default:
1433                         goto out;
1434                 case EHOSTUNREACH:
1435                         code = ICMP_HOST_UNREACH;
1436                         break;
1437                 case ENETUNREACH:
1438                         code = ICMP_NET_UNREACH;
1439                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1440                                         IPSTATS_MIB_INNOROUTES);
1441                         break;
1442                 case EACCES:
1443                         code = ICMP_PKT_FILTERED;
1444                         break;
1445         }
1446
1447         if (!rt->peer)
1448                 rt_bind_peer(rt, rt->rt_dst, 1);
1449         peer = rt->peer;
1450
1451         send = true;
1452         if (peer) {
1453                 now = jiffies;
1454                 peer->rate_tokens += now - peer->rate_last;
1455                 if (peer->rate_tokens > ip_rt_error_burst)
1456                         peer->rate_tokens = ip_rt_error_burst;
1457                 peer->rate_last = now;
1458                 if (peer->rate_tokens >= ip_rt_error_cost)
1459                         peer->rate_tokens -= ip_rt_error_cost;
1460                 else
1461                         send = false;
1462         }
1463         if (send)
1464                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1465
1466 out:    kfree_skb(skb);
1467         return 0;
1468 }
1469
1470 /*
1471  *      The last two values are not from the RFC but
1472  *      are needed for AMPRnet AX.25 paths.
1473  */
1474
1475 static const unsigned short mtu_plateau[] =
1476 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1477
1478 static inline unsigned short guess_mtu(unsigned short old_mtu)
1479 {
1480         int i;
1481
1482         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1483                 if (old_mtu > mtu_plateau[i])
1484                         return mtu_plateau[i];
1485         return 68;
1486 }
1487
1488 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1489                                  unsigned short new_mtu,
1490                                  struct net_device *dev)
1491 {
1492         unsigned short old_mtu = ntohs(iph->tot_len);
1493         unsigned short est_mtu = 0;
1494         struct inet_peer *peer;
1495
1496         peer = inet_getpeer_v4(iph->daddr, 1);
1497         if (peer) {
1498                 unsigned short mtu = new_mtu;
1499
1500                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1501                         /* BSD 4.2 derived systems incorrectly adjust
1502                          * tot_len by the IP header length, and report
1503                          * a zero MTU in the ICMP message.
1504                          */
1505                         if (mtu == 0 &&
1506                             old_mtu >= 68 + (iph->ihl << 2))
1507                                 old_mtu -= iph->ihl << 2;
1508                         mtu = guess_mtu(old_mtu);
1509                 }
1510
1511                 if (mtu < ip_rt_min_pmtu)
1512                         mtu = ip_rt_min_pmtu;
1513                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1514                         unsigned long pmtu_expires;
1515
1516                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1517                         if (!pmtu_expires)
1518                                 pmtu_expires = 1UL;
1519
1520                         est_mtu = mtu;
1521                         peer->pmtu_learned = mtu;
1522                         peer->pmtu_expires = pmtu_expires;
1523                 }
1524
1525                 inet_putpeer(peer);
1526
1527                 atomic_inc(&__rt_peer_genid);
1528         }
1529         return est_mtu ? : new_mtu;
1530 }
1531
1532 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1533 {
1534         unsigned long expires = peer->pmtu_expires;
1535
1536         if (time_before(jiffies, expires)) {
1537                 u32 orig_dst_mtu = dst_mtu(dst);
1538                 if (peer->pmtu_learned < orig_dst_mtu) {
1539                         if (!peer->pmtu_orig)
1540                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1541                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1542                 }
1543         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1544                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1545 }
1546
1547 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1548 {
1549         struct rtable *rt = (struct rtable *) dst;
1550         struct inet_peer *peer;
1551
1552         dst_confirm(dst);
1553
1554         if (!rt->peer)
1555                 rt_bind_peer(rt, rt->rt_dst, 1);
1556         peer = rt->peer;
1557         if (peer) {
1558                 if (mtu < ip_rt_min_pmtu)
1559                         mtu = ip_rt_min_pmtu;
1560                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1561                         unsigned long pmtu_expires;
1562
1563                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1564                         if (!pmtu_expires)
1565                                 pmtu_expires = 1UL;
1566
1567                         peer->pmtu_learned = mtu;
1568                         peer->pmtu_expires = pmtu_expires;
1569
1570                         atomic_inc(&__rt_peer_genid);
1571                         rt->rt_peer_genid = rt_peer_genid();
1572                 }
1573                 check_peer_pmtu(dst, peer);
1574         }
1575 }
1576
1577 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1578 {
1579         struct rtable *rt = (struct rtable *) dst;
1580         __be32 orig_gw = rt->rt_gateway;
1581
1582         dst_confirm(&rt->dst);
1583
1584         neigh_release(rt->dst.neighbour);
1585         rt->dst.neighbour = NULL;
1586
1587         rt->rt_gateway = peer->redirect_learned.a4;
1588         if (arp_bind_neighbour(&rt->dst) ||
1589             !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1590                 if (rt->dst.neighbour)
1591                         neigh_event_send(rt->dst.neighbour, NULL);
1592                 rt->rt_gateway = orig_gw;
1593                 return -EAGAIN;
1594         } else {
1595                 rt->rt_flags |= RTCF_REDIRECTED;
1596                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1597                                         rt->dst.neighbour);
1598         }
1599         return 0;
1600 }
1601
1602 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1603 {
1604         struct rtable *rt = (struct rtable *) dst;
1605
1606         if (rt_is_expired(rt))
1607                 return NULL;
1608         if (rt->rt_peer_genid != rt_peer_genid()) {
1609                 struct inet_peer *peer;
1610
1611                 if (!rt->peer)
1612                         rt_bind_peer(rt, rt->rt_dst, 0);
1613
1614                 peer = rt->peer;
1615                 if (peer && peer->pmtu_expires)
1616                         check_peer_pmtu(dst, peer);
1617
1618                 if (peer && peer->redirect_learned.a4 &&
1619                     peer->redirect_learned.a4 != rt->rt_gateway) {
1620                         if (check_peer_redir(dst, peer))
1621                                 return NULL;
1622                 }
1623
1624                 rt->rt_peer_genid = rt_peer_genid();
1625         }
1626         return dst;
1627 }
1628
1629 static void ipv4_dst_destroy(struct dst_entry *dst)
1630 {
1631         struct rtable *rt = (struct rtable *) dst;
1632         struct inet_peer *peer = rt->peer;
1633
1634         if (rt->fi) {
1635                 fib_info_put(rt->fi);
1636                 rt->fi = NULL;
1637         }
1638         if (peer) {
1639                 rt->peer = NULL;
1640                 inet_putpeer(peer);
1641         }
1642 }
1643
1644
1645 static void ipv4_link_failure(struct sk_buff *skb)
1646 {
1647         struct rtable *rt;
1648
1649         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1650
1651         rt = skb_rtable(skb);
1652         if (rt &&
1653             rt->peer &&
1654             rt->peer->pmtu_expires) {
1655                 unsigned long orig = rt->peer->pmtu_expires;
1656
1657                 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1658                         dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1659         }
1660 }
1661
1662 static int ip_rt_bug(struct sk_buff *skb)
1663 {
1664         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1665                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1666                 skb->dev ? skb->dev->name : "?");
1667         kfree_skb(skb);
1668         WARN_ON(1);
1669         return 0;
1670 }
1671
1672 /*
1673    We do not cache source address of outgoing interface,
1674    because it is used only by IP RR, TS and SRR options,
1675    so that it out of fast path.
1676
1677    BTW remember: "addr" is allowed to be not aligned
1678    in IP options!
1679  */
1680
1681 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1682 {
1683         __be32 src;
1684
1685         if (rt_is_output_route(rt))
1686                 src = ip_hdr(skb)->saddr;
1687         else {
1688                 struct fib_result res;
1689                 struct flowi4 fl4;
1690                 struct iphdr *iph;
1691
1692                 iph = ip_hdr(skb);
1693
1694                 memset(&fl4, 0, sizeof(fl4));
1695                 fl4.daddr = iph->daddr;
1696                 fl4.saddr = iph->saddr;
1697                 fl4.flowi4_tos = iph->tos;
1698                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1699                 fl4.flowi4_iif = skb->dev->ifindex;
1700                 fl4.flowi4_mark = skb->mark;
1701
1702                 rcu_read_lock();
1703                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1704                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1705                 else
1706                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1707                                         RT_SCOPE_UNIVERSE);
1708                 rcu_read_unlock();
1709         }
1710         memcpy(addr, &src, 4);
1711 }
1712
1713 #ifdef CONFIG_IP_ROUTE_CLASSID
1714 static void set_class_tag(struct rtable *rt, u32 tag)
1715 {
1716         if (!(rt->dst.tclassid & 0xFFFF))
1717                 rt->dst.tclassid |= tag & 0xFFFF;
1718         if (!(rt->dst.tclassid & 0xFFFF0000))
1719                 rt->dst.tclassid |= tag & 0xFFFF0000;
1720 }
1721 #endif
1722
1723 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1724 {
1725         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1726
1727         if (advmss == 0) {
1728                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1729                                ip_rt_min_advmss);
1730                 if (advmss > 65535 - 40)
1731                         advmss = 65535 - 40;
1732         }
1733         return advmss;
1734 }
1735
1736 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1737 {
1738         unsigned int mtu = dst->dev->mtu;
1739
1740         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1741                 const struct rtable *rt = (const struct rtable *) dst;
1742
1743                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1744                         mtu = 576;
1745         }
1746
1747         if (mtu > IP_MAX_MTU)
1748                 mtu = IP_MAX_MTU;
1749
1750         return mtu;
1751 }
1752
1753 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1754                             struct fib_info *fi)
1755 {
1756         struct inet_peer *peer;
1757         int create = 0;
1758
1759         /* If a peer entry exists for this destination, we must hook
1760          * it up in order to get at cached metrics.
1761          */
1762         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1763                 create = 1;
1764
1765         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1766         if (peer) {
1767                 rt->rt_peer_genid = rt_peer_genid();
1768                 if (inet_metrics_new(peer))
1769                         memcpy(peer->metrics, fi->fib_metrics,
1770                                sizeof(u32) * RTAX_MAX);
1771                 dst_init_metrics(&rt->dst, peer->metrics, false);
1772
1773                 if (peer->pmtu_expires)
1774                         check_peer_pmtu(&rt->dst, peer);
1775                 if (peer->redirect_learned.a4 &&
1776                     peer->redirect_learned.a4 != rt->rt_gateway) {
1777                         rt->rt_gateway = peer->redirect_learned.a4;
1778                         rt->rt_flags |= RTCF_REDIRECTED;
1779                 }
1780         } else {
1781                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1782                         rt->fi = fi;
1783                         atomic_inc(&fi->fib_clntref);
1784                 }
1785                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1786         }
1787 }
1788
1789 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1790                            const struct fib_result *res,
1791                            struct fib_info *fi, u16 type, u32 itag)
1792 {
1793         struct dst_entry *dst = &rt->dst;
1794
1795         if (fi) {
1796                 if (FIB_RES_GW(*res) &&
1797                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1798                         rt->rt_gateway = FIB_RES_GW(*res);
1799                 rt_init_metrics(rt, fl4, fi);
1800 #ifdef CONFIG_IP_ROUTE_CLASSID
1801                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1802 #endif
1803         }
1804
1805         if (dst_mtu(dst) > IP_MAX_MTU)
1806                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1807         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1808                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1809
1810 #ifdef CONFIG_IP_ROUTE_CLASSID
1811 #ifdef CONFIG_IP_MULTIPLE_TABLES
1812         set_class_tag(rt, fib_rules_tclass(res));
1813 #endif
1814         set_class_tag(rt, itag);
1815 #endif
1816 }
1817
1818 static struct rtable *rt_dst_alloc(struct net_device *dev,
1819                                    bool nopolicy, bool noxfrm)
1820 {
1821         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1822                          DST_HOST |
1823                          (nopolicy ? DST_NOPOLICY : 0) |
1824                          (noxfrm ? DST_NOXFRM : 0));
1825 }
1826
1827 /* called in rcu_read_lock() section */
1828 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1829                                 u8 tos, struct net_device *dev, int our)
1830 {
1831         unsigned int hash;
1832         struct rtable *rth;
1833         __be32 spec_dst;
1834         struct in_device *in_dev = __in_dev_get_rcu(dev);
1835         u32 itag = 0;
1836         int err;
1837
1838         /* Primary sanity checks. */
1839
1840         if (in_dev == NULL)
1841                 return -EINVAL;
1842
1843         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1844             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1845                 goto e_inval;
1846
1847         if (ipv4_is_zeronet(saddr)) {
1848                 if (!ipv4_is_local_multicast(daddr))
1849                         goto e_inval;
1850                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851         } else {
1852                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1853                                           &itag);
1854                 if (err < 0)
1855                         goto e_err;
1856         }
1857         rth = rt_dst_alloc(init_net.loopback_dev,
1858                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1859         if (!rth)
1860                 goto e_nobufs;
1861
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863         rth->dst.tclassid = itag;
1864 #endif
1865         rth->dst.output = ip_rt_bug;
1866
1867         rth->rt_key_dst = daddr;
1868         rth->rt_key_src = saddr;
1869         rth->rt_genid   = rt_genid(dev_net(dev));
1870         rth->rt_flags   = RTCF_MULTICAST;
1871         rth->rt_type    = RTN_MULTICAST;
1872         rth->rt_key_tos = tos;
1873         rth->rt_dst     = daddr;
1874         rth->rt_src     = saddr;
1875         rth->rt_route_iif = dev->ifindex;
1876         rth->rt_iif     = dev->ifindex;
1877         rth->rt_oif     = 0;
1878         rth->rt_mark    = skb->mark;
1879         rth->rt_gateway = daddr;
1880         rth->rt_spec_dst= spec_dst;
1881         rth->rt_peer_genid = 0;
1882         rth->peer = NULL;
1883         rth->fi = NULL;
1884         if (our) {
1885                 rth->dst.input= ip_local_deliver;
1886                 rth->rt_flags |= RTCF_LOCAL;
1887         }
1888
1889 #ifdef CONFIG_IP_MROUTE
1890         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1891                 rth->dst.input = ip_mr_input;
1892 #endif
1893         RT_CACHE_STAT_INC(in_slow_mc);
1894
1895         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1896         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1897         err = 0;
1898         if (IS_ERR(rth))
1899                 err = PTR_ERR(rth);
1900
1901 e_nobufs:
1902         return -ENOBUFS;
1903 e_inval:
1904         return -EINVAL;
1905 e_err:
1906         return err;
1907 }
1908
1909
1910 static void ip_handle_martian_source(struct net_device *dev,
1911                                      struct in_device *in_dev,
1912                                      struct sk_buff *skb,
1913                                      __be32 daddr,
1914                                      __be32 saddr)
1915 {
1916         RT_CACHE_STAT_INC(in_martian_src);
1917 #ifdef CONFIG_IP_ROUTE_VERBOSE
1918         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1919                 /*
1920                  *      RFC1812 recommendation, if source is martian,
1921                  *      the only hint is MAC header.
1922                  */
1923                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1924                         &daddr, &saddr, dev->name);
1925                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1926                         int i;
1927                         const unsigned char *p = skb_mac_header(skb);
1928                         printk(KERN_WARNING "ll header: ");
1929                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1930                                 printk("%02x", *p);
1931                                 if (i < (dev->hard_header_len - 1))
1932                                         printk(":");
1933                         }
1934                         printk("\n");
1935                 }
1936         }
1937 #endif
1938 }
1939
1940 /* called in rcu_read_lock() section */
1941 static int __mkroute_input(struct sk_buff *skb,
1942                            const struct fib_result *res,
1943                            struct in_device *in_dev,
1944                            __be32 daddr, __be32 saddr, u32 tos,
1945                            struct rtable **result)
1946 {
1947         struct rtable *rth;
1948         int err;
1949         struct in_device *out_dev;
1950         unsigned int flags = 0;
1951         __be32 spec_dst;
1952         u32 itag;
1953
1954         /* get a working reference to the output device */
1955         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1956         if (out_dev == NULL) {
1957                 if (net_ratelimit())
1958                         printk(KERN_CRIT "Bug in ip_route_input" \
1959                                "_slow(). Please, report\n");
1960                 return -EINVAL;
1961         }
1962
1963
1964         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1965                                   in_dev->dev, &spec_dst, &itag);
1966         if (err < 0) {
1967                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1968                                          saddr);
1969
1970                 goto cleanup;
1971         }
1972
1973         if (err)
1974                 flags |= RTCF_DIRECTSRC;
1975
1976         if (out_dev == in_dev && err &&
1977             (IN_DEV_SHARED_MEDIA(out_dev) ||
1978              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1979                 flags |= RTCF_DOREDIRECT;
1980
1981         if (skb->protocol != htons(ETH_P_IP)) {
1982                 /* Not IP (i.e. ARP). Do not create route, if it is
1983                  * invalid for proxy arp. DNAT routes are always valid.
1984                  *
1985                  * Proxy arp feature have been extended to allow, ARP
1986                  * replies back to the same interface, to support
1987                  * Private VLAN switch technologies. See arp.c.
1988                  */
1989                 if (out_dev == in_dev &&
1990                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1991                         err = -EINVAL;
1992                         goto cleanup;
1993                 }
1994         }
1995
1996         rth = rt_dst_alloc(out_dev->dev,
1997                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1998                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1999         if (!rth) {
2000                 err = -ENOBUFS;
2001                 goto cleanup;
2002         }
2003
2004         rth->rt_key_dst = daddr;
2005         rth->rt_key_src = saddr;
2006         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2007         rth->rt_flags = flags;
2008         rth->rt_type = res->type;
2009         rth->rt_key_tos = tos;
2010         rth->rt_dst     = daddr;
2011         rth->rt_src     = saddr;
2012         rth->rt_route_iif = in_dev->dev->ifindex;
2013         rth->rt_iif     = in_dev->dev->ifindex;
2014         rth->rt_oif     = 0;
2015         rth->rt_mark    = skb->mark;
2016         rth->rt_gateway = daddr;
2017         rth->rt_spec_dst= spec_dst;
2018         rth->rt_peer_genid = 0;
2019         rth->peer = NULL;
2020         rth->fi = NULL;
2021
2022         rth->dst.input = ip_forward;
2023         rth->dst.output = ip_output;
2024
2025         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2026
2027         *result = rth;
2028         err = 0;
2029  cleanup:
2030         return err;
2031 }
2032
2033 static int ip_mkroute_input(struct sk_buff *skb,
2034                             struct fib_result *res,
2035                             const struct flowi4 *fl4,
2036                             struct in_device *in_dev,
2037                             __be32 daddr, __be32 saddr, u32 tos)
2038 {
2039         struct rtable* rth = NULL;
2040         int err;
2041         unsigned hash;
2042
2043 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2044         if (res->fi && res->fi->fib_nhs > 1)
2045                 fib_select_multipath(res);
2046 #endif
2047
2048         /* create a routing cache entry */
2049         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2050         if (err)
2051                 return err;
2052
2053         /* put it into the cache */
2054         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2055                        rt_genid(dev_net(rth->dst.dev)));
2056         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2057         if (IS_ERR(rth))
2058                 return PTR_ERR(rth);
2059         return 0;
2060 }
2061
2062 /*
2063  *      NOTE. We drop all the packets that has local source
2064  *      addresses, because every properly looped back packet
2065  *      must have correct destination already attached by output routine.
2066  *
2067  *      Such approach solves two big problems:
2068  *      1. Not simplex devices are handled properly.
2069  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2070  *      called with rcu_read_lock()
2071  */
2072
2073 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2074                                u8 tos, struct net_device *dev)
2075 {
2076         struct fib_result res;
2077         struct in_device *in_dev = __in_dev_get_rcu(dev);
2078         struct flowi4   fl4;
2079         unsigned        flags = 0;
2080         u32             itag = 0;
2081         struct rtable * rth;
2082         unsigned        hash;
2083         __be32          spec_dst;
2084         int             err = -EINVAL;
2085         struct net    * net = dev_net(dev);
2086
2087         /* IP on this device is disabled. */
2088
2089         if (!in_dev)
2090                 goto out;
2091
2092         /* Check for the most weird martians, which can be not detected
2093            by fib_lookup.
2094          */
2095
2096         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2097             ipv4_is_loopback(saddr))
2098                 goto martian_source;
2099
2100         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2101                 goto brd_input;
2102
2103         /* Accept zero addresses only to limited broadcast;
2104          * I even do not know to fix it or not. Waiting for complains :-)
2105          */
2106         if (ipv4_is_zeronet(saddr))
2107                 goto martian_source;
2108
2109         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2110                 goto martian_destination;
2111
2112         /*
2113          *      Now we are ready to route packet.
2114          */
2115         fl4.flowi4_oif = 0;
2116         fl4.flowi4_iif = dev->ifindex;
2117         fl4.flowi4_mark = skb->mark;
2118         fl4.flowi4_tos = tos;
2119         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2120         fl4.daddr = daddr;
2121         fl4.saddr = saddr;
2122         err = fib_lookup(net, &fl4, &res);
2123         if (err != 0) {
2124                 if (!IN_DEV_FORWARD(in_dev))
2125                         goto e_hostunreach;
2126                 goto no_route;
2127         }
2128
2129         RT_CACHE_STAT_INC(in_slow_tot);
2130
2131         if (res.type == RTN_BROADCAST)
2132                 goto brd_input;
2133
2134         if (res.type == RTN_LOCAL) {
2135                 err = fib_validate_source(skb, saddr, daddr, tos,
2136                                           net->loopback_dev->ifindex,
2137                                           dev, &spec_dst, &itag);
2138                 if (err < 0)
2139                         goto martian_source_keep_err;
2140                 if (err)
2141                         flags |= RTCF_DIRECTSRC;
2142                 spec_dst = daddr;
2143                 goto local_input;
2144         }
2145
2146         if (!IN_DEV_FORWARD(in_dev))
2147                 goto e_hostunreach;
2148         if (res.type != RTN_UNICAST)
2149                 goto martian_destination;
2150
2151         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2152 out:    return err;
2153
2154 brd_input:
2155         if (skb->protocol != htons(ETH_P_IP))
2156                 goto e_inval;
2157
2158         if (ipv4_is_zeronet(saddr))
2159                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2160         else {
2161                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2162                                           &itag);
2163                 if (err < 0)
2164                         goto martian_source_keep_err;
2165                 if (err)
2166                         flags |= RTCF_DIRECTSRC;
2167         }
2168         flags |= RTCF_BROADCAST;
2169         res.type = RTN_BROADCAST;
2170         RT_CACHE_STAT_INC(in_brd);
2171
2172 local_input:
2173         rth = rt_dst_alloc(net->loopback_dev,
2174                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2175         if (!rth)
2176                 goto e_nobufs;
2177
2178         rth->dst.input= ip_local_deliver;
2179         rth->dst.output= ip_rt_bug;
2180 #ifdef CONFIG_IP_ROUTE_CLASSID
2181         rth->dst.tclassid = itag;
2182 #endif
2183
2184         rth->rt_key_dst = daddr;
2185         rth->rt_key_src = saddr;
2186         rth->rt_genid = rt_genid(net);
2187         rth->rt_flags   = flags|RTCF_LOCAL;
2188         rth->rt_type    = res.type;
2189         rth->rt_key_tos = tos;
2190         rth->rt_dst     = daddr;
2191         rth->rt_src     = saddr;
2192 #ifdef CONFIG_IP_ROUTE_CLASSID
2193         rth->dst.tclassid = itag;
2194 #endif
2195         rth->rt_route_iif = dev->ifindex;
2196         rth->rt_iif     = dev->ifindex;
2197         rth->rt_oif     = 0;
2198         rth->rt_mark    = skb->mark;
2199         rth->rt_gateway = daddr;
2200         rth->rt_spec_dst= spec_dst;
2201         rth->rt_peer_genid = 0;
2202         rth->peer = NULL;
2203         rth->fi = NULL;
2204         if (res.type == RTN_UNREACHABLE) {
2205                 rth->dst.input= ip_error;
2206                 rth->dst.error= -err;
2207                 rth->rt_flags   &= ~RTCF_LOCAL;
2208         }
2209         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2210         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2211         err = 0;
2212         if (IS_ERR(rth))
2213                 err = PTR_ERR(rth);
2214         goto out;
2215
2216 no_route:
2217         RT_CACHE_STAT_INC(in_no_route);
2218         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2219         res.type = RTN_UNREACHABLE;
2220         if (err == -ESRCH)
2221                 err = -ENETUNREACH;
2222         goto local_input;
2223
2224         /*
2225          *      Do not cache martian addresses: they should be logged (RFC1812)
2226          */
2227 martian_destination:
2228         RT_CACHE_STAT_INC(in_martian_dst);
2229 #ifdef CONFIG_IP_ROUTE_VERBOSE
2230         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2231                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2232                         &daddr, &saddr, dev->name);
2233 #endif
2234
2235 e_hostunreach:
2236         err = -EHOSTUNREACH;
2237         goto out;
2238
2239 e_inval:
2240         err = -EINVAL;
2241         goto out;
2242
2243 e_nobufs:
2244         err = -ENOBUFS;
2245         goto out;
2246
2247 martian_source:
2248         err = -EINVAL;
2249 martian_source_keep_err:
2250         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2251         goto out;
2252 }
2253
2254 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2255                            u8 tos, struct net_device *dev, bool noref)
2256 {
2257         struct rtable * rth;
2258         unsigned        hash;
2259         int iif = dev->ifindex;
2260         struct net *net;
2261         int res;
2262
2263         net = dev_net(dev);
2264
2265         rcu_read_lock();
2266
2267         if (!rt_caching(net))
2268                 goto skip_cache;
2269
2270         tos &= IPTOS_RT_MASK;
2271         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2272
2273         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2274              rth = rcu_dereference(rth->dst.rt_next)) {
2275                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2276                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2277                      (rth->rt_iif ^ iif) |
2278                      rth->rt_oif |
2279                      (rth->rt_key_tos ^ tos)) == 0 &&
2280                     rth->rt_mark == skb->mark &&
2281                     net_eq(dev_net(rth->dst.dev), net) &&
2282                     !rt_is_expired(rth)) {
2283                         if (noref) {
2284                                 dst_use_noref(&rth->dst, jiffies);
2285                                 skb_dst_set_noref(skb, &rth->dst);
2286                         } else {
2287                                 dst_use(&rth->dst, jiffies);
2288                                 skb_dst_set(skb, &rth->dst);
2289                         }
2290                         RT_CACHE_STAT_INC(in_hit);
2291                         rcu_read_unlock();
2292                         return 0;
2293                 }
2294                 RT_CACHE_STAT_INC(in_hlist_search);
2295         }
2296
2297 skip_cache:
2298         /* Multicast recognition logic is moved from route cache to here.
2299            The problem was that too many Ethernet cards have broken/missing
2300            hardware multicast filters :-( As result the host on multicasting
2301            network acquires a lot of useless route cache entries, sort of
2302            SDR messages from all the world. Now we try to get rid of them.
2303            Really, provided software IP multicast filter is organized
2304            reasonably (at least, hashed), it does not result in a slowdown
2305            comparing with route cache reject entries.
2306            Note, that multicast routers are not affected, because
2307            route cache entry is created eventually.
2308          */
2309         if (ipv4_is_multicast(daddr)) {
2310                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2311
2312                 if (in_dev) {
2313                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2314                                                   ip_hdr(skb)->protocol);
2315                         if (our
2316 #ifdef CONFIG_IP_MROUTE
2317                                 ||
2318                             (!ipv4_is_local_multicast(daddr) &&
2319                              IN_DEV_MFORWARD(in_dev))
2320 #endif
2321                            ) {
2322                                 int res = ip_route_input_mc(skb, daddr, saddr,
2323                                                             tos, dev, our);
2324                                 rcu_read_unlock();
2325                                 return res;
2326                         }
2327                 }
2328                 rcu_read_unlock();
2329                 return -EINVAL;
2330         }
2331         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2332         rcu_read_unlock();
2333         return res;
2334 }
2335 EXPORT_SYMBOL(ip_route_input_common);
2336
2337 /* called with rcu_read_lock() */
2338 static struct rtable *__mkroute_output(const struct fib_result *res,
2339                                        const struct flowi4 *fl4,
2340                                        __be32 orig_daddr, __be32 orig_saddr,
2341                                        int orig_oif, struct net_device *dev_out,
2342                                        unsigned int flags)
2343 {
2344         struct fib_info *fi = res->fi;
2345         u32 tos = RT_FL_TOS(fl4);
2346         struct in_device *in_dev;
2347         u16 type = res->type;
2348         struct rtable *rth;
2349
2350         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2351                 return ERR_PTR(-EINVAL);
2352
2353         if (ipv4_is_lbcast(fl4->daddr))
2354                 type = RTN_BROADCAST;
2355         else if (ipv4_is_multicast(fl4->daddr))
2356                 type = RTN_MULTICAST;
2357         else if (ipv4_is_zeronet(fl4->daddr))
2358                 return ERR_PTR(-EINVAL);
2359
2360         if (dev_out->flags & IFF_LOOPBACK)
2361                 flags |= RTCF_LOCAL;
2362
2363         in_dev = __in_dev_get_rcu(dev_out);
2364         if (!in_dev)
2365                 return ERR_PTR(-EINVAL);
2366
2367         if (type == RTN_BROADCAST) {
2368                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2369                 fi = NULL;
2370         } else if (type == RTN_MULTICAST) {
2371                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2372                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2373                                      fl4->flowi4_proto))
2374                         flags &= ~RTCF_LOCAL;
2375                 /* If multicast route do not exist use
2376                  * default one, but do not gateway in this case.
2377                  * Yes, it is hack.
2378                  */
2379                 if (fi && res->prefixlen < 4)
2380                         fi = NULL;
2381         }
2382
2383         rth = rt_dst_alloc(dev_out,
2384                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2385                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2386         if (!rth)
2387                 return ERR_PTR(-ENOBUFS);
2388
2389         rth->dst.output = ip_output;
2390
2391         rth->rt_key_dst = orig_daddr;
2392         rth->rt_key_src = orig_saddr;
2393         rth->rt_genid = rt_genid(dev_net(dev_out));
2394         rth->rt_flags   = flags;
2395         rth->rt_type    = type;
2396         rth->rt_key_tos = tos;
2397         rth->rt_dst     = fl4->daddr;
2398         rth->rt_src     = fl4->saddr;
2399         rth->rt_route_iif = 0;
2400         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2401         rth->rt_oif     = orig_oif;
2402         rth->rt_mark    = fl4->flowi4_mark;
2403         rth->rt_gateway = fl4->daddr;
2404         rth->rt_spec_dst= fl4->saddr;
2405         rth->rt_peer_genid = 0;
2406         rth->peer = NULL;
2407         rth->fi = NULL;
2408
2409         RT_CACHE_STAT_INC(out_slow_tot);
2410
2411         if (flags & RTCF_LOCAL) {
2412                 rth->dst.input = ip_local_deliver;
2413                 rth->rt_spec_dst = fl4->daddr;
2414         }
2415         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2416                 rth->rt_spec_dst = fl4->saddr;
2417                 if (flags & RTCF_LOCAL &&
2418                     !(dev_out->flags & IFF_LOOPBACK)) {
2419                         rth->dst.output = ip_mc_output;
2420                         RT_CACHE_STAT_INC(out_slow_mc);
2421                 }
2422 #ifdef CONFIG_IP_MROUTE
2423                 if (type == RTN_MULTICAST) {
2424                         if (IN_DEV_MFORWARD(in_dev) &&
2425                             !ipv4_is_local_multicast(fl4->daddr)) {
2426                                 rth->dst.input = ip_mr_input;
2427                                 rth->dst.output = ip_mc_output;
2428                         }
2429                 }
2430 #endif
2431         }
2432
2433         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2434
2435         return rth;
2436 }
2437
2438 /*
2439  * Major route resolver routine.
2440  * called with rcu_read_lock();
2441  */
2442
2443 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2444 {
2445         struct net_device *dev_out = NULL;
2446         u32 tos = RT_FL_TOS(fl4);
2447         unsigned int flags = 0;
2448         struct fib_result res;
2449         struct rtable *rth;
2450         __be32 orig_daddr;
2451         __be32 orig_saddr;
2452         int orig_oif;
2453
2454         res.fi          = NULL;
2455 #ifdef CONFIG_IP_MULTIPLE_TABLES
2456         res.r           = NULL;
2457 #endif
2458
2459         orig_daddr = fl4->daddr;
2460         orig_saddr = fl4->saddr;
2461         orig_oif = fl4->flowi4_oif;
2462
2463         fl4->flowi4_iif = net->loopback_dev->ifindex;
2464         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2465         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2466                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2467
2468         rcu_read_lock();
2469         if (fl4->saddr) {
2470                 rth = ERR_PTR(-EINVAL);
2471                 if (ipv4_is_multicast(fl4->saddr) ||
2472                     ipv4_is_lbcast(fl4->saddr) ||
2473                     ipv4_is_zeronet(fl4->saddr))
2474                         goto out;
2475
2476                 /* I removed check for oif == dev_out->oif here.
2477                    It was wrong for two reasons:
2478                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2479                       is assigned to multiple interfaces.
2480                    2. Moreover, we are allowed to send packets with saddr
2481                       of another iface. --ANK
2482                  */
2483
2484                 if (fl4->flowi4_oif == 0 &&
2485                     (ipv4_is_multicast(fl4->daddr) ||
2486                      ipv4_is_lbcast(fl4->daddr))) {
2487                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2488                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2489                         if (dev_out == NULL)
2490                                 goto out;
2491
2492                         /* Special hack: user can direct multicasts
2493                            and limited broadcast via necessary interface
2494                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2495                            This hack is not just for fun, it allows
2496                            vic,vat and friends to work.
2497                            They bind socket to loopback, set ttl to zero
2498                            and expect that it will work.
2499                            From the viewpoint of routing cache they are broken,
2500                            because we are not allowed to build multicast path
2501                            with loopback source addr (look, routing cache
2502                            cannot know, that ttl is zero, so that packet
2503                            will not leave this host and route is valid).
2504                            Luckily, this hack is good workaround.
2505                          */
2506
2507                         fl4->flowi4_oif = dev_out->ifindex;
2508                         goto make_route;
2509                 }
2510
2511                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2512                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2513                         if (!__ip_dev_find(net, fl4->saddr, false))
2514                                 goto out;
2515                 }
2516         }
2517
2518
2519         if (fl4->flowi4_oif) {
2520                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2521                 rth = ERR_PTR(-ENODEV);
2522                 if (dev_out == NULL)
2523                         goto out;
2524
2525                 /* RACE: Check return value of inet_select_addr instead. */
2526                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2527                         rth = ERR_PTR(-ENETUNREACH);
2528                         goto out;
2529                 }
2530                 if (ipv4_is_local_multicast(fl4->daddr) ||
2531                     ipv4_is_lbcast(fl4->daddr)) {
2532                         if (!fl4->saddr)
2533                                 fl4->saddr = inet_select_addr(dev_out, 0,
2534                                                               RT_SCOPE_LINK);
2535                         goto make_route;
2536                 }
2537                 if (fl4->saddr) {
2538                         if (ipv4_is_multicast(fl4->daddr))
2539                                 fl4->saddr = inet_select_addr(dev_out, 0,
2540                                                               fl4->flowi4_scope);
2541                         else if (!fl4->daddr)
2542                                 fl4->saddr = inet_select_addr(dev_out, 0,
2543                                                               RT_SCOPE_HOST);
2544                 }
2545         }
2546
2547         if (!fl4->daddr) {
2548                 fl4->daddr = fl4->saddr;
2549                 if (!fl4->daddr)
2550                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2551                 dev_out = net->loopback_dev;
2552                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2553                 res.type = RTN_LOCAL;
2554                 flags |= RTCF_LOCAL;
2555                 goto make_route;
2556         }
2557
2558         if (fib_lookup(net, fl4, &res)) {
2559                 res.fi = NULL;
2560                 if (fl4->flowi4_oif) {
2561                         /* Apparently, routing tables are wrong. Assume,
2562                            that the destination is on link.
2563
2564                            WHY? DW.
2565                            Because we are allowed to send to iface
2566                            even if it has NO routes and NO assigned
2567                            addresses. When oif is specified, routing
2568                            tables are looked up with only one purpose:
2569                            to catch if destination is gatewayed, rather than
2570                            direct. Moreover, if MSG_DONTROUTE is set,
2571                            we send packet, ignoring both routing tables
2572                            and ifaddr state. --ANK
2573
2574
2575                            We could make it even if oif is unknown,
2576                            likely IPv6, but we do not.
2577                          */
2578
2579                         if (fl4->saddr == 0)
2580                                 fl4->saddr = inet_select_addr(dev_out, 0,
2581                                                               RT_SCOPE_LINK);
2582                         res.type = RTN_UNICAST;
2583                         goto make_route;
2584                 }
2585                 rth = ERR_PTR(-ENETUNREACH);
2586                 goto out;
2587         }
2588
2589         if (res.type == RTN_LOCAL) {
2590                 if (!fl4->saddr) {
2591                         if (res.fi->fib_prefsrc)
2592                                 fl4->saddr = res.fi->fib_prefsrc;
2593                         else
2594                                 fl4->saddr = fl4->daddr;
2595                 }
2596                 dev_out = net->loopback_dev;
2597                 fl4->flowi4_oif = dev_out->ifindex;
2598                 res.fi = NULL;
2599                 flags |= RTCF_LOCAL;
2600                 goto make_route;
2601         }
2602
2603 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2604         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2605                 fib_select_multipath(&res);
2606         else
2607 #endif
2608         if (!res.prefixlen &&
2609             res.table->tb_num_default > 1 &&
2610             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2611                 fib_select_default(&res);
2612
2613         if (!fl4->saddr)
2614                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2615
2616         dev_out = FIB_RES_DEV(res);
2617         fl4->flowi4_oif = dev_out->ifindex;
2618
2619
2620 make_route:
2621         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2622                                dev_out, flags);
2623         if (!IS_ERR(rth)) {
2624                 unsigned int hash;
2625
2626                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2627                                rt_genid(dev_net(dev_out)));
2628                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2629         }
2630
2631 out:
2632         rcu_read_unlock();
2633         return rth;
2634 }
2635
2636 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2637 {
2638         struct rtable *rth;
2639         unsigned int hash;
2640
2641         if (!rt_caching(net))
2642                 goto slow_output;
2643
2644         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2645
2646         rcu_read_lock_bh();
2647         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2648                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2649                 if (rth->rt_key_dst == flp4->daddr &&
2650                     rth->rt_key_src == flp4->saddr &&
2651                     rt_is_output_route(rth) &&
2652                     rth->rt_oif == flp4->flowi4_oif &&
2653                     rth->rt_mark == flp4->flowi4_mark &&
2654                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2655                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2656                     net_eq(dev_net(rth->dst.dev), net) &&
2657                     !rt_is_expired(rth)) {
2658                         dst_use(&rth->dst, jiffies);
2659                         RT_CACHE_STAT_INC(out_hit);
2660                         rcu_read_unlock_bh();
2661                         if (!flp4->saddr)
2662                                 flp4->saddr = rth->rt_src;
2663                         if (!flp4->daddr)
2664                                 flp4->daddr = rth->rt_dst;
2665                         return rth;
2666                 }
2667                 RT_CACHE_STAT_INC(out_hlist_search);
2668         }
2669         rcu_read_unlock_bh();
2670
2671 slow_output:
2672         return ip_route_output_slow(net, flp4);
2673 }
2674 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2675
2676 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2677 {
2678         return NULL;
2679 }
2680
2681 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2682 {
2683         return 0;
2684 }
2685
2686 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2687 {
2688 }
2689
2690 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2691                                           unsigned long old)
2692 {
2693         return NULL;
2694 }
2695
2696 static struct dst_ops ipv4_dst_blackhole_ops = {
2697         .family                 =       AF_INET,
2698         .protocol               =       cpu_to_be16(ETH_P_IP),
2699         .destroy                =       ipv4_dst_destroy,
2700         .check                  =       ipv4_blackhole_dst_check,
2701         .default_mtu            =       ipv4_blackhole_default_mtu,
2702         .default_advmss         =       ipv4_default_advmss,
2703         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2704         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2705 };
2706
2707 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2708 {
2709         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2710         struct rtable *ort = (struct rtable *) dst_orig;
2711
2712         if (rt) {
2713                 struct dst_entry *new = &rt->dst;
2714
2715                 new->__use = 1;
2716                 new->input = dst_discard;
2717                 new->output = dst_discard;
2718                 dst_copy_metrics(new, &ort->dst);
2719
2720                 new->dev = ort->dst.dev;
2721                 if (new->dev)
2722                         dev_hold(new->dev);
2723
2724                 rt->rt_key_dst = ort->rt_key_dst;
2725                 rt->rt_key_src = ort->rt_key_src;
2726                 rt->rt_key_tos = ort->rt_key_tos;
2727                 rt->rt_route_iif = ort->rt_route_iif;
2728                 rt->rt_iif = ort->rt_iif;
2729                 rt->rt_oif = ort->rt_oif;
2730                 rt->rt_mark = ort->rt_mark;
2731
2732                 rt->rt_genid = rt_genid(net);
2733                 rt->rt_flags = ort->rt_flags;
2734                 rt->rt_type = ort->rt_type;
2735                 rt->rt_dst = ort->rt_dst;
2736                 rt->rt_src = ort->rt_src;
2737                 rt->rt_gateway = ort->rt_gateway;
2738                 rt->rt_spec_dst = ort->rt_spec_dst;
2739                 rt->peer = ort->peer;
2740                 if (rt->peer)
2741                         atomic_inc(&rt->peer->refcnt);
2742                 rt->fi = ort->fi;
2743                 if (rt->fi)
2744                         atomic_inc(&rt->fi->fib_clntref);
2745
2746                 dst_free(new);
2747         }
2748
2749         dst_release(dst_orig);
2750
2751         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2752 }
2753
2754 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2755                                     struct sock *sk)
2756 {
2757         struct rtable *rt = __ip_route_output_key(net, flp4);
2758
2759         if (IS_ERR(rt))
2760                 return rt;
2761
2762         if (flp4->flowi4_proto)
2763                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2764                                                    flowi4_to_flowi(flp4),
2765                                                    sk, 0);
2766
2767         return rt;
2768 }
2769 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2770
2771 static int rt_fill_info(struct net *net,
2772                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2773                         int nowait, unsigned int flags)
2774 {
2775         struct rtable *rt = skb_rtable(skb);
2776         struct rtmsg *r;
2777         struct nlmsghdr *nlh;
2778         long expires;
2779         u32 id = 0, ts = 0, tsage = 0, error;
2780
2781         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2782         if (nlh == NULL)
2783                 return -EMSGSIZE;
2784
2785         r = nlmsg_data(nlh);
2786         r->rtm_family    = AF_INET;
2787         r->rtm_dst_len  = 32;
2788         r->rtm_src_len  = 0;
2789         r->rtm_tos      = rt->rt_key_tos;
2790         r->rtm_table    = RT_TABLE_MAIN;
2791         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2792         r->rtm_type     = rt->rt_type;
2793         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2794         r->rtm_protocol = RTPROT_UNSPEC;
2795         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2796         if (rt->rt_flags & RTCF_NOTIFY)
2797                 r->rtm_flags |= RTM_F_NOTIFY;
2798
2799         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2800
2801         if (rt->rt_key_src) {
2802                 r->rtm_src_len = 32;
2803                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2804         }
2805         if (rt->dst.dev)
2806                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2807 #ifdef CONFIG_IP_ROUTE_CLASSID
2808         if (rt->dst.tclassid)
2809                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2810 #endif
2811         if (rt_is_input_route(rt))
2812                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2813         else if (rt->rt_src != rt->rt_key_src)
2814                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2815
2816         if (rt->rt_dst != rt->rt_gateway)
2817                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2818
2819         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2820                 goto nla_put_failure;
2821
2822         if (rt->rt_mark)
2823                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2824
2825         error = rt->dst.error;
2826         expires = (rt->peer && rt->peer->pmtu_expires) ?
2827                 rt->peer->pmtu_expires - jiffies : 0;
2828         if (rt->peer) {
2829                 inet_peer_refcheck(rt->peer);
2830                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2831                 if (rt->peer->tcp_ts_stamp) {
2832                         ts = rt->peer->tcp_ts;
2833                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2834                 }
2835         }
2836
2837         if (rt_is_input_route(rt)) {
2838 #ifdef CONFIG_IP_MROUTE
2839                 __be32 dst = rt->rt_dst;
2840
2841                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2842                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2843                         int err = ipmr_get_route(net, skb,
2844                                                  rt->rt_src, rt->rt_dst,
2845                                                  r, nowait);
2846                         if (err <= 0) {
2847                                 if (!nowait) {
2848                                         if (err == 0)
2849                                                 return 0;
2850                                         goto nla_put_failure;
2851                                 } else {
2852                                         if (err == -EMSGSIZE)
2853                                                 goto nla_put_failure;
2854                                         error = err;
2855                                 }
2856                         }
2857                 } else
2858 #endif
2859                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2860         }
2861
2862         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2863                                expires, error) < 0)
2864                 goto nla_put_failure;
2865
2866         return nlmsg_end(skb, nlh);
2867
2868 nla_put_failure:
2869         nlmsg_cancel(skb, nlh);
2870         return -EMSGSIZE;
2871 }
2872
2873 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2874 {
2875         struct net *net = sock_net(in_skb->sk);
2876         struct rtmsg *rtm;
2877         struct nlattr *tb[RTA_MAX+1];
2878         struct rtable *rt = NULL;
2879         __be32 dst = 0;
2880         __be32 src = 0;
2881         u32 iif;
2882         int err;
2883         int mark;
2884         struct sk_buff *skb;
2885
2886         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2887         if (err < 0)
2888                 goto errout;
2889
2890         rtm = nlmsg_data(nlh);
2891
2892         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2893         if (skb == NULL) {
2894                 err = -ENOBUFS;
2895                 goto errout;
2896         }
2897
2898         /* Reserve room for dummy headers, this skb can pass
2899            through good chunk of routing engine.
2900          */
2901         skb_reset_mac_header(skb);
2902         skb_reset_network_header(skb);
2903
2904         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2905         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2906         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2907
2908         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2909         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2910         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2911         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2912
2913         if (iif) {
2914                 struct net_device *dev;
2915
2916                 dev = __dev_get_by_index(net, iif);
2917                 if (dev == NULL) {
2918                         err = -ENODEV;
2919                         goto errout_free;
2920                 }
2921
2922                 skb->protocol   = htons(ETH_P_IP);
2923                 skb->dev        = dev;
2924                 skb->mark       = mark;
2925                 local_bh_disable();
2926                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2927                 local_bh_enable();
2928
2929                 rt = skb_rtable(skb);
2930                 if (err == 0 && rt->dst.error)
2931                         err = -rt->dst.error;
2932         } else {
2933                 struct flowi4 fl4 = {
2934                         .daddr = dst,
2935                         .saddr = src,
2936                         .flowi4_tos = rtm->rtm_tos,
2937                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2938                         .flowi4_mark = mark,
2939                 };
2940                 rt = ip_route_output_key(net, &fl4);
2941
2942                 err = 0;
2943                 if (IS_ERR(rt))
2944                         err = PTR_ERR(rt);
2945         }
2946
2947         if (err)
2948                 goto errout_free;
2949
2950         skb_dst_set(skb, &rt->dst);
2951         if (rtm->rtm_flags & RTM_F_NOTIFY)
2952                 rt->rt_flags |= RTCF_NOTIFY;
2953
2954         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2955                            RTM_NEWROUTE, 0, 0);
2956         if (err <= 0)
2957                 goto errout_free;
2958
2959         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2960 errout:
2961         return err;
2962
2963 errout_free:
2964         kfree_skb(skb);
2965         goto errout;
2966 }
2967
2968 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2969 {
2970         struct rtable *rt;
2971         int h, s_h;
2972         int idx, s_idx;
2973         struct net *net;
2974
2975         net = sock_net(skb->sk);
2976
2977         s_h = cb->args[0];
2978         if (s_h < 0)
2979                 s_h = 0;
2980         s_idx = idx = cb->args[1];
2981         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2982                 if (!rt_hash_table[h].chain)
2983                         continue;
2984                 rcu_read_lock_bh();
2985                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2986                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2987                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2988                                 continue;
2989                         if (rt_is_expired(rt))
2990                                 continue;
2991                         skb_dst_set_noref(skb, &rt->dst);
2992                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2993                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2994                                          1, NLM_F_MULTI) <= 0) {
2995                                 skb_dst_drop(skb);
2996                                 rcu_read_unlock_bh();
2997                                 goto done;
2998                         }
2999                         skb_dst_drop(skb);
3000                 }
3001                 rcu_read_unlock_bh();
3002         }
3003
3004 done:
3005         cb->args[0] = h;
3006         cb->args[1] = idx;
3007         return skb->len;
3008 }
3009
3010 void ip_rt_multicast_event(struct in_device *in_dev)
3011 {
3012         rt_cache_flush(dev_net(in_dev->dev), 0);
3013 }
3014
3015 #ifdef CONFIG_SYSCTL
3016 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3017                                         void __user *buffer,
3018                                         size_t *lenp, loff_t *ppos)
3019 {
3020         if (write) {
3021                 int flush_delay;
3022                 ctl_table ctl;
3023                 struct net *net;
3024
3025                 memcpy(&ctl, __ctl, sizeof(ctl));
3026                 ctl.data = &flush_delay;
3027                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3028
3029                 net = (struct net *)__ctl->extra1;
3030                 rt_cache_flush(net, flush_delay);
3031                 return 0;
3032         }
3033
3034         return -EINVAL;
3035 }
3036
3037 static ctl_table ipv4_route_table[] = {
3038         {
3039                 .procname       = "gc_thresh",
3040                 .data           = &ipv4_dst_ops.gc_thresh,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = proc_dointvec,
3044         },
3045         {
3046                 .procname       = "max_size",
3047                 .data           = &ip_rt_max_size,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = proc_dointvec,
3051         },
3052         {
3053                 /*  Deprecated. Use gc_min_interval_ms */
3054
3055                 .procname       = "gc_min_interval",
3056                 .data           = &ip_rt_gc_min_interval,
3057                 .maxlen         = sizeof(int),
3058                 .mode           = 0644,
3059                 .proc_handler   = proc_dointvec_jiffies,
3060         },
3061         {
3062                 .procname       = "gc_min_interval_ms",
3063                 .data           = &ip_rt_gc_min_interval,
3064                 .maxlen         = sizeof(int),
3065                 .mode           = 0644,
3066                 .proc_handler   = proc_dointvec_ms_jiffies,
3067         },
3068         {
3069                 .procname       = "gc_timeout",
3070                 .data           = &ip_rt_gc_timeout,
3071                 .maxlen         = sizeof(int),
3072                 .mode           = 0644,
3073                 .proc_handler   = proc_dointvec_jiffies,
3074         },
3075         {
3076                 .procname       = "gc_interval",
3077                 .data           = &ip_rt_gc_interval,
3078                 .maxlen         = sizeof(int),
3079                 .mode           = 0644,
3080                 .proc_handler   = proc_dointvec_jiffies,
3081         },
3082         {
3083                 .procname       = "redirect_load",
3084                 .data           = &ip_rt_redirect_load,
3085                 .maxlen         = sizeof(int),
3086                 .mode           = 0644,
3087                 .proc_handler   = proc_dointvec,
3088         },
3089         {
3090                 .procname       = "redirect_number",
3091                 .data           = &ip_rt_redirect_number,
3092                 .maxlen         = sizeof(int),
3093                 .mode           = 0644,
3094                 .proc_handler   = proc_dointvec,
3095         },
3096         {
3097                 .procname       = "redirect_silence",
3098                 .data           = &ip_rt_redirect_silence,
3099                 .maxlen         = sizeof(int),
3100                 .mode           = 0644,
3101                 .proc_handler   = proc_dointvec,
3102         },
3103         {
3104                 .procname       = "error_cost",
3105                 .data           = &ip_rt_error_cost,
3106                 .maxlen         = sizeof(int),
3107                 .mode           = 0644,
3108                 .proc_handler   = proc_dointvec,
3109         },
3110         {
3111                 .procname       = "error_burst",
3112                 .data           = &ip_rt_error_burst,
3113                 .maxlen         = sizeof(int),
3114                 .mode           = 0644,
3115                 .proc_handler   = proc_dointvec,
3116         },
3117         {
3118                 .procname       = "gc_elasticity",
3119                 .data           = &ip_rt_gc_elasticity,
3120                 .maxlen         = sizeof(int),
3121                 .mode           = 0644,
3122                 .proc_handler   = proc_dointvec,
3123         },
3124         {
3125                 .procname       = "mtu_expires",
3126                 .data           = &ip_rt_mtu_expires,
3127                 .maxlen         = sizeof(int),
3128                 .mode           = 0644,
3129                 .proc_handler   = proc_dointvec_jiffies,
3130         },
3131         {
3132                 .procname       = "min_pmtu",
3133                 .data           = &ip_rt_min_pmtu,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec,
3137         },
3138         {
3139                 .procname       = "min_adv_mss",
3140                 .data           = &ip_rt_min_advmss,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec,
3144         },
3145         { }
3146 };
3147
3148 static struct ctl_table empty[1];
3149
3150 static struct ctl_table ipv4_skeleton[] =
3151 {
3152         { .procname = "route",
3153           .mode = 0555, .child = ipv4_route_table},
3154         { .procname = "neigh",
3155           .mode = 0555, .child = empty},
3156         { }
3157 };
3158
3159 static __net_initdata struct ctl_path ipv4_path[] = {
3160         { .procname = "net", },
3161         { .procname = "ipv4", },
3162         { },
3163 };
3164
3165 static struct ctl_table ipv4_route_flush_table[] = {
3166         {
3167                 .procname       = "flush",
3168                 .maxlen         = sizeof(int),
3169                 .mode           = 0200,
3170                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3171         },
3172         { },
3173 };
3174
3175 static __net_initdata struct ctl_path ipv4_route_path[] = {
3176         { .procname = "net", },
3177         { .procname = "ipv4", },
3178         { .procname = "route", },
3179         { },
3180 };
3181
3182 static __net_init int sysctl_route_net_init(struct net *net)
3183 {
3184         struct ctl_table *tbl;
3185
3186         tbl = ipv4_route_flush_table;
3187         if (!net_eq(net, &init_net)) {
3188                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3189                 if (tbl == NULL)
3190                         goto err_dup;
3191         }
3192         tbl[0].extra1 = net;
3193
3194         net->ipv4.route_hdr =
3195                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3196         if (net->ipv4.route_hdr == NULL)
3197                 goto err_reg;
3198         return 0;
3199
3200 err_reg:
3201         if (tbl != ipv4_route_flush_table)
3202                 kfree(tbl);
3203 err_dup:
3204         return -ENOMEM;
3205 }
3206
3207 static __net_exit void sysctl_route_net_exit(struct net *net)
3208 {
3209         struct ctl_table *tbl;
3210
3211         tbl = net->ipv4.route_hdr->ctl_table_arg;
3212         unregister_net_sysctl_table(net->ipv4.route_hdr);
3213         BUG_ON(tbl == ipv4_route_flush_table);
3214         kfree(tbl);
3215 }
3216
3217 static __net_initdata struct pernet_operations sysctl_route_ops = {
3218         .init = sysctl_route_net_init,
3219         .exit = sysctl_route_net_exit,
3220 };
3221 #endif
3222
3223 static __net_init int rt_genid_init(struct net *net)
3224 {
3225         get_random_bytes(&net->ipv4.rt_genid,
3226                          sizeof(net->ipv4.rt_genid));
3227         get_random_bytes(&net->ipv4.dev_addr_genid,
3228                          sizeof(net->ipv4.dev_addr_genid));
3229         return 0;
3230 }
3231
3232 static __net_initdata struct pernet_operations rt_genid_ops = {
3233         .init = rt_genid_init,
3234 };
3235
3236
3237 #ifdef CONFIG_IP_ROUTE_CLASSID
3238 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3239 #endif /* CONFIG_IP_ROUTE_CLASSID */
3240
3241 static __initdata unsigned long rhash_entries;
3242 static int __init set_rhash_entries(char *str)
3243 {
3244         if (!str)
3245                 return 0;
3246         rhash_entries = simple_strtoul(str, &str, 0);
3247         return 1;
3248 }
3249 __setup("rhash_entries=", set_rhash_entries);
3250
3251 int __init ip_rt_init(void)
3252 {
3253         int rc = 0;
3254
3255 #ifdef CONFIG_IP_ROUTE_CLASSID
3256         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3257         if (!ip_rt_acct)
3258                 panic("IP: failed to allocate ip_rt_acct\n");
3259 #endif
3260
3261         ipv4_dst_ops.kmem_cachep =
3262                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3263                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3264
3265         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3266
3267         if (dst_entries_init(&ipv4_dst_ops) < 0)
3268                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3269
3270         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3271                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3272
3273         rt_hash_table = (struct rt_hash_bucket *)
3274                 alloc_large_system_hash("IP route cache",
3275                                         sizeof(struct rt_hash_bucket),
3276                                         rhash_entries,
3277                                         (totalram_pages >= 128 * 1024) ?
3278                                         15 : 17,
3279                                         0,
3280                                         &rt_hash_log,
3281                                         &rt_hash_mask,
3282                                         rhash_entries ? 0 : 512 * 1024);
3283         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3284         rt_hash_lock_init();
3285
3286         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3287         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3288
3289         devinet_init();
3290         ip_fib_init();
3291
3292         if (ip_rt_proc_init())
3293                 printk(KERN_ERR "Unable to create route proc files\n");
3294 #ifdef CONFIG_XFRM
3295         xfrm_init();
3296         xfrm4_init(ip_rt_max_size);
3297 #endif
3298         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3299
3300 #ifdef CONFIG_SYSCTL
3301         register_pernet_subsys(&sysctl_route_ops);
3302 #endif
3303         register_pernet_subsys(&rt_genid_ops);
3304         return rc;
3305 }
3306
3307 #ifdef CONFIG_SYSCTL
3308 /*
3309  * We really need to sanitize the damn ipv4 init order, then all
3310  * this nonsense will go away.
3311  */
3312 void __init ip_static_sysctl_init(void)
3313 {
3314         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3315 }
3316 #endif