net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 static struct delayed_work expires_work;
 137 static unsigned long expires_ljiffies;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 145 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                             int how)
 154 {
 155 }
 156
 157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158 {
 159         struct rtable *rt = (struct rtable *) dst;
 160         struct inet_peer *peer;
 161         u32 *p = NULL;
 162
 163         if (!rt->peer)
 164                 rt_bind_peer(rt, rt->rt_dst, 1);
 165
 166         peer = rt->peer;
 167         if (peer) {
 168                 u32 *old_p = __DST_METRICS_PTR(old);
 169                 unsigned long prev, new;
 170
 171                 p = peer->metrics;
 172                 if (inet_metrics_new(peer))
 173                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 174
 175                 new = (unsigned long) p;
 176                 prev = cmpxchg(&dst->_metrics, old, new);
 177
 178                 if (prev != old) {
 179                         p = __DST_METRICS_PTR(prev);
 180                         if (prev & DST_METRICS_READ_ONLY)
 181                                 p = NULL;
 182                 } else {
 183                         if (rt->fi) {
 184                                 fib_info_put(rt->fi);
 185                                 rt->fi = NULL;
 186                         }
 187                 }
 188         }
 189         return p;
 190 }
 191
 192 static struct dst_ops ipv4_dst_ops = {
 193         .family =               AF_INET,
 194         .protocol =             cpu_to_be16(ETH_P_IP),
 195         .gc =                   rt_garbage_collect,
 196         .check =                ipv4_dst_check,
 197         .default_advmss =       ipv4_default_advmss,
 198         .default_mtu =          ipv4_default_mtu,
 199         .cow_metrics =          ipv4_cow_metrics,
 200         .destroy =              ipv4_dst_destroy,
 201         .ifdown =               ipv4_dst_ifdown,
 202         .negative_advice =      ipv4_negative_advice,
 203         .link_failure =         ipv4_link_failure,
 204         .update_pmtu =          ip_rt_update_pmtu,
 205         .local_out =            __ip_local_out,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 int len;
 420
 421                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 422                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 423                         r->dst.dev ? r->dst.dev->name : "*",
 424                         (__force u32)r->rt_dst,
 425                         (__force u32)r->rt_gateway,
 426                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 427                         r->dst.__use, 0, (__force u32)r->rt_src,
 428                         dst_metric_advmss(&r->dst) + 40,
 429                         dst_metric(&r->dst, RTAX_WINDOW),
 430                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 431                               dst_metric(&r->dst, RTAX_RTTVAR)),
 432                         r->rt_key_tos,
 433                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 434                         r->dst.hh ? (r->dst.hh->hh_output ==
 435                                        dev_queue_xmit) : 0,
 436                         r->rt_spec_dst, &len);
 437
 438                 seq_printf(seq, "%*s\n", 127 - len, "");
 439         }
 440         return 0;
 441 }
 442
 443 static const struct seq_operations rt_cache_seq_ops = {
 444         .start  = rt_cache_seq_start,
 445         .next   = rt_cache_seq_next,
 446         .stop   = rt_cache_seq_stop,
 447         .show   = rt_cache_seq_show,
 448 };
 449
 450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 451 {
 452         return seq_open_net(inode, file, &rt_cache_seq_ops,
 453                         sizeof(struct rt_cache_iter_state));
 454 }
 455
 456 static const struct file_operations rt_cache_seq_fops = {
 457         .owner   = THIS_MODULE,
 458         .open    = rt_cache_seq_open,
 459         .read    = seq_read,
 460         .llseek  = seq_lseek,
 461         .release = seq_release_net,
 462 };
 463
 464
 465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 466 {
 467         int cpu;
 468
 469         if (*pos == 0)
 470                 return SEQ_START_TOKEN;
 471
 472         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 473                 if (!cpu_possible(cpu))
 474                         continue;
 475                 *pos = cpu+1;
 476                 return &per_cpu(rt_cache_stat, cpu);
 477         }
 478         return NULL;
 479 }
 480
 481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 482 {
 483         int cpu;
 484
 485         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 486                 if (!cpu_possible(cpu))
 487                         continue;
 488                 *pos = cpu+1;
 489                 return &per_cpu(rt_cache_stat, cpu);
 490         }
 491         return NULL;
 492
 493 }
 494
 495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 496 {
 497
 498 }
 499
 500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 501 {
 502         struct rt_cache_stat *st = v;
 503
 504         if (v == SEQ_START_TOKEN) {
 505                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 506                 return 0;
 507         }
 508
 509         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 510                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 511                    dst_entries_get_slow(&ipv4_dst_ops),
 512                    st->in_hit,
 513                    st->in_slow_tot,
 514                    st->in_slow_mc,
 515                    st->in_no_route,
 516                    st->in_brd,
 517                    st->in_martian_dst,
 518                    st->in_martian_src,
 519
 520                    st->out_hit,
 521                    st->out_slow_tot,
 522                    st->out_slow_mc,
 523
 524                    st->gc_total,
 525                    st->gc_ignored,
 526                    st->gc_goal_miss,
 527                    st->gc_dst_overflow,
 528                    st->in_hlist_search,
 529                    st->out_hlist_search
 530                 );
 531         return 0;
 532 }
 533
 534 static const struct seq_operations rt_cpu_seq_ops = {
 535         .start  = rt_cpu_seq_start,
 536         .next   = rt_cpu_seq_next,
 537         .stop   = rt_cpu_seq_stop,
 538         .show   = rt_cpu_seq_show,
 539 };
 540
 541
 542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 543 {
 544         return seq_open(file, &rt_cpu_seq_ops);
 545 }
 546
 547 static const struct file_operations rt_cpu_seq_fops = {
 548         .owner   = THIS_MODULE,
 549         .open    = rt_cpu_seq_open,
 550         .read    = seq_read,
 551         .llseek  = seq_lseek,
 552         .release = seq_release,
 553 };
 554
 555 #ifdef CONFIG_IP_ROUTE_CLASSID
 556 static int rt_acct_proc_show(struct seq_file *m, void *v)
 557 {
 558         struct ip_rt_acct *dst, *src;
 559         unsigned int i, j;
 560
 561         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 562         if (!dst)
 563                 return -ENOMEM;
 564
 565         for_each_possible_cpu(i) {
 566                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 567                 for (j = 0; j < 256; j++) {
 568                         dst[j].o_bytes   += src[j].o_bytes;
 569                         dst[j].o_packets += src[j].o_packets;
 570                         dst[j].i_bytes   += src[j].i_bytes;
 571                         dst[j].i_packets += src[j].i_packets;
 572                 }
 573         }
 574
 575         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 576         kfree(dst);
 577         return 0;
 578 }
 579
 580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 581 {
 582         return single_open(file, rt_acct_proc_show, NULL);
 583 }
 584
 585 static const struct file_operations rt_acct_proc_fops = {
 586         .owner          = THIS_MODULE,
 587         .open           = rt_acct_proc_open,
 588         .read           = seq_read,
 589         .llseek         = seq_lseek,
 590         .release        = single_release,
 591 };
 592 #endif
 593
 594 static int __net_init ip_rt_do_proc_init(struct net *net)
 595 {
 596         struct proc_dir_entry *pde;
 597
 598         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 599                         &rt_cache_seq_fops);
 600         if (!pde)
 601                 goto err1;
 602
 603         pde = proc_create("rt_cache", S_IRUGO,
 604                           net->proc_net_stat, &rt_cpu_seq_fops);
 605         if (!pde)
 606                 goto err2;
 607
 608 #ifdef CONFIG_IP_ROUTE_CLASSID
 609         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 610         if (!pde)
 611                 goto err3;
 612 #endif
 613         return 0;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616 err3:
 617         remove_proc_entry("rt_cache", net->proc_net_stat);
 618 #endif
 619 err2:
 620         remove_proc_entry("rt_cache", net->proc_net);
 621 err1:
 622         return -ENOMEM;
 623 }
 624
 625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 626 {
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628         remove_proc_entry("rt_cache", net->proc_net);
 629 #ifdef CONFIG_IP_ROUTE_CLASSID
 630         remove_proc_entry("rt_acct", net->proc_net);
 631 #endif
 632 }
 633
 634 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 635         .init = ip_rt_do_proc_init,
 636         .exit = ip_rt_do_proc_exit,
 637 };
 638
 639 static int __init ip_rt_proc_init(void)
 640 {
 641         return register_pernet_subsys(&ip_rt_proc_ops);
 642 }
 643
 644 #else
 645 static inline int ip_rt_proc_init(void)
 646 {
 647         return 0;
 648 }
 649 #endif /* CONFIG_PROC_FS */
 650
 651 static inline void rt_free(struct rtable *rt)
 652 {
 653         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 654 }
 655
 656 static inline void rt_drop(struct rtable *rt)
 657 {
 658         ip_rt_put(rt);
 659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 660 }
 661
 662 static inline int rt_fast_clean(struct rtable *rth)
 663 {
 664         /* Kill broadcast/multicast entries very aggresively, if they
 665            collide in hash table with more useful entries */
 666         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 667                 rt_is_input_route(rth) && rth->dst.rt_next;
 668 }
 669
 670 static inline int rt_valuable(struct rtable *rth)
 671 {
 672         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 673                 (rth->peer && rth->peer->pmtu_expires);
 674 }
 675
 676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 677 {
 678         unsigned long age;
 679         int ret = 0;
 680
 681         if (atomic_read(&rth->dst.__refcnt))
 682                 goto out;
 683
 684         age = jiffies - rth->dst.lastuse;
 685         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 686             (age <= tmo2 && rt_valuable(rth)))
 687                 goto out;
 688         ret = 1;
 689 out:    return ret;
 690 }
 691
 692 /* Bits of score are:
 693  * 31: very valuable
 694  * 30: not quite useless
 695  * 29..0: usage counter
 696  */
 697 static inline u32 rt_score(struct rtable *rt)
 698 {
 699         u32 score = jiffies - rt->dst.lastuse;
 700
 701         score = ~score & ~(3<<30);
 702
 703         if (rt_valuable(rt))
 704                 score |= (1<<31);
 705
 706         if (rt_is_output_route(rt) ||
 707             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 708                 score |= (1<<30);
 709
 710         return score;
 711 }
 712
 713 static inline bool rt_caching(const struct net *net)
 714 {
 715         return net->ipv4.current_rt_cache_rebuild_count <=
 716                 net->ipv4.sysctl_rt_cache_rebuild_count;
 717 }
 718
 719 static inline bool compare_hash_inputs(const struct rtable *rt1,
 720                                        const struct rtable *rt2)
 721 {
 722         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 723                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 724                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 725 }
 726
 727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 728 {
 729         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_mark ^ rt2->rt_mark) |
 732                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 733                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 734                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 735 }
 736
 737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 740 }
 741
 742 static inline int rt_is_expired(struct rtable *rth)
 743 {
 744         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 745 }
 746
 747 /*
 748  * Perform a full scan of hash table and free all entries.
 749  * Can be called by a softirq or a process.
 750  * In the later case, we want to be reschedule if necessary
 751  */
 752 static void rt_do_flush(struct net *net, int process_context)
 753 {
 754         unsigned int i;
 755         struct rtable *rth, *next;
 756
 757         for (i = 0; i <= rt_hash_mask; i++) {
 758                 struct rtable __rcu **pprev;
 759                 struct rtable *list;
 760
 761                 if (process_context && need_resched())
 762                         cond_resched();
 763                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 764                 if (!rth)
 765                         continue;
 766
 767                 spin_lock_bh(rt_hash_lock_addr(i));
 768
 769                 list = NULL;
 770                 pprev = &rt_hash_table[i].chain;
 771                 rth = rcu_dereference_protected(*pprev,
 772                         lockdep_is_held(rt_hash_lock_addr(i)));
 773
 774                 while (rth) {
 775                         next = rcu_dereference_protected(rth->dst.rt_next,
 776                                 lockdep_is_held(rt_hash_lock_addr(i)));
 777
 778                         if (!net ||
 779                             net_eq(dev_net(rth->dst.dev), net)) {
 780                                 rcu_assign_pointer(*pprev, next);
 781                                 rcu_assign_pointer(rth->dst.rt_next, list);
 782                                 list = rth;
 783                         } else {
 784                                 pprev = &rth->dst.rt_next;
 785                         }
 786                         rth = next;
 787                 }
 788
 789                 spin_unlock_bh(rt_hash_lock_addr(i));
 790
 791                 for (; list; list = next) {
 792                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 793                         rt_free(list);
 794                 }
 795         }
 796 }
 797
 798 /*
 799  * While freeing expired entries, we compute average chain length
 800  * and standard deviation, using fixed-point arithmetic.
 801  * This to have an estimation of rt_chain_length_max
 802  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 803  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 804  */
 805
 806 #define FRACT_BITS 3
 807 #define ONE (1UL << FRACT_BITS)
 808
 809 /*
 810  * Given a hash chain and an item in this hash chain,
 811  * find if a previous entry has the same hash_inputs
 812  * (but differs on tos, mark or oif)
 813  * Returns 0 if an alias is found.
 814  * Returns ONE if rth has no alias before itself.
 815  */
 816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 817 {
 818         const struct rtable *aux = head;
 819
 820         while (aux != rth) {
 821                 if (compare_hash_inputs(aux, rth))
 822                         return 0;
 823                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 824         }
 825         return ONE;
 826 }
 827
 828 static void rt_check_expire(void)
 829 {
 830         static unsigned int rover;
 831         unsigned int i = rover, goal;
 832         struct rtable *rth;
 833         struct rtable __rcu **rthp;
 834         unsigned long samples = 0;
 835         unsigned long sum = 0, sum2 = 0;
 836         unsigned long delta;
 837         u64 mult;
 838
 839         delta = jiffies - expires_ljiffies;
 840         expires_ljiffies = jiffies;
 841         mult = ((u64)delta) << rt_hash_log;
 842         if (ip_rt_gc_timeout > 1)
 843                 do_div(mult, ip_rt_gc_timeout);
 844         goal = (unsigned int)mult;
 845         if (goal > rt_hash_mask)
 846                 goal = rt_hash_mask + 1;
 847         for (; goal > 0; goal--) {
 848                 unsigned long tmo = ip_rt_gc_timeout;
 849                 unsigned long length;
 850
 851                 i = (i + 1) & rt_hash_mask;
 852                 rthp = &rt_hash_table[i].chain;
 853
 854                 if (need_resched())
 855                         cond_resched();
 856
 857                 samples++;
 858
 859                 if (rcu_dereference_raw(*rthp) == NULL)
 860                         continue;
 861                 length = 0;
 862                 spin_lock_bh(rt_hash_lock_addr(i));
 863                 while ((rth = rcu_dereference_protected(*rthp,
 864                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 865                         prefetch(rth->dst.rt_next);
 866                         if (rt_is_expired(rth)) {
 867                                 *rthp = rth->dst.rt_next;
 868                                 rt_free(rth);
 869                                 continue;
 870                         }
 871                         if (rth->dst.expires) {
 872                                 /* Entry is expired even if it is in use */
 873                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 874 nofree:
 875                                         tmo >>= 1;
 876                                         rthp = &rth->dst.rt_next;
 877                                         /*
 878                                          * We only count entries on
 879                                          * a chain with equal hash inputs once
 880                                          * so that entries for different QOS
 881                                          * levels, and other non-hash input
 882                                          * attributes don't unfairly skew
 883                                          * the length computation
 884                                          */
 885                                         length += has_noalias(rt_hash_table[i].chain, rth);
 886                                         continue;
 887                                 }
 888                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 889                                 goto nofree;
 890
 891                         /* Cleanup aged off entries. */
 892                         *rthp = rth->dst.rt_next;
 893                         rt_free(rth);
 894                 }
 895                 spin_unlock_bh(rt_hash_lock_addr(i));
 896                 sum += length;
 897                 sum2 += length*length;
 898         }
 899         if (samples) {
 900                 unsigned long avg = sum / samples;
 901                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 902                 rt_chain_length_max = max_t(unsigned long,
 903                                         ip_rt_gc_elasticity,
 904                                         (avg + 4*sd) >> FRACT_BITS);
 905         }
 906         rover = i;
 907 }
 908
 909 /*
 910  * rt_worker_func() is run in process context.
 911  * we call rt_check_expire() to scan part of the hash table
 912  */
 913 static void rt_worker_func(struct work_struct *work)
 914 {
 915         rt_check_expire();
 916         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 917 }
 918
 919 /*
 920  * Perturbation of rt_genid by a small quantity [1..256]
 921  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 922  * many times (2^24) without giving recent rt_genid.
 923  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 924  */
 925 static void rt_cache_invalidate(struct net *net)
 926 {
 927         unsigned char shuffle;
 928
 929         get_random_bytes(&shuffle, sizeof(shuffle));
 930         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 931 }
 932
 933 /*
 934  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 935  * delay >= 0 : invalidate & flush cache (can be long)
 936  */
 937 void rt_cache_flush(struct net *net, int delay)
 938 {
 939         rt_cache_invalidate(net);
 940         if (delay >= 0)
 941                 rt_do_flush(net, !in_softirq());
 942 }
 943
 944 /* Flush previous cache invalidated entries from the cache */
 945 void rt_cache_flush_batch(struct net *net)
 946 {
 947         rt_do_flush(net, !in_softirq());
 948 }
 949
 950 static void rt_emergency_hash_rebuild(struct net *net)
 951 {
 952         if (net_ratelimit())
 953                 printk(KERN_WARNING "Route hash chain too long!\n");
 954         rt_cache_invalidate(net);
 955 }
 956
 957 /*
 958    Short description of GC goals.
 959
 960    We want to build algorithm, which will keep routing cache
 961    at some equilibrium point, when number of aged off entries
 962    is kept approximately equal to newly generated ones.
 963
 964    Current expiration strength is variable "expire".
 965    We try to adjust it dynamically, so that if networking
 966    is idle expires is large enough to keep enough of warm entries,
 967    and when load increases it reduces to limit cache size.
 968  */
 969
 970 static int rt_garbage_collect(struct dst_ops *ops)
 971 {
 972         static unsigned long expire = RT_GC_TIMEOUT;
 973         static unsigned long last_gc;
 974         static int rover;
 975         static int equilibrium;
 976         struct rtable *rth;
 977         struct rtable __rcu **rthp;
 978         unsigned long now = jiffies;
 979         int goal;
 980         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 981
 982         /*
 983          * Garbage collection is pretty expensive,
 984          * do not make it too frequently.
 985          */
 986
 987         RT_CACHE_STAT_INC(gc_total);
 988
 989         if (now - last_gc < ip_rt_gc_min_interval &&
 990             entries < ip_rt_max_size) {
 991                 RT_CACHE_STAT_INC(gc_ignored);
 992                 goto out;
 993         }
 994
 995         entries = dst_entries_get_slow(&ipv4_dst_ops);
 996         /* Calculate number of entries, which we want to expire now. */
 997         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 998         if (goal <= 0) {
 999                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1000                         equilibrium = ipv4_dst_ops.gc_thresh;
1001                 goal = entries - equilibrium;
1002                 if (goal > 0) {
1003                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1004                         goal = entries - equilibrium;
1005                 }
1006         } else {
1007                 /* We are in dangerous area. Try to reduce cache really
1008                  * aggressively.
1009                  */
1010                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                 equilibrium = entries - goal;
1012         }
1013
1014         if (now - last_gc >= ip_rt_gc_min_interval)
1015                 last_gc = now;
1016
1017         if (goal <= 0) {
1018                 equilibrium += goal;
1019                 goto work_done;
1020         }
1021
1022         do {
1023                 int i, k;
1024
1025                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1026                         unsigned long tmo = expire;
1027
1028                         k = (k + 1) & rt_hash_mask;
1029                         rthp = &rt_hash_table[k].chain;
1030                         spin_lock_bh(rt_hash_lock_addr(k));
1031                         while ((rth = rcu_dereference_protected(*rthp,
1032                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1033                                 if (!rt_is_expired(rth) &&
1034                                         !rt_may_expire(rth, tmo, expire)) {
1035                                         tmo >>= 1;
1036                                         rthp = &rth->dst.rt_next;
1037                                         continue;
1038                                 }
1039                                 *rthp = rth->dst.rt_next;
1040                                 rt_free(rth);
1041                                 goal--;
1042                         }
1043                         spin_unlock_bh(rt_hash_lock_addr(k));
1044                         if (goal <= 0)
1045                                 break;
1046                 }
1047                 rover = k;
1048
1049                 if (goal <= 0)
1050                         goto work_done;
1051
1052                 /* Goal is not achieved. We stop process if:
1053
1054                    - if expire reduced to zero. Otherwise, expire is halfed.
1055                    - if table is not full.
1056                    - if we are called from interrupt.
1057                    - jiffies check is just fallback/debug loop breaker.
1058                      We will not spin here for long time in any case.
1059                  */
1060
1061                 RT_CACHE_STAT_INC(gc_goal_miss);
1062
1063                 if (expire == 0)
1064                         break;
1065
1066                 expire >>= 1;
1067
1068                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1069                         goto out;
1070         } while (!in_softirq() && time_before_eq(jiffies, now));
1071
1072         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1073                 goto out;
1074         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1075                 goto out;
1076         if (net_ratelimit())
1077                 printk(KERN_WARNING "dst cache overflow\n");
1078         RT_CACHE_STAT_INC(gc_dst_overflow);
1079         return 1;
1080
1081 work_done:
1082         expire += ip_rt_gc_min_interval;
1083         if (expire > ip_rt_gc_timeout ||
1084             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1085             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1086                 expire = ip_rt_gc_timeout;
1087 out:    return 0;
1088 }
1089
1090 /*
1091  * Returns number of entries in a hash chain that have different hash_inputs
1092  */
1093 static int slow_chain_length(const struct rtable *head)
1094 {
1095         int length = 0;
1096         const struct rtable *rth = head;
1097
1098         while (rth) {
1099                 length += has_noalias(head, rth);
1100                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1101         }
1102         return length >> FRACT_BITS;
1103 }
1104
1105 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1106                                      struct sk_buff *skb, int ifindex)
1107 {
1108         struct rtable   *rth, *cand;
1109         struct rtable __rcu **rthp, **candp;
1110         unsigned long   now;
1111         u32             min_score;
1112         int             chain_length;
1113         int attempts = !in_softirq();
1114
1115 restart:
1116         chain_length = 0;
1117         min_score = ~(u32)0;
1118         cand = NULL;
1119         candp = NULL;
1120         now = jiffies;
1121
1122         if (!rt_caching(dev_net(rt->dst.dev))) {
1123                 /*
1124                  * If we're not caching, just tell the caller we
1125                  * were successful and don't touch the route.  The
1126                  * caller hold the sole reference to the cache entry, and
1127                  * it will be released when the caller is done with it.
1128                  * If we drop it here, the callers have no way to resolve routes
1129                  * when we're not caching.  Instead, just point *rp at rt, so
1130                  * the caller gets a single use out of the route
1131                  * Note that we do rt_free on this new route entry, so that
1132                  * once its refcount hits zero, we are still able to reap it
1133                  * (Thanks Alexey)
1134                  * Note: To avoid expensive rcu stuff for this uncached dst,
1135                  * we set DST_NOCACHE so that dst_release() can free dst without
1136                  * waiting a grace period.
1137                  */
1138
1139                 rt->dst.flags |= DST_NOCACHE;
1140                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1141                         int err = arp_bind_neighbour(&rt->dst);
1142                         if (err) {
1143                                 if (net_ratelimit())
1144                                         printk(KERN_WARNING
1145                                             "Neighbour table failure & not caching routes.\n");
1146                                 ip_rt_put(rt);
1147                                 return ERR_PTR(err);
1148                         }
1149                 }
1150
1151                 goto skip_hashing;
1152         }
1153
1154         rthp = &rt_hash_table[hash].chain;
1155
1156         spin_lock_bh(rt_hash_lock_addr(hash));
1157         while ((rth = rcu_dereference_protected(*rthp,
1158                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1159                 if (rt_is_expired(rth)) {
1160                         *rthp = rth->dst.rt_next;
1161                         rt_free(rth);
1162                         continue;
1163                 }
1164                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1165                         /* Put it first */
1166                         *rthp = rth->dst.rt_next;
1167                         /*
1168                          * Since lookup is lockfree, the deletion
1169                          * must be visible to another weakly ordered CPU before
1170                          * the insertion at the start of the hash chain.
1171                          */
1172                         rcu_assign_pointer(rth->dst.rt_next,
1173                                            rt_hash_table[hash].chain);
1174                         /*
1175                          * Since lookup is lockfree, the update writes
1176                          * must be ordered for consistency on SMP.
1177                          */
1178                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1179
1180                         dst_use(&rth->dst, now);
1181                         spin_unlock_bh(rt_hash_lock_addr(hash));
1182
1183                         rt_drop(rt);
1184                         if (skb)
1185                                 skb_dst_set(skb, &rth->dst);
1186                         return rth;
1187                 }
1188
1189                 if (!atomic_read(&rth->dst.__refcnt)) {
1190                         u32 score = rt_score(rth);
1191
1192                         if (score <= min_score) {
1193                                 cand = rth;
1194                                 candp = rthp;
1195                                 min_score = score;
1196                         }
1197                 }
1198
1199                 chain_length++;
1200
1201                 rthp = &rth->dst.rt_next;
1202         }
1203
1204         if (cand) {
1205                 /* ip_rt_gc_elasticity used to be average length of chain
1206                  * length, when exceeded gc becomes really aggressive.
1207                  *
1208                  * The second limit is less certain. At the moment it allows
1209                  * only 2 entries per bucket. We will see.
1210                  */
1211                 if (chain_length > ip_rt_gc_elasticity) {
1212                         *candp = cand->dst.rt_next;
1213                         rt_free(cand);
1214                 }
1215         } else {
1216                 if (chain_length > rt_chain_length_max &&
1217                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1218                         struct net *net = dev_net(rt->dst.dev);
1219                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1220                         if (!rt_caching(net)) {
1221                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1222                                         rt->dst.dev->name, num);
1223                         }
1224                         rt_emergency_hash_rebuild(net);
1225                         spin_unlock_bh(rt_hash_lock_addr(hash));
1226
1227                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1228                                         ifindex, rt_genid(net));
1229                         goto restart;
1230                 }
1231         }
1232
1233         /* Try to bind route to arp only if it is output
1234            route or unicast forwarding path.
1235          */
1236         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1237                 int err = arp_bind_neighbour(&rt->dst);
1238                 if (err) {
1239                         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241                         if (err != -ENOBUFS) {
1242                                 rt_drop(rt);
1243                                 return ERR_PTR(err);
1244                         }
1245
1246                         /* Neighbour tables are full and nothing
1247                            can be released. Try to shrink route cache,
1248                            it is most likely it holds some neighbour records.
1249                          */
1250                         if (attempts-- > 0) {
1251                                 int saved_elasticity = ip_rt_gc_elasticity;
1252                                 int saved_int = ip_rt_gc_min_interval;
1253                                 ip_rt_gc_elasticity     = 1;
1254                                 ip_rt_gc_min_interval   = 0;
1255                                 rt_garbage_collect(&ipv4_dst_ops);
1256                                 ip_rt_gc_min_interval   = saved_int;
1257                                 ip_rt_gc_elasticity     = saved_elasticity;
1258                                 goto restart;
1259                         }
1260
1261                         if (net_ratelimit())
1262                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1263                         rt_drop(rt);
1264                         return ERR_PTR(-ENOBUFS);
1265                 }
1266         }
1267
1268         rt->dst.rt_next = rt_hash_table[hash].chain;
1269
1270         /*
1271          * Since lookup is lockfree, we must make sure
1272          * previous writes to rt are committed to memory
1273          * before making rt visible to other CPUS.
1274          */
1275         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1276
1277         spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279 skip_hashing:
1280         if (skb)
1281                 skb_dst_set(skb, &rt->dst);
1282         return rt;
1283 }
1284
1285 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1286
1287 static u32 rt_peer_genid(void)
1288 {
1289         return atomic_read(&__rt_peer_genid);
1290 }
1291
1292 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1293 {
1294         struct inet_peer *peer;
1295
1296         peer = inet_getpeer_v4(daddr, create);
1297
1298         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1299                 inet_putpeer(peer);
1300         else
1301                 rt->rt_peer_genid = rt_peer_genid();
1302 }
1303
1304 /*
1305  * Peer allocation may fail only in serious out-of-memory conditions.  However
1306  * we still can generate some output.
1307  * Random ID selection looks a bit dangerous because we have no chances to
1308  * select ID being unique in a reasonable period of time.
1309  * But broken packet identifier may be better than no packet at all.
1310  */
1311 static void ip_select_fb_ident(struct iphdr *iph)
1312 {
1313         static DEFINE_SPINLOCK(ip_fb_id_lock);
1314         static u32 ip_fallback_id;
1315         u32 salt;
1316
1317         spin_lock_bh(&ip_fb_id_lock);
1318         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1319         iph->id = htons(salt & 0xFFFF);
1320         ip_fallback_id = salt;
1321         spin_unlock_bh(&ip_fb_id_lock);
1322 }
1323
1324 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1325 {
1326         struct rtable *rt = (struct rtable *) dst;
1327
1328         if (rt) {
1329                 if (rt->peer == NULL)
1330                         rt_bind_peer(rt, rt->rt_dst, 1);
1331
1332                 /* If peer is attached to destination, it is never detached,
1333                    so that we need not to grab a lock to dereference it.
1334                  */
1335                 if (rt->peer) {
1336                         iph->id = htons(inet_getid(rt->peer, more));
1337                         return;
1338                 }
1339         } else
1340                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1341                        __builtin_return_address(0));
1342
1343         ip_select_fb_ident(iph);
1344 }
1345 EXPORT_SYMBOL(__ip_select_ident);
1346
1347 static void rt_del(unsigned hash, struct rtable *rt)
1348 {
1349         struct rtable __rcu **rthp;
1350         struct rtable *aux;
1351
1352         rthp = &rt_hash_table[hash].chain;
1353         spin_lock_bh(rt_hash_lock_addr(hash));
1354         ip_rt_put(rt);
1355         while ((aux = rcu_dereference_protected(*rthp,
1356                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1357                 if (aux == rt || rt_is_expired(aux)) {
1358                         *rthp = aux->dst.rt_next;
1359                         rt_free(aux);
1360                         continue;
1361                 }
1362                 rthp = &aux->dst.rt_next;
1363         }
1364         spin_unlock_bh(rt_hash_lock_addr(hash));
1365 }
1366
1367 /* called in rcu_read_lock() section */
1368 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1369                     __be32 saddr, struct net_device *dev)
1370 {
1371         struct in_device *in_dev = __in_dev_get_rcu(dev);
1372         struct inet_peer *peer;
1373         struct net *net;
1374
1375         if (!in_dev)
1376                 return;
1377
1378         net = dev_net(dev);
1379         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1380             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1381             ipv4_is_zeronet(new_gw))
1382                 goto reject_redirect;
1383
1384         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1385                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1386                         goto reject_redirect;
1387                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1388                         goto reject_redirect;
1389         } else {
1390                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1391                         goto reject_redirect;
1392         }
1393
1394         peer = inet_getpeer_v4(daddr, 1);
1395         if (peer) {
1396                 peer->redirect_learned.a4 = new_gw;
1397
1398                 inet_putpeer(peer);
1399
1400                 atomic_inc(&__rt_peer_genid);
1401         }
1402         return;
1403
1404 reject_redirect:
1405 #ifdef CONFIG_IP_ROUTE_VERBOSE
1406         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1407                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1408                         "  Advised path = %pI4 -> %pI4\n",
1409                        &old_gw, dev->name, &new_gw,
1410                        &saddr, &daddr);
1411 #endif
1412         ;
1413 }
1414
1415 static bool peer_pmtu_expired(struct inet_peer *peer)
1416 {
1417         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1418
1419         return orig &&
1420                time_after_eq(jiffies, orig) &&
1421                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1422 }
1423
1424 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1425 {
1426         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1427
1428         return orig &&
1429                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1430 }
1431
1432 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1433 {
1434         struct rtable *rt = (struct rtable *)dst;
1435         struct dst_entry *ret = dst;
1436
1437         if (rt) {
1438                 if (dst->obsolete > 0) {
1439                         ip_rt_put(rt);
1440                         ret = NULL;
1441                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1442                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1443                                                 rt->rt_oif,
1444                                                 rt_genid(dev_net(dst->dev)));
1445                         rt_del(hash, rt);
1446                         ret = NULL;
1447                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1448                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1449                 }
1450         }
1451         return ret;
1452 }
1453
1454 /*
1455  * Algorithm:
1456  *      1. The first ip_rt_redirect_number redirects are sent
1457  *         with exponential backoff, then we stop sending them at all,
1458  *         assuming that the host ignores our redirects.
1459  *      2. If we did not see packets requiring redirects
1460  *         during ip_rt_redirect_silence, we assume that the host
1461  *         forgot redirected route and start to send redirects again.
1462  *
1463  * This algorithm is much cheaper and more intelligent than dumb load limiting
1464  * in icmp.c.
1465  *
1466  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1467  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1468  */
1469
1470 void ip_rt_send_redirect(struct sk_buff *skb)
1471 {
1472         struct rtable *rt = skb_rtable(skb);
1473         struct in_device *in_dev;
1474         struct inet_peer *peer;
1475         int log_martians;
1476
1477         rcu_read_lock();
1478         in_dev = __in_dev_get_rcu(rt->dst.dev);
1479         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1480                 rcu_read_unlock();
1481                 return;
1482         }
1483         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1484         rcu_read_unlock();
1485
1486         if (!rt->peer)
1487                 rt_bind_peer(rt, rt->rt_dst, 1);
1488         peer = rt->peer;
1489         if (!peer) {
1490                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1491                 return;
1492         }
1493
1494         /* No redirected packets during ip_rt_redirect_silence;
1495          * reset the algorithm.
1496          */
1497         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1498                 peer->rate_tokens = 0;
1499
1500         /* Too many ignored redirects; do not send anything
1501          * set dst.rate_last to the last seen redirected packet.
1502          */
1503         if (peer->rate_tokens >= ip_rt_redirect_number) {
1504                 peer->rate_last = jiffies;
1505                 return;
1506         }
1507
1508         /* Check for load limit; set rate_last to the latest sent
1509          * redirect.
1510          */
1511         if (peer->rate_tokens == 0 ||
1512             time_after(jiffies,
1513                        (peer->rate_last +
1514                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1515                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1516                 peer->rate_last = jiffies;
1517                 ++peer->rate_tokens;
1518 #ifdef CONFIG_IP_ROUTE_VERBOSE
1519                 if (log_martians &&
1520                     peer->rate_tokens == ip_rt_redirect_number &&
1521                     net_ratelimit())
1522                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1523                                &ip_hdr(skb)->saddr, rt->rt_iif,
1524                                 &rt->rt_dst, &rt->rt_gateway);
1525 #endif
1526         }
1527 }
1528
1529 static int ip_error(struct sk_buff *skb)
1530 {
1531         struct rtable *rt = skb_rtable(skb);
1532         struct inet_peer *peer;
1533         unsigned long now;
1534         bool send;
1535         int code;
1536
1537         switch (rt->dst.error) {
1538                 case EINVAL:
1539                 default:
1540                         goto out;
1541                 case EHOSTUNREACH:
1542                         code = ICMP_HOST_UNREACH;
1543                         break;
1544                 case ENETUNREACH:
1545                         code = ICMP_NET_UNREACH;
1546                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1547                                         IPSTATS_MIB_INNOROUTES);
1548                         break;
1549                 case EACCES:
1550                         code = ICMP_PKT_FILTERED;
1551                         break;
1552         }
1553
1554         if (!rt->peer)
1555                 rt_bind_peer(rt, rt->rt_dst, 1);
1556         peer = rt->peer;
1557
1558         send = true;
1559         if (peer) {
1560                 now = jiffies;
1561                 peer->rate_tokens += now - peer->rate_last;
1562                 if (peer->rate_tokens > ip_rt_error_burst)
1563                         peer->rate_tokens = ip_rt_error_burst;
1564                 peer->rate_last = now;
1565                 if (peer->rate_tokens >= ip_rt_error_cost)
1566                         peer->rate_tokens -= ip_rt_error_cost;
1567                 else
1568                         send = false;
1569         }
1570         if (send)
1571                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1572
1573 out:    kfree_skb(skb);
1574         return 0;
1575 }
1576
1577 /*
1578  *      The last two values are not from the RFC but
1579  *      are needed for AMPRnet AX.25 paths.
1580  */
1581
1582 static const unsigned short mtu_plateau[] =
1583 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1584
1585 static inline unsigned short guess_mtu(unsigned short old_mtu)
1586 {
1587         int i;
1588
1589         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1590                 if (old_mtu > mtu_plateau[i])
1591                         return mtu_plateau[i];
1592         return 68;
1593 }
1594
1595 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1596                                  unsigned short new_mtu,
1597                                  struct net_device *dev)
1598 {
1599         unsigned short old_mtu = ntohs(iph->tot_len);
1600         unsigned short est_mtu = 0;
1601         struct inet_peer *peer;
1602
1603         peer = inet_getpeer_v4(iph->daddr, 1);
1604         if (peer) {
1605                 unsigned short mtu = new_mtu;
1606
1607                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1608                         /* BSD 4.2 derived systems incorrectly adjust
1609                          * tot_len by the IP header length, and report
1610                          * a zero MTU in the ICMP message.
1611                          */
1612                         if (mtu == 0 &&
1613                             old_mtu >= 68 + (iph->ihl << 2))
1614                                 old_mtu -= iph->ihl << 2;
1615                         mtu = guess_mtu(old_mtu);
1616                 }
1617
1618                 if (mtu < ip_rt_min_pmtu)
1619                         mtu = ip_rt_min_pmtu;
1620                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1621                         unsigned long pmtu_expires;
1622
1623                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1624                         if (!pmtu_expires)
1625                                 pmtu_expires = 1UL;
1626
1627                         est_mtu = mtu;
1628                         peer->pmtu_learned = mtu;
1629                         peer->pmtu_expires = pmtu_expires;
1630                 }
1631
1632                 inet_putpeer(peer);
1633
1634                 atomic_inc(&__rt_peer_genid);
1635         }
1636         return est_mtu ? : new_mtu;
1637 }
1638
1639 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1640 {
1641         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1642
1643         if (!expires)
1644                 return;
1645         if (time_before(jiffies, expires)) {
1646                 u32 orig_dst_mtu = dst_mtu(dst);
1647                 if (peer->pmtu_learned < orig_dst_mtu) {
1648                         if (!peer->pmtu_orig)
1649                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1650                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1651                 }
1652         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1653                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1654 }
1655
1656 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1657 {
1658         struct rtable *rt = (struct rtable *) dst;
1659         struct inet_peer *peer;
1660
1661         dst_confirm(dst);
1662
1663         if (!rt->peer)
1664                 rt_bind_peer(rt, rt->rt_dst, 1);
1665         peer = rt->peer;
1666         if (peer) {
1667                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1668
1669                 if (mtu < ip_rt_min_pmtu)
1670                         mtu = ip_rt_min_pmtu;
1671                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1672
1673                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1674                         if (!pmtu_expires)
1675                                 pmtu_expires = 1UL;
1676
1677                         peer->pmtu_learned = mtu;
1678                         peer->pmtu_expires = pmtu_expires;
1679
1680                         atomic_inc(&__rt_peer_genid);
1681                         rt->rt_peer_genid = rt_peer_genid();
1682                 }
1683                 check_peer_pmtu(dst, peer);
1684         }
1685 }
1686
1687 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1688 {
1689         struct rtable *rt = (struct rtable *) dst;
1690         __be32 orig_gw = rt->rt_gateway;
1691
1692         dst_confirm(&rt->dst);
1693
1694         neigh_release(rt->dst.neighbour);
1695         rt->dst.neighbour = NULL;
1696
1697         rt->rt_gateway = peer->redirect_learned.a4;
1698         if (arp_bind_neighbour(&rt->dst) ||
1699             !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1700                 if (rt->dst.neighbour)
1701                         neigh_event_send(rt->dst.neighbour, NULL);
1702                 rt->rt_gateway = orig_gw;
1703                 return -EAGAIN;
1704         } else {
1705                 rt->rt_flags |= RTCF_REDIRECTED;
1706                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1707                                         rt->dst.neighbour);
1708         }
1709         return 0;
1710 }
1711
1712 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1713 {
1714         struct rtable *rt = (struct rtable *) dst;
1715
1716         if (rt_is_expired(rt))
1717                 return NULL;
1718         if (rt->rt_peer_genid != rt_peer_genid()) {
1719                 struct inet_peer *peer;
1720
1721                 if (!rt->peer)
1722                         rt_bind_peer(rt, rt->rt_dst, 0);
1723
1724                 peer = rt->peer;
1725                 if (peer) {
1726                         check_peer_pmtu(dst, peer);
1727
1728                         if (peer->redirect_learned.a4 &&
1729                             peer->redirect_learned.a4 != rt->rt_gateway) {
1730                                 if (check_peer_redir(dst, peer))
1731                                         return NULL;
1732                         }
1733                 }
1734
1735                 rt->rt_peer_genid = rt_peer_genid();
1736         }
1737         return dst;
1738 }
1739
1740 static void ipv4_dst_destroy(struct dst_entry *dst)
1741 {
1742         struct rtable *rt = (struct rtable *) dst;
1743         struct inet_peer *peer = rt->peer;
1744
1745         if (rt->fi) {
1746                 fib_info_put(rt->fi);
1747                 rt->fi = NULL;
1748         }
1749         if (peer) {
1750                 rt->peer = NULL;
1751                 inet_putpeer(peer);
1752         }
1753 }
1754
1755
1756 static void ipv4_link_failure(struct sk_buff *skb)
1757 {
1758         struct rtable *rt;
1759
1760         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1761
1762         rt = skb_rtable(skb);
1763         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1764                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1765 }
1766
1767 static int ip_rt_bug(struct sk_buff *skb)
1768 {
1769         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1770                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1771                 skb->dev ? skb->dev->name : "?");
1772         kfree_skb(skb);
1773         WARN_ON(1);
1774         return 0;
1775 }
1776
1777 /*
1778    We do not cache source address of outgoing interface,
1779    because it is used only by IP RR, TS and SRR options,
1780    so that it out of fast path.
1781
1782    BTW remember: "addr" is allowed to be not aligned
1783    in IP options!
1784  */
1785
1786 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1787 {
1788         __be32 src;
1789
1790         if (rt_is_output_route(rt))
1791                 src = ip_hdr(skb)->saddr;
1792         else {
1793                 struct fib_result res;
1794                 struct flowi4 fl4;
1795                 struct iphdr *iph;
1796
1797                 iph = ip_hdr(skb);
1798
1799                 memset(&fl4, 0, sizeof(fl4));
1800                 fl4.daddr = iph->daddr;
1801                 fl4.saddr = iph->saddr;
1802                 fl4.flowi4_tos = RT_TOS(iph->tos);
1803                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1804                 fl4.flowi4_iif = skb->dev->ifindex;
1805                 fl4.flowi4_mark = skb->mark;
1806
1807                 rcu_read_lock();
1808                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1809                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1810                 else
1811                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1812                                         RT_SCOPE_UNIVERSE);
1813                 rcu_read_unlock();
1814         }
1815         memcpy(addr, &src, 4);
1816 }
1817
1818 #ifdef CONFIG_IP_ROUTE_CLASSID
1819 static void set_class_tag(struct rtable *rt, u32 tag)
1820 {
1821         if (!(rt->dst.tclassid & 0xFFFF))
1822                 rt->dst.tclassid |= tag & 0xFFFF;
1823         if (!(rt->dst.tclassid & 0xFFFF0000))
1824                 rt->dst.tclassid |= tag & 0xFFFF0000;
1825 }
1826 #endif
1827
1828 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1829 {
1830         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1831
1832         if (advmss == 0) {
1833                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1834                                ip_rt_min_advmss);
1835                 if (advmss > 65535 - 40)
1836                         advmss = 65535 - 40;
1837         }
1838         return advmss;
1839 }
1840
1841 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1842 {
1843         unsigned int mtu = dst->dev->mtu;
1844
1845         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1846                 const struct rtable *rt = (const struct rtable *) dst;
1847
1848                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1849                         mtu = 576;
1850         }
1851
1852         if (mtu > IP_MAX_MTU)
1853                 mtu = IP_MAX_MTU;
1854
1855         return mtu;
1856 }
1857
1858 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1859                             struct fib_info *fi)
1860 {
1861         struct inet_peer *peer;
1862         int create = 0;
1863
1864         /* If a peer entry exists for this destination, we must hook
1865          * it up in order to get at cached metrics.
1866          */
1867         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1868                 create = 1;
1869
1870         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1871         if (peer) {
1872                 rt->rt_peer_genid = rt_peer_genid();
1873                 if (inet_metrics_new(peer))
1874                         memcpy(peer->metrics, fi->fib_metrics,
1875                                sizeof(u32) * RTAX_MAX);
1876                 dst_init_metrics(&rt->dst, peer->metrics, false);
1877
1878                 check_peer_pmtu(&rt->dst, peer);
1879                 if (peer->redirect_learned.a4 &&
1880                     peer->redirect_learned.a4 != rt->rt_gateway) {
1881                         rt->rt_gateway = peer->redirect_learned.a4;
1882                         rt->rt_flags |= RTCF_REDIRECTED;
1883                 }
1884         } else {
1885                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1886                         rt->fi = fi;
1887                         atomic_inc(&fi->fib_clntref);
1888                 }
1889                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1890         }
1891 }
1892
1893 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1894                            const struct fib_result *res,
1895                            struct fib_info *fi, u16 type, u32 itag)
1896 {
1897         struct dst_entry *dst = &rt->dst;
1898
1899         if (fi) {
1900                 if (FIB_RES_GW(*res) &&
1901                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1902                         rt->rt_gateway = FIB_RES_GW(*res);
1903                 rt_init_metrics(rt, fl4, fi);
1904 #ifdef CONFIG_IP_ROUTE_CLASSID
1905                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1906 #endif
1907         }
1908
1909         if (dst_mtu(dst) > IP_MAX_MTU)
1910                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1911         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1912                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1913
1914 #ifdef CONFIG_IP_ROUTE_CLASSID
1915 #ifdef CONFIG_IP_MULTIPLE_TABLES
1916         set_class_tag(rt, fib_rules_tclass(res));
1917 #endif
1918         set_class_tag(rt, itag);
1919 #endif
1920 }
1921
1922 static struct rtable *rt_dst_alloc(struct net_device *dev,
1923                                    bool nopolicy, bool noxfrm)
1924 {
1925         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1926                          DST_HOST |
1927                          (nopolicy ? DST_NOPOLICY : 0) |
1928                          (noxfrm ? DST_NOXFRM : 0));
1929 }
1930
1931 /* called in rcu_read_lock() section */
1932 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1933                                 u8 tos, struct net_device *dev, int our)
1934 {
1935         unsigned int hash;
1936         struct rtable *rth;
1937         __be32 spec_dst;
1938         struct in_device *in_dev = __in_dev_get_rcu(dev);
1939         u32 itag = 0;
1940         int err;
1941
1942         /* Primary sanity checks. */
1943
1944         if (in_dev == NULL)
1945                 return -EINVAL;
1946
1947         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1948             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1949                 goto e_inval;
1950
1951         if (ipv4_is_zeronet(saddr)) {
1952                 if (!ipv4_is_local_multicast(daddr))
1953                         goto e_inval;
1954                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1955         } else {
1956                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1957                                           &itag);
1958                 if (err < 0)
1959                         goto e_err;
1960         }
1961         rth = rt_dst_alloc(init_net.loopback_dev,
1962                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1963         if (!rth)
1964                 goto e_nobufs;
1965
1966 #ifdef CONFIG_IP_ROUTE_CLASSID
1967         rth->dst.tclassid = itag;
1968 #endif
1969         rth->dst.output = ip_rt_bug;
1970
1971         rth->rt_key_dst = daddr;
1972         rth->rt_key_src = saddr;
1973         rth->rt_genid   = rt_genid(dev_net(dev));
1974         rth->rt_flags   = RTCF_MULTICAST;
1975         rth->rt_type    = RTN_MULTICAST;
1976         rth->rt_key_tos = tos;
1977         rth->rt_dst     = daddr;
1978         rth->rt_src     = saddr;
1979         rth->rt_route_iif = dev->ifindex;
1980         rth->rt_iif     = dev->ifindex;
1981         rth->rt_oif     = 0;
1982         rth->rt_mark    = skb->mark;
1983         rth->rt_gateway = daddr;
1984         rth->rt_spec_dst= spec_dst;
1985         rth->rt_peer_genid = 0;
1986         rth->peer = NULL;
1987         rth->fi = NULL;
1988         if (our) {
1989                 rth->dst.input= ip_local_deliver;
1990                 rth->rt_flags |= RTCF_LOCAL;
1991         }
1992
1993 #ifdef CONFIG_IP_MROUTE
1994         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1995                 rth->dst.input = ip_mr_input;
1996 #endif
1997         RT_CACHE_STAT_INC(in_slow_mc);
1998
1999         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2000         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2001         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2002
2003 e_nobufs:
2004         return -ENOBUFS;
2005 e_inval:
2006         return -EINVAL;
2007 e_err:
2008         return err;
2009 }
2010
2011
2012 static void ip_handle_martian_source(struct net_device *dev,
2013                                      struct in_device *in_dev,
2014                                      struct sk_buff *skb,
2015                                      __be32 daddr,
2016                                      __be32 saddr)
2017 {
2018         RT_CACHE_STAT_INC(in_martian_src);
2019 #ifdef CONFIG_IP_ROUTE_VERBOSE
2020         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2021                 /*
2022                  *      RFC1812 recommendation, if source is martian,
2023                  *      the only hint is MAC header.
2024                  */
2025                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2026                         &daddr, &saddr, dev->name);
2027                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2028                         int i;
2029                         const unsigned char *p = skb_mac_header(skb);
2030                         printk(KERN_WARNING "ll header: ");
2031                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2032                                 printk("%02x", *p);
2033                                 if (i < (dev->hard_header_len - 1))
2034                                         printk(":");
2035                         }
2036                         printk("\n");
2037                 }
2038         }
2039 #endif
2040 }
2041
2042 /* called in rcu_read_lock() section */
2043 static int __mkroute_input(struct sk_buff *skb,
2044                            const struct fib_result *res,
2045                            struct in_device *in_dev,
2046                            __be32 daddr, __be32 saddr, u32 tos,
2047                            struct rtable **result)
2048 {
2049         struct rtable *rth;
2050         int err;
2051         struct in_device *out_dev;
2052         unsigned int flags = 0;
2053         __be32 spec_dst;
2054         u32 itag;
2055
2056         /* get a working reference to the output device */
2057         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2058         if (out_dev == NULL) {
2059                 if (net_ratelimit())
2060                         printk(KERN_CRIT "Bug in ip_route_input" \
2061                                "_slow(). Please, report\n");
2062                 return -EINVAL;
2063         }
2064
2065
2066         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2067                                   in_dev->dev, &spec_dst, &itag);
2068         if (err < 0) {
2069                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2070                                          saddr);
2071
2072                 goto cleanup;
2073         }
2074
2075         if (err)
2076                 flags |= RTCF_DIRECTSRC;
2077
2078         if (out_dev == in_dev && err &&
2079             (IN_DEV_SHARED_MEDIA(out_dev) ||
2080              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2081                 flags |= RTCF_DOREDIRECT;
2082
2083         if (skb->protocol != htons(ETH_P_IP)) {
2084                 /* Not IP (i.e. ARP). Do not create route, if it is
2085                  * invalid for proxy arp. DNAT routes are always valid.
2086                  *
2087                  * Proxy arp feature have been extended to allow, ARP
2088                  * replies back to the same interface, to support
2089                  * Private VLAN switch technologies. See arp.c.
2090                  */
2091                 if (out_dev == in_dev &&
2092                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2093                         err = -EINVAL;
2094                         goto cleanup;
2095                 }
2096         }
2097
2098         rth = rt_dst_alloc(out_dev->dev,
2099                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2100                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2101         if (!rth) {
2102                 err = -ENOBUFS;
2103                 goto cleanup;
2104         }
2105
2106         rth->rt_key_dst = daddr;
2107         rth->rt_key_src = saddr;
2108         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2109         rth->rt_flags = flags;
2110         rth->rt_type = res->type;
2111         rth->rt_key_tos = tos;
2112         rth->rt_dst     = daddr;
2113         rth->rt_src     = saddr;
2114         rth->rt_route_iif = in_dev->dev->ifindex;
2115         rth->rt_iif     = in_dev->dev->ifindex;
2116         rth->rt_oif     = 0;
2117         rth->rt_mark    = skb->mark;
2118         rth->rt_gateway = daddr;
2119         rth->rt_spec_dst= spec_dst;
2120         rth->rt_peer_genid = 0;
2121         rth->peer = NULL;
2122         rth->fi = NULL;
2123
2124         rth->dst.input = ip_forward;
2125         rth->dst.output = ip_output;
2126
2127         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2128
2129         *result = rth;
2130         err = 0;
2131  cleanup:
2132         return err;
2133 }
2134
2135 static int ip_mkroute_input(struct sk_buff *skb,
2136                             struct fib_result *res,
2137                             const struct flowi4 *fl4,
2138                             struct in_device *in_dev,
2139                             __be32 daddr, __be32 saddr, u32 tos)
2140 {
2141         struct rtable* rth = NULL;
2142         int err;
2143         unsigned hash;
2144
2145 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2146         if (res->fi && res->fi->fib_nhs > 1)
2147                 fib_select_multipath(res);
2148 #endif
2149
2150         /* create a routing cache entry */
2151         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2152         if (err)
2153                 return err;
2154
2155         /* put it into the cache */
2156         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2157                        rt_genid(dev_net(rth->dst.dev)));
2158         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2159         if (IS_ERR(rth))
2160                 return PTR_ERR(rth);
2161         return 0;
2162 }
2163
2164 /*
2165  *      NOTE. We drop all the packets that has local source
2166  *      addresses, because every properly looped back packet
2167  *      must have correct destination already attached by output routine.
2168  *
2169  *      Such approach solves two big problems:
2170  *      1. Not simplex devices are handled properly.
2171  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2172  *      called with rcu_read_lock()
2173  */
2174
2175 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2176                                u8 tos, struct net_device *dev)
2177 {
2178         struct fib_result res;
2179         struct in_device *in_dev = __in_dev_get_rcu(dev);
2180         struct flowi4   fl4;
2181         unsigned        flags = 0;
2182         u32             itag = 0;
2183         struct rtable * rth;
2184         unsigned        hash;
2185         __be32          spec_dst;
2186         int             err = -EINVAL;
2187         struct net    * net = dev_net(dev);
2188
2189         /* IP on this device is disabled. */
2190
2191         if (!in_dev)
2192                 goto out;
2193
2194         /* Check for the most weird martians, which can be not detected
2195            by fib_lookup.
2196          */
2197
2198         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2199             ipv4_is_loopback(saddr))
2200                 goto martian_source;
2201
2202         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2203                 goto brd_input;
2204
2205         /* Accept zero addresses only to limited broadcast;
2206          * I even do not know to fix it or not. Waiting for complains :-)
2207          */
2208         if (ipv4_is_zeronet(saddr))
2209                 goto martian_source;
2210
2211         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2212                 goto martian_destination;
2213
2214         /*
2215          *      Now we are ready to route packet.
2216          */
2217         fl4.flowi4_oif = 0;
2218         fl4.flowi4_iif = dev->ifindex;
2219         fl4.flowi4_mark = skb->mark;
2220         fl4.flowi4_tos = tos;
2221         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2222         fl4.daddr = daddr;
2223         fl4.saddr = saddr;
2224         err = fib_lookup(net, &fl4, &res);
2225         if (err != 0) {
2226                 if (!IN_DEV_FORWARD(in_dev))
2227                         goto e_hostunreach;
2228                 goto no_route;
2229         }
2230
2231         RT_CACHE_STAT_INC(in_slow_tot);
2232
2233         if (res.type == RTN_BROADCAST)
2234                 goto brd_input;
2235
2236         if (res.type == RTN_LOCAL) {
2237                 err = fib_validate_source(skb, saddr, daddr, tos,
2238                                           net->loopback_dev->ifindex,
2239                                           dev, &spec_dst, &itag);
2240                 if (err < 0)
2241                         goto martian_source_keep_err;
2242                 if (err)
2243                         flags |= RTCF_DIRECTSRC;
2244                 spec_dst = daddr;
2245                 goto local_input;
2246         }
2247
2248         if (!IN_DEV_FORWARD(in_dev))
2249                 goto e_hostunreach;
2250         if (res.type != RTN_UNICAST)
2251                 goto martian_destination;
2252
2253         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2254 out:    return err;
2255
2256 brd_input:
2257         if (skb->protocol != htons(ETH_P_IP))
2258                 goto e_inval;
2259
2260         if (ipv4_is_zeronet(saddr))
2261                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2262         else {
2263                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2264                                           &itag);
2265                 if (err < 0)
2266                         goto martian_source_keep_err;
2267                 if (err)
2268                         flags |= RTCF_DIRECTSRC;
2269         }
2270         flags |= RTCF_BROADCAST;
2271         res.type = RTN_BROADCAST;
2272         RT_CACHE_STAT_INC(in_brd);
2273
2274 local_input:
2275         rth = rt_dst_alloc(net->loopback_dev,
2276                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2277         if (!rth)
2278                 goto e_nobufs;
2279
2280         rth->dst.input= ip_local_deliver;
2281         rth->dst.output= ip_rt_bug;
2282 #ifdef CONFIG_IP_ROUTE_CLASSID
2283         rth->dst.tclassid = itag;
2284 #endif
2285
2286         rth->rt_key_dst = daddr;
2287         rth->rt_key_src = saddr;
2288         rth->rt_genid = rt_genid(net);
2289         rth->rt_flags   = flags|RTCF_LOCAL;
2290         rth->rt_type    = res.type;
2291         rth->rt_key_tos = tos;
2292         rth->rt_dst     = daddr;
2293         rth->rt_src     = saddr;
2294 #ifdef CONFIG_IP_ROUTE_CLASSID
2295         rth->dst.tclassid = itag;
2296 #endif
2297         rth->rt_route_iif = dev->ifindex;
2298         rth->rt_iif     = dev->ifindex;
2299         rth->rt_oif     = 0;
2300         rth->rt_mark    = skb->mark;
2301         rth->rt_gateway = daddr;
2302         rth->rt_spec_dst= spec_dst;
2303         rth->rt_peer_genid = 0;
2304         rth->peer = NULL;
2305         rth->fi = NULL;
2306         if (res.type == RTN_UNREACHABLE) {
2307                 rth->dst.input= ip_error;
2308                 rth->dst.error= -err;
2309                 rth->rt_flags   &= ~RTCF_LOCAL;
2310         }
2311         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2312         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2313         err = 0;
2314         if (IS_ERR(rth))
2315                 err = PTR_ERR(rth);
2316         goto out;
2317
2318 no_route:
2319         RT_CACHE_STAT_INC(in_no_route);
2320         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2321         res.type = RTN_UNREACHABLE;
2322         if (err == -ESRCH)
2323                 err = -ENETUNREACH;
2324         goto local_input;
2325
2326         /*
2327          *      Do not cache martian addresses: they should be logged (RFC1812)
2328          */
2329 martian_destination:
2330         RT_CACHE_STAT_INC(in_martian_dst);
2331 #ifdef CONFIG_IP_ROUTE_VERBOSE
2332         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2333                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2334                         &daddr, &saddr, dev->name);
2335 #endif
2336
2337 e_hostunreach:
2338         err = -EHOSTUNREACH;
2339         goto out;
2340
2341 e_inval:
2342         err = -EINVAL;
2343         goto out;
2344
2345 e_nobufs:
2346         err = -ENOBUFS;
2347         goto out;
2348
2349 martian_source:
2350         err = -EINVAL;
2351 martian_source_keep_err:
2352         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2353         goto out;
2354 }
2355
2356 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2357                            u8 tos, struct net_device *dev, bool noref)
2358 {
2359         struct rtable * rth;
2360         unsigned        hash;
2361         int iif = dev->ifindex;
2362         struct net *net;
2363         int res;
2364
2365         net = dev_net(dev);
2366
2367         rcu_read_lock();
2368
2369         if (!rt_caching(net))
2370                 goto skip_cache;
2371
2372         tos &= IPTOS_RT_MASK;
2373         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2374
2375         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2376              rth = rcu_dereference(rth->dst.rt_next)) {
2377                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2378                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2379                      (rth->rt_route_iif ^ iif) |
2380                      (rth->rt_key_tos ^ tos)) == 0 &&
2381                     rth->rt_mark == skb->mark &&
2382                     net_eq(dev_net(rth->dst.dev), net) &&
2383                     !rt_is_expired(rth)) {
2384                         if (noref) {
2385                                 dst_use_noref(&rth->dst, jiffies);
2386                                 skb_dst_set_noref(skb, &rth->dst);
2387                         } else {
2388                                 dst_use(&rth->dst, jiffies);
2389                                 skb_dst_set(skb, &rth->dst);
2390                         }
2391                         RT_CACHE_STAT_INC(in_hit);
2392                         rcu_read_unlock();
2393                         return 0;
2394                 }
2395                 RT_CACHE_STAT_INC(in_hlist_search);
2396         }
2397
2398 skip_cache:
2399         /* Multicast recognition logic is moved from route cache to here.
2400            The problem was that too many Ethernet cards have broken/missing
2401            hardware multicast filters :-( As result the host on multicasting
2402            network acquires a lot of useless route cache entries, sort of
2403            SDR messages from all the world. Now we try to get rid of them.
2404            Really, provided software IP multicast filter is organized
2405            reasonably (at least, hashed), it does not result in a slowdown
2406            comparing with route cache reject entries.
2407            Note, that multicast routers are not affected, because
2408            route cache entry is created eventually.
2409          */
2410         if (ipv4_is_multicast(daddr)) {
2411                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2412
2413                 if (in_dev) {
2414                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2415                                                   ip_hdr(skb)->protocol);
2416                         if (our
2417 #ifdef CONFIG_IP_MROUTE
2418                                 ||
2419                             (!ipv4_is_local_multicast(daddr) &&
2420                              IN_DEV_MFORWARD(in_dev))
2421 #endif
2422                            ) {
2423                                 int res = ip_route_input_mc(skb, daddr, saddr,
2424                                                             tos, dev, our);
2425                                 rcu_read_unlock();
2426                                 return res;
2427                         }
2428                 }
2429                 rcu_read_unlock();
2430                 return -EINVAL;
2431         }
2432         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2433         rcu_read_unlock();
2434         return res;
2435 }
2436 EXPORT_SYMBOL(ip_route_input_common);
2437
2438 /* called with rcu_read_lock() */
2439 static struct rtable *__mkroute_output(const struct fib_result *res,
2440                                        const struct flowi4 *fl4,
2441                                        __be32 orig_daddr, __be32 orig_saddr,
2442                                        int orig_oif, struct net_device *dev_out,
2443                                        unsigned int flags)
2444 {
2445         struct fib_info *fi = res->fi;
2446         u32 tos = RT_FL_TOS(fl4);
2447         struct in_device *in_dev;
2448         u16 type = res->type;
2449         struct rtable *rth;
2450
2451         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2452                 return ERR_PTR(-EINVAL);
2453
2454         if (ipv4_is_lbcast(fl4->daddr))
2455                 type = RTN_BROADCAST;
2456         else if (ipv4_is_multicast(fl4->daddr))
2457                 type = RTN_MULTICAST;
2458         else if (ipv4_is_zeronet(fl4->daddr))
2459                 return ERR_PTR(-EINVAL);
2460
2461         if (dev_out->flags & IFF_LOOPBACK)
2462                 flags |= RTCF_LOCAL;
2463
2464         in_dev = __in_dev_get_rcu(dev_out);
2465         if (!in_dev)
2466                 return ERR_PTR(-EINVAL);
2467
2468         if (type == RTN_BROADCAST) {
2469                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2470                 fi = NULL;
2471         } else if (type == RTN_MULTICAST) {
2472                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2473                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2474                                      fl4->flowi4_proto))
2475                         flags &= ~RTCF_LOCAL;
2476                 /* If multicast route do not exist use
2477                  * default one, but do not gateway in this case.
2478                  * Yes, it is hack.
2479                  */
2480                 if (fi && res->prefixlen < 4)
2481                         fi = NULL;
2482         }
2483
2484         rth = rt_dst_alloc(dev_out,
2485                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2486                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2487         if (!rth)
2488                 return ERR_PTR(-ENOBUFS);
2489
2490         rth->dst.output = ip_output;
2491
2492         rth->rt_key_dst = orig_daddr;
2493         rth->rt_key_src = orig_saddr;
2494         rth->rt_genid = rt_genid(dev_net(dev_out));
2495         rth->rt_flags   = flags;
2496         rth->rt_type    = type;
2497         rth->rt_key_tos = tos;
2498         rth->rt_dst     = fl4->daddr;
2499         rth->rt_src     = fl4->saddr;
2500         rth->rt_route_iif = 0;
2501         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2502         rth->rt_oif     = orig_oif;
2503         rth->rt_mark    = fl4->flowi4_mark;
2504         rth->rt_gateway = fl4->daddr;
2505         rth->rt_spec_dst= fl4->saddr;
2506         rth->rt_peer_genid = 0;
2507         rth->peer = NULL;
2508         rth->fi = NULL;
2509
2510         RT_CACHE_STAT_INC(out_slow_tot);
2511
2512         if (flags & RTCF_LOCAL) {
2513                 rth->dst.input = ip_local_deliver;
2514                 rth->rt_spec_dst = fl4->daddr;
2515         }
2516         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2517                 rth->rt_spec_dst = fl4->saddr;
2518                 if (flags & RTCF_LOCAL &&
2519                     !(dev_out->flags & IFF_LOOPBACK)) {
2520                         rth->dst.output = ip_mc_output;
2521                         RT_CACHE_STAT_INC(out_slow_mc);
2522                 }
2523 #ifdef CONFIG_IP_MROUTE
2524                 if (type == RTN_MULTICAST) {
2525                         if (IN_DEV_MFORWARD(in_dev) &&
2526                             !ipv4_is_local_multicast(fl4->daddr)) {
2527                                 rth->dst.input = ip_mr_input;
2528                                 rth->dst.output = ip_mc_output;
2529                         }
2530                 }
2531 #endif
2532         }
2533
2534         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2535
2536         return rth;
2537 }
2538
2539 /*
2540  * Major route resolver routine.
2541  * called with rcu_read_lock();
2542  */
2543
2544 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2545 {
2546         struct net_device *dev_out = NULL;
2547         u32 tos = RT_FL_TOS(fl4);
2548         unsigned int flags = 0;
2549         struct fib_result res;
2550         struct rtable *rth;
2551         __be32 orig_daddr;
2552         __be32 orig_saddr;
2553         int orig_oif;
2554
2555         res.fi          = NULL;
2556 #ifdef CONFIG_IP_MULTIPLE_TABLES
2557         res.r           = NULL;
2558 #endif
2559
2560         orig_daddr = fl4->daddr;
2561         orig_saddr = fl4->saddr;
2562         orig_oif = fl4->flowi4_oif;
2563
2564         fl4->flowi4_iif = net->loopback_dev->ifindex;
2565         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2566         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2567                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2568
2569         rcu_read_lock();
2570         if (fl4->saddr) {
2571                 rth = ERR_PTR(-EINVAL);
2572                 if (ipv4_is_multicast(fl4->saddr) ||
2573                     ipv4_is_lbcast(fl4->saddr) ||
2574                     ipv4_is_zeronet(fl4->saddr))
2575                         goto out;
2576
2577                 /* I removed check for oif == dev_out->oif here.
2578                    It was wrong for two reasons:
2579                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2580                       is assigned to multiple interfaces.
2581                    2. Moreover, we are allowed to send packets with saddr
2582                       of another iface. --ANK
2583                  */
2584
2585                 if (fl4->flowi4_oif == 0 &&
2586                     (ipv4_is_multicast(fl4->daddr) ||
2587                      ipv4_is_lbcast(fl4->daddr))) {
2588                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2589                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2590                         if (dev_out == NULL)
2591                                 goto out;
2592
2593                         /* Special hack: user can direct multicasts
2594                            and limited broadcast via necessary interface
2595                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2596                            This hack is not just for fun, it allows
2597                            vic,vat and friends to work.
2598                            They bind socket to loopback, set ttl to zero
2599                            and expect that it will work.
2600                            From the viewpoint of routing cache they are broken,
2601                            because we are not allowed to build multicast path
2602                            with loopback source addr (look, routing cache
2603                            cannot know, that ttl is zero, so that packet
2604                            will not leave this host and route is valid).
2605                            Luckily, this hack is good workaround.
2606                          */
2607
2608                         fl4->flowi4_oif = dev_out->ifindex;
2609                         goto make_route;
2610                 }
2611
2612                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2613                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2614                         if (!__ip_dev_find(net, fl4->saddr, false))
2615                                 goto out;
2616                 }
2617         }
2618
2619
2620         if (fl4->flowi4_oif) {
2621                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2622                 rth = ERR_PTR(-ENODEV);
2623                 if (dev_out == NULL)
2624                         goto out;
2625
2626                 /* RACE: Check return value of inet_select_addr instead. */
2627                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2628                         rth = ERR_PTR(-ENETUNREACH);
2629                         goto out;
2630                 }
2631                 if (ipv4_is_local_multicast(fl4->daddr) ||
2632                     ipv4_is_lbcast(fl4->daddr)) {
2633                         if (!fl4->saddr)
2634                                 fl4->saddr = inet_select_addr(dev_out, 0,
2635                                                               RT_SCOPE_LINK);
2636                         goto make_route;
2637                 }
2638                 if (fl4->saddr) {
2639                         if (ipv4_is_multicast(fl4->daddr))
2640                                 fl4->saddr = inet_select_addr(dev_out, 0,
2641                                                               fl4->flowi4_scope);
2642                         else if (!fl4->daddr)
2643                                 fl4->saddr = inet_select_addr(dev_out, 0,
2644                                                               RT_SCOPE_HOST);
2645                 }
2646         }
2647
2648         if (!fl4->daddr) {
2649                 fl4->daddr = fl4->saddr;
2650                 if (!fl4->daddr)
2651                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2652                 dev_out = net->loopback_dev;
2653                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2654                 res.type = RTN_LOCAL;
2655                 flags |= RTCF_LOCAL;
2656                 goto make_route;
2657         }
2658
2659         if (fib_lookup(net, fl4, &res)) {
2660                 res.fi = NULL;
2661                 if (fl4->flowi4_oif) {
2662                         /* Apparently, routing tables are wrong. Assume,
2663                            that the destination is on link.
2664
2665                            WHY? DW.
2666                            Because we are allowed to send to iface
2667                            even if it has NO routes and NO assigned
2668                            addresses. When oif is specified, routing
2669                            tables are looked up with only one purpose:
2670                            to catch if destination is gatewayed, rather than
2671                            direct. Moreover, if MSG_DONTROUTE is set,
2672                            we send packet, ignoring both routing tables
2673                            and ifaddr state. --ANK
2674
2675
2676                            We could make it even if oif is unknown,
2677                            likely IPv6, but we do not.
2678                          */
2679
2680                         if (fl4->saddr == 0)
2681                                 fl4->saddr = inet_select_addr(dev_out, 0,
2682                                                               RT_SCOPE_LINK);
2683                         res.type = RTN_UNICAST;
2684                         goto make_route;
2685                 }
2686                 rth = ERR_PTR(-ENETUNREACH);
2687                 goto out;
2688         }
2689
2690         if (res.type == RTN_LOCAL) {
2691                 if (!fl4->saddr) {
2692                         if (res.fi->fib_prefsrc)
2693                                 fl4->saddr = res.fi->fib_prefsrc;
2694                         else
2695                                 fl4->saddr = fl4->daddr;
2696                 }
2697                 dev_out = net->loopback_dev;
2698                 fl4->flowi4_oif = dev_out->ifindex;
2699                 res.fi = NULL;
2700                 flags |= RTCF_LOCAL;
2701                 goto make_route;
2702         }
2703
2704 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2705         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2706                 fib_select_multipath(&res);
2707         else
2708 #endif
2709         if (!res.prefixlen &&
2710             res.table->tb_num_default > 1 &&
2711             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2712                 fib_select_default(&res);
2713
2714         if (!fl4->saddr)
2715                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2716
2717         dev_out = FIB_RES_DEV(res);
2718         fl4->flowi4_oif = dev_out->ifindex;
2719
2720
2721 make_route:
2722         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2723                                dev_out, flags);
2724         if (!IS_ERR(rth)) {
2725                 unsigned int hash;
2726
2727                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2728                                rt_genid(dev_net(dev_out)));
2729                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2730         }
2731
2732 out:
2733         rcu_read_unlock();
2734         return rth;
2735 }
2736
2737 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2738 {
2739         struct rtable *rth;
2740         unsigned int hash;
2741
2742         if (!rt_caching(net))
2743                 goto slow_output;
2744
2745         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2746
2747         rcu_read_lock_bh();
2748         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2749                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2750                 if (rth->rt_key_dst == flp4->daddr &&
2751                     rth->rt_key_src == flp4->saddr &&
2752                     rt_is_output_route(rth) &&
2753                     rth->rt_oif == flp4->flowi4_oif &&
2754                     rth->rt_mark == flp4->flowi4_mark &&
2755                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2756                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2757                     net_eq(dev_net(rth->dst.dev), net) &&
2758                     !rt_is_expired(rth)) {
2759                         dst_use(&rth->dst, jiffies);
2760                         RT_CACHE_STAT_INC(out_hit);
2761                         rcu_read_unlock_bh();
2762                         if (!flp4->saddr)
2763                                 flp4->saddr = rth->rt_src;
2764                         if (!flp4->daddr)
2765                                 flp4->daddr = rth->rt_dst;
2766                         return rth;
2767                 }
2768                 RT_CACHE_STAT_INC(out_hlist_search);
2769         }
2770         rcu_read_unlock_bh();
2771
2772 slow_output:
2773         return ip_route_output_slow(net, flp4);
2774 }
2775 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2776
2777 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2778 {
2779         return NULL;
2780 }
2781
2782 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2783 {
2784         return 0;
2785 }
2786
2787 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2788 {
2789 }
2790
2791 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2792                                           unsigned long old)
2793 {
2794         return NULL;
2795 }
2796
2797 static struct dst_ops ipv4_dst_blackhole_ops = {
2798         .family                 =       AF_INET,
2799         .protocol               =       cpu_to_be16(ETH_P_IP),
2800         .destroy                =       ipv4_dst_destroy,
2801         .check                  =       ipv4_blackhole_dst_check,
2802         .default_mtu            =       ipv4_blackhole_default_mtu,
2803         .default_advmss         =       ipv4_default_advmss,
2804         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2805         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2806 };
2807
2808 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2809 {
2810         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2811         struct rtable *ort = (struct rtable *) dst_orig;
2812
2813         if (rt) {
2814                 struct dst_entry *new = &rt->dst;
2815
2816                 new->__use = 1;
2817                 new->input = dst_discard;
2818                 new->output = dst_discard;
2819                 dst_copy_metrics(new, &ort->dst);
2820
2821                 new->dev = ort->dst.dev;
2822                 if (new->dev)
2823                         dev_hold(new->dev);
2824
2825                 rt->rt_key_dst = ort->rt_key_dst;
2826                 rt->rt_key_src = ort->rt_key_src;
2827                 rt->rt_key_tos = ort->rt_key_tos;
2828                 rt->rt_route_iif = ort->rt_route_iif;
2829                 rt->rt_iif = ort->rt_iif;
2830                 rt->rt_oif = ort->rt_oif;
2831                 rt->rt_mark = ort->rt_mark;
2832
2833                 rt->rt_genid = rt_genid(net);
2834                 rt->rt_flags = ort->rt_flags;
2835                 rt->rt_type = ort->rt_type;
2836                 rt->rt_dst = ort->rt_dst;
2837                 rt->rt_src = ort->rt_src;
2838                 rt->rt_gateway = ort->rt_gateway;
2839                 rt->rt_spec_dst = ort->rt_spec_dst;
2840                 rt->peer = ort->peer;
2841                 if (rt->peer)
2842                         atomic_inc(&rt->peer->refcnt);
2843                 rt->fi = ort->fi;
2844                 if (rt->fi)
2845                         atomic_inc(&rt->fi->fib_clntref);
2846
2847                 dst_free(new);
2848         }
2849
2850         dst_release(dst_orig);
2851
2852         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2853 }
2854
2855 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2856                                     struct sock *sk)
2857 {
2858         struct rtable *rt = __ip_route_output_key(net, flp4);
2859
2860         if (IS_ERR(rt))
2861                 return rt;
2862
2863         if (flp4->flowi4_proto)
2864                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2865                                                    flowi4_to_flowi(flp4),
2866                                                    sk, 0);
2867
2868         return rt;
2869 }
2870 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2871
2872 static int rt_fill_info(struct net *net,
2873                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2874                         int nowait, unsigned int flags)
2875 {
2876         struct rtable *rt = skb_rtable(skb);
2877         struct rtmsg *r;
2878         struct nlmsghdr *nlh;
2879         long expires = 0;
2880         const struct inet_peer *peer = rt->peer;
2881         u32 id = 0, ts = 0, tsage = 0, error;
2882
2883         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2884         if (nlh == NULL)
2885                 return -EMSGSIZE;
2886
2887         r = nlmsg_data(nlh);
2888         r->rtm_family    = AF_INET;
2889         r->rtm_dst_len  = 32;
2890         r->rtm_src_len  = 0;
2891         r->rtm_tos      = rt->rt_key_tos;
2892         r->rtm_table    = RT_TABLE_MAIN;
2893         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2894         r->rtm_type     = rt->rt_type;
2895         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2896         r->rtm_protocol = RTPROT_UNSPEC;
2897         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2898         if (rt->rt_flags & RTCF_NOTIFY)
2899                 r->rtm_flags |= RTM_F_NOTIFY;
2900
2901         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2902
2903         if (rt->rt_key_src) {
2904                 r->rtm_src_len = 32;
2905                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2906         }
2907         if (rt->dst.dev)
2908                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910         if (rt->dst.tclassid)
2911                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2912 #endif
2913         if (rt_is_input_route(rt))
2914                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2915         else if (rt->rt_src != rt->rt_key_src)
2916                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2917
2918         if (rt->rt_dst != rt->rt_gateway)
2919                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2920
2921         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2922                 goto nla_put_failure;
2923
2924         if (rt->rt_mark)
2925                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2926
2927         error = rt->dst.error;
2928         if (peer) {
2929                 inet_peer_refcheck(rt->peer);
2930                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2931                 if (peer->tcp_ts_stamp) {
2932                         ts = peer->tcp_ts;
2933                         tsage = get_seconds() - peer->tcp_ts_stamp;
2934                 }
2935                 expires = ACCESS_ONCE(peer->pmtu_expires);
2936                 if (expires)
2937                         expires -= jiffies;
2938         }
2939
2940         if (rt_is_input_route(rt)) {
2941 #ifdef CONFIG_IP_MROUTE
2942                 __be32 dst = rt->rt_dst;
2943
2944                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2945                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946                         int err = ipmr_get_route(net, skb,
2947                                                  rt->rt_src, rt->rt_dst,
2948                                                  r, nowait);
2949                         if (err <= 0) {
2950                                 if (!nowait) {
2951                                         if (err == 0)
2952                                                 return 0;
2953                                         goto nla_put_failure;
2954                                 } else {
2955                                         if (err == -EMSGSIZE)
2956                                                 goto nla_put_failure;
2957                                         error = err;
2958                                 }
2959                         }
2960                 } else
2961 #endif
2962                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2963         }
2964
2965         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2966                                expires, error) < 0)
2967                 goto nla_put_failure;
2968
2969         return nlmsg_end(skb, nlh);
2970
2971 nla_put_failure:
2972         nlmsg_cancel(skb, nlh);
2973         return -EMSGSIZE;
2974 }
2975
2976 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2977 {
2978         struct net *net = sock_net(in_skb->sk);
2979         struct rtmsg *rtm;
2980         struct nlattr *tb[RTA_MAX+1];
2981         struct rtable *rt = NULL;
2982         __be32 dst = 0;
2983         __be32 src = 0;
2984         u32 iif;
2985         int err;
2986         int mark;
2987         struct sk_buff *skb;
2988
2989         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2990         if (err < 0)
2991                 goto errout;
2992
2993         rtm = nlmsg_data(nlh);
2994
2995         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2996         if (skb == NULL) {
2997                 err = -ENOBUFS;
2998                 goto errout;
2999         }
3000
3001         /* Reserve room for dummy headers, this skb can pass
3002            through good chunk of routing engine.
3003          */
3004         skb_reset_mac_header(skb);
3005         skb_reset_network_header(skb);
3006
3007         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3008         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3009         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3010
3011         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3012         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3013         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3014         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3015
3016         if (iif) {
3017                 struct net_device *dev;
3018
3019                 dev = __dev_get_by_index(net, iif);
3020                 if (dev == NULL) {
3021                         err = -ENODEV;
3022                         goto errout_free;
3023                 }
3024
3025                 skb->protocol   = htons(ETH_P_IP);
3026                 skb->dev        = dev;
3027                 skb->mark       = mark;
3028                 local_bh_disable();
3029                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3030                 local_bh_enable();
3031
3032                 rt = skb_rtable(skb);
3033                 if (err == 0 && rt->dst.error)
3034                         err = -rt->dst.error;
3035         } else {
3036                 struct flowi4 fl4 = {
3037                         .daddr = dst,
3038                         .saddr = src,
3039                         .flowi4_tos = rtm->rtm_tos,
3040                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3041                         .flowi4_mark = mark,
3042                 };
3043                 rt = ip_route_output_key(net, &fl4);
3044
3045                 err = 0;
3046                 if (IS_ERR(rt))
3047                         err = PTR_ERR(rt);
3048         }
3049
3050         if (err)
3051                 goto errout_free;
3052
3053         skb_dst_set(skb, &rt->dst);
3054         if (rtm->rtm_flags & RTM_F_NOTIFY)
3055                 rt->rt_flags |= RTCF_NOTIFY;
3056
3057         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3058                            RTM_NEWROUTE, 0, 0);
3059         if (err <= 0)
3060                 goto errout_free;
3061
3062         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3063 errout:
3064         return err;
3065
3066 errout_free:
3067         kfree_skb(skb);
3068         goto errout;
3069 }
3070
3071 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3072 {
3073         struct rtable *rt;
3074         int h, s_h;
3075         int idx, s_idx;
3076         struct net *net;
3077
3078         net = sock_net(skb->sk);
3079
3080         s_h = cb->args[0];
3081         if (s_h < 0)
3082                 s_h = 0;
3083         s_idx = idx = cb->args[1];
3084         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3085                 if (!rt_hash_table[h].chain)
3086                         continue;
3087                 rcu_read_lock_bh();
3088                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3089                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3090                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3091                                 continue;
3092                         if (rt_is_expired(rt))
3093                                 continue;
3094                         skb_dst_set_noref(skb, &rt->dst);
3095                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3096                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3097                                          1, NLM_F_MULTI) <= 0) {
3098                                 skb_dst_drop(skb);
3099                                 rcu_read_unlock_bh();
3100                                 goto done;
3101                         }
3102                         skb_dst_drop(skb);
3103                 }
3104                 rcu_read_unlock_bh();
3105         }
3106
3107 done:
3108         cb->args[0] = h;
3109         cb->args[1] = idx;
3110         return skb->len;
3111 }
3112
3113 void ip_rt_multicast_event(struct in_device *in_dev)
3114 {
3115         rt_cache_flush(dev_net(in_dev->dev), 0);
3116 }
3117
3118 #ifdef CONFIG_SYSCTL
3119 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3120                                         void __user *buffer,
3121                                         size_t *lenp, loff_t *ppos)
3122 {
3123         if (write) {
3124                 int flush_delay;
3125                 ctl_table ctl;
3126                 struct net *net;
3127
3128                 memcpy(&ctl, __ctl, sizeof(ctl));
3129                 ctl.data = &flush_delay;
3130                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3131
3132                 net = (struct net *)__ctl->extra1;
3133                 rt_cache_flush(net, flush_delay);
3134                 return 0;
3135         }
3136
3137         return -EINVAL;
3138 }
3139
3140 static ctl_table ipv4_route_table[] = {
3141         {
3142                 .procname       = "gc_thresh",
3143                 .data           = &ipv4_dst_ops.gc_thresh,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 .procname       = "max_size",
3150                 .data           = &ip_rt_max_size,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 /*  Deprecated. Use gc_min_interval_ms */
3157
3158                 .procname       = "gc_min_interval",
3159                 .data           = &ip_rt_gc_min_interval,
3160                 .maxlen         = sizeof(int),
3161                 .mode           = 0644,
3162                 .proc_handler   = proc_dointvec_jiffies,
3163         },
3164         {
3165                 .procname       = "gc_min_interval_ms",
3166                 .data           = &ip_rt_gc_min_interval,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec_ms_jiffies,
3170         },
3171         {
3172                 .procname       = "gc_timeout",
3173                 .data           = &ip_rt_gc_timeout,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec_jiffies,
3177         },
3178         {
3179                 .procname       = "gc_interval",
3180                 .data           = &ip_rt_gc_interval,
3181                 .maxlen         = sizeof(int),
3182                 .mode           = 0644,
3183                 .proc_handler   = proc_dointvec_jiffies,
3184         },
3185         {
3186                 .procname       = "gc_interval",
3187                 .data           = &ip_rt_gc_interval,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec_jiffies,
3191         },
3192         {
3193                 .procname       = "redirect_load",
3194                 .data           = &ip_rt_redirect_load,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec,
3198         },
3199         {
3200                 .procname       = "redirect_number",
3201                 .data           = &ip_rt_redirect_number,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .procname       = "redirect_silence",
3208                 .data           = &ip_rt_redirect_silence,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .procname       = "error_cost",
3215                 .data           = &ip_rt_error_cost,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "error_burst",
3222                 .data           = &ip_rt_error_burst,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec,
3226         },
3227         {
3228                 .procname       = "gc_elasticity",
3229                 .data           = &ip_rt_gc_elasticity,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "mtu_expires",
3236                 .data           = &ip_rt_mtu_expires,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec_jiffies,
3240         },
3241         {
3242                 .procname       = "min_pmtu",
3243                 .data           = &ip_rt_min_pmtu,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec,
3247         },
3248         {
3249                 .procname       = "min_adv_mss",
3250                 .data           = &ip_rt_min_advmss,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec,
3254         },
3255         { }
3256 };
3257
3258 static struct ctl_table empty[1];
3259
3260 static struct ctl_table ipv4_skeleton[] =
3261 {
3262         { .procname = "route",
3263           .mode = 0555, .child = ipv4_route_table},
3264         { .procname = "neigh",
3265           .mode = 0555, .child = empty},
3266         { }
3267 };
3268
3269 static __net_initdata struct ctl_path ipv4_path[] = {
3270         { .procname = "net", },
3271         { .procname = "ipv4", },
3272         { },
3273 };
3274
3275 static struct ctl_table ipv4_route_flush_table[] = {
3276         {
3277                 .procname       = "flush",
3278                 .maxlen         = sizeof(int),
3279                 .mode           = 0200,
3280                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3281         },
3282         { },
3283 };
3284
3285 static __net_initdata struct ctl_path ipv4_route_path[] = {
3286         { .procname = "net", },
3287         { .procname = "ipv4", },
3288         { .procname = "route", },
3289         { },
3290 };
3291
3292 static __net_init int sysctl_route_net_init(struct net *net)
3293 {
3294         struct ctl_table *tbl;
3295
3296         tbl = ipv4_route_flush_table;
3297         if (!net_eq(net, &init_net)) {
3298                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3299                 if (tbl == NULL)
3300                         goto err_dup;
3301         }
3302         tbl[0].extra1 = net;
3303
3304         net->ipv4.route_hdr =
3305                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3306         if (net->ipv4.route_hdr == NULL)
3307                 goto err_reg;
3308         return 0;
3309
3310 err_reg:
3311         if (tbl != ipv4_route_flush_table)
3312                 kfree(tbl);
3313 err_dup:
3314         return -ENOMEM;
3315 }
3316
3317 static __net_exit void sysctl_route_net_exit(struct net *net)
3318 {
3319         struct ctl_table *tbl;
3320
3321         tbl = net->ipv4.route_hdr->ctl_table_arg;
3322         unregister_net_sysctl_table(net->ipv4.route_hdr);
3323         BUG_ON(tbl == ipv4_route_flush_table);
3324         kfree(tbl);
3325 }
3326
3327 static __net_initdata struct pernet_operations sysctl_route_ops = {
3328         .init = sysctl_route_net_init,
3329         .exit = sysctl_route_net_exit,
3330 };
3331 #endif
3332
3333 static __net_init int rt_genid_init(struct net *net)
3334 {
3335         get_random_bytes(&net->ipv4.rt_genid,
3336                          sizeof(net->ipv4.rt_genid));
3337         get_random_bytes(&net->ipv4.dev_addr_genid,
3338                          sizeof(net->ipv4.dev_addr_genid));
3339         return 0;
3340 }
3341
3342 static __net_initdata struct pernet_operations rt_genid_ops = {
3343         .init = rt_genid_init,
3344 };
3345
3346
3347 #ifdef CONFIG_IP_ROUTE_CLASSID
3348 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3349 #endif /* CONFIG_IP_ROUTE_CLASSID */
3350
3351 static __initdata unsigned long rhash_entries;
3352 static int __init set_rhash_entries(char *str)
3353 {
3354         if (!str)
3355                 return 0;
3356         rhash_entries = simple_strtoul(str, &str, 0);
3357         return 1;
3358 }
3359 __setup("rhash_entries=", set_rhash_entries);
3360
3361 int __init ip_rt_init(void)
3362 {
3363         int rc = 0;
3364
3365 #ifdef CONFIG_IP_ROUTE_CLASSID
3366         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3367         if (!ip_rt_acct)
3368                 panic("IP: failed to allocate ip_rt_acct\n");
3369 #endif
3370
3371         ipv4_dst_ops.kmem_cachep =
3372                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3373                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3374
3375         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3376
3377         if (dst_entries_init(&ipv4_dst_ops) < 0)
3378                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3379
3380         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3381                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3382
3383         rt_hash_table = (struct rt_hash_bucket *)
3384                 alloc_large_system_hash("IP route cache",
3385                                         sizeof(struct rt_hash_bucket),
3386                                         rhash_entries,
3387                                         (totalram_pages >= 128 * 1024) ?
3388                                         15 : 17,
3389                                         0,
3390                                         &rt_hash_log,
3391                                         &rt_hash_mask,
3392                                         rhash_entries ? 0 : 512 * 1024);
3393         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3394         rt_hash_lock_init();
3395
3396         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3397         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3398
3399         devinet_init();
3400         ip_fib_init();
3401
3402         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3403         expires_ljiffies = jiffies;
3404         schedule_delayed_work(&expires_work,
3405                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3406
3407         if (ip_rt_proc_init())
3408                 printk(KERN_ERR "Unable to create route proc files\n");
3409 #ifdef CONFIG_XFRM
3410         xfrm_init();
3411         xfrm4_init(ip_rt_max_size);
3412 #endif
3413         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3414
3415 #ifdef CONFIG_SYSCTL
3416         register_pernet_subsys(&sysctl_route_ops);
3417 #endif
3418         register_pernet_subsys(&rt_genid_ops);
3419         return rc;
3420 }
3421
3422 #ifdef CONFIG_SYSCTL
3423 /*
3424  * We really need to sanitize the damn ipv4 init order, then all
3425  * this nonsense will go away.
3426  */
3427 void __init ip_static_sysctl_init(void)
3428 {
3429         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3430 }
3431 #endif