net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150
 151 static struct dst_ops ipv4_dst_ops = {
 152         .family =               AF_INET,
 153         .protocol =             cpu_to_be16(ETH_P_IP),
 154         .gc =                   rt_garbage_collect,
 155         .check =                ipv4_dst_check,
 156         .destroy =              ipv4_dst_destroy,
 157         .ifdown =               ipv4_dst_ifdown,
 158         .negative_advice =      ipv4_negative_advice,
 159         .link_failure =         ipv4_link_failure,
 160         .update_pmtu =          ip_rt_update_pmtu,
 161         .local_out =            __ip_local_out,
 162         .entries =              ATOMIC_INIT(0),
 163 };
 164
 165 #define ECN_OR_COST(class)      TC_PRIO_##class
 166
 167 const __u8 ip_tos2prio[16] = {
 168         TC_PRIO_BESTEFFORT,
 169         ECN_OR_COST(FILLER),
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(BESTEFFORT),
 172         TC_PRIO_BULK,
 173         ECN_OR_COST(BULK),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_INTERACTIVE,
 177         ECN_OR_COST(INTERACTIVE),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE_BULK,
 181         ECN_OR_COST(INTERACTIVE_BULK),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK)
 184 };
 185
 186
 187 /*
 188  * Route cache.
 189  */
 190
 191 /* The locking scheme is rather straight forward:
 192  *
 193  * 1) Read-Copy Update protects the buckets of the central route hash.
 194  * 2) Only writers remove entries, and they hold the lock
 195  *    as they look at rtable reference counts.
 196  * 3) Only readers acquire references to rtable entries,
 197  *    they do so with atomic increments and with the
 198  *    lock held.
 199  */
 200
 201 struct rt_hash_bucket {
 202         struct rtable   *chain;
 203 };
 204
 205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 206         defined(CONFIG_PROVE_LOCKING)
 207 /*
 208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 209  * The size of this table is a power of two and depends on the number of CPUS.
 210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 211  */
 212 #ifdef CONFIG_LOCKDEP
 213 # define RT_HASH_LOCK_SZ        256
 214 #else
 215 # if NR_CPUS >= 32
 216 #  define RT_HASH_LOCK_SZ       4096
 217 # elif NR_CPUS >= 16
 218 #  define RT_HASH_LOCK_SZ       2048
 219 # elif NR_CPUS >= 8
 220 #  define RT_HASH_LOCK_SZ       1024
 221 # elif NR_CPUS >= 4
 222 #  define RT_HASH_LOCK_SZ       512
 223 # else
 224 #  define RT_HASH_LOCK_SZ       256
 225 # endif
 226 #endif
 227
 228 static spinlock_t       *rt_hash_locks;
 229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 230
 231 static __init void rt_hash_lock_init(void)
 232 {
 233         int i;
 234
 235         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 236                         GFP_KERNEL);
 237         if (!rt_hash_locks)
 238                 panic("IP: failed to allocate rt_hash_locks\n");
 239
 240         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 241                 spin_lock_init(&rt_hash_locks[i]);
 242 }
 243 #else
 244 # define rt_hash_lock_addr(slot) NULL
 245
 246 static inline void rt_hash_lock_init(void)
 247 {
 248 }
 249 #endif
 250
 251 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 252 static unsigned                 rt_hash_mask __read_mostly;
 253 static unsigned int             rt_hash_log  __read_mostly;
 254
 255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 256 #define RT_CACHE_STAT_INC(field) \
 257         (__raw_get_cpu_var(rt_cache_stat).field++)
 258
 259 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 260                                    int genid)
 261 {
 262         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 263                             idx, genid)
 264                 & rt_hash_mask;
 265 }
 266
 267 static inline int rt_genid(struct net *net)
 268 {
 269         return atomic_read(&net->ipv4.rt_genid);
 270 }
 271
 272 #ifdef CONFIG_PROC_FS
 273 struct rt_cache_iter_state {
 274         struct seq_net_private p;
 275         int bucket;
 276         int genid;
 277 };
 278
 279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 280 {
 281         struct rt_cache_iter_state *st = seq->private;
 282         struct rtable *r = NULL;
 283
 284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 285                 if (!rt_hash_table[st->bucket].chain)
 286                         continue;
 287                 rcu_read_lock_bh();
 288                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 289                 while (r) {
 290                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 291                             r->rt_genid == st->genid)
 292                                 return r;
 293                         r = rcu_dereference_bh(r->u.dst.rt_next);
 294                 }
 295                 rcu_read_unlock_bh();
 296         }
 297         return r;
 298 }
 299
 300 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 301                                           struct rtable *r)
 302 {
 303         struct rt_cache_iter_state *st = seq->private;
 304
 305         r = r->u.dst.rt_next;
 306         while (!r) {
 307                 rcu_read_unlock_bh();
 308                 do {
 309                         if (--st->bucket < 0)
 310                                 return NULL;
 311                 } while (!rt_hash_table[st->bucket].chain);
 312                 rcu_read_lock_bh();
 313                 r = rt_hash_table[st->bucket].chain;
 314         }
 315         return rcu_dereference_bh(r);
 316 }
 317
 318 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 319                                         struct rtable *r)
 320 {
 321         struct rt_cache_iter_state *st = seq->private;
 322         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 323                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 324                         continue;
 325                 if (r->rt_genid == st->genid)
 326                         break;
 327         }
 328         return r;
 329 }
 330
 331 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 332 {
 333         struct rtable *r = rt_cache_get_first(seq);
 334
 335         if (r)
 336                 while (pos && (r = rt_cache_get_next(seq, r)))
 337                         --pos;
 338         return pos ? NULL : r;
 339 }
 340
 341 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 342 {
 343         struct rt_cache_iter_state *st = seq->private;
 344         if (*pos)
 345                 return rt_cache_get_idx(seq, *pos - 1);
 346         st->genid = rt_genid(seq_file_net(seq));
 347         return SEQ_START_TOKEN;
 348 }
 349
 350 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 351 {
 352         struct rtable *r;
 353
 354         if (v == SEQ_START_TOKEN)
 355                 r = rt_cache_get_first(seq);
 356         else
 357                 r = rt_cache_get_next(seq, v);
 358         ++*pos;
 359         return r;
 360 }
 361
 362 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 363 {
 364         if (v && v != SEQ_START_TOKEN)
 365                 rcu_read_unlock_bh();
 366 }
 367
 368 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 369 {
 370         if (v == SEQ_START_TOKEN)
 371                 seq_printf(seq, "%-127s\n",
 372                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 373                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 374                            "HHUptod\tSpecDst");
 375         else {
 376                 struct rtable *r = v;
 377                 int len;
 378
 379                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 380                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 381                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 382                         (__force u32)r->rt_dst,
 383                         (__force u32)r->rt_gateway,
 384                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 385                         r->u.dst.__use, 0, (__force u32)r->rt_src,
 386                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 387                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 388                         dst_metric(&r->u.dst, RTAX_WINDOW),
 389                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 390                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 391                         r->fl.fl4_tos,
 392                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 393                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 394                                        dev_queue_xmit) : 0,
 395                         r->rt_spec_dst, &len);
 396
 397                 seq_printf(seq, "%*s\n", 127 - len, "");
 398         }
 399         return 0;
 400 }
 401
 402 static const struct seq_operations rt_cache_seq_ops = {
 403         .start  = rt_cache_seq_start,
 404         .next   = rt_cache_seq_next,
 405         .stop   = rt_cache_seq_stop,
 406         .show   = rt_cache_seq_show,
 407 };
 408
 409 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 410 {
 411         return seq_open_net(inode, file, &rt_cache_seq_ops,
 412                         sizeof(struct rt_cache_iter_state));
 413 }
 414
 415 static const struct file_operations rt_cache_seq_fops = {
 416         .owner   = THIS_MODULE,
 417         .open    = rt_cache_seq_open,
 418         .read    = seq_read,
 419         .llseek  = seq_lseek,
 420         .release = seq_release_net,
 421 };
 422
 423
 424 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 425 {
 426         int cpu;
 427
 428         if (*pos == 0)
 429                 return SEQ_START_TOKEN;
 430
 431         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 432                 if (!cpu_possible(cpu))
 433                         continue;
 434                 *pos = cpu+1;
 435                 return &per_cpu(rt_cache_stat, cpu);
 436         }
 437         return NULL;
 438 }
 439
 440 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 441 {
 442         int cpu;
 443
 444         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 445                 if (!cpu_possible(cpu))
 446                         continue;
 447                 *pos = cpu+1;
 448                 return &per_cpu(rt_cache_stat, cpu);
 449         }
 450         return NULL;
 451
 452 }
 453
 454 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 455 {
 456
 457 }
 458
 459 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 460 {
 461         struct rt_cache_stat *st = v;
 462
 463         if (v == SEQ_START_TOKEN) {
 464                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 465                 return 0;
 466         }
 467
 468         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 469                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 470                    atomic_read(&ipv4_dst_ops.entries),
 471                    st->in_hit,
 472                    st->in_slow_tot,
 473                    st->in_slow_mc,
 474                    st->in_no_route,
 475                    st->in_brd,
 476                    st->in_martian_dst,
 477                    st->in_martian_src,
 478
 479                    st->out_hit,
 480                    st->out_slow_tot,
 481                    st->out_slow_mc,
 482
 483                    st->gc_total,
 484                    st->gc_ignored,
 485                    st->gc_goal_miss,
 486                    st->gc_dst_overflow,
 487                    st->in_hlist_search,
 488                    st->out_hlist_search
 489                 );
 490         return 0;
 491 }
 492
 493 static const struct seq_operations rt_cpu_seq_ops = {
 494         .start  = rt_cpu_seq_start,
 495         .next   = rt_cpu_seq_next,
 496         .stop   = rt_cpu_seq_stop,
 497         .show   = rt_cpu_seq_show,
 498 };
 499
 500
 501 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 502 {
 503         return seq_open(file, &rt_cpu_seq_ops);
 504 }
 505
 506 static const struct file_operations rt_cpu_seq_fops = {
 507         .owner   = THIS_MODULE,
 508         .open    = rt_cpu_seq_open,
 509         .read    = seq_read,
 510         .llseek  = seq_lseek,
 511         .release = seq_release,
 512 };
 513
 514 #ifdef CONFIG_NET_CLS_ROUTE
 515 static int rt_acct_proc_show(struct seq_file *m, void *v)
 516 {
 517         struct ip_rt_acct *dst, *src;
 518         unsigned int i, j;
 519
 520         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 521         if (!dst)
 522                 return -ENOMEM;
 523
 524         for_each_possible_cpu(i) {
 525                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 526                 for (j = 0; j < 256; j++) {
 527                         dst[j].o_bytes   += src[j].o_bytes;
 528                         dst[j].o_packets += src[j].o_packets;
 529                         dst[j].i_bytes   += src[j].i_bytes;
 530                         dst[j].i_packets += src[j].i_packets;
 531                 }
 532         }
 533
 534         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 535         kfree(dst);
 536         return 0;
 537 }
 538
 539 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 540 {
 541         return single_open(file, rt_acct_proc_show, NULL);
 542 }
 543
 544 static const struct file_operations rt_acct_proc_fops = {
 545         .owner          = THIS_MODULE,
 546         .open           = rt_acct_proc_open,
 547         .read           = seq_read,
 548         .llseek         = seq_lseek,
 549         .release        = single_release,
 550 };
 551 #endif
 552
 553 static int __net_init ip_rt_do_proc_init(struct net *net)
 554 {
 555         struct proc_dir_entry *pde;
 556
 557         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 558                         &rt_cache_seq_fops);
 559         if (!pde)
 560                 goto err1;
 561
 562         pde = proc_create("rt_cache", S_IRUGO,
 563                           net->proc_net_stat, &rt_cpu_seq_fops);
 564         if (!pde)
 565                 goto err2;
 566
 567 #ifdef CONFIG_NET_CLS_ROUTE
 568         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 569         if (!pde)
 570                 goto err3;
 571 #endif
 572         return 0;
 573
 574 #ifdef CONFIG_NET_CLS_ROUTE
 575 err3:
 576         remove_proc_entry("rt_cache", net->proc_net_stat);
 577 #endif
 578 err2:
 579         remove_proc_entry("rt_cache", net->proc_net);
 580 err1:
 581         return -ENOMEM;
 582 }
 583
 584 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 585 {
 586         remove_proc_entry("rt_cache", net->proc_net_stat);
 587         remove_proc_entry("rt_cache", net->proc_net);
 588 #ifdef CONFIG_NET_CLS_ROUTE
 589         remove_proc_entry("rt_acct", net->proc_net);
 590 #endif
 591 }
 592
 593 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 594         .init = ip_rt_do_proc_init,
 595         .exit = ip_rt_do_proc_exit,
 596 };
 597
 598 static int __init ip_rt_proc_init(void)
 599 {
 600         return register_pernet_subsys(&ip_rt_proc_ops);
 601 }
 602
 603 #else
 604 static inline int ip_rt_proc_init(void)
 605 {
 606         return 0;
 607 }
 608 #endif /* CONFIG_PROC_FS */
 609
 610 static inline void rt_free(struct rtable *rt)
 611 {
 612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 613 }
 614
 615 static inline void rt_drop(struct rtable *rt)
 616 {
 617         ip_rt_put(rt);
 618         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 619 }
 620
 621 static inline int rt_fast_clean(struct rtable *rth)
 622 {
 623         /* Kill broadcast/multicast entries very aggresively, if they
 624            collide in hash table with more useful entries */
 625         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 626                 rth->fl.iif && rth->u.dst.rt_next;
 627 }
 628
 629 static inline int rt_valuable(struct rtable *rth)
 630 {
 631         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 632                 rth->u.dst.expires;
 633 }
 634
 635 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 636 {
 637         unsigned long age;
 638         int ret = 0;
 639
 640         if (atomic_read(&rth->u.dst.__refcnt))
 641                 goto out;
 642
 643         ret = 1;
 644         if (rth->u.dst.expires &&
 645             time_after_eq(jiffies, rth->u.dst.expires))
 646                 goto out;
 647
 648         age = jiffies - rth->u.dst.lastuse;
 649         ret = 0;
 650         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 651             (age <= tmo2 && rt_valuable(rth)))
 652                 goto out;
 653         ret = 1;
 654 out:    return ret;
 655 }
 656
 657 /* Bits of score are:
 658  * 31: very valuable
 659  * 30: not quite useless
 660  * 29..0: usage counter
 661  */
 662 static inline u32 rt_score(struct rtable *rt)
 663 {
 664         u32 score = jiffies - rt->u.dst.lastuse;
 665
 666         score = ~score & ~(3<<30);
 667
 668         if (rt_valuable(rt))
 669                 score |= (1<<31);
 670
 671         if (!rt->fl.iif ||
 672             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 673                 score |= (1<<30);
 674
 675         return score;
 676 }
 677
 678 static inline bool rt_caching(const struct net *net)
 679 {
 680         return net->ipv4.current_rt_cache_rebuild_count <=
 681                 net->ipv4.sysctl_rt_cache_rebuild_count;
 682 }
 683
 684 static inline bool compare_hash_inputs(const struct flowi *fl1,
 685                                         const struct flowi *fl2)
 686 {
 687         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 688                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 689                 (fl1->iif ^ fl2->iif)) == 0);
 690 }
 691
 692 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 693 {
 694         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 695                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 696                 (fl1->mark ^ fl2->mark) |
 697                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
 698                 (fl1->oif ^ fl2->oif) |
 699                 (fl1->iif ^ fl2->iif)) == 0;
 700 }
 701
 702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 703 {
 704         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
 705 }
 706
 707 static inline int rt_is_expired(struct rtable *rth)
 708 {
 709         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 710 }
 711
 712 /*
 713  * Perform a full scan of hash table and free all entries.
 714  * Can be called by a softirq or a process.
 715  * In the later case, we want to be reschedule if necessary
 716  */
 717 static void rt_do_flush(int process_context)
 718 {
 719         unsigned int i;
 720         struct rtable *rth, *next;
 721         struct rtable * tail;
 722
 723         for (i = 0; i <= rt_hash_mask; i++) {
 724                 if (process_context && need_resched())
 725                         cond_resched();
 726                 rth = rt_hash_table[i].chain;
 727                 if (!rth)
 728                         continue;
 729
 730                 spin_lock_bh(rt_hash_lock_addr(i));
 731 #ifdef CONFIG_NET_NS
 732                 {
 733                 struct rtable ** prev, * p;
 734
 735                 rth = rt_hash_table[i].chain;
 736
 737                 /* defer releasing the head of the list after spin_unlock */
 738                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 739                         if (!rt_is_expired(tail))
 740                                 break;
 741                 if (rth != tail)
 742                         rt_hash_table[i].chain = tail;
 743
 744                 /* call rt_free on entries after the tail requiring flush */
 745                 prev = &rt_hash_table[i].chain;
 746                 for (p = *prev; p; p = next) {
 747                         next = p->u.dst.rt_next;
 748                         if (!rt_is_expired(p)) {
 749                                 prev = &p->u.dst.rt_next;
 750                         } else {
 751                                 *prev = next;
 752                                 rt_free(p);
 753                         }
 754                 }
 755                 }
 756 #else
 757                 rth = rt_hash_table[i].chain;
 758                 rt_hash_table[i].chain = NULL;
 759                 tail = NULL;
 760 #endif
 761                 spin_unlock_bh(rt_hash_lock_addr(i));
 762
 763                 for (; rth != tail; rth = next) {
 764                         next = rth->u.dst.rt_next;
 765                         rt_free(rth);
 766                 }
 767         }
 768 }
 769
 770 /*
 771  * While freeing expired entries, we compute average chain length
 772  * and standard deviation, using fixed-point arithmetic.
 773  * This to have an estimation of rt_chain_length_max
 774  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 775  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 776  */
 777
 778 #define FRACT_BITS 3
 779 #define ONE (1UL << FRACT_BITS)
 780
 781 /*
 782  * Given a hash chain and an item in this hash chain,
 783  * find if a previous entry has the same hash_inputs
 784  * (but differs on tos, mark or oif)
 785  * Returns 0 if an alias is found.
 786  * Returns ONE if rth has no alias before itself.
 787  */
 788 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 789 {
 790         const struct rtable *aux = head;
 791
 792         while (aux != rth) {
 793                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 794                         return 0;
 795                 aux = aux->u.dst.rt_next;
 796         }
 797         return ONE;
 798 }
 799
 800 static void rt_check_expire(void)
 801 {
 802         static unsigned int rover;
 803         unsigned int i = rover, goal;
 804         struct rtable *rth, **rthp;
 805         unsigned long samples = 0;
 806         unsigned long sum = 0, sum2 = 0;
 807         unsigned long delta;
 808         u64 mult;
 809
 810         delta = jiffies - expires_ljiffies;
 811         expires_ljiffies = jiffies;
 812         mult = ((u64)delta) << rt_hash_log;
 813         if (ip_rt_gc_timeout > 1)
 814                 do_div(mult, ip_rt_gc_timeout);
 815         goal = (unsigned int)mult;
 816         if (goal > rt_hash_mask)
 817                 goal = rt_hash_mask + 1;
 818         for (; goal > 0; goal--) {
 819                 unsigned long tmo = ip_rt_gc_timeout;
 820                 unsigned long length;
 821
 822                 i = (i + 1) & rt_hash_mask;
 823                 rthp = &rt_hash_table[i].chain;
 824
 825                 if (need_resched())
 826                         cond_resched();
 827
 828                 samples++;
 829
 830                 if (*rthp == NULL)
 831                         continue;
 832                 length = 0;
 833                 spin_lock_bh(rt_hash_lock_addr(i));
 834                 while ((rth = *rthp) != NULL) {
 835                         prefetch(rth->u.dst.rt_next);
 836                         if (rt_is_expired(rth)) {
 837                                 *rthp = rth->u.dst.rt_next;
 838                                 rt_free(rth);
 839                                 continue;
 840                         }
 841                         if (rth->u.dst.expires) {
 842                                 /* Entry is expired even if it is in use */
 843                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 844 nofree:
 845                                         tmo >>= 1;
 846                                         rthp = &rth->u.dst.rt_next;
 847                                         /*
 848                                          * We only count entries on
 849                                          * a chain with equal hash inputs once
 850                                          * so that entries for different QOS
 851                                          * levels, and other non-hash input
 852                                          * attributes don't unfairly skew
 853                                          * the length computation
 854                                          */
 855                                         length += has_noalias(rt_hash_table[i].chain, rth);
 856                                         continue;
 857                                 }
 858                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 859                                 goto nofree;
 860
 861                         /* Cleanup aged off entries. */
 862                         *rthp = rth->u.dst.rt_next;
 863                         rt_free(rth);
 864                 }
 865                 spin_unlock_bh(rt_hash_lock_addr(i));
 866                 sum += length;
 867                 sum2 += length*length;
 868         }
 869         if (samples) {
 870                 unsigned long avg = sum / samples;
 871                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 872                 rt_chain_length_max = max_t(unsigned long,
 873                                         ip_rt_gc_elasticity,
 874                                         (avg + 4*sd) >> FRACT_BITS);
 875         }
 876         rover = i;
 877 }
 878
 879 /*
 880  * rt_worker_func() is run in process context.
 881  * we call rt_check_expire() to scan part of the hash table
 882  */
 883 static void rt_worker_func(struct work_struct *work)
 884 {
 885         rt_check_expire();
 886         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 887 }
 888
 889 /*
 890  * Pertubation of rt_genid by a small quantity [1..256]
 891  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 892  * many times (2^24) without giving recent rt_genid.
 893  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 894  */
 895 static void rt_cache_invalidate(struct net *net)
 896 {
 897         unsigned char shuffle;
 898
 899         get_random_bytes(&shuffle, sizeof(shuffle));
 900         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 901 }
 902
 903 /*
 904  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 905  * delay >= 0 : invalidate & flush cache (can be long)
 906  */
 907 void rt_cache_flush(struct net *net, int delay)
 908 {
 909         rt_cache_invalidate(net);
 910         if (delay >= 0)
 911                 rt_do_flush(!in_softirq());
 912 }
 913
 914 /* Flush previous cache invalidated entries from the cache */
 915 void rt_cache_flush_batch(void)
 916 {
 917         rt_do_flush(!in_softirq());
 918 }
 919
 920 static void rt_emergency_hash_rebuild(struct net *net)
 921 {
 922         if (net_ratelimit())
 923                 printk(KERN_WARNING "Route hash chain too long!\n");
 924         rt_cache_invalidate(net);
 925 }
 926
 927 /*
 928    Short description of GC goals.
 929
 930    We want to build algorithm, which will keep routing cache
 931    at some equilibrium point, when number of aged off entries
 932    is kept approximately equal to newly generated ones.
 933
 934    Current expiration strength is variable "expire".
 935    We try to adjust it dynamically, so that if networking
 936    is idle expires is large enough to keep enough of warm entries,
 937    and when load increases it reduces to limit cache size.
 938  */
 939
 940 static int rt_garbage_collect(struct dst_ops *ops)
 941 {
 942         static unsigned long expire = RT_GC_TIMEOUT;
 943         static unsigned long last_gc;
 944         static int rover;
 945         static int equilibrium;
 946         struct rtable *rth, **rthp;
 947         unsigned long now = jiffies;
 948         int goal;
 949
 950         /*
 951          * Garbage collection is pretty expensive,
 952          * do not make it too frequently.
 953          */
 954
 955         RT_CACHE_STAT_INC(gc_total);
 956
 957         if (now - last_gc < ip_rt_gc_min_interval &&
 958             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 959                 RT_CACHE_STAT_INC(gc_ignored);
 960                 goto out;
 961         }
 962
 963         /* Calculate number of entries, which we want to expire now. */
 964         goal = atomic_read(&ipv4_dst_ops.entries) -
 965                 (ip_rt_gc_elasticity << rt_hash_log);
 966         if (goal <= 0) {
 967                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 968                         equilibrium = ipv4_dst_ops.gc_thresh;
 969                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 970                 if (goal > 0) {
 971                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 972                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 973                 }
 974         } else {
 975                 /* We are in dangerous area. Try to reduce cache really
 976                  * aggressively.
 977                  */
 978                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 979                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 980         }
 981
 982         if (now - last_gc >= ip_rt_gc_min_interval)
 983                 last_gc = now;
 984
 985         if (goal <= 0) {
 986                 equilibrium += goal;
 987                 goto work_done;
 988         }
 989
 990         do {
 991                 int i, k;
 992
 993                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 994                         unsigned long tmo = expire;
 995
 996                         k = (k + 1) & rt_hash_mask;
 997                         rthp = &rt_hash_table[k].chain;
 998                         spin_lock_bh(rt_hash_lock_addr(k));
 999                         while ((rth = *rthp) != NULL) {
1000                                 if (!rt_is_expired(rth) &&
1001                                         !rt_may_expire(rth, tmo, expire)) {
1002                                         tmo >>= 1;
1003                                         rthp = &rth->u.dst.rt_next;
1004                                         continue;
1005                                 }
1006                                 *rthp = rth->u.dst.rt_next;
1007                                 rt_free(rth);
1008                                 goal--;
1009                         }
1010                         spin_unlock_bh(rt_hash_lock_addr(k));
1011                         if (goal <= 0)
1012                                 break;
1013                 }
1014                 rover = k;
1015
1016                 if (goal <= 0)
1017                         goto work_done;
1018
1019                 /* Goal is not achieved. We stop process if:
1020
1021                    - if expire reduced to zero. Otherwise, expire is halfed.
1022                    - if table is not full.
1023                    - if we are called from interrupt.
1024                    - jiffies check is just fallback/debug loop breaker.
1025                      We will not spin here for long time in any case.
1026                  */
1027
1028                 RT_CACHE_STAT_INC(gc_goal_miss);
1029
1030                 if (expire == 0)
1031                         break;
1032
1033                 expire >>= 1;
1034 #if RT_CACHE_DEBUG >= 2
1035                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1036                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1037 #endif
1038
1039                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1040                         goto out;
1041         } while (!in_softirq() && time_before_eq(jiffies, now));
1042
1043         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1044                 goto out;
1045         if (net_ratelimit())
1046                 printk(KERN_WARNING "dst cache overflow\n");
1047         RT_CACHE_STAT_INC(gc_dst_overflow);
1048         return 1;
1049
1050 work_done:
1051         expire += ip_rt_gc_min_interval;
1052         if (expire > ip_rt_gc_timeout ||
1053             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1054                 expire = ip_rt_gc_timeout;
1055 #if RT_CACHE_DEBUG >= 2
1056         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1057                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1058 #endif
1059 out:    return 0;
1060 }
1061
1062 /*
1063  * Returns number of entries in a hash chain that have different hash_inputs
1064  */
1065 static int slow_chain_length(const struct rtable *head)
1066 {
1067         int length = 0;
1068         const struct rtable *rth = head;
1069
1070         while (rth) {
1071                 length += has_noalias(head, rth);
1072                 rth = rth->u.dst.rt_next;
1073         }
1074         return length >> FRACT_BITS;
1075 }
1076
1077 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1078                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1079 {
1080         struct rtable   *rth, **rthp;
1081         unsigned long   now;
1082         struct rtable *cand, **candp;
1083         u32             min_score;
1084         int             chain_length;
1085         int attempts = !in_softirq();
1086
1087 restart:
1088         chain_length = 0;
1089         min_score = ~(u32)0;
1090         cand = NULL;
1091         candp = NULL;
1092         now = jiffies;
1093
1094         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1095                 /*
1096                  * If we're not caching, just tell the caller we
1097                  * were successful and don't touch the route.  The
1098                  * caller hold the sole reference to the cache entry, and
1099                  * it will be released when the caller is done with it.
1100                  * If we drop it here, the callers have no way to resolve routes
1101                  * when we're not caching.  Instead, just point *rp at rt, so
1102                  * the caller gets a single use out of the route
1103                  * Note that we do rt_free on this new route entry, so that
1104                  * once its refcount hits zero, we are still able to reap it
1105                  * (Thanks Alexey)
1106                  * Note also the rt_free uses call_rcu.  We don't actually
1107                  * need rcu protection here, this is just our path to get
1108                  * on the route gc list.
1109                  */
1110
1111                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1112                         int err = arp_bind_neighbour(&rt->u.dst);
1113                         if (err) {
1114                                 if (net_ratelimit())
1115                                         printk(KERN_WARNING
1116                                             "Neighbour table failure & not caching routes.\n");
1117                                 rt_drop(rt);
1118                                 return err;
1119                         }
1120                 }
1121
1122                 rt_free(rt);
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = *rthp) != NULL) {
1130                 if (rt_is_expired(rth)) {
1131                         *rthp = rth->u.dst.rt_next;
1132                         rt_free(rth);
1133                         continue;
1134                 }
1135                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1136                         /* Put it first */
1137                         *rthp = rth->u.dst.rt_next;
1138                         /*
1139                          * Since lookup is lockfree, the deletion
1140                          * must be visible to another weakly ordered CPU before
1141                          * the insertion at the start of the hash chain.
1142                          */
1143                         rcu_assign_pointer(rth->u.dst.rt_next,
1144                                            rt_hash_table[hash].chain);
1145                         /*
1146                          * Since lookup is lockfree, the update writes
1147                          * must be ordered for consistency on SMP.
1148                          */
1149                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1150
1151                         dst_use(&rth->u.dst, now);
1152                         spin_unlock_bh(rt_hash_lock_addr(hash));
1153
1154                         rt_drop(rt);
1155                         if (rp)
1156                                 *rp = rth;
1157                         else
1158                                 skb_dst_set(skb, &rth->u.dst);
1159                         return 0;
1160                 }
1161
1162                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1163                         u32 score = rt_score(rth);
1164
1165                         if (score <= min_score) {
1166                                 cand = rth;
1167                                 candp = rthp;
1168                                 min_score = score;
1169                         }
1170                 }
1171
1172                 chain_length++;
1173
1174                 rthp = &rth->u.dst.rt_next;
1175         }
1176
1177         if (cand) {
1178                 /* ip_rt_gc_elasticity used to be average length of chain
1179                  * length, when exceeded gc becomes really aggressive.
1180                  *
1181                  * The second limit is less certain. At the moment it allows
1182                  * only 2 entries per bucket. We will see.
1183                  */
1184                 if (chain_length > ip_rt_gc_elasticity) {
1185                         *candp = cand->u.dst.rt_next;
1186                         rt_free(cand);
1187                 }
1188         } else {
1189                 if (chain_length > rt_chain_length_max &&
1190                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1191                         struct net *net = dev_net(rt->u.dst.dev);
1192                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1193                         if (!rt_caching(net)) {
1194                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1195                                         rt->u.dst.dev->name, num);
1196                         }
1197                         rt_emergency_hash_rebuild(net);
1198                         spin_unlock_bh(rt_hash_lock_addr(hash));
1199
1200                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1201                                         ifindex, rt_genid(net));
1202                         goto restart;
1203                 }
1204         }
1205
1206         /* Try to bind route to arp only if it is output
1207            route or unicast forwarding path.
1208          */
1209         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1210                 int err = arp_bind_neighbour(&rt->u.dst);
1211                 if (err) {
1212                         spin_unlock_bh(rt_hash_lock_addr(hash));
1213
1214                         if (err != -ENOBUFS) {
1215                                 rt_drop(rt);
1216                                 return err;
1217                         }
1218
1219                         /* Neighbour tables are full and nothing
1220                            can be released. Try to shrink route cache,
1221                            it is most likely it holds some neighbour records.
1222                          */
1223                         if (attempts-- > 0) {
1224                                 int saved_elasticity = ip_rt_gc_elasticity;
1225                                 int saved_int = ip_rt_gc_min_interval;
1226                                 ip_rt_gc_elasticity     = 1;
1227                                 ip_rt_gc_min_interval   = 0;
1228                                 rt_garbage_collect(&ipv4_dst_ops);
1229                                 ip_rt_gc_min_interval   = saved_int;
1230                                 ip_rt_gc_elasticity     = saved_elasticity;
1231                                 goto restart;
1232                         }
1233
1234                         if (net_ratelimit())
1235                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1236                         rt_drop(rt);
1237                         return -ENOBUFS;
1238                 }
1239         }
1240
1241         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1242
1243 #if RT_CACHE_DEBUG >= 2
1244         if (rt->u.dst.rt_next) {
1245                 struct rtable *trt;
1246                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1247                        hash, &rt->rt_dst);
1248                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1249                         printk(" . %pI4", &trt->rt_dst);
1250                 printk("\n");
1251         }
1252 #endif
1253         /*
1254          * Since lookup is lockfree, we must make sure
1255          * previous writes to rt are comitted to memory
1256          * before making rt visible to other CPUS.
1257          */
1258         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1259
1260         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262 skip_hashing:
1263         if (rp)
1264                 *rp = rt;
1265         else
1266                 skb_dst_set(skb, &rt->u.dst);
1267         return 0;
1268 }
1269
1270 void rt_bind_peer(struct rtable *rt, int create)
1271 {
1272         static DEFINE_SPINLOCK(rt_peer_lock);
1273         struct inet_peer *peer;
1274
1275         peer = inet_getpeer(rt->rt_dst, create);
1276
1277         spin_lock_bh(&rt_peer_lock);
1278         if (rt->peer == NULL) {
1279                 rt->peer = peer;
1280                 peer = NULL;
1281         }
1282         spin_unlock_bh(&rt_peer_lock);
1283         if (peer)
1284                 inet_putpeer(peer);
1285 }
1286
1287 /*
1288  * Peer allocation may fail only in serious out-of-memory conditions.  However
1289  * we still can generate some output.
1290  * Random ID selection looks a bit dangerous because we have no chances to
1291  * select ID being unique in a reasonable period of time.
1292  * But broken packet identifier may be better than no packet at all.
1293  */
1294 static void ip_select_fb_ident(struct iphdr *iph)
1295 {
1296         static DEFINE_SPINLOCK(ip_fb_id_lock);
1297         static u32 ip_fallback_id;
1298         u32 salt;
1299
1300         spin_lock_bh(&ip_fb_id_lock);
1301         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1302         iph->id = htons(salt & 0xFFFF);
1303         ip_fallback_id = salt;
1304         spin_unlock_bh(&ip_fb_id_lock);
1305 }
1306
1307 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1308 {
1309         struct rtable *rt = (struct rtable *) dst;
1310
1311         if (rt) {
1312                 if (rt->peer == NULL)
1313                         rt_bind_peer(rt, 1);
1314
1315                 /* If peer is attached to destination, it is never detached,
1316                    so that we need not to grab a lock to dereference it.
1317                  */
1318                 if (rt->peer) {
1319                         iph->id = htons(inet_getid(rt->peer, more));
1320                         return;
1321                 }
1322         } else
1323                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1324                        __builtin_return_address(0));
1325
1326         ip_select_fb_ident(iph);
1327 }
1328
1329 static void rt_del(unsigned hash, struct rtable *rt)
1330 {
1331         struct rtable **rthp, *aux;
1332
1333         rthp = &rt_hash_table[hash].chain;
1334         spin_lock_bh(rt_hash_lock_addr(hash));
1335         ip_rt_put(rt);
1336         while ((aux = *rthp) != NULL) {
1337                 if (aux == rt || rt_is_expired(aux)) {
1338                         *rthp = aux->u.dst.rt_next;
1339                         rt_free(aux);
1340                         continue;
1341                 }
1342                 rthp = &aux->u.dst.rt_next;
1343         }
1344         spin_unlock_bh(rt_hash_lock_addr(hash));
1345 }
1346
1347 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1348                     __be32 saddr, struct net_device *dev)
1349 {
1350         int i, k;
1351         struct in_device *in_dev = in_dev_get(dev);
1352         struct rtable *rth, **rthp;
1353         __be32  skeys[2] = { saddr, 0 };
1354         int  ikeys[2] = { dev->ifindex, 0 };
1355         struct netevent_redirect netevent;
1356         struct net *net;
1357
1358         if (!in_dev)
1359                 return;
1360
1361         net = dev_net(dev);
1362         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1363             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1364             ipv4_is_zeronet(new_gw))
1365                 goto reject_redirect;
1366
1367         if (!rt_caching(net))
1368                 goto reject_redirect;
1369
1370         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1371                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1372                         goto reject_redirect;
1373                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1374                         goto reject_redirect;
1375         } else {
1376                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1377                         goto reject_redirect;
1378         }
1379
1380         for (i = 0; i < 2; i++) {
1381                 for (k = 0; k < 2; k++) {
1382                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1383                                                 rt_genid(net));
1384
1385                         rthp=&rt_hash_table[hash].chain;
1386
1387                         rcu_read_lock();
1388                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1389                                 struct rtable *rt;
1390
1391                                 if (rth->fl.fl4_dst != daddr ||
1392                                     rth->fl.fl4_src != skeys[i] ||
1393                                     rth->fl.oif != ikeys[k] ||
1394                                     rth->fl.iif != 0 ||
1395                                     rt_is_expired(rth) ||
1396                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1397                                         rthp = &rth->u.dst.rt_next;
1398                                         continue;
1399                                 }
1400
1401                                 if (rth->rt_dst != daddr ||
1402                                     rth->rt_src != saddr ||
1403                                     rth->u.dst.error ||
1404                                     rth->rt_gateway != old_gw ||
1405                                     rth->u.dst.dev != dev)
1406                                         break;
1407
1408                                 dst_hold(&rth->u.dst);
1409                                 rcu_read_unlock();
1410
1411                                 rt = dst_alloc(&ipv4_dst_ops);
1412                                 if (rt == NULL) {
1413                                         ip_rt_put(rth);
1414                                         in_dev_put(in_dev);
1415                                         return;
1416                                 }
1417
1418                                 /* Copy all the information. */
1419                                 *rt = *rth;
1420                                 rt->u.dst.__use         = 1;
1421                                 atomic_set(&rt->u.dst.__refcnt, 1);
1422                                 rt->u.dst.child         = NULL;
1423                                 if (rt->u.dst.dev)
1424                                         dev_hold(rt->u.dst.dev);
1425                                 if (rt->idev)
1426                                         in_dev_hold(rt->idev);
1427                                 rt->u.dst.obsolete      = -1;
1428                                 rt->u.dst.lastuse       = jiffies;
1429                                 rt->u.dst.path          = &rt->u.dst;
1430                                 rt->u.dst.neighbour     = NULL;
1431                                 rt->u.dst.hh            = NULL;
1432 #ifdef CONFIG_XFRM
1433                                 rt->u.dst.xfrm          = NULL;
1434 #endif
1435                                 rt->rt_genid            = rt_genid(net);
1436                                 rt->rt_flags            |= RTCF_REDIRECTED;
1437
1438                                 /* Gateway is different ... */
1439                                 rt->rt_gateway          = new_gw;
1440
1441                                 /* Redirect received -> path was valid */
1442                                 dst_confirm(&rth->u.dst);
1443
1444                                 if (rt->peer)
1445                                         atomic_inc(&rt->peer->refcnt);
1446
1447                                 if (arp_bind_neighbour(&rt->u.dst) ||
1448                                     !(rt->u.dst.neighbour->nud_state &
1449                                             NUD_VALID)) {
1450                                         if (rt->u.dst.neighbour)
1451                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1452                                         ip_rt_put(rth);
1453                                         rt_drop(rt);
1454                                         goto do_next;
1455                                 }
1456
1457                                 netevent.old = &rth->u.dst;
1458                                 netevent.new = &rt->u.dst;
1459                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1460                                                         &netevent);
1461
1462                                 rt_del(hash, rth);
1463                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1464                                         ip_rt_put(rt);
1465                                 goto do_next;
1466                         }
1467                         rcu_read_unlock();
1468                 do_next:
1469                         ;
1470                 }
1471         }
1472         in_dev_put(in_dev);
1473         return;
1474
1475 reject_redirect:
1476 #ifdef CONFIG_IP_ROUTE_VERBOSE
1477         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1478                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1479                         "  Advised path = %pI4 -> %pI4\n",
1480                        &old_gw, dev->name, &new_gw,
1481                        &saddr, &daddr);
1482 #endif
1483         in_dev_put(in_dev);
1484 }
1485
1486 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1487 {
1488         struct rtable *rt = (struct rtable *)dst;
1489         struct dst_entry *ret = dst;
1490
1491         if (rt) {
1492                 if (dst->obsolete > 0) {
1493                         ip_rt_put(rt);
1494                         ret = NULL;
1495                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1496                            (rt->u.dst.expires &&
1497                             time_after_eq(jiffies, rt->u.dst.expires))) {
1498                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1499                                                 rt->fl.oif,
1500                                                 rt_genid(dev_net(dst->dev)));
1501 #if RT_CACHE_DEBUG >= 1
1502                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1503                                 &rt->rt_dst, rt->fl.fl4_tos);
1504 #endif
1505                         rt_del(hash, rt);
1506                         ret = NULL;
1507                 }
1508         }
1509         return ret;
1510 }
1511
1512 /*
1513  * Algorithm:
1514  *      1. The first ip_rt_redirect_number redirects are sent
1515  *         with exponential backoff, then we stop sending them at all,
1516  *         assuming that the host ignores our redirects.
1517  *      2. If we did not see packets requiring redirects
1518  *         during ip_rt_redirect_silence, we assume that the host
1519  *         forgot redirected route and start to send redirects again.
1520  *
1521  * This algorithm is much cheaper and more intelligent than dumb load limiting
1522  * in icmp.c.
1523  *
1524  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1525  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1526  */
1527
1528 void ip_rt_send_redirect(struct sk_buff *skb)
1529 {
1530         struct rtable *rt = skb_rtable(skb);
1531         struct in_device *in_dev;
1532         int log_martians;
1533
1534         rcu_read_lock();
1535         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1536         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1537                 rcu_read_unlock();
1538                 return;
1539         }
1540         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1541         rcu_read_unlock();
1542
1543         /* No redirected packets during ip_rt_redirect_silence;
1544          * reset the algorithm.
1545          */
1546         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1547                 rt->u.dst.rate_tokens = 0;
1548
1549         /* Too many ignored redirects; do not send anything
1550          * set u.dst.rate_last to the last seen redirected packet.
1551          */
1552         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1553                 rt->u.dst.rate_last = jiffies;
1554                 return;
1555         }
1556
1557         /* Check for load limit; set rate_last to the latest sent
1558          * redirect.
1559          */
1560         if (rt->u.dst.rate_tokens == 0 ||
1561             time_after(jiffies,
1562                        (rt->u.dst.rate_last +
1563                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1564                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1565                 rt->u.dst.rate_last = jiffies;
1566                 ++rt->u.dst.rate_tokens;
1567 #ifdef CONFIG_IP_ROUTE_VERBOSE
1568                 if (log_martians &&
1569                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1570                     net_ratelimit())
1571                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1572                                 &rt->rt_src, rt->rt_iif,
1573                                 &rt->rt_dst, &rt->rt_gateway);
1574 #endif
1575         }
1576 }
1577
1578 static int ip_error(struct sk_buff *skb)
1579 {
1580         struct rtable *rt = skb_rtable(skb);
1581         unsigned long now;
1582         int code;
1583
1584         switch (rt->u.dst.error) {
1585                 case EINVAL:
1586                 default:
1587                         goto out;
1588                 case EHOSTUNREACH:
1589                         code = ICMP_HOST_UNREACH;
1590                         break;
1591                 case ENETUNREACH:
1592                         code = ICMP_NET_UNREACH;
1593                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1594                                         IPSTATS_MIB_INNOROUTES);
1595                         break;
1596                 case EACCES:
1597                         code = ICMP_PKT_FILTERED;
1598                         break;
1599         }
1600
1601         now = jiffies;
1602         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1603         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1604                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1605         rt->u.dst.rate_last = now;
1606         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1607                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1608                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1609         }
1610
1611 out:    kfree_skb(skb);
1612         return 0;
1613 }
1614
1615 /*
1616  *      The last two values are not from the RFC but
1617  *      are needed for AMPRnet AX.25 paths.
1618  */
1619
1620 static const unsigned short mtu_plateau[] =
1621 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1622
1623 static inline unsigned short guess_mtu(unsigned short old_mtu)
1624 {
1625         int i;
1626
1627         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1628                 if (old_mtu > mtu_plateau[i])
1629                         return mtu_plateau[i];
1630         return 68;
1631 }
1632
1633 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1634                                  unsigned short new_mtu,
1635                                  struct net_device *dev)
1636 {
1637         int i, k;
1638         unsigned short old_mtu = ntohs(iph->tot_len);
1639         struct rtable *rth;
1640         int  ikeys[2] = { dev->ifindex, 0 };
1641         __be32  skeys[2] = { iph->saddr, 0, };
1642         __be32  daddr = iph->daddr;
1643         unsigned short est_mtu = 0;
1644
1645         for (k = 0; k < 2; k++) {
1646                 for (i = 0; i < 2; i++) {
1647                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1648                                                 rt_genid(net));
1649
1650                         rcu_read_lock();
1651                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1652                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1653                                 unsigned short mtu = new_mtu;
1654
1655                                 if (rth->fl.fl4_dst != daddr ||
1656                                     rth->fl.fl4_src != skeys[i] ||
1657                                     rth->rt_dst != daddr ||
1658                                     rth->rt_src != iph->saddr ||
1659                                     rth->fl.oif != ikeys[k] ||
1660                                     rth->fl.iif != 0 ||
1661                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1662                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1663                                     rt_is_expired(rth))
1664                                         continue;
1665
1666                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1667
1668                                         /* BSD 4.2 compatibility hack :-( */
1669                                         if (mtu == 0 &&
1670                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1671                                             old_mtu >= 68 + (iph->ihl << 2))
1672                                                 old_mtu -= iph->ihl << 2;
1673
1674                                         mtu = guess_mtu(old_mtu);
1675                                 }
1676                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1677                                         if (mtu < dst_mtu(&rth->u.dst)) {
1678                                                 dst_confirm(&rth->u.dst);
1679                                                 if (mtu < ip_rt_min_pmtu) {
1680                                                         mtu = ip_rt_min_pmtu;
1681                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1682                                                                 (1 << RTAX_MTU);
1683                                                 }
1684                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1685                                                 dst_set_expires(&rth->u.dst,
1686                                                         ip_rt_mtu_expires);
1687                                         }
1688                                         est_mtu = mtu;
1689                                 }
1690                         }
1691                         rcu_read_unlock();
1692                 }
1693         }
1694         return est_mtu ? : new_mtu;
1695 }
1696
1697 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1698 {
1699         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1700             !(dst_metric_locked(dst, RTAX_MTU))) {
1701                 if (mtu < ip_rt_min_pmtu) {
1702                         mtu = ip_rt_min_pmtu;
1703                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1704                 }
1705                 dst->metrics[RTAX_MTU-1] = mtu;
1706                 dst_set_expires(dst, ip_rt_mtu_expires);
1707                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1708         }
1709 }
1710
1711 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712 {
1713         if (rt_is_expired((struct rtable *)dst))
1714                 return NULL;
1715         return dst;
1716 }
1717
1718 static void ipv4_dst_destroy(struct dst_entry *dst)
1719 {
1720         struct rtable *rt = (struct rtable *) dst;
1721         struct inet_peer *peer = rt->peer;
1722         struct in_device *idev = rt->idev;
1723
1724         if (peer) {
1725                 rt->peer = NULL;
1726                 inet_putpeer(peer);
1727         }
1728
1729         if (idev) {
1730                 rt->idev = NULL;
1731                 in_dev_put(idev);
1732         }
1733 }
1734
1735 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1736                             int how)
1737 {
1738         struct rtable *rt = (struct rtable *) dst;
1739         struct in_device *idev = rt->idev;
1740         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1741                 struct in_device *loopback_idev =
1742                         in_dev_get(dev_net(dev)->loopback_dev);
1743                 if (loopback_idev) {
1744                         rt->idev = loopback_idev;
1745                         in_dev_put(idev);
1746                 }
1747         }
1748 }
1749
1750 static void ipv4_link_failure(struct sk_buff *skb)
1751 {
1752         struct rtable *rt;
1753
1754         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1755
1756         rt = skb_rtable(skb);
1757         if (rt)
1758                 dst_set_expires(&rt->u.dst, 0);
1759 }
1760
1761 static int ip_rt_bug(struct sk_buff *skb)
1762 {
1763         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1764                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1765                 skb->dev ? skb->dev->name : "?");
1766         kfree_skb(skb);
1767         return 0;
1768 }
1769
1770 /*
1771    We do not cache source address of outgoing interface,
1772    because it is used only by IP RR, TS and SRR options,
1773    so that it out of fast path.
1774
1775    BTW remember: "addr" is allowed to be not aligned
1776    in IP options!
1777  */
1778
1779 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1780 {
1781         __be32 src;
1782         struct fib_result res;
1783
1784         if (rt->fl.iif == 0)
1785                 src = rt->rt_src;
1786         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1787                 src = FIB_RES_PREFSRC(res);
1788                 fib_res_put(&res);
1789         } else
1790                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1791                                         RT_SCOPE_UNIVERSE);
1792         memcpy(addr, &src, 4);
1793 }
1794
1795 #ifdef CONFIG_NET_CLS_ROUTE
1796 static void set_class_tag(struct rtable *rt, u32 tag)
1797 {
1798         if (!(rt->u.dst.tclassid & 0xFFFF))
1799                 rt->u.dst.tclassid |= tag & 0xFFFF;
1800         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1801                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1802 }
1803 #endif
1804
1805 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1806 {
1807         struct fib_info *fi = res->fi;
1808
1809         if (fi) {
1810                 if (FIB_RES_GW(*res) &&
1811                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1812                         rt->rt_gateway = FIB_RES_GW(*res);
1813                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1814                        sizeof(rt->u.dst.metrics));
1815                 if (fi->fib_mtu == 0) {
1816                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1817                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1818                             rt->rt_gateway != rt->rt_dst &&
1819                             rt->u.dst.dev->mtu > 576)
1820                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1821                 }
1822 #ifdef CONFIG_NET_CLS_ROUTE
1823                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1824 #endif
1825         } else
1826                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1827
1828         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1829                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1830         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1831                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1832         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1833                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1834                                        ip_rt_min_advmss);
1835         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1836                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1837
1838 #ifdef CONFIG_NET_CLS_ROUTE
1839 #ifdef CONFIG_IP_MULTIPLE_TABLES
1840         set_class_tag(rt, fib_rules_tclass(res));
1841 #endif
1842         set_class_tag(rt, itag);
1843 #endif
1844         rt->rt_type = res->type;
1845 }
1846
1847 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1848                                 u8 tos, struct net_device *dev, int our)
1849 {
1850         unsigned hash;
1851         struct rtable *rth;
1852         __be32 spec_dst;
1853         struct in_device *in_dev = in_dev_get(dev);
1854         u32 itag = 0;
1855
1856         /* Primary sanity checks. */
1857
1858         if (in_dev == NULL)
1859                 return -EINVAL;
1860
1861         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1862             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1863                 goto e_inval;
1864
1865         if (ipv4_is_zeronet(saddr)) {
1866                 if (!ipv4_is_local_multicast(daddr))
1867                         goto e_inval;
1868                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1869         } else if (fib_validate_source(saddr, 0, tos, 0,
1870                                         dev, &spec_dst, &itag, 0) < 0)
1871                 goto e_inval;
1872
1873         rth = dst_alloc(&ipv4_dst_ops);
1874         if (!rth)
1875                 goto e_nobufs;
1876
1877         rth->u.dst.output = ip_rt_bug;
1878         rth->u.dst.obsolete = -1;
1879
1880         atomic_set(&rth->u.dst.__refcnt, 1);
1881         rth->u.dst.flags= DST_HOST;
1882         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883                 rth->u.dst.flags |= DST_NOPOLICY;
1884         rth->fl.fl4_dst = daddr;
1885         rth->rt_dst     = daddr;
1886         rth->fl.fl4_tos = tos;
1887         rth->fl.mark    = skb->mark;
1888         rth->fl.fl4_src = saddr;
1889         rth->rt_src     = saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891         rth->u.dst.tclassid = itag;
1892 #endif
1893         rth->rt_iif     =
1894         rth->fl.iif     = dev->ifindex;
1895         rth->u.dst.dev  = init_net.loopback_dev;
1896         dev_hold(rth->u.dst.dev);
1897         rth->idev       = in_dev_get(rth->u.dst.dev);
1898         rth->fl.oif     = 0;
1899         rth->rt_gateway = daddr;
1900         rth->rt_spec_dst= spec_dst;
1901         rth->rt_genid   = rt_genid(dev_net(dev));
1902         rth->rt_flags   = RTCF_MULTICAST;
1903         rth->rt_type    = RTN_MULTICAST;
1904         if (our) {
1905                 rth->u.dst.input= ip_local_deliver;
1906                 rth->rt_flags |= RTCF_LOCAL;
1907         }
1908
1909 #ifdef CONFIG_IP_MROUTE
1910         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911                 rth->u.dst.input = ip_mr_input;
1912 #endif
1913         RT_CACHE_STAT_INC(in_slow_mc);
1914
1915         in_dev_put(in_dev);
1916         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1917         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1918
1919 e_nobufs:
1920         in_dev_put(in_dev);
1921         return -ENOBUFS;
1922
1923 e_inval:
1924         in_dev_put(in_dev);
1925         return -EINVAL;
1926 }
1927
1928
1929 static void ip_handle_martian_source(struct net_device *dev,
1930                                      struct in_device *in_dev,
1931                                      struct sk_buff *skb,
1932                                      __be32 daddr,
1933                                      __be32 saddr)
1934 {
1935         RT_CACHE_STAT_INC(in_martian_src);
1936 #ifdef CONFIG_IP_ROUTE_VERBOSE
1937         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1938                 /*
1939                  *      RFC1812 recommendation, if source is martian,
1940                  *      the only hint is MAC header.
1941                  */
1942                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1943                         &daddr, &saddr, dev->name);
1944                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1945                         int i;
1946                         const unsigned char *p = skb_mac_header(skb);
1947                         printk(KERN_WARNING "ll header: ");
1948                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1949                                 printk("%02x", *p);
1950                                 if (i < (dev->hard_header_len - 1))
1951                                         printk(":");
1952                         }
1953                         printk("\n");
1954                 }
1955         }
1956 #endif
1957 }
1958
1959 static int __mkroute_input(struct sk_buff *skb,
1960                            struct fib_result *res,
1961                            struct in_device *in_dev,
1962                            __be32 daddr, __be32 saddr, u32 tos,
1963                            struct rtable **result)
1964 {
1965
1966         struct rtable *rth;
1967         int err;
1968         struct in_device *out_dev;
1969         unsigned flags = 0;
1970         __be32 spec_dst;
1971         u32 itag;
1972
1973         /* get a working reference to the output device */
1974         out_dev = in_dev_get(FIB_RES_DEV(*res));
1975         if (out_dev == NULL) {
1976                 if (net_ratelimit())
1977                         printk(KERN_CRIT "Bug in ip_route_input" \
1978                                "_slow(). Please, report\n");
1979                 return -EINVAL;
1980         }
1981
1982
1983         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1984                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1985         if (err < 0) {
1986                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1987                                          saddr);
1988
1989                 err = -EINVAL;
1990                 goto cleanup;
1991         }
1992
1993         if (err)
1994                 flags |= RTCF_DIRECTSRC;
1995
1996         if (out_dev == in_dev && err &&
1997             (IN_DEV_SHARED_MEDIA(out_dev) ||
1998              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1999                 flags |= RTCF_DOREDIRECT;
2000
2001         if (skb->protocol != htons(ETH_P_IP)) {
2002                 /* Not IP (i.e. ARP). Do not create route, if it is
2003                  * invalid for proxy arp. DNAT routes are always valid.
2004                  *
2005                  * Proxy arp feature have been extended to allow, ARP
2006                  * replies back to the same interface, to support
2007                  * Private VLAN switch technologies. See arp.c.
2008                  */
2009                 if (out_dev == in_dev &&
2010                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2011                         err = -EINVAL;
2012                         goto cleanup;
2013                 }
2014         }
2015
2016
2017         rth = dst_alloc(&ipv4_dst_ops);
2018         if (!rth) {
2019                 err = -ENOBUFS;
2020                 goto cleanup;
2021         }
2022
2023         atomic_set(&rth->u.dst.__refcnt, 1);
2024         rth->u.dst.flags= DST_HOST;
2025         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026                 rth->u.dst.flags |= DST_NOPOLICY;
2027         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028                 rth->u.dst.flags |= DST_NOXFRM;
2029         rth->fl.fl4_dst = daddr;
2030         rth->rt_dst     = daddr;
2031         rth->fl.fl4_tos = tos;
2032         rth->fl.mark    = skb->mark;
2033         rth->fl.fl4_src = saddr;
2034         rth->rt_src     = saddr;
2035         rth->rt_gateway = daddr;
2036         rth->rt_iif     =
2037                 rth->fl.iif     = in_dev->dev->ifindex;
2038         rth->u.dst.dev  = (out_dev)->dev;
2039         dev_hold(rth->u.dst.dev);
2040         rth->idev       = in_dev_get(rth->u.dst.dev);
2041         rth->fl.oif     = 0;
2042         rth->rt_spec_dst= spec_dst;
2043
2044         rth->u.dst.obsolete = -1;
2045         rth->u.dst.input = ip_forward;
2046         rth->u.dst.output = ip_output;
2047         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2048
2049         rt_set_nexthop(rth, res, itag);
2050
2051         rth->rt_flags = flags;
2052
2053         *result = rth;
2054         err = 0;
2055  cleanup:
2056         /* release the working reference to the output device */
2057         in_dev_put(out_dev);
2058         return err;
2059 }
2060
2061 static int ip_mkroute_input(struct sk_buff *skb,
2062                             struct fib_result *res,
2063                             const struct flowi *fl,
2064                             struct in_device *in_dev,
2065                             __be32 daddr, __be32 saddr, u32 tos)
2066 {
2067         struct rtable* rth = NULL;
2068         int err;
2069         unsigned hash;
2070
2071 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2072         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2073                 fib_select_multipath(fl, res);
2074 #endif
2075
2076         /* create a routing cache entry */
2077         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2078         if (err)
2079                 return err;
2080
2081         /* put it into the cache */
2082         hash = rt_hash(daddr, saddr, fl->iif,
2083                        rt_genid(dev_net(rth->u.dst.dev)));
2084         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2085 }
2086
2087 /*
2088  *      NOTE. We drop all the packets that has local source
2089  *      addresses, because every properly looped back packet
2090  *      must have correct destination already attached by output routine.
2091  *
2092  *      Such approach solves two big problems:
2093  *      1. Not simplex devices are handled properly.
2094  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2095  */
2096
2097 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098                                u8 tos, struct net_device *dev)
2099 {
2100         struct fib_result res;
2101         struct in_device *in_dev = in_dev_get(dev);
2102         struct flowi fl = { .nl_u = { .ip4_u =
2103                                       { .daddr = daddr,
2104                                         .saddr = saddr,
2105                                         .tos = tos,
2106                                         .scope = RT_SCOPE_UNIVERSE,
2107                                       } },
2108                             .mark = skb->mark,
2109                             .iif = dev->ifindex };
2110         unsigned        flags = 0;
2111         u32             itag = 0;
2112         struct rtable * rth;
2113         unsigned        hash;
2114         __be32          spec_dst;
2115         int             err = -EINVAL;
2116         int             free_res = 0;
2117         struct net    * net = dev_net(dev);
2118
2119         /* IP on this device is disabled. */
2120
2121         if (!in_dev)
2122                 goto out;
2123
2124         /* Check for the most weird martians, which can be not detected
2125            by fib_lookup.
2126          */
2127
2128         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2129             ipv4_is_loopback(saddr))
2130                 goto martian_source;
2131
2132         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2133                 goto brd_input;
2134
2135         /* Accept zero addresses only to limited broadcast;
2136          * I even do not know to fix it or not. Waiting for complains :-)
2137          */
2138         if (ipv4_is_zeronet(saddr))
2139                 goto martian_source;
2140
2141         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2142             ipv4_is_loopback(daddr))
2143                 goto martian_destination;
2144
2145         /*
2146          *      Now we are ready to route packet.
2147          */
2148         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2149                 if (!IN_DEV_FORWARD(in_dev))
2150                         goto e_hostunreach;
2151                 goto no_route;
2152         }
2153         free_res = 1;
2154
2155         RT_CACHE_STAT_INC(in_slow_tot);
2156
2157         if (res.type == RTN_BROADCAST)
2158                 goto brd_input;
2159
2160         if (res.type == RTN_LOCAL) {
2161                 int result;
2162                 result = fib_validate_source(saddr, daddr, tos,
2163                                              net->loopback_dev->ifindex,
2164                                              dev, &spec_dst, &itag, skb->mark);
2165                 if (result < 0)
2166                         goto martian_source;
2167                 if (result)
2168                         flags |= RTCF_DIRECTSRC;
2169                 spec_dst = daddr;
2170                 goto local_input;
2171         }
2172
2173         if (!IN_DEV_FORWARD(in_dev))
2174                 goto e_hostunreach;
2175         if (res.type != RTN_UNICAST)
2176                 goto martian_destination;
2177
2178         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2179 done:
2180         in_dev_put(in_dev);
2181         if (free_res)
2182                 fib_res_put(&res);
2183 out:    return err;
2184
2185 brd_input:
2186         if (skb->protocol != htons(ETH_P_IP))
2187                 goto e_inval;
2188
2189         if (ipv4_is_zeronet(saddr))
2190                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2191         else {
2192                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2193                                           &itag, skb->mark);
2194                 if (err < 0)
2195                         goto martian_source;
2196                 if (err)
2197                         flags |= RTCF_DIRECTSRC;
2198         }
2199         flags |= RTCF_BROADCAST;
2200         res.type = RTN_BROADCAST;
2201         RT_CACHE_STAT_INC(in_brd);
2202
2203 local_input:
2204         rth = dst_alloc(&ipv4_dst_ops);
2205         if (!rth)
2206                 goto e_nobufs;
2207
2208         rth->u.dst.output= ip_rt_bug;
2209         rth->u.dst.obsolete = -1;
2210         rth->rt_genid = rt_genid(net);
2211
2212         atomic_set(&rth->u.dst.__refcnt, 1);
2213         rth->u.dst.flags= DST_HOST;
2214         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2215                 rth->u.dst.flags |= DST_NOPOLICY;
2216         rth->fl.fl4_dst = daddr;
2217         rth->rt_dst     = daddr;
2218         rth->fl.fl4_tos = tos;
2219         rth->fl.mark    = skb->mark;
2220         rth->fl.fl4_src = saddr;
2221         rth->rt_src     = saddr;
2222 #ifdef CONFIG_NET_CLS_ROUTE
2223         rth->u.dst.tclassid = itag;
2224 #endif
2225         rth->rt_iif     =
2226         rth->fl.iif     = dev->ifindex;
2227         rth->u.dst.dev  = net->loopback_dev;
2228         dev_hold(rth->u.dst.dev);
2229         rth->idev       = in_dev_get(rth->u.dst.dev);
2230         rth->rt_gateway = daddr;
2231         rth->rt_spec_dst= spec_dst;
2232         rth->u.dst.input= ip_local_deliver;
2233         rth->rt_flags   = flags|RTCF_LOCAL;
2234         if (res.type == RTN_UNREACHABLE) {
2235                 rth->u.dst.input= ip_error;
2236                 rth->u.dst.error= -err;
2237                 rth->rt_flags   &= ~RTCF_LOCAL;
2238         }
2239         rth->rt_type    = res.type;
2240         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2241         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2242         goto done;
2243
2244 no_route:
2245         RT_CACHE_STAT_INC(in_no_route);
2246         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2247         res.type = RTN_UNREACHABLE;
2248         if (err == -ESRCH)
2249                 err = -ENETUNREACH;
2250         goto local_input;
2251
2252         /*
2253          *      Do not cache martian addresses: they should be logged (RFC1812)
2254          */
2255 martian_destination:
2256         RT_CACHE_STAT_INC(in_martian_dst);
2257 #ifdef CONFIG_IP_ROUTE_VERBOSE
2258         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2259                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2260                         &daddr, &saddr, dev->name);
2261 #endif
2262
2263 e_hostunreach:
2264         err = -EHOSTUNREACH;
2265         goto done;
2266
2267 e_inval:
2268         err = -EINVAL;
2269         goto done;
2270
2271 e_nobufs:
2272         err = -ENOBUFS;
2273         goto done;
2274
2275 martian_source:
2276         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2277         goto e_inval;
2278 }
2279
2280 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2281                            u8 tos, struct net_device *dev, bool noref)
2282 {
2283         struct rtable * rth;
2284         unsigned        hash;
2285         int iif = dev->ifindex;
2286         struct net *net;
2287
2288         net = dev_net(dev);
2289
2290         if (!rt_caching(net))
2291                 goto skip_cache;
2292
2293         tos &= IPTOS_RT_MASK;
2294         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2295
2296         rcu_read_lock();
2297         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2298              rth = rcu_dereference(rth->u.dst.rt_next)) {
2299                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2300                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2301                      (rth->fl.iif ^ iif) |
2302                      rth->fl.oif |
2303                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2304                     rth->fl.mark == skb->mark &&
2305                     net_eq(dev_net(rth->u.dst.dev), net) &&
2306                     !rt_is_expired(rth)) {
2307                         if (noref) {
2308                                 dst_use_noref(&rth->u.dst, jiffies);
2309                                 skb_dst_set_noref(skb, &rth->u.dst);
2310                         } else {
2311                                 dst_use(&rth->u.dst, jiffies);
2312                                 skb_dst_set(skb, &rth->u.dst);
2313                         }
2314                         RT_CACHE_STAT_INC(in_hit);
2315                         rcu_read_unlock();
2316                         return 0;
2317                 }
2318                 RT_CACHE_STAT_INC(in_hlist_search);
2319         }
2320         rcu_read_unlock();
2321
2322 skip_cache:
2323         /* Multicast recognition logic is moved from route cache to here.
2324            The problem was that too many Ethernet cards have broken/missing
2325            hardware multicast filters :-( As result the host on multicasting
2326            network acquires a lot of useless route cache entries, sort of
2327            SDR messages from all the world. Now we try to get rid of them.
2328            Really, provided software IP multicast filter is organized
2329            reasonably (at least, hashed), it does not result in a slowdown
2330            comparing with route cache reject entries.
2331            Note, that multicast routers are not affected, because
2332            route cache entry is created eventually.
2333          */
2334         if (ipv4_is_multicast(daddr)) {
2335                 struct in_device *in_dev;
2336
2337                 rcu_read_lock();
2338                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2339                         int our = ip_check_mc(in_dev, daddr, saddr,
2340                                 ip_hdr(skb)->protocol);
2341                         if (our
2342 #ifdef CONFIG_IP_MROUTE
2343                                 ||
2344                             (!ipv4_is_local_multicast(daddr) &&
2345                              IN_DEV_MFORWARD(in_dev))
2346 #endif
2347                            ) {
2348                                 rcu_read_unlock();
2349                                 return ip_route_input_mc(skb, daddr, saddr,
2350                                                          tos, dev, our);
2351                         }
2352                 }
2353                 rcu_read_unlock();
2354                 return -EINVAL;
2355         }
2356         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2357 }
2358 EXPORT_SYMBOL(ip_route_input_common);
2359
2360 static int __mkroute_output(struct rtable **result,
2361                             struct fib_result *res,
2362                             const struct flowi *fl,
2363                             const struct flowi *oldflp,
2364                             struct net_device *dev_out,
2365                             unsigned flags)
2366 {
2367         struct rtable *rth;
2368         struct in_device *in_dev;
2369         u32 tos = RT_FL_TOS(oldflp);
2370         int err = 0;
2371
2372         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2373                 return -EINVAL;
2374
2375         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2376                 res->type = RTN_BROADCAST;
2377         else if (ipv4_is_multicast(fl->fl4_dst))
2378                 res->type = RTN_MULTICAST;
2379         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2380                 return -EINVAL;
2381
2382         if (dev_out->flags & IFF_LOOPBACK)
2383                 flags |= RTCF_LOCAL;
2384
2385         /* get work reference to inet device */
2386         in_dev = in_dev_get(dev_out);
2387         if (!in_dev)
2388                 return -EINVAL;
2389
2390         if (res->type == RTN_BROADCAST) {
2391                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2392                 if (res->fi) {
2393                         fib_info_put(res->fi);
2394                         res->fi = NULL;
2395                 }
2396         } else if (res->type == RTN_MULTICAST) {
2397                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2398                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2399                                  oldflp->proto))
2400                         flags &= ~RTCF_LOCAL;
2401                 /* If multicast route do not exist use
2402                    default one, but do not gateway in this case.
2403                    Yes, it is hack.
2404                  */
2405                 if (res->fi && res->prefixlen < 4) {
2406                         fib_info_put(res->fi);
2407                         res->fi = NULL;
2408                 }
2409         }
2410
2411
2412         rth = dst_alloc(&ipv4_dst_ops);
2413         if (!rth) {
2414                 err = -ENOBUFS;
2415                 goto cleanup;
2416         }
2417
2418         atomic_set(&rth->u.dst.__refcnt, 1);
2419         rth->u.dst.flags= DST_HOST;
2420         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2421                 rth->u.dst.flags |= DST_NOXFRM;
2422         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2423                 rth->u.dst.flags |= DST_NOPOLICY;
2424
2425         rth->fl.fl4_dst = oldflp->fl4_dst;
2426         rth->fl.fl4_tos = tos;
2427         rth->fl.fl4_src = oldflp->fl4_src;
2428         rth->fl.oif     = oldflp->oif;
2429         rth->fl.mark    = oldflp->mark;
2430         rth->rt_dst     = fl->fl4_dst;
2431         rth->rt_src     = fl->fl4_src;
2432         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2433         /* get references to the devices that are to be hold by the routing
2434            cache entry */
2435         rth->u.dst.dev  = dev_out;
2436         dev_hold(dev_out);
2437         rth->idev       = in_dev_get(dev_out);
2438         rth->rt_gateway = fl->fl4_dst;
2439         rth->rt_spec_dst= fl->fl4_src;
2440
2441         rth->u.dst.output=ip_output;
2442         rth->u.dst.obsolete = -1;
2443         rth->rt_genid = rt_genid(dev_net(dev_out));
2444
2445         RT_CACHE_STAT_INC(out_slow_tot);
2446
2447         if (flags & RTCF_LOCAL) {
2448                 rth->u.dst.input = ip_local_deliver;
2449                 rth->rt_spec_dst = fl->fl4_dst;
2450         }
2451         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2452                 rth->rt_spec_dst = fl->fl4_src;
2453                 if (flags & RTCF_LOCAL &&
2454                     !(dev_out->flags & IFF_LOOPBACK)) {
2455                         rth->u.dst.output = ip_mc_output;
2456                         RT_CACHE_STAT_INC(out_slow_mc);
2457                 }
2458 #ifdef CONFIG_IP_MROUTE
2459                 if (res->type == RTN_MULTICAST) {
2460                         if (IN_DEV_MFORWARD(in_dev) &&
2461                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2462                                 rth->u.dst.input = ip_mr_input;
2463                                 rth->u.dst.output = ip_mc_output;
2464                         }
2465                 }
2466 #endif
2467         }
2468
2469         rt_set_nexthop(rth, res, 0);
2470
2471         rth->rt_flags = flags;
2472
2473         *result = rth;
2474  cleanup:
2475         /* release work reference to inet device */
2476         in_dev_put(in_dev);
2477
2478         return err;
2479 }
2480
2481 static int ip_mkroute_output(struct rtable **rp,
2482                              struct fib_result *res,
2483                              const struct flowi *fl,
2484                              const struct flowi *oldflp,
2485                              struct net_device *dev_out,
2486                              unsigned flags)
2487 {
2488         struct rtable *rth = NULL;
2489         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2490         unsigned hash;
2491         if (err == 0) {
2492                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2493                                rt_genid(dev_net(dev_out)));
2494                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2495         }
2496
2497         return err;
2498 }
2499
2500 /*
2501  * Major route resolver routine.
2502  */
2503
2504 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2505                                 const struct flowi *oldflp)
2506 {
2507         u32 tos = RT_FL_TOS(oldflp);
2508         struct flowi fl = { .nl_u = { .ip4_u =
2509                                       { .daddr = oldflp->fl4_dst,
2510                                         .saddr = oldflp->fl4_src,
2511                                         .tos = tos & IPTOS_RT_MASK,
2512                                         .scope = ((tos & RTO_ONLINK) ?
2513                                                   RT_SCOPE_LINK :
2514                                                   RT_SCOPE_UNIVERSE),
2515                                       } },
2516                             .mark = oldflp->mark,
2517                             .iif = net->loopback_dev->ifindex,
2518                             .oif = oldflp->oif };
2519         struct fib_result res;
2520         unsigned flags = 0;
2521         struct net_device *dev_out = NULL;
2522         int free_res = 0;
2523         int err;
2524
2525
2526         res.fi          = NULL;
2527 #ifdef CONFIG_IP_MULTIPLE_TABLES
2528         res.r           = NULL;
2529 #endif
2530
2531         if (oldflp->fl4_src) {
2532                 err = -EINVAL;
2533                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2534                     ipv4_is_lbcast(oldflp->fl4_src) ||
2535                     ipv4_is_zeronet(oldflp->fl4_src))
2536                         goto out;
2537
2538                 /* I removed check for oif == dev_out->oif here.
2539                    It was wrong for two reasons:
2540                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2541                       is assigned to multiple interfaces.
2542                    2. Moreover, we are allowed to send packets with saddr
2543                       of another iface. --ANK
2544                  */
2545
2546                 if (oldflp->oif == 0 &&
2547                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2548                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2549                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2550                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2551                         if (dev_out == NULL)
2552                                 goto out;
2553
2554                         /* Special hack: user can direct multicasts
2555                            and limited broadcast via necessary interface
2556                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2557                            This hack is not just for fun, it allows
2558                            vic,vat and friends to work.
2559                            They bind socket to loopback, set ttl to zero
2560                            and expect that it will work.
2561                            From the viewpoint of routing cache they are broken,
2562                            because we are not allowed to build multicast path
2563                            with loopback source addr (look, routing cache
2564                            cannot know, that ttl is zero, so that packet
2565                            will not leave this host and route is valid).
2566                            Luckily, this hack is good workaround.
2567                          */
2568
2569                         fl.oif = dev_out->ifindex;
2570                         goto make_route;
2571                 }
2572
2573                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2574                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2575                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2576                         if (dev_out == NULL)
2577                                 goto out;
2578                         dev_put(dev_out);
2579                         dev_out = NULL;
2580                 }
2581         }
2582
2583
2584         if (oldflp->oif) {
2585                 dev_out = dev_get_by_index(net, oldflp->oif);
2586                 err = -ENODEV;
2587                 if (dev_out == NULL)
2588                         goto out;
2589
2590                 /* RACE: Check return value of inet_select_addr instead. */
2591                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2592                         dev_put(dev_out);
2593                         goto out;       /* Wrong error code */
2594                 }
2595
2596                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2597                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2598                         if (!fl.fl4_src)
2599                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2600                                                               RT_SCOPE_LINK);
2601                         goto make_route;
2602                 }
2603                 if (!fl.fl4_src) {
2604                         if (ipv4_is_multicast(oldflp->fl4_dst))
2605                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2606                                                               fl.fl4_scope);
2607                         else if (!oldflp->fl4_dst)
2608                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2609                                                               RT_SCOPE_HOST);
2610                 }
2611         }
2612
2613         if (!fl.fl4_dst) {
2614                 fl.fl4_dst = fl.fl4_src;
2615                 if (!fl.fl4_dst)
2616                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2617                 if (dev_out)
2618                         dev_put(dev_out);
2619                 dev_out = net->loopback_dev;
2620                 dev_hold(dev_out);
2621                 fl.oif = net->loopback_dev->ifindex;
2622                 res.type = RTN_LOCAL;
2623                 flags |= RTCF_LOCAL;
2624                 goto make_route;
2625         }
2626
2627         if (fib_lookup(net, &fl, &res)) {
2628                 res.fi = NULL;
2629                 if (oldflp->oif) {
2630                         /* Apparently, routing tables are wrong. Assume,
2631                            that the destination is on link.
2632
2633                            WHY? DW.
2634                            Because we are allowed to send to iface
2635                            even if it has NO routes and NO assigned
2636                            addresses. When oif is specified, routing
2637                            tables are looked up with only one purpose:
2638                            to catch if destination is gatewayed, rather than
2639                            direct. Moreover, if MSG_DONTROUTE is set,
2640                            we send packet, ignoring both routing tables
2641                            and ifaddr state. --ANK
2642
2643
2644                            We could make it even if oif is unknown,
2645                            likely IPv6, but we do not.
2646                          */
2647
2648                         if (fl.fl4_src == 0)
2649                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2650                                                               RT_SCOPE_LINK);
2651                         res.type = RTN_UNICAST;
2652                         goto make_route;
2653                 }
2654                 if (dev_out)
2655                         dev_put(dev_out);
2656                 err = -ENETUNREACH;
2657                 goto out;
2658         }
2659         free_res = 1;
2660
2661         if (res.type == RTN_LOCAL) {
2662                 if (!fl.fl4_src)
2663                         fl.fl4_src = fl.fl4_dst;
2664                 if (dev_out)
2665                         dev_put(dev_out);
2666                 dev_out = net->loopback_dev;
2667                 dev_hold(dev_out);
2668                 fl.oif = dev_out->ifindex;
2669                 if (res.fi)
2670                         fib_info_put(res.fi);
2671                 res.fi = NULL;
2672                 flags |= RTCF_LOCAL;
2673                 goto make_route;
2674         }
2675
2676 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2677         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2678                 fib_select_multipath(&fl, &res);
2679         else
2680 #endif
2681         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2682                 fib_select_default(net, &fl, &res);
2683
2684         if (!fl.fl4_src)
2685                 fl.fl4_src = FIB_RES_PREFSRC(res);
2686
2687         if (dev_out)
2688                 dev_put(dev_out);
2689         dev_out = FIB_RES_DEV(res);
2690         dev_hold(dev_out);
2691         fl.oif = dev_out->ifindex;
2692
2693
2694 make_route:
2695         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2696
2697
2698         if (free_res)
2699                 fib_res_put(&res);
2700         if (dev_out)
2701                 dev_put(dev_out);
2702 out:    return err;
2703 }
2704
2705 int __ip_route_output_key(struct net *net, struct rtable **rp,
2706                           const struct flowi *flp)
2707 {
2708         unsigned hash;
2709         struct rtable *rth;
2710
2711         if (!rt_caching(net))
2712                 goto slow_output;
2713
2714         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2715
2716         rcu_read_lock_bh();
2717         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2718                 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2719                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2720                     rth->fl.fl4_src == flp->fl4_src &&
2721                     rth->fl.iif == 0 &&
2722                     rth->fl.oif == flp->oif &&
2723                     rth->fl.mark == flp->mark &&
2724                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2725                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2726                     net_eq(dev_net(rth->u.dst.dev), net) &&
2727                     !rt_is_expired(rth)) {
2728                         dst_use(&rth->u.dst, jiffies);
2729                         RT_CACHE_STAT_INC(out_hit);
2730                         rcu_read_unlock_bh();
2731                         *rp = rth;
2732                         return 0;
2733                 }
2734                 RT_CACHE_STAT_INC(out_hlist_search);
2735         }
2736         rcu_read_unlock_bh();
2737
2738 slow_output:
2739         return ip_route_output_slow(net, rp, flp);
2740 }
2741
2742 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2743
2744 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2745 {
2746 }
2747
2748 static struct dst_ops ipv4_dst_blackhole_ops = {
2749         .family                 =       AF_INET,
2750         .protocol               =       cpu_to_be16(ETH_P_IP),
2751         .destroy                =       ipv4_dst_destroy,
2752         .check                  =       ipv4_dst_check,
2753         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2754         .entries                =       ATOMIC_INIT(0),
2755 };
2756
2757
2758 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2759 {
2760         struct rtable *ort = *rp;
2761         struct rtable *rt = (struct rtable *)
2762                 dst_alloc(&ipv4_dst_blackhole_ops);
2763
2764         if (rt) {
2765                 struct dst_entry *new = &rt->u.dst;
2766
2767                 atomic_set(&new->__refcnt, 1);
2768                 new->__use = 1;
2769                 new->input = dst_discard;
2770                 new->output = dst_discard;
2771                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2772
2773                 new->dev = ort->u.dst.dev;
2774                 if (new->dev)
2775                         dev_hold(new->dev);
2776
2777                 rt->fl = ort->fl;
2778
2779                 rt->idev = ort->idev;
2780                 if (rt->idev)
2781                         in_dev_hold(rt->idev);
2782                 rt->rt_genid = rt_genid(net);
2783                 rt->rt_flags = ort->rt_flags;
2784                 rt->rt_type = ort->rt_type;
2785                 rt->rt_dst = ort->rt_dst;
2786                 rt->rt_src = ort->rt_src;
2787                 rt->rt_iif = ort->rt_iif;
2788                 rt->rt_gateway = ort->rt_gateway;
2789                 rt->rt_spec_dst = ort->rt_spec_dst;
2790                 rt->peer = ort->peer;
2791                 if (rt->peer)
2792                         atomic_inc(&rt->peer->refcnt);
2793
2794                 dst_free(new);
2795         }
2796
2797         dst_release(&(*rp)->u.dst);
2798         *rp = rt;
2799         return (rt ? 0 : -ENOMEM);
2800 }
2801
2802 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2803                          struct sock *sk, int flags)
2804 {
2805         int err;
2806
2807         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2808                 return err;
2809
2810         if (flp->proto) {
2811                 if (!flp->fl4_src)
2812                         flp->fl4_src = (*rp)->rt_src;
2813                 if (!flp->fl4_dst)
2814                         flp->fl4_dst = (*rp)->rt_dst;
2815                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2816                                     flags ? XFRM_LOOKUP_WAIT : 0);
2817                 if (err == -EREMOTE)
2818                         err = ipv4_dst_blackhole(net, rp, flp);
2819
2820                 return err;
2821         }
2822
2823         return 0;
2824 }
2825
2826 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2827
2828 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2829 {
2830         return ip_route_output_flow(net, rp, flp, NULL, 0);
2831 }
2832
2833 static int rt_fill_info(struct net *net,
2834                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2835                         int nowait, unsigned int flags)
2836 {
2837         struct rtable *rt = skb_rtable(skb);
2838         struct rtmsg *r;
2839         struct nlmsghdr *nlh;
2840         long expires;
2841         u32 id = 0, ts = 0, tsage = 0, error;
2842
2843         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2844         if (nlh == NULL)
2845                 return -EMSGSIZE;
2846
2847         r = nlmsg_data(nlh);
2848         r->rtm_family    = AF_INET;
2849         r->rtm_dst_len  = 32;
2850         r->rtm_src_len  = 0;
2851         r->rtm_tos      = rt->fl.fl4_tos;
2852         r->rtm_table    = RT_TABLE_MAIN;
2853         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2854         r->rtm_type     = rt->rt_type;
2855         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2856         r->rtm_protocol = RTPROT_UNSPEC;
2857         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2858         if (rt->rt_flags & RTCF_NOTIFY)
2859                 r->rtm_flags |= RTM_F_NOTIFY;
2860
2861         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2862
2863         if (rt->fl.fl4_src) {
2864                 r->rtm_src_len = 32;
2865                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2866         }
2867         if (rt->u.dst.dev)
2868                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2869 #ifdef CONFIG_NET_CLS_ROUTE
2870         if (rt->u.dst.tclassid)
2871                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2872 #endif
2873         if (rt->fl.iif)
2874                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2875         else if (rt->rt_src != rt->fl.fl4_src)
2876                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2877
2878         if (rt->rt_dst != rt->rt_gateway)
2879                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2880
2881         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2882                 goto nla_put_failure;
2883
2884         error = rt->u.dst.error;
2885         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2886         if (rt->peer) {
2887                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2888                 if (rt->peer->tcp_ts_stamp) {
2889                         ts = rt->peer->tcp_ts;
2890                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2891                 }
2892         }
2893
2894         if (rt->fl.iif) {
2895 #ifdef CONFIG_IP_MROUTE
2896                 __be32 dst = rt->rt_dst;
2897
2898                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2899                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2900                         int err = ipmr_get_route(net, skb, r, nowait);
2901                         if (err <= 0) {
2902                                 if (!nowait) {
2903                                         if (err == 0)
2904                                                 return 0;
2905                                         goto nla_put_failure;
2906                                 } else {
2907                                         if (err == -EMSGSIZE)
2908                                                 goto nla_put_failure;
2909                                         error = err;
2910                                 }
2911                         }
2912                 } else
2913 #endif
2914                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2915         }
2916
2917         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2918                                expires, error) < 0)
2919                 goto nla_put_failure;
2920
2921         return nlmsg_end(skb, nlh);
2922
2923 nla_put_failure:
2924         nlmsg_cancel(skb, nlh);
2925         return -EMSGSIZE;
2926 }
2927
2928 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2929 {
2930         struct net *net = sock_net(in_skb->sk);
2931         struct rtmsg *rtm;
2932         struct nlattr *tb[RTA_MAX+1];
2933         struct rtable *rt = NULL;
2934         __be32 dst = 0;
2935         __be32 src = 0;
2936         u32 iif;
2937         int err;
2938         struct sk_buff *skb;
2939
2940         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2941         if (err < 0)
2942                 goto errout;
2943
2944         rtm = nlmsg_data(nlh);
2945
2946         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2947         if (skb == NULL) {
2948                 err = -ENOBUFS;
2949                 goto errout;
2950         }
2951
2952         /* Reserve room for dummy headers, this skb can pass
2953            through good chunk of routing engine.
2954          */
2955         skb_reset_mac_header(skb);
2956         skb_reset_network_header(skb);
2957
2958         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2959         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2960         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2961
2962         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2963         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2964         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2965
2966         if (iif) {
2967                 struct net_device *dev;
2968
2969                 dev = __dev_get_by_index(net, iif);
2970                 if (dev == NULL) {
2971                         err = -ENODEV;
2972                         goto errout_free;
2973                 }
2974
2975                 skb->protocol   = htons(ETH_P_IP);
2976                 skb->dev        = dev;
2977                 local_bh_disable();
2978                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2979                 local_bh_enable();
2980
2981                 rt = skb_rtable(skb);
2982                 if (err == 0 && rt->u.dst.error)
2983                         err = -rt->u.dst.error;
2984         } else {
2985                 struct flowi fl = {
2986                         .nl_u = {
2987                                 .ip4_u = {
2988                                         .daddr = dst,
2989                                         .saddr = src,
2990                                         .tos = rtm->rtm_tos,
2991                                 },
2992                         },
2993                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2994                 };
2995                 err = ip_route_output_key(net, &rt, &fl);
2996         }
2997
2998         if (err)
2999                 goto errout_free;
3000
3001         skb_dst_set(skb, &rt->u.dst);
3002         if (rtm->rtm_flags & RTM_F_NOTIFY)
3003                 rt->rt_flags |= RTCF_NOTIFY;
3004
3005         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3006                            RTM_NEWROUTE, 0, 0);
3007         if (err <= 0)
3008                 goto errout_free;
3009
3010         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3011 errout:
3012         return err;
3013
3014 errout_free:
3015         kfree_skb(skb);
3016         goto errout;
3017 }
3018
3019 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3020 {
3021         struct rtable *rt;
3022         int h, s_h;
3023         int idx, s_idx;
3024         struct net *net;
3025
3026         net = sock_net(skb->sk);
3027
3028         s_h = cb->args[0];
3029         if (s_h < 0)
3030                 s_h = 0;
3031         s_idx = idx = cb->args[1];
3032         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3033                 if (!rt_hash_table[h].chain)
3034                         continue;
3035                 rcu_read_lock_bh();
3036                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3037                      rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3038                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3039                                 continue;
3040                         if (rt_is_expired(rt))
3041                                 continue;
3042                         skb_dst_set_noref(skb, &rt->u.dst);
3043                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3044                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3045                                          1, NLM_F_MULTI) <= 0) {
3046                                 skb_dst_drop(skb);
3047                                 rcu_read_unlock_bh();
3048                                 goto done;
3049                         }
3050                         skb_dst_drop(skb);
3051                 }
3052                 rcu_read_unlock_bh();
3053         }
3054
3055 done:
3056         cb->args[0] = h;
3057         cb->args[1] = idx;
3058         return skb->len;
3059 }
3060
3061 void ip_rt_multicast_event(struct in_device *in_dev)
3062 {
3063         rt_cache_flush(dev_net(in_dev->dev), 0);
3064 }
3065
3066 #ifdef CONFIG_SYSCTL
3067 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3068                                         void __user *buffer,
3069                                         size_t *lenp, loff_t *ppos)
3070 {
3071         if (write) {
3072                 int flush_delay;
3073                 ctl_table ctl;
3074                 struct net *net;
3075
3076                 memcpy(&ctl, __ctl, sizeof(ctl));
3077                 ctl.data = &flush_delay;
3078                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3079
3080                 net = (struct net *)__ctl->extra1;
3081                 rt_cache_flush(net, flush_delay);
3082                 return 0;
3083         }
3084
3085         return -EINVAL;
3086 }
3087
3088 static ctl_table ipv4_route_table[] = {
3089         {
3090                 .procname       = "gc_thresh",
3091                 .data           = &ipv4_dst_ops.gc_thresh,
3092                 .maxlen         = sizeof(int),
3093                 .mode           = 0644,
3094                 .proc_handler   = proc_dointvec,
3095         },
3096         {
3097                 .procname       = "max_size",
3098                 .data           = &ip_rt_max_size,
3099                 .maxlen         = sizeof(int),
3100                 .mode           = 0644,
3101                 .proc_handler   = proc_dointvec,
3102         },
3103         {
3104                 /*  Deprecated. Use gc_min_interval_ms */
3105
3106                 .procname       = "gc_min_interval",
3107                 .data           = &ip_rt_gc_min_interval,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec_jiffies,
3111         },
3112         {
3113                 .procname       = "gc_min_interval_ms",
3114                 .data           = &ip_rt_gc_min_interval,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec_ms_jiffies,
3118         },
3119         {
3120                 .procname       = "gc_timeout",
3121                 .data           = &ip_rt_gc_timeout,
3122                 .maxlen         = sizeof(int),
3123                 .mode           = 0644,
3124                 .proc_handler   = proc_dointvec_jiffies,
3125         },
3126         {
3127                 .procname       = "gc_interval",
3128                 .data           = &ip_rt_gc_interval,
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0644,
3131                 .proc_handler   = proc_dointvec_jiffies,
3132         },
3133         {
3134                 .procname       = "redirect_load",
3135                 .data           = &ip_rt_redirect_load,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec,
3139         },
3140         {
3141                 .procname       = "redirect_number",
3142                 .data           = &ip_rt_redirect_number,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 .procname       = "redirect_silence",
3149                 .data           = &ip_rt_redirect_silence,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         {
3155                 .procname       = "error_cost",
3156                 .data           = &ip_rt_error_cost,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 .procname       = "error_burst",
3163                 .data           = &ip_rt_error_burst,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec,
3167         },
3168         {
3169                 .procname       = "gc_elasticity",
3170                 .data           = &ip_rt_gc_elasticity,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "mtu_expires",
3177                 .data           = &ip_rt_mtu_expires,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec_jiffies,
3181         },
3182         {
3183                 .procname       = "min_pmtu",
3184                 .data           = &ip_rt_min_pmtu,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec,
3188         },
3189         {
3190                 .procname       = "min_adv_mss",
3191                 .data           = &ip_rt_min_advmss,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec,
3195         },
3196         { }
3197 };
3198
3199 static struct ctl_table empty[1];
3200
3201 static struct ctl_table ipv4_skeleton[] =
3202 {
3203         { .procname = "route",
3204           .mode = 0555, .child = ipv4_route_table},
3205         { .procname = "neigh",
3206           .mode = 0555, .child = empty},
3207         { }
3208 };
3209
3210 static __net_initdata struct ctl_path ipv4_path[] = {
3211         { .procname = "net", },
3212         { .procname = "ipv4", },
3213         { },
3214 };
3215
3216 static struct ctl_table ipv4_route_flush_table[] = {
3217         {
3218                 .procname       = "flush",
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0200,
3221                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3222         },
3223         { },
3224 };
3225
3226 static __net_initdata struct ctl_path ipv4_route_path[] = {
3227         { .procname = "net", },
3228         { .procname = "ipv4", },
3229         { .procname = "route", },
3230         { },
3231 };
3232
3233 static __net_init int sysctl_route_net_init(struct net *net)
3234 {
3235         struct ctl_table *tbl;
3236
3237         tbl = ipv4_route_flush_table;
3238         if (!net_eq(net, &init_net)) {
3239                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3240                 if (tbl == NULL)
3241                         goto err_dup;
3242         }
3243         tbl[0].extra1 = net;
3244
3245         net->ipv4.route_hdr =
3246                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3247         if (net->ipv4.route_hdr == NULL)
3248                 goto err_reg;
3249         return 0;
3250
3251 err_reg:
3252         if (tbl != ipv4_route_flush_table)
3253                 kfree(tbl);
3254 err_dup:
3255         return -ENOMEM;
3256 }
3257
3258 static __net_exit void sysctl_route_net_exit(struct net *net)
3259 {
3260         struct ctl_table *tbl;
3261
3262         tbl = net->ipv4.route_hdr->ctl_table_arg;
3263         unregister_net_sysctl_table(net->ipv4.route_hdr);
3264         BUG_ON(tbl == ipv4_route_flush_table);
3265         kfree(tbl);
3266 }
3267
3268 static __net_initdata struct pernet_operations sysctl_route_ops = {
3269         .init = sysctl_route_net_init,
3270         .exit = sysctl_route_net_exit,
3271 };
3272 #endif
3273
3274 static __net_init int rt_genid_init(struct net *net)
3275 {
3276         get_random_bytes(&net->ipv4.rt_genid,
3277                          sizeof(net->ipv4.rt_genid));
3278         return 0;
3279 }
3280
3281 static __net_initdata struct pernet_operations rt_genid_ops = {
3282         .init = rt_genid_init,
3283 };
3284
3285
3286 #ifdef CONFIG_NET_CLS_ROUTE
3287 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3288 #endif /* CONFIG_NET_CLS_ROUTE */
3289
3290 static __initdata unsigned long rhash_entries;
3291 static int __init set_rhash_entries(char *str)
3292 {
3293         if (!str)
3294                 return 0;
3295         rhash_entries = simple_strtoul(str, &str, 0);
3296         return 1;
3297 }
3298 __setup("rhash_entries=", set_rhash_entries);
3299
3300 int __init ip_rt_init(void)
3301 {
3302         int rc = 0;
3303
3304 #ifdef CONFIG_NET_CLS_ROUTE
3305         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3306         if (!ip_rt_acct)
3307                 panic("IP: failed to allocate ip_rt_acct\n");
3308 #endif
3309
3310         ipv4_dst_ops.kmem_cachep =
3311                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3312                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3313
3314         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3315
3316         rt_hash_table = (struct rt_hash_bucket *)
3317                 alloc_large_system_hash("IP route cache",
3318                                         sizeof(struct rt_hash_bucket),
3319                                         rhash_entries,
3320                                         (totalram_pages >= 128 * 1024) ?
3321                                         15 : 17,
3322                                         0,
3323                                         &rt_hash_log,
3324                                         &rt_hash_mask,
3325                                         rhash_entries ? 0 : 512 * 1024);
3326         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3327         rt_hash_lock_init();
3328
3329         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3330         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3331
3332         devinet_init();
3333         ip_fib_init();
3334
3335         /* All the timers, started at system startup tend
3336            to synchronize. Perturb it a bit.
3337          */
3338         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3339         expires_ljiffies = jiffies;
3340         schedule_delayed_work(&expires_work,
3341                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3342
3343         if (ip_rt_proc_init())
3344                 printk(KERN_ERR "Unable to create route proc files\n");
3345 #ifdef CONFIG_XFRM
3346         xfrm_init();
3347         xfrm4_init(ip_rt_max_size);
3348 #endif
3349         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3350
3351 #ifdef CONFIG_SYSCTL
3352         register_pernet_subsys(&sysctl_route_ops);
3353 #endif
3354         register_pernet_subsys(&rt_genid_ops);
3355         return rc;
3356 }
3357
3358 #ifdef CONFIG_SYSCTL
3359 /*
3360  * We really need to sanitize the damn ipv4 init order, then all
3361  * this nonsense will go away.
3362  */
3363 void __init ip_static_sysctl_init(void)
3364 {
3365         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3366 }
3367 #endif
3368
3369 EXPORT_SYMBOL(__ip_select_ident);
3370 EXPORT_SYMBOL(ip_route_output_key);