net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static void rt_worker_func(struct work_struct *work);
 135 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149 static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152 static struct dst_ops ipv4_dst_ops = {
 153         .family =               AF_INET,
 154         .protocol =             cpu_to_be16(ETH_P_IP),
 155         .gc =                   rt_garbage_collect,
 156         .check =                ipv4_dst_check,
 157         .destroy =              ipv4_dst_destroy,
 158         .ifdown =               ipv4_dst_ifdown,
 159         .negative_advice =      ipv4_negative_advice,
 160         .link_failure =         ipv4_link_failure,
 161         .update_pmtu =          ip_rt_update_pmtu,
 162         .local_out =            __ip_local_out,
 163         .entries =              ATOMIC_INIT(0),
 164 };
 165
 166 #define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168 const __u8 ip_tos2prio[16] = {
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(FILLER),
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(BESTEFFORT),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK)
 185 };
 186
 187
 188 /*
 189  * Route cache.
 190  */
 191
 192 /* The locking scheme is rather straight forward:
 193  *
 194  * 1) Read-Copy Update protects the buckets of the central route hash.
 195  * 2) Only writers remove entries, and they hold the lock
 196  *    as they look at rtable reference counts.
 197  * 3) Only readers acquire references to rtable entries,
 198  *    they do so with atomic increments and with the
 199  *    lock held.
 200  */
 201
 202 struct rt_hash_bucket {
 203         struct rtable   *chain;
 204 };
 205
 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207         defined(CONFIG_PROVE_LOCKING)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212  */
 213 #ifdef CONFIG_LOCKDEP
 214 # define RT_HASH_LOCK_SZ        256
 215 #else
 216 # if NR_CPUS >= 32
 217 #  define RT_HASH_LOCK_SZ       4096
 218 # elif NR_CPUS >= 16
 219 #  define RT_HASH_LOCK_SZ       2048
 220 # elif NR_CPUS >= 8
 221 #  define RT_HASH_LOCK_SZ       1024
 222 # elif NR_CPUS >= 4
 223 #  define RT_HASH_LOCK_SZ       512
 224 # else
 225 #  define RT_HASH_LOCK_SZ       256
 226 # endif
 227 #endif
 228
 229 static spinlock_t       *rt_hash_locks;
 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232 static __init void rt_hash_lock_init(void)
 233 {
 234         int i;
 235
 236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                         GFP_KERNEL);
 238         if (!rt_hash_locks)
 239                 panic("IP: failed to allocate rt_hash_locks\n");
 240
 241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                 spin_lock_init(&rt_hash_locks[i]);
 243 }
 244 #else
 245 # define rt_hash_lock_addr(slot) NULL
 246
 247 static inline void rt_hash_lock_init(void)
 248 {
 249 }
 250 #endif
 251
 252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253 static unsigned                 rt_hash_mask __read_mostly;
 254 static unsigned int             rt_hash_log  __read_mostly;
 255
 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257 #define RT_CACHE_STAT_INC(field) \
 258         (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                 int genid)
 262 {
 263         return jhash_3words((__force u32)(__be32)(daddr),
 264                             (__force u32)(__be32)(saddr),
 265                             idx, genid)
 266                 & rt_hash_mask;
 267 }
 268
 269 static inline int rt_genid(struct net *net)
 270 {
 271         return atomic_read(&net->ipv4.rt_genid);
 272 }
 273
 274 #ifdef CONFIG_PROC_FS
 275 struct rt_cache_iter_state {
 276         struct seq_net_private p;
 277         int bucket;
 278         int genid;
 279 };
 280
 281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282 {
 283         struct rt_cache_iter_state *st = seq->private;
 284         struct rtable *r = NULL;
 285
 286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                 if (!rt_hash_table[st->bucket].chain)
 288                         continue;
 289                 rcu_read_lock_bh();
 290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                 while (r) {
 292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                             r->rt_genid == st->genid)
 294                                 return r;
 295                         r = rcu_dereference(r->u.dst.rt_next);
 296                 }
 297                 rcu_read_unlock_bh();
 298         }
 299         return r;
 300 }
 301
 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                           struct rtable *r)
 304 {
 305         struct rt_cache_iter_state *st = seq->private;
 306
 307         r = r->u.dst.rt_next;
 308         while (!r) {
 309                 rcu_read_unlock_bh();
 310                 do {
 311                         if (--st->bucket < 0)
 312                                 return NULL;
 313                 } while (!rt_hash_table[st->bucket].chain);
 314                 rcu_read_lock_bh();
 315                 r = rt_hash_table[st->bucket].chain;
 316         }
 317         return rcu_dereference(r);
 318 }
 319
 320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                         struct rtable *r)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                         continue;
 327                 if (r->rt_genid == st->genid)
 328                         break;
 329         }
 330         return r;
 331 }
 332
 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334 {
 335         struct rtable *r = rt_cache_get_first(seq);
 336
 337         if (r)
 338                 while (pos && (r = rt_cache_get_next(seq, r)))
 339                         --pos;
 340         return pos ? NULL : r;
 341 }
 342
 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346         if (*pos)
 347                 return rt_cache_get_idx(seq, *pos - 1);
 348         st->genid = rt_genid(seq_file_net(seq));
 349         return SEQ_START_TOKEN;
 350 }
 351
 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353 {
 354         struct rtable *r;
 355
 356         if (v == SEQ_START_TOKEN)
 357                 r = rt_cache_get_first(seq);
 358         else
 359                 r = rt_cache_get_next(seq, v);
 360         ++*pos;
 361         return r;
 362 }
 363
 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365 {
 366         if (v && v != SEQ_START_TOKEN)
 367                 rcu_read_unlock_bh();
 368 }
 369
 370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371 {
 372         if (v == SEQ_START_TOKEN)
 373                 seq_printf(seq, "%-127s\n",
 374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                            "HHUptod\tSpecDst");
 377         else {
 378                 struct rtable *r = v;
 379                 int len;
 380
 381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                         dst_metric(&r->u.dst, RTAX_WINDOW),
 390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                         r->fl.fl4_tos,
 393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                        dev_queue_xmit) : 0,
 396                         r->rt_spec_dst, &len);
 397
 398                 seq_printf(seq, "%*s\n", 127 - len, "");
 399         }
 400         return 0;
 401 }
 402
 403 static const struct seq_operations rt_cache_seq_ops = {
 404         .start  = rt_cache_seq_start,
 405         .next   = rt_cache_seq_next,
 406         .stop   = rt_cache_seq_stop,
 407         .show   = rt_cache_seq_show,
 408 };
 409
 410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411 {
 412         return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                         sizeof(struct rt_cache_iter_state));
 414 }
 415
 416 static const struct file_operations rt_cache_seq_fops = {
 417         .owner   = THIS_MODULE,
 418         .open    = rt_cache_seq_open,
 419         .read    = seq_read,
 420         .llseek  = seq_lseek,
 421         .release = seq_release_net,
 422 };
 423
 424
 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426 {
 427         int cpu;
 428
 429         if (*pos == 0)
 430                 return SEQ_START_TOKEN;
 431
 432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                 if (!cpu_possible(cpu))
 434                         continue;
 435                 *pos = cpu+1;
 436                 return &per_cpu(rt_cache_stat, cpu);
 437         }
 438         return NULL;
 439 }
 440
 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442 {
 443         int cpu;
 444
 445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                 if (!cpu_possible(cpu))
 447                         continue;
 448                 *pos = cpu+1;
 449                 return &per_cpu(rt_cache_stat, cpu);
 450         }
 451         return NULL;
 452
 453 }
 454
 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456 {
 457
 458 }
 459
 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461 {
 462         struct rt_cache_stat *st = v;
 463
 464         if (v == SEQ_START_TOKEN) {
 465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                 return 0;
 467         }
 468
 469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                    atomic_read(&ipv4_dst_ops.entries),
 472                    st->in_hit,
 473                    st->in_slow_tot,
 474                    st->in_slow_mc,
 475                    st->in_no_route,
 476                    st->in_brd,
 477                    st->in_martian_dst,
 478                    st->in_martian_src,
 479
 480                    st->out_hit,
 481                    st->out_slow_tot,
 482                    st->out_slow_mc,
 483
 484                    st->gc_total,
 485                    st->gc_ignored,
 486                    st->gc_goal_miss,
 487                    st->gc_dst_overflow,
 488                    st->in_hlist_search,
 489                    st->out_hlist_search
 490                 );
 491         return 0;
 492 }
 493
 494 static const struct seq_operations rt_cpu_seq_ops = {
 495         .start  = rt_cpu_seq_start,
 496         .next   = rt_cpu_seq_next,
 497         .stop   = rt_cpu_seq_stop,
 498         .show   = rt_cpu_seq_show,
 499 };
 500
 501
 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503 {
 504         return seq_open(file, &rt_cpu_seq_ops);
 505 }
 506
 507 static const struct file_operations rt_cpu_seq_fops = {
 508         .owner   = THIS_MODULE,
 509         .open    = rt_cpu_seq_open,
 510         .read    = seq_read,
 511         .llseek  = seq_lseek,
 512         .release = seq_release,
 513 };
 514
 515 #ifdef CONFIG_NET_CLS_ROUTE
 516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                            int length, int *eof, void *data)
 518 {
 519         unsigned int i;
 520
 521         if ((offset & 3) || (length & 3))
 522                 return -EIO;
 523
 524         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                 *eof = 1;
 526                 return 0;
 527         }
 528
 529         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                 *eof = 1;
 532         }
 533
 534         offset /= sizeof(u32);
 535
 536         if (length > 0) {
 537                 u32 *dst = (u32 *) buffer;
 538
 539                 *start = buffer;
 540                 memset(dst, 0, length);
 541
 542                 for_each_possible_cpu(i) {
 543                         unsigned int j;
 544                         u32 *src;
 545
 546                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                         for (j = 0; j < length/4; j++)
 548                                 dst[j] += src[j];
 549                 }
 550         }
 551         return length;
 552 }
 553 #endif
 554
 555 static int __net_init ip_rt_do_proc_init(struct net *net)
 556 {
 557         struct proc_dir_entry *pde;
 558
 559         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                         &rt_cache_seq_fops);
 561         if (!pde)
 562                 goto err1;
 563
 564         pde = proc_create("rt_cache", S_IRUGO,
 565                           net->proc_net_stat, &rt_cpu_seq_fops);
 566         if (!pde)
 567                 goto err2;
 568
 569 #ifdef CONFIG_NET_CLS_ROUTE
 570         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                         ip_rt_acct_read, NULL);
 572         if (!pde)
 573                 goto err3;
 574 #endif
 575         return 0;
 576
 577 #ifdef CONFIG_NET_CLS_ROUTE
 578 err3:
 579         remove_proc_entry("rt_cache", net->proc_net_stat);
 580 #endif
 581 err2:
 582         remove_proc_entry("rt_cache", net->proc_net);
 583 err1:
 584         return -ENOMEM;
 585 }
 586
 587 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588 {
 589         remove_proc_entry("rt_cache", net->proc_net_stat);
 590         remove_proc_entry("rt_cache", net->proc_net);
 591         remove_proc_entry("rt_acct", net->proc_net);
 592 }
 593
 594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595         .init = ip_rt_do_proc_init,
 596         .exit = ip_rt_do_proc_exit,
 597 };
 598
 599 static int __init ip_rt_proc_init(void)
 600 {
 601         return register_pernet_subsys(&ip_rt_proc_ops);
 602 }
 603
 604 #else
 605 static inline int ip_rt_proc_init(void)
 606 {
 607         return 0;
 608 }
 609 #endif /* CONFIG_PROC_FS */
 610
 611 static inline void rt_free(struct rtable *rt)
 612 {
 613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614 }
 615
 616 static inline void rt_drop(struct rtable *rt)
 617 {
 618         ip_rt_put(rt);
 619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620 }
 621
 622 static inline int rt_fast_clean(struct rtable *rth)
 623 {
 624         /* Kill broadcast/multicast entries very aggresively, if they
 625            collide in hash table with more useful entries */
 626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                 rth->fl.iif && rth->u.dst.rt_next;
 628 }
 629
 630 static inline int rt_valuable(struct rtable *rth)
 631 {
 632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                 rth->u.dst.expires;
 634 }
 635
 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637 {
 638         unsigned long age;
 639         int ret = 0;
 640
 641         if (atomic_read(&rth->u.dst.__refcnt))
 642                 goto out;
 643
 644         ret = 1;
 645         if (rth->u.dst.expires &&
 646             time_after_eq(jiffies, rth->u.dst.expires))
 647                 goto out;
 648
 649         age = jiffies - rth->u.dst.lastuse;
 650         ret = 0;
 651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652             (age <= tmo2 && rt_valuable(rth)))
 653                 goto out;
 654         ret = 1;
 655 out:    return ret;
 656 }
 657
 658 /* Bits of score are:
 659  * 31: very valuable
 660  * 30: not quite useless
 661  * 29..0: usage counter
 662  */
 663 static inline u32 rt_score(struct rtable *rt)
 664 {
 665         u32 score = jiffies - rt->u.dst.lastuse;
 666
 667         score = ~score & ~(3<<30);
 668
 669         if (rt_valuable(rt))
 670                 score |= (1<<31);
 671
 672         if (!rt->fl.iif ||
 673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                 score |= (1<<30);
 675
 676         return score;
 677 }
 678
 679 static inline bool rt_caching(const struct net *net)
 680 {
 681         return net->ipv4.current_rt_cache_rebuild_count <=
 682                 net->ipv4.sysctl_rt_cache_rebuild_count;
 683 }
 684
 685 static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                         const struct flowi *fl2)
 687 {
 688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                 (fl1->iif ^ fl2->iif)) == 0);
 691 }
 692
 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694 {
 695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                 (fl1->mark ^ fl2->mark) |
 698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                 (fl1->oif ^ fl2->oif) |
 701                 (fl1->iif ^ fl2->iif)) == 0;
 702 }
 703
 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705 {
 706         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707 }
 708
 709 static inline int rt_is_expired(struct rtable *rth)
 710 {
 711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712 }
 713
 714 /*
 715  * Perform a full scan of hash table and free all entries.
 716  * Can be called by a softirq or a process.
 717  * In the later case, we want to be reschedule if necessary
 718  */
 719 static void rt_do_flush(int process_context)
 720 {
 721         unsigned int i;
 722         struct rtable *rth, *next;
 723         struct rtable * tail;
 724
 725         for (i = 0; i <= rt_hash_mask; i++) {
 726                 if (process_context && need_resched())
 727                         cond_resched();
 728                 rth = rt_hash_table[i].chain;
 729                 if (!rth)
 730                         continue;
 731
 732                 spin_lock_bh(rt_hash_lock_addr(i));
 733 #ifdef CONFIG_NET_NS
 734                 {
 735                 struct rtable ** prev, * p;
 736
 737                 rth = rt_hash_table[i].chain;
 738
 739                 /* defer releasing the head of the list after spin_unlock */
 740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                         if (!rt_is_expired(tail))
 742                                 break;
 743                 if (rth != tail)
 744                         rt_hash_table[i].chain = tail;
 745
 746                 /* call rt_free on entries after the tail requiring flush */
 747                 prev = &rt_hash_table[i].chain;
 748                 for (p = *prev; p; p = next) {
 749                         next = p->u.dst.rt_next;
 750                         if (!rt_is_expired(p)) {
 751                                 prev = &p->u.dst.rt_next;
 752                         } else {
 753                                 *prev = next;
 754                                 rt_free(p);
 755                         }
 756                 }
 757                 }
 758 #else
 759                 rth = rt_hash_table[i].chain;
 760                 rt_hash_table[i].chain = NULL;
 761                 tail = NULL;
 762 #endif
 763                 spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                 for (; rth != tail; rth = next) {
 766                         next = rth->u.dst.rt_next;
 767                         rt_free(rth);
 768                 }
 769         }
 770 }
 771
 772 /*
 773  * While freeing expired entries, we compute average chain length
 774  * and standard deviation, using fixed-point arithmetic.
 775  * This to have an estimation of rt_chain_length_max
 776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778  */
 779
 780 #define FRACT_BITS 3
 781 #define ONE (1UL << FRACT_BITS)
 782
 783 static void rt_check_expire(void)
 784 {
 785         static unsigned int rover;
 786         unsigned int i = rover, goal;
 787         struct rtable *rth, *aux, **rthp;
 788         unsigned long samples = 0;
 789         unsigned long sum = 0, sum2 = 0;
 790         u64 mult;
 791
 792         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 793         if (ip_rt_gc_timeout > 1)
 794                 do_div(mult, ip_rt_gc_timeout);
 795         goal = (unsigned int)mult;
 796         if (goal > rt_hash_mask)
 797                 goal = rt_hash_mask + 1;
 798         for (; goal > 0; goal--) {
 799                 unsigned long tmo = ip_rt_gc_timeout;
 800                 unsigned long length;
 801
 802                 i = (i + 1) & rt_hash_mask;
 803                 rthp = &rt_hash_table[i].chain;
 804
 805                 if (need_resched())
 806                         cond_resched();
 807
 808                 samples++;
 809
 810                 if (*rthp == NULL)
 811                         continue;
 812                 length = 0;
 813                 spin_lock_bh(rt_hash_lock_addr(i));
 814                 while ((rth = *rthp) != NULL) {
 815                         prefetch(rth->u.dst.rt_next);
 816                         if (rt_is_expired(rth)) {
 817                                 *rthp = rth->u.dst.rt_next;
 818                                 rt_free(rth);
 819                                 continue;
 820                         }
 821                         if (rth->u.dst.expires) {
 822                                 /* Entry is expired even if it is in use */
 823                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 824 nofree:
 825                                         tmo >>= 1;
 826                                         rthp = &rth->u.dst.rt_next;
 827                                         /*
 828                                          * We only count entries on
 829                                          * a chain with equal hash inputs once
 830                                          * so that entries for different QOS
 831                                          * levels, and other non-hash input
 832                                          * attributes don't unfairly skew
 833                                          * the length computation
 834                                          */
 835                                         for (aux = rt_hash_table[i].chain;;) {
 836                                                 if (aux == rth) {
 837                                                         length += ONE;
 838                                                         break;
 839                                                 }
 840                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 841                                                         break;
 842                                                 aux = aux->u.dst.rt_next;
 843                                         }
 844                                         continue;
 845                                 }
 846                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 847                                 goto nofree;
 848
 849                         /* Cleanup aged off entries. */
 850                         *rthp = rth->u.dst.rt_next;
 851                         rt_free(rth);
 852                 }
 853                 spin_unlock_bh(rt_hash_lock_addr(i));
 854                 sum += length;
 855                 sum2 += length*length;
 856         }
 857         if (samples) {
 858                 unsigned long avg = sum / samples;
 859                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 860                 rt_chain_length_max = max_t(unsigned long,
 861                                         ip_rt_gc_elasticity,
 862                                         (avg + 4*sd) >> FRACT_BITS);
 863         }
 864         rover = i;
 865 }
 866
 867 /*
 868  * rt_worker_func() is run in process context.
 869  * we call rt_check_expire() to scan part of the hash table
 870  */
 871 static void rt_worker_func(struct work_struct *work)
 872 {
 873         rt_check_expire();
 874         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 875 }
 876
 877 /*
 878  * Pertubation of rt_genid by a small quantity [1..256]
 879  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 880  * many times (2^24) without giving recent rt_genid.
 881  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 882  */
 883 static void rt_cache_invalidate(struct net *net)
 884 {
 885         unsigned char shuffle;
 886
 887         get_random_bytes(&shuffle, sizeof(shuffle));
 888         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 889 }
 890
 891 /*
 892  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 893  * delay >= 0 : invalidate & flush cache (can be long)
 894  */
 895 void rt_cache_flush(struct net *net, int delay)
 896 {
 897         rt_cache_invalidate(net);
 898         if (delay >= 0)
 899                 rt_do_flush(!in_softirq());
 900 }
 901
 902 /*
 903  * We change rt_genid and let gc do the cleanup
 904  */
 905 static void rt_secret_rebuild(unsigned long __net)
 906 {
 907         struct net *net = (struct net *)__net;
 908         rt_cache_invalidate(net);
 909         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 910 }
 911
 912 static void rt_secret_rebuild_oneshot(struct net *net)
 913 {
 914         del_timer_sync(&net->ipv4.rt_secret_timer);
 915         rt_cache_invalidate(net);
 916         if (ip_rt_secret_interval) {
 917                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 918                 add_timer(&net->ipv4.rt_secret_timer);
 919         }
 920 }
 921
 922 static void rt_emergency_hash_rebuild(struct net *net)
 923 {
 924         if (net_ratelimit()) {
 925                 printk(KERN_WARNING "Route hash chain too long!\n");
 926                 printk(KERN_WARNING "Adjust your secret_interval!\n");
 927         }
 928
 929         rt_secret_rebuild_oneshot(net);
 930 }
 931
 932 /*
 933    Short description of GC goals.
 934
 935    We want to build algorithm, which will keep routing cache
 936    at some equilibrium point, when number of aged off entries
 937    is kept approximately equal to newly generated ones.
 938
 939    Current expiration strength is variable "expire".
 940    We try to adjust it dynamically, so that if networking
 941    is idle expires is large enough to keep enough of warm entries,
 942    and when load increases it reduces to limit cache size.
 943  */
 944
 945 static int rt_garbage_collect(struct dst_ops *ops)
 946 {
 947         static unsigned long expire = RT_GC_TIMEOUT;
 948         static unsigned long last_gc;
 949         static int rover;
 950         static int equilibrium;
 951         struct rtable *rth, **rthp;
 952         unsigned long now = jiffies;
 953         int goal;
 954
 955         /*
 956          * Garbage collection is pretty expensive,
 957          * do not make it too frequently.
 958          */
 959
 960         RT_CACHE_STAT_INC(gc_total);
 961
 962         if (now - last_gc < ip_rt_gc_min_interval &&
 963             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 964                 RT_CACHE_STAT_INC(gc_ignored);
 965                 goto out;
 966         }
 967
 968         /* Calculate number of entries, which we want to expire now. */
 969         goal = atomic_read(&ipv4_dst_ops.entries) -
 970                 (ip_rt_gc_elasticity << rt_hash_log);
 971         if (goal <= 0) {
 972                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 973                         equilibrium = ipv4_dst_ops.gc_thresh;
 974                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 975                 if (goal > 0) {
 976                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 977                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                 }
 979         } else {
 980                 /* We are in dangerous area. Try to reduce cache really
 981                  * aggressively.
 982                  */
 983                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 984                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 985         }
 986
 987         if (now - last_gc >= ip_rt_gc_min_interval)
 988                 last_gc = now;
 989
 990         if (goal <= 0) {
 991                 equilibrium += goal;
 992                 goto work_done;
 993         }
 994
 995         do {
 996                 int i, k;
 997
 998                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 999                         unsigned long tmo = expire;
1000
1001                         k = (k + 1) & rt_hash_mask;
1002                         rthp = &rt_hash_table[k].chain;
1003                         spin_lock_bh(rt_hash_lock_addr(k));
1004                         while ((rth = *rthp) != NULL) {
1005                                 if (!rt_is_expired(rth) &&
1006                                         !rt_may_expire(rth, tmo, expire)) {
1007                                         tmo >>= 1;
1008                                         rthp = &rth->u.dst.rt_next;
1009                                         continue;
1010                                 }
1011                                 *rthp = rth->u.dst.rt_next;
1012                                 rt_free(rth);
1013                                 goal--;
1014                         }
1015                         spin_unlock_bh(rt_hash_lock_addr(k));
1016                         if (goal <= 0)
1017                                 break;
1018                 }
1019                 rover = k;
1020
1021                 if (goal <= 0)
1022                         goto work_done;
1023
1024                 /* Goal is not achieved. We stop process if:
1025
1026                    - if expire reduced to zero. Otherwise, expire is halfed.
1027                    - if table is not full.
1028                    - if we are called from interrupt.
1029                    - jiffies check is just fallback/debug loop breaker.
1030                      We will not spin here for long time in any case.
1031                  */
1032
1033                 RT_CACHE_STAT_INC(gc_goal_miss);
1034
1035                 if (expire == 0)
1036                         break;
1037
1038                 expire >>= 1;
1039 #if RT_CACHE_DEBUG >= 2
1040                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1042 #endif
1043
1044                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045                         goto out;
1046         } while (!in_softirq() && time_before_eq(jiffies, now));
1047
1048         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049                 goto out;
1050         if (net_ratelimit())
1051                 printk(KERN_WARNING "dst cache overflow\n");
1052         RT_CACHE_STAT_INC(gc_dst_overflow);
1053         return 1;
1054
1055 work_done:
1056         expire += ip_rt_gc_min_interval;
1057         if (expire > ip_rt_gc_timeout ||
1058             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059                 expire = ip_rt_gc_timeout;
1060 #if RT_CACHE_DEBUG >= 2
1061         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063 #endif
1064 out:    return 0;
1065 }
1066
1067 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1068 {
1069         struct rtable   *rth, **rthp;
1070         unsigned long   now;
1071         struct rtable *cand, **candp;
1072         u32             min_score;
1073         int             chain_length;
1074         int attempts = !in_softirq();
1075
1076 restart:
1077         chain_length = 0;
1078         min_score = ~(u32)0;
1079         cand = NULL;
1080         candp = NULL;
1081         now = jiffies;
1082
1083         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1084                 /*
1085                  * If we're not caching, just tell the caller we
1086                  * were successful and don't touch the route.  The
1087                  * caller hold the sole reference to the cache entry, and
1088                  * it will be released when the caller is done with it.
1089                  * If we drop it here, the callers have no way to resolve routes
1090                  * when we're not caching.  Instead, just point *rp at rt, so
1091                  * the caller gets a single use out of the route
1092                  * Note that we do rt_free on this new route entry, so that
1093                  * once its refcount hits zero, we are still able to reap it
1094                  * (Thanks Alexey)
1095                  * Note also the rt_free uses call_rcu.  We don't actually
1096                  * need rcu protection here, this is just our path to get
1097                  * on the route gc list.
1098                  */
1099
1100                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1101                         int err = arp_bind_neighbour(&rt->u.dst);
1102                         if (err) {
1103                                 if (net_ratelimit())
1104                                         printk(KERN_WARNING
1105                                             "Neighbour table failure & not caching routes.\n");
1106                                 rt_drop(rt);
1107                                 return err;
1108                         }
1109                 }
1110
1111                 rt_free(rt);
1112                 goto skip_hashing;
1113         }
1114
1115         rthp = &rt_hash_table[hash].chain;
1116
1117         spin_lock_bh(rt_hash_lock_addr(hash));
1118         while ((rth = *rthp) != NULL) {
1119                 if (rt_is_expired(rth)) {
1120                         *rthp = rth->u.dst.rt_next;
1121                         rt_free(rth);
1122                         continue;
1123                 }
1124                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1125                         /* Put it first */
1126                         *rthp = rth->u.dst.rt_next;
1127                         /*
1128                          * Since lookup is lockfree, the deletion
1129                          * must be visible to another weakly ordered CPU before
1130                          * the insertion at the start of the hash chain.
1131                          */
1132                         rcu_assign_pointer(rth->u.dst.rt_next,
1133                                            rt_hash_table[hash].chain);
1134                         /*
1135                          * Since lookup is lockfree, the update writes
1136                          * must be ordered for consistency on SMP.
1137                          */
1138                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1139
1140                         dst_use(&rth->u.dst, now);
1141                         spin_unlock_bh(rt_hash_lock_addr(hash));
1142
1143                         rt_drop(rt);
1144                         *rp = rth;
1145                         return 0;
1146                 }
1147
1148                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1149                         u32 score = rt_score(rth);
1150
1151                         if (score <= min_score) {
1152                                 cand = rth;
1153                                 candp = rthp;
1154                                 min_score = score;
1155                         }
1156                 }
1157
1158                 chain_length++;
1159
1160                 rthp = &rth->u.dst.rt_next;
1161         }
1162
1163         if (cand) {
1164                 /* ip_rt_gc_elasticity used to be average length of chain
1165                  * length, when exceeded gc becomes really aggressive.
1166                  *
1167                  * The second limit is less certain. At the moment it allows
1168                  * only 2 entries per bucket. We will see.
1169                  */
1170                 if (chain_length > ip_rt_gc_elasticity) {
1171                         *candp = cand->u.dst.rt_next;
1172                         rt_free(cand);
1173                 }
1174         } else {
1175                 if (chain_length > rt_chain_length_max) {
1176                         struct net *net = dev_net(rt->u.dst.dev);
1177                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1178                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1179                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1180                                         rt->u.dst.dev->name, num);
1181                         }
1182                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1183                 }
1184         }
1185
1186         /* Try to bind route to arp only if it is output
1187            route or unicast forwarding path.
1188          */
1189         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1190                 int err = arp_bind_neighbour(&rt->u.dst);
1191                 if (err) {
1192                         spin_unlock_bh(rt_hash_lock_addr(hash));
1193
1194                         if (err != -ENOBUFS) {
1195                                 rt_drop(rt);
1196                                 return err;
1197                         }
1198
1199                         /* Neighbour tables are full and nothing
1200                            can be released. Try to shrink route cache,
1201                            it is most likely it holds some neighbour records.
1202                          */
1203                         if (attempts-- > 0) {
1204                                 int saved_elasticity = ip_rt_gc_elasticity;
1205                                 int saved_int = ip_rt_gc_min_interval;
1206                                 ip_rt_gc_elasticity     = 1;
1207                                 ip_rt_gc_min_interval   = 0;
1208                                 rt_garbage_collect(&ipv4_dst_ops);
1209                                 ip_rt_gc_min_interval   = saved_int;
1210                                 ip_rt_gc_elasticity     = saved_elasticity;
1211                                 goto restart;
1212                         }
1213
1214                         if (net_ratelimit())
1215                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1216                         rt_drop(rt);
1217                         return -ENOBUFS;
1218                 }
1219         }
1220
1221         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1222
1223 #if RT_CACHE_DEBUG >= 2
1224         if (rt->u.dst.rt_next) {
1225                 struct rtable *trt;
1226                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1227                        hash, &rt->rt_dst);
1228                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1229                         printk(" . %pI4", &trt->rt_dst);
1230                 printk("\n");
1231         }
1232 #endif
1233         /*
1234          * Since lookup is lockfree, we must make sure
1235          * previous writes to rt are comitted to memory
1236          * before making rt visible to other CPUS.
1237          */
1238         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1239
1240         spin_unlock_bh(rt_hash_lock_addr(hash));
1241
1242 skip_hashing:
1243         *rp = rt;
1244         return 0;
1245 }
1246
1247 void rt_bind_peer(struct rtable *rt, int create)
1248 {
1249         static DEFINE_SPINLOCK(rt_peer_lock);
1250         struct inet_peer *peer;
1251
1252         peer = inet_getpeer(rt->rt_dst, create);
1253
1254         spin_lock_bh(&rt_peer_lock);
1255         if (rt->peer == NULL) {
1256                 rt->peer = peer;
1257                 peer = NULL;
1258         }
1259         spin_unlock_bh(&rt_peer_lock);
1260         if (peer)
1261                 inet_putpeer(peer);
1262 }
1263
1264 /*
1265  * Peer allocation may fail only in serious out-of-memory conditions.  However
1266  * we still can generate some output.
1267  * Random ID selection looks a bit dangerous because we have no chances to
1268  * select ID being unique in a reasonable period of time.
1269  * But broken packet identifier may be better than no packet at all.
1270  */
1271 static void ip_select_fb_ident(struct iphdr *iph)
1272 {
1273         static DEFINE_SPINLOCK(ip_fb_id_lock);
1274         static u32 ip_fallback_id;
1275         u32 salt;
1276
1277         spin_lock_bh(&ip_fb_id_lock);
1278         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1279         iph->id = htons(salt & 0xFFFF);
1280         ip_fallback_id = salt;
1281         spin_unlock_bh(&ip_fb_id_lock);
1282 }
1283
1284 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1285 {
1286         struct rtable *rt = (struct rtable *) dst;
1287
1288         if (rt) {
1289                 if (rt->peer == NULL)
1290                         rt_bind_peer(rt, 1);
1291
1292                 /* If peer is attached to destination, it is never detached,
1293                    so that we need not to grab a lock to dereference it.
1294                  */
1295                 if (rt->peer) {
1296                         iph->id = htons(inet_getid(rt->peer, more));
1297                         return;
1298                 }
1299         } else
1300                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1301                        __builtin_return_address(0));
1302
1303         ip_select_fb_ident(iph);
1304 }
1305
1306 static void rt_del(unsigned hash, struct rtable *rt)
1307 {
1308         struct rtable **rthp, *aux;
1309
1310         rthp = &rt_hash_table[hash].chain;
1311         spin_lock_bh(rt_hash_lock_addr(hash));
1312         ip_rt_put(rt);
1313         while ((aux = *rthp) != NULL) {
1314                 if (aux == rt || rt_is_expired(aux)) {
1315                         *rthp = aux->u.dst.rt_next;
1316                         rt_free(aux);
1317                         continue;
1318                 }
1319                 rthp = &aux->u.dst.rt_next;
1320         }
1321         spin_unlock_bh(rt_hash_lock_addr(hash));
1322 }
1323
1324 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1325                     __be32 saddr, struct net_device *dev)
1326 {
1327         int i, k;
1328         struct in_device *in_dev = in_dev_get(dev);
1329         struct rtable *rth, **rthp;
1330         __be32  skeys[2] = { saddr, 0 };
1331         int  ikeys[2] = { dev->ifindex, 0 };
1332         struct netevent_redirect netevent;
1333         struct net *net;
1334
1335         if (!in_dev)
1336                 return;
1337
1338         net = dev_net(dev);
1339         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1340             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1341             || ipv4_is_zeronet(new_gw))
1342                 goto reject_redirect;
1343
1344         if (!rt_caching(net))
1345                 goto reject_redirect;
1346
1347         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1348                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1349                         goto reject_redirect;
1350                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1351                         goto reject_redirect;
1352         } else {
1353                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1354                         goto reject_redirect;
1355         }
1356
1357         for (i = 0; i < 2; i++) {
1358                 for (k = 0; k < 2; k++) {
1359                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1360                                                 rt_genid(net));
1361
1362                         rthp=&rt_hash_table[hash].chain;
1363
1364                         rcu_read_lock();
1365                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1366                                 struct rtable *rt;
1367
1368                                 if (rth->fl.fl4_dst != daddr ||
1369                                     rth->fl.fl4_src != skeys[i] ||
1370                                     rth->fl.oif != ikeys[k] ||
1371                                     rth->fl.iif != 0 ||
1372                                     rt_is_expired(rth) ||
1373                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1374                                         rthp = &rth->u.dst.rt_next;
1375                                         continue;
1376                                 }
1377
1378                                 if (rth->rt_dst != daddr ||
1379                                     rth->rt_src != saddr ||
1380                                     rth->u.dst.error ||
1381                                     rth->rt_gateway != old_gw ||
1382                                     rth->u.dst.dev != dev)
1383                                         break;
1384
1385                                 dst_hold(&rth->u.dst);
1386                                 rcu_read_unlock();
1387
1388                                 rt = dst_alloc(&ipv4_dst_ops);
1389                                 if (rt == NULL) {
1390                                         ip_rt_put(rth);
1391                                         in_dev_put(in_dev);
1392                                         return;
1393                                 }
1394
1395                                 /* Copy all the information. */
1396                                 *rt = *rth;
1397                                 rt->u.dst.__use         = 1;
1398                                 atomic_set(&rt->u.dst.__refcnt, 1);
1399                                 rt->u.dst.child         = NULL;
1400                                 if (rt->u.dst.dev)
1401                                         dev_hold(rt->u.dst.dev);
1402                                 if (rt->idev)
1403                                         in_dev_hold(rt->idev);
1404                                 rt->u.dst.obsolete      = 0;
1405                                 rt->u.dst.lastuse       = jiffies;
1406                                 rt->u.dst.path          = &rt->u.dst;
1407                                 rt->u.dst.neighbour     = NULL;
1408                                 rt->u.dst.hh            = NULL;
1409 #ifdef CONFIG_XFRM
1410                                 rt->u.dst.xfrm          = NULL;
1411 #endif
1412                                 rt->rt_genid            = rt_genid(net);
1413                                 rt->rt_flags            |= RTCF_REDIRECTED;
1414
1415                                 /* Gateway is different ... */
1416                                 rt->rt_gateway          = new_gw;
1417
1418                                 /* Redirect received -> path was valid */
1419                                 dst_confirm(&rth->u.dst);
1420
1421                                 if (rt->peer)
1422                                         atomic_inc(&rt->peer->refcnt);
1423
1424                                 if (arp_bind_neighbour(&rt->u.dst) ||
1425                                     !(rt->u.dst.neighbour->nud_state &
1426                                             NUD_VALID)) {
1427                                         if (rt->u.dst.neighbour)
1428                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1429                                         ip_rt_put(rth);
1430                                         rt_drop(rt);
1431                                         goto do_next;
1432                                 }
1433
1434                                 netevent.old = &rth->u.dst;
1435                                 netevent.new = &rt->u.dst;
1436                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1437                                                         &netevent);
1438
1439                                 rt_del(hash, rth);
1440                                 if (!rt_intern_hash(hash, rt, &rt))
1441                                         ip_rt_put(rt);
1442                                 goto do_next;
1443                         }
1444                         rcu_read_unlock();
1445                 do_next:
1446                         ;
1447                 }
1448         }
1449         in_dev_put(in_dev);
1450         return;
1451
1452 reject_redirect:
1453 #ifdef CONFIG_IP_ROUTE_VERBOSE
1454         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1455                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1456                         "  Advised path = %pI4 -> %pI4\n",
1457                        &old_gw, dev->name, &new_gw,
1458                        &saddr, &daddr);
1459 #endif
1460         in_dev_put(in_dev);
1461 }
1462
1463 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1464 {
1465         struct rtable *rt = (struct rtable *)dst;
1466         struct dst_entry *ret = dst;
1467
1468         if (rt) {
1469                 if (dst->obsolete) {
1470                         ip_rt_put(rt);
1471                         ret = NULL;
1472                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1473                            rt->u.dst.expires) {
1474                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1475                                                 rt->fl.oif,
1476                                                 rt_genid(dev_net(dst->dev)));
1477 #if RT_CACHE_DEBUG >= 1
1478                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1479                                 &rt->rt_dst, rt->fl.fl4_tos);
1480 #endif
1481                         rt_del(hash, rt);
1482                         ret = NULL;
1483                 }
1484         }
1485         return ret;
1486 }
1487
1488 /*
1489  * Algorithm:
1490  *      1. The first ip_rt_redirect_number redirects are sent
1491  *         with exponential backoff, then we stop sending them at all,
1492  *         assuming that the host ignores our redirects.
1493  *      2. If we did not see packets requiring redirects
1494  *         during ip_rt_redirect_silence, we assume that the host
1495  *         forgot redirected route and start to send redirects again.
1496  *
1497  * This algorithm is much cheaper and more intelligent than dumb load limiting
1498  * in icmp.c.
1499  *
1500  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1501  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1502  */
1503
1504 void ip_rt_send_redirect(struct sk_buff *skb)
1505 {
1506         struct rtable *rt = skb->rtable;
1507         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1508
1509         if (!in_dev)
1510                 return;
1511
1512         if (!IN_DEV_TX_REDIRECTS(in_dev))
1513                 goto out;
1514
1515         /* No redirected packets during ip_rt_redirect_silence;
1516          * reset the algorithm.
1517          */
1518         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1519                 rt->u.dst.rate_tokens = 0;
1520
1521         /* Too many ignored redirects; do not send anything
1522          * set u.dst.rate_last to the last seen redirected packet.
1523          */
1524         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1525                 rt->u.dst.rate_last = jiffies;
1526                 goto out;
1527         }
1528
1529         /* Check for load limit; set rate_last to the latest sent
1530          * redirect.
1531          */
1532         if (rt->u.dst.rate_tokens == 0 ||
1533             time_after(jiffies,
1534                        (rt->u.dst.rate_last +
1535                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1536                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1537                 rt->u.dst.rate_last = jiffies;
1538                 ++rt->u.dst.rate_tokens;
1539 #ifdef CONFIG_IP_ROUTE_VERBOSE
1540                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1541                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1542                     net_ratelimit())
1543                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1544                                 &rt->rt_src, rt->rt_iif,
1545                                 &rt->rt_dst, &rt->rt_gateway);
1546 #endif
1547         }
1548 out:
1549         in_dev_put(in_dev);
1550 }
1551
1552 static int ip_error(struct sk_buff *skb)
1553 {
1554         struct rtable *rt = skb->rtable;
1555         unsigned long now;
1556         int code;
1557
1558         switch (rt->u.dst.error) {
1559                 case EINVAL:
1560                 default:
1561                         goto out;
1562                 case EHOSTUNREACH:
1563                         code = ICMP_HOST_UNREACH;
1564                         break;
1565                 case ENETUNREACH:
1566                         code = ICMP_NET_UNREACH;
1567                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1568                                         IPSTATS_MIB_INNOROUTES);
1569                         break;
1570                 case EACCES:
1571                         code = ICMP_PKT_FILTERED;
1572                         break;
1573         }
1574
1575         now = jiffies;
1576         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1577         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1578                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1579         rt->u.dst.rate_last = now;
1580         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1581                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1582                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1583         }
1584
1585 out:    kfree_skb(skb);
1586         return 0;
1587 }
1588
1589 /*
1590  *      The last two values are not from the RFC but
1591  *      are needed for AMPRnet AX.25 paths.
1592  */
1593
1594 static const unsigned short mtu_plateau[] =
1595 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1596
1597 static inline unsigned short guess_mtu(unsigned short old_mtu)
1598 {
1599         int i;
1600
1601         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1602                 if (old_mtu > mtu_plateau[i])
1603                         return mtu_plateau[i];
1604         return 68;
1605 }
1606
1607 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1608                                  unsigned short new_mtu,
1609                                  struct net_device *dev)
1610 {
1611         int i, k;
1612         unsigned short old_mtu = ntohs(iph->tot_len);
1613         struct rtable *rth;
1614         int  ikeys[2] = { dev->ifindex, 0 };
1615         __be32  skeys[2] = { iph->saddr, 0, };
1616         __be32  daddr = iph->daddr;
1617         unsigned short est_mtu = 0;
1618
1619         if (ipv4_config.no_pmtu_disc)
1620                 return 0;
1621
1622         for (k = 0; k < 2; k++) {
1623                 for (i = 0; i < 2; i++) {
1624                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1625                                                 rt_genid(net));
1626
1627                         rcu_read_lock();
1628                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1629                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1630                                 unsigned short mtu = new_mtu;
1631
1632                                 if (rth->fl.fl4_dst != daddr ||
1633                                     rth->fl.fl4_src != skeys[i] ||
1634                                     rth->rt_dst != daddr ||
1635                                     rth->rt_src != iph->saddr ||
1636                                     rth->fl.oif != ikeys[k] ||
1637                                     rth->fl.iif != 0 ||
1638                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1639                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1640                                     rt_is_expired(rth))
1641                                         continue;
1642
1643                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1644
1645                                         /* BSD 4.2 compatibility hack :-( */
1646                                         if (mtu == 0 &&
1647                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1648                                             old_mtu >= 68 + (iph->ihl << 2))
1649                                                 old_mtu -= iph->ihl << 2;
1650
1651                                         mtu = guess_mtu(old_mtu);
1652                                 }
1653                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1654                                         if (mtu < dst_mtu(&rth->u.dst)) {
1655                                                 dst_confirm(&rth->u.dst);
1656                                                 if (mtu < ip_rt_min_pmtu) {
1657                                                         mtu = ip_rt_min_pmtu;
1658                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1659                                                                 (1 << RTAX_MTU);
1660                                                 }
1661                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1662                                                 dst_set_expires(&rth->u.dst,
1663                                                         ip_rt_mtu_expires);
1664                                         }
1665                                         est_mtu = mtu;
1666                                 }
1667                         }
1668                         rcu_read_unlock();
1669                 }
1670         }
1671         return est_mtu ? : new_mtu;
1672 }
1673
1674 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1675 {
1676         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1677             !(dst_metric_locked(dst, RTAX_MTU))) {
1678                 if (mtu < ip_rt_min_pmtu) {
1679                         mtu = ip_rt_min_pmtu;
1680                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1681                 }
1682                 dst->metrics[RTAX_MTU-1] = mtu;
1683                 dst_set_expires(dst, ip_rt_mtu_expires);
1684                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1685         }
1686 }
1687
1688 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1689 {
1690         return NULL;
1691 }
1692
1693 static void ipv4_dst_destroy(struct dst_entry *dst)
1694 {
1695         struct rtable *rt = (struct rtable *) dst;
1696         struct inet_peer *peer = rt->peer;
1697         struct in_device *idev = rt->idev;
1698
1699         if (peer) {
1700                 rt->peer = NULL;
1701                 inet_putpeer(peer);
1702         }
1703
1704         if (idev) {
1705                 rt->idev = NULL;
1706                 in_dev_put(idev);
1707         }
1708 }
1709
1710 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1711                             int how)
1712 {
1713         struct rtable *rt = (struct rtable *) dst;
1714         struct in_device *idev = rt->idev;
1715         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1716                 struct in_device *loopback_idev =
1717                         in_dev_get(dev_net(dev)->loopback_dev);
1718                 if (loopback_idev) {
1719                         rt->idev = loopback_idev;
1720                         in_dev_put(idev);
1721                 }
1722         }
1723 }
1724
1725 static void ipv4_link_failure(struct sk_buff *skb)
1726 {
1727         struct rtable *rt;
1728
1729         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1730
1731         rt = skb->rtable;
1732         if (rt)
1733                 dst_set_expires(&rt->u.dst, 0);
1734 }
1735
1736 static int ip_rt_bug(struct sk_buff *skb)
1737 {
1738         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1739                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1740                 skb->dev ? skb->dev->name : "?");
1741         kfree_skb(skb);
1742         return 0;
1743 }
1744
1745 /*
1746    We do not cache source address of outgoing interface,
1747    because it is used only by IP RR, TS and SRR options,
1748    so that it out of fast path.
1749
1750    BTW remember: "addr" is allowed to be not aligned
1751    in IP options!
1752  */
1753
1754 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1755 {
1756         __be32 src;
1757         struct fib_result res;
1758
1759         if (rt->fl.iif == 0)
1760                 src = rt->rt_src;
1761         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1762                 src = FIB_RES_PREFSRC(res);
1763                 fib_res_put(&res);
1764         } else
1765                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1766                                         RT_SCOPE_UNIVERSE);
1767         memcpy(addr, &src, 4);
1768 }
1769
1770 #ifdef CONFIG_NET_CLS_ROUTE
1771 static void set_class_tag(struct rtable *rt, u32 tag)
1772 {
1773         if (!(rt->u.dst.tclassid & 0xFFFF))
1774                 rt->u.dst.tclassid |= tag & 0xFFFF;
1775         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1776                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1777 }
1778 #endif
1779
1780 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1781 {
1782         struct fib_info *fi = res->fi;
1783
1784         if (fi) {
1785                 if (FIB_RES_GW(*res) &&
1786                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1787                         rt->rt_gateway = FIB_RES_GW(*res);
1788                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1789                        sizeof(rt->u.dst.metrics));
1790                 if (fi->fib_mtu == 0) {
1791                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1792                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1793                             rt->rt_gateway != rt->rt_dst &&
1794                             rt->u.dst.dev->mtu > 576)
1795                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1796                 }
1797 #ifdef CONFIG_NET_CLS_ROUTE
1798                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1799 #endif
1800         } else
1801                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1802
1803         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1804                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1805         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1806                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1807         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1808                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1809                                        ip_rt_min_advmss);
1810         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1811                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1812
1813 #ifdef CONFIG_NET_CLS_ROUTE
1814 #ifdef CONFIG_IP_MULTIPLE_TABLES
1815         set_class_tag(rt, fib_rules_tclass(res));
1816 #endif
1817         set_class_tag(rt, itag);
1818 #endif
1819         rt->rt_type = res->type;
1820 }
1821
1822 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1823                                 u8 tos, struct net_device *dev, int our)
1824 {
1825         unsigned hash;
1826         struct rtable *rth;
1827         __be32 spec_dst;
1828         struct in_device *in_dev = in_dev_get(dev);
1829         u32 itag = 0;
1830
1831         /* Primary sanity checks. */
1832
1833         if (in_dev == NULL)
1834                 return -EINVAL;
1835
1836         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1837             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1838                 goto e_inval;
1839
1840         if (ipv4_is_zeronet(saddr)) {
1841                 if (!ipv4_is_local_multicast(daddr))
1842                         goto e_inval;
1843                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1844         } else if (fib_validate_source(saddr, 0, tos, 0,
1845                                         dev, &spec_dst, &itag) < 0)
1846                 goto e_inval;
1847
1848         rth = dst_alloc(&ipv4_dst_ops);
1849         if (!rth)
1850                 goto e_nobufs;
1851
1852         rth->u.dst.output= ip_rt_bug;
1853
1854         atomic_set(&rth->u.dst.__refcnt, 1);
1855         rth->u.dst.flags= DST_HOST;
1856         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1857                 rth->u.dst.flags |= DST_NOPOLICY;
1858         rth->fl.fl4_dst = daddr;
1859         rth->rt_dst     = daddr;
1860         rth->fl.fl4_tos = tos;
1861         rth->fl.mark    = skb->mark;
1862         rth->fl.fl4_src = saddr;
1863         rth->rt_src     = saddr;
1864 #ifdef CONFIG_NET_CLS_ROUTE
1865         rth->u.dst.tclassid = itag;
1866 #endif
1867         rth->rt_iif     =
1868         rth->fl.iif     = dev->ifindex;
1869         rth->u.dst.dev  = init_net.loopback_dev;
1870         dev_hold(rth->u.dst.dev);
1871         rth->idev       = in_dev_get(rth->u.dst.dev);
1872         rth->fl.oif     = 0;
1873         rth->rt_gateway = daddr;
1874         rth->rt_spec_dst= spec_dst;
1875         rth->rt_genid   = rt_genid(dev_net(dev));
1876         rth->rt_flags   = RTCF_MULTICAST;
1877         rth->rt_type    = RTN_MULTICAST;
1878         if (our) {
1879                 rth->u.dst.input= ip_local_deliver;
1880                 rth->rt_flags |= RTCF_LOCAL;
1881         }
1882
1883 #ifdef CONFIG_IP_MROUTE
1884         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1885                 rth->u.dst.input = ip_mr_input;
1886 #endif
1887         RT_CACHE_STAT_INC(in_slow_mc);
1888
1889         in_dev_put(in_dev);
1890         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1891         return rt_intern_hash(hash, rth, &skb->rtable);
1892
1893 e_nobufs:
1894         in_dev_put(in_dev);
1895         return -ENOBUFS;
1896
1897 e_inval:
1898         in_dev_put(in_dev);
1899         return -EINVAL;
1900 }
1901
1902
1903 static void ip_handle_martian_source(struct net_device *dev,
1904                                      struct in_device *in_dev,
1905                                      struct sk_buff *skb,
1906                                      __be32 daddr,
1907                                      __be32 saddr)
1908 {
1909         RT_CACHE_STAT_INC(in_martian_src);
1910 #ifdef CONFIG_IP_ROUTE_VERBOSE
1911         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1912                 /*
1913                  *      RFC1812 recommendation, if source is martian,
1914                  *      the only hint is MAC header.
1915                  */
1916                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1917                         &daddr, &saddr, dev->name);
1918                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1919                         int i;
1920                         const unsigned char *p = skb_mac_header(skb);
1921                         printk(KERN_WARNING "ll header: ");
1922                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1923                                 printk("%02x", *p);
1924                                 if (i < (dev->hard_header_len - 1))
1925                                         printk(":");
1926                         }
1927                         printk("\n");
1928                 }
1929         }
1930 #endif
1931 }
1932
1933 static int __mkroute_input(struct sk_buff *skb,
1934                            struct fib_result *res,
1935                            struct in_device *in_dev,
1936                            __be32 daddr, __be32 saddr, u32 tos,
1937                            struct rtable **result)
1938 {
1939
1940         struct rtable *rth;
1941         int err;
1942         struct in_device *out_dev;
1943         unsigned flags = 0;
1944         __be32 spec_dst;
1945         u32 itag;
1946
1947         /* get a working reference to the output device */
1948         out_dev = in_dev_get(FIB_RES_DEV(*res));
1949         if (out_dev == NULL) {
1950                 if (net_ratelimit())
1951                         printk(KERN_CRIT "Bug in ip_route_input" \
1952                                "_slow(). Please, report\n");
1953                 return -EINVAL;
1954         }
1955
1956
1957         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1958                                   in_dev->dev, &spec_dst, &itag);
1959         if (err < 0) {
1960                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1961                                          saddr);
1962
1963                 err = -EINVAL;
1964                 goto cleanup;
1965         }
1966
1967         if (err)
1968                 flags |= RTCF_DIRECTSRC;
1969
1970         if (out_dev == in_dev && err &&
1971             (IN_DEV_SHARED_MEDIA(out_dev) ||
1972              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1973                 flags |= RTCF_DOREDIRECT;
1974
1975         if (skb->protocol != htons(ETH_P_IP)) {
1976                 /* Not IP (i.e. ARP). Do not create route, if it is
1977                  * invalid for proxy arp. DNAT routes are always valid.
1978                  */
1979                 if (out_dev == in_dev) {
1980                         err = -EINVAL;
1981                         goto cleanup;
1982                 }
1983         }
1984
1985
1986         rth = dst_alloc(&ipv4_dst_ops);
1987         if (!rth) {
1988                 err = -ENOBUFS;
1989                 goto cleanup;
1990         }
1991
1992         atomic_set(&rth->u.dst.__refcnt, 1);
1993         rth->u.dst.flags= DST_HOST;
1994         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1995                 rth->u.dst.flags |= DST_NOPOLICY;
1996         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1997                 rth->u.dst.flags |= DST_NOXFRM;
1998         rth->fl.fl4_dst = daddr;
1999         rth->rt_dst     = daddr;
2000         rth->fl.fl4_tos = tos;
2001         rth->fl.mark    = skb->mark;
2002         rth->fl.fl4_src = saddr;
2003         rth->rt_src     = saddr;
2004         rth->rt_gateway = daddr;
2005         rth->rt_iif     =
2006                 rth->fl.iif     = in_dev->dev->ifindex;
2007         rth->u.dst.dev  = (out_dev)->dev;
2008         dev_hold(rth->u.dst.dev);
2009         rth->idev       = in_dev_get(rth->u.dst.dev);
2010         rth->fl.oif     = 0;
2011         rth->rt_spec_dst= spec_dst;
2012
2013         rth->u.dst.input = ip_forward;
2014         rth->u.dst.output = ip_output;
2015         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2016
2017         rt_set_nexthop(rth, res, itag);
2018
2019         rth->rt_flags = flags;
2020
2021         *result = rth;
2022         err = 0;
2023  cleanup:
2024         /* release the working reference to the output device */
2025         in_dev_put(out_dev);
2026         return err;
2027 }
2028
2029 static int ip_mkroute_input(struct sk_buff *skb,
2030                             struct fib_result *res,
2031                             const struct flowi *fl,
2032                             struct in_device *in_dev,
2033                             __be32 daddr, __be32 saddr, u32 tos)
2034 {
2035         struct rtable* rth = NULL;
2036         int err;
2037         unsigned hash;
2038
2039 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2040         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2041                 fib_select_multipath(fl, res);
2042 #endif
2043
2044         /* create a routing cache entry */
2045         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2046         if (err)
2047                 return err;
2048
2049         /* put it into the cache */
2050         hash = rt_hash(daddr, saddr, fl->iif,
2051                        rt_genid(dev_net(rth->u.dst.dev)));
2052         return rt_intern_hash(hash, rth, &skb->rtable);
2053 }
2054
2055 /*
2056  *      NOTE. We drop all the packets that has local source
2057  *      addresses, because every properly looped back packet
2058  *      must have correct destination already attached by output routine.
2059  *
2060  *      Such approach solves two big problems:
2061  *      1. Not simplex devices are handled properly.
2062  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2063  */
2064
2065 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066                                u8 tos, struct net_device *dev)
2067 {
2068         struct fib_result res;
2069         struct in_device *in_dev = in_dev_get(dev);
2070         struct flowi fl = { .nl_u = { .ip4_u =
2071                                       { .daddr = daddr,
2072                                         .saddr = saddr,
2073                                         .tos = tos,
2074                                         .scope = RT_SCOPE_UNIVERSE,
2075                                       } },
2076                             .mark = skb->mark,
2077                             .iif = dev->ifindex };
2078         unsigned        flags = 0;
2079         u32             itag = 0;
2080         struct rtable * rth;
2081         unsigned        hash;
2082         __be32          spec_dst;
2083         int             err = -EINVAL;
2084         int             free_res = 0;
2085         struct net    * net = dev_net(dev);
2086
2087         /* IP on this device is disabled. */
2088
2089         if (!in_dev)
2090                 goto out;
2091
2092         /* Check for the most weird martians, which can be not detected
2093            by fib_lookup.
2094          */
2095
2096         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2097             ipv4_is_loopback(saddr))
2098                 goto martian_source;
2099
2100         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2101                 goto brd_input;
2102
2103         /* Accept zero addresses only to limited broadcast;
2104          * I even do not know to fix it or not. Waiting for complains :-)
2105          */
2106         if (ipv4_is_zeronet(saddr))
2107                 goto martian_source;
2108
2109         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2110             ipv4_is_loopback(daddr))
2111                 goto martian_destination;
2112
2113         /*
2114          *      Now we are ready to route packet.
2115          */
2116         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2117                 if (!IN_DEV_FORWARD(in_dev))
2118                         goto e_hostunreach;
2119                 goto no_route;
2120         }
2121         free_res = 1;
2122
2123         RT_CACHE_STAT_INC(in_slow_tot);
2124
2125         if (res.type == RTN_BROADCAST)
2126                 goto brd_input;
2127
2128         if (res.type == RTN_LOCAL) {
2129                 int result;
2130                 result = fib_validate_source(saddr, daddr, tos,
2131                                              net->loopback_dev->ifindex,
2132                                              dev, &spec_dst, &itag);
2133                 if (result < 0)
2134                         goto martian_source;
2135                 if (result)
2136                         flags |= RTCF_DIRECTSRC;
2137                 spec_dst = daddr;
2138                 goto local_input;
2139         }
2140
2141         if (!IN_DEV_FORWARD(in_dev))
2142                 goto e_hostunreach;
2143         if (res.type != RTN_UNICAST)
2144                 goto martian_destination;
2145
2146         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2147 done:
2148         in_dev_put(in_dev);
2149         if (free_res)
2150                 fib_res_put(&res);
2151 out:    return err;
2152
2153 brd_input:
2154         if (skb->protocol != htons(ETH_P_IP))
2155                 goto e_inval;
2156
2157         if (ipv4_is_zeronet(saddr))
2158                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2159         else {
2160                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2161                                           &itag);
2162                 if (err < 0)
2163                         goto martian_source;
2164                 if (err)
2165                         flags |= RTCF_DIRECTSRC;
2166         }
2167         flags |= RTCF_BROADCAST;
2168         res.type = RTN_BROADCAST;
2169         RT_CACHE_STAT_INC(in_brd);
2170
2171 local_input:
2172         rth = dst_alloc(&ipv4_dst_ops);
2173         if (!rth)
2174                 goto e_nobufs;
2175
2176         rth->u.dst.output= ip_rt_bug;
2177         rth->rt_genid = rt_genid(net);
2178
2179         atomic_set(&rth->u.dst.__refcnt, 1);
2180         rth->u.dst.flags= DST_HOST;
2181         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2182                 rth->u.dst.flags |= DST_NOPOLICY;
2183         rth->fl.fl4_dst = daddr;
2184         rth->rt_dst     = daddr;
2185         rth->fl.fl4_tos = tos;
2186         rth->fl.mark    = skb->mark;
2187         rth->fl.fl4_src = saddr;
2188         rth->rt_src     = saddr;
2189 #ifdef CONFIG_NET_CLS_ROUTE
2190         rth->u.dst.tclassid = itag;
2191 #endif
2192         rth->rt_iif     =
2193         rth->fl.iif     = dev->ifindex;
2194         rth->u.dst.dev  = net->loopback_dev;
2195         dev_hold(rth->u.dst.dev);
2196         rth->idev       = in_dev_get(rth->u.dst.dev);
2197         rth->rt_gateway = daddr;
2198         rth->rt_spec_dst= spec_dst;
2199         rth->u.dst.input= ip_local_deliver;
2200         rth->rt_flags   = flags|RTCF_LOCAL;
2201         if (res.type == RTN_UNREACHABLE) {
2202                 rth->u.dst.input= ip_error;
2203                 rth->u.dst.error= -err;
2204                 rth->rt_flags   &= ~RTCF_LOCAL;
2205         }
2206         rth->rt_type    = res.type;
2207         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2208         err = rt_intern_hash(hash, rth, &skb->rtable);
2209         goto done;
2210
2211 no_route:
2212         RT_CACHE_STAT_INC(in_no_route);
2213         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2214         res.type = RTN_UNREACHABLE;
2215         if (err == -ESRCH)
2216                 err = -ENETUNREACH;
2217         goto local_input;
2218
2219         /*
2220          *      Do not cache martian addresses: they should be logged (RFC1812)
2221          */
2222 martian_destination:
2223         RT_CACHE_STAT_INC(in_martian_dst);
2224 #ifdef CONFIG_IP_ROUTE_VERBOSE
2225         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2226                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2227                         &daddr, &saddr, dev->name);
2228 #endif
2229
2230 e_hostunreach:
2231         err = -EHOSTUNREACH;
2232         goto done;
2233
2234 e_inval:
2235         err = -EINVAL;
2236         goto done;
2237
2238 e_nobufs:
2239         err = -ENOBUFS;
2240         goto done;
2241
2242 martian_source:
2243         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2244         goto e_inval;
2245 }
2246
2247 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2248                    u8 tos, struct net_device *dev)
2249 {
2250         struct rtable * rth;
2251         unsigned        hash;
2252         int iif = dev->ifindex;
2253         struct net *net;
2254
2255         net = dev_net(dev);
2256
2257         if (!rt_caching(net))
2258                 goto skip_cache;
2259
2260         tos &= IPTOS_RT_MASK;
2261         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2262
2263         rcu_read_lock();
2264         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2265              rth = rcu_dereference(rth->u.dst.rt_next)) {
2266                 if (((rth->fl.fl4_dst ^ daddr) |
2267                      (rth->fl.fl4_src ^ saddr) |
2268                      (rth->fl.iif ^ iif) |
2269                      rth->fl.oif |
2270                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2271                     rth->fl.mark == skb->mark &&
2272                     net_eq(dev_net(rth->u.dst.dev), net) &&
2273                     !rt_is_expired(rth)) {
2274                         dst_use(&rth->u.dst, jiffies);
2275                         RT_CACHE_STAT_INC(in_hit);
2276                         rcu_read_unlock();
2277                         skb->rtable = rth;
2278                         return 0;
2279                 }
2280                 RT_CACHE_STAT_INC(in_hlist_search);
2281         }
2282         rcu_read_unlock();
2283
2284 skip_cache:
2285         /* Multicast recognition logic is moved from route cache to here.
2286            The problem was that too many Ethernet cards have broken/missing
2287            hardware multicast filters :-( As result the host on multicasting
2288            network acquires a lot of useless route cache entries, sort of
2289            SDR messages from all the world. Now we try to get rid of them.
2290            Really, provided software IP multicast filter is organized
2291            reasonably (at least, hashed), it does not result in a slowdown
2292            comparing with route cache reject entries.
2293            Note, that multicast routers are not affected, because
2294            route cache entry is created eventually.
2295          */
2296         if (ipv4_is_multicast(daddr)) {
2297                 struct in_device *in_dev;
2298
2299                 rcu_read_lock();
2300                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2301                         int our = ip_check_mc(in_dev, daddr, saddr,
2302                                 ip_hdr(skb)->protocol);
2303                         if (our
2304 #ifdef CONFIG_IP_MROUTE
2305                             || (!ipv4_is_local_multicast(daddr) &&
2306                                 IN_DEV_MFORWARD(in_dev))
2307 #endif
2308                             ) {
2309                                 rcu_read_unlock();
2310                                 return ip_route_input_mc(skb, daddr, saddr,
2311                                                          tos, dev, our);
2312                         }
2313                 }
2314                 rcu_read_unlock();
2315                 return -EINVAL;
2316         }
2317         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2318 }
2319
2320 static int __mkroute_output(struct rtable **result,
2321                             struct fib_result *res,
2322                             const struct flowi *fl,
2323                             const struct flowi *oldflp,
2324                             struct net_device *dev_out,
2325                             unsigned flags)
2326 {
2327         struct rtable *rth;
2328         struct in_device *in_dev;
2329         u32 tos = RT_FL_TOS(oldflp);
2330         int err = 0;
2331
2332         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2333                 return -EINVAL;
2334
2335         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2336                 res->type = RTN_BROADCAST;
2337         else if (ipv4_is_multicast(fl->fl4_dst))
2338                 res->type = RTN_MULTICAST;
2339         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2340                 return -EINVAL;
2341
2342         if (dev_out->flags & IFF_LOOPBACK)
2343                 flags |= RTCF_LOCAL;
2344
2345         /* get work reference to inet device */
2346         in_dev = in_dev_get(dev_out);
2347         if (!in_dev)
2348                 return -EINVAL;
2349
2350         if (res->type == RTN_BROADCAST) {
2351                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2352                 if (res->fi) {
2353                         fib_info_put(res->fi);
2354                         res->fi = NULL;
2355                 }
2356         } else if (res->type == RTN_MULTICAST) {
2357                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2358                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2359                                  oldflp->proto))
2360                         flags &= ~RTCF_LOCAL;
2361                 /* If multicast route do not exist use
2362                    default one, but do not gateway in this case.
2363                    Yes, it is hack.
2364                  */
2365                 if (res->fi && res->prefixlen < 4) {
2366                         fib_info_put(res->fi);
2367                         res->fi = NULL;
2368                 }
2369         }
2370
2371
2372         rth = dst_alloc(&ipv4_dst_ops);
2373         if (!rth) {
2374                 err = -ENOBUFS;
2375                 goto cleanup;
2376         }
2377
2378         atomic_set(&rth->u.dst.__refcnt, 1);
2379         rth->u.dst.flags= DST_HOST;
2380         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2381                 rth->u.dst.flags |= DST_NOXFRM;
2382         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2383                 rth->u.dst.flags |= DST_NOPOLICY;
2384
2385         rth->fl.fl4_dst = oldflp->fl4_dst;
2386         rth->fl.fl4_tos = tos;
2387         rth->fl.fl4_src = oldflp->fl4_src;
2388         rth->fl.oif     = oldflp->oif;
2389         rth->fl.mark    = oldflp->mark;
2390         rth->rt_dst     = fl->fl4_dst;
2391         rth->rt_src     = fl->fl4_src;
2392         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2393         /* get references to the devices that are to be hold by the routing
2394            cache entry */
2395         rth->u.dst.dev  = dev_out;
2396         dev_hold(dev_out);
2397         rth->idev       = in_dev_get(dev_out);
2398         rth->rt_gateway = fl->fl4_dst;
2399         rth->rt_spec_dst= fl->fl4_src;
2400
2401         rth->u.dst.output=ip_output;
2402         rth->rt_genid = rt_genid(dev_net(dev_out));
2403
2404         RT_CACHE_STAT_INC(out_slow_tot);
2405
2406         if (flags & RTCF_LOCAL) {
2407                 rth->u.dst.input = ip_local_deliver;
2408                 rth->rt_spec_dst = fl->fl4_dst;
2409         }
2410         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2411                 rth->rt_spec_dst = fl->fl4_src;
2412                 if (flags & RTCF_LOCAL &&
2413                     !(dev_out->flags & IFF_LOOPBACK)) {
2414                         rth->u.dst.output = ip_mc_output;
2415                         RT_CACHE_STAT_INC(out_slow_mc);
2416                 }
2417 #ifdef CONFIG_IP_MROUTE
2418                 if (res->type == RTN_MULTICAST) {
2419                         if (IN_DEV_MFORWARD(in_dev) &&
2420                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2421                                 rth->u.dst.input = ip_mr_input;
2422                                 rth->u.dst.output = ip_mc_output;
2423                         }
2424                 }
2425 #endif
2426         }
2427
2428         rt_set_nexthop(rth, res, 0);
2429
2430         rth->rt_flags = flags;
2431
2432         *result = rth;
2433  cleanup:
2434         /* release work reference to inet device */
2435         in_dev_put(in_dev);
2436
2437         return err;
2438 }
2439
2440 static int ip_mkroute_output(struct rtable **rp,
2441                              struct fib_result *res,
2442                              const struct flowi *fl,
2443                              const struct flowi *oldflp,
2444                              struct net_device *dev_out,
2445                              unsigned flags)
2446 {
2447         struct rtable *rth = NULL;
2448         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2449         unsigned hash;
2450         if (err == 0) {
2451                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2452                                rt_genid(dev_net(dev_out)));
2453                 err = rt_intern_hash(hash, rth, rp);
2454         }
2455
2456         return err;
2457 }
2458
2459 /*
2460  * Major route resolver routine.
2461  */
2462
2463 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2464                                 const struct flowi *oldflp)
2465 {
2466         u32 tos = RT_FL_TOS(oldflp);
2467         struct flowi fl = { .nl_u = { .ip4_u =
2468                                       { .daddr = oldflp->fl4_dst,
2469                                         .saddr = oldflp->fl4_src,
2470                                         .tos = tos & IPTOS_RT_MASK,
2471                                         .scope = ((tos & RTO_ONLINK) ?
2472                                                   RT_SCOPE_LINK :
2473                                                   RT_SCOPE_UNIVERSE),
2474                                       } },
2475                             .mark = oldflp->mark,
2476                             .iif = net->loopback_dev->ifindex,
2477                             .oif = oldflp->oif };
2478         struct fib_result res;
2479         unsigned flags = 0;
2480         struct net_device *dev_out = NULL;
2481         int free_res = 0;
2482         int err;
2483
2484
2485         res.fi          = NULL;
2486 #ifdef CONFIG_IP_MULTIPLE_TABLES
2487         res.r           = NULL;
2488 #endif
2489
2490         if (oldflp->fl4_src) {
2491                 err = -EINVAL;
2492                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2493                     ipv4_is_lbcast(oldflp->fl4_src) ||
2494                     ipv4_is_zeronet(oldflp->fl4_src))
2495                         goto out;
2496
2497                 /* I removed check for oif == dev_out->oif here.
2498                    It was wrong for two reasons:
2499                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2500                       is assigned to multiple interfaces.
2501                    2. Moreover, we are allowed to send packets with saddr
2502                       of another iface. --ANK
2503                  */
2504
2505                 if (oldflp->oif == 0
2506                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2507                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2508                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2509                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2510                         if (dev_out == NULL)
2511                                 goto out;
2512
2513                         /* Special hack: user can direct multicasts
2514                            and limited broadcast via necessary interface
2515                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2516                            This hack is not just for fun, it allows
2517                            vic,vat and friends to work.
2518                            They bind socket to loopback, set ttl to zero
2519                            and expect that it will work.
2520                            From the viewpoint of routing cache they are broken,
2521                            because we are not allowed to build multicast path
2522                            with loopback source addr (look, routing cache
2523                            cannot know, that ttl is zero, so that packet
2524                            will not leave this host and route is valid).
2525                            Luckily, this hack is good workaround.
2526                          */
2527
2528                         fl.oif = dev_out->ifindex;
2529                         goto make_route;
2530                 }
2531
2532                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2533                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2535                         if (dev_out == NULL)
2536                                 goto out;
2537                         dev_put(dev_out);
2538                         dev_out = NULL;
2539                 }
2540         }
2541
2542
2543         if (oldflp->oif) {
2544                 dev_out = dev_get_by_index(net, oldflp->oif);
2545                 err = -ENODEV;
2546                 if (dev_out == NULL)
2547                         goto out;
2548
2549                 /* RACE: Check return value of inet_select_addr instead. */
2550                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2551                         dev_put(dev_out);
2552                         goto out;       /* Wrong error code */
2553                 }
2554
2555                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2556                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2557                         if (!fl.fl4_src)
2558                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2559                                                               RT_SCOPE_LINK);
2560                         goto make_route;
2561                 }
2562                 if (!fl.fl4_src) {
2563                         if (ipv4_is_multicast(oldflp->fl4_dst))
2564                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2565                                                               fl.fl4_scope);
2566                         else if (!oldflp->fl4_dst)
2567                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2568                                                               RT_SCOPE_HOST);
2569                 }
2570         }
2571
2572         if (!fl.fl4_dst) {
2573                 fl.fl4_dst = fl.fl4_src;
2574                 if (!fl.fl4_dst)
2575                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2576                 if (dev_out)
2577                         dev_put(dev_out);
2578                 dev_out = net->loopback_dev;
2579                 dev_hold(dev_out);
2580                 fl.oif = net->loopback_dev->ifindex;
2581                 res.type = RTN_LOCAL;
2582                 flags |= RTCF_LOCAL;
2583                 goto make_route;
2584         }
2585
2586         if (fib_lookup(net, &fl, &res)) {
2587                 res.fi = NULL;
2588                 if (oldflp->oif) {
2589                         /* Apparently, routing tables are wrong. Assume,
2590                            that the destination is on link.
2591
2592                            WHY? DW.
2593                            Because we are allowed to send to iface
2594                            even if it has NO routes and NO assigned
2595                            addresses. When oif is specified, routing
2596                            tables are looked up with only one purpose:
2597                            to catch if destination is gatewayed, rather than
2598                            direct. Moreover, if MSG_DONTROUTE is set,
2599                            we send packet, ignoring both routing tables
2600                            and ifaddr state. --ANK
2601
2602
2603                            We could make it even if oif is unknown,
2604                            likely IPv6, but we do not.
2605                          */
2606
2607                         if (fl.fl4_src == 0)
2608                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2609                                                               RT_SCOPE_LINK);
2610                         res.type = RTN_UNICAST;
2611                         goto make_route;
2612                 }
2613                 if (dev_out)
2614                         dev_put(dev_out);
2615                 err = -ENETUNREACH;
2616                 goto out;
2617         }
2618         free_res = 1;
2619
2620         if (res.type == RTN_LOCAL) {
2621                 if (!fl.fl4_src)
2622                         fl.fl4_src = fl.fl4_dst;
2623                 if (dev_out)
2624                         dev_put(dev_out);
2625                 dev_out = net->loopback_dev;
2626                 dev_hold(dev_out);
2627                 fl.oif = dev_out->ifindex;
2628                 if (res.fi)
2629                         fib_info_put(res.fi);
2630                 res.fi = NULL;
2631                 flags |= RTCF_LOCAL;
2632                 goto make_route;
2633         }
2634
2635 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2636         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2637                 fib_select_multipath(&fl, &res);
2638         else
2639 #endif
2640         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2641                 fib_select_default(net, &fl, &res);
2642
2643         if (!fl.fl4_src)
2644                 fl.fl4_src = FIB_RES_PREFSRC(res);
2645
2646         if (dev_out)
2647                 dev_put(dev_out);
2648         dev_out = FIB_RES_DEV(res);
2649         dev_hold(dev_out);
2650         fl.oif = dev_out->ifindex;
2651
2652
2653 make_route:
2654         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2655
2656
2657         if (free_res)
2658                 fib_res_put(&res);
2659         if (dev_out)
2660                 dev_put(dev_out);
2661 out:    return err;
2662 }
2663
2664 int __ip_route_output_key(struct net *net, struct rtable **rp,
2665                           const struct flowi *flp)
2666 {
2667         unsigned hash;
2668         struct rtable *rth;
2669
2670         if (!rt_caching(net))
2671                 goto slow_output;
2672
2673         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2674
2675         rcu_read_lock_bh();
2676         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2677                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2678                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2679                     rth->fl.fl4_src == flp->fl4_src &&
2680                     rth->fl.iif == 0 &&
2681                     rth->fl.oif == flp->oif &&
2682                     rth->fl.mark == flp->mark &&
2683                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2684                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685                     net_eq(dev_net(rth->u.dst.dev), net) &&
2686                     !rt_is_expired(rth)) {
2687                         dst_use(&rth->u.dst, jiffies);
2688                         RT_CACHE_STAT_INC(out_hit);
2689                         rcu_read_unlock_bh();
2690                         *rp = rth;
2691                         return 0;
2692                 }
2693                 RT_CACHE_STAT_INC(out_hlist_search);
2694         }
2695         rcu_read_unlock_bh();
2696
2697 slow_output:
2698         return ip_route_output_slow(net, rp, flp);
2699 }
2700
2701 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2702
2703 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2704 {
2705 }
2706
2707 static struct dst_ops ipv4_dst_blackhole_ops = {
2708         .family                 =       AF_INET,
2709         .protocol               =       cpu_to_be16(ETH_P_IP),
2710         .destroy                =       ipv4_dst_destroy,
2711         .check                  =       ipv4_dst_check,
2712         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2713         .entries                =       ATOMIC_INIT(0),
2714 };
2715
2716
2717 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2718 {
2719         struct rtable *ort = *rp;
2720         struct rtable *rt = (struct rtable *)
2721                 dst_alloc(&ipv4_dst_blackhole_ops);
2722
2723         if (rt) {
2724                 struct dst_entry *new = &rt->u.dst;
2725
2726                 atomic_set(&new->__refcnt, 1);
2727                 new->__use = 1;
2728                 new->input = dst_discard;
2729                 new->output = dst_discard;
2730                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2731
2732                 new->dev = ort->u.dst.dev;
2733                 if (new->dev)
2734                         dev_hold(new->dev);
2735
2736                 rt->fl = ort->fl;
2737
2738                 rt->idev = ort->idev;
2739                 if (rt->idev)
2740                         in_dev_hold(rt->idev);
2741                 rt->rt_genid = rt_genid(net);
2742                 rt->rt_flags = ort->rt_flags;
2743                 rt->rt_type = ort->rt_type;
2744                 rt->rt_dst = ort->rt_dst;
2745                 rt->rt_src = ort->rt_src;
2746                 rt->rt_iif = ort->rt_iif;
2747                 rt->rt_gateway = ort->rt_gateway;
2748                 rt->rt_spec_dst = ort->rt_spec_dst;
2749                 rt->peer = ort->peer;
2750                 if (rt->peer)
2751                         atomic_inc(&rt->peer->refcnt);
2752
2753                 dst_free(new);
2754         }
2755
2756         dst_release(&(*rp)->u.dst);
2757         *rp = rt;
2758         return (rt ? 0 : -ENOMEM);
2759 }
2760
2761 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2762                          struct sock *sk, int flags)
2763 {
2764         int err;
2765
2766         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2767                 return err;
2768
2769         if (flp->proto) {
2770                 if (!flp->fl4_src)
2771                         flp->fl4_src = (*rp)->rt_src;
2772                 if (!flp->fl4_dst)
2773                         flp->fl4_dst = (*rp)->rt_dst;
2774                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2775                                     flags ? XFRM_LOOKUP_WAIT : 0);
2776                 if (err == -EREMOTE)
2777                         err = ipv4_dst_blackhole(net, rp, flp);
2778
2779                 return err;
2780         }
2781
2782         return 0;
2783 }
2784
2785 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2786
2787 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2788 {
2789         return ip_route_output_flow(net, rp, flp, NULL, 0);
2790 }
2791
2792 static int rt_fill_info(struct net *net,
2793                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2794                         int nowait, unsigned int flags)
2795 {
2796         struct rtable *rt = skb->rtable;
2797         struct rtmsg *r;
2798         struct nlmsghdr *nlh;
2799         long expires;
2800         u32 id = 0, ts = 0, tsage = 0, error;
2801
2802         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2803         if (nlh == NULL)
2804                 return -EMSGSIZE;
2805
2806         r = nlmsg_data(nlh);
2807         r->rtm_family    = AF_INET;
2808         r->rtm_dst_len  = 32;
2809         r->rtm_src_len  = 0;
2810         r->rtm_tos      = rt->fl.fl4_tos;
2811         r->rtm_table    = RT_TABLE_MAIN;
2812         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2813         r->rtm_type     = rt->rt_type;
2814         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2815         r->rtm_protocol = RTPROT_UNSPEC;
2816         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2817         if (rt->rt_flags & RTCF_NOTIFY)
2818                 r->rtm_flags |= RTM_F_NOTIFY;
2819
2820         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2821
2822         if (rt->fl.fl4_src) {
2823                 r->rtm_src_len = 32;
2824                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2825         }
2826         if (rt->u.dst.dev)
2827                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2828 #ifdef CONFIG_NET_CLS_ROUTE
2829         if (rt->u.dst.tclassid)
2830                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2831 #endif
2832         if (rt->fl.iif)
2833                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2834         else if (rt->rt_src != rt->fl.fl4_src)
2835                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2836
2837         if (rt->rt_dst != rt->rt_gateway)
2838                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2839
2840         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2841                 goto nla_put_failure;
2842
2843         error = rt->u.dst.error;
2844         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2845         if (rt->peer) {
2846                 id = rt->peer->ip_id_count;
2847                 if (rt->peer->tcp_ts_stamp) {
2848                         ts = rt->peer->tcp_ts;
2849                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2850                 }
2851         }
2852
2853         if (rt->fl.iif) {
2854 #ifdef CONFIG_IP_MROUTE
2855                 __be32 dst = rt->rt_dst;
2856
2857                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2858                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2859                         int err = ipmr_get_route(net, skb, r, nowait);
2860                         if (err <= 0) {
2861                                 if (!nowait) {
2862                                         if (err == 0)
2863                                                 return 0;
2864                                         goto nla_put_failure;
2865                                 } else {
2866                                         if (err == -EMSGSIZE)
2867                                                 goto nla_put_failure;
2868                                         error = err;
2869                                 }
2870                         }
2871                 } else
2872 #endif
2873                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2874         }
2875
2876         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2877                                expires, error) < 0)
2878                 goto nla_put_failure;
2879
2880         return nlmsg_end(skb, nlh);
2881
2882 nla_put_failure:
2883         nlmsg_cancel(skb, nlh);
2884         return -EMSGSIZE;
2885 }
2886
2887 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2888 {
2889         struct net *net = sock_net(in_skb->sk);
2890         struct rtmsg *rtm;
2891         struct nlattr *tb[RTA_MAX+1];
2892         struct rtable *rt = NULL;
2893         __be32 dst = 0;
2894         __be32 src = 0;
2895         u32 iif;
2896         int err;
2897         struct sk_buff *skb;
2898
2899         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2900         if (err < 0)
2901                 goto errout;
2902
2903         rtm = nlmsg_data(nlh);
2904
2905         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2906         if (skb == NULL) {
2907                 err = -ENOBUFS;
2908                 goto errout;
2909         }
2910
2911         /* Reserve room for dummy headers, this skb can pass
2912            through good chunk of routing engine.
2913          */
2914         skb_reset_mac_header(skb);
2915         skb_reset_network_header(skb);
2916
2917         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2918         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2919         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2920
2921         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2922         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2923         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2924
2925         if (iif) {
2926                 struct net_device *dev;
2927
2928                 dev = __dev_get_by_index(net, iif);
2929                 if (dev == NULL) {
2930                         err = -ENODEV;
2931                         goto errout_free;
2932                 }
2933
2934                 skb->protocol   = htons(ETH_P_IP);
2935                 skb->dev        = dev;
2936                 local_bh_disable();
2937                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2938                 local_bh_enable();
2939
2940                 rt = skb->rtable;
2941                 if (err == 0 && rt->u.dst.error)
2942                         err = -rt->u.dst.error;
2943         } else {
2944                 struct flowi fl = {
2945                         .nl_u = {
2946                                 .ip4_u = {
2947                                         .daddr = dst,
2948                                         .saddr = src,
2949                                         .tos = rtm->rtm_tos,
2950                                 },
2951                         },
2952                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2953                 };
2954                 err = ip_route_output_key(net, &rt, &fl);
2955         }
2956
2957         if (err)
2958                 goto errout_free;
2959
2960         skb->rtable = rt;
2961         if (rtm->rtm_flags & RTM_F_NOTIFY)
2962                 rt->rt_flags |= RTCF_NOTIFY;
2963
2964         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2965                            RTM_NEWROUTE, 0, 0);
2966         if (err <= 0)
2967                 goto errout_free;
2968
2969         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2970 errout:
2971         return err;
2972
2973 errout_free:
2974         kfree_skb(skb);
2975         goto errout;
2976 }
2977
2978 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2979 {
2980         struct rtable *rt;
2981         int h, s_h;
2982         int idx, s_idx;
2983         struct net *net;
2984
2985         net = sock_net(skb->sk);
2986
2987         s_h = cb->args[0];
2988         if (s_h < 0)
2989                 s_h = 0;
2990         s_idx = idx = cb->args[1];
2991         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2992                 if (!rt_hash_table[h].chain)
2993                         continue;
2994                 rcu_read_lock_bh();
2995                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2996                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2997                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2998                                 continue;
2999                         if (rt_is_expired(rt))
3000                                 continue;
3001                         skb->dst = dst_clone(&rt->u.dst);
3002                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3003                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3004                                          1, NLM_F_MULTI) <= 0) {
3005                                 dst_release(xchg(&skb->dst, NULL));
3006                                 rcu_read_unlock_bh();
3007                                 goto done;
3008                         }
3009                         dst_release(xchg(&skb->dst, NULL));
3010                 }
3011                 rcu_read_unlock_bh();
3012         }
3013
3014 done:
3015         cb->args[0] = h;
3016         cb->args[1] = idx;
3017         return skb->len;
3018 }
3019
3020 void ip_rt_multicast_event(struct in_device *in_dev)
3021 {
3022         rt_cache_flush(dev_net(in_dev->dev), 0);
3023 }
3024
3025 #ifdef CONFIG_SYSCTL
3026 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3027                                         struct file *filp, void __user *buffer,
3028                                         size_t *lenp, loff_t *ppos)
3029 {
3030         if (write) {
3031                 int flush_delay;
3032                 ctl_table ctl;
3033                 struct net *net;
3034
3035                 memcpy(&ctl, __ctl, sizeof(ctl));
3036                 ctl.data = &flush_delay;
3037                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
3038
3039                 net = (struct net *)__ctl->extra1;
3040                 rt_cache_flush(net, flush_delay);
3041                 return 0;
3042         }
3043
3044         return -EINVAL;
3045 }
3046
3047 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3048                                                 void __user *oldval,
3049                                                 size_t __user *oldlenp,
3050                                                 void __user *newval,
3051                                                 size_t newlen)
3052 {
3053         int delay;
3054         struct net *net;
3055         if (newlen != sizeof(int))
3056                 return -EINVAL;
3057         if (get_user(delay, (int __user *)newval))
3058                 return -EFAULT;
3059         net = (struct net *)table->extra1;
3060         rt_cache_flush(net, delay);
3061         return 0;
3062 }
3063
3064 static void rt_secret_reschedule(int old)
3065 {
3066         struct net *net;
3067         int new = ip_rt_secret_interval;
3068         int diff = new - old;
3069
3070         if (!diff)
3071                 return;
3072
3073         rtnl_lock();
3074         for_each_net(net) {
3075                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3076
3077                 if (!new)
3078                         continue;
3079
3080                 if (deleted) {
3081                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3082
3083                         if (time <= 0 || (time += diff) <= 0)
3084                                 time = 0;
3085
3086                         net->ipv4.rt_secret_timer.expires = time;
3087                 } else
3088                         net->ipv4.rt_secret_timer.expires = new;
3089
3090                 net->ipv4.rt_secret_timer.expires += jiffies;
3091                 add_timer(&net->ipv4.rt_secret_timer);
3092         }
3093         rtnl_unlock();
3094 }
3095
3096 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3097                                           struct file *filp,
3098                                           void __user *buffer, size_t *lenp,
3099                                           loff_t *ppos)
3100 {
3101         int old = ip_rt_secret_interval;
3102         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3103
3104         rt_secret_reschedule(old);
3105
3106         return ret;
3107 }
3108
3109 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3110                                                    void __user *oldval,
3111                                                    size_t __user *oldlenp,
3112                                                    void __user *newval,
3113                                                    size_t newlen)
3114 {
3115         int old = ip_rt_secret_interval;
3116         int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3117
3118         rt_secret_reschedule(old);
3119
3120         return ret;
3121 }
3122
3123 static ctl_table ipv4_route_table[] = {
3124         {
3125                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3126                 .procname       = "gc_thresh",
3127                 .data           = &ipv4_dst_ops.gc_thresh,
3128                 .maxlen         = sizeof(int),
3129                 .mode           = 0644,
3130                 .proc_handler   = proc_dointvec,
3131         },
3132         {
3133                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3134                 .procname       = "max_size",
3135                 .data           = &ip_rt_max_size,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec,
3139         },
3140         {
3141                 /*  Deprecated. Use gc_min_interval_ms */
3142
3143                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3144                 .procname       = "gc_min_interval",
3145                 .data           = &ip_rt_gc_min_interval,
3146                 .maxlen         = sizeof(int),
3147                 .mode           = 0644,
3148                 .proc_handler   = proc_dointvec_jiffies,
3149                 .strategy       = sysctl_jiffies,
3150         },
3151         {
3152                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3153                 .procname       = "gc_min_interval_ms",
3154                 .data           = &ip_rt_gc_min_interval,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec_ms_jiffies,
3158                 .strategy       = sysctl_ms_jiffies,
3159         },
3160         {
3161                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3162                 .procname       = "gc_timeout",
3163                 .data           = &ip_rt_gc_timeout,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec_jiffies,
3167                 .strategy       = sysctl_jiffies,
3168         },
3169         {
3170                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3171                 .procname       = "gc_interval",
3172                 .data           = &ip_rt_gc_interval,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec_jiffies,
3176                 .strategy       = sysctl_jiffies,
3177         },
3178         {
3179                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3180                 .procname       = "redirect_load",
3181                 .data           = &ip_rt_redirect_load,
3182                 .maxlen         = sizeof(int),
3183                 .mode           = 0644,
3184                 .proc_handler   = proc_dointvec,
3185         },
3186         {
3187                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3188                 .procname       = "redirect_number",
3189                 .data           = &ip_rt_redirect_number,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         {
3195                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3196                 .procname       = "redirect_silence",
3197                 .data           = &ip_rt_redirect_silence,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3204                 .procname       = "error_cost",
3205                 .data           = &ip_rt_error_cost,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec,
3209         },
3210         {
3211                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3212                 .procname       = "error_burst",
3213                 .data           = &ip_rt_error_burst,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec,
3217         },
3218         {
3219                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3220                 .procname       = "gc_elasticity",
3221                 .data           = &ip_rt_gc_elasticity,
3222                 .maxlen         = sizeof(int),
3223                 .mode           = 0644,
3224                 .proc_handler   = proc_dointvec,
3225         },
3226         {
3227                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3228                 .procname       = "mtu_expires",
3229                 .data           = &ip_rt_mtu_expires,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec_jiffies,
3233                 .strategy       = sysctl_jiffies,
3234         },
3235         {
3236                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3237                 .procname       = "min_pmtu",
3238                 .data           = &ip_rt_min_pmtu,
3239                 .maxlen         = sizeof(int),
3240                 .mode           = 0644,
3241                 .proc_handler   = proc_dointvec,
3242         },
3243         {
3244                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3245                 .procname       = "min_adv_mss",
3246                 .data           = &ip_rt_min_advmss,
3247                 .maxlen         = sizeof(int),
3248                 .mode           = 0644,
3249                 .proc_handler   = proc_dointvec,
3250         },
3251         {
3252                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3253                 .procname       = "secret_interval",
3254                 .data           = &ip_rt_secret_interval,
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0644,
3257                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3258                 .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3259         },
3260         { .ctl_name = 0 }
3261 };
3262
3263 static struct ctl_table empty[1];
3264
3265 static struct ctl_table ipv4_skeleton[] =
3266 {
3267         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3268           .mode = 0555, .child = ipv4_route_table},
3269         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3270           .mode = 0555, .child = empty},
3271         { }
3272 };
3273
3274 static __net_initdata struct ctl_path ipv4_path[] = {
3275         { .procname = "net", .ctl_name = CTL_NET, },
3276         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3277         { },
3278 };
3279
3280 static struct ctl_table ipv4_route_flush_table[] = {
3281         {
3282                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3283                 .procname       = "flush",
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0200,
3286                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3287                 .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3288         },
3289         { .ctl_name = 0 },
3290 };
3291
3292 static __net_initdata struct ctl_path ipv4_route_path[] = {
3293         { .procname = "net", .ctl_name = CTL_NET, },
3294         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3295         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3296         { },
3297 };
3298
3299 static __net_init int sysctl_route_net_init(struct net *net)
3300 {
3301         struct ctl_table *tbl;
3302
3303         tbl = ipv4_route_flush_table;
3304         if (net != &init_net) {
3305                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306                 if (tbl == NULL)
3307                         goto err_dup;
3308         }
3309         tbl[0].extra1 = net;
3310
3311         net->ipv4.route_hdr =
3312                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313         if (net->ipv4.route_hdr == NULL)
3314                 goto err_reg;
3315         return 0;
3316
3317 err_reg:
3318         if (tbl != ipv4_route_flush_table)
3319                 kfree(tbl);
3320 err_dup:
3321         return -ENOMEM;
3322 }
3323
3324 static __net_exit void sysctl_route_net_exit(struct net *net)
3325 {
3326         struct ctl_table *tbl;
3327
3328         tbl = net->ipv4.route_hdr->ctl_table_arg;
3329         unregister_net_sysctl_table(net->ipv4.route_hdr);
3330         BUG_ON(tbl == ipv4_route_flush_table);
3331         kfree(tbl);
3332 }
3333
3334 static __net_initdata struct pernet_operations sysctl_route_ops = {
3335         .init = sysctl_route_net_init,
3336         .exit = sysctl_route_net_exit,
3337 };
3338 #endif
3339
3340
3341 static __net_init int rt_secret_timer_init(struct net *net)
3342 {
3343         atomic_set(&net->ipv4.rt_genid,
3344                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3345                         (jiffies ^ (jiffies >> 7))));
3346
3347         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3350
3351         if (ip_rt_secret_interval) {
3352                 net->ipv4.rt_secret_timer.expires =
3353                         jiffies + net_random() % ip_rt_secret_interval +
3354                         ip_rt_secret_interval;
3355                 add_timer(&net->ipv4.rt_secret_timer);
3356         }
3357         return 0;
3358 }
3359
3360 static __net_exit void rt_secret_timer_exit(struct net *net)
3361 {
3362         del_timer_sync(&net->ipv4.rt_secret_timer);
3363 }
3364
3365 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366         .init = rt_secret_timer_init,
3367         .exit = rt_secret_timer_exit,
3368 };
3369
3370
3371 #ifdef CONFIG_NET_CLS_ROUTE
3372 struct ip_rt_acct *ip_rt_acct __read_mostly;
3373 #endif /* CONFIG_NET_CLS_ROUTE */
3374
3375 static __initdata unsigned long rhash_entries;
3376 static int __init set_rhash_entries(char *str)
3377 {
3378         if (!str)
3379                 return 0;
3380         rhash_entries = simple_strtoul(str, &str, 0);
3381         return 1;
3382 }
3383 __setup("rhash_entries=", set_rhash_entries);
3384
3385 int __init ip_rt_init(void)
3386 {
3387         int rc = 0;
3388
3389 #ifdef CONFIG_NET_CLS_ROUTE
3390         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3391         if (!ip_rt_acct)
3392                 panic("IP: failed to allocate ip_rt_acct\n");
3393 #endif
3394
3395         ipv4_dst_ops.kmem_cachep =
3396                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3397                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3398
3399         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3400
3401         rt_hash_table = (struct rt_hash_bucket *)
3402                 alloc_large_system_hash("IP route cache",
3403                                         sizeof(struct rt_hash_bucket),
3404                                         rhash_entries,
3405                                         (num_physpages >= 128 * 1024) ?
3406                                         15 : 17,
3407                                         0,
3408                                         &rt_hash_log,
3409                                         &rt_hash_mask,
3410                                         rhash_entries ? 0 : 512 * 1024);
3411         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412         rt_hash_lock_init();
3413
3414         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3416
3417         devinet_init();
3418         ip_fib_init();
3419
3420         /* All the timers, started at system startup tend
3421            to synchronize. Perturb it a bit.
3422          */
3423         schedule_delayed_work(&expires_work,
3424                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3425
3426         if (register_pernet_subsys(&rt_secret_timer_ops))
3427                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3428
3429         if (ip_rt_proc_init())
3430                 printk(KERN_ERR "Unable to create route proc files\n");
3431 #ifdef CONFIG_XFRM
3432         xfrm_init();
3433         xfrm4_init();
3434 #endif
3435         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3436
3437 #ifdef CONFIG_SYSCTL
3438         register_pernet_subsys(&sysctl_route_ops);
3439 #endif
3440         return rc;
3441 }
3442
3443 #ifdef CONFIG_SYSCTL
3444 /*
3445  * We really need to sanitize the damn ipv4 init order, then all
3446  * this nonsense will go away.
3447  */
3448 void __init ip_static_sysctl_init(void)
3449 {
3450         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3451 }
3452 #endif
3453
3454 EXPORT_SYMBOL(__ip_select_ident);
3455 EXPORT_SYMBOL(ip_route_input);
3456 EXPORT_SYMBOL(ip_route_output_key);